about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/arm64_emitter.h12
-rw-r--r--src/dynarec/arm64/dynarec_arm64_00.c6
-rw-r--r--src/dynarec/arm64/dynarec_arm64_0f.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_67.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_67_32.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_consts.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_consts.h2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f20f.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c22
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h13
-rw-r--r--src/dynarec/arm64/updateflags_arm64.c135
-rw-r--r--src/dynarec/arm64/updateflags_arm64_pass.c1005
-rw-r--r--src/dynarec/dynarec_arch.h2
-rw-r--r--src/dynarec/dynarec_native_functions.c2
14 files changed, 1183 insertions, 26 deletions
diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h
index c1359e1b..9703ac95 100644
--- a/src/dynarec/arm64/arm64_emitter.h
+++ b/src/dynarec/arm64/arm64_emitter.h
@@ -47,10 +47,12 @@ int convert_bitmask(uint64_t bitmask);
 #define ADDx_REG(Rd, Rn, Rm)                EMIT(ADDSUB_REG_gen(1, 0, 0, 0b00, Rm, 0, Rn, Rd))
 #define ADDSx_REG(Rd, Rn, Rm)              FEMIT(ADDSUB_REG_gen(1, 0, 1, 0b00, Rm, 0, Rn, Rd))
 #define ADDx_REG_LSL(Rd, Rn, Rm, lsl)       EMIT(ADDSUB_REG_gen(1, 0, 0, 0b00, Rm, lsl, Rn, Rd))
+#define ADDx_REG_LSR(Rd, Rn, Rm, lsr)       EMIT(ADDSUB_REG_gen(1, 0, 0, 0b01, Rm, lsr, Rn, Rd))
 #define ADDz_REG_LSL(Rd, Rn, Rm, lsl)       EMIT(ADDSUB_REG_gen(rex.is32bits?0:1, 0, 0, 0b00, Rm, lsl, Rn, Rd))
 #define ADDw_REG(Rd, Rn, Rm)                EMIT(ADDSUB_REG_gen(0, 0, 0, 0b00, Rm, 0, Rn, Rd))
 #define ADDSw_REG(Rd, Rn, Rm)              FEMIT(ADDSUB_REG_gen(0, 0, 1, 0b00, Rm, 0, Rn, Rd))
 #define ADDw_REG_LSL(Rd, Rn, Rm, lsl)       EMIT(ADDSUB_REG_gen(0, 0, 0, 0b00, Rm, lsl, Rn, Rd))
+#define ADDw_REG_LSR(Rd, Rn, Rm, lsr)       EMIT(ADDSUB_REG_gen(0, 0, 0, 0b01, Rm, lsr, Rn, Rd))
 #define ADDSw_REG_LSL(Rd, Rn, Rm, lsl)     FEMIT(ADDSUB_REG_gen(0, 0, 1, 0b00, Rm, lsl, Rn, Rd))
 #define ADDxw_REG(Rd, Rn, Rm)               EMIT(ADDSUB_REG_gen(rex.w, 0, 0, 0b00, Rm, 0, Rn, Rd))
 #define ADDz_REG(Rd, Rn, Rm)                EMIT(ADDSUB_REG_gen(rex.is32bits?0:1, 0, 0, 0b00, Rm, 0, Rn, Rd))
@@ -120,9 +122,11 @@ int convert_bitmask(uint64_t bitmask);
 #define SBCSw_REG(Rd, Rn, Rm)      FEMIT(ADDSUBC_gen(0, 1, 1, Rm, Rn, Rd))
 #define SBCSxw_REG(Rd, Rn, Rm)     FEMIT(ADDSUBC_gen(rex.w, 1, 1, Rm, Rn, Rd))
 
-#define SUB_ext(sf, op, S, Rm, option, imm3, Rn, Rd)    ((sf)<<31 | (op)<<30 | (S)<<29 | 0b01011<<24 | 1<<21 | (Rm)<<16 | (option)<<13 | (imm3)<<10 | (Rn)<<5 | (Rd))
-#define SUBxw_UXTB(Rd, Rn, Rm)      EMIT(SUB_ext(rex.w, 1, 0, Rm, 0b000, 0, Rn, Rd))
-#define SUBw_UXTB(Rd, Rn, Rm)       EMIT(SUB_ext(0, 1, 0, Rm, 0b000, 0, Rn, Rd))
+#define ADDSUB_ext(sf, op, S, Rm, option, imm3, Rn, Rd)    ((sf)<<31 | (op)<<30 | (S)<<29 | 0b01011<<24 | 1<<21 | (Rm)<<16 | (option)<<13 | (imm3)<<10 | (Rn)<<5 | (Rd))
+#define SUBxw_UXTB(Rd, Rn, Rm)      EMIT(ADDSUB_ext(rex.w, 1, 0, Rm, 0b000, 0, Rn, Rd))
+#define SUBw_UXTB(Rd, Rn, Rm)       EMIT(ADDSUB_ext(0, 1, 0, Rm, 0b000, 0, Rn, Rd))
+#define ADDw_UXTH(Rd, Rn, Rm)       EMIT(ADDSUB_ext(0, 0, 0, Rm, 0b001, 0, Rn, Rd))
+#define ADDx_UXTW(Rd, Rn, Rm)       EMIT(ADDSUB_ext(1, 0, 0, Rm, 0b010, 0, Rn, Rd))
 
 // CCMP compare if cond is true, set nzcv if false
 #define CCMP_reg(sf, Rm, cond, Rn, nzcv)    ((sf)<<31 | 1<<30 | 1<<29 | 0b11010010<<21 | (Rm)<<16 | (cond)<<12 | (Rn)<<5 | (nzcv))
@@ -160,6 +164,8 @@ int convert_bitmask(uint64_t bitmask);
 
 #define LDS_gen(size, op1, imm12, Rn, Rt)       ((size)<<30 | 0b111<<27 | (op1)<<24 | 0b10<<22 | (imm12)<<10 | (Rn)<<5 | (Rt))
 #define LDRSW_U12(Rt, Rn, imm12)          EMIT(LDS_gen(0b10, 0b01, ((uint32_t)((imm12)>>2))&0xfff, Rn, Rt))
+#define LDRSH_U12(Rt, Rn, imm12)          EMIT(LDS_gen(0b01, 0b01, ((uint32_t)((imm12)>>1))&0xfff, Rn, Rt))
+#define LDRSB_U12(Rt, Rn, imm12)          EMIT(LDS_gen(0b00, 0b01, ((uint32_t)(imm12))&0xfff, Rn, Rt))
 
 #define LDR_REG_gen(size, Rm, option, S, Rn, Rt)    ((size)<<30 | 0b111<<27 | 0b01<<22 | 1<<21 | (Rm)<<16 | (option)<<13 | (S)<<12 | (0b10)<<10 | (Rn)<<5 | (Rt))
 #define LDRx_REG(Rt, Rn, Rm)            EMIT(LDR_REG_gen(0b11, Rm, 0b011, 0, Rn, Rt))
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index a547f2a1..90f633bd 100644
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -1086,7 +1086,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     jump_to_next(dyn, addr+i8, 0, ninst, rex.is32bits); \
                 } else {                                                \
                     /* inside the block, cache transform */             \
-                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \
+                    CacheTransform(dyn, ninst, cacheupd);               \
                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);\
                     SKIP_SEVL(i32);                                     \
                     B(i32);                                             \
@@ -3293,7 +3293,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \
                     jump_to_next(dyn, addr+i8, 0, ninst, rex.is32bits); \
                 } else {                                                \
-                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \
+                    CacheTransform(dyn, ninst, cacheupd);               \
                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);    \
                     SKIP_SEVL(i32);                                     \
                     Bcond(c__, i32);                                    \
@@ -3527,7 +3527,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     jump_to_next(dyn, j64, 0, ninst, rex.is32bits);
                 } else {
                     // inside the block
-                    CacheTransform(dyn, ninst, CHECK_CACHE(), x1, x2, x3);
+                    CacheTransform(dyn, ninst, CHECK_CACHE());
                     tmp = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);
                     SKIP_SEVL(tmp);
                     if(tmp==4) {
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c
index ecb48925..205efd1e 100644
--- a/src/dynarec/arm64/dynarec_arm64_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_0f.c
@@ -1699,7 +1699,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \

                     jump_to_next(dyn, j64, 0, ninst, rex.is32bits);     \

                 } else {                                                \

-                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \

+                    CacheTransform(dyn, ninst, cacheupd);               \

                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);    \

                     SKIP_SEVL(i32);                                     \

                     B(i32);                                             \

diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c
index c0eaf181..05b18944 100644
--- a/src/dynarec/arm64/dynarec_arm64_67.c
+++ b/src/dynarec/arm64/dynarec_arm64_67.c
@@ -1444,7 +1444,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \

                     jump_to_next(dyn, addr+i8, 0, ninst, rex.is32bits); \

                 } else {                                                \

-                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \

+                    CacheTransform(dyn, ninst, cacheupd);               \

                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);\

                     SKIP_SEVL(i32);                                     \

                     B(i32);                                             \

diff --git a/src/dynarec/arm64/dynarec_arm64_67_32.c b/src/dynarec/arm64/dynarec_arm64_67_32.c
index ec1fa1e5..bb394e71 100644
--- a/src/dynarec/arm64/dynarec_arm64_67_32.c
+++ b/src/dynarec/arm64/dynarec_arm64_67_32.c
@@ -100,7 +100,7 @@ uintptr_t dynarec64_67_32(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                         fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \
                     jump_to_next(dyn, addr+i8, 0, ninst, rex.is32bits); \
                 } else {                                                \
-                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \
+                    CacheTransform(dyn, ninst, cacheupd);               \
                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);\
                     SKIP_SEVL(i32);                                     \
                     B(i32);                                             \
diff --git a/src/dynarec/arm64/dynarec_arm64_consts.c b/src/dynarec/arm64/dynarec_arm64_consts.c
index d30c0bd2..5a6e6d78 100644
--- a/src/dynarec/arm64/dynarec_arm64_consts.c
+++ b/src/dynarec/arm64/dynarec_arm64_consts.c
@@ -100,7 +100,7 @@ uintptr_t getConst(arm64_consts_t which)
         case const_helper_getcpu: return (uintptr_t)helper_getcpu;
         case const_cpuid: return (uintptr_t)my_cpuid;
         case const_getsegmentbase: return (uintptr_t)GetSegmentBaseEmu;
-        case const_updateflags: return (uintptr_t)UpdateFlags;
+        case const_updateflags_arm64: return (uintptr_t)create_updateflags();
         case const_reset_fpu: return (uintptr_t)reset_fpu;
         case const_sha1msg2: return (uintptr_t)sha1msg2;
         case const_sha1rnds4: return (uintptr_t)sha1rnds4;
diff --git a/src/dynarec/arm64/dynarec_arm64_consts.h b/src/dynarec/arm64/dynarec_arm64_consts.h
index 9ba65aca..2543699e 100644
--- a/src/dynarec/arm64/dynarec_arm64_consts.h
+++ b/src/dynarec/arm64/dynarec_arm64_consts.h
@@ -64,7 +64,7 @@ typedef enum arm64_consts_s {
     const_helper_getcpu,
     const_cpuid,
     const_getsegmentbase,
-    const_updateflags,
+    const_updateflags_arm64,
     const_reset_fpu,
     const_sha1msg2,
     const_sha1rnds4,
diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c
index bd103e93..a60da0b6 100644
--- a/src/dynarec/arm64/dynarec_arm64_f20f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f20f.c
@@ -468,7 +468,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \

                     jump_to_next(dyn, j64, 0, ninst, rex.is32bits);     \

                 } else {                                                \

-                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \

+                    CacheTransform(dyn, ninst, cacheupd);               \

                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);    \

                     SKIP_SEVL(i32);                                     \

                     B(i32);                                             \

diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 74f353cd..d71603d4 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -2492,7 +2492,7 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
     }
     MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
 }
-static void flagsCacheTransform(dynarec_arm_t* dyn, int ninst, int s1)
+static void flagsCacheTransform(dynarec_arm_t* dyn, int ninst)
 {
     int j64;
     int jmp = dyn->insts[ninst].x64.jmp_insts;
@@ -2516,15 +2516,16 @@ static void flagsCacheTransform(dynarec_arm_t* dyn, int ninst, int s1)
     }
     if(go) {
         if(dyn->f.pending!=SF_PENDING) {
-            LDRw_U12(s1, xEmu, offsetof(x64emu_t, df));
+            LDRw_U12(x1, xEmu, offsetof(x64emu_t, df));
             j64 = (GETMARKF2)-(dyn->native_size);
-            CBZw(s1, j64);
+            CBZw(x1, j64);
         }
         if(dyn->insts[ninst].need_nat_flags)
-            MRS_nzcv(s1);
-        CALL_(const_updateflags, -1, s1);
+            MRS_nzcv(x6);
+        TABLE64C(x1, const_updateflags_arm64);
+        BLR(x1);
         if(dyn->insts[ninst].need_nat_flags)
-            MSR_nzcv(s1);
+            MSR_nzcv(x6);
         MARKF2;
     }
 }
@@ -2607,13 +2608,14 @@ static void nativeFlagsTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2)
     MESSAGE(LOG_DUMP, "\t---- Native Flags transform\n");
 }
 
-void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) {
+// Might use all Scratch registers!
+void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd) {
     if(cacheupd&1)
-        flagsCacheTransform(dyn, ninst, s1);
+        flagsCacheTransform(dyn, ninst);
     if(cacheupd&2)
-        fpuCacheTransform(dyn, ninst, s1, s2, s3);
+        fpuCacheTransform(dyn, ninst, x1, x2, x3);
     if(cacheupd&4)
-        nativeFlagsTransform(dyn, ninst, s1, s2);
+        nativeFlagsTransform(dyn, ninst, x1, x2);
 }
 
 void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 8ba2407f..aa67cc49 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -4,6 +4,7 @@
 // undef to get Close to SSE Float->int conversions
 //#define PRECISE_CVT
 
+#ifndef STEP_PASS
 #if STEP == 0
 #include "dynarec_arm64_pass0.h"
 #elif STEP == 1
@@ -13,6 +14,8 @@
 #elif STEP == 3
 #include "dynarec_arm64_pass3.h"
 #endif
+#define STEP_PASS
+#endif
 
 #include "debug.h"
 #include "arm64_emitter.h"
@@ -1136,7 +1139,8 @@
         MOVZw(S, (N));                                                                                                          \
         STRw_U12(S, xEmu, offsetof(x64emu_t, df));                                                                              \
         if (dyn->f.pending == SF_PENDING && dyn->insts[ninst].x64.need_after && !(dyn->insts[ninst].x64.need_after & X_PEND)) { \
-            CALL_I(const_updateflags);                                                                                          \
+            TABLE64C(x6, const_updateflags_arm64);                                                                              \
+            BLR(x6);                                                                                                            \
             dyn->f.pending = SF_SET;                                                                                            \
             SET_NODF();                                                                                                         \
         }                                                                                                                       \
@@ -1158,7 +1162,8 @@
             j64 = (GETMARKF)-(dyn->native_size);        \
             CBZw(x3, j64);                              \
         }                                               \
-        CALL_I(const_updateflags);                      \
+        TABLE64C(x6, const_updateflags_arm64);          \
+        BLR(x6);                                        \
         MARKF;                                          \
         dyn->f.pending = SF_SET;                        \
         SET_DFOK();                                     \
@@ -1287,6 +1292,8 @@
 
 #define native_pass        STEPNAME(native_pass)
 
+#define updateflags_pass   STEPNAME(updateflags_pass)
+
 #define dynarec64_00       STEPNAME(dynarec64_00)
 #define dynarec64_0F       STEPNAME(dynarec64_0F)
 #define dynarec64_64       STEPNAME(dynarec64_64)
@@ -1650,7 +1657,7 @@ int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
 // purge ymm_zero mask according to purge_ymm
 void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, uint16_t mask, int s1);
 
-void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3);
+void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd);
 
 void arm64_move32(dynarec_arm_t* dyn, int ninst, int reg, uint32_t val);
 void arm64_move64(dynarec_arm_t* dyn, int ninst, int reg, uint64_t val);
diff --git a/src/dynarec/arm64/updateflags_arm64.c b/src/dynarec/arm64/updateflags_arm64.c
new file mode 100644
index 00000000..2e2af8e9
--- /dev/null
+++ b/src/dynarec/arm64/updateflags_arm64.c
@@ -0,0 +1,135 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+
+#include "os.h"
+#include "debug.h"
+#include "box64context.h"
+#include "custommem.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynablock.h"
+#include "../dynablock_private.h"
+
+#include "dynarec_native.h"
+#include "../dynarec_arch.h"
+
+void updateflags_pass0(dynarec_arm_t* dyn, uint64_t jmp_df[]);
+void updateflags_pass1(dynarec_arm_t* dyn, uint64_t jmp_df[]);
+void updateflags_pass2(dynarec_arm_t* dyn, uint64_t jmp_df[]);
+void updateflags_pass3(dynarec_arm_t* dyn, uint64_t jmp_df[]);
+
+static dynablock_t* updaflags_arm64 = NULL;
+
+static uint8_t dummy_code[] = {0x90, 0xc3}; // some dummy code so update_flags dynablock point to something
+
+void* create_updateflags()
+{
+    if(updaflags_arm64)
+        return updaflags_arm64->block;
+    uint64_t jmp_df[d_unknown+1] = {0};
+    dynarec_arm_t helper = {0};
+    instruction_arm64_t insts[1] = {0};
+    helper.insts = insts;
+    helper.need_dump = BOX64ENV(dynarec_dump);
+    helper.cap = 1;
+    helper.f.dfnone = 1;
+    helper.f.pending = SF_NODF;
+    helper.insts[0].x64.gen_flags = X_ALL;
+    // pass 0
+    updateflags_pass0(&helper, jmp_df);
+    // check if all flags are handled
+    int ok = 1;
+    for(int i=d_none; i<d_unknown; ++i)
+        if(!jmp_df[i]) {
+            printf_log(LOG_NONE, "Error, UpdateFlags case %d is not handled, will crash later\n", i);
+            ok = 0;
+        }
+    // pass 1
+    updateflags_pass1(&helper, jmp_df);
+    // pass 2
+    helper.native_size = 0;
+    updateflags_pass2(&helper, jmp_df);
+    // alloc memory for pass3
+    size_t native_size = (helper.native_size+7)&~7;   // round the size...
+    size_t sz = sizeof(void*) + native_size + helper.table64size*sizeof(uint64_t) + 4*sizeof(void*) +  0  +  0  +  0  + sizeof(dynablock_t) + 0;
+    //           dynablock_t*     block (arm insts)            table64               jmpnext code instsize arch callrets     dynablock      relocs
+    void* actual_p = (void*)AllocDynarecMap((uintptr_t)&dummy_code, sz, 1);   // arbitrary address
+    void* p = (void*)(((uintptr_t)actual_p) + sizeof(void*));
+    void* tablestart = p + native_size;
+    void* next = tablestart + helper.table64size*sizeof(uint64_t);
+    void* instsize = next + 4*sizeof(void*);
+    void* arch = instsize + 0;
+    void* callrets = arch + 0;
+    if(actual_p==NULL) {
+        dynarec_log(LOG_INFO, "AllocDynarecMap(%zu) failed, canceling UpdateBlock\n", sz);
+        return NULL;
+    }
+    helper.block = p;
+    dynablock_t* block = (dynablock_t*)(callrets+0);
+    memset(block, 0, sizeof(dynablock_t));
+    void* relocs = helper.need_reloc?(block+1):NULL;
+    // fill the block
+    block->x64_addr = &dummy_code;
+    block->isize = 0;
+    block->actual_block = actual_p;
+    helper.relocs = relocs;
+    block->relocs = relocs;
+    block->table64size = helper.table64size;
+    helper.native_start = (uintptr_t)p;
+    helper.tablestart = (uintptr_t)tablestart;
+    helper.jmp_next = (uintptr_t)next+sizeof(void*);
+    helper.instsize = (instsize_t*)instsize;
+    *(dynablock_t**)actual_p = block;
+    helper.table64cap = helper.table64size;
+    helper.table64 = (uint64_t*)helper.tablestart;
+    helper.callrets = (callret_t*)callrets;
+    block->table64 = helper.table64;
+    helper.callret_size = 0;
+    // pass 3, emit (log emit native opcode)
+    if(helper.need_dump) {
+        dynarec_log(LOG_NONE, "%s%04d|Emitting %zu bytes for UpdateFlags", (helper.need_dump>1)?"\e[01;36m":"", GetTID(), helper.native_size);
+        PrintFunctionAddr(helper.start, " => ");
+        dynarec_log_prefix(0, LOG_NONE, "%s\n", (helper.need_dump>1)?"\e[m":"");
+    }
+    helper.native_size = 0;
+    updateflags_pass3(&helper, jmp_df);
+    helper.jmp_sz = helper.jmp_cap = 0;
+    helper.jmps = NULL;
+    // keep size of instructions for signal handling
+    block->instsize = instsize;
+    helper.table64 = NULL;
+    helper.instsize = NULL;
+    helper.predecessor = NULL;
+    block->size = sz;
+    block->isize = helper.size;
+    block->block = p;
+    block->jmpnext = next+sizeof(void*);
+    block->always_test = helper.always_test;
+    block->dirty = block->always_test;
+    block->is32bits = 0;
+    block->relocsize = helper.reloc_size*sizeof(uint32_t);
+    block->arch = NULL;
+    block->arch_size = 0;
+    block->callret_size = helper.callret_size;
+    block->callrets = helper.callrets;
+    block->native_size = native_size;
+    *(dynablock_t**)next = block;
+    *(void**)(next+3*sizeof(void*)) = NULL;
+    CreateJmpNext(block->jmpnext, next+3*sizeof(void*));
+    ClearCache(block->jmpnext, 4*sizeof(void*));
+    block->x64_size = 0;
+    // all done...
+    ClearCache(actual_p+sizeof(void*), native_size);   // need to clear the cache before execution...
+
+    updaflags_arm64 = block;
+    return block->block;
+}
\ No newline at end of file
diff --git a/src/dynarec/arm64/updateflags_arm64_pass.c b/src/dynarec/arm64/updateflags_arm64_pass.c
new file mode 100644
index 00000000..8b4dfe39
--- /dev/null
+++ b/src/dynarec/arm64/updateflags_arm64_pass.c
@@ -0,0 +1,1005 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+
+#include "os.h"
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "x64trace.h"
+#include "dynablock.h"
+#include "dynarec_native.h"
+#include "../dynablock_private.h"
+#include "custommem.h"
+#include "x64test.h"
+#include "pe_tools.h"
+
+#include "../dynarec_arch.h"
+
+#if STEP == 0
+    #define EMIT(A)         dyn->native_size+=4
+    #define SETMARK(A)      jmp_df[A] = 1
+#elif STEP == 1
+    #define EMIT(A)         do {} while (0)
+    #define SETMARK(A)      jmp_df[A] = 0
+#elif STEP == 2
+    #define EMIT(A)         dyn->native_size+=4
+    #define SETMARK(A)      jmp_df[A] = dyn->native_size
+#elif STEP == 3
+    #define MESSAGE(A, ...)                                                   \
+        do {                                                                  \
+            if (dyn->need_dump) dynarec_log_prefix(0, LOG_NONE, __VA_ARGS__); \
+        } while (0)
+    #define EMIT(A)                                         \
+        do{                                                 \
+            if(dyn->need_dump) print_opcode(dyn, ninst, (uint32_t)(A)); \
+            *(uint32_t*)(dyn->block) = (uint32_t)(A);       \
+            dyn->block += 4; dyn->native_size += 4;         \
+            dyn->insts[ninst].size2 += 4;                   \
+        }while(0)
+    #define SETMARK(A)      MESSAGE(LOG_DUMP, "Mark(%d)=%p\n", A, dyn->block)
+#else
+#error Meh!
+#endif
+#define STEP_PASS
+#include "../dynarec_helper.h"
+
+/*
+    Will generate a dynablock that does UpdateFlags. x0 = x64emu_t, x1 = df
+    So read the current df, set df to None, and jump to the correct function
+    using a static jump table.
+    Only x1..x5 regs will be used. No saving of SIMD regs needed.
+    LR will be used for return, and x0 needs to be corrctly setup as xEmu
+    flags will be changed
+*/
+
+void updateflags_pass(dynarec_arm_t* dyn, uint64_t jmp_df[])
+{
+    int ninst = 0;
+    rex_t rex = {0};
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, df));
+    STRw_U12(xZR, xEmu, offsetof(x64emu_t, df));
+    CMPSw_U12(x1, d_unknown);
+    Bcond(cLT, 4+4);
+    RET(xLR);
+    ADR_S20(x2, 4+8);
+    ADDx_REG_LSL(x1, x2, x1, 2);
+    BR(x1);
+    for(int i=d_none; i<d_unknown; ++i)
+        B(jmp_df[i] - dyn->native_size);
+SETMARK(d_none);
+    RET(xLR);
+SETMARK(d_add8);
+SETMARK(d_add8b);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRB_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    emit_add8(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_add16);
+SETMARK(d_add16b);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    emit_add16(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_add32);
+SETMARK(d_add32b);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    rex.w = 0;
+    emit_add32(dyn, ninst, rex, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_add64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    rex.w = 1;
+    emit_add32(dyn, ninst, rex, x1, x2, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_and8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0xff);
+    emit_and8(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_and16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0xffff);
+    emit_and16(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_and32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0xffffffff);
+    rex.w = 0;
+    emit_and32(dyn, ninst, rex, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_and64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV64x(x2, 0xffffffffffffffffULL);
+    rex.w = 1;
+    emit_and32(dyn, ninst, rex, x1, x2, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_dec8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    emit_dec8(dyn, ninst, x1, x3, x4);
+    RET(xLR);
+SETMARK(d_dec16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    emit_dec16(dyn, ninst, x1, x3, x4);
+    RET(xLR);
+SETMARK(d_dec32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    rex.w = 0;
+    emit_dec32(dyn, ninst, rex, x1, x3, x4);
+    RET(xLR);
+SETMARK(d_dec64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    rex.w = 1;
+    emit_dec32(dyn, ninst, rex, x1, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_inc8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    emit_inc8(dyn, ninst, x1, x3, x4);
+    RET(xLR);
+SETMARK(d_inc16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    emit_inc16(dyn, ninst, x1, x3, x4);
+    RET(xLR);
+SETMARK(d_inc32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    rex.w = 0;
+    emit_inc32(dyn, ninst, rex, x1, x3, x4);
+    RET(xLR);
+SETMARK(d_inc64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    rex.w = 1;
+    emit_inc32(dyn, ninst, rex, x1, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_imul8);
+    LDRSH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    ASRxw(x2, x1, 8);
+    CMPSw_REG_ASR(x2, x1, 16);
+    CSETw(x3, cNE);
+    BFIw(xFlags, x3, F_CF, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    if(!BOX64ENV(cputype)) {
+        LSRw(x2, x1, 7);
+        BFIw(xFlags, x2, F_SF, 1);
+        BFCw(xFlags, F_ZF, 1);
+        BFCw(xFlags, F_AF, 1);
+        emit_pf(dyn, ninst, x1, x4);
+    }
+    RET(xLR);
+SETMARK(d_imul16);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    ASRw(x2, x1, 16);
+    CMPSw_REG_ASR(x2, x1, 31);
+    CSETw(x3, cNE);
+    BFIw(xFlags, x3, F_CF, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    if(!BOX64ENV(cputype)) {
+        LSRw(x2, x1, 15);
+        BFIw(xFlags, x2, F_SF, 1);
+        BFCw(xFlags, F_ZF, 1);
+        BFCw(xFlags, F_AF, 1);
+        emit_pf(dyn, ninst, x1, x4);
+    }
+    RET(xLR);
+SETMARK(d_imul32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    CMPSw_REG_ASR(x2, x1, 31);
+    CSETw(x3, cNE);
+    BFIw(xFlags, x3, F_CF, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    if(!BOX64ENV(cputype)) {
+        LSRw(x2, x1, 31);
+        BFIw(xFlags, x2, F_SF, 1);
+        BFCw(xFlags, F_ZF, 1);
+        BFCw(xFlags, F_AF, 1);
+        emit_pf(dyn, ninst, x1, x4);
+    }
+    RET(xLR);
+SETMARK(d_imul64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    CMPSx_REG_ASR(x2, x1, 63);
+    CSETw(x3, cNE);
+    BFIw(xFlags, x3, F_CF, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    if(!BOX64ENV(cputype)) {
+        LSRx(x2, x1, 63);
+        BFIw(xFlags, x2, F_SF, 1);
+        BFCw(xFlags, F_ZF, 1);
+        BFCw(xFlags, F_AF, 1);
+        emit_pf(dyn, ninst, x1, x4);
+    }
+    RET(xLR);
+SETMARK(d_or8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0);
+    emit_or8(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_or16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0);
+    emit_or16(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_or32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0);
+    rex.w = 0;
+    emit_or32(dyn, ninst, rex, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_or64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV64x(x2, 0);
+    rex.w = 1;
+    emit_or32(dyn, ninst, rex, x1, x2, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_mul8);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    CMPSw_REG_LSR(xZR, x1, 8);
+    CSETw(x3, cNE);
+    BFIw(xFlags, x3, F_CF, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    if(!BOX64ENV(cputype)) {
+        LSRw(x2, x1, 7);
+        BFIw(xFlags, x2, F_SF, 1);
+        BFCw(xFlags, F_ZF, 1);
+        BFCw(xFlags, F_AF, 1);
+        emit_pf(dyn, ninst, x1, x4);
+    }
+    RET(xLR);
+SETMARK(d_mul16);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    CMPSw_REG_LSR(xZR, x1, 16);
+    CSETw(x3, cNE);
+    BFIw(xFlags, x3, F_CF, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    if(!BOX64ENV(cputype)) {
+        LSRw(x2, x1, 15);
+        BFIw(xFlags, x2, F_SF, 1);
+        BFCw(xFlags, F_ZF, 1);
+        BFCw(xFlags, F_AF, 1);
+        emit_pf(dyn, ninst, x1, x4);
+    }
+    RET(xLR);
+SETMARK(d_mul32);
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    CMPSw_U12(x2, 0);
+    CSETw(x3, cNE);
+    BFIw(xFlags, x3, F_CF, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    if(!BOX64ENV(cputype)) {
+        LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+        LSRw(x2, x1, 31);
+        BFIw(xFlags, x2, F_SF, 1);
+        BFCw(xFlags, F_ZF, 1);
+        BFCw(xFlags, F_AF, 1);
+        emit_pf(dyn, ninst, x1, x4);
+    }
+    RET(xLR);
+SETMARK(d_mul64);
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    CMPSx_U12(x2, 0);
+    CSETw(x3, cNE);
+    BFIw(xFlags, x3, F_CF, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    if(!BOX64ENV(cputype)) {
+        LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+        LSRx(x2, x1, 63);
+        BFIw(xFlags, x2, F_SF, 1);
+        BFCw(xFlags, F_ZF, 1);
+        BFCw(xFlags, F_AF, 1);
+        emit_pf(dyn, ninst, x1, x4);
+    }
+    RET(xLR);
+SETMARK(d_neg8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    emit_neg8(dyn, ninst, x1, x3, x4);
+    RET(xLR);
+SETMARK(d_neg16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    emit_neg16(dyn, ninst, x1, x3, x4);
+    RET(xLR);
+SETMARK(d_neg32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    rex.w = 0;
+    emit_neg32(dyn, ninst, rex, x1, x3, x4);
+    RET(xLR);
+SETMARK(d_neg64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    rex.w = 1;
+    emit_neg32(dyn, ninst, rex, x1, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_shl8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRB_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 4);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    emit_shl8(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_shl16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 4);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    emit_shl16(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_shl32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 4);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    rex.w = 0;
+    emit_shl32(dyn, ninst, rex, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_shl64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 5);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    rex.w = 1;
+    emit_shl32(dyn, ninst, rex, x1, x2, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_shr8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRB_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 4);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    emit_shr8(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_shr16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 4);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    emit_shr16(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_shr32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 4);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    rex.w = 0;
+    emit_shr32(dyn, ninst, rex, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_shr64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 5);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    rex.w = 1;
+    emit_shr32(dyn, ninst, rex, x1, x2, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_sar8);
+    LDRSB_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRB_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 4);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    emit_sar8(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_sar16);
+    LDRSH_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 4);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    emit_sar16(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_sar32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 4);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    rex.w = 0;
+    emit_sar32(dyn, ninst, rex, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_sar64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    ANDSw_mask(x2, x2, 0, 5);
+    Bcond(cNE, 4+4);
+    RET(xLR);
+    rex.w = 1;
+    emit_sar32(dyn, ninst, rex, x1, x2, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_sub8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRB_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    emit_sub8(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_sub16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    emit_sub16(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_sub32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    rex.w = 0;
+    emit_sub32(dyn, ninst, rex, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_sub64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    rex.w = 1;
+    emit_sub32(dyn, ninst, rex, x1, x2, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_xor8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0);
+    emit_xor8(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_xor16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0);
+    emit_xor16(dyn, ninst, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_xor32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0);
+    rex.w = 0;
+    emit_xor32(dyn, ninst, rex, x1, x2, x3, x4);
+    RET(xLR);
+SETMARK(d_xor64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV64x(x2, 0);
+    rex.w = 1;
+    emit_xor32(dyn, ninst, rex, x1, x2, x3, x4);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_cmp8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRB_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5);
+    RET(xLR);
+SETMARK(d_cmp16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    emit_cmp16(dyn, ninst, x1, x2, x3, x4, x5);
+    RET(xLR);
+SETMARK(d_cmp32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    rex.w = 0;
+    emit_cmp32(dyn, ninst, rex, x1, x2, x3, x4, x5);
+    RET(xLR);
+SETMARK(d_cmp64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op2));
+    rex.w = 1;
+    emit_cmp32(dyn, ninst, rex, x1, x2, x3, x4, x5);
+    rex.w = 0;
+    RET(xLR);
+SETMARK(d_tst8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0xff);
+    emit_test8(dyn, ninst, x1, x2, x3, x4, x5);
+    RET(xLR);
+SETMARK(d_tst16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0xffff);
+    emit_test16(dyn, ninst, x1, x2, x3, x4, x5);
+    RET(xLR);
+SETMARK(d_tst32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV32w(x2, 0xffffffff);
+    rex.w = 0;
+    emit_test32(dyn, ninst, rex, x1, x2, x3, x4, x5);
+    RET(xLR);
+SETMARK(d_tst64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    MOV64x(x2, 0xffffffffffffffffULL);
+    rex.w = 1;
+    emit_test32(dyn, ninst, rex, x1, x2, x3, x4, x5);
+    rex.w = 0;
+    RET(xLR);
+// for ADC & SBB, the emit_adcX cannot be used because the CF state is not saved
+SETMARK(d_adc8);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    BFXILw(xFlags, x1, 8, 1);   //F_CF
+    LSRw(x2, x1, 7);
+    BFIw(xFlags, x2, F_SF, 1);
+    TSTw_mask(x1, 0, 7);    // mask 0xff
+    CSETw(x2, cEQ);
+    BFIw(xFlags, x2, F_ZF, 1);
+    LDRB_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRB_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    ANDw_REG(x4, x2, x3);   // op1 & op2
+    ORRw_REG(x2, x2, x3);   // op1 | op2
+    BICw_REG(x2, x2, x1);   // ~res & (op1 | op2)
+    ORRw_REG(x2, x2, x4); // CC
+    LSRw(x3, x2, 3);
+    BFIw(xFlags, x3, F_AF, 1);
+    LSRw(x3, x2, 6);
+    EORw_REG_LSR(x3, x3, x3, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_adc16);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    BFXILw(xFlags, x1, 16, 1);   //F_CF
+    LSRw(x2, x1, 15);
+    BFIw(xFlags, x2, F_SF, 1);
+    TSTw_mask(x1, 0, 15);    // mask 0xffff
+    CSETw(x2, cEQ);
+    BFIw(xFlags, x2, F_ZF, 1);
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    ANDw_REG(x4, x2, x3);   // op1 & op2
+    ORRw_REG(x2, x2, x3);   // op1 | op2
+    BICw_REG(x2, x2, x1);   // ~res & (op1 | op2)
+    ORRw_REG(x2, x2, x4); // CC
+    LSRw(x3, x2, 3);
+    BFIw(xFlags, x3, F_AF, 1);
+    LSRw(x3, x2, 14);
+    EORw_REG_LSR(x3, x3, x3, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_adc32);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    BFXILx(xFlags, x1, 32, 1);   //F_CF
+    LSRw(x2, x1, 31);
+    BFIw(xFlags, x2, F_SF, 1);
+    TSTw_REG(x1, x1);
+    CSETw(x2, cEQ);
+    BFIw(xFlags, x2, F_ZF, 1);
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    ANDw_REG(x4, x2, x3);   // op1 & op2
+    ORRw_REG(x2, x2, x3);   // op1 | op2
+    BICw_REG(x2, x2, x1);   // ~res & (op1 | op2)
+    ORRw_REG(x2, x2, x4); // CC
+    LSRw(x3, x2, 3);
+    BFIw(xFlags, x3, F_AF, 1);
+    LSRw(x3, x2, 30);
+    EORw_REG_LSR(x3, x3, x3, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_adc32b);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LSRw(x2, x1, 31);
+    BFIw(xFlags, x2, F_SF, 1);
+    TSTw_REG(x1, x1);
+    CSETw(x2, cEQ);
+    BFIw(xFlags, x2, F_ZF, 1);
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    ADDw_REG(x4, x2, x3);
+    CMPSw_REG(x1, x4);
+    CSETw(x4, cNE);
+    ADDx_REG(x4, x4, x2);
+    ADDx_REG(x4, x4, x3);
+    BFXILx(xFlags, x4, 32, 1);   //F_CF
+    ANDw_REG(x4, x2, x3);   // op1 & op2
+    ORRw_REG(x2, x2, x3);   // op1 | op2
+    BICw_REG(x2, x2, x1);   // ~res & (op1 | op2)
+    ORRw_REG(x2, x2, x4); // CC
+    LSRw(x3, x2, 3);
+    BFIw(xFlags, x3, F_AF, 1);
+    LSRw(x3, x2, 30);
+    EORw_REG_LSR(x3, x3, x3, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_adc64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LSRx(x2, x1, 63);
+    BFIw(xFlags, x2, F_SF, 1);
+    TSTx_REG(x1, x1);
+    CSETw(x2, cEQ);
+    BFIw(xFlags, x2, F_ZF, 1);
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    ADDx_REG(x4, x2, x3);
+    CMPSx_REG(x1, x4);
+    CSETw(x4, cNE);
+    ADDx_UXTW(x4, x4, x2);
+    ADDx_UXTW(x4, x4, x3);  // x4 = lo
+    LSRx(x4, x4, 32);
+    ADDx_REG_LSR(x4, x4, x2, 32);
+    ADDx_REG_LSR(x4, x4, x3, 32);   // hi
+    BFXILx(xFlags, x4, 32, 1);   //F_CF
+    ANDx_REG(x4, x2, x3);   // op1 & op2
+    ORRx_REG(x2, x2, x3);   // op1 | op2
+    BICx_REG(x2, x2, x1);   // ~res & (op1 | op2)
+    ORRx_REG(x2, x2, x4); // CC
+    LSRx(x3, x2, 3);
+    BFIx(xFlags, x3, F_AF, 1);
+    LSRx(x3, x2, 62);
+    EORw_REG_LSR(x3, x3, x3, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_sbb8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LSRw(x2, x1, 7);
+    BFIw(xFlags, x2, F_SF, 1);
+    TSTw_mask(x1, 0, 7);    // mask 0xff
+    CSETw(x2, cEQ);
+    BFIw(xFlags, x2, F_ZF, 1);
+    LDRB_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRB_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    BICw_REG(x4, x3, x2);   // ~op1 & op2
+    ORNw_REG(x2, x3, x2);   // ~op1 | op2
+    ANDw_REG(x2, x2, x1);   // res & (~op1 | op2)
+    ORRw_REG(x2, x2, x4); // CC
+    BFXILw(xFlags, x2, 7, 1);
+    LSRw(x3, x2, 3);
+    BFIw(xFlags, x3, F_AF, 1);
+    LSRw(x3, x2, 6);
+    EORw_REG_LSR(x3, x3, x3, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_sbb16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LSRw(x2, x1, 15);
+    BFIw(xFlags, x2, F_SF, 1);
+    TSTw_mask(x1, 0, 15);    // mask 0xffff
+    CSETw(x2, cEQ);
+    BFIw(xFlags, x2, F_ZF, 1);
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    BICw_REG(x4, x3, x2);   // ~op1 & op2
+    ORNw_REG(x2, x3, x2);   // ~op1 | op2
+    ANDw_REG(x2, x2, x1);   // res & (~op1 | op2)
+    ORRw_REG(x2, x2, x4); // CC
+    BFXILw(xFlags, x2, 15, 1);
+    LSRw(x3, x2, 3);
+    BFIw(xFlags, x3, F_AF, 1);
+    LSRw(x3, x2, 14);
+    EORw_REG_LSR(x3, x3, x3, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_sbb32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LSRw(x2, x1, 31);
+    BFIw(xFlags, x2, F_SF, 1);
+    TSTw_REG(x1, x1);
+    CSETw(x2, cEQ);
+    BFIw(xFlags, x2, F_ZF, 1);
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    BICw_REG(x4, x3, x2);   // ~op1 & op2
+    ORNw_REG(x2, x3, x2);   // ~op1 | op2
+    ANDw_REG(x2, x2, x1);   // res & (~op1 | op2)
+    ORRw_REG(x2, x2, x4); // CC
+    BFXILw(xFlags, x2, 31, 1);
+    LSRw(x3, x2, 3);
+    BFIw(xFlags, x3, F_AF, 1);
+    LSRw(x3, x2, 30);
+    EORw_REG_LSR(x3, x3, x3, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_sbb64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LSRx(x2, x1, 63);
+    BFIw(xFlags, x2, F_SF, 1);
+    TSTx_REG(x1, x1);
+    CSETw(x2, cEQ);
+    BFIw(xFlags, x2, F_ZF, 1);
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    BICx_REG(x4, x3, x2);   // ~op1 & op2
+    ORNx_REG(x2, x3, x2);   // ~op1 | op2
+    ANDx_REG(x2, x2, x1);   // res & (~op1 | op2)
+    ORRx_REG(x2, x2, x4); // CC
+    BFXILx(xFlags, x2, 63, 1);
+    LSRw(x3, x2, 3);
+    BFIw(xFlags, x3, F_AF, 1);
+    LSRx(x3, x2, 62);
+    EORw_REG_LSR(x3, x3, x3, 1);
+    BFIw(xFlags, x3, F_OF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_rol8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, res));
+    if(BOX64ENV(cputype)) {
+        EORw_REG_LSR(x2, x1, x1, 7);
+        BFIw(xFlags, x2, F_OF, 1);
+    } else {
+        LDRB_U12(x2, xEmu, offsetof(x64emu_t, op1));
+        LSLw_IMM(x3, x2, 6);
+        EORw_REG_LSR(x3, x3, x3, 1);
+        BFIw(xFlags, x3, F_OF, 1);
+    }
+    BFXILw(xFlags, x1, 0, 1);
+    RET(xLR);
+SETMARK(d_rol16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    if(BOX64ENV(cputype)) {
+        EORw_REG_LSR(x2, x1, x1, 15);
+        BFIw(xFlags, x2, F_OF, 1);
+    } else {
+        LDRH_U12(x2, xEmu, offsetof(x64emu_t, op1));
+        LSLw_IMM(x3, x2, 14);
+        EORw_REG_LSR(x3, x3, x3, 1);
+        BFIw(xFlags, x3, F_OF, 1);
+    }
+    BFXILw(xFlags, x1, 0, 1);
+    RET(xLR);
+SETMARK(d_rol32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    if(BOX64ENV(cputype)) {
+        EORw_REG_LSR(x2, x1, x1, 31);
+        BFIw(xFlags, x2, F_OF, 1);
+    } else {
+        LDRw_U12(x2, xEmu, offsetof(x64emu_t, op1));
+        LSLw_IMM(x3, x2, 30);
+        EORw_REG_LSR(x3, x3, x3, 1);
+        BFIw(xFlags, x3, F_OF, 1);
+    }
+    BFXILw(xFlags, x1, 0, 1);
+    RET(xLR);
+SETMARK(d_rol64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    if(BOX64ENV(cputype)) {
+        EORx_REG_LSR(x2, x1, x1, 63);
+        BFIw(xFlags, x2, F_OF, 1);
+    } else {
+        LDRx_U12(x2, xEmu, offsetof(x64emu_t, op1));
+        LSLx_IMM(x3, x2, 62);
+        EORw_REG_LSR(x3, x3, x3, 1);
+        BFIw(xFlags, x3, F_OF, 1);
+    }
+    BFXILw(xFlags, x1, 0, 1);
+    RET(xLR);
+SETMARK(d_ror8);
+    LDRB_U12(x1, xEmu, offsetof(x64emu_t, res));
+    if(BOX64ENV(cputype)) {
+        LSLw_IMM(x2, x1, 6);
+        EORw_REG_LSR(x3, x2, x2, 1);
+        BFIw(xFlags, x3, F_OF, 1);
+    } else {
+        LDRB_U12(x2, xEmu, offsetof(x64emu_t, op1));
+        EORw_REG_LSR(x3, x2, x2, 7);
+        BFIw(xFlags, x3, F_OF, 1);
+    }
+    BFXILw(xFlags, x1, 7, 1);
+    RET(xLR);
+SETMARK(d_ror16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    if(BOX64ENV(cputype)) {
+        LSLw_IMM(x2, x1, 14);
+        EORw_REG_LSR(x3, x2, x2, 1);
+        BFIw(xFlags, x3, F_OF, 1);
+    } else {
+        LDRH_U12(x2, xEmu, offsetof(x64emu_t, op1));
+        EORw_REG_LSR(x3, x2, x2, 15);
+        BFIw(xFlags, x3, F_OF, 1);
+    }
+    BFXILw(xFlags, x1, 15, 1);
+    RET(xLR);
+SETMARK(d_ror32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    if(BOX64ENV(cputype)) {
+        LSLw_IMM(x2, x1, 30);
+        EORw_REG_LSR(x3, x2, x2, 1);
+        BFIw(xFlags, x3, F_OF, 1);
+    } else {
+        LDRw_U12(x2, xEmu, offsetof(x64emu_t, op1));
+        EORw_REG_LSR(x3, x2, x2, 31);
+        BFIw(xFlags, x3, F_OF, 1);
+    }
+    BFXILw(xFlags, x1, 31, 1);
+    RET(xLR);
+SETMARK(d_ror64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    if(BOX64ENV(cputype)) {
+        LSLx_IMM(x2, x1, 62);
+        EORw_REG_LSR(x3, x2, x2, 1);
+        BFIw(xFlags, x3, F_OF, 1);
+    } else {
+        LDRx_U12(x2, xEmu, offsetof(x64emu_t, op1));
+        EORx_REG_LSR(x3, x2, x2, 63);
+        BFIw(xFlags, x3, F_OF, 1);
+    }
+    BFXILx(xFlags, x1, 63, 1);
+    RET(xLR);
+SETMARK(d_shrd16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    if(BOX64ENV(cputype)) {
+        LSRw(x4, x1, 14);
+        EORw_REG_LSR(x4, x4, x4, 1);
+        BFIw(xFlags, x4, F_OF, 1);
+        ORRw_mask(xFlags, xFlags, 28, 0); // mask = 0x10
+    } else {
+        ANDw_mask(x4, x3, 0, 3);    // mask = 0x0f
+        SUBw_U12(x4, x4, 16);
+        NEGw_REG(x4, x4);
+        LSRw_REG(x4, x1, x4);
+        EORw_REG_LSR(x4, x4, x2, 15);
+        BFIw(xFlags, x4, F_OF, 1);
+        BFCw(xFlags, F_AF, 1);
+    }
+    CBZw(x3, 4+4);
+    RET(xLR);
+    SUBw_U12(x4, x3, 1);
+    LSRw_REG(x4, x2, x4);
+    if(BOX64ENV(cputype)) {
+        CMPSw_U12(x3, 15);
+        CSELw(x4, x4, xZR, cGT);
+    }
+    BFIw(xFlags, x4, F_CF, 1);
+    LSRw(x4, x1, 15);
+    BFIw(xFlags, x4, F_SF, 1);
+    TSTw_mask(x1, 0, 15);
+    CSETw(x4, cEQ);
+    BFIw(xFlags, x4, F_ZF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_shrd32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    CBNZw(x3, 4+4);
+    RET(xLR);
+     if(BOX64ENV(cputype)) {
+        LSRw(x4, x1, 30);
+        EORw_REG_LSR(x4, x4, x4, 1);
+        BFIw(xFlags, x4, F_OF, 1);
+        ORRw_mask(xFlags, xFlags, 28, 0); // mask = 0x10
+    } else {
+        SUBw_U12(x4, x3, 32);
+        NEGw_REG(x4, x4);
+        LSRw_REG(x4, x1, x4);
+        EORw_REG_LSR(x4, x4, x2, 31);
+        BFIw(xFlags, x4, F_OF, 1);
+        BFCw(xFlags, F_AF, 1);
+    }
+    SUBw_U12(x4, x3, 1);
+    LSRw_REG(x4, x2, x4);
+    BFIw(xFlags, x4, F_CF, 1);
+    LSRw(x4, x1, 31);
+    BFIw(xFlags, x4, F_SF, 1);
+    TSTw_REG(x1, x1);
+    CSETw(x4, cEQ);
+    BFIw(xFlags, x4, F_ZF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_shrd64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    CBNZw(x3, 4+4);
+    RET(xLR);
+     if(BOX64ENV(cputype)) {
+        LSRx(x4, x1, 62);
+        EORw_REG_LSR(x4, x4, x4, 1);
+        BFIw(xFlags, x4, F_OF, 1);
+        ORRw_mask(xFlags, xFlags, 28, 0); // mask = 0x10
+    } else {
+        SUBw_U12(x4, x3, 64);
+        NEGw_REG(x4, x4);
+        LSRx_REG(x4, x1, x4);
+        EORx_REG_LSR(x4, x4, x2, 63);
+        BFIw(xFlags, x4, F_OF, 1);
+        BFCw(xFlags, F_AF, 1);
+    }
+    SUBx_U12(x4, x3, 1);
+    LSRx_REG(x4, x2, x4);
+    BFIw(xFlags, x4, F_CF, 1);
+    LSRx(x4, x1, 63);
+    BFIw(xFlags, x4, F_SF, 1);
+    TSTx_REG(x1, x1);
+    CSETw(x4, cEQ);
+    BFIw(xFlags, x4, F_ZF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_shld16);
+    LDRH_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LDRH_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRH_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    CBNZw(x3, 4+4);
+    RET(xLR);
+    SUBw_U12(x4, x3, 16);
+    NEGw_REG(x4, x4);
+    LSRw_REG(x4, x2, x4);
+    BFIw(xFlags, x4, F_CF, 1);
+    if(BOX64ENV(cputype)) {
+        EORw_REG_LSR(x4, xFlags, x1, 15);
+        CMPSw_U12(x3, 15);
+        CSELw(x4, x4, xFlags, cGT);
+        BFIw(xFlags, x4, F_OF, 1);
+        ORRw_mask(xFlags, xFlags, 28, 0); // mask = 0x10
+    } else {
+        LSRw(x4, x2, 14);
+        EORw_REG_LSR(x4, x4, x4, 1);
+        BFIw(xFlags, x4, F_OF, 1);
+        BFCw(xFlags, F_AF, 1);
+    }
+    LSRw(x4, x1, 15);
+    BFIw(xFlags, x4, F_SF, 1);
+    TSTw_mask(x1, 0, 15);
+    CSETw(x4, cEQ);
+    BFIw(xFlags, x4, F_ZF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_shld32);
+    LDRw_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LDRw_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRw_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    CBNZw(x3, 4+4);
+    RET(xLR);
+    SUBw_U12(x4, x3, 32);
+    NEGw_REG(x4, x4);
+    LSRw_REG(x4, x2, x4);
+    BFIw(xFlags, x4, F_CF, 1);
+    if(BOX64ENV(cputype)) {
+        EORw_REG_LSR(x4, xFlags, x1, 31);
+        BFIw(xFlags, x4, F_OF, 1);
+        ORRw_mask(xFlags, xFlags, 28, 0); // mask = 0x10
+    } else {
+        LSRw(x4, x2, 30);
+        EORw_REG_LSR(x4, x4, x4, 1);
+        BFIw(xFlags, x4, F_OF, 1);
+        BFCw(xFlags, F_AF, 1);
+    }
+    LSRw(x4, x1, 31);
+    BFIw(xFlags, x4, F_SF, 1);
+    TSTw_REG(x1, x1);
+    CSETw(x4, cEQ);
+    BFIw(xFlags, x4, F_ZF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+SETMARK(d_shld64);
+    LDRx_U12(x1, xEmu, offsetof(x64emu_t, res));
+    LDRx_U12(x2, xEmu, offsetof(x64emu_t, op1));
+    LDRx_U12(x3, xEmu, offsetof(x64emu_t, op2));
+    CBNZx(x3, 4+4);
+    RET(xLR);
+    MOV32w(x4, 64);
+    SUBw_REG(x4, x4, x3);
+    LSRx_REG(x4, x2, x4);
+    BFIw(xFlags, x4, F_CF, 1);
+    if(BOX64ENV(cputype)) {
+        EORx_REG_LSR(x4, xFlags, x1, 63);
+        BFIw(xFlags, x4, F_OF, 1);
+        ORRw_mask(xFlags, xFlags, 28, 0); // mask = 0x10
+    } else {
+        LSRx(x4, x2, 62);
+        EORw_REG_LSR(x4, x4, x4, 1);
+        BFIw(xFlags, x4, F_OF, 1);
+        BFCw(xFlags, F_AF, 1);
+    }
+    LSRx(x4, x1, 63);
+    BFIw(xFlags, x4, F_SF, 1);
+    TSTx_REG(x1, x1);
+    CSETw(x4, cEQ);
+    BFIw(xFlags, x4, F_ZF, 1);
+    emit_pf(dyn, ninst, x1, x4);
+    RET(xLR);
+// all done!
+}
\ No newline at end of file
diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h
index 761f8166..6df0b53c 100644
--- a/src/dynarec/dynarec_arch.h
+++ b/src/dynarec/dynarec_arch.h
@@ -34,6 +34,8 @@
 #define ARCH_UNALIGNED(A, B) arch_unaligned(A, B)

 extern uint32_t arm64_crc(void* p, uint32_t len);

 #define ARCH_CRC(A, B)  if(cpuext.crc32) return arm64_crc(A, B)

+extern void* create_updateflags();

+#define ARCH_UPDATEFLAGS()      create_updateflags()

 

 #define ARCH_NOP    0b11010101000000110010000000011111

 #define ARCH_UDF    0xcafe

diff --git a/src/dynarec/dynarec_native_functions.c b/src/dynarec/dynarec_native_functions.c
index 79e558d6..49ea4a40 100644
--- a/src/dynarec/dynarec_native_functions.c
+++ b/src/dynarec/dynarec_native_functions.c
@@ -38,7 +38,7 @@ void native_fstp(x64emu_t* emu, void* p)
 void native_print_armreg(x64emu_t* emu, uintptr_t reg, uintptr_t n)
 {
     (void)emu;
-    dynarec_log(LOG_DEBUG, "R%lu=0x%lx (%lu)\n", n, reg, reg);
+    dynarec_log(LOG_INFO, "Debug Register R%lu=0x%lx (%lu)\n", n, reg, reg);
 }
 
 void native_f2xm1(x64emu_t* emu)