about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h2
-rw-r--r--src/dynarec/dynarec_native_pass.c12
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_3.c10
-rw-r--r--src/dynarec/rv64/dynarec_rv64_0f.c10
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f20f.c4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h1843
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h6
-rw-r--r--src/dynarec/rv64/rv64_emitter.h152
8 files changed, 1080 insertions, 959 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 6ac79d7a..acbbe2e8 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -126,7 +126,7 @@ int Table64(dynarec_arm_t *dyn, uint64_t val, int pass);  // add a value to tabl
 
 void CreateJmpNext(void* addr, void* next);
 
-#define GO_TRACE(A, B)      \
+#define GO_TRACE(A, B, s0)  \
     GETIP(addr);            \
     MOVx_REG(x1, xRIP);     \
     STORE_XEMU_CALL(xRIP);  \
diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index f5ee2501..0562fb6b 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -88,17 +88,17 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
         if(box64_dynarec_test) {
             MESSAGE(LOG_DUMP, "TEST INIT ----\n");
             fpu_reflectcache(dyn, ninst, x1, x2, x3);
-            GO_TRACE(x64test_init, 1);
+            GO_TRACE(x64test_init, 1, x1);
             fpu_unreflectcache(dyn, ninst, x1, x2, x3);
             MESSAGE(LOG_DUMP, "----------\n");
         }
 #ifdef HAVE_TRACE
         else if(my_context->dec && box64_dynarec_trace) {
-        if((trace_end == 0) 
+        if((trace_end == 0)
             || ((ip >= trace_start) && (ip < trace_end)))  {
                 MESSAGE(LOG_DUMP, "TRACE ----\n");
                 fpu_reflectcache(dyn, ninst, x1, x2, x3);
-                GO_TRACE(PrintTrace, 1);
+                GO_TRACE(PrintTrace, 1, x1);
                 fpu_unreflectcache(dyn, ninst, x1, x2, x3);
                 MESSAGE(LOG_DUMP, "----------\n");
             }
@@ -200,7 +200,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
             if(*(uint32_t*)addr!=0) {   // check if need to continue (but is next 4 bytes are 0, stop)
                 uintptr_t next = get_closest_next(dyn, addr);
                 if(next && (
-                    (((next-addr)<15) && is_nops(dyn, addr, next-addr)) 
+                    (((next-addr)<15) && is_nops(dyn, addr, next-addr))
                     /*||(((next-addr)<30) && is_instructions(dyn, addr, next-addr))*/ ))
                 {
                     ok = 1;
@@ -232,7 +232,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
             }
         #endif
         if(ok<0)  {
-            ok = 0; need_epilog=1; 
+            ok = 0; need_epilog=1;
             #if STEP == 0
             if(ninst) {
                 --ninst;
@@ -256,7 +256,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
         }
         ++ninst;
         #if STEP == 0
-        if(ok && (((box64_dynarec_bigblock<stopblock) && !isJumpTableDefault64((void*)addr)) 
+        if(ok && (((box64_dynarec_bigblock<stopblock) && !isJumpTableDefault64((void*)addr))
             || (addr>=box64_nodynarec_start && addr<box64_nodynarec_end)))
         #else
         if(ok && (ninst==dyn->size))
diff --git a/src/dynarec/rv64/dynarec_rv64_00_3.c b/src/dynarec/rv64/dynarec_rv64_00_3.c
index dece35af..efe58f61 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_3.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_3.c
@@ -319,7 +319,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         addr+=8+8;
                     } else {
                         GETIP(ip+1); // read the 0xCC
-                        STORE_XEMU_CALL();
+                        STORE_XEMU_CALL(x3);
                         ADDI(x1, xEmu, (uint32_t)offsetof(x64emu_t, ip)); // setup addr as &emu->ip
                         CALL_S(x64Int3, -1);
                         LOAD_XEMU_CALL();
@@ -360,7 +360,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 NOTEST(x1);
                 SMEND();
                 GETIP(addr);
-                STORE_XEMU_CALL();
+                STORE_XEMU_CALL(x3);
                 CALL_S(x86Syscall, -1);
                 LOAD_XEMU_CALL();
                 TABLE64(x3, addr); // expected return address
@@ -368,13 +368,13 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 LW(x1, xEmu, offsetof(x64emu_t, quit));
                 BEQ_NEXT(x1, xZR);
                 MARK;
-                LOAD_XEMU_REM();
+                LOAD_XEMU_REM(x3);
                 jump_to_epilog(dyn, 0, xRIP, ninst);
             } else {
                 INST_NAME("INT n");
                 SETFLAGS(X_ALL, SF_SET); // Hack to set flags in "don't care" state
                 GETIP(ip);
-                STORE_XEMU_CALL();
+                STORE_XEMU_CALL(x3);
                 CALL(native_priv, -1);
                 LOAD_XEMU_CALL();
                 jump_to_epilog(dyn, 0, xRIP, ninst);
@@ -723,7 +723,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         dyn->last_ip = addr;
                     } else {
                         GETIP_(dyn->insts[ninst].natcall); // read the 0xCC already
-                        STORE_XEMU_CALL();
+                        STORE_XEMU_CALL(x3);
                         ADDI(x1, xEmu, (uint32_t)offsetof(x64emu_t, ip)); // setup addr as &emu->ip
                         CALL_S(x64Int3, -1);
                         LOAD_XEMU_CALL();
diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c
index d7e4eeb8..32211e31 100644
--- a/src/dynarec/rv64/dynarec_rv64_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f.c
@@ -64,7 +64,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             addr = fakeed(dyn, addr, ninst, nextop);
             SETFLAGS(X_ALL, SF_SET); // Hack to set flags in "don't care" state
             GETIP(ip);
-            STORE_XEMU_CALL();
+            STORE_XEMU_CALL(x3);
             CALL(native_ud, -1);
             LOAD_XEMU_CALL();
             jump_to_epilog(dyn, 0, xRIP, ninst);
@@ -77,7 +77,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             NOTEST(x1);
             SMEND();
             GETIP(addr);
-            STORE_XEMU_CALL();
+            STORE_XEMU_CALL(x3);
             CALL_S(x64Syscall, -1);
             LOAD_XEMU_CALL();
             TABLE64(x3, addr); // expected return address
@@ -85,7 +85,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             LW(w1, xEmu, offsetof(x64emu_t, quit));
             CBZ_NEXT(w1);
             MARK;
-            LOAD_XEMU_REM();
+            LOAD_XEMU_REM(x3);
             jump_to_epilog(dyn, 0, xRIP, ninst);
             break;
 
@@ -93,7 +93,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("WBINVD");
             SETFLAGS(X_ALL, SF_SET); // Hack to set flags in "don't care" state
             GETIP(ip);
-            STORE_XEMU_CALL();
+            STORE_XEMU_CALL(x3);
             CALL(native_ud, -1);
             LOAD_XEMU_CALL();
             jump_to_epilog(dyn, 0, xRIP, ninst);
@@ -105,7 +105,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("UD2");
             SETFLAGS(X_ALL, SF_SET); // Hack to set flags in "don't care" state
             GETIP(ip);
-            STORE_XEMU_CALL();
+            STORE_XEMU_CALL(x3);
             CALL(native_ud, -1);
             LOAD_XEMU_CALL();
             jump_to_epilog(dyn, 0, xRIP, ninst);
diff --git a/src/dynarec/rv64/dynarec_rv64_f20f.c b/src/dynarec/rv64/dynarec_rv64_f20f.c
index ac3da811..e2ce3aac 100644
--- a/src/dynarec/rv64/dynarec_rv64_f20f.c
+++ b/src/dynarec/rv64/dynarec_rv64_f20f.c
@@ -152,7 +152,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     addr = fakeed(dyn, addr, ninst, nextop);
                     SETFLAGS(X_ALL, SF_SET);    // Hack to set flags in "don't care" state
                     GETIP(ip);
-                    STORE_XEMU_CALL();
+                    STORE_XEMU_CALL(x3);
                     CALL(native_ud, -1);
                     LOAD_XEMU_CALL();
                     jump_to_epilog(dyn, 0, xRIP, ninst);
@@ -165,7 +165,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     addr = fakeed(dyn, addr, ninst, nextop);
                     SETFLAGS(X_ALL, SF_SET);    // Hack to set flags in "don't care" state
                     GETIP(ip);
-                    STORE_XEMU_CALL();
+                    STORE_XEMU_CALL(x3);
                     CALL(native_ud, -1);
                     LOAD_XEMU_CALL();
                     jump_to_epilog(dyn, 0, xRIP, ninst);
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 04a20308..02bc5211 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -20,439 +20,511 @@
 
 #define F8      *(uint8_t*)(addr++)
 #define F8S     *(int8_t*)(addr++)
-#define F16     *(uint16_t*)(addr+=2, addr-2)
-#define F16S    *(int16_t*)(addr+=2, addr-2)
-#define F32     *(uint32_t*)(addr+=4, addr-4)
-#define F32S    *(int32_t*)(addr+=4, addr-4)
-#define F32S64  (uint64_t)(int64_t)F32S
-#define F64     *(uint64_t*)(addr+=8, addr-8)
-#define PK(a)   *(uint8_t*)(addr+a)
-#define PK16(a)   *(uint16_t*)(addr+a)
-#define PK32(a)   *(uint32_t*)(addr+a)
-#define PK64(a)   *(uint64_t*)(addr+a)
-#define PKip(a)   *(uint8_t*)(ip+a)
+#define F16     *(uint16_t*)(addr += 2, addr - 2)
+#define F16S    *(int16_t*)(addr += 2, addr - 2)
+#define F32     *(uint32_t*)(addr += 4, addr - 4)
+#define F32S    *(int32_t*)(addr += 4, addr - 4)
+#define F32S64  (uint64_t)(int64_t) F32S
+#define F64     *(uint64_t*)(addr += 8, addr - 8)
+#define PK(a)   *(uint8_t*)(addr + a)
+#define PK16(a) *(uint16_t*)(addr + a)
+#define PK32(a) *(uint32_t*)(addr + a)
+#define PK64(a) *(uint64_t*)(addr + a)
+#define PKip(a) *(uint8_t*)(ip + a)
 
 
 // Strong mem emulation helpers
 // Sequence of Read will trigger a DMB on "first" read if strongmem is 2
 // Squence of Write will trigger a DMB on "last" write if strongmem is 1
 // Opcode will read
-#define SMREAD()    if(!dyn->smread && box64_dynarec_strongmem>1) {SMDMB();}
+#define SMREAD() \
+    if (!dyn->smread && box64_dynarec_strongmem > 1) { SMDMB(); }
 // Opcode will read with option forced lock
-#define SMREADLOCK(lock)    if(lock || (!dyn->smread && box64_dynarec_strongmem>1)) {SMDMB();}
+#define SMREADLOCK(lock) \
+    if (lock || (!dyn->smread && box64_dynarec_strongmem > 1)) { SMDMB(); }
 // Opcode migh read (depend on nextop)
-#define SMMIGHTREAD()   if(!MODREG) {SMREAD();}
+#define SMMIGHTREAD() \
+    if (!MODREG) { SMREAD(); }
 // Opcode has wrote
-#define SMWRITE()   dyn->smwrite=1
+#define SMWRITE() dyn->smwrite = 1
 // Opcode has wrote (strongmem>1 only)
-#define SMWRITE2()   if(box64_dynarec_strongmem>1) dyn->smwrite=1
+#define SMWRITE2() \
+    if (box64_dynarec_strongmem > 1) dyn->smwrite = 1
 // Opcode has wrote with option forced lock
-#define SMWRITELOCK(lock)   if(lock) {SMDMB();} else dyn->smwrite=1
+#define SMWRITELOCK(lock) \
+    if (lock) {           \
+        SMDMB();          \
+    } else                \
+        dyn->smwrite = 1
 // Opcode migh have wrote (depend on nextop)
-#define SMMIGHTWRITE()   if(!MODREG) {SMWRITE();}
+#define SMMIGHTWRITE() \
+    if (!MODREG) { SMWRITE(); }
 // Start of sequence
-#define SMSTART()   SMEND()
+#define SMSTART() SMEND()
 // End of sequence
-#define SMEND()     if(dyn->smwrite && box64_dynarec_strongmem) {FENCE();} dyn->smwrite=0; dyn->smread=0;
+#define SMEND()                                               \
+    if (dyn->smwrite && box64_dynarec_strongmem) { FENCE(); } \
+    dyn->smwrite = 0;                                         \
+    dyn->smread = 0;
 // Force a Data memory barrier (for LOCK: prefix)
-#define SMDMB()     FENCE(); dyn->smwrite=0; dyn->smread=1
+#define SMDMB()       \
+    FENCE();          \
+    dyn->smwrite = 0; \
+    dyn->smread = 1
 
-//LOCK_* define
-#define LOCK_LOCK   (int*)1
+// LOCK_* define
+#define LOCK_LOCK (int*)1
 
 // GETGD    get x64 register in gd
-#define GETGD   gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3)
+#define GETGD gd = xRAX + ((nextop & 0x38) >> 3) + (rex.r << 3)
 // GETED can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
-#define GETED(D)  if(MODREG) {                          \
-                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                    wback = 0;                          \
-                } else {                                \
-                    SMREAD()                            \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
-                    LDxw(x1, wback, fixedaddress);      \
-                    ed = x1;                            \
-                }
+#define GETED(D)                                                                                \
+    if (MODREG) {                                                                               \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                                \
+        wback = 0;                                                                              \
+    } else {                                                                                    \
+        SMREAD()                                                                                \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+        LDxw(x1, wback, fixedaddress);                                                          \
+        ed = x1;                                                                                \
+    }
 // GETSED can use r1 for ed, and r2 for wback. ed will be sign extended!
-#define GETSED(D)  if(MODREG) {                         \
-                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                    wback = 0;                          \
-                    if(!rex.w) {                        \
-                        ADDW(x1, ed, xZR);              \
-                        ed = x1;                        \
-                    }                                   \
-                } else {                                \
-                    SMREAD()                            \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
-                    if(rex.w)                           \
-                        LD(x1, wback, fixedaddress);    \
-                    else                                \
-                        LW(x1, wback, fixedaddress);    \
-                    ed = x1;                            \
-                }
+#define GETSED(D)                                                                               \
+    if (MODREG) {                                                                               \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                                \
+        wback = 0;                                                                              \
+        if (!rex.w) {                                                                           \
+            ADDW(x1, ed, xZR);                                                                  \
+            ed = x1;                                                                            \
+        }                                                                                       \
+    } else {                                                                                    \
+        SMREAD()                                                                                \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+        if (rex.w)                                                                              \
+            LD(x1, wback, fixedaddress);                                                        \
+        else                                                                                    \
+            LW(x1, wback, fixedaddress);                                                        \
+        ed = x1;                                                                                \
+    }
 // GETEDx can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
-#define GETEDx(D) if(MODREG) {                          \
-                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                    wback = 0;                          \
-                } else {                                \
-                    SMREAD()                            \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
-                    LD(x1, wback, fixedaddress);        \
-                    ed = x1;                            \
-                }
-#define GETEDz(D) if(MODREG) {                          \
-                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                    wback = 0;                          \
-                } else {                                \
-                    SMREAD()                            \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
-                    LDz(x1, wback, fixedaddress);       \
-                    ed = x1;                            \
-                }
+#define GETEDx(D)                                                                               \
+    if (MODREG) {                                                                               \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                                \
+        wback = 0;                                                                              \
+    } else {                                                                                    \
+        SMREAD()                                                                                \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+        LD(x1, wback, fixedaddress);                                                            \
+        ed = x1;                                                                                \
+    }
+#define GETEDz(D)                                                                               \
+    if (MODREG) {                                                                               \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                                \
+        wback = 0;                                                                              \
+    } else {                                                                                    \
+        SMREAD()                                                                                \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+        LDz(x1, wback, fixedaddress);                                                           \
+        ed = x1;                                                                                \
+    }
 // GETED32 can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
-#define GETED32(D)  if(MODREG) {                        \
-                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                    wback = 0;                          \
-                } else {                                \
-                    SMREAD()                            \
-                    addr = geted32(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
-                    LDxw(x1, wback, fixedaddress);      \
-                    ed = x1;                            \
-                }
-//GETEDH can use hint for ed, and x1 or x2 for wback (depending on hint), might also use x3. wback is 0 if ed is xEAX..xEDI
-#define GETEDH(hint, D) if(MODREG) {                    \
-                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                    wback = 0;                          \
-                } else {                                \
-                    SMREAD();                           \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, (hint==x2)?x1:x2, (hint==x1)?x1:x3, &fixedaddress, rex, NULL, 1, D); \
-                    LDxw(hint, wback, fixedaddress);    \
-                    ed = hint;                          \
-                }
-//GETEDW can use hint for wback and ret for ed. wback is 0 if ed is xEAX..xEDI
-#define GETEDW(hint, ret, D)   if(MODREG) {             \
-                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                    MV(ret, ed);                        \
-                    wback = 0;                          \
-                } else {                                \
-                    SMREAD();                           \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, (hint==x2)?x1:x2, (hint==x1)?x1:x3, &fixedaddress, rex, NULL, 0, D); \
-                    ed = ret;                           \
-                    LDxw(ed, wback, fixedaddress);      \
-                }
+#define GETED32(D)                                                                                \
+    if (MODREG) {                                                                                 \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                                  \
+        wback = 0;                                                                                \
+    } else {                                                                                      \
+        SMREAD()                                                                                  \
+        addr = geted32(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+        LDxw(x1, wback, fixedaddress);                                                            \
+        ed = x1;                                                                                  \
+    }
+// GETEDH can use hint for ed, and x1 or x2 for wback (depending on hint), might also use x3. wback is 0 if ed is xEAX..xEDI
+#define GETEDH(hint, D)                                                                                                                 \
+    if (MODREG) {                                                                                                                       \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                                                                        \
+        wback = 0;                                                                                                                      \
+    } else {                                                                                                                            \
+        SMREAD();                                                                                                                       \
+        addr = geted(dyn, addr, ninst, nextop, &wback, (hint == x2) ? x1 : x2, (hint == x1) ? x1 : x3, &fixedaddress, rex, NULL, 1, D); \
+        LDxw(hint, wback, fixedaddress);                                                                                                \
+        ed = hint;                                                                                                                      \
+    }
+// GETEDW can use hint for wback and ret for ed. wback is 0 if ed is xEAX..xEDI
+#define GETEDW(hint, ret, D)                                                                                                            \
+    if (MODREG) {                                                                                                                       \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                                                                        \
+        MV(ret, ed);                                                                                                                    \
+        wback = 0;                                                                                                                      \
+    } else {                                                                                                                            \
+        SMREAD();                                                                                                                       \
+        addr = geted(dyn, addr, ninst, nextop, &wback, (hint == x2) ? x1 : x2, (hint == x1) ? x1 : x3, &fixedaddress, rex, NULL, 0, D); \
+        ed = ret;                                                                                                                       \
+        LDxw(ed, wback, fixedaddress);                                                                                                  \
+    }
 // GETGW extract x64 register in gd, that is i
-#define GETGW(i) gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3); ZEXTH(i, gd); gd = i;
-//GETEWW will use i for ed, and can use w for wback.
-#define GETEWW(w, i, D) if(MODREG) {        \
-                    wback = xRAX+(nextop&7)+(rex.b<<3);\
-                    ZEXTH(i, wback);        \
-                    ed = i;                 \
-                    wb1 = 0;                \
-                } else {                    \
-                    SMREAD();               \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, w, i, &fixedaddress, rex, NULL, 1, D); \
-                    LHU(i, wback, fixedaddress);\
-                    ed = i;                 \
-                    wb1 = 1;                \
-                }
-//GETEW will use i for ed, and can use r3 for wback.
+#define GETGW(i)                                       \
+    gd = xRAX + ((nextop & 0x38) >> 3) + (rex.r << 3); \
+    ZEXTH(i, gd);                                      \
+    gd = i;
+// GETEWW will use i for ed, and can use w for wback.
+#define GETEWW(w, i, D)                                                                       \
+    if (MODREG) {                                                                             \
+        wback = xRAX + (nextop & 7) + (rex.b << 3);                                           \
+        ZEXTH(i, wback);                                                                      \
+        ed = i;                                                                               \
+        wb1 = 0;                                                                              \
+    } else {                                                                                  \
+        SMREAD();                                                                             \
+        addr = geted(dyn, addr, ninst, nextop, &wback, w, i, &fixedaddress, rex, NULL, 1, D); \
+        LHU(i, wback, fixedaddress);                                                          \
+        ed = i;                                                                               \
+        wb1 = 1;                                                                              \
+    }
+// GETEW will use i for ed, and can use r3 for wback.
 #define GETEW(i, D) GETEWW(x3, i, D)
-//GETSEW will use i for ed, and can use r3 for wback. This is the Signed version
-#define GETSEW(i, D) if(MODREG) {           \
-                    wback = xRAX+(nextop&7)+(rex.b<<3);\
-                    if(rv64_zbb) SEXTH(i, wback); else {SLLI(i, wback, 48); SRAI(i, i, 48);}\
-                    ed = i;                 \
-                    wb1 = 0;                \
-                } else {                    \
-                    SMREAD();               \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, i, &fixedaddress, rex, NULL, 1, D); \
-                    LH(i, wback, fixedaddress); \
-                    ed = i;                 \
-                    wb1 = 1;                \
-                }
+// GETSEW will use i for ed, and can use r3 for wback. This is the Signed version
+#define GETSEW(i, D)                                                                           \
+    if (MODREG) {                                                                              \
+        wback = xRAX + (nextop & 7) + (rex.b << 3);                                            \
+        if (rv64_zbb)                                                                          \
+            SEXTH(i, wback);                                                                   \
+        else {                                                                                 \
+            SLLI(i, wback, 48);                                                                \
+            SRAI(i, i, 48);                                                                    \
+        }                                                                                      \
+        ed = i;                                                                                \
+        wb1 = 0;                                                                               \
+    } else {                                                                                   \
+        SMREAD();                                                                              \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x3, i, &fixedaddress, rex, NULL, 1, D); \
+        LH(i, wback, fixedaddress);                                                            \
+        ed = i;                                                                                \
+        wb1 = 1;                                                                               \
+    }
 // Write ed back to original register / memory
-#define EWBACK       EWBACKW(ed)
+#define EWBACK EWBACKW(ed)
 // Write w back to original register / memory (w needs to be 16bits only!)
-#define EWBACKW(w)   if(wb1) {SH(w, wback, fixedaddress); SMWRITE();} else {SRLI(wback, wback, 16); SLLI(wback, wback, 16); OR(wback, wback, w);}
+#define EWBACKW(w)                  \
+    if (wb1) {                      \
+        SH(w, wback, fixedaddress); \
+        SMWRITE();                  \
+    } else {                        \
+        SRLI(wback, wback, 16);     \
+        SLLI(wback, wback, 16);     \
+        OR(wback, wback, w);        \
+    }
 // Write back gd in correct register (gd needs to be 16bits only!)
-#define GWBACK       do{int g=xRAX+((nextop&0x38)>>3)+(rex.r<<3); SRLI(g, g, 16); SLLI(g, g, 16); OR(g, g, gd);}while(0)
-
-//GETEDO can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
-#define GETEDO(O, D, S)   if(MODREG) {                  \
-                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                    wback = 0;                          \
-                } else {                                \
-                    SMREAD();                           \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, S, &fixedaddress, rex, NULL, 1, D); \
-                    ADD(S, wback, O);                   \
-                    LDxw(x1, S, fixedaddress);          \
-                    ed = x1;                            \
-                }
-#define WBACKO(O)   if(wback) {ADD(O, wback, O); SDxw(ed, O, 0); SMWRITE2();}
+#define GWBACK                                                \
+    do {                                                      \
+        int g = xRAX + ((nextop & 0x38) >> 3) + (rex.r << 3); \
+        SRLI(g, g, 16);                                       \
+        SLLI(g, g, 16);                                       \
+        OR(g, g, gd);                                         \
+    } while (0)
+
+// GETEDO can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
+#define GETEDO(O, D, S)                                                                        \
+    if (MODREG) {                                                                              \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                               \
+        wback = 0;                                                                             \
+    } else {                                                                                   \
+        SMREAD();                                                                              \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x2, S, &fixedaddress, rex, NULL, 1, D); \
+        ADD(S, wback, O);                                                                      \
+        LDxw(x1, S, fixedaddress);                                                             \
+        ed = x1;                                                                               \
+    }
+#define WBACKO(O)         \
+    if (wback) {          \
+        ADD(O, wback, O); \
+        SDxw(ed, O, 0);   \
+        SMWRITE2();       \
+    }
 
 // FAKEED like GETED, but doesn't get anything
-#define FAKEED  if(!MODREG) {   \
-                    addr = fakeed(dyn, addr, ninst, nextop); \
-                }
+#define FAKEED                                   \
+    if (!MODREG) {                               \
+        addr = fakeed(dyn, addr, ninst, nextop); \
+    }
 
 // GETGW extract x64 register in gd, that is i, Signed extented
-#define GETSGW(i) gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3); SLLIW(i, gd, 16); SRAIW(i, i, 16); gd = i;
+#define GETSGW(i)                                      \
+    gd = xRAX + ((nextop & 0x38) >> 3) + (rex.r << 3); \
+    SLLIW(i, gd, 16);                                  \
+    SRAIW(i, i, 16);                                   \
+    gd = i;
 
 // Write back ed in wback (if wback not 0)
-#define WBACK       if(wback) {SDxw(ed, wback, fixedaddress); SMWRITE();}
+#define WBACK                          \
+    if (wback) {                       \
+        SDxw(ed, wback, fixedaddress); \
+        SMWRITE();                     \
+    }
 
 // GETEB will use i for ed, and can use r3 for wback.
-#define GETEB(i, D) if(MODREG) {                \
-                    if(rex.rex) {               \
-                        wback = xRAX+(nextop&7)+(rex.b<<3);     \
-                        wb2 = 0;                \
-                    } else {                    \
-                        wback = (nextop&7);     \
-                        wb2 = (wback>>2)*8;     \
-                        wback = xRAX+(wback&3); \
-                    }                           \
-                    if (wb2) {                  \
-                        if (rv64_xtheadbb) {    \
-                            TH_EXTU(i, wback, 15, 8);   \
-                        } else {                \
-                            MV(i, wback);       \
-                            SRLI(i, i, wb2);    \
-                            ANDI(i, i, 0xff);   \
-                        }                       \
-                    } else ANDI(i, wback, 0xff);\
-                    wb1 = 0;                    \
-                    ed = i;                     \
-                } else {                        \
-                    SMREAD();                   \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
-                    LBU(i, wback, fixedaddress);\
-                    wb1 = 1;                    \
-                    ed = i;                     \
-                }
-//GETEBO will use i for ed, i is also Offset, and can use r3 for wback.
-#define GETEBO(i, D) if(MODREG) {               \
-                    if(rex.rex) {               \
-                        wback = xRAX+(nextop&7)+(rex.b<<3);     \
-                        wb2 = 0;                \
-                    } else {                    \
-                        wback = (nextop&7);     \
-                        wb2 = (wback>>2)*8;     \
-                        wback = xRAX+(wback&3); \
-                    }                           \
-                    if (wb2) {                  \
-                        if (rv64_xtheadbb) {    \
-                            TH_EXTU(i, wback, 15, 8);   \
-                        } else {                \
-                            MV(i, wback);       \
-                            SRLI(i, i, wb2);    \
-                            ANDI(i, i, 0xff);   \
-                        }                       \
-                    } else ANDI(i, wback, 0xff);\
-                    wb1 = 0;                    \
-                    ed = i;                     \
-                } else {                        \
-                    SMREAD();                   \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
-                    ADD(x3, wback, i);          \
-                    if(wback!=x3) wback = x3;   \
-                    LBU(i, wback, fixedaddress);\
-                    wb1 = 1;                    \
-                    ed = i;                     \
-                }
-//GETSEB sign extend EB, will use i for ed, and can use r3 for wback.
-#define GETSEB(i, D) if(MODREG) {                \
-                    if(rex.rex) {               \
-                        wback = xRAX+(nextop&7)+(rex.b<<3);     \
-                        wb2 = 0;                \
-                    } else {                    \
-                        wback = (nextop&7);     \
-                        wb2 = (wback>>2)*8;     \
-                        wback = xRAX+(wback&3); \
-                    }                           \
-                    MV(i, wback);               \
-                    SLLIW(i, i, 24-wb2);        \
-                    SRAIW(i, i, 24);            \
-                    wb1 = 0;                    \
-                    ed = i;                     \
-                } else {                        \
-                    SMREAD();                   \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, D); \
-                    LB(i, wback, fixedaddress); \
-                    wb1 = 1;                    \
-                    ed = i;                     \
-                }
+#define GETEB(i, D)                                                                             \
+    if (MODREG) {                                                                               \
+        if (rex.rex) {                                                                          \
+            wback = xRAX + (nextop & 7) + (rex.b << 3);                                         \
+            wb2 = 0;                                                                            \
+        } else {                                                                                \
+            wback = (nextop & 7);                                                               \
+            wb2 = (wback >> 2) * 8;                                                             \
+            wback = xRAX + (wback & 3);                                                         \
+        }                                                                                       \
+        if (wb2) {                                                                              \
+            if (rv64_xtheadbb) {                                                                \
+                TH_EXTU(i, wback, 15, 8);                                                       \
+            } else {                                                                            \
+                MV(i, wback);                                                                   \
+                SRLI(i, i, wb2);                                                                \
+                ANDI(i, i, 0xff);                                                               \
+            }                                                                                   \
+        } else                                                                                  \
+            ANDI(i, wback, 0xff);                                                               \
+        wb1 = 0;                                                                                \
+        ed = i;                                                                                 \
+    } else {                                                                                    \
+        SMREAD();                                                                               \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
+        LBU(i, wback, fixedaddress);                                                            \
+        wb1 = 1;                                                                                \
+        ed = i;                                                                                 \
+    }
+// GETEBO will use i for ed, i is also Offset, and can use r3 for wback.
+#define GETEBO(i, D)                                                                            \
+    if (MODREG) {                                                                               \
+        if (rex.rex) {                                                                          \
+            wback = xRAX + (nextop & 7) + (rex.b << 3);                                         \
+            wb2 = 0;                                                                            \
+        } else {                                                                                \
+            wback = (nextop & 7);                                                               \
+            wb2 = (wback >> 2) * 8;                                                             \
+            wback = xRAX + (wback & 3);                                                         \
+        }                                                                                       \
+        if (wb2) {                                                                              \
+            if (rv64_xtheadbb) {                                                                \
+                TH_EXTU(i, wback, 15, 8);                                                       \
+            } else {                                                                            \
+                MV(i, wback);                                                                   \
+                SRLI(i, i, wb2);                                                                \
+                ANDI(i, i, 0xff);                                                               \
+            }                                                                                   \
+        } else                                                                                  \
+            ANDI(i, wback, 0xff);                                                               \
+        wb1 = 0;                                                                                \
+        ed = i;                                                                                 \
+    } else {                                                                                    \
+        SMREAD();                                                                               \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
+        ADD(x3, wback, i);                                                                      \
+        if (wback != x3) wback = x3;                                                            \
+        LBU(i, wback, fixedaddress);                                                            \
+        wb1 = 1;                                                                                \
+        ed = i;                                                                                 \
+    }
+// GETSEB sign extend EB, will use i for ed, and can use r3 for wback.
+#define GETSEB(i, D)                                                                            \
+    if (MODREG) {                                                                               \
+        if (rex.rex) {                                                                          \
+            wback = xRAX + (nextop & 7) + (rex.b << 3);                                         \
+            wb2 = 0;                                                                            \
+        } else {                                                                                \
+            wback = (nextop & 7);                                                               \
+            wb2 = (wback >> 2) * 8;                                                             \
+            wback = xRAX + (wback & 3);                                                         \
+        }                                                                                       \
+        MV(i, wback);                                                                           \
+        SLLIW(i, i, 24 - wb2);                                                                  \
+        SRAIW(i, i, 24);                                                                        \
+        wb1 = 0;                                                                                \
+        ed = i;                                                                                 \
+    } else {                                                                                    \
+        SMREAD();                                                                               \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, D); \
+        LB(i, wback, fixedaddress);                                                             \
+        wb1 = 1;                                                                                \
+        ed = i;                                                                                 \
+    }
 // GETEB32 will use i for ed, and can use r3 for wback.
-#define GETEB32(i, D) if(MODREG) {                \
-                    if(rex.rex) {               \
-                        wback = xRAX+(nextop&7)+(rex.b<<3);     \
-                        wb2 = 0;                \
-                    } else {                    \
-                        wback = (nextop&7);     \
-                        wb2 = (wback>>2)*8;     \
-                        wback = xRAX+(wback&3); \
-                    }                           \
-                    if (wb2) {                  \
-                        if (rv64_xtheadbb) {    \
-                            TH_EXTU(i, wback, 15, 8);   \
-                        } else {                \
-                            MV(i, wback);       \
-                            SRLI(i, i, wb2);    \
-                            ANDI(i, i, 0xff);   \
-                        }                       \
-                    } else ANDI(i, wback, 0xff);\
-                    wb1 = 0;                    \
-                    ed = i;                     \
-                } else {                        \
-                    SMREAD();                   \
-                    addr = geted32(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
-                    LBU(i, wback, fixedaddress);\
-                    wb1 = 1;                    \
-                    ed = i;                     \
-                }
-
-//GETGB will use i for gd
-#define GETGB(i) if(rex.rex) {  \
-                    gb1 = xRAX+((nextop&0x38)>>3)+(rex.r<<3);   \
-                    gb2 = 0;                    \
-                } else {                        \
-                    gd = (nextop&0x38)>>3;      \
-                    gb2 = ((gd&4)>>2);          \
-                    gb1 = xRAX+(gd&3);          \
-                }                               \
-                gd = i;                         \
-                if (gb2) {                      \
-                    if (rv64_xtheadbb) {        \
-                        TH_EXTU(gd, gb1, 15, 8);\
-                    } else {                    \
-                        MV(gd, gb1);            \
-                        SRLI(gd, gd, 8);        \
-                        ANDI(gd, gd, 0xff);     \
-                    }                           \
-                } else ANDI(gd, gb1, 0xff);
+#define GETEB32(i, D)                                                                             \
+    if (MODREG) {                                                                                 \
+        if (rex.rex) {                                                                            \
+            wback = xRAX + (nextop & 7) + (rex.b << 3);                                           \
+            wb2 = 0;                                                                              \
+        } else {                                                                                  \
+            wback = (nextop & 7);                                                                 \
+            wb2 = (wback >> 2) * 8;                                                               \
+            wback = xRAX + (wback & 3);                                                           \
+        }                                                                                         \
+        if (wb2) {                                                                                \
+            if (rv64_xtheadbb) {                                                                  \
+                TH_EXTU(i, wback, 15, 8);                                                         \
+            } else {                                                                              \
+                MV(i, wback);                                                                     \
+                SRLI(i, i, wb2);                                                                  \
+                ANDI(i, i, 0xff);                                                                 \
+            }                                                                                     \
+        } else                                                                                    \
+            ANDI(i, wback, 0xff);                                                                 \
+        wb1 = 0;                                                                                  \
+        ed = i;                                                                                   \
+    } else {                                                                                      \
+        SMREAD();                                                                                 \
+        addr = geted32(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
+        LBU(i, wback, fixedaddress);                                                              \
+        wb1 = 1;                                                                                  \
+        ed = i;                                                                                   \
+    }
+
+// GETGB will use i for gd
+#define GETGB(i)                                            \
+    if (rex.rex) {                                          \
+        gb1 = xRAX + ((nextop & 0x38) >> 3) + (rex.r << 3); \
+        gb2 = 0;                                            \
+    } else {                                                \
+        gd = (nextop & 0x38) >> 3;                          \
+        gb2 = ((gd & 4) >> 2);                              \
+        gb1 = xRAX + (gd & 3);                              \
+    }                                                       \
+    gd = i;                                                 \
+    if (gb2) {                                              \
+        if (rv64_xtheadbb) {                                \
+            TH_EXTU(gd, gb1, 15, 8);                        \
+        } else {                                            \
+            MV(gd, gb1);                                    \
+            SRLI(gd, gd, 8);                                \
+            ANDI(gd, gd, 0xff);                             \
+        }                                                   \
+    } else                                                  \
+        ANDI(gd, gb1, 0xff);
 
 // Write gb (gd) back to original register / memory, using s1 as scratch
-#define GBBACK(s1) if(gb2) {                            \
-                    MOV64x(s1, 0xffffffffffff00ffLL);   \
-                    AND(gb1, gb1, s1);                  \
-                    SLLI(s1, gd, 8);                    \
-                    OR(gb1, gb1, s1);                   \
-                } else {                                \
-                    ANDI(gb1, gb1, ~0xff);              \
-                    OR(gb1, gb1, gd);                   \
-                }
+#define GBBACK(s1)                        \
+    if (gb2) {                            \
+        MOV64x(s1, 0xffffffffffff00ffLL); \
+        AND(gb1, gb1, s1);                \
+        SLLI(s1, gd, 8);                  \
+        OR(gb1, gb1, s1);                 \
+    } else {                              \
+        ANDI(gb1, gb1, ~0xff);            \
+        OR(gb1, gb1, gd);                 \
+    }
 
 // Write eb (ed) back to original register / memory, using s1 as scratch
-#define EBBACK(s1, c) if(wb1) {                         \
-                    SB(ed, wback, fixedaddress);        \
-                    SMWRITE();                          \
-                } else if(wb2) {                        \
-                    MOV64x(s1, 0xffffffffffff00ffLL);   \
-                    AND(wback, wback, s1);              \
-                    if (c) {ANDI(ed, ed, 0xff);}        \
-                    SLLI(s1, ed, 8);                    \
-                    OR(wback, wback, s1);               \
-                } else {                                \
-                    ANDI(wback, wback, ~0xff);          \
-                    if (c) {ANDI(ed, ed, 0xff);}        \
-                    OR(wback, wback, ed);               \
-                }
+#define EBBACK(s1, c)                     \
+    if (wb1) {                            \
+        SB(ed, wback, fixedaddress);      \
+        SMWRITE();                        \
+    } else if (wb2) {                     \
+        MOV64x(s1, 0xffffffffffff00ffLL); \
+        AND(wback, wback, s1);            \
+        if (c) { ANDI(ed, ed, 0xff); }    \
+        SLLI(s1, ed, 8);                  \
+        OR(wback, wback, s1);             \
+    } else {                              \
+        ANDI(wback, wback, ~0xff);        \
+        if (c) { ANDI(ed, ed, 0xff); }    \
+        OR(wback, wback, ed);             \
+    }
 
 // Get direction with size Z and based of F_DF flag, on register r ready for load/store fetching
 // using s as scratch.
-#define GETDIR(r, s, Z)             \
-    MOV32w(r, Z); /* mask=1<<10 */  \
-    ANDI(s, xFlags, 1<<F_DF);       \
-    BEQZ(s, 8);                     \
-    SUB(r, xZR, r);                 \
+#define GETDIR(r, s, Z)            \
+    MOV32w(r, Z); /* mask=1<<10 */ \
+    ANDI(s, xFlags, 1 << F_DF);    \
+    BEQZ(s, 8);                    \
+    SUB(r, xZR, r);
 
 // Generic get GD, but reg value in gd (R_RAX is not added)
-#define GETG        gd = ((nextop&0x38)>>3)+(rex.r<<3)
+#define GETG gd = ((nextop & 0x38) >> 3) + (rex.r << 3)
 
 // Get GX as a Single (might use x2)
-#define GETGXSS(a)                      \
-    gd = ((nextop&0x38)>>3)+(rex.r<<3); \
+#define GETGXSS(a)                              \
+    gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
     a = sse_get_reg(dyn, ninst, x2, gd, 1)
 
 // Get GX as a Single (might use x2), no fetching old value
-#define GETGXSS_empty(a)                \
-    gd = ((nextop&0x38)>>3)+(rex.r<<3); \
+#define GETGXSS_empty(a)                        \
+    gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
     a = sse_get_reg_empty(dyn, ninst, x2, gd, 1)
 
 // Get GX as a Double (might use x2)
-#define GETGXSD(a)                      \
-    gd = ((nextop&0x38)>>3)+(rex.r<<3); \
+#define GETGXSD(a)                              \
+    gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
     a = sse_get_reg(dyn, ninst, x2, gd, 0)
 
 // Get GX as a Double (might use x2), no fetching old value
-#define GETGXSD_empty(a)                \
-    gd = ((nextop&0x38)>>3)+(rex.r<<3); \
+#define GETGXSD_empty(a)                        \
+    gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
     a = sse_get_reg_empty(dyn, ninst, x2, gd, 0)
 
 // Get Ex as a single, not a quad (warning, x1 get used, x2 might too)
-#define GETEXSS(a, D)                                                                                   \
-    if(MODREG) {                                                                                        \
-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1);                                      \
-    } else {                                                                                            \
-        SMREAD();                                                                                       \
-        a = fpu_get_scratch(dyn);                                                                       \
-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, D);            \
-        FLW(a, ed, fixedaddress);                                                                       \
+#define GETEXSS(a, D)                                                                        \
+    if (MODREG) {                                                                            \
+        a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 1);                     \
+    } else {                                                                                 \
+        SMREAD();                                                                            \
+        a = fpu_get_scratch(dyn);                                                            \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, D); \
+        FLW(a, ed, fixedaddress);                                                            \
     }
 
 // Get Ex as a double, not a quad (warning, x1 get used, x2 might too)
-#define GETEXSD(a, D)                                                                                   \
-    if(MODREG) {                                                                                        \
-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0);                                      \
-    } else {                                                                                            \
-        SMREAD();                                                                                       \
-        a = fpu_get_scratch(dyn);                                                                       \
-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, D);            \
-        FLD(a, ed, fixedaddress);                                                                       \
+#define GETEXSD(a, D)                                                                        \
+    if (MODREG) {                                                                            \
+        a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0);                     \
+    } else {                                                                                 \
+        SMREAD();                                                                            \
+        a = fpu_get_scratch(dyn);                                                            \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, D); \
+        FLD(a, ed, fixedaddress);                                                            \
     }
 
 // Will get pointer to GX in general register a, will purge SS or SD if loaded. can use gback as load address
-#define GETGX()                             \
-    gd = ((nextop&0x38)>>3)+(rex.r<<3);     \
-    sse_forget_reg(dyn, ninst, gd);         \
-    gback = xEmu;                           \
+#define GETGX()                                 \
+    gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
+    sse_forget_reg(dyn, ninst, gd);             \
+    gback = xEmu;                               \
     gdoffset = offsetof(x64emu_t, xmm[gd])
 
 // Get Ex address in general register a, will purge SS or SD if it's reg and is loaded. May use x3. Use wback as load address!
-#define GETEX(a, D)                                                                                     \
-    if(MODREG) {                                                                                        \
-        ed = (nextop&7)+(rex.b<<3);                                                                     \
-        sse_forget_reg(dyn, ninst, ed);                                                                 \
-        fixedaddress = offsetof(x64emu_t, xmm[ed]);                                                     \
-        wback = xEmu;                                                                                   \
-    } else {                                                                                            \
-        SMREAD();                                                                                       \
-        ed=16;                                                                                          \
-        addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D);          \
+#define GETEX(a, D)                                                                            \
+    if (MODREG) {                                                                              \
+        ed = (nextop & 7) + (rex.b << 3);                                                      \
+        sse_forget_reg(dyn, ninst, ed);                                                        \
+        fixedaddress = offsetof(x64emu_t, xmm[ed]);                                            \
+        wback = xEmu;                                                                          \
+    } else {                                                                                   \
+        SMREAD();                                                                              \
+        ed = 16;                                                                               \
+        addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D); \
     }
 
-#define GETGM()                             \
-    gd = ((nextop&0x38)>>3);                \
-    mmx_forget_reg(dyn, ninst, gd);         \
-    gback = xEmu;                           \
+#define GETGM()                     \
+    gd = ((nextop & 0x38) >> 3);    \
+    mmx_forget_reg(dyn, ninst, gd); \
+    gback = xEmu;                   \
     gdoffset = offsetof(x64emu_t, mmx[gd])
 
 // Get EM, might use x3
-#define GETEM(a, D)                                                                             \
-    if(MODREG) {                                                                                \
-        ed = (nextop&7);                                                                        \
-        mmx_forget_reg(dyn, ninst, ed);                                                         \
-        fixedaddress = offsetof(x64emu_t, mmx[ed]);                                             \
-        wback = xEmu;                                                                           \
-    } else {                                                                                    \
-        SMREAD();                                                                               \
-        ed=8;                                                                                   \
-        addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D);  \
+#define GETEM(a, D)                                                                            \
+    if (MODREG) {                                                                              \
+        ed = (nextop & 7);                                                                     \
+        mmx_forget_reg(dyn, ninst, ed);                                                        \
+        fixedaddress = offsetof(x64emu_t, mmx[ed]);                                            \
+        wback = xEmu;                                                                          \
+    } else {                                                                                   \
+        SMREAD();                                                                              \
+        ed = 8;                                                                                \
+        addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D); \
     }
 
-#define SSE_LOOP_D_ITEM(GX1, EX1, F, i) \
-    LWU(GX1, gback, gdoffset+i*4);      \
-    LWU(EX1, wback, fixedaddress+i*4);  \
-    F;                                  \
-    SW(GX1, gback, gdoffset+i*4);
+#define SSE_LOOP_D_ITEM(GX1, EX1, F, i)    \
+    LWU(GX1, gback, gdoffset + i * 4);     \
+    LWU(EX1, wback, fixedaddress + i * 4); \
+    F;                                     \
+    SW(GX1, gback, gdoffset + i * 4);
 
 // Loop for SSE opcode that use 32bits value and write to GX.
 #define SSE_LOOP_D(GX1, EX1, F)     \
@@ -461,11 +533,11 @@
     SSE_LOOP_D_ITEM(GX1, EX1, F, 2) \
     SSE_LOOP_D_ITEM(GX1, EX1, F, 3)
 
-#define SSE_LOOP_DS_ITEM(GX1, EX1, F, i) \
-    LW(GX1, gback, gdoffset+i*4);        \
-    LW(EX1, wback, fixedaddress+i*4);    \
-    F;                                   \
-    SW(GX1, gback, gdoffset+i*4);
+#define SSE_LOOP_DS_ITEM(GX1, EX1, F, i)  \
+    LW(GX1, gback, gdoffset + i * 4);     \
+    LW(EX1, wback, fixedaddress + i * 4); \
+    F;                                    \
+    SW(GX1, gback, gdoffset + i * 4);
 
 // Loop for SSE opcode that use 32bits value and write to GX.
 #define SSE_LOOP_DS(GX1, EX1, F)     \
@@ -474,34 +546,34 @@
     SSE_LOOP_DS_ITEM(GX1, EX1, F, 2) \
     SSE_LOOP_DS_ITEM(GX1, EX1, F, 3)
 
-#define MMX_LOOP_W(GX1, EX1, F)            \
-    for (int i=0; i<4; ++i) {              \
-        LHU(GX1, gback, gdoffset+i*2);     \
-        LHU(EX1, wback, fixedaddress+i*2); \
-        F;                                 \
-        SH(GX1, gback, gdoffset+i*2);      \
+#define MMX_LOOP_W(GX1, EX1, F)                \
+    for (int i = 0; i < 4; ++i) {              \
+        LHU(GX1, gback, gdoffset + i * 2);     \
+        LHU(EX1, wback, fixedaddress + i * 2); \
+        F;                                     \
+        SH(GX1, gback, gdoffset + i * 2);      \
     }
 
-#define SSE_LOOP_W(GX1, EX1, F)            \
-    for (int i=0; i<8; ++i) {              \
-        LHU(GX1, gback, gdoffset+i*2);     \
-        LHU(EX1, wback, fixedaddress+i*2); \
-        F;                                 \
-        SH(GX1, gback, gdoffset+i*2);      \
+#define SSE_LOOP_W(GX1, EX1, F)                \
+    for (int i = 0; i < 8; ++i) {              \
+        LHU(GX1, gback, gdoffset + i * 2);     \
+        LHU(EX1, wback, fixedaddress + i * 2); \
+        F;                                     \
+        SH(GX1, gback, gdoffset + i * 2);      \
     }
 
-#define SSE_LOOP_WS(GX1, EX1, F)          \
-    for (int i=0; i<8; ++i) {             \
-        LH(GX1, gback, gdoffset+i*2);     \
-        LH(EX1, wback, fixedaddress+i*2); \
-        F;                                \
-        SH(GX1, gback, gdoffset+i*2);     \
+#define SSE_LOOP_WS(GX1, EX1, F)              \
+    for (int i = 0; i < 8; ++i) {             \
+        LH(GX1, gback, gdoffset + i * 2);     \
+        LH(EX1, wback, fixedaddress + i * 2); \
+        F;                                    \
+        SH(GX1, gback, gdoffset + i * 2);     \
     }
 
-#define SSE_LOOP_D_S_ITEM(EX1, F, i)    \
-    LWU(EX1, wback, fixedaddress+i*4);  \
-    F;                                  \
-    SW(EX1, wback, fixedaddress+i*4);
+#define SSE_LOOP_D_S_ITEM(EX1, F, i)       \
+    LWU(EX1, wback, fixedaddress + i * 4); \
+    F;                                     \
+    SW(EX1, wback, fixedaddress + i * 4);
 
 // Loop for SSE opcode that use 32bits value and write to EX.
 #define SSE_LOOP_D_S(EX1, F)     \
@@ -510,11 +582,11 @@
     SSE_LOOP_D_S_ITEM(EX1, F, 2) \
     SSE_LOOP_D_S_ITEM(EX1, F, 3)
 
-#define SSE_LOOP_Q_ITEM(GX1, EX1, F, i) \
-    LD(GX1, gback, gdoffset+i*8);       \
-    LD(EX1, wback, fixedaddress+i*8);   \
-    F;                                  \
-    SD(GX1, gback, gdoffset+i*8);
+#define SSE_LOOP_Q_ITEM(GX1, EX1, F, i)   \
+    LD(GX1, gback, gdoffset + i * 8);     \
+    LD(EX1, wback, fixedaddress + i * 8); \
+    F;                                    \
+    SD(GX1, gback, gdoffset + i * 8);
 
 // Loop for SSE opcode that use 64bits value and write to GX.
 #define SSE_LOOP_Q(GX1, EX1, F)     \
@@ -522,11 +594,11 @@
     SSE_LOOP_Q_ITEM(GX1, EX1, F, 1)
 
 
-#define SSE_LOOP_FQ_ITEM(GX1, EX1, F, i)            \
-    FLD(v0, gback, gdoffset+i*8);                   \
-    FLD(v1, wback, fixedaddress+i*8);               \
-    F;                                              \
-    FSD(v0, gback, gdoffset+i*8);
+#define SSE_LOOP_FQ_ITEM(GX1, EX1, F, i)  \
+    FLD(v0, gback, gdoffset + i * 8);     \
+    FLD(v1, wback, fixedaddress + i * 8); \
+    F;                                    \
+    FSD(v0, gback, gdoffset + i * 8);
 
 #define SSE_LOOP_FQ(GX1, EX1, F)     \
     v0 = fpu_get_scratch(dyn);       \
@@ -535,18 +607,18 @@
     SSE_LOOP_FQ_ITEM(GX1, EX1, F, 1)
 
 
-#define SSE_LOOP_MV_Q_ITEM(s, i)      \
-    LD(s, wback, fixedaddress+i*8);   \
-    SD(s, gback, gdoffset+i*8);
+#define SSE_LOOP_MV_Q_ITEM(s, i)        \
+    LD(s, wback, fixedaddress + i * 8); \
+    SD(s, gback, gdoffset + i * 8);
 
 // Loop for SSE opcode that moves 64bits value from wback to gback, use s as scratch.
 #define SSE_LOOP_MV_Q(s)     \
     SSE_LOOP_MV_Q_ITEM(s, 0) \
     SSE_LOOP_MV_Q_ITEM(s, 1)
 
-#define SSE_LOOP_MV_Q_ITEM2(s, i)     \
-    LD(s, gback, gdoffset+i*8);       \
-    SD(s, wback, fixedaddress+i*8);
+#define SSE_LOOP_MV_Q_ITEM2(s, i)   \
+    LD(s, gback, gdoffset + i * 8); \
+    SD(s, wback, fixedaddress + i * 8);
 
 // Loop for SSE opcode that moves 64bits value from gback to wback, use s as scratch.
 #define SSE_LOOP_MV_Q2(s)     \
@@ -563,7 +635,7 @@
 // R0 will not be pushed/popd if ret is -2. Flags are not save/restored
 #define CALL_S(F, ret) call_c(dyn, ninst, F, x6, ret, 0, 0)
 
-#define MARKi(i)   dyn->insts[ninst].mark[i] = dyn->native_size
+#define MARKi(i)    dyn->insts[ninst].mark[i] = dyn->native_size
 #define GETMARKi(i) dyn->insts[ninst].mark[i]
 #define MARK        MARKi(0)
 #define GETMARK     GETMARKi(0)
@@ -572,57 +644,57 @@
 #define MARK3       MARKi(2)
 #define GETMARK3    GETMARKi(2)
 
-#define MARKFi(i)   dyn->insts[ninst].markf[i] = dyn->native_size
+#define MARKFi(i)    dyn->insts[ninst].markf[i] = dyn->native_size
 #define GETMARKFi(i) dyn->insts[ninst].markf[i]
-#define MARKF       MARKFi(0)
-#define GETMARKF    GETMARKFi(0)
-#define MARKF2      MARKFi(1)
-#define GETMARKF2   GETMARKFi(1)
+#define MARKF        MARKFi(0)
+#define GETMARKF     GETMARKFi(0)
+#define MARKF2       MARKFi(1)
+#define GETMARKF2    GETMARKFi(1)
 
 #define MARKSEG     dyn->insts[ninst].markseg = dyn->native_size
 #define GETMARKSEG  dyn->insts[ninst].markseg
 #define MARKLOCK    dyn->insts[ninst].marklock = dyn->native_size
 #define GETMARKLOCK dyn->insts[ninst].marklock
 
-#define Bxx_gen(OP, M, reg1, reg2)      \
-    j64 = GET##M - dyn->native_size;    \
-    B##OP (reg1, reg2, j64)
+#define Bxx_gen(OP, M, reg1, reg2)   \
+    j64 = GET##M - dyn->native_size; \
+    B##OP(reg1, reg2, j64)
 
-#define Bxx_geni(OP, M, reg1, reg2, i)      \
-    j64 = GET##M##i(i) - dyn->native_size;    \
-    B##OP (reg1, reg2, j64)
+#define Bxx_geni(OP, M, reg1, reg2, i)     \
+    j64 = GET##M##i(i) - dyn->native_size; \
+    B##OP(reg1, reg2, j64)
 
 // Branch to MARK if reg1==reg2 (use j64)
-#define BEQ_MARK(reg1, reg2) Bxx_gen(EQ, MARK, reg1, reg2)
+#define BEQ_MARK(reg1, reg2)     Bxx_gen(EQ, MARK, reg1, reg2)
 #define BEQ_MARKi(reg1, reg2, i) Bxx_geni(EQ, MARK, reg1, reg2, i)
 // Branch to MARK if reg1!=reg2 (use j64)
-#define BNE_MARK(reg1, reg2) Bxx_gen(NE, MARK, reg1, reg2)
+#define BNE_MARK(reg1, reg2)     Bxx_gen(NE, MARK, reg1, reg2)
 #define BNE_MARKi(reg1, reg2, i) Bxx_geni(NE, MARK, reg1, reg2, i)
 // Branch to MARK if reg1!=0 (use j64)
-#define BNEZ_MARK(reg) BNE_MARK(reg, xZR)
+#define BNEZ_MARK(reg)     BNE_MARK(reg, xZR)
 #define BNEZ_MARKi(reg, i) BNE_MARKi(reg, xZR, i)
 // Branch to MARK instruction unconditionnal (use j64)
-#define B_MARK_nocond   Bxx_gen(__, MARK, 0, 0)
-#define B_MARKi_nocond   Bxx_geni(__, MARK, 0, 0, i)
+#define B_MARK_nocond  Bxx_gen(__, MARK, 0, 0)
+#define B_MARKi_nocond Bxx_geni(__, MARK, 0, 0, i)
 // Branch to MARK if reg1<reg2 (use j64)
-#define BLT_MARK(reg1, reg2) Bxx_gen(LT, MARK, reg1, reg2)
+#define BLT_MARK(reg1, reg2)  Bxx_gen(LT, MARK, reg1, reg2)
 #define BLT_MARKi(reg1, reg2) Bxx_geni(LT, MARK, reg1, reg2, i)
 // Branch to MARK if reg1<reg2 (use j64)
-#define BLTU_MARK(reg1, reg2) Bxx_gen(LTU, MARK, reg1, reg2)
+#define BLTU_MARK(reg1, reg2)  Bxx_gen(LTU, MARK, reg1, reg2)
 #define BLTU_MARKi(reg1, reg2) Bxx_geni(LTU, MARK, reg1, reg2, i)
 // Branch to MARK if reg1>=reg2 (use j64)
-#define BGE_MARK(reg1, reg2) Bxx_gen(GE, MARK, reg1, reg2)
+#define BGE_MARK(reg1, reg2)  Bxx_gen(GE, MARK, reg1, reg2)
 #define BGE_MARKi(reg1, reg2) Bxx_geni(GE, MARK, reg1, reg2, i)
 // Branch to MARK2 if reg1==reg2 (use j64)
-#define BEQ_MARK2(reg1, reg2) Bxx_gen(EQ, MARK2, reg1,reg2)
+#define BEQ_MARK2(reg1, reg2) Bxx_gen(EQ, MARK2, reg1, reg2)
 // Branch to MARK2 if reg1!=reg2 (use j64)
-#define BNE_MARK2(reg1, reg2) Bxx_gen(NE, MARK2, reg1,reg2)
+#define BNE_MARK2(reg1, reg2) Bxx_gen(NE, MARK2, reg1, reg2)
 // Branch to MARK2 if reg1!=0 (use j64)
 #define BNEZ_MARK2(reg) BNE_MARK2(reg, xZR)
 // Branch to MARK2 if reg1<reg2 (use j64)
-#define BLT_MARK2(reg1, reg2) Bxx_gen(LT, MARK2, reg1,reg2)
+#define BLT_MARK2(reg1, reg2) Bxx_gen(LT, MARK2, reg1, reg2)
 // Branch to MARK2 instruction unconditionnal (use j64)
-#define B_MARK2_nocond  Bxx_gen(__, MARK2, 0, 0)
+#define B_MARK2_nocond Bxx_gen(__, MARK2, 0, 0)
 // Branch to MARK3 if reg1==reg2 (use j64)
 #define BEQ_MARK3(reg1, reg2) Bxx_gen(EQ, MARK3, reg1, reg2)
 // Branch to MARK3 if reg1!=reg2 (use j64)
@@ -630,124 +702,157 @@
 // Branch to MARK3 if reg1!=0 (use j64)
 #define BNEZ_MARK3(reg) BNE_MARK3(reg, xZR)
 // Branch to MARK3 instruction unconditionnal (use j64)
-#define B_MARK3_nocond  Bxx_gen(__, MARK3, 0, 0)
+#define B_MARK3_nocond Bxx_gen(__, MARK3, 0, 0)
 // Branch to MARKLOCK if reg1!=reg2 (use j64)
 #define BNE_MARKLOCK(reg1, reg2) Bxx_gen(NE, MARKLOCK, reg1, reg2)
 // Branch to MARKLOCK if reg1!=0 (use j64)
 #define BNEZ_MARKLOCK(reg) BNE_MARKLOCK(reg, xZR)
 
 // Branch to NEXT if reg1==reg2 (use j64)
-#define BEQ_NEXT(reg1, reg2)           \
-    j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0; \
+#define BEQ_NEXT(reg1, reg2)                                                  \
+    j64 = (dyn->insts) ? (dyn->insts[ninst].epilog - (dyn->native_size)) : 0; \
     BEQ(reg1, reg2, j64)
 
 // Branch to NEXT if reg1==0 (use j64)
-#define CBZ_NEXT(reg1)                 \
-    j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0; \
+#define CBZ_NEXT(reg1)                                                        \
+    j64 = (dyn->insts) ? (dyn->insts[ninst].epilog - (dyn->native_size)) : 0; \
     BEQ(reg1, xZR, j64)
 // Branch to NEXT if reg1!=0 (use j64)
-#define CBNZ_NEXT(reg1)                \
-    j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0; \
+#define CBNZ_NEXT(reg1)                                                       \
+    j64 = (dyn->insts) ? (dyn->insts[ninst].epilog - (dyn->native_size)) : 0; \
     BNE(reg1, xZR, j64)
 // Branch to next instruction unconditionnal (use j64)
-#define B_NEXT_nocond                                               \
-    j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0;\
+#define B_NEXT_nocond                                                         \
+    j64 = (dyn->insts) ? (dyn->insts[ninst].epilog - (dyn->native_size)) : 0; \
     B(j64)
 
 // Branch to MARKSEG if reg is 0 (use j64)
-#define CBZ_MARKSEG(reg)    \
-    j64 = GETMARKSEG-(dyn->native_size);   \
+#define CBZ_MARKSEG(reg)                   \
+    j64 = GETMARKSEG - (dyn->native_size); \
     BEQZ(reg, j64);
 // Branch to MARKSEG if reg is not 0 (use j64)
-#define CBNZ_MARKSEG(reg)              \
-    j64 = GETMARKSEG-(dyn->native_size);   \
+#define CBNZ_MARKSEG(reg)                  \
+    j64 = GETMARKSEG - (dyn->native_size); \
     BNEZ(reg, j64);
 
-#define IFX(A)  if((dyn->insts[ninst].x64.gen_flags&(A)))
-#define IFX_PENDOR0  if((dyn->insts[ninst].x64.gen_flags&(X_PEND) || !dyn->insts[ninst].x64.gen_flags))
-#define IFXX(A) if((dyn->insts[ninst].x64.gen_flags==(A)))
-#define IFX2X(A, B) if((dyn->insts[ninst].x64.gen_flags==(A) || dyn->insts[ninst].x64.gen_flags==(B) || dyn->insts[ninst].x64.gen_flags==((A)|(B))))
-#define IFXN(A, B)  if((dyn->insts[ninst].x64.gen_flags&(A) && !(dyn->insts[ninst].x64.gen_flags&(B))))
+#define IFX(A)      if ((dyn->insts[ninst].x64.gen_flags & (A)))
+#define IFX_PENDOR0 if ((dyn->insts[ninst].x64.gen_flags & (X_PEND) || !dyn->insts[ninst].x64.gen_flags))
+#define IFXX(A)     if ((dyn->insts[ninst].x64.gen_flags == (A)))
+#define IFX2X(A, B) if ((dyn->insts[ninst].x64.gen_flags == (A) || dyn->insts[ninst].x64.gen_flags == (B) || dyn->insts[ninst].x64.gen_flags == ((A) | (B))))
+#define IFXN(A, B)  if ((dyn->insts[ninst].x64.gen_flags & (A) && !(dyn->insts[ninst].x64.gen_flags & (B))))
 
-#define STORE_REG(A)    SD(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
-#define LOAD_REG(A)     LD(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
+#define STORE_REG(A) SD(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
+#define LOAD_REG(A)  LD(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
 
 // Need to also store current value of some register, as they may be used by functions like setjmp
-#define STORE_XEMU_CALL()   \
-    STORE_REG(RBX);         \
-    STORE_REG(RDX);         \
-    STORE_REG(RSP);         \
-    STORE_REG(RBP);         \
-    STORE_REG(RDI);         \
-    STORE_REG(RSI);         \
-    STORE_REG(R8);          \
-    STORE_REG(R9);          \
-    STORE_REG(R10);         \
-    STORE_REG(R11);         \
-
-#define LOAD_XEMU_CALL()    \
-
-#define LOAD_XEMU_REM()     \
-    LOAD_REG(RBX);          \
-    LOAD_REG(RDX);          \
-    LOAD_REG(RSP);          \
-    LOAD_REG(RBP);          \
-    LOAD_REG(RDI);          \
-    LOAD_REG(RSI);          \
-    LOAD_REG(R8);           \
-    LOAD_REG(R9);           \
-    LOAD_REG(R10);          \
-    LOAD_REG(R11);          \
-
-
-#define SET_DFNONE()    if(!dyn->f.dfnone) {SW(xZR, xEmu, offsetof(x64emu_t, df)); dyn->f.dfnone=1;}
-#define SET_DF(S, N)     if((N)!=d_none) {MOV_U12(S, (N)); SW(S, xEmu, offsetof(x64emu_t, df)); dyn->f.dfnone=0;} else SET_DFNONE()
-#define SET_NODF()          dyn->f.dfnone = 0
-#define SET_DFOK()          dyn->f.dfnone = 1
-
-#define CLEAR_FLAGS() IFX(X_ALL) {ANDI(xFlags, xFlags, ~((1UL<<F_AF) | (1UL<<F_CF) | (1UL<<F_OF2) | (1UL<<F_ZF) | (1UL<<F_SF) | (1UL<<F_PF)));}
-
-#define CALC_SUB_FLAGS(op1_, op2, res, scratch1, scratch2, width)         \
-    IFX(X_AF | X_CF | X_OF) {                                             \
-        /* calc borrow chain */                                           \
-        /* bc = (res & (~op1 | op2)) | (~op1 & op2) */                    \
-        OR(scratch1, op1_, op2);                                          \
-        AND(scratch2, res, scratch1);                                     \
-        AND(op1_, op1_, op2);                                             \
-        OR(scratch2, scratch2, op1_);                                     \
-        IFX(X_AF) {                                                       \
-            /* af = bc & 0x8 */                                           \
-            ANDI(scratch1, scratch2, 8);                                  \
-            BEQZ(scratch1, 8);                                            \
-            ORI(xFlags, xFlags, 1 << F_AF);                               \
-        }                                                                 \
-        IFX(X_CF) {                                                       \
-            /* cf = bc & (1<<(width-1)) */                                \
-            if ((width) == 8) {                                           \
-                ANDI(scratch1, scratch2, 0x80);                           \
-            } else {                                                      \
-                SRLI(scratch1, scratch2, (width)-1);                      \
-                if(width!=64) ANDI(scratch1, scratch1, 1);                \
-            }                                                             \
-            BEQZ(scratch1, 8);                                            \
-            ORI(xFlags, xFlags, 1 << F_CF);                               \
-        }                                                                 \
-        IFX(X_OF) {                                                       \
-            /* of = ((bc >> (width-2)) ^ (bc >> (width-1))) & 0x1; */     \
-            SRLI(scratch1, scratch2, (width)-2);                          \
-            SRLI(scratch2, scratch1, 1);                                  \
-            XOR(scratch1, scratch1, scratch2);                            \
-            ANDI(scratch1, scratch1, 1);                                  \
-            BEQZ(scratch1, 8);                                            \
-            ORI(xFlags, xFlags, 1 << F_OF2);                              \
-        }                                                                 \
+#define STORE_XEMU_CALL(s0)                             \
+    if (rv64_xtheadmempair) {                           \
+        ADDI(s0, xEmu, offsetof(x64emu_t, regs[_RSP])); \
+        TH_SDD(xRDX, xRBX, xEmu, 1);                    \
+        TH_SDD(xRSP, xRBP, s0, 0);                      \
+        TH_SDD(xRSI, xRDI, s0, 1);                      \
+        TH_SDD(xR8, xR9, s0, 2);                        \
+        TH_SDD(xR10, xR11, s0, 3);                      \
+    } else {                                            \
+        STORE_REG(RBX);                                 \
+        STORE_REG(RDX);                                 \
+        STORE_REG(RSP);                                 \
+        STORE_REG(RBP);                                 \
+        STORE_REG(RDI);                                 \
+        STORE_REG(RSI);                                 \
+        STORE_REG(R8);                                  \
+        STORE_REG(R9);                                  \
+        STORE_REG(R10);                                 \
+        STORE_REG(R11);                                 \
+    }
+
+#define LOAD_XEMU_CALL()
+
+#define LOAD_XEMU_REM(s0)                               \
+    if (rv64_xtheadmempair) {                           \
+        ADDI(s0, xEmu, offsetof(x64emu_t, regs[_RSP])); \
+        TH_LDD(xRDX, xRBX, xEmu, 1);                    \
+        TH_LDD(xRSP, xRBP, s0, 0);                      \
+        TH_LDD(xRSI, xRDI, s0, 1);                      \
+        TH_LDD(xR8, xR9, s0, 2);                        \
+        TH_LDD(xR10, xR11, s0, 3);                      \
+    } else {                                            \
+        LOAD_REG(RBX);                                  \
+        LOAD_REG(RDX);                                  \
+        LOAD_REG(RSP);                                  \
+        LOAD_REG(RBP);                                  \
+        LOAD_REG(RDI);                                  \
+        LOAD_REG(RSI);                                  \
+        LOAD_REG(R8);                                   \
+        LOAD_REG(R9);                                   \
+        LOAD_REG(R10);                                  \
+        LOAD_REG(R11);                                  \
+    }
+
+
+#define SET_DFNONE()                           \
+    if (!dyn->f.dfnone) {                      \
+        SW(xZR, xEmu, offsetof(x64emu_t, df)); \
+        dyn->f.dfnone = 1;                     \
+    }
+#define SET_DF(S, N)                         \
+    if ((N) != d_none) {                     \
+        MOV_U12(S, (N));                     \
+        SW(S, xEmu, offsetof(x64emu_t, df)); \
+        dyn->f.dfnone = 0;                   \
+    } else                                   \
+        SET_DFNONE()
+#define SET_NODF() dyn->f.dfnone = 0
+#define SET_DFOK() dyn->f.dfnone = 1
+
+#define CLEAR_FLAGS() \
+    IFX(X_ALL) { ANDI(xFlags, xFlags, ~((1UL << F_AF) | (1UL << F_CF) | (1UL << F_OF2) | (1UL << F_ZF) | (1UL << F_SF) | (1UL << F_PF))); }
+
+#define CALC_SUB_FLAGS(op1_, op2, res, scratch1, scratch2, width)     \
+    IFX(X_AF | X_CF | X_OF)                                           \
+    {                                                                 \
+        /* calc borrow chain */                                       \
+        /* bc = (res & (~op1 | op2)) | (~op1 & op2) */                \
+        OR(scratch1, op1_, op2);                                      \
+        AND(scratch2, res, scratch1);                                 \
+        AND(op1_, op1_, op2);                                         \
+        OR(scratch2, scratch2, op1_);                                 \
+        IFX(X_AF)                                                     \
+        {                                                             \
+            /* af = bc & 0x8 */                                       \
+            ANDI(scratch1, scratch2, 8);                              \
+            BEQZ(scratch1, 8);                                        \
+            ORI(xFlags, xFlags, 1 << F_AF);                           \
+        }                                                             \
+        IFX(X_CF)                                                     \
+        {                                                             \
+            /* cf = bc & (1<<(width-1)) */                            \
+            if ((width) == 8) {                                       \
+                ANDI(scratch1, scratch2, 0x80);                       \
+            } else {                                                  \
+                SRLI(scratch1, scratch2, (width)-1);                  \
+                if (width != 64) ANDI(scratch1, scratch1, 1);         \
+            }                                                         \
+            BEQZ(scratch1, 8);                                        \
+            ORI(xFlags, xFlags, 1 << F_CF);                           \
+        }                                                             \
+        IFX(X_OF)                                                     \
+        {                                                             \
+            /* of = ((bc >> (width-2)) ^ (bc >> (width-1))) & 0x1; */ \
+            SRLI(scratch1, scratch2, (width)-2);                      \
+            SRLI(scratch2, scratch1, 1);                              \
+            XOR(scratch1, scratch1, scratch2);                        \
+            ANDI(scratch1, scratch1, 1);                              \
+            BEQZ(scratch1, 8);                                        \
+            ORI(xFlags, xFlags, 1 << F_OF2);                          \
+        }                                                             \
     }
 
 // Adjust the xFlags bit 11 -> bit 5, result in reg (can be xFlags, but not s1)
-#define FLAGS_ADJUST_FROM11(reg, s1)\
-    ANDI(reg, xFlags, ~(1<<5));     \
-    SRLI(s1, reg, 11-5);            \
-    ANDI(s1, s1, 1<<5);             \
+#define FLAGS_ADJUST_FROM11(reg, s1) \
+    ANDI(reg, xFlags, ~(1 << 5));    \
+    SRLI(s1, reg, 11 - 5);           \
+    ANDI(s1, s1, 1 << 5);            \
     OR(reg, reg, s1)
 
 // Adjust the xFlags bit 5 -> bit 11, src and dst can be the same (and can be xFlags, but not s1)
@@ -755,8 +860,8 @@
     LUI(s1, 0xFFFFF);                   \
     ADDIW(s1, s1, 0x7DF);               \
     AND(s1, src, s1);                   \
-    ANDI(dst, src, 1<<5);               \
-    SLLI(dst, dst, 11-5);               \
+    ANDI(dst, src, 1 << 5);             \
+    SLLI(dst, dst, 11 - 5);             \
     OR(dst, dst, s1)
 
 #ifndef MAYSETFLAGS
@@ -764,37 +869,39 @@
 #endif
 
 #ifndef READFLAGS
-#define READFLAGS(A) \
-    if(((A)!=X_PEND && dyn->f.pending!=SF_SET)          \
-    && (dyn->f.pending!=SF_SET_PENDING)) {              \
-        if(dyn->f.pending!=SF_PENDING) {                \
-            LD(x3, xEmu, offsetof(x64emu_t, df));       \
-            j64 = (GETMARKF)-(dyn->native_size);        \
-            BEQ(x3, xZR, j64);                          \
-        }                                               \
-        CALL_(UpdateFlags, -1, 0);                      \
-        FLAGS_ADJUST_FROM11(xFlags, x3);                \
-        MARKF;                                          \
-        dyn->f.pending = SF_SET;                        \
-        SET_DFOK();                                     \
+#define READFLAGS(A)                                \
+    if (((A) != X_PEND && dyn->f.pending != SF_SET) \
+        && (dyn->f.pending != SF_SET_PENDING)) {    \
+        if (dyn->f.pending != SF_PENDING) {         \
+            LD(x3, xEmu, offsetof(x64emu_t, df));   \
+            j64 = (GETMARKF) - (dyn->native_size);  \
+            BEQ(x3, xZR, j64);                      \
+        }                                           \
+        CALL_(UpdateFlags, -1, 0);                  \
+        FLAGS_ADJUST_FROM11(xFlags, x3);            \
+        MARKF;                                      \
+        dyn->f.pending = SF_SET;                    \
+        SET_DFOK();                                 \
     }
 #endif
 
 #ifndef SETFLAGS
-#define SETFLAGS(A, B)                                                                          \
-    if(dyn->f.pending!=SF_SET                                                                   \
-    && ((B)&SF_SUB)                                                                             \
-    && (dyn->insts[ninst].x64.gen_flags&(~(A))))                                                \
-        READFLAGS(((dyn->insts[ninst].x64.gen_flags&X_PEND)?X_ALL:dyn->insts[ninst].x64.gen_flags)&(~(A)));\
-    if(dyn->insts[ninst].x64.gen_flags) switch(B) {                                             \
-        case SF_SUBSET:                                                                         \
-        case SF_SET: dyn->f.pending = SF_SET; break;                                            \
-        case SF_PENDING: dyn->f.pending = SF_PENDING; break;                                    \
-        case SF_SUBSET_PENDING:                                                                 \
-        case SF_SET_PENDING:                                                                    \
-            dyn->f.pending = (dyn->insts[ninst].x64.gen_flags&X_PEND)?SF_SET_PENDING:SF_SET;    \
-            break;                                                                              \
-    } else dyn->f.pending = SF_SET
+#define SETFLAGS(A, B)                                                                                              \
+    if (dyn->f.pending != SF_SET                                                                                    \
+        && ((B)&SF_SUB)                                                                                             \
+        && (dyn->insts[ninst].x64.gen_flags & (~(A))))                                                              \
+        READFLAGS(((dyn->insts[ninst].x64.gen_flags & X_PEND) ? X_ALL : dyn->insts[ninst].x64.gen_flags) & (~(A))); \
+    if (dyn->insts[ninst].x64.gen_flags) switch (B) {                                                               \
+            case SF_SUBSET:                                                                                         \
+            case SF_SET: dyn->f.pending = SF_SET; break;                                                            \
+            case SF_PENDING: dyn->f.pending = SF_PENDING; break;                                                    \
+            case SF_SUBSET_PENDING:                                                                                 \
+            case SF_SET_PENDING:                                                                                    \
+                dyn->f.pending = (dyn->insts[ninst].x64.gen_flags & X_PEND) ? SF_SET_PENDING : SF_SET;              \
+                break;                                                                                              \
+        }                                                                                                           \
+    else                                                                                                            \
+        dyn->f.pending = SF_SET
 #endif
 #ifndef JUMP
 #define JUMP(A, C)
@@ -805,14 +912,24 @@
 #ifndef BARRIER_NEXT
 #define BARRIER_NEXT(A)
 #endif
-#define UFLAG_OP1(A) if(dyn->insts[ninst].x64.gen_flags) {SDxw(A, xEmu, offsetof(x64emu_t, op1));}
-#define UFLAG_OP2(A) if(dyn->insts[ninst].x64.gen_flags) {SDxw(A, xEmu, offsetof(x64emu_t, op2));}
-#define UFLAG_OP12(A1, A2) if(dyn->insts[ninst].x64.gen_flags) {SDxw(A1, xEmu, offsetof(x64emu_t, op1));SDxw(A2, xEmu, offsetof(x64emu_t, op2));}
-#define UFLAG_RES(A) if(dyn->insts[ninst].x64.gen_flags) {SDxw(A, xEmu, offsetof(x64emu_t, res));}
-#define UFLAG_DF(r, A) if(dyn->insts[ninst].x64.gen_flags) {SET_DF(r, A)}
-#define UFLAG_IF if(dyn->insts[ninst].x64.gen_flags)
+#define UFLAG_OP1(A) \
+    if (dyn->insts[ninst].x64.gen_flags) { SDxw(A, xEmu, offsetof(x64emu_t, op1)); }
+#define UFLAG_OP2(A) \
+    if (dyn->insts[ninst].x64.gen_flags) { SDxw(A, xEmu, offsetof(x64emu_t, op2)); }
+#define UFLAG_OP12(A1, A2)                       \
+    if (dyn->insts[ninst].x64.gen_flags) {       \
+        SDxw(A1, xEmu, offsetof(x64emu_t, op1)); \
+        SDxw(A2, xEmu, offsetof(x64emu_t, op2)); \
+    }
+#define UFLAG_RES(A) \
+    if (dyn->insts[ninst].x64.gen_flags) { SDxw(A, xEmu, offsetof(x64emu_t, res)); }
+#define UFLAG_DF(r, A) \
+    if (dyn->insts[ninst].x64.gen_flags) { SET_DF(r, A) }
+#define UFLAG_IF if (dyn->insts[ninst].x64.gen_flags)
 #ifndef DEFAULT
-#define DEFAULT      *ok = -1; BARRIER(2)
+#define DEFAULT \
+    *ok = -1;   \
+    BARRIER(2)
 #endif
 
 #ifndef TABLE64
@@ -830,206 +947,208 @@
 #define GETIP_(A)
 #else
 // put value in the Table64 even if not using it for now to avoid difference between Step2 and Step3. Needs to be optimized later...
-#define GETIP(A)                                        \
-    if(dyn->last_ip && ((A)-dyn->last_ip)<2048) {       \
-        uint64_t _delta_ip = (A)-dyn->last_ip;          \
-        dyn->last_ip += _delta_ip;                      \
-        if(_delta_ip) {                                 \
-            ADDI(xRIP, xRIP, _delta_ip);                \
-        }                                               \
-    } else {                                            \
-        dyn->last_ip = (A);                             \
-        if(dyn->last_ip<0xffffffff) {                   \
-            MOV64x(xRIP, dyn->last_ip);                 \
-        } else                                          \
-            TABLE64(xRIP, dyn->last_ip);                \
+#define GETIP(A)                                     \
+    if (dyn->last_ip && ((A)-dyn->last_ip) < 2048) { \
+        uint64_t _delta_ip = (A)-dyn->last_ip;       \
+        dyn->last_ip += _delta_ip;                   \
+        if (_delta_ip) {                             \
+            ADDI(xRIP, xRIP, _delta_ip);             \
+        }                                            \
+    } else {                                         \
+        dyn->last_ip = (A);                          \
+        if (dyn->last_ip < 0xffffffff) {             \
+            MOV64x(xRIP, dyn->last_ip);              \
+        } else                                       \
+            TABLE64(xRIP, dyn->last_ip);             \
     }
 #define GETIP_(A)                                       \
-    if(dyn->last_ip && ((A)-dyn->last_ip)<2048) {       \
+    if (dyn->last_ip && ((A)-dyn->last_ip) < 2048) {    \
         int64_t _delta_ip = (A)-dyn->last_ip;           \
-        if(_delta_ip) {ADDI(xRIP, xRIP, _delta_ip);}    \
+        if (_delta_ip) { ADDI(xRIP, xRIP, _delta_ip); } \
     } else {                                            \
-        if((A)<0xffffffff) {                            \
+        if ((A) < 0xffffffff) {                         \
             MOV64x(xRIP, (A));                          \
         } else                                          \
             TABLE64(xRIP, (A));                         \
     }
 #endif
-#define CLEARIP()   dyn->last_ip=0
+#define CLEARIP() dyn->last_ip = 0
 
 #if STEP < 2
-#define PASS2IF(A, B) if(A)
+#define PASS2IF(A, B) if (A)
 #elif STEP == 2
-#define PASS2IF(A, B) if(A) dyn->insts[ninst].pass2choice = B; if(dyn->insts[ninst].pass2choice == B)
+#define PASS2IF(A, B)                         \
+    if (A) dyn->insts[ninst].pass2choice = B; \
+    if (dyn->insts[ninst].pass2choice == B)
 #else
-#define PASS2IF(A, B) if(dyn->insts[ninst].pass2choice == B)
+#define PASS2IF(A, B) if (dyn->insts[ninst].pass2choice == B)
 #endif
 
-#define MODREG  ((nextop&0xC0)==0xC0)
+#define MODREG ((nextop & 0xC0) == 0xC0)
 
 void rv64_epilog(void);
 void rv64_epilog_fast(void);
 void* rv64_next(x64emu_t* emu, uintptr_t addr);
 
 #ifndef STEPNAME
-#define STEPNAME3(N,M) N##M
-#define STEPNAME2(N,M) STEPNAME3(N,M)
-#define STEPNAME(N) STEPNAME2(N, STEP)
+#define STEPNAME3(N, M) N##M
+#define STEPNAME2(N, M) STEPNAME3(N, M)
+#define STEPNAME(N)     STEPNAME2(N, STEP)
 #endif
 
-#define native_pass        STEPNAME(native_pass)
-
-#define dynarec64_00       STEPNAME(dynarec64_00)
-#define dynarec64_00_0     STEPNAME(dynarec64_00_0)
-#define dynarec64_00_1     STEPNAME(dynarec64_00_1)
-#define dynarec64_00_2     STEPNAME(dynarec64_00_2)
-#define dynarec64_00_3     STEPNAME(dynarec64_00_3)
-#define dynarec64_0F       STEPNAME(dynarec64_0F)
-#define dynarec64_64       STEPNAME(dynarec64_64)
-#define dynarec64_65       STEPNAME(dynarec64_65)
-#define dynarec64_66       STEPNAME(dynarec64_66)
-#define dynarec64_67       STEPNAME(dynarec64_67)
-#define dynarec64_D8       STEPNAME(dynarec64_D8)
-#define dynarec64_D9       STEPNAME(dynarec64_D9)
-#define dynarec64_DA       STEPNAME(dynarec64_DA)
-#define dynarec64_DB       STEPNAME(dynarec64_DB)
-#define dynarec64_DC       STEPNAME(dynarec64_DC)
-#define dynarec64_DD       STEPNAME(dynarec64_DD)
-#define dynarec64_DE       STEPNAME(dynarec64_DE)
-#define dynarec64_DF       STEPNAME(dynarec64_DF)
-#define dynarec64_F0       STEPNAME(dynarec64_F0)
-#define dynarec64_660F     STEPNAME(dynarec64_660F)
-#define dynarec64_6664     STEPNAME(dynarec64_6664)
-#define dynarec64_66F0     STEPNAME(dynarec64_66F0)
-#define dynarec64_F20F     STEPNAME(dynarec64_F20F)
-#define dynarec64_F30F     STEPNAME(dynarec64_F30F)
-
-#define geted           STEPNAME(geted)
-#define geted32         STEPNAME(geted32)
-#define geted16         STEPNAME(geted16)
-#define jump_to_epilog  STEPNAME(jump_to_epilog)
-#define jump_to_epilog_fast  STEPNAME(jump_to_epilog_fast)
-#define jump_to_next    STEPNAME(jump_to_next)
-#define ret_to_epilog   STEPNAME(ret_to_epilog)
-#define retn_to_epilog  STEPNAME(retn_to_epilog)
-#define iret_to_epilog  STEPNAME(iret_to_epilog)
-#define call_c          STEPNAME(call_c)
-#define call_n          STEPNAME(call_n)
-#define grab_segdata    STEPNAME(grab_segdata)
-#define emit_cmp8       STEPNAME(emit_cmp8)
-#define emit_cmp16      STEPNAME(emit_cmp16)
-#define emit_cmp32      STEPNAME(emit_cmp32)
-#define emit_cmp8_0     STEPNAME(emit_cmp8_0)
-#define emit_cmp16_0    STEPNAME(emit_cmp16_0)
-#define emit_cmp32_0    STEPNAME(emit_cmp32_0)
-#define emit_test8      STEPNAME(emit_test8)
-#define emit_test16     STEPNAME(emit_test16)
-#define emit_test32     STEPNAME(emit_test32)
-#define emit_test32c    STEPNAME(emit_test32)
-#define emit_add32      STEPNAME(emit_add32)
-#define emit_add32c     STEPNAME(emit_add32c)
-#define emit_add8       STEPNAME(emit_add8)
-#define emit_add8c      STEPNAME(emit_add8c)
-#define emit_sub32      STEPNAME(emit_sub32)
-#define emit_sub32c     STEPNAME(emit_sub32c)
-#define emit_sub8       STEPNAME(emit_sub8)
-#define emit_sub8c      STEPNAME(emit_sub8c)
-#define emit_or32       STEPNAME(emit_or32)
-#define emit_or32c      STEPNAME(emit_or32c)
-#define emit_xor32      STEPNAME(emit_xor32)
-#define emit_xor32c     STEPNAME(emit_xor32c)
-#define emit_and32      STEPNAME(emit_and32)
-#define emit_and32c     STEPNAME(emit_and32c)
-#define emit_or8        STEPNAME(emit_or8)
-#define emit_or8c       STEPNAME(emit_or8c)
-#define emit_xor8       STEPNAME(emit_xor8)
-#define emit_xor8c      STEPNAME(emit_xor8c)
-#define emit_and8       STEPNAME(emit_and8)
-#define emit_and8c      STEPNAME(emit_and8c)
-#define emit_add16      STEPNAME(emit_add16)
-#define emit_add16c     STEPNAME(emit_add16c)
-#define emit_sub16      STEPNAME(emit_sub16)
-#define emit_sub16c     STEPNAME(emit_sub16c)
-#define emit_or16       STEPNAME(emit_or16)
-#define emit_or16c      STEPNAME(emit_or16c)
-#define emit_xor16      STEPNAME(emit_xor16)
-#define emit_xor16c     STEPNAME(emit_xor16c)
-#define emit_and16      STEPNAME(emit_and16)
-#define emit_and16c     STEPNAME(emit_and16c)
-#define emit_inc32      STEPNAME(emit_inc32)
-#define emit_inc16      STEPNAME(emit_inc16)
-#define emit_inc8       STEPNAME(emit_inc8)
-#define emit_dec32      STEPNAME(emit_dec32)
-#define emit_dec16      STEPNAME(emit_dec16)
-#define emit_dec8       STEPNAME(emit_dec8)
-#define emit_adc32      STEPNAME(emit_adc32)
-#define emit_adc32c     STEPNAME(emit_adc32c)
-#define emit_adc8       STEPNAME(emit_adc8)
-#define emit_adc8c      STEPNAME(emit_adc8c)
-#define emit_adc16      STEPNAME(emit_adc16)
-#define emit_adc16c     STEPNAME(emit_adc16c)
-#define emit_sbb32      STEPNAME(emit_sbb32)
-#define emit_sbb32c     STEPNAME(emit_sbb32c)
-#define emit_sbb8       STEPNAME(emit_sbb8)
-#define emit_sbb8c      STEPNAME(emit_sbb8c)
-#define emit_sbb16      STEPNAME(emit_sbb16)
-#define emit_sbb16c     STEPNAME(emit_sbb16c)
-#define emit_neg32      STEPNAME(emit_neg32)
-#define emit_neg16      STEPNAME(emit_neg16)
-#define emit_neg8       STEPNAME(emit_neg8)
-#define emit_shl32      STEPNAME(emit_shl32)
-#define emit_shl32c     STEPNAME(emit_shl32c)
-#define emit_shr32      STEPNAME(emit_shr32)
-#define emit_shr32c     STEPNAME(emit_shr32c)
-#define emit_sar32c     STEPNAME(emit_sar32c)
-#define emit_rol32      STEPNAME(emit_rol32)
-#define emit_ror32      STEPNAME(emit_ror32)
-#define emit_rol32c     STEPNAME(emit_rol32c)
-#define emit_ror32c     STEPNAME(emit_ror32c)
-#define emit_shrd32c    STEPNAME(emit_shrd32c)
-#define emit_shld32c    STEPNAME(emit_shld32c)
-
-#define emit_pf         STEPNAME(emit_pf)
-
-#define x87_do_push     STEPNAME(x87_do_push)
-#define x87_do_push_empty STEPNAME(x87_do_push_empty)
-#define x87_do_pop      STEPNAME(x87_do_pop)
-#define x87_get_current_cache   STEPNAME(x87_get_current_cache)
-#define x87_get_cache   STEPNAME(x87_get_cache)
-#define x87_get_extcache STEPNAME(x87_get_extcache)
-#define x87_get_st      STEPNAME(x87_get_st)
-#define x87_get_st_empty  STEPNAME(x87_get_st)
-#define x87_refresh     STEPNAME(x87_refresh)
-#define x87_forget      STEPNAME(x87_forget)
-#define x87_reget_st    STEPNAME(x87_reget_st)
-#define x87_stackcount  STEPNAME(x87_stackcount)
-#define x87_swapreg     STEPNAME(x87_swapreg)
-#define x87_setround    STEPNAME(x87_setround)
-#define x87_restoreround STEPNAME(x87_restoreround)
-#define sse_setround    STEPNAME(sse_setround)
-#define mmx_get_reg     STEPNAME(mmx_get_reg)
-#define mmx_get_reg_empty STEPNAME(mmx_get_reg_empty)
-#define mmx_forget_reg   STEPNAME(mmx_forget_reg)
-#define sse_get_reg     STEPNAME(sse_get_reg)
-#define sse_get_reg_empty STEPNAME(sse_get_reg_empty)
-#define sse_forget_reg   STEPNAME(sse_forget_reg)
-#define sse_purge07cache STEPNAME(sse_purge07cache)
-
-#define fpu_pushcache   STEPNAME(fpu_pushcache)
-#define fpu_popcache    STEPNAME(fpu_popcache)
-#define fpu_reset       STEPNAME(fpu_reset)
-#define fpu_reset_cache STEPNAME(fpu_reset_cache)
+#define native_pass STEPNAME(native_pass)
+
+#define dynarec64_00   STEPNAME(dynarec64_00)
+#define dynarec64_00_0 STEPNAME(dynarec64_00_0)
+#define dynarec64_00_1 STEPNAME(dynarec64_00_1)
+#define dynarec64_00_2 STEPNAME(dynarec64_00_2)
+#define dynarec64_00_3 STEPNAME(dynarec64_00_3)
+#define dynarec64_0F   STEPNAME(dynarec64_0F)
+#define dynarec64_64   STEPNAME(dynarec64_64)
+#define dynarec64_65   STEPNAME(dynarec64_65)
+#define dynarec64_66   STEPNAME(dynarec64_66)
+#define dynarec64_67   STEPNAME(dynarec64_67)
+#define dynarec64_D8   STEPNAME(dynarec64_D8)
+#define dynarec64_D9   STEPNAME(dynarec64_D9)
+#define dynarec64_DA   STEPNAME(dynarec64_DA)
+#define dynarec64_DB   STEPNAME(dynarec64_DB)
+#define dynarec64_DC   STEPNAME(dynarec64_DC)
+#define dynarec64_DD   STEPNAME(dynarec64_DD)
+#define dynarec64_DE   STEPNAME(dynarec64_DE)
+#define dynarec64_DF   STEPNAME(dynarec64_DF)
+#define dynarec64_F0   STEPNAME(dynarec64_F0)
+#define dynarec64_660F STEPNAME(dynarec64_660F)
+#define dynarec64_6664 STEPNAME(dynarec64_6664)
+#define dynarec64_66F0 STEPNAME(dynarec64_66F0)
+#define dynarec64_F20F STEPNAME(dynarec64_F20F)
+#define dynarec64_F30F STEPNAME(dynarec64_F30F)
+
+#define geted               STEPNAME(geted)
+#define geted32             STEPNAME(geted32)
+#define geted16             STEPNAME(geted16)
+#define jump_to_epilog      STEPNAME(jump_to_epilog)
+#define jump_to_epilog_fast STEPNAME(jump_to_epilog_fast)
+#define jump_to_next        STEPNAME(jump_to_next)
+#define ret_to_epilog       STEPNAME(ret_to_epilog)
+#define retn_to_epilog      STEPNAME(retn_to_epilog)
+#define iret_to_epilog      STEPNAME(iret_to_epilog)
+#define call_c              STEPNAME(call_c)
+#define call_n              STEPNAME(call_n)
+#define grab_segdata        STEPNAME(grab_segdata)
+#define emit_cmp8           STEPNAME(emit_cmp8)
+#define emit_cmp16          STEPNAME(emit_cmp16)
+#define emit_cmp32          STEPNAME(emit_cmp32)
+#define emit_cmp8_0         STEPNAME(emit_cmp8_0)
+#define emit_cmp16_0        STEPNAME(emit_cmp16_0)
+#define emit_cmp32_0        STEPNAME(emit_cmp32_0)
+#define emit_test8          STEPNAME(emit_test8)
+#define emit_test16         STEPNAME(emit_test16)
+#define emit_test32         STEPNAME(emit_test32)
+#define emit_test32c        STEPNAME(emit_test32)
+#define emit_add32          STEPNAME(emit_add32)
+#define emit_add32c         STEPNAME(emit_add32c)
+#define emit_add8           STEPNAME(emit_add8)
+#define emit_add8c          STEPNAME(emit_add8c)
+#define emit_sub32          STEPNAME(emit_sub32)
+#define emit_sub32c         STEPNAME(emit_sub32c)
+#define emit_sub8           STEPNAME(emit_sub8)
+#define emit_sub8c          STEPNAME(emit_sub8c)
+#define emit_or32           STEPNAME(emit_or32)
+#define emit_or32c          STEPNAME(emit_or32c)
+#define emit_xor32          STEPNAME(emit_xor32)
+#define emit_xor32c         STEPNAME(emit_xor32c)
+#define emit_and32          STEPNAME(emit_and32)
+#define emit_and32c         STEPNAME(emit_and32c)
+#define emit_or8            STEPNAME(emit_or8)
+#define emit_or8c           STEPNAME(emit_or8c)
+#define emit_xor8           STEPNAME(emit_xor8)
+#define emit_xor8c          STEPNAME(emit_xor8c)
+#define emit_and8           STEPNAME(emit_and8)
+#define emit_and8c          STEPNAME(emit_and8c)
+#define emit_add16          STEPNAME(emit_add16)
+#define emit_add16c         STEPNAME(emit_add16c)
+#define emit_sub16          STEPNAME(emit_sub16)
+#define emit_sub16c         STEPNAME(emit_sub16c)
+#define emit_or16           STEPNAME(emit_or16)
+#define emit_or16c          STEPNAME(emit_or16c)
+#define emit_xor16          STEPNAME(emit_xor16)
+#define emit_xor16c         STEPNAME(emit_xor16c)
+#define emit_and16          STEPNAME(emit_and16)
+#define emit_and16c         STEPNAME(emit_and16c)
+#define emit_inc32          STEPNAME(emit_inc32)
+#define emit_inc16          STEPNAME(emit_inc16)
+#define emit_inc8           STEPNAME(emit_inc8)
+#define emit_dec32          STEPNAME(emit_dec32)
+#define emit_dec16          STEPNAME(emit_dec16)
+#define emit_dec8           STEPNAME(emit_dec8)
+#define emit_adc32          STEPNAME(emit_adc32)
+#define emit_adc32c         STEPNAME(emit_adc32c)
+#define emit_adc8           STEPNAME(emit_adc8)
+#define emit_adc8c          STEPNAME(emit_adc8c)
+#define emit_adc16          STEPNAME(emit_adc16)
+#define emit_adc16c         STEPNAME(emit_adc16c)
+#define emit_sbb32          STEPNAME(emit_sbb32)
+#define emit_sbb32c         STEPNAME(emit_sbb32c)
+#define emit_sbb8           STEPNAME(emit_sbb8)
+#define emit_sbb8c          STEPNAME(emit_sbb8c)
+#define emit_sbb16          STEPNAME(emit_sbb16)
+#define emit_sbb16c         STEPNAME(emit_sbb16c)
+#define emit_neg32          STEPNAME(emit_neg32)
+#define emit_neg16          STEPNAME(emit_neg16)
+#define emit_neg8           STEPNAME(emit_neg8)
+#define emit_shl32          STEPNAME(emit_shl32)
+#define emit_shl32c         STEPNAME(emit_shl32c)
+#define emit_shr32          STEPNAME(emit_shr32)
+#define emit_shr32c         STEPNAME(emit_shr32c)
+#define emit_sar32c         STEPNAME(emit_sar32c)
+#define emit_rol32          STEPNAME(emit_rol32)
+#define emit_ror32          STEPNAME(emit_ror32)
+#define emit_rol32c         STEPNAME(emit_rol32c)
+#define emit_ror32c         STEPNAME(emit_ror32c)
+#define emit_shrd32c        STEPNAME(emit_shrd32c)
+#define emit_shld32c        STEPNAME(emit_shld32c)
+
+#define emit_pf STEPNAME(emit_pf)
+
+#define x87_do_push           STEPNAME(x87_do_push)
+#define x87_do_push_empty     STEPNAME(x87_do_push_empty)
+#define x87_do_pop            STEPNAME(x87_do_pop)
+#define x87_get_current_cache STEPNAME(x87_get_current_cache)
+#define x87_get_cache         STEPNAME(x87_get_cache)
+#define x87_get_extcache      STEPNAME(x87_get_extcache)
+#define x87_get_st            STEPNAME(x87_get_st)
+#define x87_get_st_empty      STEPNAME(x87_get_st)
+#define x87_refresh           STEPNAME(x87_refresh)
+#define x87_forget            STEPNAME(x87_forget)
+#define x87_reget_st          STEPNAME(x87_reget_st)
+#define x87_stackcount        STEPNAME(x87_stackcount)
+#define x87_swapreg           STEPNAME(x87_swapreg)
+#define x87_setround          STEPNAME(x87_setround)
+#define x87_restoreround      STEPNAME(x87_restoreround)
+#define sse_setround          STEPNAME(sse_setround)
+#define mmx_get_reg           STEPNAME(mmx_get_reg)
+#define mmx_get_reg_empty     STEPNAME(mmx_get_reg_empty)
+#define mmx_forget_reg        STEPNAME(mmx_forget_reg)
+#define sse_get_reg           STEPNAME(sse_get_reg)
+#define sse_get_reg_empty     STEPNAME(sse_get_reg_empty)
+#define sse_forget_reg        STEPNAME(sse_forget_reg)
+#define sse_purge07cache      STEPNAME(sse_purge07cache)
+
+#define fpu_pushcache       STEPNAME(fpu_pushcache)
+#define fpu_popcache        STEPNAME(fpu_popcache)
+#define fpu_reset           STEPNAME(fpu_reset)
+#define fpu_reset_cache     STEPNAME(fpu_reset_cache)
 #define fpu_propagate_stack STEPNAME(fpu_propagate_stack)
-#define fpu_purgecache  STEPNAME(fpu_purgecache)
-#define mmx_purgecache  STEPNAME(mmx_purgecache)
-#define x87_purgecache  STEPNAME(x87_purgecache)
-#define sse_purgecache  STEPNAME(sse_purgecache)
-#define fpu_reflectcache STEPNAME(fpu_reflectcache)
-#define fpu_unreflectcache STEPNAME(fpu_unreflectcache)
+#define fpu_purgecache      STEPNAME(fpu_purgecache)
+#define mmx_purgecache      STEPNAME(mmx_purgecache)
+#define x87_purgecache      STEPNAME(x87_purgecache)
+#define sse_purgecache      STEPNAME(sse_purgecache)
+#define fpu_reflectcache    STEPNAME(fpu_reflectcache)
+#define fpu_unreflectcache  STEPNAME(fpu_unreflectcache)
 
-#define CacheTransform       STEPNAME(CacheTransform)
-#define rv64_move64     STEPNAME(rv64_move64)
-#define rv64_move32     STEPNAME(rv64_move32)
+#define CacheTransform STEPNAME(CacheTransform)
+#define rv64_move64    STEPNAME(rv64_move64)
+#define rv64_move32    STEPNAME(rv64_move32)
 
 /* setup r2 to address pointed by */
 uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta);
@@ -1038,7 +1157,7 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
 uintptr_t geted32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta);
 
 /* setup r2 to address pointed by */
-//uintptr_t geted16(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, int s);
+// uintptr_t geted16(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, int s);
 
 
 // generic x64 helper
@@ -1082,15 +1201,15 @@ void emit_xor8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s
 void emit_and8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
 void emit_and8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_add16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
-//void emit_add16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
+// void emit_add16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_sub16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
-//void emit_sub16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
+// void emit_sub16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_or16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
-//void emit_or16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
+// void emit_or16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_xor16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
-//void emit_xor16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
+// void emit_xor16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_and16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
-//void emit_and16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
+// void emit_and16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_inc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
 void emit_inc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
 void emit_inc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
@@ -1098,17 +1217,17 @@ void emit_dec32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 void emit_dec16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
 void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
 void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6);
-//void emit_adc32c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
+// void emit_adc32c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
 void emit_adc8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5, int s6);
 void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
-//void emit_adc16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
+// void emit_adc16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_sbb32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
-//void emit_sbb32c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
+// void emit_sbb32c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_sbb8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
 void emit_sbb8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5, int s6);
 void emit_sbb16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
-//void emit_sbb16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
+// void emit_sbb16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_neg32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3);
 void emit_neg16(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4);
 void emit_neg8(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4);
@@ -1166,44 +1285,44 @@ void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val);
 void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zeroup);
 
 #if STEP < 2
-#define CHECK_CACHE()   0
+#define CHECK_CACHE() 0
 #else
-#define CHECK_CACHE()   (cacheupd = CacheNeedsTransform(dyn, ninst))
+#define CHECK_CACHE() (cacheupd = CacheNeedsTransform(dyn, ninst))
 #endif
 #define extcache_st_coherency STEPNAME(extcache_st_coherency)
 int extcache_st_coherency(dynarec_rv64_t* dyn, int ninst, int a, int b);
 
 #if STEP == 0
-#define ST_IS_F(A)          0
-#define X87_COMBINE(A, B)   EXT_CACHE_ST_D
-#define X87_ST0             EXT_CACHE_ST_D
-#define X87_ST(A)           EXT_CACHE_ST_D
+#define ST_IS_F(A)        0
+#define X87_COMBINE(A, B) EXT_CACHE_ST_D
+#define X87_ST0           EXT_CACHE_ST_D
+#define X87_ST(A)         EXT_CACHE_ST_D
 #elif STEP == 1
-#define ST_IS_F(A) (extcache_get_current_st(dyn, ninst, A)==EXT_CACHE_ST_F)
+#define ST_IS_F(A)        (extcache_get_current_st(dyn, ninst, A) == EXT_CACHE_ST_F)
 #define X87_COMBINE(A, B) extcache_combine_st(dyn, ninst, A, B)
-#define X87_ST0     extcache_get_current_st(dyn, ninst, 0)
-#define X87_ST(A)   extcache_get_current_st(dyn, ninst, A)
+#define X87_ST0           extcache_get_current_st(dyn, ninst, 0)
+#define X87_ST(A)         extcache_get_current_st(dyn, ninst, A)
 #else
-#define ST_IS_F(A) (extcache_get_st(dyn, ninst, A)==EXT_CACHE_ST_F)
+#define ST_IS_F(A) (extcache_get_st(dyn, ninst, A) == EXT_CACHE_ST_F)
 #if STEP == 3
 #define X87_COMBINE(A, B) extcache_st_coherency(dyn, ninst, A, B)
 #else
 #define X87_COMBINE(A, B) extcache_get_st(dyn, ninst, A)
 #endif
-#define X87_ST0     extcache_get_st(dyn, ninst, 0)
-#define X87_ST(A)   extcache_get_st(dyn, ninst, A)
+#define X87_ST0   extcache_get_st(dyn, ninst, 0)
+#define X87_ST(A) extcache_get_st(dyn, ninst, A)
 #endif
 
-//MMX helpers
-// get float register for a MMX reg, create the entry if needed
+// MMX helpers
+//  get float register for a MMX reg, create the entry if needed
 int mmx_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a);
 // get float register for a MMX reg, but don't try to synch it if it needed to be created
 int mmx_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a);
 // forget float register for a MMX reg, create the entry if needed
 void mmx_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
 
-//SSE/SSE2 helpers
-// get float register for a SSE reg, create the entry if needed
+// SSE/SSE2 helpers
+//  get float register for a SSE reg, create the entry if needed
 int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
 // get float register for a SSE reg, but don't try to synch it if it needed to be created
 int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
@@ -1238,12 +1357,12 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
 uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int seg, int* ok, int* need_epilog);
-//uintptr_t dynarec64_65(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep,int* ok, int* need_epilog);
+// uintptr_t dynarec64_65(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep,int* ok, int* need_epilog);
 uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_D8(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
-//uintptr_t dynarec64_DA(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+// uintptr_t dynarec64_DA(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_DD(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
@@ -1259,139 +1378,123 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
 #if STEP < 2
 #define PASS2(A)
 #else
-#define PASS2(A)   A
+#define PASS2(A) A
 #endif
 
 #if STEP < 3
 #define PASS3(A)
 #else
-#define PASS3(A)   A
+#define PASS3(A) A
 #endif
 
 #if STEP < 3
-#define MAYUSE(A)   (void)A
+#define MAYUSE(A) (void)A
 #else
 #define MAYUSE(A)
 #endif
 
 // GOCOND will use x1 and x3
-#define GOCOND(B, T1, T2)                                   \
-    case B+0x0:                                             \
-        INST_NAME(T1 "O " T2);                              \
-        GO( ANDI(x1, xFlags, 1<<F_OF2)                      \
-            , EQZ, NEZ, X_OF)                               \
-        break;                                              \
-    case B+0x1:                                             \
-        INST_NAME(T1 "NO " T2);                             \
-        GO( ANDI(x1, xFlags, 1<<F_OF2)                      \
-            , NEZ, EQZ, X_OF)                               \
-        break;                                              \
-    case B+0x2:                                             \
-        INST_NAME(T1 "C " T2);                              \
-        GO( ANDI(x1, xFlags, 1<<F_CF)                       \
-            , EQZ, NEZ, X_CF)                               \
-        break;                                              \
-    case B+0x3:                                             \
-        INST_NAME(T1 "NC " T2);                             \
-        GO( ANDI(x1, xFlags, 1<<F_CF)                       \
-            , NEZ, EQZ, X_CF)                               \
-        break;                                              \
-    case B+0x4:                                             \
-        INST_NAME(T1 "Z " T2);                              \
-        GO( ANDI(x1, xFlags, 1<<F_ZF)                       \
-            , EQZ, NEZ, X_ZF)                               \
-        break;                                              \
-    case B+0x5:                                             \
-        INST_NAME(T1 "NZ " T2);                             \
-        GO( ANDI(x1, xFlags, 1<<F_ZF)                       \
-            , NEZ, EQZ, X_ZF)                               \
-        break;                                              \
-    case B+0x6:                                             \
-        INST_NAME(T1 "BE " T2);                             \
-        GO( ANDI(x1, xFlags, (1<<F_CF)|(1<<F_ZF))           \
-            , EQZ, NEZ, X_CF|X_ZF)                          \
-        break;                                              \
-    case B+0x7:                                             \
-        INST_NAME(T1 "NBE " T2);                            \
-        GO( ANDI(x1, xFlags, (1<<F_CF)|(1<<F_ZF))           \
-            , NEZ, EQZ, X_CF|X_ZF)                          \
-        break;                                              \
-    case B+0x8:                                             \
-        INST_NAME(T1 "S " T2);                              \
-        GO( ANDI(x1, xFlags, 1<<F_SF)                       \
-            , EQZ, NEZ, X_SF)                               \
-        break;                                              \
-    case B+0x9:                                             \
-        INST_NAME(T1 "NS " T2);                             \
-        GO( ANDI(x1, xFlags, 1<<F_SF)                       \
-            , NEZ, EQZ, X_SF)                               \
-        break;                                              \
-    case B+0xA:                                             \
-        INST_NAME(T1 "P " T2);                              \
-        GO( ANDI(x1, xFlags, 1<<F_PF)                       \
-            , EQZ, NEZ, X_PF)                               \
-        break;                                              \
-    case B+0xB:                                             \
-        INST_NAME(T1 "NP " T2);                             \
-        GO( ANDI(x1, xFlags, 1<<F_PF)                       \
-            , NEZ, EQZ, X_PF)                               \
-        break;                                              \
-    case B+0xC:                                             \
-        INST_NAME(T1 "L " T2);                              \
-        GO( SRLI(x1, xFlags, F_SF-F_OF2);                   \
-            XOR(x1, x1, xFlags);                            \
-            ANDI(x1, x1, 1<<F_OF2)                          \
-            , EQZ, NEZ, X_SF|X_OF)                          \
-        break;                                              \
-    case B+0xD:                                             \
-        INST_NAME(T1 "GE " T2);                             \
-        GO( SRLI(x1, xFlags, F_SF-F_OF2);                   \
-            XOR(x1, x1, xFlags);                            \
-            ANDI(x1, x1, 1<<F_OF2)                          \
-            , NEZ, EQZ, X_SF|X_OF)                          \
-        break;                                              \
-    case B+0xE:                                             \
-        INST_NAME(T1 "LE " T2);                             \
-        GO( SRLI(x1, xFlags, F_SF-F_OF2);                   \
-            XOR(x1, x1, xFlags);                            \
-            ANDI(x1, x1, 1<<F_OF2);                         \
-            ANDI(x3, xFlags, 1<<F_ZF);                      \
-            OR(x1, x1, x3);                                 \
-            ANDI(x1, x1, (1<<F_OF2) | (1<<F_ZF))            \
-            , EQZ, NEZ, X_SF|X_OF|X_ZF)                     \
-        break;                                              \
-    case B+0xF:                                             \
-        INST_NAME(T1 "G " T2);                              \
-        GO( SRLI(x1, xFlags, F_SF-F_OF2);                   \
-            XOR(x1, x1, xFlags);                            \
-            ANDI(x1, x1, 1<<F_OF2);                         \
-            ANDI(x3, xFlags, 1<<F_ZF);                      \
-            OR(x1, x1, x3);                                 \
-            ANDI(x1, x1, (1<<F_OF2) | (1<<F_ZF))            \
-            , NEZ, EQZ, X_SF|X_OF|X_ZF)                     \
+#define GOCOND(B, T1, T2)                                                           \
+    case B + 0x0:                                                                   \
+        INST_NAME(T1 "O " T2);                                                      \
+        GO(ANDI(x1, xFlags, 1 << F_OF2), EQZ, NEZ, X_OF)                            \
+        break;                                                                      \
+    case B + 0x1:                                                                   \
+        INST_NAME(T1 "NO " T2);                                                     \
+        GO(ANDI(x1, xFlags, 1 << F_OF2), NEZ, EQZ, X_OF)                            \
+        break;                                                                      \
+    case B + 0x2:                                                                   \
+        INST_NAME(T1 "C " T2);                                                      \
+        GO(ANDI(x1, xFlags, 1 << F_CF), EQZ, NEZ, X_CF)                             \
+        break;                                                                      \
+    case B + 0x3:                                                                   \
+        INST_NAME(T1 "NC " T2);                                                     \
+        GO(ANDI(x1, xFlags, 1 << F_CF), NEZ, EQZ, X_CF)                             \
+        break;                                                                      \
+    case B + 0x4:                                                                   \
+        INST_NAME(T1 "Z " T2);                                                      \
+        GO(ANDI(x1, xFlags, 1 << F_ZF), EQZ, NEZ, X_ZF)                             \
+        break;                                                                      \
+    case B + 0x5:                                                                   \
+        INST_NAME(T1 "NZ " T2);                                                     \
+        GO(ANDI(x1, xFlags, 1 << F_ZF), NEZ, EQZ, X_ZF)                             \
+        break;                                                                      \
+    case B + 0x6:                                                                   \
+        INST_NAME(T1 "BE " T2);                                                     \
+        GO(ANDI(x1, xFlags, (1 << F_CF) | (1 << F_ZF)), EQZ, NEZ, X_CF | X_ZF)      \
+        break;                                                                      \
+    case B + 0x7:                                                                   \
+        INST_NAME(T1 "NBE " T2);                                                    \
+        GO(ANDI(x1, xFlags, (1 << F_CF) | (1 << F_ZF)), NEZ, EQZ, X_CF | X_ZF)      \
+        break;                                                                      \
+    case B + 0x8:                                                                   \
+        INST_NAME(T1 "S " T2);                                                      \
+        GO(ANDI(x1, xFlags, 1 << F_SF), EQZ, NEZ, X_SF)                             \
+        break;                                                                      \
+    case B + 0x9:                                                                   \
+        INST_NAME(T1 "NS " T2);                                                     \
+        GO(ANDI(x1, xFlags, 1 << F_SF), NEZ, EQZ, X_SF)                             \
+        break;                                                                      \
+    case B + 0xA:                                                                   \
+        INST_NAME(T1 "P " T2);                                                      \
+        GO(ANDI(x1, xFlags, 1 << F_PF), EQZ, NEZ, X_PF)                             \
+        break;                                                                      \
+    case B + 0xB:                                                                   \
+        INST_NAME(T1 "NP " T2);                                                     \
+        GO(ANDI(x1, xFlags, 1 << F_PF), NEZ, EQZ, X_PF)                             \
+        break;                                                                      \
+    case B + 0xC:                                                                   \
+        INST_NAME(T1 "L " T2);                                                      \
+        GO(SRLI(x1, xFlags, F_SF - F_OF2);                                          \
+            XOR(x1, x1, xFlags);                                                    \
+            ANDI(x1, x1, 1 << F_OF2), EQZ, NEZ, X_SF | X_OF)                        \
+        break;                                                                      \
+    case B + 0xD:                                                                   \
+        INST_NAME(T1 "GE " T2);                                                     \
+        GO(SRLI(x1, xFlags, F_SF - F_OF2);                                          \
+            XOR(x1, x1, xFlags);                                                    \
+            ANDI(x1, x1, 1 << F_OF2), NEZ, EQZ, X_SF | X_OF)                        \
+        break;                                                                      \
+    case B + 0xE:                                                                   \
+        INST_NAME(T1 "LE " T2);                                                     \
+        GO(SRLI(x1, xFlags, F_SF - F_OF2);                                          \
+            XOR(x1, x1, xFlags);                                                    \
+            ANDI(x1, x1, 1 << F_OF2);                                               \
+            ANDI(x3, xFlags, 1 << F_ZF);                                            \
+            OR(x1, x1, x3);                                                         \
+            ANDI(x1, x1, (1 << F_OF2) | (1 << F_ZF)), EQZ, NEZ, X_SF | X_OF | X_ZF) \
+        break;                                                                      \
+    case B + 0xF:                                                                   \
+        INST_NAME(T1 "G " T2);                                                      \
+        GO(SRLI(x1, xFlags, F_SF - F_OF2);                                          \
+            XOR(x1, x1, xFlags);                                                    \
+            ANDI(x1, x1, 1 << F_OF2);                                               \
+            ANDI(x3, xFlags, 1 << F_ZF);                                            \
+            OR(x1, x1, x3);                                                         \
+            ANDI(x1, x1, (1 << F_OF2) | (1 << F_ZF)), NEZ, EQZ, X_SF | X_OF | X_ZF) \
         break
 
-#define NOTEST(s1)                                          \
-    if(box64_dynarec_test) {                                \
-        SW(xZR, xEmu, offsetof(x64emu_t, test.test));       \
-        SW(xZR, xEmu, offsetof(x64emu_t, test.clean));      \
+#define NOTEST(s1)                                     \
+    if (box64_dynarec_test) {                          \
+        SW(xZR, xEmu, offsetof(x64emu_t, test.test));  \
+        SW(xZR, xEmu, offsetof(x64emu_t, test.clean)); \
     }
-#define SKIPTEST(s1)                                        \
-    if(box64_dynarec_test) {                                \
-        SW(xZR, xEmu, offsetof(x64emu_t, test.clean));      \
+#define SKIPTEST(s1)                                   \
+    if (box64_dynarec_test) {                          \
+        SW(xZR, xEmu, offsetof(x64emu_t, test.clean)); \
     }
-#define GOTEST(s1, s2)                                      \
-    if(box64_dynarec_test) {                                \
-        MOV32w(s2, 1);                                      \
-        SW(s2, xEmu, offsetof(x64emu_t, test.test));        \
+#define GOTEST(s1, s2)                               \
+    if (box64_dynarec_test) {                        \
+        MOV32w(s2, 1);                               \
+        SW(s2, xEmu, offsetof(x64emu_t, test.test)); \
     }
 
-#define GETREX()                                \
-    rex.rex = 0;                                \
-    if(!rex.is32bits)                           \
-        while(opcode>=0x40 && opcode<=0x4f) {   \
-            rex.rex = opcode;                   \
-            opcode = F8;                        \
+#define GETREX()                                   \
+    rex.rex = 0;                                   \
+    if (!rex.is32bits)                             \
+        while (opcode >= 0x40 && opcode <= 0x4f) { \
+            rex.rex = opcode;                      \
+            opcode = F8;                           \
         }
 
 #endif //__DYNAREC_RV64_HELPER_H__
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index d377309d..b7058c9a 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -139,12 +139,12 @@ int Table64(dynarec_rv64_t *dyn, uint64_t val, int pass);  // add a value to tab
 
 void CreateJmpNext(void* addr, void* next);
 
-#define GO_TRACE(A, B)      \
+#define GO_TRACE(A, B, s0)  \
     GETIP(addr);            \
     MV(A1, xRIP);           \
-    STORE_XEMU_CALL();      \
+    STORE_XEMU_CALL(s0);    \
     MOV64x(A2, B);          \
     CALL(A, -1);            \
     LOAD_XEMU_CALL()
 
-#endif //__DYNAREC_RV64_PRIVATE_H_
\ No newline at end of file
+#endif //__DYNAREC_RV64_PRIVATE_H_
diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h
index e7608781..7a1b3f4a 100644
--- a/src/dynarec/rv64/rv64_emitter.h
+++ b/src/dynarec/rv64/rv64_emitter.h
@@ -30,42 +30,42 @@ f18–27  fs2–11  FP saved registers              Callee
 f28–31  ft8–11  FP temporaries                  Caller
 */
 // x86 Register mapping
-#define xRAX 16
-#define xRCX 17
-#define xRDX 18
-#define xRBX 19
-#define xRSP 20
-#define xRBP 21
-#define xRSI 22
-#define xRDI 23
-#define xR8 24
-#define xR9 25
-#define xR10 26
-#define xR11 27
-#define xR12 28
-#define xR13 29
-#define xR14 30
-#define xR15 31
+#define xRAX   16
+#define xRCX   17
+#define xRDX   18
+#define xRBX   19
+#define xRSP   20
+#define xRBP   21
+#define xRSI   22
+#define xRDI   23
+#define xR8    24
+#define xR9    25
+#define xR10   26
+#define xR11   27
+#define xR12   28
+#define xR13   29
+#define xR14   30
+#define xR15   31
 #define xFlags 8
-#define xRIP 7
+#define xRIP   7
 
 // 32bits version
-#define wEAX xRAX
-#define wECX xRCX
-#define wEDX xRDX
-#define wEBX xRBX
-#define wESP xRSP
-#define wEBP xRBP
-#define wESI xRSI
-#define wEDI xRDI
-#define wR8 xR8
-#define wR9 xR9
-#define wR10 xR10
-#define wR11 xR11
-#define wR12 xR12
-#define wR13 xR13
-#define wR14 xR14
-#define wR15 xR15
+#define wEAX   xRAX
+#define wECX   xRCX
+#define wEDX   xRDX
+#define wEBX   xRBX
+#define wESP   xRSP
+#define wEBP   xRBP
+#define wESI   xRSI
+#define wEDI   xRDI
+#define wR8    xR8
+#define wR9    xR9
+#define wR10   xR10
+#define wR11   xR11
+#define wR12   xR12
+#define wR13   xR13
+#define wR14   xR14
+#define wR15   xR15
 #define wFlags xFlags
 // scratch registers
 #define x1 11
@@ -129,11 +129,11 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define ZEROUP(r) AND(r, r, xMASK)
 
 #define R_type(funct7, rs2, rs1, funct3, rd, opcode) ((funct7) << 25 | (rs2) << 20 | (rs1) << 15 | (funct3) << 12 | (rd) << 7 | (opcode))
-#define I_type(imm12, rs1, funct3, rd, opcode) ((imm12) << 20 | (rs1) << 15 | (funct3) << 12 | (rd) << 7 | (opcode))
-#define S_type(imm12, rs2, rs1, funct3, opcode) (((imm12) >> 5) << 25 | (rs2) << 20 | (rs1) << 15 | (funct3) << 12 | ((imm12)&31) << 7 | (opcode))
-#define B_type(imm13, rs2, rs1, funct3, opcode) ((((imm13) >> 12) & 1) << 31 | (((imm13) >> 5) & 63) << 25 | (rs2) << 20 | (rs1) << 15 | (funct3) << 12 | (((imm13) >> 1) & 15) << 8 | (((imm13) >> 11) & 1) << 7 | (opcode))
-#define U_type(imm32, rd, opcode) (((imm32) >> 12) << 12 | (rd) << 7 | (opcode))
-#define J_type(imm21, rd, opcode) ((((imm21) >> 20) & 1) << 31 | (((imm21) >> 1) & 0b1111111111) << 21 | (((imm21) >> 11) & 1) << 20 | (((imm21) >> 12) & 0b11111111) << 12 | (rd) << 7 | (opcode))
+#define I_type(imm12, rs1, funct3, rd, opcode)       ((imm12) << 20 | (rs1) << 15 | (funct3) << 12 | (rd) << 7 | (opcode))
+#define S_type(imm12, rs2, rs1, funct3, opcode)      (((imm12) >> 5) << 25 | (rs2) << 20 | (rs1) << 15 | (funct3) << 12 | ((imm12)&31) << 7 | (opcode))
+#define B_type(imm13, rs2, rs1, funct3, opcode)      ((((imm13) >> 12) & 1) << 31 | (((imm13) >> 5) & 63) << 25 | (rs2) << 20 | (rs1) << 15 | (funct3) << 12 | (((imm13) >> 1) & 15) << 8 | (((imm13) >> 11) & 1) << 7 | (opcode))
+#define U_type(imm32, rd, opcode)                    (((imm32) >> 12) << 12 | (rd) << 7 | (opcode))
+#define J_type(imm21, rd, opcode)                    ((((imm21) >> 20) & 1) << 31 | (((imm21) >> 1) & 0b1111111111) << 21 | (((imm21) >> 11) & 1) << 20 | (((imm21) >> 12) & 0b11111111) << 12 | (rd) << 7 | (opcode))
 
 // RV32I
 // put imm20 in the [31:12] bits of rd, zero [11:0] and sign extend bits31
@@ -144,7 +144,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 
 #define JAL_gen(rd, imm21) J_type(imm21, rd, 0b1101111)
 // Unconditional branch, no return address set
-#define B(imm21) EMIT(JAL_gen(xZR, imm21))
+#define B(imm21)               EMIT(JAL_gen(xZR, imm21))
 #define B__(reg1, reg2, imm21) B(imm21)
 // Unconditional branch, return set to xRA
 #define JAL(imm21) EMIT(JAL_gen(xRA, imm21))
@@ -248,10 +248,10 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define SNEZ(rd, rs1) SLTU(rd, xZR, rs1)
 
 
-#define BEQ(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b000, 0b1100011))
-#define BNE(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b001, 0b1100011))
-#define BLT(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b100, 0b1100011))
-#define BGE(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b101, 0b1100011))
+#define BEQ(rs1, rs2, imm13)  EMIT(B_type(imm13, rs2, rs1, 0b000, 0b1100011))
+#define BNE(rs1, rs2, imm13)  EMIT(B_type(imm13, rs2, rs1, 0b001, 0b1100011))
+#define BLT(rs1, rs2, imm13)  EMIT(B_type(imm13, rs2, rs1, 0b100, 0b1100011))
+#define BGE(rs1, rs2, imm13)  EMIT(B_type(imm13, rs2, rs1, 0b101, 0b1100011))
 #define BLTU(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b110, 0b1100011))
 #define BGEU(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b111, 0b1100011))
 
@@ -377,10 +377,10 @@ f28–31  ft8–11  FP temporaries                  Caller
     }
 
 #define FENCE_gen(pred, succ) (((pred) << 24) | ((succ) << 20) | 0b0001111)
-#define FENCE() EMIT(FENCE_gen(3, 3))
+#define FENCE()               EMIT(FENCE_gen(3, 3))
 
 #define FENCE_I_gen() ((0b001 << 12) | 0b0001111)
-#define FENCE_I() EMIT(FENCE_I_gen())
+#define FENCE_I()     EMIT(FENCE_I_gen())
 
 #define EBREAK() EMIT(I_type(1, 0, 0, 0, 0b1110011))
 
@@ -476,9 +476,9 @@ f28–31  ft8–11  FP temporaries                  Caller
         SRAIW(rd, rs1, imm); \
     }
 
-#define CSRRW(rd, rs1, csr) EMIT(I_type(csr, rs1, 0b001, rd, 0b1110011))
-#define CSRRS(rd, rs1, csr) EMIT(I_type(csr, rs1, 0b010, rd, 0b1110011))
-#define CSRRC(rd, rs1, csr) EMIT(I_type(csr, rs1, 0b011, rd, 0b1110011))
+#define CSRRW(rd, rs1, csr)  EMIT(I_type(csr, rs1, 0b001, rd, 0b1110011))
+#define CSRRS(rd, rs1, csr)  EMIT(I_type(csr, rs1, 0b010, rd, 0b1110011))
+#define CSRRC(rd, rs1, csr)  EMIT(I_type(csr, rs1, 0b011, rd, 0b1110011))
 #define CSRRWI(rd, imm, csr) EMIT(I_type(csr, imm, 0b101, rd, 0b1110011))
 #define CSRRSI(rd, imm, csr) EMIT(I_type(csr, imm, 0b110, rd, 0b1110011))
 #define CSRRCI(rd, imm, csr) EMIT(I_type(csr, imm, 0b111, rd, 0b1110011))
@@ -493,10 +493,10 @@ f28–31  ft8–11  FP temporaries                  Caller
 // rd =(upper) rs1 * rs2 (both unsigned)
 #define MULHU(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b011, rd, 0b0110011))
 // rd =(upper) rs1 / rs2
-#define DIV(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, 0b0110011))
+#define DIV(rd, rs1, rs2)  EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, 0b0110011))
 #define DIVU(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, 0b0110011))
 // rd = rs1 mod rs2
-#define REM(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, 0b0110011))
+#define REM(rd, rs1, rs2)  EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, 0b0110011))
 #define REMU(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, 0b0110011))
 
 // RV64M
@@ -505,29 +505,29 @@ f28–31  ft8–11  FP temporaries                  Caller
 // rd = rs1 * rs2
 #define MULxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b000, rd, rex.w ? 0b0110011 : 0b0111011))
 // rd = rs1 / rs2
-#define DIVW(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, 0b0111011))
-#define DIVxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, rex.w ? 0b0110011 : 0b0111011))
-#define DIVUW(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, 0b0111011))
+#define DIVW(rd, rs1, rs2)   EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, 0b0111011))
+#define DIVxw(rd, rs1, rs2)  EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, rex.w ? 0b0110011 : 0b0111011))
+#define DIVUW(rd, rs1, rs2)  EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, 0b0111011))
 #define DIVUxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, rex.w ? 0b0110011 : 0b0111011))
 // rd = rs1 mod rs2
-#define REMW(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, 0b0111011))
-#define REMxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, rex.w ? 0b0110011 : 0b0111011))
-#define REMUW(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, 0b0111011))
+#define REMW(rd, rs1, rs2)   EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, 0b0111011))
+#define REMxw(rd, rs1, rs2)  EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, rex.w ? 0b0110011 : 0b0111011))
+#define REMUW(rd, rs1, rs2)  EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, 0b0111011))
 #define REMUxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, rex.w ? 0b0110011 : 0b0111011))
 
 #define AQ_RL(f5, aq, rl) ((f5 << 2) | ((aq & 1) << 1) | (rl & 1))
 
 // RV32A
-#define LR_W(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010, rd, 0b0101111))
+#define LR_W(rd, rs1, aq, rl)      EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010, rd, 0b0101111))
 #define SC_W(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010, rd, 0b0101111))
 
 #define AMOSWAP_W(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00001, aq, rl), rs2, rs1, 0b010, rd, 0b0101111))
 
 // RV64A
-#define LR_D(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b011, rd, 0b0101111))
+#define LR_D(rd, rs1, aq, rl)      EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b011, rd, 0b0101111))
 #define SC_D(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b011, rd, 0b0101111))
 
-#define LRxw(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010 | rex.w, rd, 0b0101111))
+#define LRxw(rd, rs1, aq, rl)      EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010 | rex.w, rd, 0b0101111))
 #define SCxw(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010 | rex.w, rd, 0b0101111))
 
 #define AMOSWAP_D(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00001, aq, rl), rs2, rs1, 0b011, rd, 0b0101111))
@@ -563,7 +563,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 // Round to Nearest, ties to Max Magnitude
 #define RD_RMM 0b100
 // In instruction’s rm field, selects dynamic rounding mode;
-#define RD_RM 0b111
+#define RD_RM  0b111
 #define RD_DYN RD_RM
 
 // load single precision from rs1+imm12 to frd
@@ -595,7 +595,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define FSUBS(frd, frs1, frs2) EMIT(R_type(0b0000100, frs2, frs1, 0b000, frd, 0b1010011))
 #define FMULS(frd, frs1, frs2) EMIT(R_type(0b0001000, frs2, frs1, 0b000, frd, 0b1010011))
 #define FDIVS(frd, frs1, frs2) EMIT(R_type(0b0001100, frs2, frs1, 0b000, frd, 0b1010011))
-#define FSQRTS(frd, frs1) EMIT(R_type(0b0101100, 0b00000, frs1, 0b000, frd, 0b1010011))
+#define FSQRTS(frd, frs1)      EMIT(R_type(0b0101100, 0b00000, frs1, 0b000, frd, 0b1010011))
 #define FMINS(frd, frs1, frs2) EMIT(R_type(0b0010100, frs2, frs1, 0b000, frd, 0b1010011))
 #define FMAXS(frd, frs1, frs2) EMIT(R_type(0b0010100, frs2, frs1, 0b001, frd, 0b1010011))
 
@@ -652,7 +652,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define FSUBD(frd, frs1, frs2) EMIT(R_type(0b0000101, frs2, frs1, 0b000, frd, 0b1010011))
 #define FMULD(frd, frs1, frs2) EMIT(R_type(0b0001001, frs2, frs1, 0b000, frd, 0b1010011))
 #define FDIVD(frd, frs1, frs2) EMIT(R_type(0b0001101, frs2, frs1, 0b000, frd, 0b1010011))
-#define FSQRTD(frd, frs1) EMIT(R_type(0b0101101, 0b00000, frs1, 0b000, frd, 0b1010011))
+#define FSQRTD(frd, frs1)      EMIT(R_type(0b0101101, 0b00000, frs1, 0b000, frd, 0b1010011))
 #define FMIND(frd, frs1, frs2) EMIT(R_type(0b0010101, frs2, frs1, 0b000, frd, 0b1010011))
 #define FMAXD(frd, frs1, frs2) EMIT(R_type(0b0010101, frs2, frs1, 0b001, frd, 0b1010011))
 
@@ -1029,11 +1029,29 @@ f28–31  ft8–11  FP temporaries                  Caller
 // rd2 := mem[addr+15:addr+8]
 #define TH_LDD(rd1, rd2, rs1, imm2) EMIT(R_type(0b1111100 | ((imm2)&0b11), rd2, rs1, 0b100, rd1, 0b0001011))
 
-// TODO
-// th.lwd rd1, rd2, (rs1), imm2, 3 Load two signed 32-bit values
-// th.lwud rd1, rd2, (rs1), imm2, 3 Load two unsigned 32-bit values
-// th.sdd rd1, rd2, (rs1), imm2, 4 Store two 64-bit values
-// th.swd rd1, rd2, (rs1), imm2, 3 Store two 32-bit values
+// Load two signed 32-bit values from memory into two GPRs.
+// addr := rs1 + (zero_extend(imm2) << 3)
+// reg[rd1] := sign_extend(mem[addr+3:addr])
+// reg[rd2] := sign_extend(mem[addr+7:addr+4])
+#define TH_LWD(rd1, rd2, rs1, imm2) EMIT(R_type(0b1110000 | ((imm2)&0b11), rd2, rs1, 0b100, rd1, 0b0001011))
+
+// Load two unsigned 32-bit values from memory into two GPRs.
+// addr := rs1 + (zero_extend(imm2) << 3)
+// reg[rd1] := zero_extend(mem[addr+3:addr])
+// reg[rd2] := zero_extend(mem[addr+7:addr+4])
+#define TH_LWUD(rd1, rd2, rs1, imm2) EMIT(R_type(0b1111000 | ((imm2)&0b11), rd2, rs1, 0b100, rd1, 0b0001011))
+
+// Store two 64-bit values to memory from two GPRs.
+// addr := rs1 + (zero_extend(imm2) << 4)
+// mem[addr+7:addr] := reg[rd1]
+// mem[addr+15:addr+8] := reg[rd2]
+#define TH_SDD(rd1, rd2, rs1, imm2) EMIT(R_type(0b1111100 | ((imm2)&0b11), rd2, rs1, 0b101, rd1, 0b0001011))
+
+// Store two 32-bit values to memory from two GPRs.
+// addr := rs1 + (zero_extend(imm2) << 3)
+// mem[addr+3:addr] := reg[rd1][31:0]
+// mem[addr+7:addr+3] := reg[rd2][31:0]
+#define TH_SWD(rd1, rd2, rs1, imm2) EMIT(R_type(0b1110000 | ((imm2)&0b11), rd2, rs1, 0b101, rd1, 0b0001011))
 
 // XTheadFMemIdx - Indexed memory operations for floating-point registers