about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2023-09-25 04:20:21 +0800
committerGitHub <noreply@github.com>2023-09-24 22:20:21 +0200
commitc00cd1d6271b313ea459d0b782bc392af74e1598 (patch)
tree392f37492bb8aca62ee40e747a59787e0a34a982 /src
parentab65641eae5c6b6440ab7defe4b798953249a1d3 (diff)
downloadbox64-c00cd1d6271b313ea459d0b782bc392af74e1598.tar.gz
box64-c00cd1d6271b313ea459d0b782bc392af74e1598.zip
[RV64_DYNAREC] Added more support for XTheadBb extension (#989)
* Reformat

* Use TH_FF0 for LZCNT

* Reformat

* Reformat

* Added MOVBE tests

* Reformat

* Added a new REVxw macro

* Refined test24

* Refined test24

* Fix bugs

* [INTERPRETER] Fixed MOVBE

* [DYNAREC_ARM64] Fix MOVBE
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_660f.c32
-rw-r--r--src/dynarec/rv64/dynarec_rv64_0f.c1060
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f.c1639
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f30f.c221
-rw-r--r--src/dynarec/rv64/rv64_emitter.h955
-rw-r--r--src/emu/x64run660f.c48
6 files changed, 2045 insertions, 1910 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index 2d80e68d..0013517e 100644
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -169,7 +169,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             FAKEED;

             break;

-        

+

         case 0x28:

             INST_NAME("MOVAPD Gx,Ex");

             nextop = F8;

@@ -857,7 +857,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         BFIx(gd, x1, 0, 16);

                     } else {

                         SMREAD();

-                        addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, (1<<2)-1, rex, NULL, 0, 0);

+                        addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<1, (1<<1)-1, rex, NULL, 0, 0);

                         LDH(x1, ed, fixedaddress);

                         REV16x(x1, x1);

                         BFIx(gd, x1, 0, 16);

@@ -873,7 +873,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         BFIx(ed, x1, 0, 16);

                     } else {

                         SMREAD();

-                        addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, (1<<2)-1, rex, NULL, 0, 0);

+                        addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<1, (1<<1)-1, rex, NULL, 0, 0);

                         REV16x(x1, gd);

                         STH(x1, ed, fixedaddress);

                     }

@@ -1007,7 +1007,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     GETEX(q1, 0, 1);

                     u8 = F8;

                     if(u8>31) {

-                        VEORQ(q0, q0, q0);    

+                        VEORQ(q0, q0, q0);

                     } else if(u8>15) {

                         d0 = fpu_get_scratch(dyn);

                         VEORQ(d0, d0, d0);

@@ -1121,7 +1121,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         sse_forget_reg(dyn, ninst, gd);

                         MOV32w(x1, gd); // gx

                         if(MODREG) {

-                            ed = (nextop&7)+(rex.b<<3); 

+                            ed = (nextop&7)+(rex.b<<3);

                             sse_forget_reg(dyn, ninst, ed);

                             MOV32w(x2, ed);

                             MOV32w(x3, 0);  //p = NULL

@@ -1145,7 +1145,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     sse_forget_reg(dyn, ninst, gd);

                     MOV32w(x1, gd); // gx

                     if(MODREG) {

-                        ed = (nextop&7)+(rex.b<<3); 

+                        ed = (nextop&7)+(rex.b<<3);

                         sse_forget_reg(dyn, ninst, ed);

                         MOV32w(x2, ed);

                         MOV32w(x3, 0);  //p = NULL

@@ -1195,7 +1195,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             LSRx(x1, x1, 63);

             LSRx(gd, gd, 63);

             BFIx(gd, x1, 1, 1);

-            break;    

+            break;

         case 0x51:

             INST_NAME("SQRTPD Gx, Ex");

             nextop = F8;

@@ -1215,7 +1215,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 VFSQRTQD(q1, q0);

             }

             break;

-     

+

         case 0x54:

             INST_NAME("ANDPD Gx, Ex");

             nextop = F8;

@@ -2195,21 +2195,21 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 case 0: VFCMEQQD(v0, v0, v1); break;   // Equal

                 case 1: VFCMGTQD(v0, v1, v0); break;   // Less than

                 case 2: VFCMGEQD(v0, v1, v0); break;   // Less or equal

-                case 3: VFCMEQQD(v0, v0, v0); 

+                case 3: VFCMEQQD(v0, v0, v0);

                         if(v0!=v1) {

-                            q0 = fpu_get_scratch(dyn); 

-                            VFCMEQQD(q0, v1, v1); 

+                            q0 = fpu_get_scratch(dyn);

+                            VFCMEQQD(q0, v1, v1);

                             VANDQ(v0, v0, q0);

                         }

-                        VMVNQ(v0, v0); 

+                        VMVNQ(v0, v0);

                         break;   // NaN (NaN is not equal to himself)

                 case 4: VFCMEQQD(v0, v0, v1); VMVNQ(v0, v0); break;   // Not Equal (or unordered on ARM, not on X86...)

                 case 5: VFCMGTQD(v0, v1, v0); VMVNQ(v0, v0); break;   // Greater or equal or unordered

                 case 6: VFCMGEQD(v0, v1, v0); VMVNQ(v0, v0); break;   // Greater or unordered

-                case 7: VFCMEQQD(v0, v0, v0); 

+                case 7: VFCMEQQD(v0, v0, v0);

                         if(v0!=v1) {

-                            q0 = fpu_get_scratch(dyn); 

-                            VFCMEQQD(q0, v1, v1); 

+                            q0 = fpu_get_scratch(dyn);

+                            VFCMEQQD(q0, v1, v1);

                             VANDQ(v0, v0, q0);

                         }

                         break;   // not NaN

@@ -2386,7 +2386,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             UADDLV_8(q1, q1);   // accumalte

             VMOVBto(x1, q1, 0);

             BFIx(gd, x1, 8, 8);

-            break;        

+            break;

         case 0xD8:

             INST_NAME("PSUBUSB Gx, Ex");

             nextop = F8;

diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c
index 8e85cc8e..227ffda5 100644
--- a/src/dynarec/rv64/dynarec_rv64_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f.c
@@ -26,7 +26,8 @@
 
 uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog)
 {
-    (void)ip; (void)need_epilog;
+    (void)ip;
+    (void)need_epilog;
 
     uint8_t opcode = F8;
     uint8_t nextop, u8;
@@ -55,13 +56,13 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
     MAYUSE(j64);
     MAYUSE(cacheupd);
 
-    switch(opcode) {
+    switch (opcode) {
 
         case 0x01:
             INST_NAME("FAKE xgetbv");
             nextop = F8;
             addr = fakeed(dyn, addr, ninst, nextop);
-            SETFLAGS(X_ALL, SF_SET);    // Hack to set flags in "don't care" state
+            SETFLAGS(X_ALL, SF_SET); // Hack to set flags in "don't care" state
             GETIP(ip);
             STORE_XEMU_CALL();
             CALL(native_ud, -1);
@@ -90,7 +91,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 
         case 0x09:
             INST_NAME("WBINVD");
-            SETFLAGS(X_ALL, SF_SET);    // Hack to set flags in "don't care" state
+            SETFLAGS(X_ALL, SF_SET); // Hack to set flags in "don't care" state
             GETIP(ip);
             STORE_XEMU_CALL();
             CALL(native_ud, -1);
@@ -102,7 +103,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 
         case 0x0B:
             INST_NAME("UD2");
-            SETFLAGS(X_ALL, SF_SET);    // Hack to set flags in "don't care" state
+            SETFLAGS(X_ALL, SF_SET); // Hack to set flags in "don't care" state
             GETIP(ip);
             STORE_XEMU_CALL();
             CALL(native_ud, -1);
@@ -114,13 +115,13 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 
         case 0x0D:
             nextop = F8;
-            switch((nextop>>3)&7) {
+            switch ((nextop >> 3) & 7) {
                 case 1:
                     INST_NAME("PREFETCHW");
                     // nop without Zicbom, Zicbop, Zicboz extensions
                     FAKEED;
                     break;
-                default:    //???
+                default: //???
                     DEFAULT;
             }
             break;
@@ -130,31 +131,31 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            LD(x3, wback, fixedaddress+0);
-            LD(x4, wback, fixedaddress+8);
-            SD(x3, gback, gdoffset+0);
-            SD(x4, gback, gdoffset+8);
+            LD(x3, wback, fixedaddress + 0);
+            LD(x4, wback, fixedaddress + 8);
+            SD(x3, gback, gdoffset + 0);
+            SD(x4, gback, gdoffset + 8);
             break;
         case 0x11:
             INST_NAME("MOVUPS Ex,Gx");
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            LD(x3, gback, gdoffset+0);
-            LD(x4, gback, gdoffset+8);
-            SD(x3, wback, fixedaddress+0);
-            SD(x4, wback, fixedaddress+8);
-            if(!MODREG)
+            LD(x3, gback, gdoffset + 0);
+            LD(x4, gback, gdoffset + 8);
+            SD(x3, wback, fixedaddress + 0);
+            SD(x4, wback, fixedaddress + 8);
+            if (!MODREG)
                 SMWRITE2();
             break;
         case 0x12:
             nextop = F8;
-            if(MODREG) {
+            if (MODREG) {
                 INST_NAME("MOVHLPS Gx,Ex");
                 GETGX();
                 GETEX(x2, 0);
-                LD(x3, wback, fixedaddress+8);
-                SD(x3, gback, gdoffset+0);
+                LD(x3, wback, fixedaddress + 8);
+                SD(x3, gback, gdoffset + 0);
             } else {
                 INST_NAME("MOVLPS Gx,Ex");
                 GETEXSD(v0, 0);
@@ -167,9 +168,9 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            LD(x3, gback, gdoffset+0);
-            SD(x3, wback, fixedaddress+0);
-            if(!MODREG)
+            LD(x3, gback, gdoffset + 0);
+            SD(x3, wback, fixedaddress + 0);
+            if (!MODREG)
                 SMWRITE2();
             break;
         case 0x14:
@@ -177,30 +178,30 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            LWU(x5, gback, gdoffset+1*4);
-            LWU(x3, wback, fixedaddress+0);
-            LWU(x4, wback, fixedaddress+4);
-            SW(x4, gback, gdoffset+3*4);
-            SW(x5, gback, gdoffset+2*4);
-            SW(x3, gback, gdoffset+1*4);
+            LWU(x5, gback, gdoffset + 1 * 4);
+            LWU(x3, wback, fixedaddress + 0);
+            LWU(x4, wback, fixedaddress + 4);
+            SW(x4, gback, gdoffset + 3 * 4);
+            SW(x5, gback, gdoffset + 2 * 4);
+            SW(x3, gback, gdoffset + 1 * 4);
             break;
         case 0x15:
             INST_NAME("UNPCKHPS Gx,Ex");
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            LWU(x3, wback, fixedaddress+2*4);
-            LWU(x4, wback, fixedaddress+3*4);
-            LWU(x5, gback, gdoffset+2*4);
-            LWU(x6, gback, gdoffset+3*4);
-            SW(x5, gback, gdoffset+0*4);
-            SW(x3, gback, gdoffset+1*4);
-            SW(x6, gback, gdoffset+2*4);
-            SW(x4, gback, gdoffset+3*4);
+            LWU(x3, wback, fixedaddress + 2 * 4);
+            LWU(x4, wback, fixedaddress + 3 * 4);
+            LWU(x5, gback, gdoffset + 2 * 4);
+            LWU(x6, gback, gdoffset + 3 * 4);
+            SW(x5, gback, gdoffset + 0 * 4);
+            SW(x3, gback, gdoffset + 1 * 4);
+            SW(x6, gback, gdoffset + 2 * 4);
+            SW(x4, gback, gdoffset + 3 * 4);
             break;
         case 0x16:
             nextop = F8;
-            if(MODREG) {
+            if (MODREG) {
                 INST_NAME("MOVLHPS Gx,Ex");
             } else {
                 INST_NAME("MOVHPS Gx,Ex");
@@ -208,35 +209,35 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             }
             GETGX();
             GETEX(x2, 0);
-            LD(x4, wback, fixedaddress+0);
-            SD(x4, gback, gdoffset+8);
+            LD(x4, wback, fixedaddress + 0);
+            SD(x4, gback, gdoffset + 8);
             break;
         case 0x17:
             INST_NAME("MOVHPS Ex,Gx");
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            LD(x4, gback, gdoffset+8);
-            SD(x4, wback, fixedaddress+0);
-            if(!MODREG)
+            LD(x4, gback, gdoffset + 8);
+            SD(x4, wback, fixedaddress + 0);
+            if (!MODREG)
                 SMWRITE2();
             break;
         case 0x18:
             nextop = F8;
-            if((nextop&0xC0)==0xC0) {
+            if ((nextop & 0xC0) == 0xC0) {
                 INST_NAME("NOP (multibyte)");
             } else
-            switch((nextop>>3)&7) {
-                case 0:
-                case 1:
-                case 2:
-                case 3:
-                    INST_NAME("PREFETCHh Ed");
-                    FAKEED;
-                    break;
-                default:
-                    INST_NAME("NOP (multibyte)");
-                    FAKEED;
+                switch ((nextop >> 3) & 7) {
+                    case 0:
+                    case 1:
+                    case 2:
+                    case 3:
+                        INST_NAME("PREFETCHh Ed");
+                        FAKEED;
+                        break;
+                    default:
+                        INST_NAME("NOP (multibyte)");
+                        FAKEED;
                 }
             break;
 
@@ -259,7 +260,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
-            if(!MODREG)
+            if (!MODREG)
                 SMWRITE2();
             break;
         case 0x2A:
@@ -269,10 +270,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEM(x2, 0);
             d0 = fpu_get_scratch(dyn);
             u8 = sse_setround(dyn, ninst, x4, x5);
-            for (int i=0; i<2; ++i) {
-                LW(x3, wback, fixedaddress+i*4);
+            for (int i = 0; i < 2; ++i) {
+                LW(x3, wback, fixedaddress + i * 4);
                 FCVTSW(d0, x3, RD_DYN);
-                FSW(d0, gback, gdoffset+i*4);
+                FSW(d0, gback, gdoffset + i * 4);
             }
             x87_restoreround(dyn, ninst, u8);
             break;
@@ -281,10 +282,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            LD(x3, gback, gdoffset+0);
-            LD(x4, gback, gdoffset+8);
-            SD(x3, wback, fixedaddress+0);
-            SD(x4, wback, fixedaddress+8);
+            LD(x3, gback, gdoffset + 0);
+            LD(x4, gback, gdoffset + 8);
+            SD(x3, wback, fixedaddress + 0);
+            SD(x4, wback, fixedaddress + 8);
             break;
         case 0x2C:
             INST_NAME("CVTTPS2PI Gm,Ex");
@@ -292,20 +293,20 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGM();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
-            for (int i=0; i<2; ++i) {
-                if(!box64_dynarec_fastround) {
-                    FSFLAGSI(0);  // // reset all bits
+            for (int i = 0; i < 2; ++i) {
+                if (!box64_dynarec_fastround) {
+                    FSFLAGSI(0); // // reset all bits
                 }
-                FLW(d0, wback, fixedaddress+i*4);
+                FLW(d0, wback, fixedaddress + i * 4);
                 FCVTWS(x1, d0, RD_RTZ);
-                if(!box64_dynarec_fastround) {
-                    FRFLAGS(x5);   // get back FPSR to check the IOC bit
-                    ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                if (!box64_dynarec_fastround) {
+                    FRFLAGS(x5); // get back FPSR to check the IOC bit
+                    ANDI(x5, x5, (1 << FR_NV) | (1 << FR_OF));
                     BEQ_MARKi(x5, xZR, i);
                     MOV32w(x1, 0x80000000);
                     MARKi(i);
                 }
-                SW(x1, gback, gdoffset+i*4);
+                SW(x1, gback, gdoffset + i * 4);
             }
             break;
         case 0x2D:
@@ -315,27 +316,31 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             u8 = sse_setround(dyn, ninst, x6, x4);
-            for (int i=0; i<2; ++i) {
-                if(!box64_dynarec_fastround) {
-                    FSFLAGSI(0);  // // reset all bits
+            for (int i = 0; i < 2; ++i) {
+                if (!box64_dynarec_fastround) {
+                    FSFLAGSI(0); // // reset all bits
                 }
-                FLW(d0, wback, fixedaddress+i*4);
+                FLW(d0, wback, fixedaddress + i * 4);
                 FCVTWS(x1, d0, RD_DYN);
-                if(!box64_dynarec_fastround) {
-                    FRFLAGS(x5);   // get back FPSR to check the IOC bit
-                    ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                if (!box64_dynarec_fastround) {
+                    FRFLAGS(x5); // get back FPSR to check the IOC bit
+                    ANDI(x5, x5, (1 << FR_NV) | (1 << FR_OF));
                     BEQ_MARKi(x5, xZR, i);
                     MOV32w(x1, 0x80000000);
                     MARKi(i);
                 }
-                SW(x1, gback, gdoffset+i*4);
+                SW(x1, gback, gdoffset + i * 4);
             }
             x87_restoreround(dyn, ninst, u8);
             break;
         case 0x2E:
             // no special check...
         case 0x2F:
-            if(opcode==0x2F) {INST_NAME("COMISS Gx, Ex");} else {INST_NAME("UCOMISS Gx, Ex");}
+            if (opcode == 0x2F) {
+                INST_NAME("COMISS Gx, Ex");
+            } else {
+                INST_NAME("UCOMISS Gx, Ex");
+            }
             SETFLAGS(X_ALL, SF_SET);
             SET_DFNONE();
             nextop = F8;
@@ -343,160 +348,61 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEXSS(v0, 0);
             CLEAR_FLAGS();
             // if isnan(d0) || isnan(v0)
-            IFX(X_ZF | X_PF | X_CF) {
+            IFX(X_ZF | X_PF | X_CF)
+            {
                 FEQS(x3, d0, d0);
                 FEQS(x2, v0, v0);
                 AND(x2, x2, x3);
                 BNE_MARK(x2, xZR);
-                ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF));
+                ORI(xFlags, xFlags, (1 << F_ZF) | (1 << F_PF) | (1 << F_CF));
                 B_NEXT_nocond;
             }
             MARK;
             // else if isless(d0, v0)
-            IFX(X_CF) {
+            IFX(X_CF)
+            {
                 FLTS(x2, d0, v0);
                 BEQ_MARK2(x2, xZR);
-                ORI(xFlags, xFlags, 1<<F_CF);
+                ORI(xFlags, xFlags, 1 << F_CF);
                 B_NEXT_nocond;
             }
             MARK2;
             // else if d0 == v0
-            IFX(X_ZF) {
+            IFX(X_ZF)
+            {
                 FEQS(x2, d0, v0);
                 CBZ_NEXT(x2);
-                ORI(xFlags, xFlags, 1<<F_ZF);
+                ORI(xFlags, xFlags, 1 << F_ZF);
             }
             break;
         case 0x31:
             INST_NAME("RDTSC");
             NOTEST(x1);
             MESSAGE(LOG_DUMP, "Need Optimization\n");
-            CALL(ReadTSC, x3);   // will return the u64 in x3
+            CALL(ReadTSC, x3); // will return the u64 in x3
             SRLI(xRDX, x3, 32);
-            AND(xRAX, x3, 32);   // wipe upper part
+            AND(xRAX, x3, 32); // wipe upper part
             break;
         case 0x38:
-            //SSE3
-            nextop=F8;
-            switch(nextop) {
+            // SSE3
+            nextop = F8;
+            switch (nextop) {
                 case 0xF0:
                     INST_NAME("MOVBE Gd, Ed");
-                    nextop=F8;
+                    nextop = F8;
                     GETGD;
                     SMREAD();
                     addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
                     LDxw(gd, ed, fixedaddress);
-                    if (rv64_zbb) {
-                        REV8(gd, gd);
-                        if (!rex.w) {
-                            SRLI(gd, gd, 32);
-                        }
-                    } else {
-                        if (rex.w) {
-                            MOV_U12(x2, 0xff);
-                            SLLI(x1, gd, 56);
-                            SRLI(x3, gd, 56);
-                            SRLI(x4, gd, 40);
-                            SLLI(x2, x2, 8);
-                            AND(x4, x4, x2);
-                            OR(x1, x1, x3);
-                            OR(x1, x1, x4);
-                            SLLI(x3, gd, 40);
-                            SLLI(x4, x2, 40);
-                            AND(x3, x3, x4);
-                            OR(x1, x1, x3);
-
-                            SRLI(x3, gd, 24);
-                            SLLI(x4, x2, 8);
-                            AND(x3, x3, x4);
-                            OR(x1, x1, x3);
-                            SLLI(x3, gd, 24);
-                            SLLI(x4, x2, 32);
-                            AND(x3, x3, x4);
-                            OR(x1, x1, x3);
-
-                            SRLI(x3, gd, 8);
-                            SLLI(x4, x2, 16);
-                            AND(x3, x3, x4);
-                            OR(x1, x1, x3);
-                            SLLI(x3, gd, 8);
-                            SLLI(x4, x2, 24);
-                            AND(x3, x3, x4);
-                            OR(gd, x1, x3);
-                        } else {
-                            MOV_U12(x2, 0xff);
-                            SLLIW(x2, x2, 8);
-                            SLLIW(x1, gd, 24);
-                            SRLIW(x3, gd, 24);
-                            SRLIW(x4, gd, 8);
-                            AND(x4, x4, x2);
-                            OR(x1, x1, x3);
-                            OR(x1, x1, x4);
-                            SLLIW(gd, gd, 8);
-                            LUI(x2, 0xff0);
-                            AND(gd, gd, x2);
-                            OR(gd, gd, x1);
-                        }
-                    }
+                    REV8xw(gd, gd, x1, x2, x3, x4);
                     break;
                 case 0xF1:
                     INST_NAME("MOVBE Ed, Gd");
-                    nextop=F8;
+                    nextop = F8;
                     GETGD;
                     SMREAD();
                     addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
-                    if (rv64_zbb) {
-                        REV8(x1, gd);
-                        if (!rex.w) {
-                            SRLI(x1, x1, 32);
-                        }
-                    } else {
-                        if (rex.w) {
-                            MOV_U12(x2, 0xff);
-                            SLLI(x1, gd, 56);
-                            SRLI(x3, gd, 56);
-                            SRLI(x4, gd, 40);
-                            SLLI(x2, x2, 8);
-                            AND(x4, x4, x2);
-                            OR(x1, x1, x3);
-                            OR(x1, x1, x4);
-                            SLLI(x3, gd, 40);
-                            SLLI(x4, x2, 40);
-                            AND(x3, x3, x4);
-                            OR(x1, x1, x3);
-
-                            SRLI(x3, gd, 24);
-                            SLLI(x4, x2, 8);
-                            AND(x3, x3, x4);
-                            OR(x1, x1, x3);
-                            SLLI(x3, gd, 24);
-                            SLLI(x4, x2, 32);
-                            AND(x3, x3, x4);
-                            OR(x1, x1, x3);
-
-                            SRLI(x3, gd, 8);
-                            SLLI(x4, x2, 16);
-                            AND(x3, x3, x4);
-                            OR(x1, x1, x3);
-                            SLLI(x3, gd, 8);
-                            SLLI(x4, x2, 24);
-                            AND(x3, x3, x4);
-                            OR(x1, x1, x3);
-                        } else {
-                            MOV_U12(x2, 0xff);
-                            SLLIW(x2, x2, 8);
-                            SLLIW(x1, gd, 24);
-                            SRLIW(x3, gd, 24);
-                            SRLIW(x4, gd, 8);
-                            AND(x4, x4, x2);
-                            OR(x1, x1, x3);
-                            OR(x1, x1, x4);
-                            SLLIW(x3, gd, 8);
-                            LUI(x2, 0xff0);
-                            AND(x3, x3, x2);
-                            OR(x1, x1, x3);
-                        }
-                    }
+                    REV8xw(x1, gd, x1, x2, x3, x4);
                     SDxw(x1, wback, fixedaddress);
                     break;
                 default:
@@ -504,34 +410,34 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             }
             break;
 
-        #define GO(GETFLAGS, NO, YES, F)            \
-            READFLAGS(F);                           \
-            GETFLAGS;                               \
-            nextop=F8;                              \
-            GETGD;                                  \
-            if(MODREG) {                            \
-                ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                B##NO(x1, 8);                       \
-                MV(gd, ed);                         \
-            } else {                                \
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \
-                B##NO(x1, 8);                       \
-                LDxw(gd, ed, fixedaddress);         \
-            }                                       \
-            if(!rex.w) ZEROUP(gd);
+#define GO(GETFLAGS, NO, YES, F)                                                             \
+    READFLAGS(F);                                                                            \
+    GETFLAGS;                                                                                \
+    nextop = F8;                                                                             \
+    GETGD;                                                                                   \
+    if (MODREG) {                                                                            \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                             \
+        B##NO(x1, 8);                                                                        \
+        MV(gd, ed);                                                                          \
+    } else {                                                                                 \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \
+        B##NO(x1, 8);                                                                        \
+        LDxw(gd, ed, fixedaddress);                                                          \
+    }                                                                                        \
+    if (!rex.w) ZEROUP(gd);
 
-        GOCOND(0x40, "CMOV", "Gd, Ed");
-        #undef GO
+            GOCOND(0x40, "CMOV", "Gd, Ed");
+#undef GO
         case 0x50:
             INST_NAME("MOVMSKPS Gd, Ex");
             nextop = F8;
             GETGD;
             GETEX(x1, 0);
             XOR(gd, gd, gd);
-            for(int i=0; i<4; ++i) {
-                LWU(x2, wback, fixedaddress+i*4);
-                SRLI(x2, x2, 31-i);
-                if (i>0) ANDI(x2, x2, 1<<i);
+            for (int i = 0; i < 4; ++i) {
+                LWU(x2, wback, fixedaddress + i * 4);
+                SRLI(x2, x2, 31 - i);
+                if (i > 0) ANDI(x2, x2, 1 << i);
                 OR(gd, gd, x2);
             }
             break;
@@ -541,10 +447,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
-            for(int i=0; i<4; ++i) {
-                FLW(d0, wback, fixedaddress+4*i);
+            for (int i = 0; i < 4; ++i) {
+                FLW(d0, wback, fixedaddress + 4 * i);
                 FSQRTS(d0, d0);
-                FSW(d0, gback, gdoffset+4*i);
+                FSW(d0, gback, gdoffset + 4 * i);
             }
             break;
         case 0x52:
@@ -561,23 +467,23 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             if (!box64_dynarec_fastnan) {
                 FCVTSW(v0, xZR, RD_DYN);
             }
-            for(int i=0; i<4; ++i) {
-                FLW(s0, wback, fixedaddress+i*4);
+            for (int i = 0; i < 4; ++i) {
+                FLW(s0, wback, fixedaddress + i * 4);
                 if (!box64_dynarec_fastnan) {
                     FLES(x3, v0, s0); // s0 >= 0.0f?
-                    BNEZ(x3, 6*4);
+                    BNEZ(x3, 6 * 4);
                     FEQS(x3, s0, s0); // isnan(s0)?
-                    BEQZ(x3, 2*4);
+                    BEQZ(x3, 2 * 4);
                     // s0 is negative, so generate a NaN
                     FDIVS(s0, s1, v0);
                     // s0 is a NaN, just copy it
-                    FSW(s0, gback, gdoffset+i*4);
-                    J(4*4);
+                    FSW(s0, gback, gdoffset + i * 4);
+                    J(4 * 4);
                     // do regular computation
                 }
                 FSQRTS(s0, s0);
                 FDIVS(s0, s1, s0);
-                FSW(s0, gback, gdoffset+i*4);
+                FSW(s0, gback, gdoffset + i * 4);
             }
             break;
         case 0x53:
@@ -589,17 +495,17 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             d1 = fpu_get_scratch(dyn);
             LUI(x3, 0x3f800);
             FMVWX(d0, x3); // 1.0f
-            for(int i=0; i<4; ++i) {
-                FLW(d1, wback, fixedaddress+4*i);
+            for (int i = 0; i < 4; ++i) {
+                FLW(d1, wback, fixedaddress + 4 * i);
                 FDIVS(d1, d0, d1);
-                FSW(d1, gback, gdoffset+4*i);
+                FSW(d1, gback, gdoffset + 4 * i);
             }
             break;
         case 0x54:
             INST_NAME("ANDPS Gx, Ex");
             nextop = F8;
-            gd = ((nextop&0x38)>>3)+(rex.r<<3);
-            if(!(MODREG && gd==(nextop&7)+(rex.b<<3))) {
+            gd = ((nextop & 0x38) >> 3) + (rex.r << 3);
+            if (!(MODREG && gd == (nextop & 7) + (rex.b << 3))) {
                 GETGX();
                 GETEX(x2, 0);
                 SSE_LOOP_Q(x3, x4, AND(x3, x3, x4));
@@ -615,8 +521,8 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x56:
             INST_NAME("ORPS Gx, Ex");
             nextop = F8;
-            gd = ((nextop&0x38)>>3)+(rex.r<<3);
-            if(!(MODREG && gd==(nextop&7)+(rex.b<<3))) {
+            gd = ((nextop & 0x38) >> 3) + (rex.r << 3);
+            if (!(MODREG && gd == (nextop & 7) + (rex.b << 3))) {
                 GETGX();
                 GETEX(x2, 0);
                 SSE_LOOP_Q(x3, x4, OR(x3, x3, x4));
@@ -625,13 +531,12 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x57:
             INST_NAME("XORPS Gx, Ex");
             nextop = F8;
-            //TODO: it might be possible to check if SS or SD are used and not purge them to optimize a bit
+            // TODO: it might be possible to check if SS or SD are used and not purge them to optimize a bit
             GETGX();
-            if(MODREG && gd==(nextop&7)+(rex.b<<3))
-            {
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
                 // just zero dest
-                SD(xZR, gback, gdoffset+0);
-                SD(xZR, gback, gdoffset+8);
+                SD(xZR, gback, gdoffset + 0);
+                SD(xZR, gback, gdoffset + 8);
             } else {
                 GETEX(x2, 0);
                 SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
@@ -644,12 +549,12 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
-            for(int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // GX->f[i] += EX->f[i];
-                FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, gdoffset+i*4);
+                FLW(s0, wback, fixedaddress + i * 4);
+                FLW(s1, gback, gdoffset + i * 4);
                 FADDS(s1, s1, s0);
-                FSW(s1, gback, gdoffset+i*4);
+                FSW(s1, gback, gdoffset + i * 4);
             }
             break;
         case 0x59:
@@ -659,12 +564,12 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
-            for(int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // GX->f[i] *= EX->f[i];
-                FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, gdoffset+i*4);
+                FLW(s0, wback, fixedaddress + i * 4);
+                FLW(s1, gback, gdoffset + i * 4);
                 FMULS(s1, s1, s0);
-                FSW(s1, gback, gdoffset+i*4);
+                FSW(s1, gback, gdoffset + i * 4);
             }
             break;
         case 0x5A:
@@ -675,11 +580,11 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
             FLW(s0, wback, fixedaddress);
-            FLW(s1, wback, fixedaddress+4);
+            FLW(s1, wback, fixedaddress + 4);
             FCVTDS(s0, s0);
             FCVTDS(s1, s1);
-            FSD(s0, gback, gdoffset+0);
-            FSD(s1, gback, gdoffset+8);
+            FSD(s0, gback, gdoffset + 0);
+            FSD(s1, gback, gdoffset + 8);
             break;
         case 0x5B:
             INST_NAME("CVTDQ2PS Gx, Ex");
@@ -687,10 +592,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGX();
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
-            for (int i=0; i<4; ++i) {
-                LW(x3, wback, fixedaddress+i*4);
+            for (int i = 0; i < 4; ++i) {
+                LW(x3, wback, fixedaddress + i * 4);
                 FCVTSW(s0, x3, RD_RNE);
-                FSW(s0, gback, gdoffset+i*4);
+                FSW(s0, gback, gdoffset + i * 4);
             }
             break;
         case 0x5C:
@@ -700,12 +605,12 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
-            for(int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // GX->f[i] -= EX->f[i];
-                FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, gdoffset+i*4);
+                FLW(s0, wback, fixedaddress + i * 4);
+                FLW(s1, gback, gdoffset + i * 4);
                 FSUBS(s1, s1, s0);
-                FSW(s1, gback, gdoffset+i*4);
+                FSW(s1, gback, gdoffset + i * 4);
             }
             break;
         case 0x5D:
@@ -715,20 +620,20 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
-            for(int i=0; i<4; ++i) {
-                FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, gdoffset+i*4);
-                if(!box64_dynarec_fastnan) {
+            for (int i = 0; i < 4; ++i) {
+                FLW(s0, wback, fixedaddress + i * 4);
+                FLW(s1, gback, gdoffset + i * 4);
+                if (!box64_dynarec_fastnan) {
                     FEQS(x3, s0, s0);
                     FEQS(x4, s1, s1);
                     AND(x3, x3, x4);
                     BEQZ(x3, 12);
                     FLTS(x3, s0, s1);
                     BEQZ(x3, 8);
-                    FSW(s0, gback, gdoffset+i*4);
+                    FSW(s0, gback, gdoffset + i * 4);
                 } else {
                     FMINS(s1, s1, s0);
-                    FSW(s1, gback, gdoffset+i*4);
+                    FSW(s1, gback, gdoffset + i * 4);
                 }
             }
             break;
@@ -739,12 +644,12 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
-            for(int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // GX->f[i] /= EX->f[i];
-                FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, gdoffset+i*4);
+                FLW(s0, wback, fixedaddress + i * 4);
+                FLW(s1, gback, gdoffset + i * 4);
                 FDIVS(s1, s1, s0);
-                FSW(s1, gback, gdoffset+i*4);
+                FSW(s1, gback, gdoffset + i * 4);
             }
             break;
         case 0x5F:
@@ -754,20 +659,20 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
-            for(int i=0; i<4; ++i) {
-                FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, gdoffset+i*4);
-                if(!box64_dynarec_fastnan) {
+            for (int i = 0; i < 4; ++i) {
+                FLW(s0, wback, fixedaddress + i * 4);
+                FLW(s1, gback, gdoffset + i * 4);
+                if (!box64_dynarec_fastnan) {
                     FEQS(x3, s0, s0);
                     FEQS(x4, s1, s1);
                     AND(x3, x3, x4);
                     BEQZ(x3, 12);
                     FLTS(x3, s1, s0);
                     BEQZ(x3, 8);
-                    FSW(s0, gback, gdoffset+i*4);
+                    FSW(s0, gback, gdoffset + i * 4);
                 } else {
                     FMAXS(s1, s1, s0);
-                    FSW(s1, gback, gdoffset+i*4);
+                    FSW(s1, gback, gdoffset + i * 4);
                 }
             }
             break;
@@ -775,23 +680,23 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("PUNPCKLBW Gm,Em");
             nextop = F8;
             GETGM();
-            for(int i=3; i>0; --i) { // 0 is untouched
+            for (int i = 3; i > 0; --i) { // 0 is untouched
                 // GX->ub[2 * i] = GX->ub[i];
-                LBU(x3, gback, gdoffset+i);
-                SB(x3, gback, gdoffset+2*i);
+                LBU(x3, gback, gdoffset + i);
+                SB(x3, gback, gdoffset + 2 * i);
             }
-            if (MODREG && gd==(nextop&7)) {
-                for(int i=0; i<4; ++i) {
+            if (MODREG && gd == (nextop & 7)) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->ub[2 * i + 1] = GX->ub[2 * i];
-                    LBU(x3, gback, gdoffset+2*i);
-                    SB(x3, gback, gdoffset+2*i+1);
+                    LBU(x3, gback, gdoffset + 2 * i);
+                    SB(x3, gback, gdoffset + 2 * i + 1);
                 }
             } else {
                 GETEM(x2, 0);
-                for(int i=0; i<4; ++i) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->ub[2 * i + 1] = EX->ub[i];
-                    LBU(x3, wback, fixedaddress+i);
-                    SB(x3, gback, gdoffset+2*i+1);
+                    LBU(x3, wback, fixedaddress + i);
+                    SB(x3, gback, gdoffset + 2 * i + 1);
                 }
             }
             break;
@@ -801,14 +706,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGM();
             GETEM(x2, 0);
             // GM->uw[3] = EM->uw[1];
-            LHU(x3, wback, fixedaddress+2*1);
-            SH(x3, gback, gdoffset+2*3);
+            LHU(x3, wback, fixedaddress + 2 * 1);
+            SH(x3, gback, gdoffset + 2 * 3);
             // GM->uw[2] = GM->uw[1];
-            LHU(x3, gback, gdoffset+2*1);
-            SH(x3, gback, gdoffset+2*2);
+            LHU(x3, gback, gdoffset + 2 * 1);
+            SH(x3, gback, gdoffset + 2 * 2);
             // GM->uw[1] = EM->uw[0];
-            LHU(x3, wback, fixedaddress+2*0);
-            SH(x3, gback, gdoffset+2*1);
+            LHU(x3, wback, fixedaddress + 2 * 0);
+            SH(x3, gback, gdoffset + 2 * 1);
             break;
         case 0x62:
             INST_NAME("PUNPCKLDQ Gm, Em");
@@ -817,38 +722,38 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEM(x2, 0);
             // GM->ud[1] = EM->ud[0];
             LWU(x3, wback, fixedaddress);
-            SW(x3, gback, gdoffset+4*1);
+            SW(x3, gback, gdoffset + 4 * 1);
             break;
         case 0x67:
             INST_NAME("PACKUSWB Gm, Em");
             nextop = F8;
             GETGM();
             ADDI(x5, xZR, 0xFF);
-            for(int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // GX->ub[i] = (GX->sw[i]<0)?0:((GX->sw[i]>0xff)?0xff:GX->sw[i]);
-                LH(x3, gback, gdoffset+i*2);
+                LH(x3, gback, gdoffset + i * 2);
                 BGE(x5, x3, 8);
                 ADDI(x3, xZR, 0xFF);
                 NOT(x4, x3);
                 SRAI(x4, x4, 63);
                 AND(x3, x3, x4);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
-            if (MODREG && gd==(nextop&7)) {
+            if (MODREG && gd == (nextop & 7)) {
                 // GM->ud[1] = GM->ud[0];
-                LW(x3, gback, gdoffset+0*4);
-                SW(x3, gback, gdoffset+1*4);
+                LW(x3, gback, gdoffset + 0 * 4);
+                SW(x3, gback, gdoffset + 1 * 4);
             } else {
                 GETEM(x1, 0);
-                for(int i=0; i<4; ++i) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->ub[4+i] = (EX->sw[i]<0)?0:((EX->sw[i]>0xff)?0xff:EX->sw[i]);
-                    LH(x3, wback, fixedaddress+i*2);
+                    LH(x3, wback, fixedaddress + i * 2);
                     BGE(x5, x3, 8);
                     ADDI(x3, xZR, 0xFF);
                     NOT(x4, x3);
                     SRAI(x4, x4, 63);
                     AND(x3, x3, x4);
-                    SB(x3, gback, gdoffset+4+i);
+                    SB(x3, gback, gdoffset + 4 + i);
                 }
             }
             break;
@@ -856,23 +761,23 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("PUNPCKHBW Gm,Em");
             nextop = F8;
             GETGM();
-            for(int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // GX->ub[2 * i] = GX->ub[i + 4];
-                LBU(x3, gback, gdoffset+i+4);
-                SB(x3, gback, gdoffset+2*i);
+                LBU(x3, gback, gdoffset + i + 4);
+                SB(x3, gback, gdoffset + 2 * i);
             }
-            if (MODREG && gd==(nextop&7)) {
-                for(int i=0; i<4; ++i) {
+            if (MODREG && gd == (nextop & 7)) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->ub[2 * i + 1] = GX->ub[2 * i];
-                    LBU(x3, gback, gdoffset+2*i);
-                    SB(x3, gback, gdoffset+2*i+1);
+                    LBU(x3, gback, gdoffset + 2 * i);
+                    SB(x3, gback, gdoffset + 2 * i + 1);
                 }
             } else {
                 GETEM(x2, 0);
-                for(int i=0; i<4; ++i) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->ub[2 * i + 1] = EX->ub[i + 4];
-                    LBU(x3, wback, fixedaddress+i+4);
-                    SB(x3, gback, gdoffset+2*i+1);
+                    LBU(x3, wback, fixedaddress + i + 4);
+                    SB(x3, gback, gdoffset + 2 * i + 1);
                 }
             }
             break;
@@ -880,23 +785,23 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("PUNPCKHWD Gm,Em");
             nextop = F8;
             GETGM();
-            for(int i=0; i<2; ++i) {
+            for (int i = 0; i < 2; ++i) {
                 // GX->uw[2 * i] = GX->uw[i + 2];
-                LHU(x3, gback, gdoffset+(i+2)*2);
-                SH(x3, gback, gdoffset+2*i*2);
+                LHU(x3, gback, gdoffset + (i + 2) * 2);
+                SH(x3, gback, gdoffset + 2 * i * 2);
             }
-            if (MODREG && gd==(nextop&7)) {
-                for(int i=0; i<2; ++i) {
+            if (MODREG && gd == (nextop & 7)) {
+                for (int i = 0; i < 2; ++i) {
                     // GX->uw[2 * i + 1] = GX->uw[2 * i];
-                    LHU(x3, gback, gdoffset+2*i*2);
-                    SH(x3, gback, gdoffset+(2*i+1)*2);
+                    LHU(x3, gback, gdoffset + 2 * i * 2);
+                    SH(x3, gback, gdoffset + (2 * i + 1) * 2);
                 }
             } else {
                 GETEM(x1, 0);
-                for(int i=0; i<2; ++i) {
+                for (int i = 0; i < 2; ++i) {
                     // GX->uw[2 * i + 1] = EX->uw[i + 2];
-                    LHU(x3, wback, fixedaddress+(i+2)*2);
-                    SH(x3, gback, gdoffset+(2*i+1)*2);
+                    LHU(x3, wback, fixedaddress + (i + 2) * 2);
+                    SH(x3, gback, gdoffset + (2 * i + 1) * 2);
                 }
             }
             break;
@@ -906,30 +811,33 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEM(x1, 0);
             GETGM();
             // GM->ud[0] = GM->ud[1];
-            LWU(x3, gback, gdoffset+1*4);
-            SW(x3, gback, gdoffset+0*4);
-            if (!(MODREG && (gd==ed))) {
+            LWU(x3, gback, gdoffset + 1 * 4);
+            SW(x3, gback, gdoffset + 0 * 4);
+            if (!(MODREG && (gd == ed))) {
                 // GM->ud[1] = EM->ud[1];
-                LWU(x3, wback, fixedaddress+1*4);
-                SW(x3, gback, gdoffset+1*4);
+                LWU(x3, wback, fixedaddress + 1 * 4);
+                SW(x3, gback, gdoffset + 1 * 4);
             }
             break;
         case 0x6E:
             INST_NAME("MOVD Gm, Ed");
             nextop = F8;
             GETGM();
-            if(MODREG) {
-                ed = xRAX + (nextop&7) + (rex.b<<3);
+            if (MODREG) {
+                ed = xRAX + (nextop & 7) + (rex.b << 3);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, 0);
-                if(rex.w) {
+                if (rex.w) {
                     LD(x4, ed, fixedaddress);
                 } else {
                     LW(x4, ed, fixedaddress);
                 }
                 ed = x4;
             }
-            if(rex.w) SD(ed, gback, gdoffset+0); else SW(ed, gback, gdoffset+0);
+            if (rex.w)
+                SD(ed, gback, gdoffset + 0);
+            else
+                SW(ed, gback, gdoffset + 0);
             break;
         case 0x6F:
             INST_NAME("MOVQ Gm, Em");
@@ -937,24 +845,24 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGM();
             GETEM(x2, 0);
             LD(x3, wback, fixedaddress);
-            SD(x3, gback, gdoffset+0);
+            SD(x3, gback, gdoffset + 0);
             break;
         case 0x71:
             nextop = F8;
-            switch((nextop>>3)&7) {
+            switch ((nextop >> 3) & 7) {
                 case 2:
                     INST_NAME("PSRLW Em, Ib");
                     GETEM(x1, 1);
                     u8 = F8;
-                    if (u8>15) {
+                    if (u8 > 15) {
                         // just zero dest
                         SD(xZR, wback, fixedaddress);
-                    } else if(u8) {
-                        for (int i=0; i<4; ++i) {
+                    } else if (u8) {
+                        for (int i = 0; i < 4; ++i) {
                             // EX->uw[i] >>= u8;
-                            LHU(x3, wback, fixedaddress+i*2);
+                            LHU(x3, wback, fixedaddress + i * 2);
                             SRLI(x3, x3, u8);
-                            SH(x3, wback, fixedaddress+i*2);
+                            SH(x3, wback, fixedaddress + i * 2);
                         }
                     }
                     break;
@@ -962,13 +870,13 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     INST_NAME("PSRAW Em, Ib");
                     GETEM(x1, 1);
                     u8 = F8;
-                    if(u8>15) u8=15;
-                    if(u8) {
-                        for (int i=0; i<4; ++i) {
+                    if (u8 > 15) u8 = 15;
+                    if (u8) {
+                        for (int i = 0; i < 4; ++i) {
                             // EX->sw[i] >>= u8;
-                            LH(x3, wback, fixedaddress+i*2);
+                            LH(x3, wback, fixedaddress + i * 2);
                             SRAI(x3, x3, u8);
-                            SH(x3, wback, fixedaddress+i*2);
+                            SH(x3, wback, fixedaddress + i * 2);
                         }
                     }
                     break;
@@ -976,15 +884,15 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     INST_NAME("PSLLW Em, Ib");
                     GETEM(x1, 1);
                     u8 = F8;
-                    if (u8>15) {
+                    if (u8 > 15) {
                         // just zero dest
-                        SD(xZR, wback, fixedaddress+0);
-                    } else if(u8) {
-                        for (int i=0; i<4; ++i) {
+                        SD(xZR, wback, fixedaddress + 0);
+                    } else if (u8) {
+                        for (int i = 0; i < 4; ++i) {
                             // EX->uw[i] <<= u8;
-                            LHU(x3, wback, fixedaddress+i*2);
+                            LHU(x3, wback, fixedaddress + i * 2);
                             SLLI(x3, x3, u8);
-                            SH(x3, wback, fixedaddress+i*2);
+                            SH(x3, wback, fixedaddress + i * 2);
                         }
                     }
                     break;
@@ -1005,76 +913,76 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             // empty MMX, FPU now usable
             mmx_purgecache(dyn, ninst, 0, x1);
             /*emu->top = 0;
-            emu->fpu_stack = 0;*/ //TODO: Check if something is needed here?
+            emu->fpu_stack = 0;*/
+            // TODO: Check if something is needed here?
             break;
         case 0x7F:
             INST_NAME("MOVQ Em, Gm");
             nextop = F8;
             GETGM();
             GETEM(x2, 0);
-            LD(x3, gback, gdoffset+0);
+            LD(x3, gback, gdoffset + 0);
             SD(x3, wback, fixedaddress);
             break;
-        #define GO(GETFLAGS, NO, YES, F)   \
-            READFLAGS(F);                                               \
-            i32_ = F32S;                                                \
-            BARRIER(BARRIER_MAYBE);                                     \
-            JUMP(addr+i32_, 1);                                         \
-            GETFLAGS;                                                   \
-            if(dyn->insts[ninst].x64.jmp_insts==-1 ||                   \
-                CHECK_CACHE()) {                                        \
-                /* out of the block */                                  \
-                i32 = dyn->insts[ninst].epilog-(dyn->native_size);      \
-                B##NO##_safe(x1, i32);                                  \
-                if(dyn->insts[ninst].x64.jmp_insts==-1) {               \
-                    if(!(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))  \
-                        fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \
-                    jump_to_next(dyn, addr+i32_, 0, ninst);             \
-                } else {                                                \
-                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \
-                    i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);    \
-                    B(i32);                                             \
-                }                                                       \
-            } else {                                                    \
-                /* inside the block */                                  \
-                i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);    \
-                B##YES##_safe(x1, i32);                                 \
-            }
+#define GO(GETFLAGS, NO, YES, F)                                                            \
+    READFLAGS(F);                                                                           \
+    i32_ = F32S;                                                                            \
+    BARRIER(BARRIER_MAYBE);                                                                 \
+    JUMP(addr + i32_, 1);                                                                   \
+    GETFLAGS;                                                                               \
+    if (dyn->insts[ninst].x64.jmp_insts == -1 || CHECK_CACHE()) {                           \
+        /* out of the block */                                                              \
+        i32 = dyn->insts[ninst].epilog - (dyn->native_size);                                \
+        B##NO##_safe(x1, i32);                                                              \
+        if (dyn->insts[ninst].x64.jmp_insts == -1) {                                        \
+            if (!(dyn->insts[ninst].x64.barrier & BARRIER_FLOAT))                           \
+                fpu_purgecache(dyn, ninst, 1, x1, x2, x3);                                  \
+            jump_to_next(dyn, addr + i32_, 0, ninst);                                       \
+        } else {                                                                            \
+            CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);                               \
+            i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size); \
+            B(i32);                                                                         \
+        }                                                                                   \
+    } else {                                                                                \
+        /* inside the block */                                                              \
+        i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size);     \
+        B##YES##_safe(x1, i32);                                                             \
+    }
 
-        GOCOND(0x80, "J", "Id");
-        #undef GO
+            GOCOND(0x80, "J", "Id");
+#undef GO
 
-        #define GO(GETFLAGS, NO, YES, F)                \
-            READFLAGS(F);                               \
-            GETFLAGS;                                   \
-            nextop=F8;                                  \
-            S##YES(x3, x1);                             \
-            if(MODREG) {                                \
-                if(rex.rex) {                           \
-                    eb1= xRAX+(nextop&7)+(rex.b<<3);    \
-                    eb2 = 0;                            \
-                } else {                                \
-                    ed = (nextop&7);                    \
-                    eb2 = (ed>>2)*8;                    \
-                    eb1 = xRAX+(ed&3);                  \
-                }                                       \
-                if (eb2) {                              \
-                    LUI(x1, 0xffff0);                   \
-                    ORI(x1, x1, 0xff);                  \
-                    AND(eb1, eb1, x1);                  \
-                    SLLI(x3, x3, 8);                    \
-                } else {                                \
-                    ANDI(eb1, eb1, 0xf00);              \
-                }                                       \
-                OR(eb1, eb1, x3);                       \
-            } else {                                    \
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress,rex, NULL, 1, 0); \
-                SB(x3, ed, fixedaddress);               \
-                SMWRITE();                              \
-            }
+#define GO(GETFLAGS, NO, YES, F)                                                             \
+    READFLAGS(F);                                                                            \
+    GETFLAGS;                                                                                \
+    nextop = F8;                                                                             \
+    S##YES(x3, x1);                                                                          \
+    if (MODREG) {                                                                            \
+        if (rex.rex) {                                                                       \
+            eb1 = xRAX + (nextop & 7) + (rex.b << 3);                                        \
+            eb2 = 0;                                                                         \
+        } else {                                                                             \
+            ed = (nextop & 7);                                                               \
+            eb2 = (ed >> 2) * 8;                                                             \
+            eb1 = xRAX + (ed & 3);                                                           \
+        }                                                                                    \
+        if (eb2) {                                                                           \
+            LUI(x1, 0xffff0);                                                                \
+            ORI(x1, x1, 0xff);                                                               \
+            AND(eb1, eb1, x1);                                                               \
+            SLLI(x3, x3, 8);                                                                 \
+        } else {                                                                             \
+            ANDI(eb1, eb1, 0xf00);                                                           \
+        }                                                                                    \
+        OR(eb1, eb1, x3);                                                                    \
+    } else {                                                                                 \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); \
+        SB(x3, ed, fixedaddress);                                                            \
+        SMWRITE();                                                                           \
+    }
 
-        GOCOND(0x90, "SET", "Eb");
-        #undef GO
+            GOCOND(0x90, "SET", "Eb");
+#undef GO
 
         case 0xA2:
             INST_NAME("CPUID");
@@ -1091,20 +999,20 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             SET_DFNONE();
             nextop = F8;
             GETGD;
-            if(MODREG) {
-                ed = xRAX+(nextop&7)+(rex.b<<3);
+            if (MODREG) {
+                ed = xRAX + (nextop & 7) + (rex.b << 3);
             } else {
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, x1, &fixedaddress, rex, NULL, 1, 0);
-                SRAIxw(x1, gd, 5+rex.w); // r1 = (gd>>5)
-                ADDSL(x3, wback, x1, 2+rex.w, x1);
+                SRAIxw(x1, gd, 5 + rex.w); // r1 = (gd>>5)
+                ADDSL(x3, wback, x1, 2 + rex.w, x1);
                 LDxw(x1, x3, fixedaddress);
                 ed = x1;
             }
-            ANDI(x2, gd, rex.w?0x3f:0x1f);
+            ANDI(x2, gd, rex.w ? 0x3f : 0x1f);
             SRL(x4, ed, x2);
             ANDI(x4, x4, 1);
-            ANDI(xFlags, xFlags, ~1);   //F_CF is 1
+            ANDI(xFlags, xFlags, ~1); // F_CF is 1
             OR(xFlags, xFlags, x4);
             break;
         case 0xA4:
@@ -1124,13 +1032,13 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETGD;
             if (MODREG) {
-                ed = xRAX+(nextop&7)+(rex.b<<3);
+                ed = xRAX + (nextop & 7) + (rex.b << 3);
                 wback = 0;
             } else {
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, x1, &fixedaddress, rex, NULL, 1, 0);
-                SRAI(x1, gd, 5+rex.w);
-                ADDSL(x3, wback, x1, 2+rex.w, x1);
+                SRAI(x1, gd, 5 + rex.w);
+                ADDSL(x3, wback, x1, 2 + rex.w, x1);
                 LDxw(x1, x3, fixedaddress);
                 ed = x1;
                 wback = x3;
@@ -1147,7 +1055,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             ADDI(x3, xZR, 1);
             SLL(x3, x3, x2);
             OR(ed, ed, x3);
-            if(wback) {
+            if (wback) {
                 SDxw(ed, wback, fixedaddress);
                 SMWRITE();
             }
@@ -1159,36 +1067,34 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETED(1);
             GETGD;
             u8 = F8;
-            u8&=(rex.w?0x3f:0x1f);
+            u8 &= (rex.w ? 0x3f : 0x1f);
             emit_shrd32c(dyn, ninst, rex, ed, gd, u8, x3, x4);
             WBACK;
             break;
         case 0xAE:
             nextop = F8;
-            if((nextop&0xF8)==0xE8) {
+            if ((nextop & 0xF8) == 0xE8) {
                 INST_NAME("LFENCE");
                 SMDMB();
-            } else
-            if((nextop&0xF8)==0xF0) {
+            } else if ((nextop & 0xF8) == 0xF0) {
                 INST_NAME("MFENCE");
                 SMDMB();
-            } else
-            if((nextop&0xF8)==0xF8) {
+            } else if ((nextop & 0xF8) == 0xF8) {
                 INST_NAME("SFENCE");
                 SMDMB();
             } else {
-                switch((nextop>>3)&7) {
+                switch ((nextop >> 3) & 7) {
                     case 0:
                         INST_NAME("FXSAVE Ed");
                         MESSAGE(LOG_DUMP, "Need Optimization\n");
                         SKIPTEST(x1);
                         fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
-                        if(MODREG) {
+                        if (MODREG) {
                             DEFAULT;
                         } else {
                             addr = geted(dyn, addr, ninst, nextop, &ed, x1, x3, &fixedaddress, rex, NULL, 0, 0);
-                            if(ed!=x1) {MV(x1, ed);}
-                            CALL(rex.w?((void*)fpu_fxsave64):((void*)fpu_fxsave32), -1);
+                            if (ed != x1) { MV(x1, ed); }
+                            CALL(rex.w ? ((void*)fpu_fxsave64) : ((void*)fpu_fxsave32), -1);
                         }
                         break;
                     case 1:
@@ -1196,19 +1102,19 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         MESSAGE(LOG_DUMP, "Need Optimization\n");
                         SKIPTEST(x1);
                         fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
-                        if(MODREG) {
+                        if (MODREG) {
                             DEFAULT;
                         } else {
                             addr = geted(dyn, addr, ninst, nextop, &ed, x1, x3, &fixedaddress, rex, NULL, 0, 0);
-                            if(ed!=x1) {MV(x1, ed);}
-                            CALL(rex.w?((void*)fpu_fxrstor64):((void*)fpu_fxrstor32), -1);
+                            if (ed != x1) { MV(x1, ed); }
+                            CALL(rex.w ? ((void*)fpu_fxrstor64) : ((void*)fpu_fxrstor32), -1);
                         }
                         break;
                     case 2:
                         INST_NAME("LDMXCSR Md");
                         GETED(0);
                         SW(ed, xEmu, offsetof(x64emu_t, mxcsr));
-                        if(box64_sse_flushto0) {
+                        if (box64_sse_flushto0) {
                             // TODO: applyFlushTo0 also needs to add RISC-V support.
                         }
                         break;
@@ -1222,7 +1128,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         INST_NAME("CLFLUSH Ed");
                         MESSAGE(LOG_DUMP, "Need Optimization?\n");
                         addr = geted(dyn, addr, ninst, nextop, &wback, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                        if(wback!=A1) {
+                        if (wback != A1) {
                             MV(A1, wback);
                         }
                         CALL_(native_clflush, -1, 0);
@@ -1238,26 +1144,32 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETGD;
             GETED(0);
-            if(rex.w) {
+            if (rex.w) {
                 // 64bits imul
-                UFLAG_IF {
+                UFLAG_IF
+                {
                     MULH(x3, gd, ed);
                     MUL(gd, gd, ed);
                     UFLAG_OP1(x3);
                     UFLAG_RES(gd);
                     UFLAG_DF(x3, d_imul64);
-                } else {
+                }
+                else
+                {
                     MULxw(gd, gd, ed);
                 }
             } else {
                 // 32bits imul
-                UFLAG_IF {
+                UFLAG_IF
+                {
                     MUL(gd, gd, ed);
                     UFLAG_RES(gd);
                     SRLI(x3, gd, 32);
                     UFLAG_OP1(x3);
                     UFLAG_DF(x3, d_imul32);
-                } else {
+                }
+                else
+                {
                     MULxw(gd, gd, ed);
                 }
                 SLLI(gd, gd, 32);
@@ -1270,14 +1182,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             SET_DFNONE();
             nextop = F8;
             GETGD;
-            if(MODREG) {
-                ed = xRAX+(nextop&7)+(rex.b<<3);
+            if (MODREG) {
+                ed = xRAX + (nextop & 7) + (rex.b << 3);
                 wback = 0;
             } else {
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
-                SRAI(x1, gd, 5+rex.w);
-                ADDSL(x3, wback, x1, 2+rex.w, x1);
+                SRAI(x1, gd, 5 + rex.w);
+                ADDSL(x3, wback, x1, 2 + rex.w, x1);
                 LDxw(x1, x3, fixedaddress);
                 ed = x1;
                 wback = x3;
@@ -1295,7 +1207,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             SLL(x5, x5, x2);
             NOT(x5, x5);
             AND(ed, ed, x5);
-            if(wback) {
+            if (wback) {
                 SDxw(ed, wback, fixedaddress);
                 SMWRITE();
             }
@@ -1304,14 +1216,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("MOVZX Gd, Eb");
             nextop = F8;
             GETGD;
-            if(MODREG) {
-                if(rex.rex) {
-                    eb1 = xRAX+(nextop&7)+(rex.b<<3);
-                    eb2 = 0;                \
+            if (MODREG) {
+                if (rex.rex) {
+                    eb1 = xRAX + (nextop & 7) + (rex.b << 3);
+                    eb2 = 0;
                 } else {
-                    ed = (nextop&7);
-                    eb1 = xRAX+(ed&3);  // Ax, Cx, Dx or Bx
-                    eb2 = (ed&4)>>2;    // L or H
+                    ed = (nextop & 7);
+                    eb1 = xRAX + (ed & 3); // Ax, Cx, Dx or Bx
+                    eb2 = (ed & 4) >> 2;   // L or H
                 }
                 if (eb2) {
                     SRLI(gd, eb1, 8);
@@ -1329,8 +1241,8 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("MOVZX Gd, Ew");
             nextop = F8;
             GETGD;
-            if(MODREG) {
-                ed = xRAX+(nextop&7)+(rex.b<<3);
+            if (MODREG) {
+                ed = xRAX + (nextop & 7) + (rex.b << 3);
                 ZEXTH(gd, ed);
             } else {
                 SMREAD();
@@ -1340,14 +1252,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             break;
         case 0xBA:
             nextop = F8;
-            switch((nextop>>3)&7) {
+            switch ((nextop >> 3) & 7) {
                 case 4:
                     INST_NAME("BT Ed, Ib");
                     SETFLAGS(X_CF, SF_SUBSET);
                     SET_DFNONE();
                     GETED(1);
                     u8 = F8;
-                    u8&=rex.w?0x3f:0x1f;
+                    u8 &= rex.w ? 0x3f : 0x1f;
                     SRLIxw(x3, ed, u8);
                     ANDI(x3, x3, 1); // F_CF is 1
                     ANDI(xFlags, xFlags, ~1);
@@ -1359,19 +1271,19 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     SET_DFNONE();
                     GETED(1);
                     u8 = F8;
-                    u8&=(rex.w?0x3f:0x1f);
-                    ORI(xFlags, xFlags, 1<<F_CF);
+                    u8 &= (rex.w ? 0x3f : 0x1f);
+                    ORI(xFlags, xFlags, 1 << F_CF);
                     if (u8 <= 10) {
-                        ANDI(x6, ed, 1<<u8);
+                        ANDI(x6, ed, 1 << u8);
                         BNE_MARK(x6, xZR);
-                        ANDI(xFlags, xFlags, ~(1<<F_CF));
-                        XORI(ed, ed, 1<<u8);
+                        ANDI(xFlags, xFlags, ~(1 << F_CF));
+                        XORI(ed, ed, 1 << u8);
                     } else {
                         ORI(x6, xZR, 1);
                         SLLI(x6, x6, u8);
                         AND(x4, ed, x6);
                         BNE_MARK(x4, xZR);
-                        ANDI(xFlags, xFlags, ~(1<<F_CF));
+                        ANDI(xFlags, xFlags, ~(1 << F_CF));
                         XOR(ed, ed, x6);
                     }
                     if (wback) {
@@ -1386,19 +1298,19 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     SET_DFNONE();
                     GETED(1);
                     u8 = F8;
-                    u8&=(rex.w?0x3f:0x1f);
-                    ANDI(xFlags, xFlags, ~(1<<F_CF));
+                    u8 &= (rex.w ? 0x3f : 0x1f);
+                    ANDI(xFlags, xFlags, ~(1 << F_CF));
                     if (u8 <= 10) {
-                        ANDI(x6, ed, 1<<u8);
+                        ANDI(x6, ed, 1 << u8);
                         BEQ_MARK(x6, xZR);
-                        ORI(xFlags, xFlags, 1<<F_CF);
-                        XORI(ed, ed, 1<<u8);
+                        ORI(xFlags, xFlags, 1 << F_CF);
+                        XORI(ed, ed, 1 << u8);
                     } else {
                         ORI(x6, xZR, 1);
                         SLLI(x6, x6, u8);
                         AND(x6, ed, x6);
                         BEQ_MARK(x6, xZR);
-                        ORI(xFlags, xFlags, 1<<F_CF);
+                        ORI(xFlags, xFlags, 1 << F_CF);
                         XOR(ed, ed, x6);
                     }
                     if (wback) {
@@ -1413,7 +1325,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     SET_DFNONE();
                     GETED(1);
                     u8 = F8;
-                    u8&=rex.w?0x3f:0x1f;
+                    u8 &= rex.w ? 0x3f : 0x1f;
                     SRLIxw(x3, ed, u8);
                     ANDI(x3, x3, 1); // F_CF is 1
                     ANDI(xFlags, xFlags, ~1);
@@ -1424,7 +1336,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         MOV64xw(x3, (1LL << u8));
                         XOR(ed, ed, x3);
                     }
-                    if(wback) {
+                    if (wback) {
                         SDxw(ed, wback, fixedaddress);
                         SMWRITE();
                     }
@@ -1440,13 +1352,13 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETGD;
             if (MODREG) {
-                ed = xRAX+(nextop&7)+(rex.b<<3);
+                ed = xRAX + (nextop & 7) + (rex.b << 3);
                 wback = 0;
             } else {
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, x1, &fixedaddress, rex, NULL, 1, 0);
-                SRAI(x1, gd, 5+rex.w);
-                ADDSL(x3, wback, x1, 2+rex.w, x1);
+                SRAI(x1, gd, 5 + rex.w);
+                ADDSL(x3, wback, x1, 2 + rex.w, x1);
                 LDxw(x1, x3, fixedaddress);
                 ed = x1;
                 wback = x3;
@@ -1463,7 +1375,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             ADDI(x3, xZR, 1);
             SLL(x3, x3, x2);
             XOR(ed, ed, x3);
-            if(wback) {
+            if (wback) {
                 SDxw(ed, wback, fixedaddress);
                 SMWRITE();
             }
@@ -1475,27 +1387,27 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETED(0);
             GETGD;
-            if(!rex.w && MODREG) {
+            if (!rex.w && MODREG) {
                 AND(x4, ed, xMASK);
                 ed = x4;
             }
             BNE_MARK(ed, xZR);
-            ORI(xFlags, xFlags, 1<<F_ZF);
+            ORI(xFlags, xFlags, 1 << F_ZF);
             B_NEXT_nocond;
             MARK;
-            if(rv64_zbb) {
+            if (rv64_zbb) {
                 CTZxw(gd, ed);
             } else {
                 NEG(x2, ed);
                 AND(x2, x2, ed);
                 TABLE64(x3, 0x03f79d71b4ca8b09ULL);
                 MUL(x2, x2, x3);
-                SRLI(x2, x2, 64-6);
+                SRLI(x2, x2, 64 - 6);
                 TABLE64(x1, (uintptr_t)&deBruijn64tab);
                 ADD(x1, x1, x2);
                 LBU(gd, x1, 0);
             }
-            ANDI(xFlags, xFlags, ~(1<<F_ZF));
+            ANDI(xFlags, xFlags, ~(1 << F_ZF));
             break;
         case 0xBD:
             INST_NAME("BSR Gd, Ed");
@@ -1504,44 +1416,44 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETED(0);
             GETGD;
-            if(!rex.w && MODREG) {
+            if (!rex.w && MODREG) {
                 AND(x4, ed, xMASK);
                 ed = x4;
             }
             BNE_MARK(ed, xZR);
-            ORI(xFlags, xFlags, 1<<F_ZF);
+            ORI(xFlags, xFlags, 1 << F_ZF);
             B_NEXT_nocond;
             MARK;
-            ANDI(xFlags, xFlags, ~(1<<F_ZF));
-            if(rv64_zbb) {
-                MOV32w(x1, rex.w?63:31);
+            ANDI(xFlags, xFlags, ~(1 << F_ZF));
+            if (rv64_zbb) {
+                MOV32w(x1, rex.w ? 63 : 31);
                 CLZxw(gd, ed);
                 SUB(gd, x1, gd);
             } else {
-                if(ed!=gd)
+                if (ed != gd)
                     u8 = gd;
                 else
                     u8 = x1;
                 ADDI(u8, xZR, 0);
-                if(rex.w) {
+                if (rex.w) {
                     MV(x2, ed);
                     SRLI(x3, x2, 32);
-                    BEQZ(x3, 4+2*4);
+                    BEQZ(x3, 4 + 2 * 4);
                     ADDI(u8, u8, 32);
                     MV(x2, x3);
                 } else {
                     AND(x2, ed, xMASK);
                 }
                 SRLI(x3, x2, 16);
-                BEQZ(x3, 4+2*4);
+                BEQZ(x3, 4 + 2 * 4);
                 ADDI(u8, u8, 16);
                 MV(x2, x3);
                 SRLI(x3, x2, 8);
-                BEQZ(x3, 4+2*4);
+                BEQZ(x3, 4 + 2 * 4);
                 ADDI(u8, u8, 8);
                 MV(x2, x3);
                 SRLI(x3, x2, 4);
-                BEQZ(x3, 4+2*4);
+                BEQZ(x3, 4 + 2 * 4);
                 ADDI(u8, u8, 4);
                 MV(x2, x3);
                 ANDI(x2, x2, 0b1111);
@@ -1555,31 +1467,31 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("MOVSX Gd, Eb");
             nextop = F8;
             GETGD;
-            if(MODREG) {
-                if(rex.rex) {
-                    wback = xRAX+(nextop&7)+(rex.b<<3);
+            if (MODREG) {
+                if (rex.rex) {
+                    wback = xRAX + (nextop & 7) + (rex.b << 3);
                     wb2 = 0;
                 } else {
-                    wback = (nextop&7);
-                    wb2 = (wback>>2)*8;
-                    wback = xRAX+(wback&3);
+                    wback = (nextop & 7);
+                    wb2 = (wback >> 2) * 8;
+                    wback = xRAX + (wback & 3);
                 }
-                SLLI(gd, wback, 56-wb2);
+                SLLI(gd, wback, 56 - wb2);
                 SRAI(gd, gd, 56);
             } else {
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x1, &fixedaddress, rex, NULL, 1, 0);
                 LB(gd, ed, fixedaddress);
             }
-            if(!rex.w)
+            if (!rex.w)
                 ZEROUP(gd);
             break;
         case 0xBF:
             INST_NAME("MOVSX Gd, Ew");
             nextop = F8;
             GETGD;
-            if(MODREG) {
-                ed = xRAX+(nextop&7)+(rex.b<<3);
+            if (MODREG) {
+                ed = xRAX + (nextop & 7) + (rex.b << 3);
                 SLLI(gd, ed, 48);
                 SRAI(gd, gd, 48);
             } else {
@@ -1587,7 +1499,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x1, &fixedaddress, rex, NULL, 1, 0);
                 LH(gd, ed, fixedaddress);
             }
-            if(!rex.w)
+            if (!rex.w)
                 ZEROUP(gd);
             break;
         case 0xC2:
@@ -1598,12 +1510,12 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             u8 = F8;
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
-            for(int i=0; i<4; ++i) {
-                FLW(d0, gback, gdoffset+i*4);
-                FLW(d1, wback, fixedaddress+i*4);
-                if ((u8&7) == 0) {                                      // Equal
+            for (int i = 0; i < 4; ++i) {
+                FLW(d0, gback, gdoffset + i * 4);
+                FLW(d1, wback, fixedaddress + i * 4);
+                if ((u8 & 7) == 0) { // Equal
                     FEQS(x3, d0, d1);
-                } else if ((u8&7) == 4) {                               // Not Equal or unordered
+                } else if ((u8 & 7) == 4) { // Not Equal or unordered
                     FEQS(x3, d0, d1);
                     XORI(x3, x3, 1);
                 } else {
@@ -1612,41 +1524,47 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     FEQS(x3, d1, d1);
                     AND(x3, x3, x4);
 
-                    switch(u8&7) {
-                    case 1: BEQ_MARK(x3, xZR); FLTS(x3, d0, d1); break; // Less than
-                    case 2: BEQ_MARK(x3, xZR); FLES(x3, d0, d1); break; // Less or equal
-                    case 3: XORI(x3, x3, 1); break;                     // NaN
-                    case 5: {                                           // Greater or equal or unordered
-                        BEQ(x3, xZR, 12); // MARK2
-                        FLES(x3, d1, d0);
-                        J(8); // MARK;
-                        break;
-                    }
-                    case 6: {                                           // Greater or unordered, test inverted, N!=V so unordered or less than (inverted)
-                        BEQ(x3, xZR, 12); // MARK2
-                        FLTS(x3, d1, d0);
-                        J(8); // MARK;
-                        break;
-                    }
-                    case 7: break;                                      // Not NaN
+                    switch (u8 & 7) {
+                        case 1:
+                            BEQ_MARK(x3, xZR);
+                            FLTS(x3, d0, d1);
+                            break; // Less than
+                        case 2:
+                            BEQ_MARK(x3, xZR);
+                            FLES(x3, d0, d1);
+                            break;                      // Less or equal
+                        case 3: XORI(x3, x3, 1); break; // NaN
+                        case 5: {                       // Greater or equal or unordered
+                            BEQ(x3, xZR, 12);           // MARK2
+                            FLES(x3, d1, d0);
+                            J(8); // MARK;
+                            break;
+                        }
+                        case 6: {             // Greater or unordered, test inverted, N!=V so unordered or less than (inverted)
+                            BEQ(x3, xZR, 12); // MARK2
+                            FLTS(x3, d1, d0);
+                            J(8); // MARK;
+                            break;
+                        }
+                        case 7: break; // Not NaN
                     }
 
                     // MARK2;
-                    if ((u8&7) == 5 || (u8&7) == 6) {
+                    if ((u8 & 7) == 5 || (u8 & 7) == 6) {
                         MOV32w(x3, 1);
                     }
                     // MARK;
                 }
                 NEG(x3, x3);
-                SW(x3, gback, gdoffset+i*4);
+                SW(x3, gback, gdoffset + i * 4);
             }
             break;
         case 0xC3:
             INST_NAME("MOVNTI Ed, Gd");
             nextop = F8;
             GETGD;
-            if(MODREG) {
-                MVxw(xRAX+(nextop&7)+(rex.b<<3), gd);
+            if (MODREG) {
+                MVxw(xRAX + (nextop & 7) + (rex.b << 3), gd);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
                 SDxw(gd, ed, fixedaddress);
@@ -1660,19 +1578,19 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             u8 = F8;
             int32_t idx;
 
-            idx = (u8>>(0*2))&3;
-            LWU(x3, gback, gdoffset+idx*4);
-            idx = (u8>>(1*2))&3;
-            LWU(x4, gback, gdoffset+idx*4);
-            idx = (u8>>(2*2))&3;
-            LWU(x5, wback, fixedaddress+idx*4);
-            idx = (u8>>(3*2))&3;
-            LWU(x6, wback, fixedaddress+idx*4);
+            idx = (u8 >> (0 * 2)) & 3;
+            LWU(x3, gback, gdoffset + idx * 4);
+            idx = (u8 >> (1 * 2)) & 3;
+            LWU(x4, gback, gdoffset + idx * 4);
+            idx = (u8 >> (2 * 2)) & 3;
+            LWU(x5, wback, fixedaddress + idx * 4);
+            idx = (u8 >> (3 * 2)) & 3;
+            LWU(x6, wback, fixedaddress + idx * 4);
 
-            SW(x3, gback, gdoffset+0*4);
-            SW(x4, gback, gdoffset+1*4);
-            SW(x5, gback, gdoffset+2*4);
-            SW(x6, gback, gdoffset+3*4);
+            SW(x3, gback, gdoffset + 0 * 4);
+            SW(x4, gback, gdoffset + 1 * 4);
+            SW(x5, gback, gdoffset + 2 * 4);
+            SW(x6, gback, gdoffset + 3 * 4);
             break;
 
         case 0xC8:
@@ -1682,62 +1600,22 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0xCC:
         case 0xCD:
         case 0xCE:
-        case 0xCF:                  /* BSWAP reg */
+        case 0xCF: /* BSWAP reg */
             INST_NAME("BSWAP Reg");
-            gd = xRAX+(opcode&7)+(rex.b<<3);
-            if(rv64_zbb) {
-                REV8(gd, gd);
-                if(!rex.w)
-                    SRLI(gd, gd, 32);
-            } else {
-                gback = gd;
-                if (!rex.w) {
-                    AND(x4, gd, xMASK);
-                    gd = x4;
-                }
-                ANDI(x1, gd, 0xff);
-                SLLI(x1, x1, (rex.w?64:32)-8);
-                SRLI(x2, gd, 8);
-                ANDI(x3, x2, 0xff);
-                SLLI(x3, x3, (rex.w?64:32)-16);
-                OR(x1, x1, x3);
-                SRLI(x2, gd, 16);
-                ANDI(x3, x2, 0xff);
-                SLLI(x3, x3, (rex.w?64:32)-24);
-                OR(x1, x1, x3);
-                SRLI(x2, gd, 24);
-                if(rex.w) {
-                    ANDI(x3, x2, 0xff);
-                    SLLI(x3, x3, 64-32);
-                    OR(x1, x1, x3);
-                    SRLI(x2, gd, 32);
-                    ANDI(x3, x2, 0xff);
-                    SLLI(x3, x3, 64-40);
-                    OR(x1, x1, x3);
-                    SRLI(x2, gd, 40);
-                    ANDI(x3, x2, 0xff);
-                    SLLI(x3, x3, 64-48);
-                    OR(x1, x1, x3);
-                    SRLI(x2, gd, 48);
-                    ANDI(x3, x2, 0xff);
-                    SLLI(x3, x3, 64-56);
-                    OR(x1, x1, x3);
-                    SRLI(x2, gd, 56);
-                }
-                OR(gback, x1, x2);
-            }
+            gd = xRAX + (opcode & 7) + (rex.b << 3);
+            REV8xw(gd, gd, x1, x2, x3, x4);
             break;
         case 0xE5:
             INST_NAME("PMULHW Gm,Em");
             nextop = F8;
             GETGM();
             GETEM(x2, 0);
-            for(int i=0; i<4; ++i) {
-                LH(x3, gback, gdoffset+2*i);
-                LH(x4, wback, fixedaddress+2*i);
+            for (int i = 0; i < 4; ++i) {
+                LH(x3, gback, gdoffset + 2 * i);
+                LH(x4, wback, fixedaddress + 2 * i);
                 MULW(x3, x3, x4);
                 SRAIW(x3, x3, 16);
-                SH(x3, gback, gdoffset+2*i);
+                SH(x3, gback, gdoffset + 2 * i);
             }
             break;
         case 0xED:
@@ -1745,35 +1623,35 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             GETGM();
             GETEM(x2, 0);
-            for(int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // tmp32s = (int32_t)GX->sw[i] + EX->sw[i];
                 // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
-                LH(x3, gback, gdoffset+2*i);
-                LH(x4, wback, fixedaddress+2*i);
+                LH(x3, gback, gdoffset + 2 * i);
+                LH(x4, wback, fixedaddress + 2 * i);
                 ADDW(x3, x3, x4);
                 LUI(x4, 0xFFFF8); // -32768
                 BGE(x3, x4, 12);
-                SH(x4, gback, gdoffset+2*i);
-                J(20); // continue
+                SH(x4, gback, gdoffset + 2 * i);
+                J(20);      // continue
                 LUI(x4, 8); // 32768
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, gdoffset+2*i);
+                SH(x3, gback, gdoffset + 2 * i);
             }
             break;
         case 0xEF:
             INST_NAME("PXOR Gm,Em");
             nextop = F8;
             GETGM();
-            if(MODREG && gd==(nextop&7)) {
+            if (MODREG && gd == (nextop & 7)) {
                 // just zero dest
-                SD(xZR, gback, gdoffset+0);
+                SD(xZR, gback, gdoffset + 0);
             } else {
                 GETEM(x2, 0);
-                LD(x3, gback, gdoffset+0);
+                LD(x3, gback, gdoffset + 0);
                 LD(x4, wback, fixedaddress);
                 XOR(x3, x3, x4);
-                SD(x3, gback, gdoffset+0);
+                SD(x3, gback, gdoffset + 0);
             }
             break;
         case 0xF9:
diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c
index 7baddf4f..bfdbbaa5 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f.c
@@ -23,7 +23,8 @@
 
 uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog)
 {
-    (void)ip; (void)need_epilog;
+    (void)ip;
+    (void)need_epilog;
 
     uint8_t opcode = F8;
     uint8_t nextop, u8, s8;
@@ -49,7 +50,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
 
     static const int8_t round_round[] = { RD_RNE, RD_RDN, RD_RUP, RD_RTZ };
 
-    switch(opcode) {
+    switch (opcode) {
         case 0x10:
             INST_NAME("MOVUPD Gx,Ex");
             nextop = F8;
@@ -63,13 +64,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x1, 0);
             GETGX();
             SSE_LOOP_MV_Q2(x3);
-            if(!MODREG) SMWRITE2();
+            if (!MODREG) SMWRITE2();
             break;
         case 0x12:
             INST_NAME("MOVLPD Gx, Eq");
             nextop = F8;
             GETGX();
-            if(MODREG) {
+            if (MODREG) {
                 // access register instead of memory is bad opcode!
                 DEFAULT;
                 return addr;
@@ -77,19 +78,19 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             SMREAD();
             addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
             LD(x3, wback, fixedaddress);
-            SD(x3, gback, gdoffset+0);
+            SD(x3, gback, gdoffset + 0);
             break;
         case 0x13:
             INST_NAME("MOVLPD Eq, Gx");
             nextop = F8;
             GETGX();
-            if(MODREG) {
+            if (MODREG) {
                 // access register instead of memory is bad opcode!
                 DEFAULT;
                 return addr;
             }
             addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
-            LD(x3, gback, gdoffset+0);
+            LD(x3, gback, gdoffset + 0);
             SD(x3, wback, fixedaddress);
             SMWRITE2();
             break;
@@ -99,8 +100,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             // GX->q[1] = EX->q[0];
-            LD(x3, wback, fixedaddress+0);
-            SD(x3, gback, gdoffset+8);
+            LD(x3, wback, fixedaddress + 0);
+            SD(x3, gback, gdoffset + 8);
             break;
         case 0x15:
             INST_NAME("UNPCKHPD Gx, Ex");
@@ -108,17 +109,17 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x1, 0);
             GETGX();
             // GX->q[0] = GX->q[1];
-            LD(x3, gback, gdoffset+8);
-            SD(x3, gback, gdoffset+0);
+            LD(x3, gback, gdoffset + 8);
+            SD(x3, gback, gdoffset + 0);
             // GX->q[1] = EX->q[1];
-            LD(x3, wback, fixedaddress+8);
-            SD(x3, gback, gdoffset+8);
+            LD(x3, wback, fixedaddress + 8);
+            SD(x3, gback, gdoffset + 8);
             break;
         case 0x16:
             INST_NAME("MOVHPD Gx, Eq");
             nextop = F8;
             GETGX();
-            if(MODREG) {
+            if (MODREG) {
                 // access register instead of memory is bad opcode!
                 DEFAULT;
                 return addr;
@@ -126,7 +127,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             SMREAD();
             addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
             LD(x3, wback, fixedaddress);
-            SD(x3, gback, gdoffset+8);
+            SD(x3, gback, gdoffset + 8);
             break;
         case 0x1F:
             INST_NAME("NOP (multibyte)");
@@ -146,7 +147,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x1, 0);
             GETGX();
             SSE_LOOP_MV_Q2(x3);
-            if(!MODREG) SMWRITE2();
+            if (!MODREG) SMWRITE2();
             break;
         case 0x2A:
             INST_NAME("CVTPI2PD Gx,Em");
@@ -154,10 +155,10 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEM(x2, 0);
             d0 = fpu_get_scratch(dyn);
-            for (int i=0; i<2; ++i) {
-                LW(x1, wback, fixedaddress+i*4);
+            for (int i = 0; i < 2; ++i) {
+                LW(x1, wback, fixedaddress + i * 4);
                 FCVTDW(d0, x1, RD_RTZ);
-                FSD(d0, gback, gdoffset+i*8);
+                FSD(d0, gback, gdoffset + i * 8);
             }
             break;
         case 0x2B:
@@ -173,20 +174,20 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGM();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
-            for (int i=0; i<2; ++i) {
-                if(!box64_dynarec_fastround) {
-                    FSFLAGSI(0);  // // reset all bits
+            for (int i = 0; i < 2; ++i) {
+                if (!box64_dynarec_fastround) {
+                    FSFLAGSI(0); // // reset all bits
                 }
-                FLD(d0, wback, fixedaddress+i*8);
+                FLD(d0, wback, fixedaddress + i * 8);
                 FCVTWD(x1, d0, RD_RTZ);
-                if(!box64_dynarec_fastround) {
-                    FRFLAGS(x5);   // get back FPSR to check the IOC bit
-                    ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                if (!box64_dynarec_fastround) {
+                    FRFLAGS(x5); // get back FPSR to check the IOC bit
+                    ANDI(x5, x5, (1 << FR_NV) | (1 << FR_OF));
                     BEQ_MARKi(x5, xZR, i);
                     MOV32w(x1, 0x80000000);
                     MARKi(i);
                 }
-                SW(x1, gback, gdoffset+i*4);
+                SW(x1, gback, gdoffset + i * 4);
             }
             break;
         case 0x2D:
@@ -196,27 +197,31 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             u8 = sse_setround(dyn, ninst, x4, x5);
-            for (int i=0; i<2; ++i) {
-                if(!box64_dynarec_fastround) {
-                    FSFLAGSI(0);  // // reset all bits
+            for (int i = 0; i < 2; ++i) {
+                if (!box64_dynarec_fastround) {
+                    FSFLAGSI(0); // // reset all bits
                 }
-                FLD(d0, wback, fixedaddress+i*8);
+                FLD(d0, wback, fixedaddress + i * 8);
                 FCVTWD(x1, d0, RD_DYN);
-                if(!box64_dynarec_fastround) {
-                    FRFLAGS(x5);   // get back FPSR to check the IOC bit
-                    ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                if (!box64_dynarec_fastround) {
+                    FRFLAGS(x5); // get back FPSR to check the IOC bit
+                    ANDI(x5, x5, (1 << FR_NV) | (1 << FR_OF));
                     BEQ_MARKi(x5, xZR, i);
                     MOV32w(x1, 0x80000000);
                     MARKi(i);
                 }
-                SW(x1, gback, gdoffset+i*4);
+                SW(x1, gback, gdoffset + i * 4);
             }
             x87_restoreround(dyn, ninst, u8);
             break;
         case 0x2E:
             // no special check...
         case 0x2F:
-            if(opcode==0x2F) {INST_NAME("COMISD Gx, Ex");} else {INST_NAME("UCOMISD Gx, Ex");}
+            if (opcode == 0x2F) {
+                INST_NAME("COMISD Gx, Ex");
+            } else {
+                INST_NAME("UCOMISD Gx, Ex");
+            }
             SETFLAGS(X_ALL, SF_SET);
             SET_DFNONE();
             nextop = F8;
@@ -224,33 +229,36 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEXSD(v0, 0);
             CLEAR_FLAGS();
             // if isnan(d0) || isnan(v0)
-            IFX(X_ZF | X_PF | X_CF) {
+            IFX(X_ZF | X_PF | X_CF)
+            {
                 FEQD(x3, d0, d0);
                 FEQD(x2, v0, v0);
                 AND(x2, x2, x3);
                 BNE_MARK(x2, xZR);
-                ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF));
+                ORI(xFlags, xFlags, (1 << F_ZF) | (1 << F_PF) | (1 << F_CF));
                 B_NEXT_nocond;
             }
             MARK;
             // else if isless(d0, v0)
-            IFX(X_CF) {
+            IFX(X_CF)
+            {
                 FLTD(x2, d0, v0);
                 BEQ_MARK2(x2, xZR);
-                ORI(xFlags, xFlags, 1<<F_CF);
+                ORI(xFlags, xFlags, 1 << F_CF);
                 B_NEXT_nocond;
             }
             MARK2;
             // else if d0 == v0
-            IFX(X_ZF) {
+            IFX(X_ZF)
+            {
                 FEQD(x2, d0, v0);
                 CBZ_NEXT(x2);
-                ORI(xFlags, xFlags, 1<<F_ZF);
+                ORI(xFlags, xFlags, 1 << F_ZF);
             }
             break;
-        case 0x38:  // SSSE3 opcodes
+        case 0x38: // SSSE3 opcodes
             nextop = F8;
-            switch(nextop) {
+            switch (nextop) {
                 case 0x00:
                     INST_NAME("PSHUFB Gx, Ex");
                     nextop = F8;
@@ -261,46 +269,46 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
 
                     // perserve gd
-                    LD(x3, gback, gdoffset+0);
-                    LD(x4, gback, gdoffset+8);
+                    LD(x3, gback, gdoffset + 0);
+                    LD(x4, gback, gdoffset + 8);
                     SD(x3, x5, 0);
                     SD(x4, x5, 8);
 
-                    for (int i=0; i<16; ++i) {
-                        LBU(x3, wback, fixedaddress+i);
+                    for (int i = 0; i < 16; ++i) {
+                        LBU(x3, wback, fixedaddress + i);
                         ANDI(x4, x3, 128);
                         BEQZ(x4, 12);
-                        SB(xZR, gback, gdoffset+i);
+                        SB(xZR, gback, gdoffset + i);
                         BEQZ(xZR, 20); // continue
                         ANDI(x4, x3, 15);
                         ADD(x4, x4, x5);
                         LBU(x4, x4, 0);
-                        SB(x4, gback, gdoffset+i);
+                        SB(x4, gback, gdoffset + i);
                     }
                     break;
                 case 0x01:
                     INST_NAME("PHADDW Gx, Ex");
                     nextop = F8;
                     GETGX();
-                    for (int i=0; i<4; ++i) {
+                    for (int i = 0; i < 4; ++i) {
                         // GX->sw[i] = GX->sw[i*2+0]+GX->sw[i*2+1];
-                        LH(x3, gback, gdoffset+2*(i*2+0));
-                        LH(x4, gback, gdoffset+2*(i*2+1));
+                        LH(x3, gback, gdoffset + 2 * (i * 2 + 0));
+                        LH(x4, gback, gdoffset + 2 * (i * 2 + 1));
                         ADDW(x3, x3, x4);
-                        SH(x3, gback, gdoffset+2*i);
+                        SH(x3, gback, gdoffset + 2 * i);
                     }
-                    if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
+                    if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
                         // GX->q[1] = GX->q[0];
-                        LD(x3, gback, gdoffset+0);
-                        SD(x3, gback, gdoffset+8);
+                        LD(x3, gback, gdoffset + 0);
+                        SD(x3, gback, gdoffset + 8);
                     } else {
                         GETEX(x2, 0);
-                        for (int i=0; i<4; ++i) {
+                        for (int i = 0; i < 4; ++i) {
                             // GX->sw[4+i] = EX->sw[i*2+0] + EX->sw[i*2+1];
-                            LH(x3, wback, fixedaddress+2*(i*2+0));
-                            LH(x4, wback, fixedaddress+2*(i*2+1));
+                            LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
+                            LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                             ADDW(x3, x3, x4);
-                            SH(x3, gback, gdoffset+2*(4+i));
+                            SH(x3, gback, gdoffset + 2 * (4 + i));
                         }
                     }
                     break;
@@ -309,31 +317,31 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     // GX->sd[0] += GX->sd[1];
-                    LW(x3, gback, gdoffset+0*4);
-                    LW(x4, gback, gdoffset+1*4);
+                    LW(x3, gback, gdoffset + 0 * 4);
+                    LW(x4, gback, gdoffset + 1 * 4);
                     ADDW(x3, x3, x4);
-                    SW(x3, gback, gdoffset+0*4);
+                    SW(x3, gback, gdoffset + 0 * 4);
                     // GX->sd[1] = GX->sd[2] + GX->sd[3];
-                    LW(x3, gback, gdoffset+2*4);
-                    LW(x4, gback, gdoffset+3*4);
+                    LW(x3, gback, gdoffset + 2 * 4);
+                    LW(x4, gback, gdoffset + 3 * 4);
                     ADDW(x3, x3, x4);
-                    SW(x3, gback, gdoffset+1*4);
-                    if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
+                    SW(x3, gback, gdoffset + 1 * 4);
+                    if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
                         // GX->q[1] = GX->q[0];
-                        LD(x3, gback, gdoffset+0);
-                        SD(x3, gback, gdoffset+8);
+                        LD(x3, gback, gdoffset + 0);
+                        SD(x3, gback, gdoffset + 8);
                     } else {
                         GETEX(x2, 0);
                         // GX->sd[2] = EX->sd[0] + EX->sd[1];
-                        LW(x3, wback, fixedaddress+0*4);
-                        LW(x4, wback, fixedaddress+1*4);
+                        LW(x3, wback, fixedaddress + 0 * 4);
+                        LW(x4, wback, fixedaddress + 1 * 4);
                         ADDW(x3, x3, x4);
-                        SW(x3, gback, gdoffset+2*4);
+                        SW(x3, gback, gdoffset + 2 * 4);
                         // GX->sd[3] = EX->sd[2] + EX->sd[3];
-                        LW(x3, wback, fixedaddress+2*4);
-                        LW(x4, wback, fixedaddress+3*4);
+                        LW(x3, wback, fixedaddress + 2 * 4);
+                        LW(x4, wback, fixedaddress + 3 * 4);
                         ADDW(x3, x3, x4);
-                        SW(x3, gback, gdoffset+3*4);
+                        SW(x3, gback, gdoffset + 3 * 4);
                     }
                     break;
 
@@ -344,24 +352,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETEX(x2, 0);
                     MOV64x(x5, 32767);
                     MOV64x(x6, -32768);
-                    for(int i=0; i<8; ++i) {
-                        LBU(x3, gback, gdoffset+i*2);
-                        LB(x4, wback, fixedaddress+i*2);
+                    for (int i = 0; i < 8; ++i) {
+                        LBU(x3, gback, gdoffset + i * 2);
+                        LB(x4, wback, fixedaddress + i * 2);
                         MUL(x9, x3, x4);
-                        LBU(x3, gback, gdoffset+i*2+1);
-                        LB(x4, wback, fixedaddress+i*2+1);
+                        LBU(x3, gback, gdoffset + i * 2 + 1);
+                        LB(x4, wback, fixedaddress + i * 2 + 1);
                         MUL(x3, x3, x4);
                         ADD(x3, x3, x9);
-                        if(rv64_zbb) {
+                        if (rv64_zbb) {
                             MIN(x3, x3, x5);
                             MAX(x3, x3, x6);
                         } else {
-                            BLT(x3, x5, 4+4);
+                            BLT(x3, x5, 4 + 4);
                             MV(x3, x5);
-                            BLT(x6, x3, 4+4);
+                            BLT(x6, x3, 4 + 4);
                             MV(x3, x6);
                         }
-                        SH(x3, gback, gdoffset+i*2);
+                        SH(x3, gback, gdoffset + i * 2);
                     }
                     break;
 
@@ -370,14 +378,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<16; ++i) {
-                        LB(x3, gback, gdoffset+i);
-                        LB(x4, wback, fixedaddress+i);
-                        BGE(x4, xZR, 4+4);
+                    for (int i = 0; i < 16; ++i) {
+                        LB(x3, gback, gdoffset + i);
+                        LB(x4, wback, fixedaddress + i);
+                        BGE(x4, xZR, 4 + 4);
                         NEG(x3, x3);
-                        BNE(x4, xZR, 4+4);
+                        BNE(x4, xZR, 4 + 4);
                         MOV_U12(x3, 0);
-                        SB(x3, gback, gdoffset+i);
+                        SB(x3, gback, gdoffset + i);
                     }
                     break;
                 case 0x09:
@@ -385,14 +393,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<8; ++i) {
-                        LH(x3, gback, gdoffset+i*2);
-                        LH(x4, wback, fixedaddress+i*2);
-                        BGE(x4, xZR, 4+4);
+                    for (int i = 0; i < 8; ++i) {
+                        LH(x3, gback, gdoffset + i * 2);
+                        LH(x4, wback, fixedaddress + i * 2);
+                        BGE(x4, xZR, 4 + 4);
                         NEG(x3, x3);
-                        BNE(x4, xZR, 4+4);
+                        BNE(x4, xZR, 4 + 4);
                         MOV_U12(x3, 0);
-                        SH(x3, gback, gdoffset+i*2);
+                        SH(x3, gback, gdoffset + i * 2);
                     }
                     break;
                 case 0x0A:
@@ -400,14 +408,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<4; ++i) {
-                        LW(x3, gback, gdoffset+i*4);
-                        LW(x4, wback, fixedaddress+i*4);
-                        BGE(x4, xZR, 4+4);
+                    for (int i = 0; i < 4; ++i) {
+                        LW(x3, gback, gdoffset + i * 4);
+                        LW(x4, wback, fixedaddress + i * 4);
+                        BGE(x4, xZR, 4 + 4);
                         NEG(x3, x3);
-                        BNE(x4, xZR, 4+4);
+                        BNE(x4, xZR, 4 + 4);
                         ADDI(x3, xZR, 0);
-                        SW(x3, gback, gdoffset+i*4);
+                        SW(x3, gback, gdoffset + i * 4);
                     }
                     break;
                 case 0x0B:
@@ -415,14 +423,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<8; ++i) {
-                        LH(x3, gback, gdoffset+i*2);
-                        LH(x4, wback, fixedaddress+i*2);
+                    for (int i = 0; i < 8; ++i) {
+                        LH(x3, gback, gdoffset + i * 2);
+                        LH(x4, wback, fixedaddress + i * 2);
                         MUL(x3, x3, x4);
                         SRAI(x3, x3, 14);
                         ADDI(x3, x3, 1);
                         SRAI(x3, x3, 1);
-                        SH(x3, gback, gdoffset+i*2);
+                        SH(x3, gback, gdoffset + i * 2);
                     }
                     break;
                 case 0x10:
@@ -431,11 +439,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETGX();
                     GETEX(x2, 0);
                     sse_forget_reg(dyn, ninst, 0); // forget xmm[0]
-                    for (int i=0; i<16; ++i) {
-                        LB(x3, xEmu, offsetof(x64emu_t, xmm[0])+i);
+                    for (int i = 0; i < 16; ++i) {
+                        LB(x3, xEmu, offsetof(x64emu_t, xmm[0]) + i);
                         BGE(x3, xZR, 12); // continue
-                        LBU(x3, wback, fixedaddress+i);
-                        SB(x3, gback, gdoffset+i);
+                        LBU(x3, wback, fixedaddress + i);
+                        SB(x3, gback, gdoffset + i);
                         // continue
                     }
                     break;
@@ -447,29 +455,32 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETEX(x2, 0);
                     CLEAR_FLAGS();
                     SET_DFNONE();
-                    IFX(X_ZF|X_CF) {
-                        LD(x5, wback, fixedaddress+0);
-                        LD(x6, wback, fixedaddress+8);
+                    IFX(X_ZF | X_CF)
+                    {
+                        LD(x5, wback, fixedaddress + 0);
+                        LD(x6, wback, fixedaddress + 8);
 
-                        IFX(X_ZF) {
-                            LD(x3, gback, gdoffset+0);
-                            LD(x4, gback, gdoffset+8);
+                        IFX(X_ZF)
+                        {
+                            LD(x3, gback, gdoffset + 0);
+                            LD(x4, gback, gdoffset + 8);
                             AND(x3, x3, x5);
                             AND(x4, x4, x6);
                             OR(x3, x3, x4);
                             BNEZ(x3, 8);
-                            ORI(xFlags, xFlags, 1<<F_ZF);
+                            ORI(xFlags, xFlags, 1 << F_ZF);
                         }
-                        IFX(X_CF) {
-                            LD(x3, gback, gdoffset+0);
+                        IFX(X_CF)
+                        {
+                            LD(x3, gback, gdoffset + 0);
                             NOT(x3, x3);
-                            LD(x4, gback, gdoffset+8);
+                            LD(x4, gback, gdoffset + 8);
                             NOT(x4, x4);
                             AND(x3, x3, x5);
                             AND(x4, x4, x6);
                             OR(x3, x3, x4);
                             BNEZ(x3, 8);
-                            ORI(xFlags, xFlags, 1<<F_CF);
+                            ORI(xFlags, xFlags, 1 << F_CF);
                         }
                     }
                     break;
@@ -479,11 +490,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<16; ++i) {
-                        LB(x4, wback, fixedaddress+i);
-                        BGE(x4, xZR, 4+4);
+                    for (int i = 0; i < 16; ++i) {
+                        LB(x4, wback, fixedaddress + i);
+                        BGE(x4, xZR, 4 + 4);
                         NEG(x4, x4);
-                        SB(x4, gback, gdoffset+i);
+                        SB(x4, gback, gdoffset + i);
                     }
                     break;
                 case 0x1D:
@@ -491,11 +502,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<8; ++i) {
-                        LH(x4, wback, fixedaddress+i*2);
-                        BGE(x4, xZR, 4+4);
+                    for (int i = 0; i < 8; ++i) {
+                        LH(x4, wback, fixedaddress + i * 2);
+                        BGE(x4, xZR, 4 + 4);
                         NEG(x4, x4);
-                        SH(x4, gback, gdoffset+i*2);
+                        SH(x4, gback, gdoffset + i * 2);
                     }
                     break;
                 case 0x1E:
@@ -503,12 +514,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    MOV64x(x5, ~(1<<31));
-                    for(int i=0; i<4; ++i) {
-                        LW(x4, wback, fixedaddress+i*4);
-                        BGE(x4, xZR, 4+4);
+                    MOV64x(x5, ~(1 << 31));
+                    for (int i = 0; i < 4; ++i) {
+                        LW(x4, wback, fixedaddress + i * 4);
+                        BGE(x4, xZR, 4 + 4);
                         NEG(x4, x4);
-                        SW(x4, gback, gdoffset+i*4);
+                        SW(x4, gback, gdoffset + i * 4);
                     }
                     break;
 
@@ -518,35 +529,36 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETGX();
                     GETEX(x2, 0);
                     MOV64x(x5, 65535);
-                    for(int i=0; i<4; ++i) {
-                        LW(x3, gback, gdoffset+i*4);
-                        if(rv64_zbb) {
+                    for (int i = 0; i < 4; ++i) {
+                        LW(x3, gback, gdoffset + i * 4);
+                        if (rv64_zbb) {
                             MIN(x3, x3, x5);
                             MAX(x3, x3, xZR);
                         } else {
-                            BGE(x3, xZR, 4+4);
+                            BGE(x3, xZR, 4 + 4);
                             MV(x3, xZR);
-                            BLT(x3, x5, 4+4);
+                            BLT(x3, x5, 4 + 4);
                             MV(x3, x5);
                         }
-                        SH(x3, gback, gdoffset+i*2);
+                        SH(x3, gback, gdoffset + i * 2);
                     }
-                    if(MODREG && gd==ed) {
-                        LD(x3, gback, gdoffset+0);
-                        SD(x3, gback, gdoffset+8);
-                    } else for(int i=0; i<4; ++i) {
-                        LW(x3, wback, fixedaddress+i*4);
-                        if(rv64_zbb) {
-                            MIN(x3, x3, x5);
-                            MAX(x3, x3, xZR);
-                        } else {
-                            BGE(x3, xZR, 4+4);
-                            MV(x3, xZR);
-                            BLT(x3, x5, 4+4);
-                            MV(x3, x5);
+                    if (MODREG && gd == ed) {
+                        LD(x3, gback, gdoffset + 0);
+                        SD(x3, gback, gdoffset + 8);
+                    } else
+                        for (int i = 0; i < 4; ++i) {
+                            LW(x3, wback, fixedaddress + i * 4);
+                            if (rv64_zbb) {
+                                MIN(x3, x3, x5);
+                                MAX(x3, x3, xZR);
+                            } else {
+                                BGE(x3, xZR, 4 + 4);
+                                MV(x3, xZR);
+                                BLT(x3, x5, 4 + 4);
+                                MV(x3, x5);
+                            }
+                            SH(x3, gback, gdoffset + 8 + i * 2);
                         }
-                        SH(x3, gback, gdoffset+8+i*2);
-                    }
                     break;
 
                 case 0x30:
@@ -554,9 +566,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=7; i>=0; --i) {
-                        LBU(x3, wback, fixedaddress+i);
-                        SH(x3, gback, gdoffset+i*2);
+                    for (int i = 7; i >= 0; --i) {
+                        LBU(x3, wback, fixedaddress + i);
+                        SH(x3, gback, gdoffset + i * 2);
                     }
                     break;
                 case 0x31:
@@ -564,9 +576,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=3; i>=0; --i) {
-                        LBU(x3, wback, fixedaddress+i);
-                        SW(x3, gback, gdoffset+i*4);
+                    for (int i = 3; i >= 0; --i) {
+                        LBU(x3, wback, fixedaddress + i);
+                        SW(x3, gback, gdoffset + i * 4);
                     }
                     break;
                 case 0x32:
@@ -574,9 +586,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=1; i>=0; --i) {
-                        LBU(x3, wback, fixedaddress+i);
-                        SD(x3, gback, gdoffset+i*8);
+                    for (int i = 1; i >= 0; --i) {
+                        LBU(x3, wback, fixedaddress + i);
+                        SD(x3, gback, gdoffset + i * 8);
                     }
                     break;
                 case 0x33:
@@ -584,9 +596,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=3; i>=0; --i) {
-                        LHU(x3, wback, fixedaddress+i*2);
-                        SW(x3, gback, gdoffset+i*4);
+                    for (int i = 3; i >= 0; --i) {
+                        LHU(x3, wback, fixedaddress + i * 2);
+                        SW(x3, gback, gdoffset + i * 4);
                     }
                     break;
                 case 0x34:
@@ -594,9 +606,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=1; i>=0; --i) {
-                        LHU(x3, wback, fixedaddress+i*2);
-                        SD(x3, gback, gdoffset+i*8);
+                    for (int i = 1; i >= 0; --i) {
+                        LHU(x3, wback, fixedaddress + i * 2);
+                        SD(x3, gback, gdoffset + i * 8);
                     }
                     break;
                 case 0x35:
@@ -604,106 +616,130 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=1; i>=0; --i) {
-                        LWU(x3, wback, fixedaddress+i*4);
-                        SD(x3, gback, gdoffset+i*8);
+                    for (int i = 1; i >= 0; --i) {
+                        LWU(x3, wback, fixedaddress + i * 4);
+                        SD(x3, gback, gdoffset + i * 8);
                     }
                     break;
 
                 case 0x38:
-                    INST_NAME("PMINSB Gx, Ex");  // SSE4 opcode!
+                    INST_NAME("PMINSB Gx, Ex"); // SSE4 opcode!
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<16; ++i) {
-                        LB(x3, gback, gdoffset+i);
-                        LB(x4, wback, fixedaddress+i);
-                        if(rv64_zbb) MIN(x4, x3, x4); else BLT(x3, x4, 4+4);
-                        SB(x4, gback, gdoffset+i);
+                    for (int i = 0; i < 16; ++i) {
+                        LB(x3, gback, gdoffset + i);
+                        LB(x4, wback, fixedaddress + i);
+                        if (rv64_zbb)
+                            MIN(x4, x3, x4);
+                        else
+                            BLT(x3, x4, 4 + 4);
+                        SB(x4, gback, gdoffset + i);
                     }
                     break;
                 case 0x39:
-                    INST_NAME("PMINSD Gx, Ex");  // SSE4 opcode!
+                    INST_NAME("PMINSD Gx, Ex"); // SSE4 opcode!
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<4; ++i) {
-                        LW(x3, gback, gdoffset+i*4);
-                        LW(x4, wback, fixedaddress+i*4);
-                        if(rv64_zbb) MIN(x4, x3, x4); else BLT(x3, x4, 4+4);
-                        SW(x4, gback, gdoffset+i*4);
+                    for (int i = 0; i < 4; ++i) {
+                        LW(x3, gback, gdoffset + i * 4);
+                        LW(x4, wback, fixedaddress + i * 4);
+                        if (rv64_zbb)
+                            MIN(x4, x3, x4);
+                        else
+                            BLT(x3, x4, 4 + 4);
+                        SW(x4, gback, gdoffset + i * 4);
                     }
                     break;
                 case 0x3A:
-                    INST_NAME("PMINUW Gx, Ex");  // SSE4 opcode!
+                    INST_NAME("PMINUW Gx, Ex"); // SSE4 opcode!
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<8; ++i) {
-                        LHU(x3, gback, gdoffset+i*2);
-                        LHU(x4, wback, fixedaddress+i*2);
-                        if(rv64_zbb) MINU(x4, x3, x4); else BLTU(x3, x4, 4+4);
-                        SH(x4, gback, gdoffset+i*2);
+                    for (int i = 0; i < 8; ++i) {
+                        LHU(x3, gback, gdoffset + i * 2);
+                        LHU(x4, wback, fixedaddress + i * 2);
+                        if (rv64_zbb)
+                            MINU(x4, x3, x4);
+                        else
+                            BLTU(x3, x4, 4 + 4);
+                        SH(x4, gback, gdoffset + i * 2);
                     }
                     break;
                 case 0x3B:
-                    INST_NAME("PMINUD Gx, Ex");  // SSE4 opcode!
+                    INST_NAME("PMINUD Gx, Ex"); // SSE4 opcode!
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<4; ++i) {
-                        LWU(x3, gback, gdoffset+i*4);
-                        LWU(x4, wback, fixedaddress+i*4);
-                        if(rv64_zbb) MINU(x4, x3, x4); else BLTU(x3, x4, 4+4);
-                        SW(x4, gback, gdoffset+i*4);
+                    for (int i = 0; i < 4; ++i) {
+                        LWU(x3, gback, gdoffset + i * 4);
+                        LWU(x4, wback, fixedaddress + i * 4);
+                        if (rv64_zbb)
+                            MINU(x4, x3, x4);
+                        else
+                            BLTU(x3, x4, 4 + 4);
+                        SW(x4, gback, gdoffset + i * 4);
                     }
                     break;
                 case 0x3C:
-                    INST_NAME("PMAXSB Gx, Ex");  // SSE4 opcode!
+                    INST_NAME("PMAXSB Gx, Ex"); // SSE4 opcode!
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<16; ++i) {
-                        LB(x3, gback, gdoffset+i);
-                        LB(x4, wback, fixedaddress+i);
-                        if(rv64_zbb) MAX(x4, x3, x4); else BLT(x4, x3, 4+4);
-                        SB(x4, gback, gdoffset+i);
+                    for (int i = 0; i < 16; ++i) {
+                        LB(x3, gback, gdoffset + i);
+                        LB(x4, wback, fixedaddress + i);
+                        if (rv64_zbb)
+                            MAX(x4, x3, x4);
+                        else
+                            BLT(x4, x3, 4 + 4);
+                        SB(x4, gback, gdoffset + i);
                     }
                     break;
                 case 0x3D:
-                    INST_NAME("PMAXSD Gx, Ex");  // SSE4 opcode!
+                    INST_NAME("PMAXSD Gx, Ex"); // SSE4 opcode!
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<4; ++i) {
-                        LW(x3, gback, gdoffset+i*4);
-                        LW(x4, wback, fixedaddress+i*4);
-                        if(rv64_zbb) MAX(x4, x3, x4); else BLT(x4, x3, 4+4);
-                        SW(x4, gback, gdoffset+i*4);
+                    for (int i = 0; i < 4; ++i) {
+                        LW(x3, gback, gdoffset + i * 4);
+                        LW(x4, wback, fixedaddress + i * 4);
+                        if (rv64_zbb)
+                            MAX(x4, x3, x4);
+                        else
+                            BLT(x4, x3, 4 + 4);
+                        SW(x4, gback, gdoffset + i * 4);
                     }
                     break;
                 case 0x3E:
-                    INST_NAME("PMAXUW Gx, Ex");  // SSE4 opcode!
+                    INST_NAME("PMAXUW Gx, Ex"); // SSE4 opcode!
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<8; ++i) {
-                        LHU(x3, gback, gdoffset+i*2);
-                        LHU(x4, wback, fixedaddress+i*2);
-                        if(rv64_zbb) MAXU(x4, x3, x4); else BLTU(x4, x3, 4+4);
-                        SH(x4, gback, gdoffset+i*2);
+                    for (int i = 0; i < 8; ++i) {
+                        LHU(x3, gback, gdoffset + i * 2);
+                        LHU(x4, wback, fixedaddress + i * 2);
+                        if (rv64_zbb)
+                            MAXU(x4, x3, x4);
+                        else
+                            BLTU(x4, x3, 4 + 4);
+                        SH(x4, gback, gdoffset + i * 2);
                     }
                     break;
                 case 0x3F:
-                    INST_NAME("PMAXUD Gx, Ex");  // SSE4 opcode!
+                    INST_NAME("PMAXUD Gx, Ex"); // SSE4 opcode!
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<4; ++i) {
-                        LWU(x3, gback, gdoffset+i*4);
-                        LWU(x4, wback, fixedaddress+i*4);
-                        if(rv64_zbb) MAXU(x4, x3, x4); else BLTU(x4, x3, 4+4);
-                        SW(x4, gback, gdoffset+i*4);
+                    for (int i = 0; i < 4; ++i) {
+                        LWU(x3, gback, gdoffset + i * 4);
+                        LWU(x4, wback, fixedaddress + i * 4);
+                        if (rv64_zbb)
+                            MAXU(x4, x3, x4);
+                        else
+                            BLTU(x4, x3, 4 + 4);
+                        SW(x4, gback, gdoffset + i * 4);
                     }
                     break;
                 case 0x40:
@@ -711,15 +747,15 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
-                    for(int i=0; i<4; ++i) {
-                        LW(x3, gback, gdoffset+i*4);
-                        LW(x4, wback, fixedaddress+i*4);
+                    for (int i = 0; i < 4; ++i) {
+                        LW(x3, gback, gdoffset + i * 4);
+                        LW(x4, wback, fixedaddress + i * 4);
                         MUL(x3, x3, x4);
-                        SW(x3, gback, gdoffset+i*4);
+                        SW(x3, gback, gdoffset + i * 4);
                     }
                     break;
                 case 0xDB:
-                    INST_NAME("AESIMC Gx, Ex");  // AES-NI
+                    INST_NAME("AESIMC Gx, Ex"); // AES-NI
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0);
@@ -729,7 +765,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     CALL(native_aesimc, -1);
                     break;
                 case 0xDC:
-                    INST_NAME("AESENC Gx, Ex");  // AES-NI
+                    INST_NAME("AESENC Gx, Ex"); // AES-NI
                     nextop = F8;
                     GETG;
                     sse_forget_reg(dyn, ninst, gd);
@@ -740,7 +776,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
                     break;
                 case 0xDD:
-                    INST_NAME("AESENCLAST Gx, Ex");  // AES-NI
+                    INST_NAME("AESENCLAST Gx, Ex"); // AES-NI
                     nextop = F8;
                     GETG;
                     sse_forget_reg(dyn, ninst, gd);
@@ -751,7 +787,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
                     break;
                 case 0xDE:
-                    INST_NAME("AESDEC Gx, Ex");  // AES-NI
+                    INST_NAME("AESDEC Gx, Ex"); // AES-NI
                     nextop = F8;
                     GETG;
                     sse_forget_reg(dyn, ninst, gd);
@@ -763,7 +799,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     break;
 
                 case 0xDF:
-                    INST_NAME("AESDECLAST Gx, Ex");  // AES-NI
+                    INST_NAME("AESDECLAST Gx, Ex"); // AES-NI
                     nextop = F8;
                     GETG;
                     sse_forget_reg(dyn, ninst, gd);
@@ -775,14 +811,17 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     break;
                 case 0xF0:
                     INST_NAME("MOVBE Gw, Ew");
-                    nextop=F8;
+                    nextop = F8;
                     GETGD;
                     SMREAD();
-                    addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 1);
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, 0);
                     LHU(x1, ed, fixedaddress);
                     if (rv64_zbb) {
                         REV8(x1, x1);
                         SRLI(x1, x1, 48);
+                    } else if (rv64_xtheadbb) {
+                        TH_REVW(x1, x1);
+                        SRLI(x1, x1, 16);
                     } else {
                         ANDI(x2, x1, 0xff);
                         SLLI(x2, x2, 8);
@@ -791,17 +830,20 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     }
                     LUI(x2, 0xffff0);
                     AND(gd, gd, x2);
-                    OR(gd, gd, x1);                    
+                    OR(gd, gd, x1);
                     break;
                 case 0xF1:
                     INST_NAME("MOVBE Ew, Gw");
-                    nextop=F8;
+                    nextop = F8;
                     GETGD;
                     SMREAD();
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 0, 1);  
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, 0);
                     if (rv64_zbb) {
                         REV8(x1, gd);
                         SRLI(x1, x1, 48);
+                    } else if (rv64_xtheadbb) {
+                        TH_REVW(x1, gd);
+                        SRLI(x1, x1, 16);
                     } else {
                         ANDI(x1, gd, 0xff);
                         SLLI(x1, x1, 8);
@@ -815,9 +857,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     DEFAULT;
             }
             break;
-        case 0x3A:  // these are some more SSSE3+ opcodes
+        case 0x3A: // these are some more SSSE3+ opcodes
             opcode = F8;
-            switch(opcode) {
+            switch (opcode) {
                 case 0x0B:
                     INST_NAME("ROUNDSD Gx, Ex, Ib");
                     nextop = F8;
@@ -828,7 +870,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     u8 = F8;
                     FEQD(x2, d0, d0);
                     BNEZ_MARK(x2);
-                    if (v0!=d0) FMVD(v0, d0);
+                    if (v0 != d0) FMVD(v0, d0);
                     B_NEXT_nocond;
                     MARK; // d0 is not nan
                     FABSD(v1, d0);
@@ -836,16 +878,16 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     FCVTDL(d1, x3, RD_RTZ);
                     FLTD(x3, v1, d1);
                     BNEZ_MARK2(x3);
-                    if (v0!=d0) FMVD(v0, d0);
+                    if (v0 != d0) FMVD(v0, d0);
                     B_NEXT_nocond;
                     MARK2;
-                    if(u8&4) {
+                    if (u8 & 4) {
                         u8 = sse_setround(dyn, ninst, x4, x2);
                         FCVTLD(x5, d0, RD_DYN);
                         FCVTDL(v0, x5, RD_RTZ);
                         x87_restoreround(dyn, ninst, u8);
                     } else {
-                        FCVTLD(x5, d0, round_round[u8&3]);
+                        FCVTLD(x5, d0, round_round[u8 & 3]);
                         FCVTDL(v0, x5, RD_RTZ);
                     }
                     break;
@@ -871,20 +913,20 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     FLTD(x4, v1, d1);
                     BNEZ(x4, 8);
                     B_MARK_nocond;
-                    if(u8&4) {
+                    if (u8 & 4) {
                         u8 = sse_setround(dyn, ninst, x4, x5);
                         FCVTLD(x5, d0, RD_DYN);
                         FCVTDL(d0, x5, RD_RTZ);
                         x87_restoreround(dyn, ninst, u8);
                     } else {
-                        FCVTLD(x5, d0, round_round[u8&3]);
+                        FCVTLD(x5, d0, round_round[u8 & 3]);
                         FCVTDL(d0, x5, RD_RTZ);
                     }
                     MARK;
-                    FSD(d0, gback, gdoffset+0);
+                    FSD(d0, gback, gdoffset + 0);
 
                     // i = 1
-                    FLD(d0, wback, fixedaddress+8);
+                    FLD(d0, wback, fixedaddress + 8);
                     FEQD(x4, d0, d0);
                     BNEZ(x4, 8);
                     B_MARK2_nocond;
@@ -893,17 +935,17 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     FLTD(x4, v1, d1);
                     BNEZ(x4, 8);
                     B_MARK2_nocond;
-                    if(u8&4) {
+                    if (u8 & 4) {
                         u8 = sse_setround(dyn, ninst, x4, x5);
                         FCVTLD(x5, d0, RD_DYN);
                         FCVTDL(d0, x5, RD_RTZ);
                         x87_restoreround(dyn, ninst, u8);
                     } else {
-                        FCVTLD(x5, d0, round_round[u8&3]);
+                        FCVTLD(x5, d0, round_round[u8 & 3]);
                         FCVTDL(d0, x5, RD_RTZ);
                     }
                     MARK2;
-                    FSD(d0, gback, gdoffset+8);
+                    FSD(d0, gback, gdoffset + 8);
                     break;
                 case 0x0E:
                     INST_NAME("PBLENDW Gx, Ex, Ib");
@@ -912,34 +954,34 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETEX(x2, 1);
                     u8 = F8;
                     i32 = 0;
-                    if (MODREG && gd==ed) break;
+                    if (MODREG && gd == ed) break;
                     while (u8)
-                        if(u8&1) {
-                            if(!(i32&1) && u8&2) {
-                                if(!(i32&3) && (u8&0xf)==0xf) {
+                        if (u8 & 1) {
+                            if (!(i32 & 1) && u8 & 2) {
+                                if (!(i32 & 3) && (u8 & 0xf) == 0xf) {
                                     // whole 64bits
-                                    LD(x3, wback, fixedaddress+8*(i32>>2));
-                                    SD(x3, gback, gdoffset+8*(i32>>2));
-                                    i32+=4;
-                                    u8>>=4;
+                                    LD(x3, wback, fixedaddress + 8 * (i32 >> 2));
+                                    SD(x3, gback, gdoffset + 8 * (i32 >> 2));
+                                    i32 += 4;
+                                    u8 >>= 4;
                                 } else {
                                     // 32bits
-                                    LWU(x3, wback, fixedaddress+4*(i32>>1));
-                                    SW(x3, gback, gdoffset+4*(i32>>1));
-                                    i32+=2;
-                                    u8>>=2;
+                                    LWU(x3, wback, fixedaddress + 4 * (i32 >> 1));
+                                    SW(x3, gback, gdoffset + 4 * (i32 >> 1));
+                                    i32 += 2;
+                                    u8 >>= 2;
                                 }
                             } else {
                                 // 16 bits
-                                LHU(x3, wback, fixedaddress+2*i32);
-                                SH(x3, gback, gdoffset+2*i32);
+                                LHU(x3, wback, fixedaddress + 2 * i32);
+                                SH(x3, gback, gdoffset + 2 * i32);
                                 i32++;
-                                u8>>=1;
+                                u8 >>= 1;
                             }
                         } else {
                             // nope
                             i32++;
-                            u8>>=1;
+                            u8 >>= 1;
                         }
                     break;
                 case 0x0F:
@@ -951,38 +993,42 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     sse_forget_reg(dyn, ninst, x5);
                     ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
                     // perserve gd
-                    LD(x3, gback, gdoffset+0);
-                    LD(x4, gback, gdoffset+8);
+                    LD(x3, gback, gdoffset + 0);
+                    LD(x4, gback, gdoffset + 8);
                     SD(x3, x5, 0);
                     SD(x4, x5, 8);
-                    if(u8>31) {
-                        SD(xZR, gback, gdoffset+0);
-                        SD(xZR, gback, gdoffset+8);
+                    if (u8 > 31) {
+                        SD(xZR, gback, gdoffset + 0);
+                        SD(xZR, gback, gdoffset + 8);
                     } else {
-                        for (int i=0; i<16; ++i, ++u8) {
-                            if (u8>15) {
-                                if(u8>31) {
-                                    SB(xZR, gback, gdoffset+i);
+                        for (int i = 0; i < 16; ++i, ++u8) {
+                            if (u8 > 15) {
+                                if (u8 > 31) {
+                                    SB(xZR, gback, gdoffset + i);
                                     continue;
-                                }
-                                else LBU(x3, x5, u8-16);
+                                } else
+                                    LBU(x3, x5, u8 - 16);
                             } else {
-                                LBU(x3, wback, fixedaddress+u8);
+                                LBU(x3, wback, fixedaddress + u8);
                             }
-                            SB(x3, gback, gdoffset+i);
+                            SB(x3, gback, gdoffset + i);
                         }
                     }
                     break;
                 case 0x16:
-                    if(rex.w) {INST_NAME("PEXTRQ Ed, Gx, Ib");} else {INST_NAME("PEXTRD Ed, Gx, Ib");}
+                    if (rex.w) {
+                        INST_NAME("PEXTRQ Ed, Gx, Ib");
+                    } else {
+                        INST_NAME("PEXTRD Ed, Gx, Ib");
+                    }
                     nextop = F8;
                     GETGX();
                     GETED(1);
                     u8 = F8;
-                    if(rex.w)
-                        LD(ed, gback, gdoffset+8*(u8&1));
+                    if (rex.w)
+                        LD(ed, gback, gdoffset + 8 * (u8 & 1));
                     else
-                        LWU(ed, gback, gdoffset+4*(u8&3));
+                        LWU(ed, gback, gdoffset + 4 * (u8 & 3));
                     if (wback) {
                         SDxw(ed, wback, fixedaddress);
                         SMWRITE2();
@@ -994,7 +1040,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETGX();
                     GETED(1);
                     u8 = F8;
-                    SB(ed, gback, gdoffset+u8&0xF);
+                    SB(ed, gback, gdoffset + u8 & 0xF);
                     break;
                 case 0x21:
                     INST_NAME("INSERTPS GX, EX, Ib");
@@ -1002,14 +1048,17 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETGX();
                     GETEX(x2, 1);
                     u8 = F8;
-                    if(MODREG) s8 = (u8>>6)&3; else s8 = 0;
+                    if (MODREG)
+                        s8 = (u8 >> 6) & 3;
+                    else
+                        s8 = 0;
                     // GX->ud[(tmp8u>>4)&3] = EX->ud[tmp8s];
-                    LWU(x3, wback, fixedaddress+4*s8);
-                    SW(x3, gback, gdoffset+4*(u8>>4));
-                    for(int i=0; i<4; ++i) {
-                        if(u8&(1<<i))
+                    LWU(x3, wback, fixedaddress + 4 * s8);
+                    SW(x3, gback, gdoffset + 4 * (u8 >> 4));
+                    for (int i = 0; i < 4; ++i) {
+                        if (u8 & (1 << i))
                             // GX->ud[i] = 0;
-                            SW(xZR, gback, gdoffset+4*i);
+                            SW(xZR, gback, gdoffset + 4 * i);
                     }
                     break;
                 case 0x22:
@@ -1018,10 +1067,10 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETGX();
                     GETED(1);
                     u8 = F8;
-                    if(rex.w) {
-                        SD(ed, gback, gdoffset+8*(u8&0x1));
+                    if (rex.w) {
+                        SD(ed, gback, gdoffset + 8 * (u8 & 0x1));
                     } else {
-                        SW(ed, gback, gdoffset+4*(u8&0x3));
+                        SW(ed, gback, gdoffset + 4 * (u8 & 0x3));
                     }
                     break;
                 case 0x44:
@@ -1030,15 +1079,15 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETG;
                     sse_forget_reg(dyn, ninst, gd);
                     MOV32w(x1, gd); // gx
-                    if(MODREG) {
-                        ed = (nextop&7)+(rex.b<<3);
+                    if (MODREG) {
+                        ed = (nextop & 7) + (rex.b << 3);
                         sse_forget_reg(dyn, ninst, ed);
                         MOV32w(x2, ed);
-                        MOV32w(x3, 0);  // p = NULL
+                        MOV32w(x3, 0); // p = NULL
                     } else {
                         MOV32w(x2, 0);
                         addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 0, 1);
-                        if(ed!=x3) {
+                        if (ed != x3) {
                             MV(x3, ed);
                         }
                     }
@@ -1047,20 +1096,20 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     CALL(native_pclmul, -1);
                     break;
                 case 0xDF:
-                    INST_NAME("AESKEYGENASSIST Gx, Ex, Ib");  // AES-NI
+                    INST_NAME("AESKEYGENASSIST Gx, Ex, Ib"); // AES-NI
                     nextop = F8;
                     GETG;
                     sse_forget_reg(dyn, ninst, gd);
                     MOV32w(x1, gd); // gx
-                    if(MODREG) {
-                        ed = (nextop&7)+(rex.b<<3);
+                    if (MODREG) {
+                        ed = (nextop & 7) + (rex.b << 3);
                         sse_forget_reg(dyn, ninst, ed);
                         MOV32w(x2, ed);
-                        MOV32w(x3, 0);  //p = NULL
+                        MOV32w(x3, 0); // p = NULL
                     } else {
                         MOV32w(x2, 0);
                         addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 1);
-                        if(ed!=x3) {
+                        if (ed != x3) {
                             MV(x3, ed);
                         }
                     }
@@ -1068,41 +1117,41 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     MOV32w(x4, u8);
                     CALL(native_aeskeygenassist, -1);
                     break;
-            default:
+                default:
                     DEFAULT;
             }
             break;
-        #define GO(GETFLAGS, NO, YES, F)            \
-            READFLAGS(F);                           \
-            GETFLAGS;                               \
-            nextop=F8;                              \
-            GETGD;                                  \
-            if(MODREG) {                            \
-                ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                ZEXTH(x4, ed);                      \
-                ed = x4;                            \
-            } else {                                \
-                SMREAD();                           \
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \
-                LHU(x4, ed, fixedaddress);          \
-                ed = x4;                            \
-            }                                       \
-            B##NO(x1, 4+3*4);                       \
-            LUI(x3, 0xffff0);                       \
-            AND(gd, gd, x3);                        \
-            OR(gd, gd, ed);
+#define GO(GETFLAGS, NO, YES, F)                                                             \
+    READFLAGS(F);                                                                            \
+    GETFLAGS;                                                                                \
+    nextop = F8;                                                                             \
+    GETGD;                                                                                   \
+    if (MODREG) {                                                                            \
+        ed = xRAX + (nextop & 7) + (rex.b << 3);                                             \
+        ZEXTH(x4, ed);                                                                       \
+        ed = x4;                                                                             \
+    } else {                                                                                 \
+        SMREAD();                                                                            \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \
+        LHU(x4, ed, fixedaddress);                                                           \
+        ed = x4;                                                                             \
+    }                                                                                        \
+    B##NO(x1, 4 + 3 * 4);                                                                    \
+    LUI(x3, 0xffff0);                                                                        \
+    AND(gd, gd, x3);                                                                         \
+    OR(gd, gd, ed);
 
-        GOCOND(0x40, "CMOV", "Gw, Ew");
-        #undef GO
+            GOCOND(0x40, "CMOV", "Gw, Ew");
+#undef GO
         case 0x50:
             INST_NAME("PMOVMSKD Gd, Ex");
             nextop = F8;
             GETGD;
             GETEX(x1, 0);
             MV(gd, xZR);
-            for(int i=0; i<2; ++i) {
+            for (int i = 0; i < 2; ++i) {
                 // GD->dword[0] |= ((EX->q[i]>>63)&1)<<i;
-                LD(x2, wback, fixedaddress+8*i);
+                LD(x2, wback, fixedaddress + 8 * i);
                 SRLI(x2, x2, 63);
                 if (i) SLLI(x2, x2, 1);
                 OR(gd, gd, x2);
@@ -1114,21 +1163,21 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
-            if(!box64_dynarec_fastnan) {
+            if (!box64_dynarec_fastnan) {
                 d1 = fpu_get_scratch(dyn);
                 FMVDX(d1, xZR);
             }
-            for (int i=0; i<2; ++i) {
-                FLD(d0, wback, fixedaddress+i*8);
-                if(!box64_dynarec_fastnan) {
+            for (int i = 0; i < 2; ++i) {
+                FLD(d0, wback, fixedaddress + i * 8);
+                if (!box64_dynarec_fastnan) {
                     FLTD(x3, d0, d1);
                 }
                 FSQRTD(d0, d0);
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     BEQ(x3, xZR, 8);
                     FNEGD(d0, d0);
                 }
-                FSD(d0, gback, gdoffset+i*8);
+                FSD(d0, gback, gdoffset + i * 8);
             }
             break;
         case 0x54:
@@ -1165,12 +1214,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x1, 0);
             GETGX();
             SSE_LOOP_FQ(x3, x4, {
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
                     FEQD(x4, v1, v1);
                 }
                 FADDD(v0, v0, v1);
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     AND(x3, x3, x4);
                     BEQZ(x3, 16);
                     FEQD(x3, v0, v0);
@@ -1185,12 +1234,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x1, 0);
             GETGX();
             SSE_LOOP_FQ(x3, x4, {
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
                     FEQD(x4, v1, v1);
                 }
                 FMULD(v0, v0, v1);
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     AND(x3, x3, x4);
                     BEQZ(x3, 16);
                     FEQD(x3, v0, v0);
@@ -1206,15 +1255,15 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             // GX->f[0] = EX->d[0];
-            FLD(d0, wback, fixedaddress+0);
+            FLD(d0, wback, fixedaddress + 0);
             FCVTSD(d0, d0);
-            FSD(d0, gback, gdoffset+0);
+            FSD(d0, gback, gdoffset + 0);
             // GX->f[1] = EX->d[1];
-            FLD(d0, wback, fixedaddress+8);
+            FLD(d0, wback, fixedaddress + 8);
             FCVTSD(d0, d0);
-            FSD(d0, gback, gdoffset+4);
+            FSD(d0, gback, gdoffset + 4);
             // GX->q[1] = 0;
-            SD(xZR, gback, gdoffset+8);
+            SD(xZR, gback, gdoffset + 8);
             break;
         case 0x5B:
             INST_NAME("CVTPS2DQ Gx, Ex");
@@ -1223,14 +1272,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             u8 = sse_setround(dyn, ninst, x6, x4);
-            for (int i=0; i<4; ++i) {
-                FLW(d0, wback, fixedaddress+4*i);
+            for (int i = 0; i < 4; ++i) {
+                FLW(d0, wback, fixedaddress + 4 * i);
                 FCVTLS(x3, d0, RD_DYN);
                 SEXT_W(x5, x3);
                 SUB(x5, x5, x3);
                 BEQZ(x5, 8);
                 LUI(x3, 0x80000); // INT32_MIN
-                SW(x3, gback, gdoffset+4*i);
+                SW(x3, gback, gdoffset + 4 * i);
             }
             x87_restoreround(dyn, ninst, u8);
             break;
@@ -1240,12 +1289,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x1, 0);
             GETGX();
             SSE_LOOP_FQ(x3, x4, {
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
                     FEQD(x4, v1, v1);
                 }
                 FSUBD(v0, v0, v1);
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     AND(x3, x3, x4);
                     BEQZ(x3, 16);
                     FEQD(x3, v0, v0);
@@ -1261,16 +1310,16 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
-            for (int i=0; i<2; ++i) {
-                FLD(d0, gback, gdoffset+8*i);
-                FLD(d1, wback, fixedaddress+8*i);
+            for (int i = 0; i < 2; ++i) {
+                FLD(d0, gback, gdoffset + 8 * i);
+                FLD(d1, wback, fixedaddress + 8 * i);
                 FEQD(x3, d0, d0);
                 FEQD(x4, d1, d1);
                 AND(x3, x3, x4);
                 BEQ(x3, xZR, 12);
                 FLTD(x3, d1, d0);
                 BEQ(x3, xZR, 8); // continue
-                FSD(d1, gback, gdoffset+8*i);
+                FSD(d1, gback, gdoffset + 8 * i);
             }
             break;
         case 0x5E:
@@ -1279,12 +1328,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x1, 0);
             GETGX();
             SSE_LOOP_FQ(x3, x4, {
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
                     FEQD(x4, v1, v1);
                 }
                 FDIVD(v0, v0, v1);
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     AND(x3, x3, x4);
                     BEQZ(x3, 16);
                     FEQD(x3, v0, v0);
@@ -1300,39 +1349,39 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
-            for (int i=0; i<2; ++i) {
-                FLD(d0, gback, gdoffset+8*i);
-                FLD(d1, wback, fixedaddress+8*i);
+            for (int i = 0; i < 2; ++i) {
+                FLD(d0, gback, gdoffset + 8 * i);
+                FLD(d1, wback, fixedaddress + 8 * i);
                 FEQD(x3, d0, d0);
                 FEQD(x4, d1, d1);
                 AND(x3, x3, x4);
                 BEQ(x3, xZR, 12);
                 FLTD(x3, d0, d1);
                 BEQ(x3, xZR, 8); // continue
-                FSD(d1, gback, gdoffset+8*i);
+                FSD(d1, gback, gdoffset + 8 * i);
             }
             break;
         case 0x60:
             INST_NAME("PUNPCKLBW Gx,Ex");
             nextop = F8;
             GETGX();
-            for(int i=7; i>0; --i) { // 0 is untouched
+            for (int i = 7; i > 0; --i) { // 0 is untouched
                 // GX->ub[2 * i] = GX->ub[i];
-                LBU(x3, gback, gdoffset+i);
-                SB(x3, gback, gdoffset+2*i);
+                LBU(x3, gback, gdoffset + i);
+                SB(x3, gback, gdoffset + 2 * i);
             }
-            if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
-                for(int i=0; i<8; ++i) {
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
+                for (int i = 0; i < 8; ++i) {
                     // GX->ub[2 * i + 1] = GX->ub[2 * i];
-                    LBU(x3, gback, gdoffset+2*i);
-                    SB(x3, gback, gdoffset+2*i+1);
+                    LBU(x3, gback, gdoffset + 2 * i);
+                    SB(x3, gback, gdoffset + 2 * i + 1);
                 }
             } else {
                 GETEX(x1, 0);
-                for(int i=0; i<8; ++i) {
+                for (int i = 0; i < 8; ++i) {
                     // GX->ub[2 * i + 1] = EX->ub[i];
-                    LBU(x3, wback, fixedaddress+i);
-                    SB(x3, gback, gdoffset+2*i+1);
+                    LBU(x3, wback, fixedaddress + i);
+                    SB(x3, gback, gdoffset + 2 * i + 1);
                 }
             }
             break;
@@ -1340,23 +1389,23 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("PUNPCKLWD Gx,Ex");
             nextop = F8;
             GETGX();
-            for(int i=3; i>0; --i) {
+            for (int i = 3; i > 0; --i) {
                 // GX->uw[2 * i] = GX->uw[i];
-                LHU(x3, gback, gdoffset+i*2);
-                SH(x3, gback, gdoffset+2*i*2);
+                LHU(x3, gback, gdoffset + i * 2);
+                SH(x3, gback, gdoffset + 2 * i * 2);
             }
-            if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
-                for(int i=0; i<4; ++i) {
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->uw[2 * i + 1] = GX->uw[2 * i];
-                    LHU(x3, gback, gdoffset+2*i*2);
-                    SH(x3, gback, gdoffset+(2*i+1)*2);
+                    LHU(x3, gback, gdoffset + 2 * i * 2);
+                    SH(x3, gback, gdoffset + (2 * i + 1) * 2);
                 }
             } else {
                 GETEX(x1, 0);
-                for(int i=0; i<4; ++i) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->uw[2 * i + 1] = EX->uw[i];
-                    LHU(x3, wback, fixedaddress+i*2);
-                    SH(x3, gback, gdoffset+(2*i+1)*2);
+                    LHU(x3, wback, fixedaddress + i * 2);
+                    SH(x3, gback, gdoffset + (2 * i + 1) * 2);
                 }
             }
             break;
@@ -1366,14 +1415,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x1, 0);
             GETGX();
             // GX->ud[3] = EX->ud[1];
-            LWU(x3, wback, fixedaddress+1*4);
-            SW(x3, gback, gdoffset+3*4);
+            LWU(x3, wback, fixedaddress + 1 * 4);
+            SW(x3, gback, gdoffset + 3 * 4);
             // GX->ud[2] = GX->ud[1];
-            LWU(x3, gback, gdoffset+1*4);
-            SW(x3, gback, gdoffset+2*4);
+            LWU(x3, gback, gdoffset + 1 * 4);
+            SW(x3, gback, gdoffset + 2 * 4);
             // GX->ud[1] = EX->ud[0];
-            LWU(x3, wback, fixedaddress+0*4);
-            SW(x3, gback, gdoffset+1*4);
+            LWU(x3, wback, fixedaddress + 0 * 4);
+            SW(x3, gback, gdoffset + 1 * 4);
             break;
         case 0x63:
             INST_NAME("PACKSSWB Gx, Ex");
@@ -1382,48 +1431,49 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             MOV64x(x5, 127);
             MOV64x(x6, -128);
-            for(int i=0; i<8; ++i) {
-                LH(x3, gback, gdoffset+i*2);
-                if(rv64_zbb) {
+            for (int i = 0; i < 8; ++i) {
+                LH(x3, gback, gdoffset + i * 2);
+                if (rv64_zbb) {
                     MIN(x3, x3, x5);
                     MAX(x3, x3, x6);
                 } else {
-                    BLT(x3, x5, 4+4);
+                    BLT(x3, x5, 4 + 4);
                     MV(x3, x5);
-                    BGE(x3, x6, 4+4);
+                    BGE(x3, x6, 4 + 4);
                     MV(x3, x6);
                 }
-                SB(x3, gback, gdoffset+i);
-            }
-            if(MODREG && gd==ed) {
-                LD(x3, gback, gdoffset+0);
-                SD(x3, gback, gdoffset+8);
-            } else for(int i=0; i<8; ++i) {
-                LH(x3, wback, fixedaddress+i*2);
-                if(rv64_zbb) {
-                    MIN(x3, x3, x5);
-                    MAX(x3, x3, x6);
-                } else {
-                    BLT(x3, x5, 4+4);
-                    MV(x3, x5);
-                    BGE(x3, x6, 4+4);
-                    MV(x3, x6);
+                SB(x3, gback, gdoffset + i);
+            }
+            if (MODREG && gd == ed) {
+                LD(x3, gback, gdoffset + 0);
+                SD(x3, gback, gdoffset + 8);
+            } else
+                for (int i = 0; i < 8; ++i) {
+                    LH(x3, wback, fixedaddress + i * 2);
+                    if (rv64_zbb) {
+                        MIN(x3, x3, x5);
+                        MAX(x3, x3, x6);
+                    } else {
+                        BLT(x3, x5, 4 + 4);
+                        MV(x3, x5);
+                        BGE(x3, x6, 4 + 4);
+                        MV(x3, x6);
+                    }
+                    SB(x3, gback, gdoffset + 8 + i);
                 }
-                SB(x3, gback, gdoffset+8+i);
-            }
             break;
         case 0x64:
             INST_NAME("PCMPGTB Gx,Ex");
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<16; ++i) {
+            for (int i = 0; i < 16; ++i) {
                 // GX->ub[i] = (GX->sb[i]>EX->sb[i])?0xFF:0x00;
-                LB(x3, wback, fixedaddress+i);
-                LB(x4, gback, gdoffset+i);
+                LB(x3, wback, fixedaddress + i);
+                LB(x4, gback, gdoffset + i);
                 SLT(x3, x3, x4);
                 NEG(x3, x3);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0x65:
@@ -1431,13 +1481,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<8; ++i) {
+            for (int i = 0; i < 8; ++i) {
                 // GX->uw[i] = (GX->sw[i]>EX->sw[i])?0xFFFF:0x0000;
-                LH(x3, wback, fixedaddress+i*2);
-                LH(x4, gback, gdoffset+i*2);
+                LH(x3, wback, fixedaddress + i * 2);
+                LH(x4, gback, gdoffset + i * 2);
                 SLT(x3, x3, x4);
                 NEG(x3, x3);
-                SH(x3, gback, gdoffset+i*2);
+                SH(x3, gback, gdoffset + i * 2);
             }
             break;
         case 0x66:
@@ -1452,31 +1502,31 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             ADDI(x5, xZR, 0xFF);
-            for(int i=0; i<8; ++i) {
+            for (int i = 0; i < 8; ++i) {
                 // GX->ub[i] = (GX->sw[i]<0)?0:((GX->sw[i]>0xff)?0xff:GX->sw[i]);
-                LH(x3, gback, gdoffset+i*2);
+                LH(x3, gback, gdoffset + i * 2);
                 BGE(x5, x3, 8);
                 ADDI(x3, xZR, 0xFF);
                 NOT(x4, x3);
                 SRAI(x4, x4, 63);
                 AND(x3, x3, x4);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
-            if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
                 // GX->q[1] = GX->q[0];
-                LD(x3, gback, gdoffset+0*8);
-                SD(x3, gback, gdoffset+1*8);
+                LD(x3, gback, gdoffset + 0 * 8);
+                SD(x3, gback, gdoffset + 1 * 8);
             } else {
                 GETEX(x1, 0);
-                for(int i=0; i<8; ++i) {
+                for (int i = 0; i < 8; ++i) {
                     // GX->ub[8+i] = (EX->sw[i]<0)?0:((EX->sw[i]>0xff)?0xff:EX->sw[i]);
-                    LH(x3, wback, fixedaddress+i*2);
+                    LH(x3, wback, fixedaddress + i * 2);
                     BGE(x5, x3, 8);
                     ADDI(x3, xZR, 0xFF);
                     NOT(x4, x3);
                     SRAI(x4, x4, 63);
                     AND(x3, x3, x4);
-                    SB(x3, gback, gdoffset+8+i);
+                    SB(x3, gback, gdoffset + 8 + i);
                 }
             }
             break;
@@ -1484,23 +1534,23 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("PUNPCKHBW Gx,Ex");
             nextop = F8;
             GETGX();
-            for(int i=0; i<8; ++i) {
+            for (int i = 0; i < 8; ++i) {
                 // GX->ub[2 * i] = GX->ub[i + 8];
-                LBU(x3, gback, gdoffset+i+8);
-                SB(x3, gback, gdoffset+2*i);
+                LBU(x3, gback, gdoffset + i + 8);
+                SB(x3, gback, gdoffset + 2 * i);
             }
-            if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
-                for(int i=0; i<8; ++i) {
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
+                for (int i = 0; i < 8; ++i) {
                     // GX->ub[2 * i + 1] = GX->ub[2 * i];
-                    LBU(x3, gback, gdoffset+2*i);
-                    SB(x3, gback, gdoffset+2*i+1);
+                    LBU(x3, gback, gdoffset + 2 * i);
+                    SB(x3, gback, gdoffset + 2 * i + 1);
                 }
             } else {
                 GETEX(x2, 0);
-                for(int i=0; i<8; ++i) {
+                for (int i = 0; i < 8; ++i) {
                     // GX->ub[2 * i + 1] = EX->ub[i + 8];
-                    LBU(x3, wback, fixedaddress+i+8);
-                    SB(x3, gback, gdoffset+2*i+1);
+                    LBU(x3, wback, fixedaddress + i + 8);
+                    SB(x3, gback, gdoffset + 2 * i + 1);
                 }
             }
             break;
@@ -1508,23 +1558,23 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("PUNPCKHWD Gx,Ex");
             nextop = F8;
             GETGX();
-            for(int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // GX->uw[2 * i] = GX->uw[i + 4];
-                LHU(x3, gback, gdoffset+(i+4)*2);
-                SH(x3, gback, gdoffset+2*i*2);
+                LHU(x3, gback, gdoffset + (i + 4) * 2);
+                SH(x3, gback, gdoffset + 2 * i * 2);
             }
-            if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
-                for(int i=0; i<4; ++i) {
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->uw[2 * i + 1] = GX->uw[2 * i];
-                    LHU(x3, gback, gdoffset+2*i*2);
-                    SH(x3, gback, gdoffset+(2*i+1)*2);
+                    LHU(x3, gback, gdoffset + 2 * i * 2);
+                    SH(x3, gback, gdoffset + (2 * i + 1) * 2);
                 }
             } else {
                 GETEX(x1, 0);
-                for(int i=0; i<4; ++i) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->uw[2 * i + 1] = EX->uw[i + 4];
-                    LHU(x3, wback, fixedaddress+(i+4)*2);
-                    SH(x3, gback, gdoffset+(2*i+1)*2);
+                    LHU(x3, wback, fixedaddress + (i + 4) * 2);
+                    SH(x3, gback, gdoffset + (2 * i + 1) * 2);
                 }
             }
             break;
@@ -1534,18 +1584,18 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x1, 0);
             GETGX();
             // GX->ud[0] = GX->ud[2];
-            LWU(x3, gback, gdoffset+2*4);
-            SW(x3, gback, gdoffset+0*4);
+            LWU(x3, gback, gdoffset + 2 * 4);
+            SW(x3, gback, gdoffset + 0 * 4);
             // GX->ud[1] = EX->ud[2];
-            LWU(x3, wback, fixedaddress+2*4);
-            SW(x3, gback, gdoffset+1*4);
+            LWU(x3, wback, fixedaddress + 2 * 4);
+            SW(x3, gback, gdoffset + 1 * 4);
             // GX->ud[2] = GX->ud[3];
-            LWU(x3, gback, gdoffset+3*4);
-            SW(x3, gback, gdoffset+2*4);
+            LWU(x3, gback, gdoffset + 3 * 4);
+            SW(x3, gback, gdoffset + 2 * 4);
             // GX->ud[3] = EX->ud[3];
-            if (!(MODREG && (gd==ed))) {
-                LWU(x3, wback, fixedaddress+3*4);
-                SW(x3, gback, gdoffset+3*4);
+            if (!(MODREG && (gd == ed))) {
+                LWU(x3, wback, fixedaddress + 3 * 4);
+                SW(x3, gback, gdoffset + 3 * 4);
             }
             break;
         case 0x6B:
@@ -1554,29 +1604,29 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             MOV64x(x5, 32768);
             NEG(x6, x5);
-            for(int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // GX->sw[i] = (GX->sd[i]<-32768)?-32768:((GX->sd[i]>32767)?32767:GX->sd[i]);
-                LW(x3, gback, gdoffset+i*4);
+                LW(x3, gback, gdoffset + i * 4);
                 BGE(x5, x3, 8);
                 ADDI(x3, x5, -1);
                 BGE(x3, x6, 8);
                 MV(x3, x6);
-                SH(x3, gback, gdoffset+i*2);
+                SH(x3, gback, gdoffset + i * 2);
             }
-            if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
                 // GX->q[1] = GX->q[0];
-                LD(x3, gback, gdoffset+0*8);
-                SD(x3, gback, gdoffset+1*8);
+                LD(x3, gback, gdoffset + 0 * 8);
+                SD(x3, gback, gdoffset + 1 * 8);
             } else {
                 GETEX(x1, 0);
-                for(int i=0; i<4; ++i) {
+                for (int i = 0; i < 4; ++i) {
                     // GX->sw[4+i] = (EX->sd[i]<-32768)?-32768:((EX->sd[i]>32767)?32767:EX->sd[i]);
-                    LW(x3, wback, fixedaddress+i*4);
+                    LW(x3, wback, fixedaddress + i * 4);
                     BGE(x5, x3, 8);
                     ADDI(x3, x5, -1);
                     BGE(x3, x6, 8);
                     MV(x3, x6);
-                    SH(x3, gback, gdoffset+(4+i)*2);
+                    SH(x3, gback, gdoffset + (4 + i) * 2);
                 }
             }
             break;
@@ -1584,13 +1634,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("PUNPCKLQDQ Gx,Ex");
             nextop = F8;
             GETGX();
-            if(MODREG) {
-                v1 = sse_get_reg(dyn, ninst, x2, (nextop&7)+(rex.b<<3), 0);
-                FSD(v1, gback, gdoffset+8);
+            if (MODREG) {
+                v1 = sse_get_reg(dyn, ninst, x2, (nextop & 7) + (rex.b << 3), 0);
+                FSD(v1, gback, gdoffset + 8);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
-                LD(x3, ed, fixedaddress+0);
-                SD(x3, gback, gdoffset+8);
+                LD(x3, ed, fixedaddress + 0);
+                SD(x3, gback, gdoffset + 8);
             }
             break;
         case 0x6D:
@@ -1598,27 +1648,27 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            LD(x3, gback, gdoffset+8);
-            SD(x3, gback, gdoffset+0);
-            LD(x3, wback, fixedaddress+8);
-            SD(x3, gback, gdoffset+8);
+            LD(x3, gback, gdoffset + 8);
+            SD(x3, gback, gdoffset + 0);
+            LD(x3, wback, fixedaddress + 8);
+            SD(x3, gback, gdoffset + 8);
             break;
         case 0x6E:
             INST_NAME("MOVD Gx, Ed");
             nextop = F8;
-            if(rex.w) {
+            if (rex.w) {
                 GETGXSD_empty(v0);
             } else {
                 GETGXSS_empty(v0);
             }
             GETED(0);
-            if(rex.w) {
+            if (rex.w) {
                 FMVDX(v0, ed);
             } else {
                 FMVWX(v0, ed);
-                SW(xZR, xEmu, offsetof(x64emu_t, xmm[gd])+4);
+                SW(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 4);
             }
-            SD(xZR, xEmu, offsetof(x64emu_t, xmm[gd])+8);
+            SD(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 8);
             break;
         case 0x6F:
             INST_NAME("MOVDQA Gx,Ex");
@@ -1635,37 +1685,37 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             u8 = F8;
             int32_t idx;
 
-            idx = (u8>>(0*2))&3;
-            LWU(x3, wback, fixedaddress+idx*4);
-            idx = (u8>>(1*2))&3;
-            LWU(x4, wback, fixedaddress+idx*4);
-            idx = (u8>>(2*2))&3;
-            LWU(x5, wback, fixedaddress+idx*4);
-            idx = (u8>>(3*2))&3;
-            LWU(x6, wback, fixedaddress+idx*4);
+            idx = (u8 >> (0 * 2)) & 3;
+            LWU(x3, wback, fixedaddress + idx * 4);
+            idx = (u8 >> (1 * 2)) & 3;
+            LWU(x4, wback, fixedaddress + idx * 4);
+            idx = (u8 >> (2 * 2)) & 3;
+            LWU(x5, wback, fixedaddress + idx * 4);
+            idx = (u8 >> (3 * 2)) & 3;
+            LWU(x6, wback, fixedaddress + idx * 4);
 
-            SW(x3, gback, gdoffset+0*4);
-            SW(x4, gback, gdoffset+1*4);
-            SW(x5, gback, gdoffset+2*4);
-            SW(x6, gback, gdoffset+3*4);
+            SW(x3, gback, gdoffset + 0 * 4);
+            SW(x4, gback, gdoffset + 1 * 4);
+            SW(x5, gback, gdoffset + 2 * 4);
+            SW(x6, gback, gdoffset + 3 * 4);
             break;
         case 0x71:
             nextop = F8;
-            switch((nextop>>3)&7) {
+            switch ((nextop >> 3) & 7) {
                 case 2:
                     INST_NAME("PSRLW Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if (u8>15) {
+                    if (u8 > 15) {
                         // just zero dest
-                        SD(xZR, wback, fixedaddress+0);
-                        SD(xZR, wback, fixedaddress+8);
-                    } else if(u8) {
-                        for (int i=0; i<8; ++i) {
+                        SD(xZR, wback, fixedaddress + 0);
+                        SD(xZR, wback, fixedaddress + 8);
+                    } else if (u8) {
+                        for (int i = 0; i < 8; ++i) {
                             // EX->uw[i] >>= u8;
-                            LHU(x3, wback, fixedaddress+i*2);
+                            LHU(x3, wback, fixedaddress + i * 2);
                             SRLI(x3, x3, u8);
-                            SH(x3, wback, fixedaddress+i*2);
+                            SH(x3, wback, fixedaddress + i * 2);
                         }
                     }
                     break;
@@ -1673,13 +1723,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     INST_NAME("PSRAW Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if(u8>15) u8=15;
-                    if(u8) {
-                        for (int i=0; i<8; ++i) {
+                    if (u8 > 15) u8 = 15;
+                    if (u8) {
+                        for (int i = 0; i < 8; ++i) {
                             // EX->sw[i] >>= u8;
-                            LH(x3, wback, fixedaddress+i*2);
+                            LH(x3, wback, fixedaddress + i * 2);
                             SRAI(x3, x3, u8);
-                            SH(x3, wback, fixedaddress+i*2);
+                            SH(x3, wback, fixedaddress + i * 2);
                         }
                     }
                     break;
@@ -1687,16 +1737,16 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     INST_NAME("PSLLW Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if (u8>15) {
+                    if (u8 > 15) {
                         // just zero dest
-                        SD(xZR, wback, fixedaddress+0);
-                        SD(xZR, wback, fixedaddress+8);
-                    } else if(u8) {
-                        for (int i=0; i<8; ++i) {
+                        SD(xZR, wback, fixedaddress + 0);
+                        SD(xZR, wback, fixedaddress + 8);
+                    } else if (u8) {
+                        for (int i = 0; i < 8; ++i) {
                             // EX->uw[i] <<= u8;
-                            LHU(x3, wback, fixedaddress+i*2);
+                            LHU(x3, wback, fixedaddress + i * 2);
                             SLLI(x3, x3, u8);
-                            SH(x3, wback, fixedaddress+i*2);
+                            SH(x3, wback, fixedaddress + i * 2);
                         }
                     }
                     break;
@@ -1707,17 +1757,17 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             break;
         case 0x72:
             nextop = F8;
-            switch((nextop>>3)&7) {
+            switch ((nextop >> 3) & 7) {
                 case 2:
                     INST_NAME("PSRLD Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if(u8) {
-                        if (u8>31) {
+                    if (u8) {
+                        if (u8 > 31) {
                             // just zero dest
-                            SD(xZR, wback, fixedaddress+0);
-                            SD(xZR, wback, fixedaddress+8);
-                        } else if(u8) {
+                            SD(xZR, wback, fixedaddress + 0);
+                            SD(xZR, wback, fixedaddress + 8);
+                        } else if (u8) {
                             SSE_LOOP_D_S(x3, SRLI(x3, x3, u8));
                         }
                     }
@@ -1726,7 +1776,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     INST_NAME("PSRAD Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if(u8>31) u8=31;
+                    if (u8 > 31) u8 = 31;
                     if (u8) {
                         SSE_LOOP_D_S(x3, SRAIW(x3, x3, u8));
                     }
@@ -1735,12 +1785,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     INST_NAME("PSLLD Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if(u8) {
-                        if (u8>31) {
+                    if (u8) {
+                        if (u8 > 31) {
                             // just zero dest
-                            SD(xZR, wback, fixedaddress+0);
-                            SD(xZR, wback, fixedaddress+8);
-                        } else if(u8) {
+                            SD(xZR, wback, fixedaddress + 0);
+                            SD(xZR, wback, fixedaddress + 8);
+                        } else if (u8) {
                             SSE_LOOP_D_S(x3, SLLI(x3, x3, u8));
                         }
                     }
@@ -1751,50 +1801,50 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             break;
         case 0x73:
             nextop = F8;
-            switch((nextop>>3)&7) {
+            switch ((nextop >> 3) & 7) {
                 case 2:
                     INST_NAME("PSRLQ Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if(!u8) break;
-                    if(u8>63) {
+                    if (!u8) break;
+                    if (u8 > 63) {
                         // just zero dest
-                        SD(xZR, wback, fixedaddress+0);
-                        SD(xZR, wback, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress + 0);
+                        SD(xZR, wback, fixedaddress + 8);
                     } else {
-                        LD(x3, wback, fixedaddress+0);
-                        LD(x4, wback, fixedaddress+8);
+                        LD(x3, wback, fixedaddress + 0);
+                        LD(x4, wback, fixedaddress + 8);
                         SRLI(x3, x3, u8);
                         SRLI(x4, x4, u8);
-                        SD(x3, wback, fixedaddress+0);
-                        SD(x4, wback, fixedaddress+8);
+                        SD(x3, wback, fixedaddress + 0);
+                        SD(x4, wback, fixedaddress + 8);
                     }
                     break;
                 case 3:
                     INST_NAME("PSRLDQ Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if(!u8) break;
-                    if(u8>15) {
+                    if (!u8) break;
+                    if (u8 > 15) {
                         // just zero dest
-                        SD(xZR, wback, fixedaddress+0);
-                        SD(xZR, wback, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress + 0);
+                        SD(xZR, wback, fixedaddress + 8);
                     } else {
-                        u8*=8;
+                        u8 *= 8;
                         if (u8 < 64) {
-                            LD(x3, wback, fixedaddress+0);
-                            LD(x4, wback, fixedaddress+8);
+                            LD(x3, wback, fixedaddress + 0);
+                            LD(x4, wback, fixedaddress + 8);
                             SRLI(x3, x3, u8);
-                            SLLI(x5, x4, 64-u8);
+                            SLLI(x5, x4, 64 - u8);
                             OR(x3, x3, x5);
-                            SD(x3, wback, fixedaddress+0);
+                            SD(x3, wback, fixedaddress + 0);
                             SRLI(x4, x4, u8);
-                            SD(x4, wback, fixedaddress+8);
+                            SD(x4, wback, fixedaddress + 8);
                         } else {
-                            LD(x3, wback, fixedaddress+8);
-                            if (u8-64 > 0) { SRLI(x3, x3, u8-64); }
-                            SD(x3, wback, fixedaddress+0);
-                            SD(xZR, wback, fixedaddress+8);
+                            LD(x3, wback, fixedaddress + 8);
+                            if (u8 - 64 > 0) { SRLI(x3, x3, u8 - 64); }
+                            SD(x3, wback, fixedaddress + 0);
+                            SD(xZR, wback, fixedaddress + 8);
                         }
                     }
                     break;
@@ -1802,45 +1852,45 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     INST_NAME("PSLLQ Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if(!u8) break;
-                    if(u8>63) {
+                    if (!u8) break;
+                    if (u8 > 63) {
                         // just zero dest
-                        SD(xZR, wback, fixedaddress+0);
-                        SD(xZR, wback, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress + 0);
+                        SD(xZR, wback, fixedaddress + 8);
                     } else {
-                        LD(x3, wback, fixedaddress+0);
-                        LD(x4, wback, fixedaddress+8);
+                        LD(x3, wback, fixedaddress + 0);
+                        LD(x4, wback, fixedaddress + 8);
                         SLLI(x3, x3, u8);
                         SLLI(x4, x4, u8);
-                        SD(x3, wback, fixedaddress+0);
-                        SD(x4, wback, fixedaddress+8);
+                        SD(x3, wback, fixedaddress + 0);
+                        SD(x4, wback, fixedaddress + 8);
                     }
                     break;
                 case 7:
                     INST_NAME("PSLLDQ Ex, Ib");
                     GETEX(x1, 1);
                     u8 = F8;
-                    if(!u8) break;
-                    if(u8>15) {
+                    if (!u8) break;
+                    if (u8 > 15) {
                         // just zero dest
-                        SD(xZR, wback, fixedaddress+0);
-                        SD(xZR, wback, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress + 0);
+                        SD(xZR, wback, fixedaddress + 8);
                     } else {
-                        u8*=8;
+                        u8 *= 8;
                         if (u8 < 64) {
-                            LD(x3, wback, fixedaddress+0);
-                            LD(x4, wback, fixedaddress+8);
+                            LD(x3, wback, fixedaddress + 0);
+                            LD(x4, wback, fixedaddress + 8);
                             SLLI(x4, x4, u8);
-                            SRLI(x5, x3, 64-u8);
+                            SRLI(x5, x3, 64 - u8);
                             OR(x4, x4, x5);
-                            SD(x4, wback, fixedaddress+8);
+                            SD(x4, wback, fixedaddress + 8);
                             SLLI(x3, x3, u8);
-                            SD(x3, wback, fixedaddress+0);
+                            SD(x3, wback, fixedaddress + 0);
                         } else {
-                            LD(x3, wback, fixedaddress+0);
-                            if (u8-64 > 0) { SLLI(x3, x3, u8-64); }
-                            SD(x3, wback, fixedaddress+8);
-                            SD(xZR, wback, fixedaddress+0);
+                            LD(x3, wback, fixedaddress + 0);
+                            if (u8 - 64 > 0) { SLLI(x3, x3, u8 - 64); }
+                            SD(x3, wback, fixedaddress + 8);
+                            SD(xZR, wback, fixedaddress + 0);
                         }
                     }
                     break;
@@ -1853,13 +1903,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for (int i=0; i<16; ++i) {
-                LBU(x3, gback, gdoffset+i);
-                LBU(x4, wback, fixedaddress+i);
+            for (int i = 0; i < 16; ++i) {
+                LBU(x3, gback, gdoffset + i);
+                LBU(x4, wback, fixedaddress + i);
                 SUB(x3, x3, x4);
                 SEQZ(x3, x3);
                 NEG(x3, x3);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0x75:
@@ -1882,63 +1932,63 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
-            FLD(d0, gback, gdoffset+0);
-            FLD(d1, gback, gdoffset+8);
-            if(!box64_dynarec_fastnan) {
+            FLD(d0, gback, gdoffset + 0);
+            FLD(d1, gback, gdoffset + 8);
+            if (!box64_dynarec_fastnan) {
                 FEQD(x3, d0, d0);
                 FEQD(x4, d1, d1);
                 AND(x3, x3, x4);
             }
             FADDD(d0, d0, d1);
-            if(!box64_dynarec_fastnan) {
+            if (!box64_dynarec_fastnan) {
                 FEQD(x4, d0, d0);
                 BEQZ(x3, 12);
                 BNEZ(x4, 8);
                 FNEGD(d0, d0);
             }
-            FSD(d0, gback, gdoffset+0);
-            if(MODREG && gd==(nextop&7)+(rex.b<<3)) {
-                FSD(d0, gback, gdoffset+8);
+            FSD(d0, gback, gdoffset + 0);
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
+                FSD(d0, gback, gdoffset + 8);
             } else {
                 GETEX(x2, 0);
-                FLD(d0, wback, fixedaddress+0);
-                FLD(d1, wback, fixedaddress+8);
-                if(!box64_dynarec_fastnan) {
+                FLD(d0, wback, fixedaddress + 0);
+                FLD(d1, wback, fixedaddress + 8);
+                if (!box64_dynarec_fastnan) {
                     FEQD(x3, d0, d0);
                     FEQD(x4, d1, d1);
                     AND(x3, x3, x4);
                 }
                 FADDD(d0, d0, d1);
-                if(!box64_dynarec_fastnan) {
+                if (!box64_dynarec_fastnan) {
                     FEQD(x4, d0, d0);
                     BEQZ(x3, 12);
                     BNEZ(x4, 8);
                     FNEGD(d0, d0);
                 }
-                FSD(d0, gback, gdoffset+8);
+                FSD(d0, gback, gdoffset + 8);
             }
             break;
         case 0x7E:
             INST_NAME("MOVD Ed,Gx");
             nextop = F8;
             GETGX();
-            if(rex.w) {
-                if(MODREG) {
-                    ed = xRAX + (nextop&7) + (rex.b<<3);
-                    LD(ed, gback, gdoffset+0);
+            if (rex.w) {
+                if (MODREG) {
+                    ed = xRAX + (nextop & 7) + (rex.b << 3);
+                    LD(ed, gback, gdoffset + 0);
                 } else {
                     addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
-                    LD(x3, gback, gdoffset+0);
+                    LD(x3, gback, gdoffset + 0);
                     SD(x3, ed, fixedaddress);
                     SMWRITE2();
                 }
             } else {
-                if(MODREG) {
-                    ed = xRAX + (nextop&7) + (rex.b<<3);
-                    LWU(ed, gback, gdoffset+0);
+                if (MODREG) {
+                    ed = xRAX + (nextop & 7) + (rex.b << 3);
+                    LWU(ed, gback, gdoffset + 0);
                 } else {
                     addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
-                    LWU(x3, gback, gdoffset+0);
+                    LWU(x3, gback, gdoffset + 0);
                     SW(x3, ed, fixedaddress);
                     SMWRITE2();
                 }
@@ -1950,7 +2000,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
-            if(!MODREG) SMWRITE2();
+            if (!MODREG) SMWRITE2();
             break;
         case 0xAF:
             INST_NAME("IMUL Gw,Ew");
@@ -1969,17 +2019,17 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("MOVSX Gw, Eb");
             nextop = F8;
             GETGD;
-            if(MODREG) {
-                if(rex.rex) {
-                    ed = xRAX+(nextop&7)+(rex.b<<3);
-                    eb1=ed;
-                    eb2=0;
+            if (MODREG) {
+                if (rex.rex) {
+                    ed = xRAX + (nextop & 7) + (rex.b << 3);
+                    eb1 = ed;
+                    eb2 = 0;
                 } else {
-                    ed = (nextop&7);
-                    eb1 = xRAX+(ed&3);  // Ax, Cx, Dx or Bx
-                    eb2 = (ed&4)>>2;    // L or H
+                    ed = (nextop & 7);
+                    eb1 = xRAX + (ed & 3); // Ax, Cx, Dx or Bx
+                    eb2 = (ed & 4) >> 2;   // L or H
                 }
-                SLLI(x1, eb1, 56-eb2*8);
+                SLLI(x1, eb1, 56 - eb2 * 8);
                 SRAI(x1, x1, 56);
             } else {
                 SMREAD();
@@ -2000,12 +2050,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             u8 = F8;
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
-            for(int i=0; i<2; ++i) {
-                FLD(d0, gback, gdoffset+8*i);
-                FLD(d1, wback, fixedaddress+8*i);
-                if ((u8&7) == 0) {                                      // Equal
+            for (int i = 0; i < 2; ++i) {
+                FLD(d0, gback, gdoffset + 8 * i);
+                FLD(d1, wback, fixedaddress + 8 * i);
+                if ((u8 & 7) == 0) { // Equal
                     FEQD(x3, d0, d1);
-                } else if ((u8&7) == 4) {                               // Not Equal or unordered
+                } else if ((u8 & 7) == 4) { // Not Equal or unordered
                     FEQD(x3, d0, d1);
                     XORI(x3, x3, 1);
                 } else {
@@ -2014,33 +2064,39 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     FEQD(x3, d1, d1);
                     AND(x3, x3, x4);
 
-                    switch(u8&7) {
-                    case 1: BEQ_MARK(x3, xZR); FLTD(x3, d0, d1); break; // Less than
-                    case 2: BEQ_MARK(x3, xZR); FLED(x3, d0, d1); break; // Less or equal
-                    case 3: XORI(x3, x3, 1); break;                     // NaN
-                    case 5: {                                           // Greater or equal or unordered
-                        BEQ(x3, xZR, 12); // MARK2
-                        FLED(x3, d1, d0);
-                        J(8); // MARK;
-                        break;
-                    }
-                    case 6: {                                           // Greater or unordered
-                        BEQ(x3, xZR, 12); // MARK2
-                        FLTD(x3, d1, d0);
-                        J(8); // MARK;
-                        break;
-                    }
-                    case 7: break;                                      // Not NaN
+                    switch (u8 & 7) {
+                        case 1:
+                            BEQ_MARK(x3, xZR);
+                            FLTD(x3, d0, d1);
+                            break; // Less than
+                        case 2:
+                            BEQ_MARK(x3, xZR);
+                            FLED(x3, d0, d1);
+                            break;                      // Less or equal
+                        case 3: XORI(x3, x3, 1); break; // NaN
+                        case 5: {                       // Greater or equal or unordered
+                            BEQ(x3, xZR, 12);           // MARK2
+                            FLED(x3, d1, d0);
+                            J(8); // MARK;
+                            break;
+                        }
+                        case 6: {             // Greater or unordered
+                            BEQ(x3, xZR, 12); // MARK2
+                            FLTD(x3, d1, d0);
+                            J(8); // MARK;
+                            break;
+                        }
+                        case 7: break; // Not NaN
                     }
 
                     // MARK2;
-                    if ((u8&7) == 5 || (u8&7) == 6) {
+                    if ((u8 & 7) == 5 || (u8 & 7) == 6) {
                         MOV32w(x3, 1);
                     }
                     // MARK;
                 }
                 NEG(x3, x3);
-                SD(x3, gback, gdoffset+8*i);
+                SD(x3, gback, gdoffset + 8 * i);
             }
             break;
         case 0xC4:
@@ -2049,7 +2105,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETED(1);
             GETGX();
             u8 = (F8)&7;
-            SH(ed, gback, gdoffset+u8*2);
+            SH(ed, gback, gdoffset + u8 * 2);
             break;
         case 0xC5:
             INST_NAME("PEXTRW Gd,Ex,Ib");
@@ -2057,7 +2113,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGD;
             GETEX(x1, 0);
             u8 = (F8)&7;
-            LHU(gd, wback, fixedaddress+u8*2);
+            LHU(gd, wback, fixedaddress + u8 * 2);
             break;
         case 0xC6:
             INST_NAME("SHUFPD Gx, Ex, Ib");
@@ -2065,15 +2121,15 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 1);
             u8 = F8;
-            if (MODREG && gd==(nextop&7)+(rex.b<<3) && u8==0) {
-                LD(x3, gback, gdoffset+0);
-                SD(x3, gback, gdoffset+8);
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3) && u8 == 0) {
+                LD(x3, gback, gdoffset + 0);
+                SD(x3, gback, gdoffset + 8);
                 break;
             }
-            LD(x3, gback, gdoffset+8*(u8&1));
-            LD(x4, wback, fixedaddress+8*((u8>>1)&1));
-            SD(x3, gback, gdoffset+0);
-            SD(x4, gback, gdoffset+8);
+            LD(x3, gback, gdoffset + 8 * (u8 & 1));
+            LD(x4, wback, fixedaddress + 8 * ((u8 >> 1) & 1));
+            SD(x3, gback, gdoffset + 0);
+            SD(x4, gback, gdoffset + 8);
             break;
         case 0xD1:
             INST_NAME("PSRLW Gx,Ex");
@@ -2083,14 +2139,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             LD(x3, wback, fixedaddress);
             ADDI(x4, xZR, 16);
             BLTU_MARK(x3, x4);
-            SD(xZR, gback, gdoffset+0);
-            SD(xZR, gback, gdoffset+8);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
             B_NEXT_nocond;
             MARK;
-            for (int i=0; i<8; ++i) {
-                LHU(x5, gback, gdoffset+2*i);
+            for (int i = 0; i < 8; ++i) {
+                LHU(x5, gback, gdoffset + 2 * i);
                 SRLW(x5, x5, x3);
-                SH(x5, gback, gdoffset+2*i);
+                SH(x5, gback, gdoffset + 2 * i);
             }
             break;
         case 0xD2:
@@ -2101,14 +2157,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             LD(x3, wback, fixedaddress);
             ADDI(x4, xZR, 32);
             BLTU_MARK(x3, x4);
-            SD(xZR, gback, gdoffset+0);
-            SD(xZR, gback, gdoffset+8);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
             B_NEXT_nocond;
             MARK;
-            for (int i=0; i<4; ++i) {
-                LWU(x5, gback, gdoffset+4*i);
+            for (int i = 0; i < 4; ++i) {
+                LWU(x5, gback, gdoffset + 4 * i);
                 SRLW(x5, x5, x3);
-                SW(x5, gback, gdoffset+4*i);
+                SW(x5, gback, gdoffset + 4 * i);
             }
             break;
         case 0xD3:
@@ -2119,14 +2175,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             LD(x3, wback, fixedaddress);
             ADDI(x4, xZR, 64);
             BLTU_MARK(x3, x4);
-            SD(xZR, gback, gdoffset+0);
-            SD(xZR, gback, gdoffset+8);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
             B_NEXT_nocond;
             MARK;
-            for (int i=0; i<2; ++i) {
-                LD(x5, gback, gdoffset+8*i);
+            for (int i = 0; i < 2; ++i) {
+                LD(x5, gback, gdoffset + 8 * i);
                 SRL(x5, x5, x3);
-                SD(x5, gback, gdoffset+8*i);
+                SD(x5, gback, gdoffset + 8 * i);
             }
             break;
         case 0xD4:
@@ -2141,11 +2197,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<8; ++i) {
-                LH(x3, gback, gdoffset+2*i);
-                LH(x4, wback, fixedaddress+2*i);
+            for (int i = 0; i < 8; ++i) {
+                LH(x3, gback, gdoffset + 2 * i);
+                LH(x4, wback, fixedaddress + 2 * i);
                 MULW(x3, x3, x4);
-                SH(x3, gback, gdoffset+2*i);
+                SH(x3, gback, gdoffset + 2 * i);
             }
             break;
         case 0xD6:
@@ -2153,9 +2209,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGXSD(d0);
             GETEX(x2, 0);
-            FSD(d0, wback, fixedaddress+0);
+            FSD(d0, wback, fixedaddress + 0);
             if (MODREG) {
-                SD(xZR, wback, fixedaddress+8);
+                SD(xZR, wback, fixedaddress + 8);
             } else {
                 SMWRITE2();
             }
@@ -2166,8 +2222,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             GETGD;
             MV(gd, xZR);
-            for (int i=0; i<16; ++i) {
-                LB(x1, wback, fixedaddress+i);
+            for (int i = 0; i < 16; ++i) {
+                LB(x1, wback, fixedaddress + i);
                 SLT(x3, x1, xZR);
                 if (i > 0) SLLI(x3, x3, i);
                 OR(gd, gd, x3);
@@ -2178,14 +2234,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<16; ++i) {
-                LBU(x3, gback, gdoffset+i);
-                LBU(x4, wback, fixedaddress+i);
+            for (int i = 0; i < 16; ++i) {
+                LBU(x3, gback, gdoffset + i);
+                LBU(x4, wback, fixedaddress + i);
                 SUB(x3, x3, x4);
                 NOT(x4, x3);
                 SRAI(x4, x4, 63);
                 AND(x3, x3, x4);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0xD9:
@@ -2200,12 +2256,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for (int i=0; i<16; ++i) {
-                LBU(x3, gback, gdoffset+i);
-                LBU(x4, wback, fixedaddress+i);
+            for (int i = 0; i < 16; ++i) {
+                LBU(x3, gback, gdoffset + i);
+                LBU(x4, wback, fixedaddress + i);
                 BLTU(x3, x4, 8);
                 MV(x3, x4);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0xDB:
@@ -2221,13 +2277,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             ADDI(x5, xZR, 0xFF);
-            for(int i=0; i<16; ++i) {
-                LBU(x3, gback, gdoffset+i);
-                LBU(x4, wback, fixedaddress+i);
+            for (int i = 0; i < 16; ++i) {
+                LBU(x3, gback, gdoffset + i);
+                LBU(x4, wback, fixedaddress + i);
                 ADD(x3, x3, x4);
                 BLT(x3, x5, 8);
                 ADDI(x3, xZR, 0xFF);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0xDD:
@@ -2235,16 +2291,16 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<8; ++i) {
+            for (int i = 0; i < 8; ++i) {
                 // tmp32s = (int32_t)GX->uw[i] + EX->uw[i];
                 // GX->uw[i] = (tmp32s>65535)?65535:tmp32s;
-                LHU(x3, gback, gdoffset+i*2);
-                LHU(x4, wback, fixedaddress+i*2);
+                LHU(x3, gback, gdoffset + i * 2);
+                LHU(x4, wback, fixedaddress + i * 2);
                 ADDW(x3, x3, x4);
                 MOV32w(x4, 65536);
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, gdoffset+i*2);
+                SH(x3, gback, gdoffset + i * 2);
             }
             break;
         case 0xDE:
@@ -2252,12 +2308,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for (int i=0; i<16; ++i) {
-                LBU(x3, gback, gdoffset+i);
-                LBU(x4, wback, fixedaddress+i);
+            for (int i = 0; i < 16; ++i) {
+                LBU(x3, gback, gdoffset + i);
+                LBU(x4, wback, fixedaddress + i);
                 BLTU(x4, x3, 8);
                 MV(x3, x4);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0xDF:
@@ -2267,18 +2323,18 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, NOT(x3, x3); AND(x3, x3, x4));
             break;
-         case 0xE0:
+        case 0xE0:
             INST_NAME("PAVGB Gx, Ex");
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for (int i=0; i<16; ++i) {
-                LBU(x3, gback, gdoffset+i);
-                LBU(x4, wback, fixedaddress+i);
+            for (int i = 0; i < 16; ++i) {
+                LBU(x3, gback, gdoffset + i);
+                LBU(x4, wback, fixedaddress + i);
                 ADDW(x3, x3, x4);
                 ADDIW(x3, x3, 1);
                 SRAIW(x3, x3, 1);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0xE1:
@@ -2290,10 +2346,10 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             LD(x3, wback, fixedaddress);
             BLTU(x3, x4, 8);
             SUBI(x3, x4, 1);
-            for (int i=0; i<8; ++i) {
-                LH(x4, gback, gdoffset+2*i);
+            for (int i = 0; i < 8; ++i) {
+                LH(x4, gback, gdoffset + 2 * i);
                 SRAW(x4, x4, x3);
-                SH(x4, gback, gdoffset+2*i);
+                SH(x4, gback, gdoffset + 2 * i);
             }
             break;
         case 0xE2:
@@ -2305,10 +2361,10 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             LD(x3, wback, fixedaddress);
             BLTU(x3, x4, 8);
             SUBI(x3, x4, 1);
-            for (int i=0; i<4; ++i) {
-                LW(x4, gback, gdoffset+4*i);
+            for (int i = 0; i < 4; ++i) {
+                LW(x4, gback, gdoffset + 4 * i);
                 SRAW(x4, x4, x3);
-                SW(x4, gback, gdoffset+4*i);
+                SW(x4, gback, gdoffset + 4 * i);
             }
             break;
         case 0xE3:
@@ -2316,13 +2372,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for (int i=0; i<8; ++i) {
-                LHU(x3, gback, gdoffset+2*i);
-                LHU(x4, wback, fixedaddress+2*i);
+            for (int i = 0; i < 8; ++i) {
+                LHU(x3, gback, gdoffset + 2 * i);
+                LHU(x4, wback, fixedaddress + 2 * i);
                 ADDW(x3, x3, x4);
                 ADDIW(x3, x3, 1);
                 SRAIW(x3, x3, 1);
-                SH(x3, gback, gdoffset+2*i);
+                SH(x3, gback, gdoffset + 2 * i);
             }
             break;
         case 0xE4:
@@ -2330,12 +2386,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<8; ++i) {
-                LHU(x3, gback, gdoffset+2*i);
-                LHU(x4, wback, fixedaddress+2*i);
+            for (int i = 0; i < 8; ++i) {
+                LHU(x3, gback, gdoffset + 2 * i);
+                LHU(x4, wback, fixedaddress + 2 * i);
                 MULW(x3, x3, x4);
                 SRLIW(x3, x3, 16);
-                SH(x3, gback, gdoffset+2*i);
+                SH(x3, gback, gdoffset + 2 * i);
             }
             break;
         case 0xE5:
@@ -2343,12 +2399,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<8; ++i) {
-                LH(x3, gback, gdoffset+2*i);
-                LH(x4, wback, fixedaddress+2*i);
+            for (int i = 0; i < 8; ++i) {
+                LH(x3, gback, gdoffset + 2 * i);
+                LH(x4, wback, fixedaddress + 2 * i);
                 MULW(x3, x3, x4);
                 SRAIW(x3, x3, 16);
-                SH(x3, gback, gdoffset+2*i);
+                SH(x3, gback, gdoffset + 2 * i);
             }
             break;
         case 0xE6:
@@ -2358,31 +2414,31 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             v0 = fpu_get_scratch(dyn);
             v1 = fpu_get_scratch(dyn);
-            FLD(v0, wback, fixedaddress+0);
-            FLD(v1, wback, fixedaddress+8);
-            if(!box64_dynarec_fastround) {
-                FSFLAGSI(0);  // // reset all bits
+            FLD(v0, wback, fixedaddress + 0);
+            FLD(v1, wback, fixedaddress + 8);
+            if (!box64_dynarec_fastround) {
+                FSFLAGSI(0); // // reset all bits
             }
             FCVTWD(x3, v0, RD_RTZ);
-            if(!box64_dynarec_fastround) {
-                FRFLAGS(x5);   // get back FPSR to check the IOC bit
-                ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+            if (!box64_dynarec_fastround) {
+                FRFLAGS(x5); // get back FPSR to check the IOC bit
+                ANDI(x5, x5, (1 << FR_NV) | (1 << FR_OF));
                 BEQ_MARK(x5, xZR);
                 MOV32w(x3, 0x80000000);
                 MARK;
-                FSFLAGSI(0);  // // reset all bits
+                FSFLAGSI(0); // // reset all bits
             }
             FCVTWD(x4, v1, RD_RTZ);
-            if(!box64_dynarec_fastround) {
-                FRFLAGS(x5);   // get back FPSR to check the IOC bit
-                ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+            if (!box64_dynarec_fastround) {
+                FRFLAGS(x5); // get back FPSR to check the IOC bit
+                ANDI(x5, x5, (1 << FR_NV) | (1 << FR_OF));
                 BEQ_MARK2(x5, xZR);
                 MOV32w(x4, 0x80000000);
                 MARK2;
             }
-            SW(x3, gback, gdoffset+0);
-            SW(x4, gback, gdoffset+4);
-            SD(xZR, gback, gdoffset+8);
+            SW(x3, gback, gdoffset + 0);
+            SW(x4, gback, gdoffset + 4);
+            SD(xZR, gback, gdoffset + 8);
             break;
         case 0xE7:
             INST_NAME("MOVNTDQ Ex, Gx");
@@ -2396,23 +2452,23 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<16; ++i) {
+            for (int i = 0; i < 16; ++i) {
                 // tmp16s = (int16_t)GX->sb[i] - EX->sb[i];
                 // GX->sb[i] = (tmp16s<-128)?-128:((tmp16s>127)?127:tmp16s);
-                LB(x3, gback, gdoffset+i);
-                LB(x4, wback, fixedaddress+i);
+                LB(x3, gback, gdoffset + i);
+                LB(x4, wback, fixedaddress + i);
                 SUBW(x3, x3, x4);
                 SLLIW(x3, x3, 16);
                 SRAIW(x3, x3, 16);
                 ADDI(x4, xZR, 0x7f);
-                BLT(x3, x4, 12);     // tmp16s>127?
-                SB(x4, gback, gdoffset+i);
-                J(24);               // continue
+                BLT(x3, x4, 12); // tmp16s>127?
+                SB(x4, gback, gdoffset + i);
+                J(24); // continue
                 ADDI(x4, xZR, 0xf80);
-                BLT(x4, x3, 12);     // tmp16s<-128?
-                SB(x4, gback, gdoffset+i);
-                J(8);                // continue
-                SB(x3, gback, gdoffset+i);
+                BLT(x4, x3, 12); // tmp16s<-128?
+                SB(x4, gback, gdoffset + i);
+                J(8); // continue
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0xE9:
@@ -2420,20 +2476,20 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<8; ++i) {
+            for (int i = 0; i < 8; ++i) {
                 // tmp32s = (int32_t)GX->sw[i] - EX->sw[i];
                 // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
-                LH(x3, gback, gdoffset+2*i);
-                LH(x4, wback, fixedaddress+2*i);
+                LH(x3, gback, gdoffset + 2 * i);
+                LH(x4, wback, fixedaddress + 2 * i);
                 SUBW(x3, x3, x4);
                 LUI(x4, 0xFFFF8); // -32768
                 BGE(x3, x4, 12);
-                SH(x4, gback, gdoffset+2*i);
-                J(20); // continue
+                SH(x4, gback, gdoffset + 2 * i);
+                J(20);      // continue
                 LUI(x4, 8); // 32768
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, gdoffset+2*i);
+                SH(x3, gback, gdoffset + 2 * i);
             }
             break;
         case 0xEA:
@@ -2441,12 +2497,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for (int i=0; i<8; ++i) {
-                LH(x3, gback, gdoffset+2*i);
-                LH(x4, wback, fixedaddress+2*i);
+            for (int i = 0; i < 8; ++i) {
+                LH(x3, gback, gdoffset + 2 * i);
+                LH(x4, wback, fixedaddress + 2 * i);
                 BLT(x3, x4, 8);
                 MV(x3, x4);
-                SH(x3, gback, gdoffset+2*i);
+                SH(x3, gback, gdoffset + 2 * i);
             }
             break;
         case 0xEB:
@@ -2461,23 +2517,23 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<16; ++i) {
+            for (int i = 0; i < 16; ++i) {
                 // tmp16s = (int16_t)GX->sb[i] + EX->sb[i];
                 // GX->sb[i] = (tmp16s>127)?127:((tmp16s<-128)?-128:tmp16s);
-                LB(x3, gback, gdoffset+i);
-                LB(x4, wback, fixedaddress+i);
+                LB(x3, gback, gdoffset + i);
+                LB(x4, wback, fixedaddress + i);
                 ADDW(x3, x3, x4);
                 SLLIW(x3, x3, 16);
                 SRAIW(x3, x3, 16);
                 ADDI(x4, xZR, 0x7f);
-                BLT(x3, x4, 12);     // tmp16s>127?
-                SB(x4, gback, gdoffset+i);
-                J(24);               // continue
+                BLT(x3, x4, 12); // tmp16s>127?
+                SB(x4, gback, gdoffset + i);
+                J(24); // continue
                 ADDI(x4, xZR, 0xf80);
-                BLT(x4, x3, 12);     // tmp16s<-128?
-                SB(x4, gback, gdoffset+i);
-                J(8);                // continue
-                SB(x3, gback, gdoffset+i);
+                BLT(x4, x3, 12); // tmp16s<-128?
+                SB(x4, gback, gdoffset + i);
+                J(8); // continue
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0xED:
@@ -2485,20 +2541,20 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<8; ++i) {
+            for (int i = 0; i < 8; ++i) {
                 // tmp32s = (int32_t)GX->sw[i] + EX->sw[i];
                 // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
-                LH(x3, gback, gdoffset+2*i);
-                LH(x4, wback, fixedaddress+2*i);
+                LH(x3, gback, gdoffset + 2 * i);
+                LH(x4, wback, fixedaddress + 2 * i);
                 ADDW(x3, x3, x4);
                 LUI(x4, 0xFFFF8); // -32768
                 BGE(x3, x4, 12);
-                SH(x4, gback, gdoffset+2*i);
-                J(20); // continue
+                SH(x4, gback, gdoffset + 2 * i);
+                J(20);      // continue
                 LUI(x4, 8); // 32768
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, gdoffset+2*i);
+                SH(x3, gback, gdoffset + 2 * i);
             }
             break;
         case 0xEE:
@@ -2512,11 +2568,10 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("PXOR Gx, Ex");
             nextop = F8;
             GETGX();
-            if(MODREG && gd==(nextop&7)+(rex.b<<3))
-            {
+            if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
                 // just zero dest
-                SD(xZR, gback, gdoffset+0);
-                SD(xZR, gback, gdoffset+8);
+                SD(xZR, gback, gdoffset + 0);
+                SD(xZR, gback, gdoffset + 8);
             } else {
                 GETEX(x2, 0);
                 SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
@@ -2528,17 +2583,17 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 16);
-            LD(x3, wback, fixedaddress+0);
+            LD(x3, wback, fixedaddress + 0);
             BLTU_MARK(x3, x4);
             // just zero dest
-            SD(xZR, gback, gdoffset+0);
-            SD(xZR, gback, gdoffset+8);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
             B_NEXT_nocond;
             MARK;
-            for (int i=0; i<8; ++i) {
-                LHU(x4, gback, gdoffset+2*i);
+            for (int i = 0; i < 8; ++i) {
+                LHU(x4, gback, gdoffset + 2 * i);
                 SLLW(x4, x4, x3);
-                SH(x4, gback, gdoffset+2*i);
+                SH(x4, gback, gdoffset + 2 * i);
             }
             break;
         case 0xF2:
@@ -2547,17 +2602,17 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 32);
-            LD(x3, wback, fixedaddress+0);
+            LD(x3, wback, fixedaddress + 0);
             BLTU_MARK(x3, x4);
             // just zero dest
-            SD(xZR, gback, gdoffset+0);
-            SD(xZR, gback, gdoffset+8);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
             B_NEXT_nocond;
             MARK;
-            for (int i=0; i<4; ++i) {
-                LWU(x4, gback, gdoffset+4*i);
+            for (int i = 0; i < 4; ++i) {
+                LWU(x4, gback, gdoffset + 4 * i);
                 SLLW(x4, x4, x3);
-                SW(x4, gback, gdoffset+4*i);
+                SW(x4, gback, gdoffset + 4 * i);
             }
             break;
         case 0xF3:
@@ -2566,17 +2621,17 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 64);
-            LD(x3, wback, fixedaddress+0);
+            LD(x3, wback, fixedaddress + 0);
             BLTU_MARK(x3, x4);
             // just zero dest
-            SD(xZR, gback, gdoffset+0);
-            SD(xZR, gback, gdoffset+8);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
             B_NEXT_nocond;
             MARK;
-            for (int i=0; i<2; ++i) {
-                LD(x4, gback, gdoffset+8*i);
+            for (int i = 0; i < 2; ++i) {
+                LD(x4, gback, gdoffset + 8 * i);
                 SLL(x4, x4, x3);
-                SD(x4, gback, gdoffset+8*i);
+                SD(x4, gback, gdoffset + 8 * i);
             }
             break;
         case 0xF4:
@@ -2585,32 +2640,32 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             // GX->q[1] = (uint64_t)EX->ud[2]*GX->ud[2];
-            LWU(x3, gback, gdoffset+2*4);
-            LWU(x4, wback, fixedaddress+2*4);
+            LWU(x3, gback, gdoffset + 2 * 4);
+            LWU(x4, wback, fixedaddress + 2 * 4);
             MUL(x3, x3, x4);
-            SD(x3, gback, gdoffset+8);
+            SD(x3, gback, gdoffset + 8);
             // GX->q[0] = (uint64_t)EX->ud[0]*GX->ud[0];
-            LWU(x3, gback, gdoffset+0*4);
-            LWU(x4, wback, fixedaddress+0*4);
+            LWU(x3, gback, gdoffset + 0 * 4);
+            LWU(x4, wback, fixedaddress + 0 * 4);
             MUL(x3, x3, x4);
-            SD(x3, gback, gdoffset+0);
+            SD(x3, gback, gdoffset + 0);
             break;
         case 0xF5:
             INST_NAME("PMADDWD Gx, Ex");
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for (int i=0; i<4; ++i) {
+            for (int i = 0; i < 4; ++i) {
                 // GX->sd[i] = (int32_t)(GX->sw[i*2+0])*EX->sw[i*2+0] +
                 //             (int32_t)(GX->sw[i*2+1])*EX->sw[i*2+1];
-                LH(x3, gback, gdoffset+2*(i*2+0));
-                LH(x4, wback, fixedaddress+2*(i*2+0));
+                LH(x3, gback, gdoffset + 2 * (i * 2 + 0));
+                LH(x4, wback, fixedaddress + 2 * (i * 2 + 0));
                 MULW(x5, x3, x4);
-                LH(x3, gback, gdoffset+2*(i*2+1));
-                LH(x4, wback, fixedaddress+2*(i*2+1));
+                LH(x3, gback, gdoffset + 2 * (i * 2 + 1));
+                LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                 MULW(x6, x3, x4);
                 ADDW(x5, x5, x6);
-                SW(x5, gback, gdoffset+4*i);
+                SW(x5, gback, gdoffset + 4 * i);
             }
             break;
         case 0xF6:
@@ -2619,18 +2674,18 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             MV(x6, xZR);
-            for (int i=0; i<16; ++i) {
-                LBU(x3, gback, gdoffset+i);
-                LBU(x4, wback, fixedaddress+i);
+            for (int i = 0; i < 16; ++i) {
+                LBU(x3, gback, gdoffset + i);
+                LBU(x4, wback, fixedaddress + i);
                 SUBW(x3, x3, x4);
                 SRAIW(x5, x3, 31);
                 XOR(x3, x5, x3);
                 SUBW(x3, x3, x5);
                 ANDI(x3, x3, 0xff);
                 ADDW(x6, x6, x3);
-                if (i==7 || i == 15) {
-                    SD(x6, gback, gdoffset+i+1-8);
-                    if (i==7) MV(x6, xZR);
+                if (i == 7 || i == 15) {
+                    SD(x6, gback, gdoffset + i + 1 - 8);
+                    if (i == 7) MV(x6, xZR);
                 }
             }
             break;
@@ -2639,12 +2694,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<16; ++i) {
+            for (int i = 0; i < 16; ++i) {
                 // GX->sb[i] -= EX->sb[i];
-                LB(x3, wback, fixedaddress+i);
-                LB(x4, gback, gdoffset+i);
+                LB(x3, wback, fixedaddress + i);
+                LB(x4, gback, gdoffset + i);
                 SUB(x3, x4, x3);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0xF9:
@@ -2673,12 +2728,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            for(int i=0; i<16; ++i) {
+            for (int i = 0; i < 16; ++i) {
                 // GX->sb[i] += EX->sb[i];
-                LB(x3, gback, gdoffset+i);
-                LB(x4, wback, fixedaddress+i);
+                LB(x3, gback, gdoffset + i);
+                LB(x4, wback, fixedaddress + i);
                 ADDW(x3, x3, x4);
-                SB(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset + i);
             }
             break;
         case 0xFD:
diff --git a/src/dynarec/rv64/dynarec_rv64_f30f.c b/src/dynarec/rv64/dynarec_rv64_f30f.c
index 0c0676e0..9007e46e 100644
--- a/src/dynarec/rv64/dynarec_rv64_f30f.c
+++ b/src/dynarec/rv64/dynarec_rv64_f30f.c
@@ -24,7 +24,8 @@
 
 uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog)
 {
-    (void)ip; (void)need_epilog;
+    (void)ip;
+    (void)need_epilog;
 
     uint8_t opcode = F8;
     uint8_t nextop, u8;
@@ -46,14 +47,14 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     MAYUSE(v1);
     MAYUSE(j64);
 
-    switch(opcode) {
+    switch (opcode) {
         case 0x10:
             INST_NAME("MOVSS Gx, Ex");
             nextop = F8;
             GETG;
-            if(MODREG) {
+            if (MODREG) {
                 v0 = sse_get_reg(dyn, ninst, x1, gd, 1);
-                q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 1);
+                q0 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 1);
                 FMVS(v0, q0);
             } else {
                 v0 = sse_get_reg_empty(dyn, ninst, x1, gd, 1);
@@ -61,8 +62,8 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 8, 0);
                 FLW(v0, ed, fixedaddress);
                 // reset upper part
-                SW(xZR, xEmu, offsetof(x64emu_t, xmm[gd])+4);
-                SD(xZR, xEmu, offsetof(x64emu_t, xmm[gd])+8);
+                SW(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 4);
+                SD(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 8);
             }
             break;
         case 0x11:
@@ -70,8 +71,8 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETG;
             v0 = sse_get_reg(dyn, ninst, x1, gd, 1);
-            if(MODREG) {
-                q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 1);
+            if (MODREG) {
+                q0 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 1);
                 FMVS(q0, v0);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0);
@@ -88,12 +89,12 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
 
             // GX->ud[1] = GX->ud[0] = EX->ud[0];
             // GX->ud[3] = GX->ud[2] = EX->ud[2];
-            LD(x3, wback, fixedaddress+0);
-            SD(x3, gback, gdoffset+0);
-            SD(x3, gback, gdoffset+4);
-            LD(x3, wback, fixedaddress+8);
-            SD(x3, gback, gdoffset+8);
-            SD(x3, gback, gdoffset+12);
+            LD(x3, wback, fixedaddress + 0);
+            SD(x3, gback, gdoffset + 0);
+            SD(x3, gback, gdoffset + 4);
+            LD(x3, wback, fixedaddress + 8);
+            SD(x3, gback, gdoffset + 8);
+            SD(x3, gback, gdoffset + 12);
             break;
         case 0x1E:
             INST_NAME("NOP / ENDBR32 / ENDBR64");
@@ -106,7 +107,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGXSS(v0);
             GETED(0);
-            if(rex.w) {
+            if (rex.w) {
                 FCVTSL(v0, ed, RD_RNE);
             } else {
                 FCVTSW(v0, ed, RD_RNE);
@@ -118,17 +119,17 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGD;
             GETEXSS(d0, 0);
-            if(!box64_dynarec_fastround) {
-                FSFLAGSI(0);  // // reset all bits
+            if (!box64_dynarec_fastround) {
+                FSFLAGSI(0); // // reset all bits
             }
             FCVTSxw(gd, d0, RD_RTZ);
-            if(!rex.w)
+            if (!rex.w)
                 ZEROUP(gd);
-            if(!box64_dynarec_fastround) {
-                FRFLAGS(x5);   // get back FPSR to check the IOC bit
-                ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+            if (!box64_dynarec_fastround) {
+                FRFLAGS(x5); // get back FPSR to check the IOC bit
+                ANDI(x5, x5, (1 << FR_NV) | (1 << FR_OF));
                 CBZ_NEXT(x5);
-                if(rex.w) {
+                if (rex.w) {
                     MOV64x(gd, 0x8000000000000000LL);
                 } else {
                     MOV32w(gd, 0x80000000);
@@ -140,19 +141,19 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGD;
             GETEXSS(d0, 0);
-            if(!box64_dynarec_fastround) {
-                FSFLAGSI(0);  // // reset all bits
+            if (!box64_dynarec_fastround) {
+                FSFLAGSI(0); // // reset all bits
             }
             u8 = sse_setround(dyn, ninst, x5, x6);
             FCVTSxw(gd, d0, RD_DYN);
             x87_restoreround(dyn, ninst, u8);
-            if(!rex.w)
+            if (!rex.w)
                 ZEROUP(gd);
-            if(!box64_dynarec_fastround) {
-                FRFLAGS(x5);   // get back FPSR to check the IOC bit
-                ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+            if (!box64_dynarec_fastround) {
+                FRFLAGS(x5); // get back FPSR to check the IOC bit
+                ANDI(x5, x5, (1 << FR_NV) | (1 << FR_OF));
                 CBZ_NEXT(x5);
-                if(rex.w) {
+                if (rex.w) {
                     MOV64x(gd, 0x8000000000000000LL);
                 } else {
                     MOV32w(gd, 0x80000000);
@@ -257,31 +258,31 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             u8 = F8;
             int32_t idx;
 
-            idx = 4+((u8>>(0*2))&3);
-            LHU(x3, wback, fixedaddress+idx*2);
-            idx = 4+((u8>>(1*2))&3);
-            LHU(x4, wback, fixedaddress+idx*2);
-            idx = 4+((u8>>(2*2))&3);
-            LHU(x5, wback, fixedaddress+idx*2);
-            idx = 4+((u8>>(3*2))&3);
-            LHU(x6, wback, fixedaddress+idx*2);
+            idx = 4 + ((u8 >> (0 * 2)) & 3);
+            LHU(x3, wback, fixedaddress + idx * 2);
+            idx = 4 + ((u8 >> (1 * 2)) & 3);
+            LHU(x4, wback, fixedaddress + idx * 2);
+            idx = 4 + ((u8 >> (2 * 2)) & 3);
+            LHU(x5, wback, fixedaddress + idx * 2);
+            idx = 4 + ((u8 >> (3 * 2)) & 3);
+            LHU(x6, wback, fixedaddress + idx * 2);
 
-            SH(x3, gback, gdoffset+(4+0)*2);
-            SH(x4, gback, gdoffset+(4+1)*2);
-            SH(x5, gback, gdoffset+(4+2)*2);
-            SH(x6, gback, gdoffset+(4+3)*2);
+            SH(x3, gback, gdoffset + (4 + 0) * 2);
+            SH(x4, gback, gdoffset + (4 + 1) * 2);
+            SH(x5, gback, gdoffset + (4 + 2) * 2);
+            SH(x6, gback, gdoffset + (4 + 3) * 2);
 
-            if (!(MODREG && (gd==ed))) {
-                LD(x3, wback, fixedaddress+0);
-                SD(x3, gback, gdoffset+0);
+            if (!(MODREG && (gd == ed))) {
+                LD(x3, wback, fixedaddress + 0);
+                SD(x3, gback, gdoffset + 0);
             }
             break;
         case 0x7E:
             INST_NAME("MOVQ Gx, Ex");
             nextop = F8;
             // Will load Gx as SD. Is that a good choice?
-            if(MODREG) {
-                v1 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0);
+            if (MODREG) {
+                v1 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0);
                 GETGXSD_empty(v0);
                 FMVD(v0, v1);
             } else {
@@ -290,7 +291,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0);
                 FLD(v0, ed, fixedaddress);
             }
-            SD(xZR, xEmu, offsetof(x64emu_t, xmm[gd])+8);
+            SD(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 8);
             break;
         case 0x7F:
             INST_NAME("MOVDQU Ex,Gx");
@@ -298,7 +299,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
-            if(!MODREG) SMWRITE2();
+            if (!MODREG) SMWRITE2();
             break;
 
         case 0x5B:
@@ -307,19 +308,19 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGX();
             GETEX(x2, 0);
             v0 = fpu_get_scratch(dyn);
-            for(int i=0; i<4; ++i) {
-                if(!box64_dynarec_fastround) {
+            for (int i = 0; i < 4; ++i) {
+                if (!box64_dynarec_fastround) {
                     FSFLAGSI(0); // reset all bits
                 }
-                FLW(v0, wback, fixedaddress+i*4);
+                FLW(v0, wback, fixedaddress + i * 4);
                 FCVTWS(x3, v0, RD_RTZ);
-                if(!box64_dynarec_fastround) {
-                    FRFLAGS(x5);   // get back FPSR to check the IOC bit
-                    ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                if (!box64_dynarec_fastround) {
+                    FRFLAGS(x5); // get back FPSR to check the IOC bit
+                    ANDI(x5, x5, (1 << FR_NV) | (1 << FR_OF));
                     BEQZ(x5, 8);
                     MOV32w(x3, 0x80000000);
                 }
-                SW(x3, gback, gdoffset+i*4);
+                SW(x3, gback, gdoffset + i * 4);
             }
             break;
         case 0xB8:
@@ -329,17 +330,17 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETED(0);
             GETGD;
-            if(!rex.w && MODREG) {
+            if (!rex.w && MODREG) {
                 AND(x4, ed, xMASK);
                 ed = x4;
             }
             CLEAR_FLAGS();
             BNE_MARK(ed, xZR);
-            ORI(xFlags, xFlags, 1<<F_ZF);
+            ORI(xFlags, xFlags, 1 << F_ZF);
             MOV32w(gd, 0);
             B_NEXT_nocond;
             MARK;
-            if(rv64_zbb) {
+            if (rv64_zbb) {
                 CPOPxw(gd, ed);
             } else {
                 TABLE64(x1, 0x5555555555555555uLL);
@@ -371,75 +372,77 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETED(0);
             GETGD;
-            if(!rex.w && MODREG) {
+            if (!rex.w && MODREG) {
                 AND(x4, ed, xMASK);
                 ed = x4;
             }
-            ANDI(xFlags, xFlags, ~((1<<F_ZF) | (1<<F_CF)));
+            ANDI(xFlags, xFlags, ~((1 << F_ZF) | (1 << F_CF)));
             BNE_MARK(ed, xZR);
-            ORI(xFlags, xFlags, 1<<F_CF);
-            MOV32w(gd, rex.w?64:32);
+            ORI(xFlags, xFlags, 1 << F_CF);
+            MOV32w(gd, rex.w ? 64 : 32);
             B_NEXT_nocond;
             MARK;
-            if(rv64_zbb) {
+            if (rv64_zbb) {
                 CTZxw(gd, ed);
             } else {
                 NEG(x2, ed);
                 AND(x2, x2, ed);
                 TABLE64(x3, 0x03f79d71b4ca8b09ULL);
                 MUL(x2, x2, x3);
-                SRLI(x2, x2, 64-6);
+                SRLI(x2, x2, 64 - 6);
                 TABLE64(x1, (uintptr_t)&deBruijn64tab);
                 ADD(x1, x1, x2);
                 LBU(gd, x1, 0);
             }
-            BNE(gd, xZR, 4+4);
-            ORI(xFlags, xFlags, 1<<F_ZF);
+            BNE(gd, xZR, 4 + 4);
+            ORI(xFlags, xFlags, 1 << F_ZF);
             break;
         case 0xBD:
             INST_NAME("LZCNT Gd, Ed");
-            SETFLAGS(X_ZF|X_CF, SF_SUBSET);
+            SETFLAGS(X_ZF | X_CF, SF_SUBSET);
             SET_DFNONE();
             nextop = F8;
             GETED(0);
             GETGD;
-            if(!rex.w && MODREG) {
+            if (!rex.w && MODREG) {
                 AND(x4, ed, xMASK);
                 ed = x4;
             }
             BNE_MARK(ed, xZR);
-            MOV32w(gd, rex.w?64:32);
-            ANDI(xFlags, xFlags, ~(1<<F_ZF));
-            ORI(xFlags, xFlags, 1<<F_CF);
+            MOV32w(gd, rex.w ? 64 : 32);
+            ANDI(xFlags, xFlags, ~(1 << F_ZF));
+            ORI(xFlags, xFlags, 1 << F_CF);
             B_NEXT_nocond;
             MARK;
-            if(rv64_zbb) {
+            if (rv64_zbb) {
                 CLZxw(gd, ed);
+            } else if (rv64_xtheadbb) {
+                TH_FF0(gd, ed);
             } else {
-                if(ed!=gd)
+                if (ed != gd)
                     u8 = gd;
                 else
                     u8 = x1;
-                ADDI(u8, xZR, rex.w?63:31);
-                if(rex.w) {
+                ADDI(u8, xZR, rex.w ? 63 : 31);
+                if (rex.w) {
                     MV(x2, ed);
                     SRLI(x3, x2, 32);
-                    BEQZ(x3, 4+2*4);
+                    BEQZ(x3, 4 + 2 * 4);
                     SUBI(u8, u8, 32);
                     MV(x2, x3);
                 } else {
                     AND(x2, ed, xMASK);
                 }
                 SRLI(x3, x2, 16);
-                BEQZ(x3, 4+2*4);
+                BEQZ(x3, 4 + 2 * 4);
                 SUBI(u8, u8, 16);
                 MV(x2, x3);
                 SRLI(x3, x2, 8);
-                BEQZ(x3, 4+2*4);
+                BEQZ(x3, 4 + 2 * 4);
                 SUBI(u8, u8, 8);
                 MV(x2, x3);
                 SRLI(x3, x2, 4);
-                BEQZ(x3, 4+2*4);
+                BEQZ(x3, 4 + 2 * 4);
                 SUBI(u8, u8, 4);
                 MV(x2, x3);
                 ANDI(x2, x2, 0b1111);
@@ -449,9 +452,9 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 SUB(gd, u8, x2);
                 MARK2;
             }
-            ANDI(xFlags, xFlags, ~((1<<F_ZF) | (1<<F_CF)));
-            BNE(gd, xZR, 4+4);
-            ORI(xFlags, xFlags, 1<<F_ZF);
+            ANDI(xFlags, xFlags, ~((1 << F_ZF) | (1 << F_CF)));
+            BNE(gd, xZR, 4 + 4);
+            ORI(xFlags, xFlags, 1 << F_ZF);
             break;
 
         case 0xC2:
@@ -460,9 +463,9 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGXSS(d0);
             GETEXSS(d1, 1);
             u8 = F8;
-            if ((u8&7) == 0) {                                      // Equal
+            if ((u8 & 7) == 0) { // Equal
                 FEQS(x2, d0, d1);
-            } else if ((u8&7) == 4) {                               // Not Equal or unordered
+            } else if ((u8 & 7) == 4) { // Not Equal or unordered
                 FEQS(x2, d0, d1);
                 XORI(x2, x2, 1);
             } else {
@@ -471,27 +474,33 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 FEQS(x2, d1, d1);
                 AND(x2, x2, x3);
 
-                switch(u8&7) {
-                case 1: BEQ_MARK(x2, xZR); FLTS(x2, d0, d1); break; // Less than
-                case 2: BEQ_MARK(x2, xZR); FLES(x2, d0, d1); break; // Less or equal
-                case 3: XORI(x2, x2, 1); break;                     // NaN
-                case 5: {                                           // Greater or equal or unordered
-                    BEQ_MARK2(x2, xZR);
-                    FLES(x2, d1, d0);
-                    B_MARK_nocond;
-                    break;
-                }
-                case 6: {                                           // Greater or unordered, test inverted, N!=V so unordered or less than (inverted)
-                    BEQ_MARK2(x2, xZR);
-                    FLTS(x2, d1, d0);
-                    B_MARK_nocond;
-                    break;
-                }
-                case 7: break;                                      // Not NaN
+                switch (u8 & 7) {
+                    case 1:
+                        BEQ_MARK(x2, xZR);
+                        FLTS(x2, d0, d1);
+                        break; // Less than
+                    case 2:
+                        BEQ_MARK(x2, xZR);
+                        FLES(x2, d0, d1);
+                        break;                      // Less or equal
+                    case 3: XORI(x2, x2, 1); break; // NaN
+                    case 5: {                       // Greater or equal or unordered
+                        BEQ_MARK2(x2, xZR);
+                        FLES(x2, d1, d0);
+                        B_MARK_nocond;
+                        break;
+                    }
+                    case 6: { // Greater or unordered, test inverted, N!=V so unordered or less than (inverted)
+                        BEQ_MARK2(x2, xZR);
+                        FLTS(x2, d1, d0);
+                        B_MARK_nocond;
+                        break;
+                    }
+                    case 7: break; // Not NaN
                 }
 
                 MARK2;
-                if ((u8&7) == 5 || (u8&7) == 6) {
+                if ((u8 & 7) == 5 || (u8 & 7) == 6) {
                     MOV32w(x2, 1);
                 }
                 MARK;
@@ -507,12 +516,12 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             q0 = fpu_get_scratch(dyn);
             q1 = fpu_get_scratch(dyn);
-            LW(x3, wback, fixedaddress+0);
-            LW(x4, wback, fixedaddress+4);
+            LW(x3, wback, fixedaddress + 0);
+            LW(x4, wback, fixedaddress + 4);
             FCVTDW(q0, x3, RD_RTZ);
             FCVTDW(q1, x4, RD_RTZ);
-            FSD(q0, gback, gdoffset+0);
-            FSD(q1, gback, gdoffset+8);
+            FSD(q0, gback, gdoffset + 0);
+            FSD(q1, gback, gdoffset + 8);
             break;
 
         default:
diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h
index 1caebc90..63e62ad1 100644
--- a/src/dynarec/rv64/rv64_emitter.h
+++ b/src/dynarec/rv64/rv64_emitter.h
@@ -30,637 +30,829 @@ f18–27  fs2–11  FP saved registers              Callee
 f28–31  ft8–11  FP temporaries                  Caller
 */
 // x86 Register mapping
-#define xRAX    16
-#define xRCX    17
-#define xRDX    18
-#define xRBX    19
-#define xRSP    20
-#define xRBP    21
-#define xRSI    22
-#define xRDI    23
-#define xR8     24
-#define xR9     25
-#define xR10    26
-#define xR11    27
-#define xR12    28
-#define xR13    29
-#define xR14    30
-#define xR15    31
-#define xFlags  8
-#define xRIP    7
+#define xRAX 16
+#define xRCX 17
+#define xRDX 18
+#define xRBX 19
+#define xRSP 20
+#define xRBP 21
+#define xRSI 22
+#define xRDI 23
+#define xR8 24
+#define xR9 25
+#define xR10 26
+#define xR11 27
+#define xR12 28
+#define xR13 29
+#define xR14 30
+#define xR15 31
+#define xFlags 8
+#define xRIP 7
 
 // 32bits version
-#define wEAX    xRAX
-#define wECX    xRCX
-#define wEDX    xRDX
-#define wEBX    xRBX
-#define wESP    xRSP
-#define wEBP    xRBP
-#define wESI    xRSI
-#define wEDI    xRDI
-#define wR8     xR8
-#define wR9     xR9
-#define wR10    xR10
-#define wR11    xR11
-#define wR12    xR12
-#define wR13    xR13
-#define wR14    xR14
-#define wR15    xR15
-#define wFlags  xFlags
+#define wEAX xRAX
+#define wECX xRCX
+#define wEDX xRDX
+#define wEBX xRBX
+#define wESP xRSP
+#define wEBP xRBP
+#define wESI xRSI
+#define wEDI xRDI
+#define wR8 xR8
+#define wR9 xR9
+#define wR10 xR10
+#define wR11 xR11
+#define wR12 xR12
+#define wR13 xR13
+#define wR14 xR14
+#define wR15 xR15
+#define wFlags xFlags
 // scratch registers
-#define x1      11
-#define x2      12
-#define x3      13
-#define x4      14
-#define x5      15
-#define x6      6
-#define x9      9
+#define x1 11
+#define x2 12
+#define x3 13
+#define x4 14
+#define x5 15
+#define x6 6
+#define x9 9
 // used to clear the upper 32bits
-#define xMASK   5
+#define xMASK 5
 // 32bits version of scratch
-#define w1      x1
-#define w2      x2
-#define w3      x3
-#define w4      x4
-#define w5      x5
-#define w6      x6
+#define w1 x1
+#define w2 x2
+#define w3 x3
+#define w4 x4
+#define w5 x5
+#define w6 x6
 // emu is r10
-#define xEmu    10
+#define xEmu 10
 // RV64 RA
-#define xRA     1
-#define xSP     2
+#define xRA 1
+#define xSP 2
 // RV64 args
-#define A0      10
-#define A1      11
-#define A2      12
-#define A3      13
-#define A4      14
-#define A5      15
-#define A6      16
-#define A7      17
+#define A0 10
+#define A1 11
+#define A2 12
+#define A3 13
+#define A4 14
+#define A5 15
+#define A6 16
+#define A7 17
 // xZR reg is 0
-#define xZR     0
-#define wZR     xZR
+#define xZR 0
+#define wZR xZR
 
 // replacement for F_OF internaly, using a reserved bit. Need to use F_OF2 internaly, never F_OF directly!
-#define F_OF2   F_res3
+#define F_OF2 F_res3
 
 // split a 32bits value in 20bits + 12bits, adjust the upper part is 12bits is negative
-#define SPLIT20(A)  (((A)+0x800)>>12)
-#define SPLIT12(A)  ((A)&0xfff)
+#define SPLIT20(A) (((A) + 0x800) >> 12)
+#define SPLIT12(A) ((A)&0xfff)
 
 // MOV64x/MOV32w is quite complex, so use a function for this
-#define MOV64x(A, B)    rv64_move64(dyn, ninst, A, B)
-#define MOV32w(A, B)    rv64_move32(dyn, ninst, A, B, 1)
-#define MOV64xw(A, B)   if(rex.w) {MOV64x(A, B);} else {MOV32w(A, B);}
-#define MOV64z(A, B)    if(rex.is32bits) {MOV32w(A, B);} else {MOV64x(A, B);}
+#define MOV64x(A, B) rv64_move64(dyn, ninst, A, B)
+#define MOV32w(A, B) rv64_move32(dyn, ninst, A, B, 1)
+#define MOV64xw(A, B) \
+    if (rex.w) {      \
+        MOV64x(A, B); \
+    } else {          \
+        MOV32w(A, B); \
+    }
+#define MOV64z(A, B)    \
+    if (rex.is32bits) { \
+        MOV32w(A, B);   \
+    } else {            \
+        MOV64x(A, B);   \
+    }
 
 // ZERO the upper part
-#define ZEROUP(r)       AND(r, r, xMASK)
+#define ZEROUP(r) AND(r, r, xMASK)
 
-#define R_type(funct7, rs2, rs1, funct3, rd, opcode)    ((funct7)<<25 | (rs2)<<20 | (rs1)<<15 | (funct3)<<12 | (rd)<<7 | (opcode))
-#define I_type(imm12, rs1, funct3, rd, opcode)    ((imm12)<<20 | (rs1)<<15 | (funct3)<<12 | (rd)<<7 | (opcode))
-#define S_type(imm12, rs2, rs1, funct3, opcode)    (((imm12)>>5)<<25 | (rs2)<<20 | (rs1)<<15 | (funct3)<<12 | ((imm12)&31)<<7 | (opcode))
-#define B_type(imm13, rs2, rs1, funct3, opcode)      ((((imm13)>>12)&1)<<31 | (((imm13)>>5)&63)<<25 | (rs2)<<20 | (rs1)<<15 | (funct3)<<12 | (((imm13)>>1)&15)<<8 | (((imm13)>>11)&1)<<7 | (opcode))
-#define U_type(imm32, rd, opcode)   (((imm32)>>12)<<12 | (rd)<<7 | (opcode))
-#define J_type(imm21, rd, opcode)    ((((imm21)>>20)&1)<<31 | (((imm21)>>1)&0b1111111111)<<21 | (((imm21)>>11)&1)<<20 | (((imm21)>>12)&0b11111111)<<12 | (rd)<<7 | (opcode))
+#define R_type(funct7, rs2, rs1, funct3, rd, opcode) ((funct7) << 25 | (rs2) << 20 | (rs1) << 15 | (funct3) << 12 | (rd) << 7 | (opcode))
+#define I_type(imm12, rs1, funct3, rd, opcode) ((imm12) << 20 | (rs1) << 15 | (funct3) << 12 | (rd) << 7 | (opcode))
+#define S_type(imm12, rs2, rs1, funct3, opcode) (((imm12) >> 5) << 25 | (rs2) << 20 | (rs1) << 15 | (funct3) << 12 | ((imm12)&31) << 7 | (opcode))
+#define B_type(imm13, rs2, rs1, funct3, opcode) ((((imm13) >> 12) & 1) << 31 | (((imm13) >> 5) & 63) << 25 | (rs2) << 20 | (rs1) << 15 | (funct3) << 12 | (((imm13) >> 1) & 15) << 8 | (((imm13) >> 11) & 1) << 7 | (opcode))
+#define U_type(imm32, rd, opcode) (((imm32) >> 12) << 12 | (rd) << 7 | (opcode))
+#define J_type(imm21, rd, opcode) ((((imm21) >> 20) & 1) << 31 | (((imm21) >> 1) & 0b1111111111) << 21 | (((imm21) >> 11) & 1) << 20 | (((imm21) >> 12) & 0b11111111) << 12 | (rd) << 7 | (opcode))
 
 // RV32I
 // put imm20 in the [31:12] bits of rd, zero [11:0] and sign extend bits31
-#define LUI(rd, imm20)                 EMIT(U_type((imm20)<<12, rd, 0b0110111))
+#define LUI(rd, imm20) EMIT(U_type((imm20) << 12, rd, 0b0110111))
 
 // put PC+imm20 in rd
-#define AUIPC(rd, imm20)               EMIT(U_type((imm20)<<12, rd, 0b0010111))
+#define AUIPC(rd, imm20) EMIT(U_type((imm20) << 12, rd, 0b0010111))
 
-#define JAL_gen(rd, imm21)             J_type(imm21, rd, 0b1101111)
+#define JAL_gen(rd, imm21) J_type(imm21, rd, 0b1101111)
 // Unconditional branch, no return address set
-#define B(imm21)                       EMIT(JAL_gen(xZR, imm21))
-#define B__(reg1, reg2, imm21)         B(imm21)
+#define B(imm21) EMIT(JAL_gen(xZR, imm21))
+#define B__(reg1, reg2, imm21) B(imm21)
 // Unconditional branch, return set to xRA
-#define JAL(imm21)                     EMIT(JAL_gen(xRA, imm21))
+#define JAL(imm21) EMIT(JAL_gen(xRA, imm21))
 // Unconditional branch, without link
-#define J(imm21)                       EMIT(JAL_gen(xZR, imm21))
+#define J(imm21) EMIT(JAL_gen(xZR, imm21))
 
-#define JALR_gen(rd, rs1, imm12)       I_type(imm12, rs1, 0b000, rd, 0b1100111)
+#define JALR_gen(rd, rs1, imm12) I_type(imm12, rs1, 0b000, rd, 0b1100111)
 // Unconditionnal branch to r, no return address set
-#define BR(r)                          EMIT(JALR_gen(xZR, r, 0))
+#define BR(r) EMIT(JALR_gen(xZR, r, 0))
 // Unconditionnal branch to r+i12, no return address set
-#define BR_I12(r, imm12)               EMIT(JALR_gen(xZR, r, (imm12)&0b111111111111))
+#define BR_I12(r, imm12) EMIT(JALR_gen(xZR, r, (imm12)&0b111111111111))
 // Unconditionnal branch to r, return address set to xRA
-#define JALR(r)                        EMIT(JALR_gen(xRA, r, 0))
+#define JALR(r) EMIT(JALR_gen(xRA, r, 0))
 // Unconditionnal branch to r+i12, return address set to xRA
-#define JALR_I12(r, imm12)             EMIT(JALR_gen(xRA, r, (imm12)&0b111111111111))
+#define JALR_I12(r, imm12) EMIT(JALR_gen(xRA, r, (imm12)&0b111111111111))
 
 // rd = rs1 + imm12
-#define ADDI(rd, rs1, imm12)        EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, 0b0010011))
+#define ADDI(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, 0b0010011))
 // rd = rs1 - imm12 (pseudo instruction)
-#define SUBI(rd, rs1, imm12)        EMIT(I_type((-(imm12))&0b111111111111, rs1, 0b000, rd, 0b0010011))
+#define SUBI(rd, rs1, imm12) EMIT(I_type((-(imm12)) & 0b111111111111, rs1, 0b000, rd, 0b0010011))
 // rd = (rs1<imm12)?1:0
-#define SLTI(rd, rs1, imm12)        EMIT(I_type((imm12)&0b111111111111, rs1, 0b010, rd, 0b0010011))
+#define SLTI(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b010, rd, 0b0010011))
 // rd = (rs1<imm12)?1:0 unsigned
-#define SLTIU(rd, rs1, imm12)       EMIT(I_type((imm12)&0b111111111111, rs1, 0b011, rd, 0b0010011))
+#define SLTIU(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b011, rd, 0b0010011))
 // rd = rs1 ^ imm12
-#define XORI(rd, rs1, imm12)        EMIT(I_type((imm12)&0b111111111111, rs1, 0b100, rd, 0b0010011))
+#define XORI(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b100, rd, 0b0010011))
 // rd = rs1 | imm12
-#define ORI(rd, rs1, imm12)         EMIT(I_type((imm12)&0b111111111111, rs1, 0b110, rd, 0b0010011))
+#define ORI(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b110, rd, 0b0010011))
 // rd = rs1 & imm12
-#define ANDI(rd, rs1, imm12)        EMIT(I_type((imm12)&0b111111111111, rs1, 0b111, rd, 0b0010011))
+#define ANDI(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b111, rd, 0b0010011))
 
 // rd = imm12
-#define MOV_U12(rd, imm12)          ADDI(rd, xZR, imm12)
+#define MOV_U12(rd, imm12) ADDI(rd, xZR, imm12)
 // nop
-#define NOP()                       ADDI(xZR, xZR, 0)
+#define NOP() ADDI(xZR, xZR, 0)
 
 // rd = rs1 + rs2
-#define ADD(rd, rs1, rs2)           EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, 0b0110011))
+#define ADD(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, 0b0110011))
 // rd = rs1 + rs2
-#define ADDW(rd, rs1, rs2)          EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, 0b0111011))
+#define ADDW(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, 0b0111011))
 // rd = rs1 + rs2
-#define ADDxw(rd, rs1, rs2)         EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, rex.w?0b0110011:0b0111011))
+#define ADDxw(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, rex.w ? 0b0110011 : 0b0111011))
 // rd = rs1 + rs2
-#define ADDz(rd, rs1, rs2)          EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, rex.is32bits?0b0111011:0b0110011))
+#define ADDz(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, rex.is32bits ? 0b0111011 : 0b0110011))
 // rd = rs1 - rs2
-#define SUB(rd, rs1, rs2)           EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, 0b0110011))
+#define SUB(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, 0b0110011))
 // rd = rs1 - rs2
-#define SUBW(rd, rs1, rs2)          EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, 0b0111011))
+#define SUBW(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, 0b0111011))
 // rd = rs1 - rs2
-#define SUBxw(rd, rs1, rs2)         EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, rex.w?0b0110011:0b0111011))
+#define SUBxw(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, rex.w ? 0b0110011 : 0b0111011))
 // rd = rs1 - rs2
-#define SUBz(rd, rs1, rs2)          EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, rex.is32bits?0b0111011:0b0110011))
+#define SUBz(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, rex.is32bits ? 0b0111011 : 0b0110011))
 // rd = rs1<<rs2
-#define SLL(rd, rs1, rs2)           EMIT(R_type(0b0000000, rs2, rs1, 0b001, rd, 0b0110011))
+#define SLL(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b001, rd, 0b0110011))
 // rd = (rs1<rs2)?1:0
-#define SLT(rd, rs1, rs2)           EMIT(R_type(0b0000000, rs2, rs1, 0b010, rd, 0b0110011))
+#define SLT(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b010, rd, 0b0110011))
 // rd = (rs1<rs2)?1:0 Unsigned
-#define SLTU(rd, rs1, rs2)          EMIT(R_type(0b0000000, rs2, rs1, 0b011, rd, 0b0110011))
+#define SLTU(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b011, rd, 0b0110011))
 // rd = rs1 ^ rs2
-#define XOR(rd, rs1, rs2)           EMIT(R_type(0b0000000, rs2, rs1, 0b100, rd, 0b0110011))
+#define XOR(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b100, rd, 0b0110011))
 // rd = rs1 ^ rs2
-#define XORxw(rd, rs1, rs2)         do{ XOR(rd, rs1, rs2); if (!rex.w) ZEROUP(rd); }while(0)
+#define XORxw(rd, rs1, rs2)     \
+    do {                        \
+        XOR(rd, rs1, rs2);      \
+        if (!rex.w) ZEROUP(rd); \
+    } while (0)
 // rd = rs1>>rs2 logical
-#define SRL(rd, rs1, rs2)           EMIT(R_type(0b0000000, rs2, rs1, 0b101, rd, 0b0110011))
+#define SRL(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b101, rd, 0b0110011))
 // rd = rs1>>rs2 arithmetic
-#define SRA(rd, rs1, rs2)           EMIT(R_type(0b0100000, rs2, rs1, 0b101, rd, 0b0110011))
+#define SRA(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b101, rd, 0b0110011))
 // rd = rs1 | rs2
-#define OR(rd, rs1, rs2)            EMIT(R_type(0b0000000, rs2, rs1, 0b110, rd, 0b0110011))
+#define OR(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b110, rd, 0b0110011))
 // rd = rs1 & rs2
-#define AND(rd, rs1, rs2)           EMIT(R_type(0b0000000, rs2, rs1, 0b111, rd, 0b0110011))
+#define AND(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b111, rd, 0b0110011))
 
 // rd = rs1 (pseudo instruction)
-#define MV(rd, rs1)                 ADDI(rd, rs1, 0)
+#define MV(rd, rs1) ADDI(rd, rs1, 0)
 // rd = rs1 (pseudo instruction)
-#define MVxw(rd, rs1)               if(rex.w) {MV(rd, rs1);} else {AND(rd, rs1, xMASK);}
+#define MVxw(rd, rs1)        \
+    if (rex.w) {             \
+        MV(rd, rs1);         \
+    } else {                 \
+        AND(rd, rs1, xMASK); \
+    }
 // rd = rs1 (pseudo instruction)
-#define MVz(rd, rs1)               if(rex.is32bits) {AND(rd, rs1, xMASK);} else {MV(rd, rs1);}
+#define MVz(rd, rs1)         \
+    if (rex.is32bits) {      \
+        AND(rd, rs1, xMASK); \
+    } else {                 \
+        MV(rd, rs1);         \
+    }
 // rd = !rs1
-#define NOT(rd, rs1)                XORI(rd, rs1, -1)
+#define NOT(rd, rs1) XORI(rd, rs1, -1)
 // rd = -rs1
-#define NEG(rd, rs1)                SUB(rd, xZR, rs1)
+#define NEG(rd, rs1) SUB(rd, xZR, rs1)
 // rd = -rs1
-#define NEGxw(rd, rs1)              SUBxw(rd, xZR, rs1)
+#define NEGxw(rd, rs1) SUBxw(rd, xZR, rs1)
 // rd = rs1 == 0
-#define SEQZ(rd, rs1)               SLTIU(rd, rs1, 1)
+#define SEQZ(rd, rs1) SLTIU(rd, rs1, 1)
 // rd = rs1 != 0
-#define SNEZ(rd, rs1)               SLTU(rd, xZR, rs1)
+#define SNEZ(rd, rs1) SLTU(rd, xZR, rs1)
 
 
-#define BEQ(rs1, rs2, imm13)       EMIT(B_type(imm13, rs2, rs1, 0b000, 0b1100011))
-#define BNE(rs1, rs2, imm13)       EMIT(B_type(imm13, rs2, rs1, 0b001, 0b1100011))
-#define BLT(rs1, rs2, imm13)       EMIT(B_type(imm13, rs2, rs1, 0b100, 0b1100011))
-#define BGE(rs1, rs2, imm13)       EMIT(B_type(imm13, rs2, rs1, 0b101, 0b1100011))
-#define BLTU(rs1, rs2, imm13)      EMIT(B_type(imm13, rs2, rs1, 0b110, 0b1100011))
-#define BGEU(rs1, rs2, imm13)      EMIT(B_type(imm13, rs2, rs1, 0b111, 0b1100011))
+#define BEQ(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b000, 0b1100011))
+#define BNE(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b001, 0b1100011))
+#define BLT(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b100, 0b1100011))
+#define BGE(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b101, 0b1100011))
+#define BLTU(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b110, 0b1100011))
+#define BGEU(rs1, rs2, imm13) EMIT(B_type(imm13, rs2, rs1, 0b111, 0b1100011))
 
 // TODO: Find a better way to have conditionnal jumps? Imm is a relative jump address, so the the 2nd jump needs to be addapted
-#define BEQ_safe(rs1, rs2, imm)     if((imm)>-0x1000 && (imm)<0x1000) {BEQ(rs1, rs2, imm); NOP();} else {BNE(rs1, rs2, 8); B(imm-4);}
-#define BNE_safe(rs1, rs2, imm)     if((imm)>-0x1000 && (imm)<0x1000) {BNE(rs1, rs2, imm); NOP();} else {BEQ(rs1, rs2, 8); B(imm-4);}
-#define BLT_safe(rs1, rs2, imm)     if((imm)>-0x1000 && (imm)<0x1000) {BLT(rs1, rs2, imm); NOP();} else {BGE(rs2, rs1, 8); B(imm-4);}
-#define BGE_safe(rs1, rs2, imm)     if((imm)>-0x1000 && (imm)<0x1000) {BGE(rs1, rs2, imm); NOP();} else {BLT(rs2, rs1, 8); B(imm-4);}
-#define BLTU_safe(rs1, rs2, imm)    if((imm)>-0x1000 && (imm)<0x1000) {BLTU(rs1, rs2, imm); NOP();} else {BGEU(rs2, rs1, 8); B(imm-4);}
-#define BGEU_safe(rs1, rs2, imm)    if((imm)>-0x1000 && (imm)<0x1000) {BGEU(rs1, rs2, imm); NOP();} else {BLTU(rs2, rs1, 8); B(imm-4);}
+#define BEQ_safe(rs1, rs2, imm)              \
+    if ((imm) > -0x1000 && (imm) < 0x1000) { \
+        BEQ(rs1, rs2, imm);                  \
+        NOP();                               \
+    } else {                                 \
+        BNE(rs1, rs2, 8);                    \
+        B(imm - 4);                          \
+    }
+#define BNE_safe(rs1, rs2, imm)              \
+    if ((imm) > -0x1000 && (imm) < 0x1000) { \
+        BNE(rs1, rs2, imm);                  \
+        NOP();                               \
+    } else {                                 \
+        BEQ(rs1, rs2, 8);                    \
+        B(imm - 4);                          \
+    }
+#define BLT_safe(rs1, rs2, imm)              \
+    if ((imm) > -0x1000 && (imm) < 0x1000) { \
+        BLT(rs1, rs2, imm);                  \
+        NOP();                               \
+    } else {                                 \
+        BGE(rs2, rs1, 8);                    \
+        B(imm - 4);                          \
+    }
+#define BGE_safe(rs1, rs2, imm)              \
+    if ((imm) > -0x1000 && (imm) < 0x1000) { \
+        BGE(rs1, rs2, imm);                  \
+        NOP();                               \
+    } else {                                 \
+        BLT(rs2, rs1, 8);                    \
+        B(imm - 4);                          \
+    }
+#define BLTU_safe(rs1, rs2, imm)             \
+    if ((imm) > -0x1000 && (imm) < 0x1000) { \
+        BLTU(rs1, rs2, imm);                 \
+        NOP();                               \
+    } else {                                 \
+        BGEU(rs2, rs1, 8);                   \
+        B(imm - 4);                          \
+    }
+#define BGEU_safe(rs1, rs2, imm)             \
+    if ((imm) > -0x1000 && (imm) < 0x1000) { \
+        BGEU(rs1, rs2, imm);                 \
+        NOP();                               \
+    } else {                                 \
+        BLTU(rs2, rs1, 8);                   \
+        B(imm - 4);                          \
+    }
 
-#define BEQZ(rs1, imm13)           BEQ(rs1, 0, imm13)
-#define BNEZ(rs1, imm13)           BNE(rs1, 0, imm13)
+#define BEQZ(rs1, imm13) BEQ(rs1, 0, imm13)
+#define BNEZ(rs1, imm13) BNE(rs1, 0, imm13)
 
-#define BEQZ_safe(rs1, imm)         if((imm)>-0x1000 && (imm)<0x1000) {BEQZ(rs1, imm); NOP();} else {BNEZ(rs1, 8); B(imm-4);}
-#define BNEZ_safe(rs1, imm)         if((imm)>-0x1000 && (imm)<0x1000) {BNEZ(rs1, imm); NOP();} else {BEQZ(rs1, 8); B(imm-4);}
+#define BEQZ_safe(rs1, imm)                  \
+    if ((imm) > -0x1000 && (imm) < 0x1000) { \
+        BEQZ(rs1, imm);                      \
+        NOP();                               \
+    } else {                                 \
+        BNEZ(rs1, 8);                        \
+        B(imm - 4);                          \
+    }
+#define BNEZ_safe(rs1, imm)                  \
+    if ((imm) > -0x1000 && (imm) < 0x1000) { \
+        BNEZ(rs1, imm);                      \
+        NOP();                               \
+    } else {                                 \
+        BEQZ(rs1, 8);                        \
+        B(imm - 4);                          \
+    }
 
 // rd = 4-bytes[rs1+imm12] signed extended
-#define LW(rd, rs1, imm12)          EMIT(I_type(imm12, rs1, 0b010, rd, 0b0000011))
+#define LW(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b010, rd, 0b0000011))
 // rd = 2-bytes[rs1+imm12] signed extended
-#define LH(rd, rs1, imm12)          EMIT(I_type(imm12, rs1, 0b001, rd, 0b0000011))
+#define LH(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b001, rd, 0b0000011))
 // rd = byte[rs1+imm12] signed extended
-#define LB(rd, rs1, imm12)          EMIT(I_type(imm12, rs1, 0b000, rd, 0b0000011))
+#define LB(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b000, rd, 0b0000011))
 // rd = 2-bytes[rs1+imm12] zero extended
-#define LHU(rd, rs1, imm12)         EMIT(I_type(imm12, rs1, 0b101, rd, 0b0000011))
+#define LHU(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b101, rd, 0b0000011))
 // rd = byte[rs1+imm12] zero extended
-#define LBU(rd, rs1, imm12)         EMIT(I_type(imm12, rs1, 0b100, rd, 0b0000011))
+#define LBU(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b100, rd, 0b0000011))
 // byte[rs1+imm12] = rs2
-#define SB(rs2, rs1, imm12)         EMIT(S_type(imm12, rs2, rs1, 0b000, 0b0100011))
+#define SB(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b000, 0b0100011))
 // 2-bytes[rs1+imm12] = rs2
-#define SH(rs2, rs1, imm12)         EMIT(S_type(imm12, rs2, rs1, 0b001, 0b0100011))
+#define SH(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b001, 0b0100011))
 // 4-bytes[rs1+imm12] = rs2
-#define SW(rs2, rs1, imm12)         EMIT(S_type(imm12, rs2, rs1, 0b010, 0b0100011))
-
-#define PUSH1(reg)                  do {SD(reg, xRSP, -8); SUBI(xRSP, xRSP, 8);} while(0)
-#define POP1(reg)                   do {LD(reg, xRSP, 0); if (reg!=xRSP) ADDI(xRSP, xRSP, 8);} while(0)
-#define PUSH1_32(reg)               do {SW(reg, xRSP, -4); SUBIW(xRSP, xRSP, 4);} while(0)
-#define POP1_32(reg)                do {LWU(reg, xRSP, 0); if (reg!=xRSP) ADDIW(xRSP, xRSP, 4);} while(0)
-
-#define POP1z(reg)                  if(rex.is32bits) {POP1_32(reg);} else {POP1(reg);}
-#define PUSH1z(reg)                 if(rex.is32bits) {PUSH1_32(reg);} else {PUSH1(reg);}
+#define SW(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b010, 0b0100011))
+
+#define PUSH1(reg)           \
+    do {                     \
+        SD(reg, xRSP, -8);   \
+        SUBI(xRSP, xRSP, 8); \
+    } while (0)
+#define POP1(reg)                             \
+    do {                                      \
+        LD(reg, xRSP, 0);                     \
+        if (reg != xRSP) ADDI(xRSP, xRSP, 8); \
+    } while (0)
+#define PUSH1_32(reg)         \
+    do {                      \
+        SW(reg, xRSP, -4);    \
+        SUBIW(xRSP, xRSP, 4); \
+    } while (0)
+#define POP1_32(reg)                           \
+    do {                                       \
+        LWU(reg, xRSP, 0);                     \
+        if (reg != xRSP) ADDIW(xRSP, xRSP, 4); \
+    } while (0)
+
+#define POP1z(reg)      \
+    if (rex.is32bits) { \
+        POP1_32(reg);   \
+    } else {            \
+        POP1(reg);      \
+    }
+#define PUSH1z(reg)     \
+    if (rex.is32bits) { \
+        PUSH1_32(reg);  \
+    } else {            \
+        PUSH1(reg);     \
+    }
 
-#define FENCE_gen(pred, succ)       (((pred)<<24) | ((succ)<<20) | 0b0001111)
-#define FENCE()                     EMIT(FENCE_gen(3, 3))
+#define FENCE_gen(pred, succ) (((pred) << 24) | ((succ) << 20) | 0b0001111)
+#define FENCE() EMIT(FENCE_gen(3, 3))
 
-#define FENCE_I_gen()               ((0b001<<12) | 0b0001111)
-#define FENCE_I()                   EMIT(FENCE_I_gen())
+#define FENCE_I_gen() ((0b001 << 12) | 0b0001111)
+#define FENCE_I() EMIT(FENCE_I_gen())
 
-#define EBREAK()                    EMIT(I_type(1, 0, 0, 0, 0b1110011))
+#define EBREAK() EMIT(I_type(1, 0, 0, 0, 0b1110011))
 
 // RV64I
-#define LWU(rd, rs1, imm12)         EMIT(I_type(imm12, rs1, 0b110, rd, 0b0000011))
+#define LWU(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b110, rd, 0b0000011))
 
 // rd = [rs1 + imm12]
-#define LD(rd, rs1, imm12)          EMIT(I_type(imm12, rs1, 0b011, rd, 0b0000011))
+#define LD(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b011, rd, 0b0000011))
 // rd = [rs1 + imm12]
-#define LDxw(rd, rs1, imm12)        EMIT(I_type(imm12, rs1, 0b011<<(1-rex.w), rd, 0b0000011))
+#define LDxw(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b011 << (1 - rex.w), rd, 0b0000011))
 // rd = [rs1 + imm12]
-#define LDz(rd, rs1, imm12)         EMIT(I_type(imm12, rs1, 0b011<<rex.is32bits, rd, 0b0000011))
+#define LDz(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b011 << rex.is32bits, rd, 0b0000011))
 // [rs1 + imm12] = rs2
-#define SD(rs2, rs1, imm12)         EMIT(S_type(imm12, rs2, rs1, 0b011, 0b0100011))
+#define SD(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b011, 0b0100011))
 // [rs1 + imm12] = rs2
-#define SDxw(rs2, rs1, imm12)       EMIT(S_type(imm12, rs2, rs1, 0b010+rex.w, 0b0100011))
+#define SDxw(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b010 + rex.w, 0b0100011))
 // [rs1 + imm12] = rs2
-#define SDz(rs2, rs1, imm12)        EMIT(S_type(imm12, rs2, rs1, 0b010+(1-rex.is32bits), 0b0100011))
+#define SDz(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b010 + (1 - rex.is32bits), 0b0100011))
 
 // Shift Left Immediate
-#define SLLI(rd, rs1, imm6)         EMIT(I_type(imm6, rs1, 0b001, rd, 0b0010011))
+#define SLLI(rd, rs1, imm6) EMIT(I_type(imm6, rs1, 0b001, rd, 0b0010011))
 // Shift Right Logical Immediate
-#define SRLI(rd, rs1, imm6)         EMIT(I_type(imm6, rs1, 0b101, rd, 0b0010011))
+#define SRLI(rd, rs1, imm6) EMIT(I_type(imm6, rs1, 0b101, rd, 0b0010011))
 // Shift Right Arithmetic Immediate
-#define SRAI(rd, rs1, imm6)         EMIT(I_type((imm6)|(0b010000<<6), rs1, 0b101, rd, 0b0010011))
+#define SRAI(rd, rs1, imm6) EMIT(I_type((imm6) | (0b010000 << 6), rs1, 0b101, rd, 0b0010011))
 
 // rd = rs1 + imm12
-#define ADDIW(rd, rs1, imm12)       EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, 0b0011011))
+#define ADDIW(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, 0b0011011))
 // rd = rs1 - imm12
-#define SUBIW(rd, rs1, imm12)       EMIT(I_type((-imm12)&0b111111111111, rs1, 0b000, rd, 0b0011011))
+#define SUBIW(rd, rs1, imm12) EMIT(I_type((-imm12) & 0b111111111111, rs1, 0b000, rd, 0b0011011))
 // rd = rs1 + imm12
-#define ADDIxw(rd, rs1, imm12)      EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, rex.w?0b0010011:0b0011011))
+#define ADDIxw(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, rex.w ? 0b0010011 : 0b0011011))
 // rd = rs1 + imm12
-#define ADDIz(rd, rs1, imm12)       EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, rex.is32bits?0b0011011:0b0010011))
+#define ADDIz(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, rex.is32bits ? 0b0011011 : 0b0010011))
 
 // rd = rs1 + (rs2 << imm2)
-#define ADDSL(rd, rs1, rs2, imm2, scratch) if (!(imm2)) { \
-        ADD(rd, rs1, rs2);              \
-    } else if (rv64_zba) {              \
-        SHxADD(rd, rs2, imm2, rs1);     \
-    } else if (rv64_xtheadba) {         \
-        TH_ADDSL(rd, rs1, rs2, imm2);   \
-    } else {                            \
-        SLLI(scratch, rs2, imm2);       \
-        ADD(rd, rs1, scratch);          \
-    }                                   \
-
-#define SEXT_W(rd, rs1)             ADDIW(rd, rs1, 0)
+#define ADDSL(rd, rs1, rs2, imm2, scratch) \
+    if (!(imm2)) {                         \
+        ADD(rd, rs1, rs2);                 \
+    } else if (rv64_zba) {                 \
+        SHxADD(rd, rs2, imm2, rs1);        \
+    } else if (rv64_xtheadba) {            \
+        TH_ADDSL(rd, rs1, rs2, imm2);      \
+    } else {                               \
+        SLLI(scratch, rs2, imm2);          \
+        ADD(rd, rs1, scratch);             \
+    }
+
+#define SEXT_W(rd, rs1) ADDIW(rd, rs1, 0)
 
 // rd = rs1<<rs2
-#define SLLW(rd, rs1, rs2)           EMIT(R_type(0b0000000, rs2, rs1, 0b001, rd, 0b0111011))
+#define SLLW(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b001, rd, 0b0111011))
 // rd = rs1>>rs2 logical
-#define SRLW(rd, rs1, rs2)           EMIT(R_type(0b0000000, rs2, rs1, 0b101, rd, 0b0111011))
+#define SRLW(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b101, rd, 0b0111011))
 // rd = rs1>>rs2 arithmetic
-#define SRAW(rd, rs1, rs2)           EMIT(R_type(0b0100000, rs2, rs1, 0b101, rd, 0b0111011))
-
-#define SLLxw(rd, rs1, rs2)          EMIT(R_type(0b0000000, rs2, rs1, 0b001, rd, rex.w?0b0110011:0b0111011))
-#define SRLxw(rd, rs1, rs2)          EMIT(R_type(0b0000000, rs2, rs1, 0b101, rd, rex.w?0b0110011:0b0111011))
-#define SRAxw(rd, rs1, rs2)          if(rex.w) {SRA(rd, rs1, rs2);} else {SRAW(rd, rs1, rs2); ZEROUP(rd);}
+#define SRAW(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b101, rd, 0b0111011))
+
+#define SLLxw(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b001, rd, rex.w ? 0b0110011 : 0b0111011))
+#define SRLxw(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b101, rd, rex.w ? 0b0110011 : 0b0111011))
+#define SRAxw(rd, rs1, rs2) \
+    if (rex.w) {            \
+        SRA(rd, rs1, rs2);  \
+    } else {                \
+        SRAW(rd, rs1, rs2); \
+        ZEROUP(rd);         \
+    }
 
 // Shift Left Immediate, 32-bit, sign-extended
-#define SLLIW(rd, rs1, imm5)        EMIT(I_type(imm5, rs1, 0b001, rd, 0b0011011))
+#define SLLIW(rd, rs1, imm5) EMIT(I_type(imm5, rs1, 0b001, rd, 0b0011011))
 // Shift Left Immediate
-#define SLLIxw(rd, rs1, imm)        if (rex.w) { SLLI(rd, rs1, imm); } else { SLLIW(rd, rs1, imm); }
+#define SLLIxw(rd, rs1, imm) \
+    if (rex.w) {             \
+        SLLI(rd, rs1, imm);  \
+    } else {                 \
+        SLLIW(rd, rs1, imm); \
+    }
 // Shift Right Logical Immediate, 32-bit, sign-extended
-#define SRLIW(rd, rs1, imm5)        EMIT(I_type(imm5, rs1, 0b101, rd, 0b0011011))
+#define SRLIW(rd, rs1, imm5) EMIT(I_type(imm5, rs1, 0b101, rd, 0b0011011))
 // Shift Right Logical Immediate
-#define SRLIxw(rd, rs1, imm)        if (rex.w) { SRLI(rd, rs1, imm); } else { SRLIW(rd, rs1, imm); }
+#define SRLIxw(rd, rs1, imm) \
+    if (rex.w) {             \
+        SRLI(rd, rs1, imm);  \
+    } else {                 \
+        SRLIW(rd, rs1, imm); \
+    }
 // Shift Right Arithmetic Immediate, 32-bit, sign-extended
-#define SRAIW(rd, rs1, imm5)        EMIT(I_type((imm5)|(0b0100000<<5), rs1, 0b101, rd, 0b0011011))
+#define SRAIW(rd, rs1, imm5) EMIT(I_type((imm5) | (0b0100000 << 5), rs1, 0b101, rd, 0b0011011))
 // Shift Right Arithmetic Immediate
-#define SRAIxw(rd, rs1, imm)        if (rex.w) { SRAI(rd, rs1, imm); } else { SRAIW(rd, rs1, imm); }
+#define SRAIxw(rd, rs1, imm) \
+    if (rex.w) {             \
+        SRAI(rd, rs1, imm);  \
+    } else {                 \
+        SRAIW(rd, rs1, imm); \
+    }
 
-#define CSRRW(rd, rs1, csr)         EMIT(I_type(csr, rs1, 0b001, rd, 0b1110011))
-#define CSRRS(rd, rs1, csr)         EMIT(I_type(csr, rs1, 0b010, rd, 0b1110011))
-#define CSRRC(rd, rs1, csr)         EMIT(I_type(csr, rs1, 0b011, rd, 0b1110011))
-#define CSRRWI(rd, imm, csr)        EMIT(I_type(csr, imm, 0b101, rd, 0b1110011))
-#define CSRRSI(rd, imm, csr)        EMIT(I_type(csr, imm, 0b110, rd, 0b1110011))
-#define CSRRCI(rd, imm, csr)        EMIT(I_type(csr, imm, 0b111, rd, 0b1110011))
+#define CSRRW(rd, rs1, csr) EMIT(I_type(csr, rs1, 0b001, rd, 0b1110011))
+#define CSRRS(rd, rs1, csr) EMIT(I_type(csr, rs1, 0b010, rd, 0b1110011))
+#define CSRRC(rd, rs1, csr) EMIT(I_type(csr, rs1, 0b011, rd, 0b1110011))
+#define CSRRWI(rd, imm, csr) EMIT(I_type(csr, imm, 0b101, rd, 0b1110011))
+#define CSRRSI(rd, imm, csr) EMIT(I_type(csr, imm, 0b110, rd, 0b1110011))
+#define CSRRCI(rd, imm, csr) EMIT(I_type(csr, imm, 0b111, rd, 0b1110011))
 
 // RV32M
 // rd =(lower) rs1 * rs2 (both signed)
-#define MUL(rd, rs1, rs2)           EMIT(R_type(0b0000001, rs2, rs1, 0b000, rd, 0b0110011))
+#define MUL(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b000, rd, 0b0110011))
 // rd =(upper) rs1 * rs2 (both signed)
-#define MULH(rd, rs1, rs2)          EMIT(R_type(0b0000001, rs2, rs1, 0b001, rd, 0b0110011))
+#define MULH(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b001, rd, 0b0110011))
 // rd =(upper) (signed)rs1 * (unsigned)rs2
-#define MULHSU(rd, rs1, rs2)        EMIT(R_type(0b0000001, rs2, rs1, 0b010, rd, 0b0110011))
+#define MULHSU(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b010, rd, 0b0110011))
 // rd =(upper) rs1 * rs2 (both unsigned)
-#define MULHU(rd, rs1, rs2)         EMIT(R_type(0b0000001, rs2, rs1, 0b011, rd, 0b0110011))
+#define MULHU(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b011, rd, 0b0110011))
 // rd =(upper) rs1 / rs2
-#define DIV(rd, rs1, rs2)           EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, 0b0110011))
-#define DIVU(rd, rs1, rs2)          EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, 0b0110011))
+#define DIV(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, 0b0110011))
+#define DIVU(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, 0b0110011))
 // rd = rs1 mod rs2
-#define REM(rd, rs1, rs2)           EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, 0b0110011))
-#define REMU(rd, rs1, rs2)          EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, 0b0110011))
+#define REM(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, 0b0110011))
+#define REMU(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, 0b0110011))
 
 // RV64M
 // rd = rs1 * rs2
-#define MULW(rd, rs1, rs2)          EMIT(R_type(0b0000001, rs2, rs1, 0b000, rd, 0b0111011))
+#define MULW(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b000, rd, 0b0111011))
 // rd = rs1 * rs2
-#define MULxw(rd, rs1, rs2)         EMIT(R_type(0b0000001, rs2, rs1, 0b000, rd, rex.w?0b0110011:0b0111011))
+#define MULxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b000, rd, rex.w ? 0b0110011 : 0b0111011))
 // rd = rs1 / rs2
-#define DIVW(rd, rs1, rs2)          EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, 0b0111011))
-#define DIVxw(rd, rs1, rs2)         EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, rex.w?0b0110011:0b0111011))
-#define DIVUW(rd, rs1, rs2)         EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, 0b0111011))
-#define DIVUxw(rd, rs1, rs2)        EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, rex.w?0b0110011:0b0111011))
+#define DIVW(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, 0b0111011))
+#define DIVxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b100, rd, rex.w ? 0b0110011 : 0b0111011))
+#define DIVUW(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, 0b0111011))
+#define DIVUxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b101, rd, rex.w ? 0b0110011 : 0b0111011))
 // rd = rs1 mod rs2
-#define REMW(rd, rs1, rs2)          EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, 0b0111011))
-#define REMxw(rd, rs1, rs2)         EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, rex.w?0b0110011:0b0111011))
-#define REMUW(rd, rs1, rs2)         EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, 0b0111011))
-#define REMUxw(rd, rs1, rs2)        EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, rex.w?0b0110011:0b0111011))
+#define REMW(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, 0b0111011))
+#define REMxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b110, rd, rex.w ? 0b0110011 : 0b0111011))
+#define REMUW(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, 0b0111011))
+#define REMUxw(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b111, rd, rex.w ? 0b0110011 : 0b0111011))
 
-#define AQ_RL(f5, aq, rl) ((f5 << 2) | ((aq&1) << 1) | (rl&1))
+#define AQ_RL(f5, aq, rl) ((f5 << 2) | ((aq & 1) << 1) | (rl & 1))
 
 // RV32A
-#define LR_W(rd, rs1, aq, rl)       EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010, rd, 0b0101111))
-#define SC_W(rd, rs2, rs1, aq, rl)  EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010, rd, 0b0101111))
+#define LR_W(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010, rd, 0b0101111))
+#define SC_W(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010, rd, 0b0101111))
 
-#define AMOSWAP_W(rd, rs2, rs1, aq, rl)  EMIT(R_type(AQ_RL(0b00001, aq, rl), rs2, rs1, 0b010, rd, 0b0101111))
+#define AMOSWAP_W(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00001, aq, rl), rs2, rs1, 0b010, rd, 0b0101111))
 
 // RV64A
-#define LR_D(rd, rs1, aq, rl)       EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b011, rd, 0b0101111))
-#define SC_D(rd, rs2, rs1, aq, rl)  EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b011, rd, 0b0101111))
+#define LR_D(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b011, rd, 0b0101111))
+#define SC_D(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b011, rd, 0b0101111))
 
-#define LRxw(rd, rs1, aq, rl)       EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010|rex.w, rd, 0b0101111))
-#define SCxw(rd, rs2, rs1, aq, rl)  EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010|rex.w, rd, 0b0101111))
+#define LRxw(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010 | rex.w, rd, 0b0101111))
+#define SCxw(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010 | rex.w, rd, 0b0101111))
 
 #define AMOSWAP_D(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00001, aq, rl), rs2, rs1, 0b011, rd, 0b0101111))
 
 // RV32F
 // Read round mode
-#define FRRM(rd)                    CSRRS(rd, xZR, 0x002)
+#define FRRM(rd) CSRRS(rd, xZR, 0x002)
 // Swap round mode
-#define FSRM(rd, rs)                CSRRW(rd, rs, 0x002)
+#define FSRM(rd, rs) CSRRW(rd, rs, 0x002)
 // Write FP exception flags, immediate
-#define FSFLAGSI(imm)               CSRRWI(xZR, imm, 0x0001)
+#define FSFLAGSI(imm) CSRRWI(xZR, imm, 0x0001)
 // Read  FP exception flags to rd
-#define FRFLAGS(rd)                 CSRRS(rd, xZR, 0x0001)
+#define FRFLAGS(rd) CSRRS(rd, xZR, 0x0001)
 // Inexact
-#define FR_NX   0
+#define FR_NX 0
 // Underflow
-#define FR_UF   1
+#define FR_UF 1
 // Overflow
-#define FR_OF   2
+#define FR_OF 2
 // Divide by Zero
-#define FR_DZ   3
+#define FR_DZ 3
 // Invalid Operation
-#define FR_NV   4
+#define FR_NV 4
 
 // Round to Nearest, ties to Even
-#define RD_RNE      0b000
+#define RD_RNE 0b000
 // Round towards Zero
-#define RD_RTZ      0b001
+#define RD_RTZ 0b001
 // Round Down (towards −∞)
-#define RD_RDN      0b010
+#define RD_RDN 0b010
 // Round Up (towards +∞)
-#define RD_RUP      0b011
+#define RD_RUP 0b011
 // Round to Nearest, ties to Max Magnitude
-#define RD_RMM      0b100
+#define RD_RMM 0b100
 // In instruction’s rm field, selects dynamic rounding mode;
-#define RD_RM       0b111
-#define RD_DYN      RD_RM
+#define RD_RM 0b111
+#define RD_DYN RD_RM
 
 // load single precision from rs1+imm12 to frd
-#define FLW(frd, rs1, imm12)        EMIT(I_type(imm12, rs1, 0b010, frd, 0b0000111))
+#define FLW(frd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b010, frd, 0b0000111))
 // store single precision frs2 to rs1+imm12
-#define FSW(frs2, rs1, imm12)       EMIT(S_type(imm12, frs2, rs1, 0b010, 0b0100111))
+#define FSW(frs2, rs1, imm12) EMIT(S_type(imm12, frs2, rs1, 0b010, 0b0100111))
 // store rs1 with rs2 sign bit to rd
-#define FSGNJS(rd, rs1, rs2)        EMIT(R_type(0b0010000, rs2, rs1, 0b000, rd, 0b1010011))
+#define FSGNJS(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b000, rd, 0b1010011))
 // move rs1 to rd
-#define FMVS(rd, rs1)               FSGNJS(rd, rs1, rs1)
+#define FMVS(rd, rs1) FSGNJS(rd, rs1, rs1)
 // store rs1 with oposite rs2 sign bit to rd
-#define FSGNJNS(rd, rs1, rs2)       EMIT(R_type(0b0010000, rs2, rs1, 0b001, rd, 0b1010011))
+#define FSGNJNS(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b001, rd, 0b1010011))
 // -rs1 => rd
-#define FNEGS(rd, rs1)              FSGNJNS(rd, rs1, rs1)
+#define FNEGS(rd, rs1) FSGNJNS(rd, rs1, rs1)
 // store rs1 with rs1^rs2 sign bit to rd
-#define FSGNJXS(rd, rs1, rs2)       EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b1010011))
+#define FSGNJXS(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b1010011))
 // |rs1| => rd
-#define FABSS(rd, rs1)              FSGNJXS(rd, rs1, rs1)
+#define FABSS(rd, rs1) FSGNJXS(rd, rs1, rs1)
 // Move from Single
-#define FMVXW(rd, frs1)             EMIT(R_type(0b1110000, 0b00000, frs1, 0b000, rd, 0b1010011))
+#define FMVXW(rd, frs1) EMIT(R_type(0b1110000, 0b00000, frs1, 0b000, rd, 0b1010011))
 // Move to Single
-#define FMVWX(frd, rs1)             EMIT(R_type(0b1111000, 0b00000, rs1, 0b000, frd, 0b1010011))
+#define FMVWX(frd, rs1) EMIT(R_type(0b1111000, 0b00000, rs1, 0b000, frd, 0b1010011))
 // Convert from signed 32bits to Single
-#define FCVTSW(frd, rs1, rm)        EMIT(R_type(0b1101000, 0b00000, rs1, rm, frd, 0b1010011))
+#define FCVTSW(frd, rs1, rm) EMIT(R_type(0b1101000, 0b00000, rs1, rm, frd, 0b1010011))
 // Convert from Single to signed 32bits (trucated)
-#define FCVTWS(rd, frs1, rm)        EMIT(R_type(0b1100000, 0b00000, frs1, rm, rd, 0b1010011))
+#define FCVTWS(rd, frs1, rm) EMIT(R_type(0b1100000, 0b00000, frs1, rm, rd, 0b1010011))
 
-#define FADDS(frd, frs1, frs2)      EMIT(R_type(0b0000000, frs2, frs1, 0b000, frd, 0b1010011))
-#define FSUBS(frd, frs1, frs2)      EMIT(R_type(0b0000100, frs2, frs1, 0b000, frd, 0b1010011))
-#define FMULS(frd, frs1, frs2)      EMIT(R_type(0b0001000, frs2, frs1, 0b000, frd, 0b1010011))
-#define FDIVS(frd, frs1, frs2)      EMIT(R_type(0b0001100, frs2, frs1, 0b000, frd, 0b1010011))
-#define FSQRTS(frd, frs1)           EMIT(R_type(0b0101100, 0b00000, frs1, 0b000, frd, 0b1010011))
-#define FMINS(frd, frs1, frs2)      EMIT(R_type(0b0010100, frs2, frs1, 0b000, frd, 0b1010011))
-#define FMAXS(frd, frs1, frs2)      EMIT(R_type(0b0010100, frs2, frs1, 0b001, frd, 0b1010011))
+#define FADDS(frd, frs1, frs2) EMIT(R_type(0b0000000, frs2, frs1, 0b000, frd, 0b1010011))
+#define FSUBS(frd, frs1, frs2) EMIT(R_type(0b0000100, frs2, frs1, 0b000, frd, 0b1010011))
+#define FMULS(frd, frs1, frs2) EMIT(R_type(0b0001000, frs2, frs1, 0b000, frd, 0b1010011))
+#define FDIVS(frd, frs1, frs2) EMIT(R_type(0b0001100, frs2, frs1, 0b000, frd, 0b1010011))
+#define FSQRTS(frd, frs1) EMIT(R_type(0b0101100, 0b00000, frs1, 0b000, frd, 0b1010011))
+#define FMINS(frd, frs1, frs2) EMIT(R_type(0b0010100, frs2, frs1, 0b000, frd, 0b1010011))
+#define FMAXS(frd, frs1, frs2) EMIT(R_type(0b0010100, frs2, frs1, 0b001, frd, 0b1010011))
 
 // compare
-#define FEQS(rd, frs1, frs2)        EMIT(R_type(0b1010000, frs2, frs1, 0b010, rd, 0b1010011))
-#define FLTS(rd, frs1, frs2)        EMIT(R_type(0b1010000, frs2, frs1, 0b001, rd, 0b1010011))
-#define FLES(rd, frs1, frs2)        EMIT(R_type(0b1010000, frs2, frs1, 0b000, rd, 0b1010011))
+#define FEQS(rd, frs1, frs2) EMIT(R_type(0b1010000, frs2, frs1, 0b010, rd, 0b1010011))
+#define FLTS(rd, frs1, frs2) EMIT(R_type(0b1010000, frs2, frs1, 0b001, rd, 0b1010011))
+#define FLES(rd, frs1, frs2) EMIT(R_type(0b1010000, frs2, frs1, 0b000, rd, 0b1010011))
 
 // RV64F
 // Convert from signed 64bits to Single
-#define FCVTSL(frd, rs1, rm)        EMIT(R_type(0b1101000, 0b00010, rs1, rm, frd, 0b1010011))
+#define FCVTSL(frd, rs1, rm) EMIT(R_type(0b1101000, 0b00010, rs1, rm, frd, 0b1010011))
 // Convert from unsigned 64bits to Single
-#define FCVTSLU(frd, rs1, rm)       EMIT(R_type(0b1101000, 0b00011, rs1, rm, frd, 0b1010011))
+#define FCVTSLU(frd, rs1, rm) EMIT(R_type(0b1101000, 0b00011, rs1, rm, frd, 0b1010011))
 // Convert from Single to signed 64bits
-#define FCVTLS(rd, frs1, rm)        EMIT(R_type(0b1100000, 0b00010, frs1, rm, rd, 0b1010011))
+#define FCVTLS(rd, frs1, rm) EMIT(R_type(0b1100000, 0b00010, frs1, rm, rd, 0b1010011))
 // Convert from Single to unsigned 64bits
-#define FCVTLUS(rd, frs1, rm)       EMIT(R_type(0b1100000, 0b00011, frs1, rm, rd, 0b1010011))
+#define FCVTLUS(rd, frs1, rm) EMIT(R_type(0b1100000, 0b00011, frs1, rm, rd, 0b1010011))
 // onvert from Single to signed 32/64bits (trucated)
-#define FCVTSxw(rd, frs1, rm)       EMIT(R_type(0b1100000, rex.w?0b00010:0b00000, frs1, rm, rd, 0b1010011))
+#define FCVTSxw(rd, frs1, rm) EMIT(R_type(0b1100000, rex.w ? 0b00010 : 0b00000, frs1, rm, rd, 0b1010011))
 
 // RV32D
 // load double precision from rs1+imm12 to frd
-#define FLD(frd, rs1, imm12)        EMIT(I_type(imm12, rs1, 0b011, frd, 0b0000111))
+#define FLD(frd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b011, frd, 0b0000111))
 // store double precision frs2 to rs1+imm12
-#define FSD(frs2, rs1, imm12)       EMIT(S_type(imm12, frs2, rs1, 0b011, 0b0100111))
+#define FSD(frs2, rs1, imm12) EMIT(S_type(imm12, frs2, rs1, 0b011, 0b0100111))
 // Convert Double frs1 to Single frd
-#define FCVTSD(frd, frs1)           EMIT(R_type(0b0100000, 0b00001, frs1, 0b000, frd, 0b1010011))
+#define FCVTSD(frd, frs1) EMIT(R_type(0b0100000, 0b00001, frs1, 0b000, frd, 0b1010011))
 // Convert Single frs1 to Double frd
-#define FCVTDS(frd, frs1)           EMIT(R_type(0b0100001, 0b00000, frs1, 0b000, frd, 0b1010011))
+#define FCVTDS(frd, frs1) EMIT(R_type(0b0100001, 0b00000, frs1, 0b000, frd, 0b1010011))
 // Convert from Double to signed 32bits
-#define FCVTWD(rd, frs1, rm)        EMIT(R_type(0b1100001, 0b00000, frs1, rm, rd, 0b1010011))
+#define FCVTWD(rd, frs1, rm) EMIT(R_type(0b1100001, 0b00000, frs1, rm, rd, 0b1010011))
 // Convert from Double to unsigned 32bits
-#define FCVTWUD(rd, frs1, rm)       EMIT(R_type(0b1100001, 0b00001, frs1, rm, rd, 0b1010011))
+#define FCVTWUD(rd, frs1, rm) EMIT(R_type(0b1100001, 0b00001, frs1, rm, rd, 0b1010011))
 // store rs1 with rs2 sign bit to rd
-#define FSGNJD(rd, rs1, rs2)        EMIT(R_type(0b0010001, rs2, rs1, 0b000, rd, 0b1010011))
+#define FSGNJD(rd, rs1, rs2) EMIT(R_type(0b0010001, rs2, rs1, 0b000, rd, 0b1010011))
 // move rs1 to rd
-#define FMVD(rd, rs1)               FSGNJD(rd, rs1, rs1)
+#define FMVD(rd, rs1) FSGNJD(rd, rs1, rs1)
 // store rs1 with oposite rs2 sign bit to rd
-#define FSGNJND(rd, rs1, rs2)       EMIT(R_type(0b0010001, rs2, rs1, 0b001, rd, 0b1010011))
+#define FSGNJND(rd, rs1, rs2) EMIT(R_type(0b0010001, rs2, rs1, 0b001, rd, 0b1010011))
 // -rs1 => rd
-#define FNEGD(rd, rs1)              FSGNJND(rd, rs1, rs1)
+#define FNEGD(rd, rs1) FSGNJND(rd, rs1, rs1)
 // store rs1 with rs1^rs2 sign bit to rd
-#define FSGNJXD(rd, rs1, rs2)       EMIT(R_type(0b0010001, rs2, rs1, 0b010, rd, 0b1010011))
+#define FSGNJXD(rd, rs1, rs2) EMIT(R_type(0b0010001, rs2, rs1, 0b010, rd, 0b1010011))
 // |rs1| => rd
-#define FABSD(rd, rs1)              FSGNJXD(rd, rs1, rs1)
+#define FABSD(rd, rs1) FSGNJXD(rd, rs1, rs1)
 // Convert from signed 32bits to Double
-#define FCVTDW(frd, rs1, rm)        EMIT(R_type(0b1101001, 0b00000, rs1, rm, frd, 0b1010011))
-
-#define FEQD(rd, frs1, frs2)        EMIT(R_type(0b1010001, frs2, frs1, 0b010, rd, 0b1010011))
-#define FLTD(rd, frs1, frs2)        EMIT(R_type(0b1010001, frs2, frs1, 0b001, rd, 0b1010011))
-#define FLED(rd, frs1, frs2)        EMIT(R_type(0b1010001, frs2, frs1, 0b000, rd, 0b1010011))
-
-#define FADDD(frd, frs1, frs2)      EMIT(R_type(0b0000001, frs2, frs1, 0b000, frd, 0b1010011))
-#define FSUBD(frd, frs1, frs2)      EMIT(R_type(0b0000101, frs2, frs1, 0b000, frd, 0b1010011))
-#define FMULD(frd, frs1, frs2)      EMIT(R_type(0b0001001, frs2, frs1, 0b000, frd, 0b1010011))
-#define FDIVD(frd, frs1, frs2)      EMIT(R_type(0b0001101, frs2, frs1, 0b000, frd, 0b1010011))
-#define FSQRTD(frd, frs1)           EMIT(R_type(0b0101101, 0b00000, frs1, 0b000, frd, 0b1010011))
-#define FMIND(frd, frs1, frs2)      EMIT(R_type(0b0010101, frs2, frs1, 0b000, frd, 0b1010011))
-#define FMAXD(frd, frs1, frs2)      EMIT(R_type(0b0010101, frs2, frs1, 0b001, frd, 0b1010011))
-
-//RV64D
-// Move from Double
-#define FMVXD(rd, frs1)             EMIT(R_type(0b1110001, 0b00000, frs1, 0b000, rd, 0b1010011))
+#define FCVTDW(frd, rs1, rm) EMIT(R_type(0b1101001, 0b00000, rs1, rm, frd, 0b1010011))
+
+#define FEQD(rd, frs1, frs2) EMIT(R_type(0b1010001, frs2, frs1, 0b010, rd, 0b1010011))
+#define FLTD(rd, frs1, frs2) EMIT(R_type(0b1010001, frs2, frs1, 0b001, rd, 0b1010011))
+#define FLED(rd, frs1, frs2) EMIT(R_type(0b1010001, frs2, frs1, 0b000, rd, 0b1010011))
+
+#define FADDD(frd, frs1, frs2) EMIT(R_type(0b0000001, frs2, frs1, 0b000, frd, 0b1010011))
+#define FSUBD(frd, frs1, frs2) EMIT(R_type(0b0000101, frs2, frs1, 0b000, frd, 0b1010011))
+#define FMULD(frd, frs1, frs2) EMIT(R_type(0b0001001, frs2, frs1, 0b000, frd, 0b1010011))
+#define FDIVD(frd, frs1, frs2) EMIT(R_type(0b0001101, frs2, frs1, 0b000, frd, 0b1010011))
+#define FSQRTD(frd, frs1) EMIT(R_type(0b0101101, 0b00000, frs1, 0b000, frd, 0b1010011))
+#define FMIND(frd, frs1, frs2) EMIT(R_type(0b0010101, frs2, frs1, 0b000, frd, 0b1010011))
+#define FMAXD(frd, frs1, frs2) EMIT(R_type(0b0010101, frs2, frs1, 0b001, frd, 0b1010011))
+
+// RV64D
+//  Move from Double
+#define FMVXD(rd, frs1) EMIT(R_type(0b1110001, 0b00000, frs1, 0b000, rd, 0b1010011))
 // Move to Double
-#define FMVDX(frd, rs1)             EMIT(R_type(0b1111001, 0b00000, rs1, 0b000, frd, 0b1010011))
+#define FMVDX(frd, rs1) EMIT(R_type(0b1111001, 0b00000, rs1, 0b000, frd, 0b1010011))
 // Convert from signed 64bits to Double
-#define FCVTDL(frd, rs1, rm)        EMIT(R_type(0b1101001, 0b00010, rs1, rm, frd, 0b1010011))
+#define FCVTDL(frd, rs1, rm) EMIT(R_type(0b1101001, 0b00010, rs1, rm, frd, 0b1010011))
 // Convert from unsigned 64bits to Double
-#define FCVTDLU(frd, rs1, rm)       EMIT(R_type(0b1101001, 0b00011, rs1, rm, frd, 0b1010011))
+#define FCVTDLU(frd, rs1, rm) EMIT(R_type(0b1101001, 0b00011, rs1, rm, frd, 0b1010011))
 // Convert from Double to signed 64bits
-#define FCVTLD(rd, frs1, rm)        EMIT(R_type(0b1100001, 0b00010, frs1, rm, rd, 0b1010011))
+#define FCVTLD(rd, frs1, rm) EMIT(R_type(0b1100001, 0b00010, frs1, rm, rd, 0b1010011))
 // Convert from Double to unsigned 64bits
-#define FCVTLUD(rd, frs1, rm)       EMIT(R_type(0b1100001, 0b00011, frs1, rm, rd, 0b1010011))
+#define FCVTLUD(rd, frs1, rm) EMIT(R_type(0b1100001, 0b00011, frs1, rm, rd, 0b1010011))
 
 // Convert from Double to signed integer
-#define FCVTLDxw(rd, frs1, rm)      EMIT(R_type(0b1100001, 0b00000+(rex.w?0b10:0b00), frs1, rm, rd, 0b1010011))
+#define FCVTLDxw(rd, frs1, rm) EMIT(R_type(0b1100001, 0b00000 + (rex.w ? 0b10 : 0b00), frs1, rm, rd, 0b1010011))
 // Convert from Double to unsigned integer
-#define FCVTLUDxw(rd, frs1, rm)     EMIT(R_type(0b1100001, 0b00001+(rex.w?0b10:0b00), frs1, rm, rd, 0b1010011))
+#define FCVTLUDxw(rd, frs1, rm) EMIT(R_type(0b1100001, 0b00001 + (rex.w ? 0b10 : 0b00), frs1, rm, rd, 0b1010011))
 
-//Zba
-// Add unsigned word (Wz(rs1) + X(rs2))
-#define ADDUW(rd, rs1, rs2)         EMIT(R_type(0b0000100, rs2, rs1, 0b000, rd, 0b0111011))
+// Zba
+//  Add unsigned word (Wz(rs1) + X(rs2))
+#define ADDUW(rd, rs1, rs2) EMIT(R_type(0b0000100, rs2, rs1, 0b000, rd, 0b0111011))
 // Zero-extend Word
-#define ZEXTW(rd, rs1)              ADDUW(rd, rs1, xZR)
+#define ZEXTW(rd, rs1) ADDUW(rd, rs1, xZR)
 // Shift left by 1 and add (rd = X(rs2) + X(rs1)<<1)
-#define SH1ADD(rd, rs1, rs2)        EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b0110011))
+#define SH1ADD(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b0110011))
 // Shift unsigned word left by 1 and add (rd = X(rs2) + Wz(rs1)<<1)
-#define SH1ADDUW(rd, rs1, rs2)      EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b0111011))
+#define SH1ADDUW(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b0111011))
 // Shift left by 2 and add (rd = X(rs2) + X(rs1)<<2)
-#define SH2ADD(rd, rs1, rs2)        EMIT(R_type(0b0010000, rs2, rs1, 0b100, rd, 0b0110011))
+#define SH2ADD(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b100, rd, 0b0110011))
 // Shift unsigned word left by 2 and add (rd = X(rs2) + Wz(rs1)<<2)
-#define SH2ADDUW(rd, rs1, rs2)      EMIT(R_type(0b0010000, rs2, rs1, 0b100, rd, 0b0111011))
+#define SH2ADDUW(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b100, rd, 0b0111011))
 // Shift left by 3 and add (rd = X(rs2) + X(rs1)<<3)
-#define SH3ADD(rd, rs1, rs2)        EMIT(R_type(0b0010000, rs2, rs1, 0b110, rd, 0b0110011))
+#define SH3ADD(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b110, rd, 0b0110011))
 // Shift unsigned word left by 3 and add (rd = X(rs2) + Wz(rs1)<<3)
-#define SH3ADDUW(rd, rs1, rs2)      EMIT(R_type(0b0010000, rs2, rs1, 0b110, rd, 0b0111011))
+#define SH3ADDUW(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b110, rd, 0b0111011))
 // Shift left unsigned word (immediate)
-#define SLLIUW(rd, rs1, imm)        EMIT(R_type(0b0000100, imm, rs1, 0b001, rd, 0b0011011))
+#define SLLIUW(rd, rs1, imm) EMIT(R_type(0b0000100, imm, rs1, 0b001, rd, 0b0011011))
 // Shift left by 1,2 or 3 and add (rd = X(rs2) + X(rs1)<<x)
-#define SHxADD(rd, rs1, x, rs2)        EMIT(R_type(0b0010000, rs2, rs1, (x)<<1, rd, 0b0110011))
+#define SHxADD(rd, rs1, x, rs2) EMIT(R_type(0b0010000, rs2, rs1, (x) << 1, rd, 0b0110011))
 // Shift unsigned word left by 1,2 or 3 and add (rd = X(rs2) + Wz(rs1)<<x)
-#define SHxADDUW(rd, rs1, x, rs2)      EMIT(R_type(0b0010000, rs2, rs1, (x)<<1, rd, 0b0111011))
+#define SHxADDUW(rd, rs1, x, rs2) EMIT(R_type(0b0010000, rs2, rs1, (x) << 1, rd, 0b0111011))
 
-//Zbb
-// AND with reverted operand (rs1 & ~rs2)
-#define ANDN(rd, rs1, rs2)      EMIT(R_type(0b0100000, rs2, rs1, 0b111, rd, 0b0110011))
+// Zbb
+//  AND with reverted operand (rs1 & ~rs2)
+#define ANDN(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b111, rd, 0b0110011))
 // OR with reverted operand (rs1 | ~rs2)
-#define ORN(rd, rs1, rs2)       EMIT(R_type(0b0100000, rs2, rs1, 0b110, rd, 0b0110011))
+#define ORN(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b110, rd, 0b0110011))
 // Exclusive NOR (~(rs1 ^ rs2))
-#define XNOR(rd, rs1, rs2)      EMIT(R_type(0b0100000, rs2, rs1, 0b100, rd, 0b0110011))
+#define XNOR(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b100, rd, 0b0110011))
 // Count leading zero bits
-#define CLZ(rd, rs)             EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, 0b0010011))
+#define CLZ(rd, rs) EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, 0b0010011))
 // Count leading zero bits in word
-#define CLZW(rd, rs)            EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, 0b0011011))
+#define CLZW(rd, rs) EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, 0b0011011))
 // Count leading zero bits
-#define CLZxw(rd, rs)           EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, rex.w?0b0010011:0b0011011))
+#define CLZxw(rd, rs) EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, rex.w ? 0b0010011 : 0b0011011))
 // Count trailing zero bits
-#define CTZ(rd, rs)             EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, 0b0010011))
+#define CTZ(rd, rs) EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, 0b0010011))
 // Count trailing zero bits in word
-#define CTZW(rd, rs)            EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, 0b0011011))
+#define CTZW(rd, rs) EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, 0b0011011))
 // Count trailing zero bits
-#define CTZxw(rd, rs)           EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, rex.w?0b0010011:0b0011011))
+#define CTZxw(rd, rs) EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, rex.w ? 0b0010011 : 0b0011011))
 // Count set bits
-#define CPOP(rd, rs)            EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, 0b0010011))
+#define CPOP(rd, rs) EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, 0b0010011))
 // Count set bits in word
-#define CPOPW(rd, rs)           EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, 0b0011011))
+#define CPOPW(rd, rs) EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, 0b0011011))
 // Count set bits
-#define CPOPxw(rd, rs)          EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, rex.w?0b0010011:0b0011011))
+#define CPOPxw(rd, rs) EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, rex.w ? 0b0010011 : 0b0011011))
 // Maximum
-#define MAX(rd, rs1, rs2)       EMIT(R_type(0b0000101, rs2, rs1, 0b110, rd, 0b0110011))
+#define MAX(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b110, rd, 0b0110011))
 // Unisgned maximum
-#define MAXU(rd, rs1, rs2)      EMIT(R_type(0b0000101, rs2, rs1, 0b111, rd, 0b0110011))
+#define MAXU(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b111, rd, 0b0110011))
 // Minimum
-#define MIN(rd, rs1, rs2)       EMIT(R_type(0b0000101, rs2, rs1, 0b100, rd, 0b0110011))
+#define MIN(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b100, rd, 0b0110011))
 // Unsigned minimum
-#define MINU(rd, rs1, rs2)      EMIT(R_type(0b0000101, rs2, rs1, 0b101, rd, 0b0110011))
+#define MINU(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b101, rd, 0b0110011))
 // Sign-extend byte
-#define SEXTB(rd, rs)           EMIT(R_type(0b0110000, 0b00100, rs, 0b001, rd, 0b0010011))
+#define SEXTB(rd, rs) EMIT(R_type(0b0110000, 0b00100, rs, 0b001, rd, 0b0010011))
 // Sign-extend half-word
-#define SEXTH(rd, rs)           EMIT(R_type(0b0110000, 0b00101, rs, 0b001, rd, 0b0010011))
+#define SEXTH(rd, rs) EMIT(R_type(0b0110000, 0b00101, rs, 0b001, rd, 0b0010011))
 // Zero-extend half-word
-#define ZEXTH_(rd, rs)          EMIT(R_type(0b0000100, 0b00000, rs, 0b100, rd, 0b0111011))
+#define ZEXTH_(rd, rs) EMIT(R_type(0b0000100, 0b00000, rs, 0b100, rd, 0b0111011))
 // Zero-extend half-word
-#define ZEXTH(rd, rs)           if(rv64_zbb) ZEXTH_(rd, rs); else {SLLI(rd, rs, 48); SRLI(rd, rd, 48);}
+#define ZEXTH(rd, rs)     \
+    if (rv64_zbb)         \
+        ZEXTH_(rd, rs);   \
+    else {                \
+        SLLI(rd, rs, 48); \
+        SRLI(rd, rd, 48); \
+    }
 // Rotate left (register)
-#define ROL(rd, rs1, rs2)       EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, 0b0110011))
+#define ROL(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, 0b0110011))
 // Rotate left word (register)
-#define ROLW(rd, rs1, rs2)      EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, 0b0111011))
+#define ROLW(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, 0b0111011))
 // Rotate left (register)
-#define ROLxw(rd, rs1, rs2)     EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, rex.w?0b0110011:0b0111011))
+#define ROLxw(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, rex.w ? 0b0110011 : 0b0111011))
 // Rotate right (register)
-#define ROR(rd, rs1, rs2)       EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, 0b0110011))
+#define ROR(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, 0b0110011))
 // Rotate right (immediate)
-#define RORI(rd, rs1, shamt)    EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, 0b0010011))
+#define RORI(rd, rs1, shamt) EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, 0b0010011))
 // Rotate right word (immediate)
-#define RORIW(rd, rs1, shamt)   EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, 0b0011011))
+#define RORIW(rd, rs1, shamt) EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, 0b0011011))
 // Rotate right (immediate)
-#define RORIxw(rd, rs1, shamt)  EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, rex.w?0b0010011:0b0011011))
+#define RORIxw(rd, rs1, shamt) EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, rex.w ? 0b0010011 : 0b0011011))
 // Rotate right word (register)
-#define RORW(rd, rs1, rs2)      EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, 0b0111011))
+#define RORW(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, 0b0111011))
 // Rotate right (register)
-#define RORxw(rd, rs1, rs2)     EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, rex.w?0b0110011:0b0111011))
+#define RORxw(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, rex.w ? 0b0110011 : 0b0111011))
 // Bitwise OR Combine, byte granule (for all byte, if byte==0, res.byte=0, else res.byte=0xff)
-#define ORCB(rd, rs)            EMIT(I_type(0b001010000111, rs, 0b101, rd, 0b0010011))
+#define ORCB(rd, rs) EMIT(I_type(0b001010000111, rs, 0b101, rd, 0b0010011))
 // Byte-reverse register
-#define REV8(rd, rs)            EMIT(I_type(0b011010111000, rs, 0b101, rd, 0b0010011))
+#define REV8(rd, rs) EMIT(I_type(0b011010111000, rs, 0b101, rd, 0b0010011))
+
+// Byte-reverse register, rd can be the same as rs or s1, but rs cannot be the same as s1.
+#define REV8xw(rd, rs, s1, s2, s3, s4) \
+    if (rv64_zbb) {                    \
+        REV8(rd, rs);                  \
+        if (!rex.w) {                  \
+            SRLI(rd, rd, 32);          \
+        }                              \
+    } else if (rv64_xtheadbb) {        \
+        if (rex.w) {                   \
+            TH_REV(rd, rs);            \
+        } else {                       \
+            TH_REVW(rd, rs);           \
+        }                              \
+    } else {                           \
+        MOV_U12(s2, 0xff);             \
+        if (rex.w) {                   \
+            SLLI(s1, rs, 56);          \
+            SRLI(s3, rs, 56);          \
+            SRLI(s4, rs, 40);          \
+            SLLI(s2, s2, 8);           \
+            AND(s4, s4, s2);           \
+            OR(s1, s1, s3);            \
+            OR(s1, s1, s4);            \
+            SLLI(s3, rs, 40);          \
+            SLLI(s4, s2, 40);          \
+            AND(s3, s3, s4);           \
+            OR(s1, s1, s3);            \
+            SRLI(s3, rs, 24);          \
+            SLLI(s4, s2, 8);           \
+            AND(s3, s3, s4);           \
+            OR(s1, s1, s3);            \
+            SLLI(s3, rs, 24);          \
+            SLLI(s4, s2, 32);          \
+            AND(s3, s3, s4);           \
+            OR(s1, s1, s3);            \
+            SRLI(s3, rs, 8);           \
+            SLLI(s4, s2, 16);          \
+            AND(s3, s3, s4);           \
+            OR(s1, s1, s3);            \
+            SLLI(s3, rs, 8);           \
+            SLLI(s4, s2, 24);          \
+            AND(s3, s3, s4);           \
+            OR(rd, s1, s3);            \
+        } else {                       \
+            SLLIW(s2, s2, 8);          \
+            SLLIW(s1, rs, 24);         \
+            SRLIW(s3, rs, 24);         \
+            SRLIW(s4, rs, 8);          \
+            AND(s4, s4, s2);           \
+            OR(s1, s1, s3);            \
+            OR(s1, s1, s4);            \
+            SLLIW(s3, rs, 8);          \
+            LUI(s2, 0xff0);            \
+            AND(s3, s3, s2);           \
+            OR(rd, s1, s3);            \
+        }                              \
+    }
 
-//Zbc
-// Carry-less multily (low-part)
-#define CLMUL(rd, rs1, rs2)         EMIT(R_type(0b0000101, rs2, rs1, 0b001, rd, 0b0110011))
+// Zbc
+//  Carry-less multily (low-part)
+#define CLMUL(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b001, rd, 0b0110011))
 // Carry-less multiply (high-part)
-#define CLMULH(rd, rs1, rs2)        EMIT(R_type(0b0000101, rs2, rs1, 0b011, rd, 0b0110011))
+#define CLMULH(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b011, rd, 0b0110011))
 // Carry-less multiply (reversed)
-#define CLMULR(rd, rs1, rs2)        EMIT(R_type(0b0000101, rs2, rs1, 0b010, rd, 0b0110011))
+#define CLMULR(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b010, rd, 0b0110011))
 
-//Zbs
-// encoding of the "imm" on RV64 use a slight different mask, but it will work using R_type with high bit of imm ovewriting low bit op func
-// Single-bit Clear (Register)
-#define BCLR(rd, rs1, rs2)          EMIT(R_type(0b0100100, rs2, rs1, 0b001, rd, 0b0110011))
+// Zbs
+//  encoding of the "imm" on RV64 use a slight different mask, but it will work using R_type with high bit of imm ovewriting low bit op func
+//  Single-bit Clear (Register)
+#define BCLR(rd, rs1, rs2) EMIT(R_type(0b0100100, rs2, rs1, 0b001, rd, 0b0110011))
 // Single-bit Clear (Immediate)
-#define BCLI(rd, rs1, imm)          EMIT(R_type(0b0100100, imm, rs1, 0b001, rd, 0b0010011))
+#define BCLI(rd, rs1, imm) EMIT(R_type(0b0100100, imm, rs1, 0b001, rd, 0b0010011))
 // Single-bit Extreact (Register)
-#define BEXT(rd, rs1, rs2)          EMIT(R_type(0b0100100, rs2, rs1, 0b101, rd, 0b0110011))
+#define BEXT(rd, rs1, rs2) EMIT(R_type(0b0100100, rs2, rs1, 0b101, rd, 0b0110011))
 // Single-bit Extract (Immediate)
-#define BEXTI(rd, rs1, imm)         EMIT(R_type(0b0100100, imm, rs1, 0b101, rd, 0b0010011))
+#define BEXTI(rd, rs1, imm) EMIT(R_type(0b0100100, imm, rs1, 0b101, rd, 0b0010011))
 // Single-bit Invert (Register)
-#define BINV(rd, rs1, rs2)          EMIT(R_type(0b0110100, rs2, rs1, 0b001, rd, 0b0110011))
+#define BINV(rd, rs1, rs2) EMIT(R_type(0b0110100, rs2, rs1, 0b001, rd, 0b0110011))
 // Single-bit Invert (Immediate)
-#define BINVI(rd, rs1, imm)         EMIT(R_type(0b0110100, imm, rs1, 0b001, rd, 0b0010011))
+#define BINVI(rd, rs1, imm) EMIT(R_type(0b0110100, imm, rs1, 0b001, rd, 0b0010011))
 // Single-bit Set (Register)
-#define BSET(rd, rs1, rs2)          EMIT(R_type(0b0010100, rs2, rs1, 0b001, rd, 0b0110011))
+#define BSET(rd, rs1, rs2) EMIT(R_type(0b0010100, rs2, rs1, 0b001, rd, 0b0110011))
 // Single-bit Set (Immediate)
-#define BSETI(rd, rs1, imm)         EMIT(R_type(0b0010100, imm, rs1, 0b001, rd, 0b0010011))
+#define BSETI(rd, rs1, imm) EMIT(R_type(0b0010100, imm, rs1, 0b001, rd, 0b0010011))
 
 /// THead vendor extension
 /// https://github.com/T-head-Semi/thead-extension-spec/releases
@@ -669,11 +861,12 @@ f28–31  ft8–11  FP temporaries                  Caller
 
 // Add a shifted operand to a second operand.
 // reg[rd] := reg[rs1] + (reg[rs2] << imm2)
-#define TH_ADDSL(rd, rs1, rs2, imm2) EMIT(R_type(imm2&0b11, rs2, rs1, 0b001, rd, 0b0001011))
+#define TH_ADDSL(rd, rs1, rs2, imm2) EMIT(R_type(imm2 & 0b11, rs2, rs1, 0b001, rd, 0b0001011))
 
 // XTheadBb - Basic bit-manipulation
 
-#define TH_SRRIxw(rd, rs1, imm) if(rex.w) { \
+#define TH_SRRIxw(rd, rs1, imm) \
+    if (rex.w) {                \
         TH_SRRI(rd, rs1, imm);  \
     } else {                    \
         TH_SRRIW(rd, rs1, imm); \
@@ -681,20 +874,20 @@ f28–31  ft8–11  FP temporaries                  Caller
 
 // Perform a cyclic right shift.
 // reg[rd] := (reg[rs1] >> imm6) | (reg[rs1] << (xlen - imm6))
-#define TH_SRRI(rd, rs1, imm6) EMIT(I_type(0b000100000000|(imm6&0x3f), rs1, 0b001, rd, 0b0001011))
+#define TH_SRRI(rd, rs1, imm6) EMIT(I_type(0b000100000000 | (imm6 & 0x3f), rs1, 0b001, rd, 0b0001011))
 
 // Perform a cyclic right shift on word operand.
 // data := zext.w(reg[rs1])
 // reg[rd] := (data >> imm5) | (data << (32 - imm5))
-#define TH_SRRIW(rd, rs1, imm5) EMIT(I_type(0b000101000000|(imm5&0x1f), rs1, 0b001, rd, 0b0001011))
+#define TH_SRRIW(rd, rs1, imm5) EMIT(I_type(0b000101000000 | (imm5 & 0x1f), rs1, 0b001, rd, 0b0001011))
 
 // Extract and sign-extend bits.
 // reg[rd] := sign_extend(reg[rs1][imm1:imm2])
-#define TH_EXT(rd, rs1, imm1, imm2) EMIT(I_type(((imm1&0x1f)<<6)|(imm2&0x1f), rs1, 0b010, rd, 0b0001011))
+#define TH_EXT(rd, rs1, imm1, imm2) EMIT(I_type(((imm1 & 0x1f) << 6) | (imm2 & 0x1f), rs1, 0b010, rd, 0b0001011))
 
 // Extract and zero-extend bits.
 // reg[rd] := zero_extend(reg[rs1][imm1:imm2])
-#define TH_EXTU(rd, rs1, imm1, imm2) EMIT(I_type(((imm1&0x1f)<<6)|(imm2&0x1f), rs1, 0b011, rd, 0b0001011))
+#define TH_EXTU(rd, rs1, imm1, imm2) EMIT(I_type(((imm1 & 0x1f) << 6) | (imm2 & 0x1f), rs1, 0b011, rd, 0b0001011))
 
 // Find first '0'-bit
 // for i=xlen..0:
@@ -739,7 +932,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 //   rd := 1
 // else
 //   rd := 0
-#define TH_TST(rd, rs1, imm6) EMIT(I_type(0b100010000000|(imm6&0x3f), rs1, 0b001, rd, 0b0001011))
+#define TH_TST(rd, rs1, imm6) EMIT(I_type(0b100010000000 | (imm6 & 0x3f), rs1, 0b001, rd, 0b0001011))
 
 
 // XTheadCondMov -  Conditional move
@@ -759,7 +952,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 // Load indexed byte, increment address after loading.
 // rd := sign_extend(mem[rs1])
 // rs1 := rs1 + (sign_extend(imm5) << imm2)
-#define TH_LBIA(rd, rs1, imm5, imm2) EMIT(I_type(0b000110000000|((imm2&0b11)<<5)|(imm5&0x1f), rs1, 0b100, rd, 0b0001011))
+#define TH_LBIA(rd, rs1, imm5, imm2) EMIT(I_type(0b000110000000 | ((imm2 & 0b11) << 5) | (imm5 & 0x1f), rs1, 0b100, rd, 0b0001011))
 
 // TODO
 // th.lbib rd, (rs1), imm5, imm2 Load indexed byte
@@ -813,7 +1006,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 // addr := rs1 + (zero_extend(imm2) << 4)
 // rd1 := mem[addr+7:addr]
 // rd2 := mem[addr+15:addr+8]
-#define TH_LDD(rd1, rd2, rs1, imm2) EMIT(R_type(0b1111100|(imm2&0b11), rd2, rs1, 0b100, rd1, 0b0001011))
+#define TH_LDD(rd1, rd2, rs1, imm2) EMIT(R_type(0b1111100 | (imm2 & 0b11), rd2, rs1, 0b100, rd1, 0b0001011))
 
 // TODO
 // th.lwd rd1, rd2, (rs1), imm2, 3 Load two signed 32-bit values
@@ -826,7 +1019,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 // Load indexed double-precision floating point value.
 // addr := rs1 + (rs2 << imm2)
 // rd := fmem[addr+7:addr]
-#define TH_FLRD(rd, rs1, rs2, imm2) EMIT(R_type(0b0110000|(imm2&0b11), rs2, rs1, 0b110, rd, 0b0001011))
+#define TH_FLRD(rd, rs1, rs2, imm2) EMIT(R_type(0b0110000 | (imm2 & 0b11), rs2, rs1, 0b110, rd, 0b0001011))
 
 // TODO
 // th.flrw rd, rs1, rs2, imm2 Load indexed float
diff --git a/src/emu/x64run660f.c b/src/emu/x64run660f.c
index 902772e5..739c9664 100644
--- a/src/emu/x64run660f.c
+++ b/src/emu/x64run660f.c
@@ -27,19 +27,19 @@ static uint8_t ff_mult(uint8_t a, uint8_t b)
 	int retval = 0;

 

 	for(int i = 0; i < 8; i++) {

-		if((b & 1) == 1) 

+		if((b & 1) == 1)

 			retval ^= a;

-		

+

 		if((a & 0x80)) {

 			a <<= 1;

 			a  ^= 0x1b;

 		} else {

 			a <<= 1;

 		}

-		

+

 		b >>= 1;

 	}

-	

+

 	return retval;

 }

 

@@ -514,7 +514,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
                 for(int i=1; i>=0; --i)

                     GX->sq[i] = EX->sd[i];

                 break;

-            

+

             case 0x28:  /* PMULDQ Gx, Ex */

                 nextop = F8;

                 GETEX(0);

@@ -790,15 +790,15 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
                 break;

             case 0xF0: /* MOVBE Gw, Ew */

                 nextop = F8;

-                GETEX(0);

-                GETGX;

-                GX->uw[0] = __builtin_bswap16(EX->uw[0]);

+                GETED(0);

+                GETGD;

+                GD->word[0] = __builtin_bswap16(ED->word[0]);

                 break;

             case 0xF1: /* MOVBE Ew, Gw */

                 nextop = F8;

-                GETEX(0);

-                GETGX;

-                EX->uw[0] = __builtin_bswap16(GX->uw[0]);

+                GETED(0);

+                GETGD;

+                ED->word[0] = __builtin_bswap16(GD->word[0]);

                 break;

             default:

                 return 0;

@@ -1113,7 +1113,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
                 return 0;

         }

         break;

-        

+

     GOCOND(0x40

         , nextop = F8;

         CHECK_FLAGS(emu);

@@ -1285,7 +1285,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         if (isnan(GX->d[1]) || isnan(EX->d[1]) || isgreater(EX->d[1], GX->d[1]))

             GX->d[1] = EX->d[1];

         break;

-        

+

     case 0x60:  /* PUNPCKLBW Gx,Ex */

         nextop = F8;

         GETEX(0);

@@ -1295,7 +1295,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         if(GX==EX)

             for(int i=0; i<8; ++i)

                 GX->ub[2 * i + 1] = GX->ub[2 * i];

-        else 

+        else

             for(int i=0; i<8; ++i)

                 GX->ub[2 * i + 1] = EX->ub[i];

         break;

@@ -1538,7 +1538,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
                     } else {

                         EX->q[0] = EX->q[1] >> (tmp8u - 64);

                         EX->q[1] = 0;

-                    }                    

+                    }

                 }

                 break;

             case 6:                 /* PSLLQ Ex, Ib */

@@ -1827,7 +1827,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
             GW->word[0] = EW->word[0];

         break;

 

-    case 0xBA:                      

+    case 0xBA:

         nextop = F8;

         switch((nextop>>3)&7) {

             case 4:                 /* BT Ew,Ib */

@@ -2096,7 +2096,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         GETGX;

         if(EX->q[0]>15)

             {GX->q[0] = GX->q[1] = 0;}

-        else 

+        else

             {tmp8u=EX->ub[0]; for (int i=0; i<8; ++i) GX->uw[i] >>= tmp8u;}

         break;

     case 0xD2:  /* PSRLD Gx, Ex */

@@ -2105,7 +2105,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         GETGX;

         if(EX->q[0]>31)

             {GX->q[0] = GX->q[1] = 0;}

-        else 

+        else

             {tmp8u=EX->ub[0]; for (int i=0; i<4; ++i) GX->ud[i] >>= tmp8u;}

         break;

     case 0xD3:  /* PSRLQ Gx, Ex */

@@ -2114,7 +2114,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         GETGX;

         if(EX->q[0]>63)

             {GX->q[0] = GX->q[1] = 0;}

-        else 

+        else

             {tmp8u=EX->ub[0]; for (int i=0; i<2; ++i) GX->q[i] >>= tmp8u;}

         break;

     case 0xD4:  /* PADDQ Gx,Ex */

@@ -2229,7 +2229,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         GETEX(0);

         GETGX;

         tmp8u=(EX->q[0]>15)?15:EX->ub[0];

-        for (int i=0; i<8; ++i) 

+        for (int i=0; i<8; ++i)

             GX->sw[i] >>= tmp8u;

         break;

     case 0xE2:  /* PSRAD Gx, Ex */

@@ -2357,7 +2357,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         GETGX;

         if(EX->q[0]>15)

             {GX->q[0] = GX->q[1] = 0;}

-        else 

+        else

             {tmp8u=EX->ub[0]; for (int i=0; i<8; ++i) GX->uw[i] <<= tmp8u;}

         break;

     case 0xF2:  /* PSLLD Gx, Ex */

@@ -2366,7 +2366,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         GETGX;

         if(EX->q[0]>31)

             {GX->q[0] = GX->q[1] = 0;}

-        else 

+        else

             {tmp8u=EX->ub[0]; for (int i=0; i<4; ++i) GX->ud[i] <<= tmp8u;}

         break;

     case 0xF3:  /* PSLLQ Gx, Ex */

@@ -2375,7 +2375,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         GETGX;

         if(EX->q[0]>63)

             {GX->q[0] = GX->q[1] = 0;}

-        else 

+        else

             {tmp8u=EX->ub[0]; for (int i=0; i<2; ++i) GX->q[i] <<= tmp8u;}

         break;

     case 0xF4:  /* PMULUDQ Gx,Ex */

@@ -2468,7 +2468,7 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         GX->sd[2] += EX->sd[2];

         GX->sd[3] += EX->sd[3];

         break;

-    

+

     default:

         return 0;

     }