about summary refs log tree commit diff stats
path: root/src/dynarec/rv64
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynarec/rv64')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00.c3
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_0.c14
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_1.c125
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_2.c192
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_3.c187
-rw-r--r--src/dynarec/rv64/dynarec_rv64_0f.c748
-rw-r--r--src/dynarec/rv64/dynarec_rv64_64.c346
-rw-r--r--src/dynarec/rv64/dynarec_rv64_66.c188
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f.c1442
-rw-r--r--src/dynarec/rv64/dynarec_rv64_6664.c77
-rw-r--r--src/dynarec/rv64/dynarec_rv64_66f0.c129
-rw-r--r--src/dynarec/rv64/dynarec_rv64_67.c574
-rw-r--r--src/dynarec/rv64/dynarec_rv64_d8.c67
-rw-r--r--src/dynarec/rv64/dynarec_rv64_d9.c53
-rw-r--r--src/dynarec/rv64/dynarec_rv64_db.c60
-rw-r--r--src/dynarec/rv64/dynarec_rv64_dc.c119
-rw-r--r--src/dynarec/rv64/dynarec_rv64_dd.c179
-rw-r--r--src/dynarec/rv64/dynarec_rv64_de.c1
-rw-r--r--src/dynarec/rv64/dynarec_rv64_df.c106
-rw-r--r--src/dynarec/rv64/dynarec_rv64_emit_logic.c10
-rw-r--r--src/dynarec/rv64/dynarec_rv64_emit_math.c212
-rw-r--r--src/dynarec/rv64/dynarec_rv64_emit_shift.c50
-rw-r--r--src/dynarec/rv64/dynarec_rv64_emit_tests.c5
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f0.c178
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f20f.c45
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f30f.c216
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.c48
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.h2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.c517
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h238
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass0.h10
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass1.h1
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass2.h4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass3.h6
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h12
-rw-r--r--src/dynarec/rv64/rv64_emitter.h145
-rw-r--r--src/dynarec/rv64/rv64_epilog.S43
-rw-r--r--src/dynarec/rv64/rv64_printer.c17
-rw-r--r--src/dynarec/rv64/rv64_prolog.S43
39 files changed, 5258 insertions, 1154 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_00.c b/src/dynarec/rv64/dynarec_rv64_00.c
index f5bd8af7..5f529fb7 100644
--- a/src/dynarec/rv64/dynarec_rv64_00.c
+++ b/src/dynarec/rv64/dynarec_rv64_00.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 #include <signal.h>
 #include <assert.h>
@@ -26,8 +25,6 @@
 #include "dynarec_rv64_functions.h"
 #include "dynarec_rv64_helper.h"
 
-int isSimpleWrapper(wrapper_t fun);
-
 uintptr_t dynarec64_00(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
 {
     uint8_t opcode;
diff --git a/src/dynarec/rv64/dynarec_rv64_00_0.c b/src/dynarec/rv64/dynarec_rv64_00_0.c
index 0320107d..a0ff3746 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_0.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_0.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 #include <signal.h>
 #include <assert.h>
@@ -26,8 +25,6 @@
 #include "dynarec_rv64_functions.h"
 #include "dynarec_rv64_helper.h"
 
-int isSimpleWrapper(wrapper_t fun);
-
 uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
 {
     uint8_t nextop, opcode;
@@ -178,7 +175,7 @@ uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGD;
             GETED(0);
-            emit_adc32(dyn, ninst, rex, ed, gd, x3, x4, x5);
+            emit_adc32(dyn, ninst, rex, ed, gd, x3, x4, x5, x6);
             WBACK;
             break;
 
@@ -231,6 +228,15 @@ uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             ANDI(xRAX, xRAX, ~0xff);
             OR(xRAX, xRAX, x1);
             break;
+        case 0x1D:
+            INST_NAME("SBB EAX, Id");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            MOV64xw(x2, i64);
+            emit_sbb32(dyn, ninst, rex, xRAX, x2, x3, x4, x5);
+            break;
+
         case 0x20:
             INST_NAME("AND Eb, Gb");
             SETFLAGS(X_ALL, SF_SET_PENDING);
diff --git a/src/dynarec/rv64/dynarec_rv64_00_1.c b/src/dynarec/rv64/dynarec_rv64_00_1.c
index 3abb0444..54ca28f5 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_1.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_1.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 #include <signal.h>
 #include <assert.h>
@@ -53,7 +52,32 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     MAYUSE(cacheupd);
 
     switch(opcode) {
-
+        case 0x40:
+        case 0x41:
+        case 0x42:
+        case 0x43:
+        case 0x44:
+        case 0x45:
+        case 0x46:
+        case 0x47:
+            INST_NAME("INC Reg (32bits)");
+            SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING);
+            gd = xRAX + (opcode&7);
+            emit_inc32(dyn, ninst, rex, gd, x1, x2, x3, x4);
+            break;
+        case 0x48:
+        case 0x49:
+        case 0x4A:
+        case 0x4B:
+        case 0x4C:
+        case 0x4D:
+        case 0x4E:
+        case 0x4F:
+            INST_NAME("DEC Reg (32bits)");
+            SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING);
+            gd = xRAX + (opcode&7);
+            emit_dec32(dyn, ninst, rex, gd, x1, x2, x3, x4);
+            break;
         case 0x50:
         case 0x51:
         case 0x52:
@@ -64,8 +88,7 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x57:
             INST_NAME("PUSH reg");
             gd = xRAX+(opcode&0x07)+(rex.b<<3);
-            SD(gd, xRSP, -8);
-            SUBI(xRSP, xRSP, 8);
+            PUSH1z(gd);
             break;
         case 0x58:
         case 0x59:
@@ -77,31 +100,65 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5F:
             INST_NAME("POP reg");
             gd = xRAX+(opcode&0x07)+(rex.b<<3);
-            LD(gd, xRSP, 0);
-            if(gd!=xRSP) {
-                ADDI(xRSP, xRSP, 8);
+            POP1z(gd);
+            break;
+
+        case 0x60:
+            if(rex.is32bits) {
+                INST_NAME("PUSHAD");
+                AND(x1, xRSP, xMASK);
+                PUSH1_32(xRAX);
+                PUSH1_32(xRCX);
+                PUSH1_32(xRDX);
+                PUSH1_32(xRBX);
+                PUSH1_32(x1);
+                PUSH1_32(xRBP);
+                PUSH1_32(xRSI);
+                PUSH1_32(xRDI);
+            } else {
+                DEFAULT;
+            }
+            break;
+        case 0x61:
+            if(rex.is32bits) {
+                INST_NAME("POPAD");
+                POP1_32(xRDI);
+                POP1_32(xRSI);
+                POP1_32(xRBP);
+                POP1_32(x1);
+                POP1_32(xRBX);
+                POP1_32(xRDX);
+                POP1_32(xRCX);
+                POP1_32(xRAX);
+            } else {
+                DEFAULT;
             }
             break;
 
         case 0x63:
-            INST_NAME("MOVSXD Gd, Ed");
-            nextop = F8;
-            GETGD;
-            if(rex.w) {
-                if(MODREG) {   // reg <= reg
-                    ADDIW(gd, xRAX+(nextop&7)+(rex.b<<3), 0);
-                } else {                    // mem <= reg
-                    SMREAD();
-                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
-                    LW(gd, ed, fixedaddress);
-                }
+            if(rex.is32bits) {
+                // this is ARPL opcode
+                DEFAULT;
             } else {
-                if(MODREG) {   // reg <= reg
-                    AND(gd, xRAX+(nextop&7)+(rex.b<<3), xMASK);
-                } else {                    // mem <= reg
-                    SMREAD();
-                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
-                    LWU(gd, ed, fixedaddress);
+                INST_NAME("MOVSXD Gd, Ed");
+                nextop = F8;
+                GETGD;
+                if(rex.w) {
+                    if(MODREG) {   // reg <= reg
+                        ADDIW(gd, xRAX+(nextop&7)+(rex.b<<3), 0);
+                    } else {                    // mem <= reg
+                        SMREAD();
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                        LW(gd, ed, fixedaddress);
+                    }
+                } else {
+                    if(MODREG) {   // reg <= reg
+                        AND(gd, xRAX+(nextop&7)+(rex.b<<3), xMASK);
+                    } else {                    // mem <= reg
+                        SMREAD();
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                        LWU(gd, ed, fixedaddress);
+                    }
                 }
             }
             break;
@@ -114,7 +171,9 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x66:
             addr = dynarec64_66(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
             break;
-
+        case 0x67:
+            addr = dynarec64_67(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
         case 0x68:
             INST_NAME("PUSH Id");
             i64 = F32S;
@@ -122,10 +181,10 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 MESSAGE(LOG_DUMP, "PUSH then RET, using indirect\n");
                 TABLE64(x3, addr-4);
                 LW(x1, x3, 0);
-                PUSH1(x1);
+                PUSH1z(x1);
             } else {
-                MOV64x(x3, i64);
-                PUSH1(x3);
+                MOV64z(x3, i64);
+                PUSH1z(x3);
             }
             break;
         case 0x69:
@@ -164,8 +223,8 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x6A:
             INST_NAME("PUSH Ib");
             i64 = F8S;
-            MOV64x(x3, i64);
-            PUSH1(x3);
+            MOV64z(x3, i64);
+            PUSH1z(x3);
             break;
         case 0x6B:
             INST_NAME("IMUL Gd, Ed, Ib");
@@ -179,12 +238,12 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 // 64bits imul
                 UFLAG_IF {
                     MULH(x3, ed, x4);
-                    MULW(gd, ed, x4);
+                    MUL(gd, ed, x4);
                     UFLAG_OP1(x3);
                     UFLAG_RES(gd);
                     UFLAG_DF(x3, d_imul64);
                 } else {
-                    MULxw(gd, ed, x4);
+                    MUL(gd, ed, x4);
                 }
             } else {
                 // 32bits imul
@@ -195,7 +254,7 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     UFLAG_OP1(x3);
                     UFLAG_DF(x3, d_imul32);
                 } else {
-                    MULxw(gd, ed, x4);
+                    MULW(gd, ed, x4);
                 }
                 ZEROUP(gd);
             }
diff --git a/src/dynarec/rv64/dynarec_rv64_00_2.c b/src/dynarec/rv64/dynarec_rv64_00_2.c
index 6f0ef03e..20333f96 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_2.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_2.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 #include <signal.h>
 #include <assert.h>
@@ -26,8 +25,6 @@
 #include "dynarec_rv64_functions.h"
 #include "dynarec_rv64_helper.h"
 
-int isSimpleWrapper(wrapper_t fun);
-
 uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
 {
     uint8_t nextop, opcode;
@@ -72,6 +69,15 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     emit_or8c(dyn, ninst, x1, u8, x2, x4, x5);
                     EBBACK(x5, 0);
                     break;
+                case 2: // ADC
+                    INST_NAME("ADC Eb, Ib");
+                    READFLAGS(X_CF);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEB(x1, 1);
+                    u8 = F8;
+                    emit_adc8c(dyn, ninst, x1, u8, x2, x4, x5, x6);
+                    EBBACK(x5, 0);
+                    break;
                 case 3: // SBB
                     INST_NAME("SBB Eb, Ib");
                     READFLAGS(X_CF);
@@ -148,7 +154,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETED((opcode==0x81)?4:1);
                     if(opcode==0x81) i64 = F32S; else i64 = F8S;
                     MOV64xw(x5, i64);
-                    emit_adc32(dyn, ninst, rex, ed, x5, x3, x4, x6);
+                    emit_adc32(dyn, ninst, rex, ed, x5, x3, x4, x6, x9);
                     WBACK;
                     break;
                 case 3: // SBB
@@ -297,7 +303,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 ANDI(gd, gb1, 0xff);
                 if(eb2) {
                     MOV64x(x1, 0xffffffffffff00ffLL);
-                    ANDI(x1, eb1, x1);
+                    AND(x1, eb1, x1);
                     SLLI(gd, gd, 8);
                     OR(eb1, x1, gd);
                 } else {
@@ -316,7 +322,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGD;
             if(MODREG) {   // reg <= reg
                 MVxw(xRAX+(nextop&7)+(rex.b<<3), gd);
-            } else {                    // mem <= reg
+            } else {       // mem <= reg
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0);
                 SDxw(gd, ed, fixedaddress);
                 SMWRITELOCK(lock);
@@ -391,15 +397,13 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("LEA Gd, Ed");
             nextop=F8;
             GETGD;
-            if(MODREG) {   // reg <= reg? that's an invalid operation
+            if(MODREG) { // reg <= reg? that's an invalid operation
                 DEFAULT;
-            } else {                    // mem <= reg
-                addr = geted(dyn, addr, ninst, nextop, &ed, gd, x1, &fixedaddress, rex, NULL, 0, 0);
-                if(gd!=ed) {    // it's sometimes used as a 3 bytes NOP
-                    MV(gd, ed);
-                }
-                if(!rex.w) {
-                    ZEROUP(gd);   //truncate the higher 32bits as asked
+            } else {     // mem <= reg
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 0, 0);
+                MV(gd, ed);
+                if(!rex.w || rex.is32bits) {
+                    ZEROUP(gd); // truncate the higher 32bits as asked
                 }
             }
             break;
@@ -421,17 +425,17 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("POP Ed");
             nextop = F8;
             if(MODREG) {
-                POP1(xRAX+(nextop&7)+(rex.b<<3));
+                POP1z(xRAX+(nextop&7)+(rex.b<<3));
             } else {
-                POP1(x2); // so this can handle POP [ESP] and maybe some variant too
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0);
+                POP1z(x2); // so this can handle POP [ESP] and maybe some variant too
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, x1, &fixedaddress, rex, &lock, 1, 0);
                 if(ed==xRSP) {
-                    SD(x2, ed, fixedaddress);
+                    SDz(x2, ed, fixedaddress);
                 } else {
                     // complicated to just allow a segfault that can be recovered correctly
-                    SUB(xRSP, xRSP, 8);
-                    SD(x2, ed, fixedaddress);
-                    ADD(xRSP, xRSP, 8);
+                    ADDIz(xRSP, xRSP, rex.is32bits?-4:-8);
+                    SDz(x2, ed, fixedaddress);
+                    ADDIz(xRSP, xRSP, rex.is32bits?4:8);
                 }
             }
             break;
@@ -473,39 +477,68 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 ZEROUP(xRDX);
             }
             break;
+        case 0x9B:
+            INST_NAME("FWAIT");
+            break;
         case 0x9C:
             INST_NAME("PUSHF");
+            NOTEST(x1);
             READFLAGS(X_ALL);
             FLAGS_ADJUST_TO11(x3, xFlags, x2);
-            PUSH1(x3);
+            PUSH1z(x3);
             break;
         case 0x9D:
             INST_NAME("POPF");
             SETFLAGS(X_ALL, SF_SET);
-            POP1(xFlags);
+            POP1z(xFlags);
             FLAGS_ADJUST_FROM11(xFlags, x2);
             MOV32w(x1, 0x3F7FD7);
             AND(xFlags, xFlags, x1);
             ORI(xFlags, xFlags, 0x2);
             SET_DFNONE();
+            if(box64_wine) {    // should this be done all the time?
+                ANDI(x1, xFlags, 1 << F_TF);
+                CBZ_NEXT(x1);
+                MOV64x(xRIP, addr);
+                STORE_XEMU_CALL();
+                CALL(native_singlestep, -1);
+                ANDI(xFlags, xFlags, ~(1 << F_TF));
+            }
+            break;
+        case 0x9F:
+            INST_NAME("LAHF");
+            READFLAGS(X_CF|X_PF|X_AF|X_ZF|X_SF);
+            ANDI(x1, xFlags, 0xFF);
+            SLLI(x1, x1, 8);
+            MOV64x(x2, 0xffffffffffff00ffLL);
+            AND(xRAX, xRAX, x2);
+            OR(xRAX, xRAX, x1);
+            break;
+        case 0xA0:
+            INST_NAME("MOV AL,Ob");
+            if(rex.is32bits) u64 = F32; else u64 = F64;
+            MOV64z(x1, u64);
+            LBU(x1, x1, 0);
+            ANDI(xRAX, xRAX, ~0xff);
+            OR(xRAX, xRAX, x1);
             break;
         case 0xA1:
             INST_NAME("MOV EAX,Od");
-            u64 = F64;
-            MOV64x(x1, u64);
+            if(rex.is32bits) u64 = F32; else u64 = F64;
+            MOV64z(x1, u64);
             LDxw(xRAX, x1, 0);
             break;
         case 0xA2:
             INST_NAME("MOV Ob,AL");
-            u64 = F64;
-            MOV64x(x1, u64);
+            if(rex.is32bits) u64 = F32; else u64 = F64;
+            MOV64z(x1, u64);
             SB(xRAX, x1, 0);
             SMWRITE();
             break;
         case 0xA3:
             INST_NAME("MOV Od,EAX");
-            u64 = F64;
-            MOV64x(x1, u64);
+            if(rex.is32bits) u64 = F32; else u64 = F64;
+            MOV64z(x1, u64);
             SDxw(xRAX, x1, 0);
             SMWRITE();
             break;
@@ -628,6 +661,31 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             MOV64xw(x2, i64);
             emit_test32(dyn, ninst, rex, xRAX, x2, x3, x4, x5);
             break;
+        case 0xAA:
+            if(rep) {
+                INST_NAME("REP STOSB");
+                CBZ_NEXT(xRCX);
+                ANDI(x1, xFlags, 1<<F_DF);
+                BNEZ_MARK2(x1);
+                MARK;   // Part with DF==0
+                SB(xRAX, xRDI, 0);
+                ADDI(xRDI, xRDI, 1);
+                ADDI(xRCX, xRCX, -1);
+                BNEZ_MARK(xRCX);
+                B_NEXT_nocond;
+                MARK2;  // Part with DF==1
+                SB(xRAX, xRDI, 0);
+                ADDI(xRDI, xRDI, -1);
+                ADDI(xRCX, xRCX, -1);
+                BNEZ_MARK2(xRCX);
+                // done
+            } else {
+                INST_NAME("STOSB");
+                GETDIR(x3, x1, 1);
+                SB(xRAX, xRDI, 0);
+                ADD(xRDI, xRDI, x3);
+            }
+            break;
         case 0xAB:
             if(rep) {
                 INST_NAME("REP STOSD");
@@ -653,6 +711,82 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 ADD(xRDI, xRDI, x3);
             }
             break;
+        case 0xAE:
+            switch (rep) {
+            case 1:
+            case 2:
+                if (rep==1) {INST_NAME("REPNZ SCASB");} else {INST_NAME("REPZ SCASB");}
+                MAYSETFLAGS();
+                SETFLAGS(X_ALL, SF_SET_PENDING);
+                CBZ_NEXT(xRCX);
+                ANDI(x1, xRAX, 0xff);
+                ANDI(x2, xFlags, 1<<F_DF);
+                BNEZ_MARK2(x2);
+                MARK;   // Part with DF==0
+                LBU(x2, xRDI, 0);
+                ADDI(xRDI, xRDI, 1);
+                SUBI(xRCX, xRCX, 1);
+                if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);}
+                BNE_MARK(xRCX, xZR);
+                B_MARK3_nocond;
+                MARK2;  // Part with DF==1
+                LBU(x2, xRDI, 0);
+                SUBI(xRDI, xRDI, 1);
+                SUBI(xRCX, xRCX, 1);
+                if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);}
+                BNE_MARK2(xRCX, xZR);
+                MARK3; // end
+                emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6);
+                break;
+            default:
+                INST_NAME("SCASB");
+                SETFLAGS(X_ALL, SF_SET_PENDING);
+                GETDIR(x3, x1, 1);
+                ANDI(x1, xRAX, 0xff);
+                LBU(x2, xRDI, 0);
+                ADD(xRDI, xRDI, x3);
+                emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6);
+                break;
+            }
+            break;
+        case 0xAF:
+            switch (rep) {
+            case 1:
+            case 2:
+                if (rep==1) {INST_NAME("REPNZ SCASD");} else {INST_NAME("REPZ SCASD");}
+                MAYSETFLAGS();
+                SETFLAGS(X_ALL, SF_SET_PENDING);
+                CBZ_NEXT(xRCX);
+                if (rex.w) {MV(x1, xRAX);} else {AND(x1, xRAX, xMASK);}
+                ANDI(x2, xFlags, 1<<F_DF);
+                BNEZ_MARK2(x2);
+                MARK;   // Part with DF==0
+                LDxw(x2, xRDI, 0);
+                ADDI(xRDI, xRDI, rex.w?8:4);
+                SUBI(xRCX, xRCX, 1);
+                if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);}
+                BNE_MARK(xRCX, xZR);
+                B_MARK3_nocond;
+                MARK2;  // Part with DF==1
+                LDxw(x2, xRDI, 0);
+                SUBI(xRDI, xRDI, rex.w?8:4);
+                SUBI(xRCX, xRCX, 1);
+                if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);}
+                BNE_MARK2(xRCX, xZR);
+                MARK3; // end
+                emit_cmp32(dyn, ninst, rex, x1, x2, x3, x4, x5, x6);
+                break;
+            default:
+                INST_NAME("SCASD");
+                SETFLAGS(X_ALL, SF_SET_PENDING);
+                GETDIR(x3, x1, rex.w?8:4);
+                AND(x1, xRAX, xMASK);
+                LDxw(x2, xRDI, 0);
+                ADD(xRDI, xRDI, x3);
+                emit_cmp32(dyn, ninst, rex, x1, x2, x3, x4, x5, x6);
+                break;
+            }
+            break;
         case 0xB0:
         case 0xB1:
         case 0xB2:
diff --git a/src/dynarec/rv64/dynarec_rv64_00_3.c b/src/dynarec/rv64/dynarec_rv64_00_3.c
index a19f3f68..2be53fc8 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_3.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_3.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 #include <signal.h>
 #include <assert.h>
@@ -27,6 +26,7 @@
 #include "dynarec_rv64_helper.h"
 
 int isSimpleWrapper(wrapper_t fun);
+int isRetX87Wrapper(wrapper_t fun);
 
 uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
 {
@@ -66,6 +66,16 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     CALL_(rol8, ed, x3);
                     EBBACK(x5, 0);
                     break;
+                case 1:
+                    INST_NAME("ROR Eb, Ib");
+                    MESSAGE(LOG_DUMP, "Need Optimization\n");
+                    SETFLAGS(X_OF|X_CF, SF_SET);
+                    GETEB(x1, 1);
+                    u8 = F8;
+                    MOV32w(x2, u8);
+                    CALL_(ror8, ed, x3);
+                    EBBACK(x5, 0);
+                    break;
                 case 4:
                 case 6:
                     INST_NAME("SHL Eb, Ib");
@@ -187,7 +197,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             }
             BARRIER(BARRIER_FLOAT);
             i32 = F16;
-            retn_to_epilog(dyn, ninst, i32);
+            retn_to_epilog(dyn, ninst, rex, i32);
             *need_epilog = 0;
             *ok = 0;
             break;
@@ -198,7 +208,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 READFLAGS(X_PEND);  // so instead, force the deferred flags, so it's not too slow, and flags are not lost
             }
             BARRIER(BARRIER_FLOAT);
-            ret_to_epilog(dyn, ninst);
+            ret_to_epilog(dyn, ninst, rex);
             *need_epilog = 0;
             *ok = 0;
             break;
@@ -219,7 +229,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
 
                 if (eb2) {
                     // load a mask to x3 (ffffffffffff00ff)
-                    LUI(x3, 0xffffffffffff0);
+                    LUI(x3, 0xffff0);
                     ORI(x3, x3, 0xff);
                     // apply mask
                     AND(eb1, eb1, x3);
@@ -270,8 +280,8 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
 
         case 0xC9:
             INST_NAME("LEAVE");
-            MV(xRSP, xRBP);
-            POP1(xRBP);
+            MVz(xRSP, xRBP);
+            POP1z(xRBP);
             break;
 
         case 0xCC:
@@ -298,6 +308,9 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     // disabling isSimpleWrapper because all signed value less than 64bits needs to be sign extended
                     // and return value needs to be cleanned up
                     tmp = 0;//isSimpleWrapper(*(wrapper_t*)(addr));
+                    if(isRetX87Wrapper(*(wrapper_t*)(addr)))
+                        // return value will be on the stack, so the stack depth needs to be updated
+                        x87_purgecache(dyn, ninst, 0, x3, x1, x4);
                     if(tmp<0 || tmp>1)
                         tmp=0;  //TODO: removed when FP is in place
                     if((box64_log<2 && !cycle_log) && tmp) {
@@ -336,6 +349,39 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 #endif
             }
             break;
+        case 0xCD:
+            u8 = F8;
+            if (box64_wine && u8 == 0x2D) {
+                INST_NAME("INT 2D");
+                // lets do nothing
+                MESSAGE(LOG_INFO, "INT 2D Windows anti-debug hack\n");
+            } else if (u8 == 0x80) {
+                INST_NAME("32bits SYSCALL");
+                NOTEST(x1);
+                SMEND();
+                GETIP(addr);
+                STORE_XEMU_CALL();
+                CALL_S(x86Syscall, -1);
+                LOAD_XEMU_CALL();
+                TABLE64(x3, addr); // expected return address
+                BNE_MARK(xRIP, x3);
+                LW(x1, xEmu, offsetof(x64emu_t, quit));
+                BEQ_NEXT(x1, xZR);
+                MARK;
+                LOAD_XEMU_REM();
+                jump_to_epilog(dyn, 0, xRIP, ninst);
+            } else {
+                INST_NAME("INT n");
+                SETFLAGS(X_ALL, SF_SET); // Hack to set flags in "don't care" state
+                GETIP(ip);
+                STORE_XEMU_CALL();
+                CALL(native_priv, -1);
+                LOAD_XEMU_CALL();
+                jump_to_epilog(dyn, 0, xRIP, ninst);
+                *need_epilog = 0;
+                *ok = 0;
+            }
+            break;
         case 0xCF:
             INST_NAME("IRET");
             SETFLAGS(X_ALL, SF_SET);    // Not a hack, EFLAGS are restored
@@ -348,6 +394,24 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xD2:  // TODO: Jump if CL is 0
             nextop = F8;
             switch((nextop>>3)&7) {
+                case 0:
+                    if(opcode==0xD0) {
+                        INST_NAME("ROL Eb, 1");
+                        MOV32w(x2, 1);
+                    } else {
+                        INST_NAME("ROL Eb, CL");
+                        ANDI(x2, xRCX, 7);
+                    }
+                    SETFLAGS(X_OF|X_CF, SF_PENDING);
+                    GETEB(x1, 0);
+                    UFLAG_OP12(ed, x2);
+                    SLL(x3, ed, x2);
+                    SRLI(x4, x3, 8);
+                    OR(ed, x3, x4);
+                    EBBACK(x5, 1);
+                    UFLAG_RES(ed);
+                    UFLAG_DF(x3, d_rol8);
+                    break;
                 case 1:
                     if(opcode==0xD0) {
                         INST_NAME("ROR Eb, 1");
@@ -367,6 +431,23 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     UFLAG_RES(ed);
                     UFLAG_DF(x3, d_ror8);
                     break;
+                case 4:
+                case 6:
+                    if(opcode==0xD0) {
+                        INST_NAME("SHL Eb, 1");
+                        MOV32w(x2, 1);
+                    } else {
+                        INST_NAME("SHL Eb, CL");
+                        ANDI(x2, xRCX, 7);
+                    }
+                    SETFLAGS(X_ALL, SF_PENDING);
+                    GETEB(x1, 0);
+                    UFLAG_OP12(ed, x2)
+                    SLL(ed, ed, x2);
+                    EBBACK(x5, 1);
+                    UFLAG_RES(ed);
+                    UFLAG_DF(x3, d_shl8);
+                    break;
                 case 5:
                     if(opcode==0xD0) {
                         INST_NAME("SHR Eb, 1");
@@ -422,6 +503,16 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     WBACK;
                     if(!wback && !rex.w) ZEROUP(ed);
                     break;
+                case 3:
+                    INST_NAME("RCR Ed, 1");
+                    MESSAGE(LOG_DUMP, "Need Optimization\n");
+                    READFLAGS(X_CF);
+                    SETFLAGS(X_OF|X_CF, SF_SET);
+                    MOV32w(x2, 1);
+                    GETEDW(x4, x1, 0);
+                    CALL_(rex.w?((void*)rcr64):((void*)rcr32), ed, x4);
+                    WBACK;
+                    break;
                 case 4:
                 case 6:
                     INST_NAME("SHL Ed, 1");
@@ -517,6 +608,12 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xDB:
             addr = dynarec64_DB(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
             break;
+        case 0xDC:
+            addr = dynarec64_DC(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+        case 0xDD:
+            addr = dynarec64_DD(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
 
         case 0xDE:
             addr = dynarec64_DE(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
@@ -534,7 +631,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 #endif
             }
             #if STEP < 2
-            if(isNativeCall(dyn, addr+i32, &dyn->insts[ninst].natcall, &dyn->insts[ninst].retn))
+            if(!rex.is32bits && isNativeCall(dyn, addr+i32, &dyn->insts[ninst].natcall, &dyn->insts[ninst].retn))
                 tmp = dyn->insts[ninst].pass2choice = 3;
             else
                 tmp = dyn->insts[ninst].pass2choice = 0;
@@ -564,6 +661,9 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                             tmp=0;  // float paramters not ready!
                     } else
                         tmp=0;
+                    if(dyn->insts[ninst].natcall && isRetX87Wrapper(*(wrapper_t*)(dyn->insts[ninst].natcall+2)))
+                        // return value will be on the stack, so the stack depth needs to be updated
+                        x87_purgecache(dyn, ninst, 0, x3, x1, x4);
                     if((box64_log<2 && !cycle_log) && dyn->insts[ninst].natcall && tmp) {
                         //GETIP(ip+3+8+8); // read the 0xCC
                         call_n(dyn, ninst, *(void**)(dyn->insts[ninst].natcall+2+8), tmp);
@@ -611,12 +711,13 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         *need_epilog = 0;
                         *ok = 0;
                     }
-                    if(addr<0x100000000LL) {
-                        MOV64x(x2, addr);
+
+                    if(rex.is32bits) {
+                        MOV32w(x2, addr);
                     } else {
                         TABLE64(x2, addr);
                     }
-                    PUSH1(x2);
+                    PUSH1z(x2);
                     // TODO: Add support for CALLRET optim
                     /*if(box64_dynarec_callret) {
                         // Push actual return address
@@ -636,16 +737,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         *ok = 0;
                         *need_epilog = 0;
                     }
-                    if(addr+i32==0) {   // self modifying code maybe? so use indirect address fetching
-                        if(addr-4<0x100000000LL) {
-                            MOV64x(x4, addr-4);
-                        } else {
-                            TABLE64(x4, addr-4);
-                        }
-                        LD(x4, x4, 0);
-                        jump_to_next(dyn, 0, x4, ninst);
-                    } else
-                        jump_to_next(dyn, addr+i32, 0, ninst);
+                    jump_to_next(dyn, addr+i32, 0, ninst);
                     break;
             }
             break;
@@ -659,11 +751,11 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 INST_NAME("JMP Ib");
                 i32 = F8S;
             }
-            JUMP(addr+i32, 0);
+            JUMP((uintptr_t)getAlternate((void*)(addr+i32)), 0);
             if(dyn->insts[ninst].x64.jmp_insts==-1) {
                 // out of the block
                 fpu_purgecache(dyn, ninst, 1, x1, x2, x3);
-                jump_to_next(dyn, addr+i32, 0, ninst);
+                jump_to_next(dyn, (uintptr_t)getAlternate((void*)(addr+i32)), 0, ninst);
             } else {
                 // inside the block
                 CacheTransform(dyn, ninst, CHECK_CACHE(), x1, x2, x3);
@@ -681,6 +773,12 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xF0:
             addr = dynarec64_F0(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
             break;
+        case 0xF5:
+            INST_NAME("CMC");
+            READFLAGS(X_CF);
+            SETFLAGS(X_CF, SF_SUBSET);
+            XORI(xFlags, xFlags, 1<<F_CF);
+            break;
         case 0xF6:
             nextop = F8;
             switch((nextop>>3)&7) {
@@ -716,8 +814,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     UFLAG_RES(x1);
                     LUI(x2, 0xffff0);
                     AND(xRAX, xRAX, x2);
-                    SLLI(x1, x1, 48);
-                    SRLI(x1, x1, 48);
+                    ZEXTH(x1, x1);
                     OR(xRAX, xRAX, x1);
                     break;
                 case 5:
@@ -731,8 +828,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     UFLAG_RES(x1);
                     LUI(x2, 0xffff0);
                     AND(xRAX, xRAX, x2);
-                    SLLI(x1, x1, 48);
-                    SRLI(x1, x1, 48);
+                    ZEXTH(x1, x1);
                     OR(xRAX, xRAX, x1);
                     break;
                 case 6:
@@ -840,9 +936,9 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         AND(xRAX, x2, xMASK);
                         ZEROUP(xRDX);
                     } else {
-                        if(ninst 
-                           && dyn->insts[ninst-1].x64.addr 
-                           && *(uint8_t*)(dyn->insts[ninst-1].x64.addr)==0x31 
+                        if(ninst
+                           && dyn->insts[ninst-1].x64.addr
+                           && *(uint8_t*)(dyn->insts[ninst-1].x64.addr)==0x31
                            && *(uint8_t*)(dyn->insts[ninst-1].x64.addr+1)==0xD2) {
                             SET_DFNONE();
                             GETED(0);
@@ -879,7 +975,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         ZEROUP(xRDX);
                     } else {
                         if(ninst && dyn->insts
-                           &&  dyn->insts[ninst-1].x64.addr 
+                           &&  dyn->insts[ninst-1].x64.addr
                            && *(uint8_t*)(dyn->insts[ninst-1].x64.addr)==0x48
                            && *(uint8_t*)(dyn->insts[ninst-1].x64.addr+1)==0x99) {
                             SET_DFNONE()
@@ -970,7 +1066,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     break;
                 case 2: // CALL Ed
                     INST_NAME("CALL Ed");
-                    PASS2IF((box64_dynarec_safeflags>1) || 
+                    PASS2IF((box64_dynarec_safeflags>1) ||
                         ((ninst && dyn->insts[ninst-1].x64.set_flags)
                         || ((ninst>1) && dyn->insts[ninst-2].x64.set_flags)), 1)
                     {
@@ -978,7 +1074,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     } else {
                         SETFLAGS(X_ALL, SF_SET);    //Hack to put flag in "don't care" state
                     }
-                    GETEDx(0);
+                    GETEDz(0);
                     if(box64_dynarec_callret && box64_dynarec_bigblock>1) {
                         BARRIER(BARRIER_FULL);
                     } else {
@@ -1001,22 +1097,41 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         }
                         STPx_S7_preindex(x4, xRIP, xSP, -16);
                     }*/
-                    PUSH1(xRIP);
+                    PUSH1z(xRIP);
                     jump_to_next(dyn, 0, ed, ninst);
                     break;
                 case 4: // JMP Ed
                     INST_NAME("JMP Ed");
                     READFLAGS(X_PEND);
                     BARRIER(BARRIER_FLOAT);
-                    GETEDx(0);
+                    GETEDz(0);
                     jump_to_next(dyn, 0, ed, ninst);
                     *need_epilog = 0;
                     *ok = 0;
                     break;
+                case 5: // JMP FAR Ed
+                    if(MODREG) {
+                        DEFAULT;
+                    } else {
+                        INST_NAME("JMP FAR Ed");
+                        READFLAGS(X_PEND);
+                        BARRIER(BARRIER_FLOAT);
+                        SMREAD()
+                        addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 0, 0);
+                        LDxw(x1, wback, 0);
+                        ed = x1;
+                        LHU(x3, wback, rex.w?8:4);
+                        SW(x3, xEmu, offsetof(x64emu_t, segs[_CS]));
+                        SW(xZR, xEmu, offsetof(x64emu_t, segs_serial[_CS]));
+                        jump_to_epilog(dyn, 0, ed, ninst);
+                        *need_epilog = 0;
+                        *ok = 0;
+                    }
+                    break;
                 case 6: // Push Ed
                     INST_NAME("PUSH Ed");
-                    GETEDx(0);
-                    PUSH1(ed);
+                    GETEDz(0);
+                    PUSH1z(ed);
                     break;
 
                 default:
diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c
index a3d9efc1..5c8d7b81 100644
--- a/src/dynarec/rv64/dynarec_rv64_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -42,7 +41,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
     int s0, s1;
     uint64_t tmp64u;
     int64_t j64;
-    int64_t fixedaddress;
+    int64_t fixedaddress, gdoffset;
     int unscaled;
     MAYUSE(wb2);
     MAYUSE(gback);
@@ -113,24 +112,36 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             *ok = 0;
             break;
 
+        case 0x0D:
+            nextop = F8;
+            switch((nextop>>3)&7) {
+                case 1:
+                    INST_NAME("PREFETCHW");
+                    // nop without Zicbom, Zicbop, Zicboz extensions
+                    FAKEED;
+                    break;
+                default:    //???
+                    DEFAULT;
+            }
+            break;
 
         case 0x10:
             INST_NAME("MOVUPS Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LD(x3, wback, fixedaddress+0);
             LD(x4, wback, fixedaddress+8);
-            SD(x3, gback, 0);
-            SD(x4, gback, 8);
+            SD(x3, gback, gdoffset+0);
+            SD(x4, gback, gdoffset+8);
             break;
         case 0x11:
             INST_NAME("MOVUPS Ex,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
-            LD(x3, gback, 0);
-            LD(x4, gback, 8);
+            LD(x3, gback, gdoffset+0);
+            LD(x4, gback, gdoffset+8);
             SD(x3, wback, fixedaddress+0);
             SD(x4, wback, fixedaddress+8);
             if(!MODREG)
@@ -140,10 +151,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             if(MODREG) {
                 INST_NAME("MOVHLPS Gx,Ex");
-                GETGX(x1);
+                GETGX();
                 GETEX(x2, 0);
                 LD(x3, wback, fixedaddress+8);
-                SD(x3, gback, 0);
+                SD(x3, gback, gdoffset+0);
             } else {
                 INST_NAME("MOVLPS Gx,Ex");
                 GETEXSD(v0, 0);
@@ -154,9 +165,9 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x13:
             INST_NAME("MOVLPS Ex,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
-            LD(x3, gback, 0);
+            LD(x3, gback, gdoffset+0);
             SD(x3, wback, fixedaddress+0);
             if(!MODREG)
                 SMWRITE2();
@@ -164,28 +175,28 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x14:
             INST_NAME("UNPCKLPS Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
-            LWU(x5, gback, 1*4);
+            LWU(x5, gback, gdoffset+1*4);
             LWU(x3, wback, fixedaddress+0);
             LWU(x4, wback, fixedaddress+4);
-            SW(x4, gback, 3*4);
-            SW(x5, gback, 2*4);
-            SW(x3, gback, 1*4);
+            SW(x4, gback, gdoffset+3*4);
+            SW(x5, gback, gdoffset+2*4);
+            SW(x3, gback, gdoffset+1*4);
             break;
         case 0x15:
             INST_NAME("UNPCKHPS Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LWU(x3, wback, fixedaddress+2*4);
             LWU(x4, wback, fixedaddress+3*4);
-            LWU(x5, gback, 2*4);
-            LWU(x6, gback, 3*4);
-            SW(x5, gback, 0*4);
-            SW(x3, gback, 1*4);
-            SW(x6, gback, 2*4);
-            SW(x4, gback, 3*4);
+            LWU(x5, gback, gdoffset+2*4);
+            LWU(x6, gback, gdoffset+3*4);
+            SW(x5, gback, gdoffset+0*4);
+            SW(x3, gback, gdoffset+1*4);
+            SW(x6, gback, gdoffset+2*4);
+            SW(x4, gback, gdoffset+3*4);
             break;
         case 0x16:
             nextop = F8;
@@ -195,17 +206,17 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 INST_NAME("MOVHPS Gx,Ex");
                 SMREAD();
             }
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LD(x4, wback, fixedaddress+0);
-            SD(x4, gback, 8);
+            SD(x4, gback, gdoffset+8);
             break;
         case 0x17:
             INST_NAME("MOVHPS Ex,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
-            LD(x4, gback, 8);
+            LD(x4, gback, gdoffset+8);
             SD(x4, wback, fixedaddress+0);
             if(!MODREG)
                 SMWRITE2();
@@ -217,16 +228,11 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             } else
             switch((nextop>>3)&7) {
                 case 0:
-                    DEFAULT;
-                    break;
                 case 1:
-                    DEFAULT;
-                    break;
                 case 2:
-                    DEFAULT;
-                    break;
                 case 3:
-                    DEFAULT;
+                    INST_NAME("PREFETCHh Ed");
+                    FAKEED;
                     break;
                 default:
                     INST_NAME("NOP (multibyte)");
@@ -243,14 +249,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x28:
             INST_NAME("MOVAPS Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q(x3);
             break;
         case 0x29:
             INST_NAME("MOVAPS Ex,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
             if(!MODREG)
@@ -260,10 +266,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x2B:
             INST_NAME("MOVNTPS Ex,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
-            LD(x3, gback, 0);
-            LD(x4, gback, 8);
+            LD(x3, gback, gdoffset+0);
+            LD(x4, gback, gdoffset+8);
             SD(x3, wback, fixedaddress+0);
             SD(x4, wback, fixedaddress+8);
             break;
@@ -304,10 +310,11 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             break;
         case 0x31:
             INST_NAME("RDTSC");
+            NOTEST(x1);
             MESSAGE(LOG_DUMP, "Need Optimization\n");
-            CALL(ReadTSC, xRAX);   // will return the u64 in xEAX
-            SRLI(xRDX, xRAX, 32);
-            ZEROUP(xRAX);   // wipe upper part
+            CALL(ReadTSC, x3);   // will return the u64 in x3
+            SRLI(xRDX, x3, 32);
+            AND(xRAX, x3, 32);   // wipe upper part
             break;
 
 
@@ -342,12 +349,72 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 OR(gd, gd, x2);
             }
             break;
+        case 0x51:
+            INST_NAME("SQRTPS Gx, Ex");
+            nextop = F8;
+            GETGX();
+            GETEX(x2, 0);
+            d0 = fpu_get_scratch(dyn);
+            for(int i=0; i<4; ++i) {
+                FLW(d0, wback, fixedaddress+4*i);
+                FSQRTS(d0, d0);
+                FSW(d0, gback, gdoffset+4*i);
+            }
+            break;
+        case 0x52:
+            INST_NAME("RSQRTPS Gx, Ex");
+            nextop = F8;
+            GETGX();
+            GETEX(x2, 0);
+            s0 = fpu_get_scratch(dyn);
+            s1 = fpu_get_scratch(dyn); // 1.0f
+            v0 = fpu_get_scratch(dyn); // 0.0f
+            // do accurate computation, because riscv doesn't have rsqrt
+            MOV32w(x3, 1);
+            FCVTSW(s1, x3, RD_DYN);
+            if (!box64_dynarec_fastnan) {
+                FCVTSW(v0, xZR, RD_DYN);
+            }
+            for(int i=0; i<4; ++i) {
+                FLW(s0, wback, fixedaddress+i*4);
+                if (!box64_dynarec_fastnan) {
+                    FLES(x3, v0, s0); // s0 >= 0.0f?
+                    BNEZ(x3, 6*4);
+                    FEQS(x3, s0, s0); // isnan(s0)?
+                    BEQZ(x3, 2*4);
+                    // s0 is negative, so generate a NaN
+                    FDIVS(s0, s1, v0);
+                    // s0 is a NaN, just copy it
+                    FSW(s0, gback, gdoffset+i*4);
+                    J(4*4);
+                    // do regular computation
+                }
+                FSQRTS(s0, s0);
+                FDIVS(s0, s1, s0);
+                FSW(s0, gback, gdoffset+i*4);
+            }
+            break;
+        case 0x53:
+            INST_NAME("RCPPS Gx, Ex");
+            nextop = F8;
+            GETGX();
+            GETEX(x2, 0);
+            d0 = fpu_get_scratch(dyn);
+            d1 = fpu_get_scratch(dyn);
+            LUI(x3, 0x3f800);
+            FMVWX(d0, x3); // 1.0f
+            for(int i=0; i<4; ++i) {
+                FLW(d1, wback, fixedaddress+4*i);
+                FDIVS(d1, d0, d1);
+                FSW(d1, gback, gdoffset+4*i);
+            }
+            break;
         case 0x54:
             INST_NAME("ANDPS Gx, Ex");
             nextop = F8;
             gd = ((nextop&0x38)>>3)+(rex.r<<3);
             if(!(MODREG && gd==(nextop&7)+(rex.b<<3))) {
-                GETGX(x1);
+                GETGX();
                 GETEX(x2, 0);
                 SSE_LOOP_Q(x3, x4, AND(x3, x3, x4));
             }
@@ -355,7 +422,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x55:
             INST_NAME("ANDNPS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, NOT(x3, x3); AND(x3, x3, x4));
             break;
@@ -364,7 +431,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             nextop = F8;
             gd = ((nextop&0x38)>>3)+(rex.r<<3);
             if(!(MODREG && gd==(nextop&7)+(rex.b<<3))) {
-                GETGX(x1);
+                GETGX();
                 GETEX(x2, 0);
                 SSE_LOOP_Q(x3, x4, OR(x3, x3, x4));
             }
@@ -373,12 +440,12 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("XORPS Gx, Ex");
             nextop = F8;
             //TODO: it might be possible to check if SS or SD are used and not purge them to optimize a bit
-            GETGX(x1);
+            GETGX();
             if(MODREG && gd==(nextop&7)+(rex.b<<3))
             {
                 // just zero dest
-                SD(xZR, x1, 0);
-                SD(xZR, x1, 8);
+                SD(xZR, gback, gdoffset+0);
+                SD(xZR, gback, gdoffset+8);
             } else {
                 GETEX(x2, 0);
                 SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
@@ -387,37 +454,37 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x58:
             INST_NAME("ADDPS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
             for(int i=0; i<4; ++i) {
                 // GX->f[i] += EX->f[i];
                 FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, i*4);
+                FLW(s1, gback, gdoffset+i*4);
                 FADDS(s1, s1, s0);
-                FSW(s1, gback, i*4);
+                FSW(s1, gback, gdoffset+i*4);
             }
             break;
         case 0x59:
             INST_NAME("MULPS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
             for(int i=0; i<4; ++i) {
                 // GX->f[i] *= EX->f[i];
                 FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, i*4);
+                FLW(s1, gback, gdoffset+i*4);
                 FMULS(s1, s1, s0);
-                FSW(s1, gback, i*4);
+                FSW(s1, gback, gdoffset+i*4);
             }
             break;
         case 0x5A:
             INST_NAME("CVTPS2PD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
@@ -425,46 +492,46 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             FLW(s1, wback, fixedaddress+4);
             FCVTDS(s0, s0);
             FCVTDS(s1, s1);
-            FSD(s0, gback, 0);
-            FSD(s1, gback, 8);
+            FSD(s0, gback, gdoffset+0);
+            FSD(s1, gback, gdoffset+8);
             break;
         case 0x5B:
             INST_NAME("CVTDQ2PS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             for (int i=0; i<4; ++i) {
                 LW(x3, wback, fixedaddress+i*4);
                 FCVTSW(s0, x3, RD_RNE);
-                FSW(s0, gback, i*4);
+                FSW(s0, gback, gdoffset+i*4);
             }
             break;
         case 0x5C:
             INST_NAME("SUBPS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
             for(int i=0; i<4; ++i) {
                 // GX->f[i] -= EX->f[i];
                 FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, i*4);
+                FLW(s1, gback, gdoffset+i*4);
                 FSUBS(s1, s1, s0);
-                FSW(s1, gback, i*4);
+                FSW(s1, gback, gdoffset+i*4);
             }
             break;
         case 0x5D:
             INST_NAME("MINPS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
             for(int i=0; i<4; ++i) {
                 FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, i*4);
+                FLW(s1, gback, gdoffset+i*4);
                 if(!box64_dynarec_fastnan) {
                     FEQS(x3, s0, s0);
                     FEQS(x4, s1, s1);
@@ -472,38 +539,38 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     BEQZ(x3, 12);
                     FLTS(x3, s0, s1);
                     BEQZ(x3, 8);
-                    FSW(s0, gback, i*4);
+                    FSW(s0, gback, gdoffset+i*4);
                 } else {
                     FMINS(s1, s1, s0);
-                    FSW(s1, gback, i*4);
+                    FSW(s1, gback, gdoffset+i*4);
                 }
             }
             break;
         case 0x5E:
             INST_NAME("DIVPS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
             for(int i=0; i<4; ++i) {
                 // GX->f[i] /= EX->f[i];
                 FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, i*4);
+                FLW(s1, gback, gdoffset+i*4);
                 FDIVS(s1, s1, s0);
-                FSW(s1, gback, i*4);
+                FSW(s1, gback, gdoffset+i*4);
             }
             break;
         case 0x5F:
             INST_NAME("MAXPS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             s0 = fpu_get_scratch(dyn);
             s1 = fpu_get_scratch(dyn);
             for(int i=0; i<4; ++i) {
                 FLW(s0, wback, fixedaddress+i*4);
-                FLW(s1, gback, i*4);
+                FLW(s1, gback, gdoffset+i*4);
                 if(!box64_dynarec_fastnan) {
                     FEQS(x3, s0, s0);
                     FEQS(x4, s1, s1);
@@ -511,13 +578,242 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     BEQZ(x3, 12);
                     FLTS(x3, s1, s0);
                     BEQZ(x3, 8);
-                    FSW(s0, gback, i*4);
+                    FSW(s0, gback, gdoffset+i*4);
                 } else {
                     FMAXS(s1, s1, s0);
-                    FSW(s1, gback, i*4);
+                    FSW(s1, gback, gdoffset+i*4);
+                }
+            }
+            break;
+        case 0x60:
+            INST_NAME("PUNPCKLBW Gm,Em");
+            nextop = F8;
+            GETGM();
+            for(int i=3; i>0; --i) { // 0 is untouched
+                // GX->ub[2 * i] = GX->ub[i];
+                LBU(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset+2*i);
+            }
+            if (MODREG && gd==(nextop&7)) {
+                for(int i=0; i<4; ++i) {
+                    // GX->ub[2 * i + 1] = GX->ub[2 * i];
+                    LBU(x3, gback, gdoffset+2*i);
+                    SB(x3, gback, gdoffset+2*i+1);
+                }
+            } else {
+                GETEM(x2, 0);
+                for(int i=0; i<4; ++i) {
+                    // GX->ub[2 * i + 1] = EX->ub[i];
+                    LBU(x3, wback, fixedaddress+i);
+                    SB(x3, gback, gdoffset+2*i+1);
+                }
+            }
+            break;
+        case 0x61:
+            INST_NAME("PUNPCKLWD Gm, Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            // GM->uw[3] = EM->uw[1];
+            LHU(x3, wback, fixedaddress+2*1);
+            SH(x3, gback, gdoffset+2*3);
+            // GM->uw[2] = GM->uw[1];
+            LHU(x3, gback, gdoffset+2*1);
+            SH(x3, gback, gdoffset+2*2);
+            // GM->uw[1] = EM->uw[0];
+            LHU(x3, wback, fixedaddress+2*0);
+            SH(x3, gback, gdoffset+2*1);
+            break;
+        case 0x62:
+            INST_NAME("PUNPCKLDQ Gm, Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            // GM->ud[1] = EM->ud[0];
+            LWU(x3, wback, fixedaddress);
+            SW(x3, gback, gdoffset+4*1);
+            break;
+        case 0x67:
+            INST_NAME("PACKUSWB Gm, Em");
+            nextop = F8;
+            GETGM();
+            ADDI(x5, xZR, 0xFF);
+            for(int i=0; i<4; ++i) {
+                // GX->ub[i] = (GX->sw[i]<0)?0:((GX->sw[i]>0xff)?0xff:GX->sw[i]);
+                LH(x3, gback, gdoffset+i*2);
+                BGE(x5, x3, 8);
+                ADDI(x3, xZR, 0xFF);
+                NOT(x4, x3);
+                SRAI(x4, x4, 63);
+                AND(x3, x3, x4);
+                SB(x3, gback, gdoffset+i);
+            }
+            if (MODREG && gd==(nextop&7)) {
+                // GM->ud[1] = GM->ud[0];
+                LW(x3, gback, gdoffset+0*4);
+                SW(x3, gback, gdoffset+1*4);
+            } else {
+                GETEM(x1, 0);
+                for(int i=0; i<4; ++i) {
+                    // GX->ub[4+i] = (EX->sw[i]<0)?0:((EX->sw[i]>0xff)?0xff:EX->sw[i]);
+                    LH(x3, wback, fixedaddress+i*2);
+                    BGE(x5, x3, 8);
+                    ADDI(x3, xZR, 0xFF);
+                    NOT(x4, x3);
+                    SRAI(x4, x4, 63);
+                    AND(x3, x3, x4);
+                    SB(x3, gback, gdoffset+4+i);
+                }
+            }
+            break;
+        case 0x68:
+            INST_NAME("PUNPCKHBW Gm,Em");
+            nextop = F8;
+            GETGM();
+            for(int i=0; i<4; ++i) {
+                // GX->ub[2 * i] = GX->ub[i + 4];
+                LBU(x3, gback, gdoffset+i+4);
+                SB(x3, gback, gdoffset+2*i);
+            }
+            if (MODREG && gd==(nextop&7)) {
+                for(int i=0; i<4; ++i) {
+                    // GX->ub[2 * i + 1] = GX->ub[2 * i];
+                    LBU(x3, gback, gdoffset+2*i);
+                    SB(x3, gback, gdoffset+2*i+1);
+                }
+            } else {
+                GETEM(x2, 0);
+                for(int i=0; i<4; ++i) {
+                    // GX->ub[2 * i + 1] = EX->ub[i + 4];
+                    LBU(x3, wback, fixedaddress+i+4);
+                    SB(x3, gback, gdoffset+2*i+1);
+                }
+            }
+            break;
+        case 0x69:
+            INST_NAME("PUNPCKHWD Gm,Em");
+            nextop = F8;
+            GETGM();
+            for(int i=0; i<2; ++i) {
+                // GX->uw[2 * i] = GX->uw[i + 2];
+                LHU(x3, gback, gdoffset+(i+2)*2);
+                SH(x3, gback, gdoffset+2*i*2);
+            }
+            if (MODREG && gd==(nextop&7)) {
+                for(int i=0; i<2; ++i) {
+                    // GX->uw[2 * i + 1] = GX->uw[2 * i];
+                    LHU(x3, gback, gdoffset+2*i*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
+                }
+            } else {
+                GETEM(x1, 0);
+                for(int i=0; i<2; ++i) {
+                    // GX->uw[2 * i + 1] = EX->uw[i + 2];
+                    LHU(x3, wback, fixedaddress+(i+2)*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
+                }
+            }
+            break;
+        case 0x6A:
+            INST_NAME("PUNPCKHDQ Gm,Em");
+            nextop = F8;
+            GETEM(x1, 0);
+            GETGM();
+            // GM->ud[0] = GM->ud[1];
+            LWU(x3, gback, gdoffset+1*4);
+            SW(x3, gback, gdoffset+0*4);
+            if (!(MODREG && (gd==ed))) {
+                // GM->ud[1] = EM->ud[1];
+                LWU(x3, wback, fixedaddress+1*4);
+                SW(x3, gback, gdoffset+1*4);
+            }
+            break;
+        case 0x6E:
+            INST_NAME("MOVD Gm, Ed");
+            nextop = F8;
+            GETGM();
+            if(MODREG) {
+                ed = xRAX + (nextop&7) + (rex.b<<3);
+            } else {
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, 0);
+                if(rex.w) {
+                    LD(x4, ed, fixedaddress);
+                } else {
+                    LW(x4, ed, fixedaddress);
                 }
+                ed = x4;
+            }
+            if(rex.w) SD(ed, gback, gdoffset+0); else SW(ed, gback, gdoffset+0);
+            break;
+        case 0x6F:
+            INST_NAME("MOVQ Gm, Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            LD(x3, wback, fixedaddress);
+            SD(x3, gback, gdoffset+0);
+            break;
+        case 0x71:
+            nextop = F8;
+            switch((nextop>>3)&7) {
+                case 2:
+                    INST_NAME("PSRLW Em, Ib");
+                    GETEM(x1, 1);
+                    u8 = F8;
+                    if (u8>15) {
+                        // just zero dest
+                        SD(xZR, wback, fixedaddress);
+                    } else if(u8) {
+                        for (int i=0; i<4; ++i) {
+                            // EX->uw[i] >>= u8;
+                            LHU(x3, wback, fixedaddress+i*2);
+                            SRLI(x3, x3, u8);
+                            SH(x3, wback, fixedaddress+i*2);
+                        }
+                    }
+                    break;
+                case 4:
+                    INST_NAME("PSRAW Em, Ib");
+                    GETEM(x1, 1);
+                    u8 = F8;
+                    if(u8>15) u8=15;
+                    if(u8) {
+                        for (int i=0; i<4; ++i) {
+                            // EX->sw[i] >>= u8;
+                            LH(x3, wback, fixedaddress+i*2);
+                            SRAI(x3, x3, u8);
+                            SH(x3, wback, fixedaddress+i*2);
+                        }
+                    }
+                    break;
+                case 6:
+                    INST_NAME("PSLLW Em, Ib");
+                    GETEM(x1, 1);
+                    u8 = F8;
+                    if (u8>15) {
+                        // just zero dest
+                        SD(xZR, wback, fixedaddress+0);
+                    } else if(u8) {
+                        for (int i=0; i<4; ++i) {
+                            // EX->uw[i] <<= u8;
+                            LHU(x3, wback, fixedaddress+i*2);
+                            SLLI(x3, x3, u8);
+                            SH(x3, wback, fixedaddress+i*2);
+                        }
+                    }
+                    break;
+                default:
+                    *ok = 0;
+                    DEFAULT;
             }
             break;
+        case 0x75:
+            INST_NAME("PCMPEQW Gm,Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            MMX_LOOP_W(x3, x4, SUB(x3, x3, x4); SEQZ(x3, x3); NEG(x3, x3));
+            break;
         case 0x77:
             INST_NAME("EMMS");
             // empty MMX, FPU now usable
@@ -525,7 +821,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             /*emu->top = 0;
             emu->fpu_stack = 0;*/ //TODO: Check if something is needed here?
             break;
-
+        case 0x7F:
+            INST_NAME("MOVQ Em, Gm");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            LD(x3, gback, gdoffset+0);
+            SD(x3, wback, fixedaddress);
+            break;
         #define GO(GETFLAGS, NO, YES, F)   \
             READFLAGS(F);                                               \
             i32_ = F32S;                                                \
@@ -570,9 +873,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     eb1 = xRAX+(ed&3);                  \
                 }                                       \
                 if (eb2) {                              \
-                    LUI(x1, 0xffffffffffff0);           \
+                    LUI(x1, 0xffff0);                   \
                     ORI(x1, x1, 0xff);                  \
                     AND(eb1, eb1, x1);                  \
+                    SLLI(x3, x3, 8);                    \
                 } else {                                \
                     ANDI(eb1, eb1, 0xf00);              \
                 }                                       \
@@ -585,7 +889,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 
         GOCOND(0x90, "SET", "Eb");
         #undef GO
-            
+
         case 0xA2:
             INST_NAME("CPUID");
             NOTEST(x1);
@@ -787,7 +1091,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 wback = 0;
             } else {
                 SMREAD();
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
                 SRAI(x1, gd, 5+rex.w);
                 SLLI(x1, x1, 2+rex.w);
                 ADD(x3, wback, x1);
@@ -804,10 +1108,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             ANDI(x4, x4, 1); // F_CF is 1
             ANDI(xFlags, xFlags, ~1);
             OR(xFlags, xFlags, x4);
-            ADDI(x3, xZR, 1);
-            SLL(x3, x3, x2);
-            NOT(x3, x3);
-            AND(ed, ed, x3);
+            ADDI(x5, xZR, 1);
+            SLL(x5, x5, x2);
+            NOT(x5, x5);
+            AND(ed, ed, x5);
             if(wback) {
                 SDxw(ed, wback, fixedaddress);
                 SMWRITE();
@@ -844,8 +1148,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGD;
             if(MODREG) {
                 ed = xRAX+(nextop&7)+(rex.b<<3);
-                SLLI(gd, ed, 48);
-                SRLI(gd, gd, 48);
+                ZEXTH(gd, ed);
             } else {
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
@@ -998,14 +1301,18 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             ORI(xFlags, xFlags, 1<<F_ZF);
             B_NEXT_nocond;
             MARK;
-            NEG(x2, ed);
-            AND(x2, x2, ed);
-            TABLE64(x3, 0x03f79d71b4ca8b09ULL);
-            MUL(x2, x2, x3);
-            SRLI(x2, x2, 64-6);
-            TABLE64(x1, (uintptr_t)&deBruijn64tab);
-            ADD(x1, x1, x2);
-            LBU(gd, x1, 0);
+            if(rv64_zbb) {
+                CTZxw(gd, ed);
+            } else {
+                NEG(x2, ed);
+                AND(x2, x2, ed);
+                TABLE64(x3, 0x03f79d71b4ca8b09ULL);
+                MUL(x2, x2, x3);
+                SRLI(x2, x2, 64-6);
+                TABLE64(x1, (uintptr_t)&deBruijn64tab);
+                ADD(x1, x1, x2);
+                LBU(gd, x1, 0);
+            }
             ANDI(xFlags, xFlags, ~(1<<F_ZF));
             break;
         case 0xBD:
@@ -1024,37 +1331,43 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             B_NEXT_nocond;
             MARK;
             ANDI(xFlags, xFlags, ~(1<<F_ZF));
-            if(ed!=gd)
-                u8 = gd;
-            else
-                u8 = x1;
-            ADDI(u8, xZR, 0);
-            if(rex.w) {
-                MV(x2, ed);
-                SRLI(x3, x2, 32);
+            if(rv64_zbb) {
+                MOV32w(x1, rex.w?63:31);
+                CLZxw(gd, ed);
+                SUB(gd, x1, gd);
+            } else {
+                if(ed!=gd)
+                    u8 = gd;
+                else
+                    u8 = x1;
+                ADDI(u8, xZR, 0);
+                if(rex.w) {
+                    MV(x2, ed);
+                    SRLI(x3, x2, 32);
+                    BEQZ(x3, 4+2*4);
+                    ADDI(u8, u8, 32);
+                    MV(x2, x3);
+                } else {
+                    AND(x2, ed, xMASK);
+                }
+                SRLI(x3, x2, 16);
                 BEQZ(x3, 4+2*4);
-                ADDI(u8, u8, 32);
+                ADDI(u8, u8, 16);
                 MV(x2, x3);
-            } else {
-                AND(x2, ed, xMASK);
+                SRLI(x3, x2, 8);
+                BEQZ(x3, 4+2*4);
+                ADDI(u8, u8, 8);
+                MV(x2, x3);
+                SRLI(x3, x2, 4);
+                BEQZ(x3, 4+2*4);
+                ADDI(u8, u8, 4);
+                MV(x2, x3);
+                ANDI(x2, x2, 0b1111);
+                TABLE64(x3, (uintptr_t)&lead0tab);
+                ADD(x3, x3, x2);
+                LBU(x2, x3, 0);
+                ADD(gd, u8, x2);
             }
-            SRLI(x3, x2, 16);
-            BEQZ(x3, 4+2*4);
-            ADDI(u8, u8, 16);
-            MV(x2, x3);
-            SRLI(x3, x2, 8);
-            BEQZ(x3, 4+2*4);
-            ADDI(u8, u8, 8);
-            MV(x2, x3);
-            SRLI(x3, x2, 4);
-            BEQZ(x3, 4+2*4);
-            ADDI(u8, u8, 4);
-            MV(x2, x3);
-            ANDI(x2, x2, 0b1111); 
-            TABLE64(x3, (uintptr_t)&lead0tab);
-            ADD(x3, x3, x2);
-            LBU(x2, x3, 0);
-            ADD(gd, u8, x2);
             break;
         case 0xBE:
             INST_NAME("MOVSX Gd, Eb");
@@ -1098,13 +1411,13 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0xC2:
             INST_NAME("CMPPS Gx, Ex, Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 1);
             u8 = F8;
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
             for(int i=0; i<4; ++i) {
-                FLW(d0, gback, i*4);
+                FLW(d0, gback, gdoffset+i*4);
                 FLW(d1, wback, fixedaddress+i*4);
                 if ((u8&7) == 0) {                                      // Equal
                     FEQS(x3, d0, d1);
@@ -1135,7 +1448,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     }
                     case 7: break;                                      // Not NaN
                     }
-                    
+
                     // MARK2;
                     if ((u8&7) == 5 || (u8&7) == 6) {
                         MOV32w(x3, 1);
@@ -1143,7 +1456,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     // MARK;
                 }
                 NEG(x3, x3);
-                SW(x3, gback, i*4);
+                SW(x3, gback, gdoffset+i*4);
             }
             break;
         case 0xC3:
@@ -1160,24 +1473,24 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0xC6: // TODO: Optimize this!
             INST_NAME("SHUFPS Gx, Ex, Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 1);
             u8 = F8;
             int32_t idx;
 
             idx = (u8>>(0*2))&3;
-            LWU(x3, gback, idx*4);
+            LWU(x3, gback, gdoffset+idx*4);
             idx = (u8>>(1*2))&3;
-            LWU(x4, gback, idx*4);
+            LWU(x4, gback, gdoffset+idx*4);
             idx = (u8>>(2*2))&3;
             LWU(x5, wback, fixedaddress+idx*4);
             idx = (u8>>(3*2))&3;
             LWU(x6, wback, fixedaddress+idx*4);
 
-            SW(x3, gback, 0*4);
-            SW(x4, gback, 1*4);
-            SW(x5, gback, 2*4);
-            SW(x6, gback, 3*4);
+            SW(x3, gback, gdoffset+0*4);
+            SW(x4, gback, gdoffset+1*4);
+            SW(x5, gback, gdoffset+2*4);
+            SW(x6, gback, gdoffset+3*4);
             break;
 
         case 0xC8:
@@ -1190,90 +1503,111 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0xCF:                  /* BSWAP reg */
             INST_NAME("BSWAP Reg");
             gd = xRAX+(opcode&7)+(rex.b<<3);
-            #if 1
-            ANDI(x1, gd, 0xff);
-            SLLI(x1, x1, (rex.w?64:32)-8);
-            SRLI(x2, gd, 8);
-            ANDI(x3, x2, 0xff);
-            SLLI(x3, x3, (rex.w?64:32)-16);
-            OR(x1, x1, x3);
-            SRLI(x2, gd, 16);
-            ANDI(x3, x2, 0xff);
-            SLLI(x3, x3, (rex.w?64:32)-24);
-            OR(x1, x1, x3);
-            SRLI(x2, gd, 24);
-            if(rex.w) {
-                ANDI(x3, x2, 0xff);
-                SLLI(x3, x3, (rex.w?64:32)-32);
-                OR(x1, x1, x3);
-                SRLI(x2, gd, 32);
-                ANDI(x3, x2, 0xff);
-                SLLI(x3, x3, 64-40);
-                OR(x1, x1, x3);
-                SRLI(x2, gd, 40);
+            if(rv64_zbb) {
+                REV8(gd, gd);
+                if(!rex.w)
+                    SRLI(gd, gd, 32);
+            } else {
+                gback = gd;
+                if (!rex.w) {
+                    AND(x4, gd, xMASK);
+                    gd = x4;
+                }
+                ANDI(x1, gd, 0xff);
+                SLLI(x1, x1, (rex.w?64:32)-8);
+                SRLI(x2, gd, 8);
                 ANDI(x3, x2, 0xff);
-                SLLI(x3, x3, 64-48);
+                SLLI(x3, x3, (rex.w?64:32)-16);
                 OR(x1, x1, x3);
-                SRLI(x2, gd, 48);
+                SRLI(x2, gd, 16);
                 ANDI(x3, x2, 0xff);
-                SLLI(x3, x3, 64-56);
+                SLLI(x3, x3, (rex.w?64:32)-24);
                 OR(x1, x1, x3);
-                SRLI(x2, gd, 56);
+                SRLI(x2, gd, 24);
+                if(rex.w) {
+                    ANDI(x3, x2, 0xff);
+                    SLLI(x3, x3, 64-32);
+                    OR(x1, x1, x3);
+                    SRLI(x2, gd, 32);
+                    ANDI(x3, x2, 0xff);
+                    SLLI(x3, x3, 64-40);
+                    OR(x1, x1, x3);
+                    SRLI(x2, gd, 40);
+                    ANDI(x3, x2, 0xff);
+                    SLLI(x3, x3, 64-48);
+                    OR(x1, x1, x3);
+                    SRLI(x2, gd, 48);
+                    ANDI(x3, x2, 0xff);
+                    SLLI(x3, x3, 64-56);
+                    OR(x1, x1, x3);
+                    SRLI(x2, gd, 56);
+                }
+                OR(gback, x1, x2);
             }
-            OR(gd, x1, x2);
-            #else
-            MOV_U12(x1, 0xff);
-            SLLI(x4, x1, 8); // mask 0xff00
-            if (rex.w) {
-                SLLI(x5, x1, 16); // mask 0xff0000
-                SLLI(x6, x1, 24); // mask 0xff000000
-
-                SRLI(x2, gd, 56);
-
-                SRLI(x3, gd, 40);
-                AND(x3, x3, x4);
-                OR(x2, x2, x3);
-
-                SRLI(x3, gd, 24);
-                AND(x3, x3, x5);
-                OR(x2, x2, x3);
-
-                SRLI(x3, gd, 8);
-                AND(x3, x3, x6);
-                OR(x2, x2, x3);
-
-                AND(x3, gd, x6);
-                SLLI(x3, x3, 8);
-                OR(x2, x2, x3);
-
-                AND(x3, gd, x5);
-                SLLI(x3, x3, 24);
-                OR(x2, x2, x3);
-
-                AND(x3, gd, x4);
-                SLLI(x3, x3, 40);
-                OR(x2, x2, x3);
-
-                SLLI(x3, x3, 56);
-                OR(gd, x2, x3);
+            break;
+        case 0xE5:
+            INST_NAME("PMULHW Gm,Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            for(int i=0; i<4; ++i) {
+                LH(x3, gback, gdoffset+2*i);
+                LH(x4, wback, fixedaddress+2*i);
+                MULW(x3, x3, x4);
+                SRAIW(x3, x3, 16);
+                SH(x3, gback, gdoffset+2*i);
+            }
+            break;
+        case 0xED:
+            INST_NAME("PADDSW Gm,Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            for(int i=0; i<4; ++i) {
+                // tmp32s = (int32_t)GX->sw[i] + EX->sw[i];
+                // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
+                LH(x3, gback, gdoffset+2*i);
+                LH(x4, wback, fixedaddress+2*i);
+                ADDW(x3, x3, x4);
+                LUI(x4, 0xFFFF8); // -32768
+                BGE(x3, x4, 12);
+                SH(x4, gback, gdoffset+2*i);
+                J(20); // continue
+                LUI(x4, 8); // 32768
+                BLT(x3, x4, 8);
+                ADDIW(x3, x4, -1);
+                SH(x3, gback, gdoffset+2*i);
+            }
+            break;
+        case 0xEF:
+            INST_NAME("PXOR Gm,Em");
+            nextop = F8;
+            GETGM();
+            if(MODREG && gd==(nextop&7)) {
+                // just zero dest
+                SD(xZR, gback, gdoffset+0);
             } else {
-                SRLIW(x2, gd, 24);
-
-                SRLIW(x3, gd, 8);
-                AND(x3, x3, x4);
-                OR(x2, x2, x3);
-
-                AND(x3, gd, x4);
-                SLLI(x3, x3, 8);
-                OR(x2, x2, x3);
-
-                AND(x3, gd, x1);
-                SLLI(x3, x3, 24);
-                OR(gd, x2, x3);
+                GETEM(x2, 0);
+                LD(x3, gback, gdoffset+0);
+                LD(x4, wback, fixedaddress);
+                XOR(x3, x3, x4);
+                SD(x3, gback, gdoffset+0);
             }
-            #endif
             break;
-
+        case 0xF9:
+            INST_NAME("PSUBW Gm, Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            MMX_LOOP_W(x3, x4, SUBW(x3, x3, x4));
+            break;
+        case 0xFD:
+            INST_NAME("PADDW Gm, Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            MMX_LOOP_W(x3, x4, ADDW(x3, x3, x4));
+            break;
         default:
             DEFAULT;
     }
diff --git a/src/dynarec/rv64/dynarec_rv64_64.c b/src/dynarec/rv64/dynarec_rv64_64.c
index 455a8d72..bc3b2c96 100644
--- a/src/dynarec/rv64/dynarec_rv64_64.c
+++ b/src/dynarec/rv64/dynarec_rv64_64.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -23,8 +22,6 @@
 #include "dynarec_rv64_helper.h"
 #include "dynarec_rv64_functions.h"
 
-#define GETG        gd = ((nextop&0x38)>>3)+(rex.r<<3)
-
 uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int seg, int* ok, int* need_epilog)
 {
     (void)ip; (void)rep; (void)need_epilog;
@@ -33,12 +30,13 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
     uint8_t nextop;
     uint8_t u8;
     uint8_t gd, ed, eb1, eb2, gb1, gb2;
-    uint8_t wback, wb1, wb2, wb;
+    uint8_t gback, wback, wb1, wb2, wb;
     int64_t i64, j64;
+    uint64_t u64;
     int v0, v1;
     int q0;
     int d0;
-    int64_t fixedaddress;
+    int64_t fixedaddress, gdoffset;
     int unscaled;
     MAYUSE(eb1);
     MAYUSE(eb2);
@@ -56,14 +54,85 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         rep = opcode-0xF1;
         opcode = F8;
     }
-    // REX prefix before the F0 are ignored
-    rex.rex = 0;
-    while(opcode>=0x40 && opcode<=0x4f) {
-        rex.rex = opcode;
-        opcode = F8;
-    }
+
+    GETREX();
 
     switch(opcode) {
+        case 0x03:
+            INST_NAME("ADD Gd, Seg:Ed");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            grab_segdata(dyn, addr, ninst, x4, seg);
+            nextop = F8;
+            GETGD;
+            GETEDO(x4, 0, x5);
+            emit_add32(dyn, ninst, rex, gd, ed, x3, x4, x5);
+            break;
+        case 0x0F:
+            opcode = F8;
+            switch(opcode) {
+                case 0x11:
+                    switch(rep) {
+                        case 0:
+                            INST_NAME("MOVUPS Ex,Gx");
+                            nextop = F8;
+                            GETGX();
+                            GETEX(x2, 0);
+                            if(!MODREG) {
+                                grab_segdata(dyn, addr, ninst, x4, seg);
+                                ADD(x4, x4, wback);
+                                wback = x4;
+                            }
+                            LD(x3, gback, gdoffset+0);
+                            LD(x5, gback, gdoffset+8);
+                            SD(x3, wback, fixedaddress+0);
+                            SD(x5, wback, fixedaddress+8);
+                            if(!MODREG)
+                                SMWRITE2();
+                            break;
+                        case 1:
+                            INST_NAME("MOVSD Ex, Gx");
+                            nextop = F8;
+                            GETG;
+                            v0 = sse_get_reg(dyn, ninst, x1, gd, 0);
+                            if(MODREG) {
+                                ed = (nextop&7)+ (rex.b<<3);
+                                d0 = sse_get_reg(dyn, ninst, x1, ed, 0);
+                                FMVD(d0, v0);
+                            } else {
+                                grab_segdata(dyn, addr, ninst, x4, seg);
+                                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0);
+                                ADD(x4, x4, ed);
+                                ed = x4;
+                                FSD(v0, ed, fixedaddress);
+                                SMWRITE2();
+                            }
+                            break;
+                        case 2:
+                            INST_NAME("MOVSS Ex, Gx");
+                            nextop = F8;
+                            GETG;
+                            v0 = sse_get_reg(dyn, ninst, x1, gd, 1);
+                            if(MODREG) {
+                                q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 1);
+                                FMVS(q0, v0);
+                            } else {
+                                grab_segdata(dyn, addr, ninst, x4, seg);
+                                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0);
+                                ADD(x4, x4, ed);
+                                ed = x4;
+                                FSW(v0, ed, fixedaddress);
+                                SMWRITE2();
+                            }
+                            break;
+                        default:
+                            DEFAULT;
+                    }
+                    break;
+
+                default:
+                    DEFAULT;
+            }
+            break;
         case 0x2B:
             INST_NAME("SUB Gd, Seg:Ed");
             SETFLAGS(X_ALL, SF_SET_PENDING);
@@ -84,6 +153,174 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             emit_xor32(dyn, ninst, rex, gd, ed, x3, x4);
             break;
 
+        case 0x66:
+            addr = dynarec64_6664(dyn, addr, ip, ninst, rex, seg, ok, need_epilog);
+            break;
+        case 0x80:
+            nextop = F8;
+            switch((nextop>>3)&7) {
+                case 0: // ADD
+                    INST_NAME("ADD Eb, Ib");
+                    grab_segdata(dyn, addr, ninst, x1, seg);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEBO(x1, 1);
+                    u8 = F8;
+                    emit_add8c(dyn, ninst, x1, u8, x2, x4, x5);
+                    EBBACK(x5, 0);
+                    break;
+                case 1: // OR
+                    INST_NAME("OR Eb, Ib");
+                    grab_segdata(dyn, addr, ninst, x1, seg);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEBO(x1, 1);
+                    u8 = F8;
+                    emit_or8c(dyn, ninst, x1, u8, x2, x4, x5);
+                    EBBACK(x5, 0);
+                    break;
+                case 2: // ADC
+                    INST_NAME("ADC Eb, Ib");
+                    grab_segdata(dyn, addr, ninst, x1, seg);
+                    READFLAGS(X_CF);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEBO(x1, 1);
+                    u8 = F8;
+                    emit_adc8c(dyn, ninst, x1, u8, x2, x4, x5, x6);
+                    EBBACK(x5, 0);
+                    break;
+                case 3: // SBB
+                    INST_NAME("SBB Eb, Ib");
+                    grab_segdata(dyn, addr, ninst, x1, seg);
+                    READFLAGS(X_CF);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEBO(x1, 1);
+                    u8 = F8;
+                    emit_sbb8c(dyn, ninst, x1, u8, x2, x4, x5, x6);
+                    EBBACK(x5, 0);
+                    break;
+                case 4: // AND
+                    INST_NAME("AND Eb, Ib");
+                    grab_segdata(dyn, addr, ninst, x1, seg);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEBO(x1, 1);
+                    u8 = F8;
+                    emit_and8c(dyn, ninst, x1, u8, x2, x4);
+                    EBBACK(x5, 0);
+                    break;
+                case 5: // SUB
+                    INST_NAME("SUB Eb, Ib");
+                    grab_segdata(dyn, addr, ninst, x1, seg);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEBO(x1, 1);
+                    u8 = F8;
+                    emit_sub8c(dyn, ninst, x1, u8, x2, x4, x5, x6);
+                    EBBACK(x5, 0);
+                    break;
+                case 6: // XOR
+                    INST_NAME("XOR Eb, Ib");
+                    grab_segdata(dyn, addr, ninst, x1, seg);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEBO(x1, 1);
+                    u8 = F8;
+                    emit_xor8c(dyn, ninst, x1, u8, x2, x4);
+                    EBBACK(x5, 0);
+                    break;
+                case 7: // CMP
+                    INST_NAME("CMP Eb, Ib");
+                    grab_segdata(dyn, addr, ninst, x1, seg);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEBO(x1, 1);
+                    u8 = F8;
+                    if(u8) {
+                        MOV32w(x2, u8);
+                        emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6);
+                    } else {
+                        emit_cmp8_0(dyn, ninst, x1, x3, x4);
+                    }
+                    break;
+                default:
+                    DEFAULT;
+            }
+            break;
+        case 0x81:
+        case 0x83:
+            nextop = F8;
+            grab_segdata(dyn, addr, ninst, x6, seg);
+            switch((nextop>>3)&7) {
+                case 0: // ADD
+                    if(opcode==0x81) {INST_NAME("ADD Ed, Id");} else {INST_NAME("ADD Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEDO(x6, (opcode==0x81)?4:1, x5);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_add32c(dyn, ninst, rex, ed, i64, x3, x4, x5, x9);
+                    WBACKO(x6);
+                    break;
+                case 1: // OR
+                    if(opcode==0x81) {INST_NAME("OR Ed, Id");} else {INST_NAME("OR Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEDO(x6, (opcode==0x81)?4:1, x5);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_or32c(dyn, ninst, rex, ed, i64, x3, x4);
+                    WBACKO(x6);
+                    break;
+                case 2: // ADC
+                    if(opcode==0x81) {INST_NAME("ADC Ed, Id");} else {INST_NAME("ADC Ed, Ib");}
+                    READFLAGS(X_CF);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEDO(x6, (opcode==0x81)?4:1, x5);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    MOV64xw(x5, i64);
+                    SD(x6, xEmu, offsetof(x64emu_t, scratch));
+                    emit_adc32(dyn, ninst, rex, ed, x5, x3, x4, x6, x9);
+                    LD(x6, xEmu, offsetof(x64emu_t, scratch));
+                    WBACKO(x6);
+                    break;
+                case 3: // SBB
+                    if(opcode==0x81) {INST_NAME("SBB Ed, Id");} else {INST_NAME("SBB Ed, Ib");}
+                    READFLAGS(X_CF);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEDO(x6, (opcode==0x81)?4:1, x5);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    MOV64xw(x5, i64);
+                    emit_sbb32(dyn, ninst, rex, ed, x5, x3, x4, x9);
+                    WBACKO(x6);
+                    break;
+                case 4: // AND
+                    if(opcode==0x81) {INST_NAME("AND Ed, Id");} else {INST_NAME("AND Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEDO(x6, (opcode==0x81)?4:1, x5);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_and32c(dyn, ninst, rex, ed, i64, x3, x4);
+                    WBACKO(x6);
+                    break;
+                case 5: // SUB
+                    if(opcode==0x81) {INST_NAME("SUB Ed, Id");} else {INST_NAME("SUB Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEDO(x6, (opcode==0x81)?4:1, x5);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_sub32c(dyn, ninst, rex, ed, i64, x3, x4, x5, x9);
+                    WBACKO(x6);
+                    break;
+                case 6: // XOR
+                    if(opcode==0x81) {INST_NAME("XOR Ed, Id");} else {INST_NAME("XOR Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEDO(x6, (opcode==0x81)?4:1, x5);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_xor32c(dyn, ninst, rex, ed, i64, x3, x4);
+                    WBACKO(x6);
+                    break;
+                case 7: // CMP
+                    if(opcode==0x81) {INST_NAME("CMP Ed, Id");} else {INST_NAME("CMP Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETEDO(x6, (opcode==0x81)?4:1, x5);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    if(i64) {
+                        MOV64xw(x2, i64);
+                        emit_cmp32(dyn, ninst, rex, ed, x2, x3, x4, x5, x6);
+                    } else
+                        emit_cmp32_0(dyn, ninst, rex, ed, x3, x4);
+                    break;
+            }
+            break;
         case 0x88:
             INST_NAME("MOV Seg:Eb, Gb");
             grab_segdata(dyn, addr, ninst, x4, seg);
@@ -156,6 +393,81 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 LDxw(gd, x4, fixedaddress);
             }
             break;
+
+        case 0xA1:
+            INST_NAME("MOV EAX,FS:Od");
+            grab_segdata(dyn, addr, ninst, x4, seg);
+            if(rex.is32bits)
+                u64 = F32;
+            else
+                u64 = F64;
+            // TODO: could be optimized.
+            MOV64z(x1, u64);
+            ADD(x1, x1, x4);
+            LDxw(xRAX, x1, 0);
+            break;
+
+        case 0xA3:
+            INST_NAME("MOV FS:Od,EAX");
+            grab_segdata(dyn, addr, ninst, x4, seg);
+            if(rex.is32bits)
+                u64 = F32;
+            else
+                u64 = F64;
+            // TODO: could be optimized.
+            MOV64z(x1, u64);
+            ADD(x1, x1, x4);
+            SDxw(xRAX, x1, 0);
+            SMWRITE2();
+            break;
+
+        case 0xC6:
+            INST_NAME("MOV Seg:Eb, Ib");
+            grab_segdata(dyn, addr, ninst, x4, seg);
+            nextop=F8;
+            if(MODREG) {   // reg <= u8
+                u8 = F8;
+                if(!rex.rex) {
+                    ed = (nextop&7);
+                    eb1 = xRAX+(ed&3);  // Ax, Cx, Dx or Bx
+                    eb2 = (ed&4)>>2;    // L or H
+                } else {
+                    eb1 = xRAX+(nextop&7)+(rex.b<<3);
+                    eb2 = 0;
+                }
+
+                if (eb2) {
+                    // load a mask to x3 (ffffffffffff00ff)
+                    LUI(x3, 0xffff0);
+                    ORI(x3, x3, 0xff);
+                    // apply mask
+                    AND(eb1, eb1, x3);
+                    if(u8) {
+                        if((u8<<8)<2048) {
+                            ADDI(x4, xZR, u8<<8);
+                        } else {
+                            ADDI(x4, xZR, u8);
+                            SLLI(x4, x4, 8);
+                        }
+                        OR(eb1, eb1, x4);
+                    }
+                } else {
+                    ANDI(eb1, eb1, 0xf00);  // mask ffffffffffffff00
+                    ORI(eb1, eb1, u8);
+                }
+            } else {                    // mem <= u8
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 1);
+                u8 = F8;
+                if(u8) {
+                    ADDI(x3, xZR, u8);
+                    ed = x3;
+                } else
+                    ed = xZR;
+                ADD(x4, wback, x4);
+                SB(ed, x4, fixedaddress);
+                SMWRITE2();
+            }
+            break;
         case 0xC7:
             INST_NAME("MOV Seg:Ed, Id");
             grab_segdata(dyn, addr, ninst, x4, seg);
@@ -165,11 +477,15 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 ed = xRAX+(nextop&7)+(rex.b<<3);
                 MOV64xw(ed, i64);
             } else {                    // mem <= i32
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 4);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 4);
                 i64 = F32S;
-                MOV64xw(x3, i64);
-                ADD(x4, ed, x4);
-                SDxw(x3, x4, fixedaddress);
+                if(i64) {
+                    MOV64xw(x3, i64);
+                    ed = x3;
+                } else
+                    ed = xZR;
+                ADD(x4, wback, x4);
+                SDxw(ed, x4, fixedaddress);
                 SMWRITE2();
             }
             break;
diff --git a/src/dynarec/rv64/dynarec_rv64_66.c b/src/dynarec/rv64/dynarec_rv64_66.c
index 7bc996e9..49a7ef65 100644
--- a/src/dynarec/rv64/dynarec_rv64_66.c
+++ b/src/dynarec/rv64/dynarec_rv64_66.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -50,14 +49,10 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         rep = opcode-0xF1;
         opcode = F8;
     }
-    // REX prefix before the 66 are ignored
-    rex.rex = 0;
-    while(opcode>=0x40 && opcode<=0x4f) {
-        rex.rex = opcode;
-        opcode = F8;
-    }
 
-    if(rex.w && opcode!=0x0f)   // rex.w cancels "66", but not for 66 0f type of prefix
+    GETREX();
+
+    if(rex.w && !(opcode==0x0f || opcode==0xf0 || opcode==0x64 || opcode==0x65))   // rex.w cancels "66", but not for 66 0f type of prefix
         return dynarec64_00(dyn, addr-1, ip, ninst, rex, rep, ok, need_epilog); // addr-1, to "put back" opcode
 
     switch(opcode) {
@@ -83,8 +78,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("ADD AX, Iw");
             SETFLAGS(X_ALL, SF_SET_PENDING);
             i32 = F16;
-            SLLI(x1 , xRAX, 48);
-            SRLI(x1, x1, 48);
+            ZEXTH(x1 , xRAX);
             MOV32w(x2, i32);
             emit_add16(dyn, ninst, x1, x2, x3, x4, x6);
             LUI(x3, 0xffff0);
@@ -113,8 +107,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("OR AX, Iw");
             SETFLAGS(X_ALL, SF_SET_PENDING);
             i32 = F16;
-            SLLI(x1, xRAX, 48);
-            SRLI(x1, x1, 48);
+            ZEXTH(x1, xRAX);
             MOV32w(x2, i32);
             emit_or16(dyn, ninst, x1, x2, x3, x4);
             LUI(x3, 0xffff0);
@@ -124,6 +117,16 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x0F:
             addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog);
             break;
+        case 0x19:
+            INST_NAME("SBB Ew, Gw");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGW(x2);
+            GETEW(x1, 0);
+            emit_sbb16(dyn, ninst, x1, x2, x4, x5, x6);
+            EWBACK;
+            break;
         case 0x1B:
             INST_NAME("SBB Gw, Ew");
             READFLAGS(X_CF);
@@ -156,8 +159,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("AND AX, Iw");
             SETFLAGS(X_ALL, SF_SET_PENDING);
             i32 = F16;
-            SLLI(x1, xRAX, 48);
-            SRLI(x1, x1, 48);
+            ZEXTH(x1, xRAX);
             MOV32w(x2, i32);
             emit_and16(dyn, ninst, x1, x2, x3, x4);
             LUI(x3, 0xffff0);
@@ -186,8 +188,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("SUB AX, Iw");
             SETFLAGS(X_ALL, SF_SET_PENDING);
             i32 = F16;
-            SLLI(x1, xRAX, 48);
-            SRLI(x1, x1, 48);
+            ZEXTH(x1, xRAX);
             MOV32w(x2, i32);
             emit_sub16(dyn, ninst, x1, x2, x3, x4, x5);
             LUI(x2, 0xffff0);
@@ -216,8 +217,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("XOR AX, Iw");
             SETFLAGS(X_ALL, SF_SET_PENDING);
             i32 = F16;
-            SLLI(x1, xRAX, 48);
-            SRLI(x1, x1, 48);
+            ZEXTH(x1, xRAX);
             MOV32w(x2, i32);
             emit_xor16(dyn, ninst, x1, x2, x3, x4, x5);
             LUI(x5, 0xffff0);
@@ -244,8 +244,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("CMP AX, Iw");
             SETFLAGS(X_ALL, SF_SET_PENDING);
             i32 = F16;
-            SLLI(x1, xRAX, 48);
-            SRLI(x1, x1, 48);
+            ZEXTH(x1, xRAX);
             if(i32) {
                 MOV32w(x2, i32);
                 emit_cmp16(dyn, ninst, x1, x2, x3, x4, x5, x6);
@@ -253,6 +252,51 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 emit_cmp16_0(dyn, ninst, x1, x3, x4);
             }
             break;
+        case 0x40:
+        case 0x41:
+        case 0x42:
+        case 0x43:
+        case 0x44:
+        case 0x45:
+        case 0x46:
+        case 0x47:
+            INST_NAME("INC Reg16 (32bits)");
+            SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING);
+            gd = xRAX + (opcode&7);
+            ZEXTH(x1, gd);
+            emit_inc16(dyn, ninst, x1, x2, x3, x4);
+            LUI(x3, 0xffff0);
+            AND(gd, gd, x3);
+            OR(gd, gd, x1);
+            ZEROUP(gd);
+            break;
+        case 0x48:
+        case 0x49:
+        case 0x4A:
+        case 0x4B:
+        case 0x4C:
+        case 0x4D:
+        case 0x4E:
+        case 0x4F:
+            INST_NAME("DEC Reg16 (32bits)");
+            SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING);
+            gd = xRAX + (opcode&7);
+            ZEXTH(x1, gd);
+            emit_dec16(dyn, ninst, x1, x2, x3, x4, x5);
+            LUI(x3, 0xffff0);
+            AND(gd, gd, x3);
+            OR(gd, gd, x1);
+            ZEROUP(gd);
+            break;
+        case 0x64:
+            addr = dynarec64_6664(dyn, addr, ip, ninst, rex, _FS, ok, need_epilog);
+            break;
+        case 0x65:
+            addr = dynarec64_6664(dyn, addr, ip, ninst, rex, _GS, ok, need_epilog);
+            break;
+        case 0x66:
+            addr = dynarec64_66(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
         case 0x69:
         case 0x6B:
             if(opcode==0x69) {
@@ -267,8 +311,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             if(opcode==0x69) i32 = F16S; else i32 = F8S;
             MOV32w(x2, i32);
             MULW(x2, x2, x1);
-            SLLI(x2, x2, 48);
-            SRLI(x2, x2, 48);
+            ZEXTH(x2, x2);
             UFLAG_RES(x2);
             gd=x2;
             GWBACK;
@@ -394,8 +437,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     // we don't use GETGW above, so we need let gd & 0xffff.
                     LUI(x1, 0xffff0);
                     AND(ed, ed, x1);
-                    SLLI(x2, gd, 48);
-                    SRLI(x2, x2, 48);
+                    ZEXTH(x2, gd);
                     OR(ed, ed, x2);
                 }
             } else {
@@ -413,8 +455,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 if(ed!=gd) {
                     LUI(x1, 0xffff0);
                     AND(gd, gd, x1);
-                    SLLI(x2, ed, 48);
-                    SRLI(x2, x2, 48);
+                    ZEXTH(x2, ed);
                     OR(gd, gd, x2);
                 }
             } else {
@@ -443,13 +484,11 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     // x2 <- rax
                     MV(x2, xRAX);
                     // rax[15:0] <- gd[15:0]
-                    SLLI(x3, gd, 48);
-                    SRLI(x3, x3, 48);
+                    ZEXTH(x3, gd);
                     AND(xRAX, xRAX, x4);
                     OR(xRAX, xRAX, x3);
                     // gd[15:0] <- x2[15:0]
-                    SLLI(x2, x2, 48);
-                    SRLI(x2, x2, 48);
+                    ZEXTH(x2, x2);
                     AND(gd, gd, x4);
                     OR(gd, gd, x2);
                 }
@@ -530,6 +569,54 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 ADD(xRDI, xRDI, x3);
             }
             break;
+
+        case 0xAF:
+            switch (rep) {
+                case 1:
+                case 2:
+                    if(rep==1) {INST_NAME("REPNZ SCASW");} else {INST_NAME("REPZ SCASW");}
+                    MAYSETFLAGS();
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    CBZ_NEXT(xRCX);
+                    GETDIR(x3, x1, rex.w?8:2);
+                    if (rex.w) {
+                        MARK;
+                        LD(x2, xRDI, 0);
+                        ADD(xRDI, xRDI, x3);
+                        ADDI(xRCX, xRCX, -1);
+                        if (rep==1) {BEQ_MARK3(xRAX, x2);} else {BNE_MARK3(xRAX, x2);}
+                        BNE_MARK(xRCX, xZR);
+                        MARK3;
+                        emit_cmp32(dyn, ninst, rex, xRAX, x2, x3, x4, x5, x6);
+                    } else {
+                        ZEXTH(x1, xRAX);
+                        MARK;
+                        LHU(x2, xRDI, 0);
+                        ADD(xRDI, xRDI, x3);
+                        ADDI(xRCX, xRCX, -1);
+                        if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);}
+                        BNE_MARK(xRCX, xZR);
+                        MARK3;
+                        emit_cmp16(dyn, ninst, x1, x2, x3, x4, x5, x6);
+                    }
+                    break;
+                default:
+                    INST_NAME("SCASW");
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETDIR(x3, x1, rex.w?8:2);
+                    if (rex.w) {
+                        LD(x2, xRDI, 0);
+                        emit_cmp32(dyn, ninst, rex, xRAX, x2, x3, x4, x5, x6);
+                    } else {
+                        ZEXTH(x1, xRAX);
+                        LHU(x2, xRDI, 0);
+                        emit_cmp16(dyn, ninst, x1, x2, x3, x4, x5, x6);
+                    }
+                    ADD(xRDI, xRDI, x3);
+                    break;
+            }
+            break;
+
         case 0xB8:
         case 0xB9:
         case 0xBA:
@@ -555,7 +642,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 }
             }
             break;
-            
+
         case 0xC1:
             nextop = F8;
             switch((nextop>>3)&7) {
@@ -643,8 +730,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     UFLAG_OP12(ed, x2)
                     SRAI(ed, ed, u8&0x1f);
                     if(MODREG) {
-                        SLLI(ed, ed, 48);
-                        SRLI(ed, ed, 48);
+                        ZEXTH(ed, ed);
                     }
                     EWBACK;
                     UFLAG_RES(ed);
@@ -652,7 +738,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     break;
             }
             break;
-            
+
         case 0xC7:
             INST_NAME("MOV Ew, Iw");
             nextop = F8;
@@ -693,6 +779,25 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     UFLAG_RES(ed);
                     UFLAG_DF(x3, d_shr16);
                     break;
+                case 4:
+                case 6:
+                    if(opcode==0xD1) {
+                        INST_NAME("SHL Ew, 1");
+                        MOV32w(x4, 1);
+                    } else {
+                        INST_NAME("SHL Ew, CL");
+                        ANDI(x4, xRCX, 0x1f);
+                    }
+                    UFLAG_IF {MESSAGE(LOG_DUMP, "Need Optimization for flags\n");}
+                    SETFLAGS(X_ALL, SF_PENDING);
+                    GETEW(x1, 0);
+                    UFLAG_OP12(ed, x4)
+                    SLL(ed, ed, x4);
+                    ZEXTH(ed, ed);
+                    EWBACK;
+                    UFLAG_RES(ed);
+                    UFLAG_DF(x3, d_shl16);
+                    break;
                 case 7:
                     if(opcode==0xD1) {
                         INST_NAME("SAR Ew, 1");
@@ -704,10 +809,9 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     UFLAG_IF {MESSAGE(LOG_DUMP, "Need Optimization for flags\n");}
                     SETFLAGS(X_ALL, SF_PENDING);
                     GETSEW(x1, 0);
-                    UFLAG_OP12(ed, x4)
+                    UFLAG_OP12(ed, x4);
                     SRA(ed, ed, x4);
-                    SLLI(ed, ed, 48);
-                    SRLI(ed, ed, 48);
+                    ZEXTH(ed, ed);
                     EWBACK;
                     UFLAG_RES(ed);
                     UFLAG_DF(x3, d_sar16);
@@ -716,6 +820,10 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     DEFAULT;
             }
             break;
+
+        case 0xF0:
+            return dynarec64_66F0(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+
         case 0xF7:
             nextop = F8;
             switch((nextop>>3)&7) {
@@ -745,9 +853,8 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     INST_NAME("DIV Ew");
                     SETFLAGS(X_ALL, SF_SET);
                     GETEW(x1, 0);
-                    SLLI(x2, xRAX, 48);
+                    ZEXTH(x2, xRAX);
                     SLLI(x3, xRDX, 48);
-                    SRLI(x2, x2, 48);
                     SRLI(x3, x3, 32);
                     OR(x2, x2, x3);
                     DIVUW(x3, x2, ed);
@@ -766,9 +873,8 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     NOTEST(x1);
                     SETFLAGS(X_ALL, SF_SET);
                     GETSEW(x1, 0);
-                    SLLI(x2, xRAX, 48);
+                    ZEXTH(x2, xRAX);
                     SLLI(x3, xRDX, 48);
-                    SRLI(x2, x2, 48);
                     SRLI(x3, x3, 32);
                     OR(x2, x2, x3);
                     DIVW(x3, x2, ed);
diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c
index 260ea32b..3f51289e 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -27,7 +26,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     (void)ip; (void)need_epilog;
 
     uint8_t opcode = F8;
-    uint8_t nextop, u8;
+    uint8_t nextop, u8, s8;
     int32_t i32;
     uint8_t gd, ed;
     uint8_t wback, wb1, wb2, gback;
@@ -37,7 +36,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     int v0, v1;
     int q0, q1;
     int d0, d1;
-    int64_t fixedaddress;
+    int64_t fixedaddress, gdoffset;
     int unscaled;
 
     MAYUSE(d0);
@@ -49,27 +48,27 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     MAYUSE(j64);
 
     static const int8_t round_round[] = { RD_RNE, RD_RDN, RD_RUP, RD_RTZ };
-    
+
     switch(opcode) {
         case 0x10:
             INST_NAME("MOVUPD Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_MV_Q(x3);
             break;
         case 0x11:
             INST_NAME("MOVUPD Ex,Gx");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_MV_Q2(x3);
             if(!MODREG) SMWRITE2();
             break;
         case 0x12:
             INST_NAME("MOVLPD Gx, Eq");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(MODREG) {
                 // access register instead of memory is bad opcode!
                 DEFAULT;
@@ -78,33 +77,47 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             SMREAD();
             addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
             LD(x3, wback, fixedaddress);
-            SD(x3, gback, 0);
+            SD(x3, gback, gdoffset+0);
+            break;
+        case 0x13:
+            INST_NAME("MOVLPD Eq, Gx");
+            nextop = F8;
+            GETGX();
+            if(MODREG) {
+                // access register instead of memory is bad opcode!
+                DEFAULT;
+                return addr;
+            }
+            addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+            LD(x3, gback, gdoffset+0);
+            SD(x3, wback, fixedaddress);
+            SMWRITE2();
             break;
         case 0x14:
             INST_NAME("UNPCKLPD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             // GX->q[1] = EX->q[0];
             LD(x3, wback, fixedaddress+0);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             break;
         case 0x15:
             INST_NAME("UNPCKHPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             // GX->q[0] = GX->q[1];
-            LD(x3, gback, 8);
-            SD(x3, gback, 0);
+            LD(x3, gback, gdoffset+8);
+            SD(x3, gback, gdoffset+0);
             // GX->q[1] = EX->q[1];
             LD(x3, wback, fixedaddress+8);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             break;
         case 0x16:
             INST_NAME("MOVHPD Gx, Eq");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(MODREG) {
                 // access register instead of memory is bad opcode!
                 DEFAULT;
@@ -113,56 +126,32 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             SMREAD();
             addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
             LD(x3, wback, fixedaddress);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             break;
         case 0x1F:
             INST_NAME("NOP (multibyte)");
             nextop = F8;
             FAKEED;
             break;
-        
-        #define GO(GETFLAGS, NO, YES, F)            \
-            READFLAGS(F);                           \
-            GETFLAGS;                               \
-            nextop=F8;                              \
-            GETGD;                                  \
-            if(MODREG) {                            \
-                ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                SLLI(x4, ed, 48);                   \
-                SRLI(x4, x4, 48);                   \
-            } else {                                \
-                SMREAD();                           \
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \
-                LHU(x4, ed, fixedaddress);          \
-                ed = x4;                            \
-            }                                       \
-            B##NO(x1, 4+4*4);                       \
-            ADDI(x3, xZR, -1);                      \
-            SRLI(x3, x3, 48);                       \
-            AND(gd, gd, x3);                        \
-            OR(gd, gd, ed);
-
-        GOCOND(0x40, "CMOV", "Gw, Ew");
-        #undef GO
         case 0x28:
             INST_NAME("MOVAPD Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_MV_Q(x3);
             break;
         case 0x29:
             INST_NAME("MOVAPD Ex,Gx");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_MV_Q2(x3);
             if(!MODREG) SMWRITE2();
             break;
         case 0x2B:
             INST_NAME("MOVNTPD Ex, Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
             break;
@@ -207,15 +196,15 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 case 0x00:
                     INST_NAME("PSHUFB Gx, Ex");
                     nextop = F8;
-                    GETGX(x1);
+                    GETGX();
                     GETEX(x2, 0);
                     sse_forget_reg(dyn, ninst, x5);
 
                     ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
 
                     // perserve gd
-                    LD(x3, gback, 0);
-                    LD(x4, gback, 8);
+                    LD(x3, gback, gdoffset+0);
+                    LD(x4, gback, gdoffset+8);
                     SD(x3, x5, 0);
                     SD(x4, x5, 8);
 
@@ -223,29 +212,29 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         LBU(x3, wback, fixedaddress+i);
                         ANDI(x4, x3, 128);
                         BEQZ(x4, 12);
-                        SB(xZR, gback, i);
+                        SB(xZR, gback, gdoffset+i);
                         BEQZ(xZR, 20); // continue
                         ANDI(x4, x3, 15);
                         ADD(x4, x4, x5);
                         LBU(x4, x4, 0);
-                        SB(x4, gback, i);
+                        SB(x4, gback, gdoffset+i);
                     }
                     break;
                 case 0x01:
                     INST_NAME("PHADDW Gx, Ex");
                     nextop = F8;
-                    GETGX(x1);
+                    GETGX();
                     for (int i=0; i<4; ++i) {
                         // GX->sw[i] = GX->sw[i*2+0]+GX->sw[i*2+1];
-                        LH(x3, gback, 2*(i*2+0));
-                        LH(x4, gback, 2*(i*2+1));
+                        LH(x3, gback, gdoffset+2*(i*2+0));
+                        LH(x4, gback, gdoffset+2*(i*2+1));
                         ADDW(x3, x3, x4);
-                        SH(x3, gback, 2*i);
+                        SH(x3, gback, gdoffset+2*i);
                     }
                     if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                         // GX->q[1] = GX->q[0];
-                        LD(x3, gback, 0);
-                        SD(x3, gback, 8);
+                        LD(x3, gback, gdoffset+0);
+                        SD(x3, gback, gdoffset+8);
                     } else {
                         GETEX(x2, 0);
                         for (int i=0; i<4; ++i) {
@@ -253,47 +242,150 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                             LH(x3, wback, fixedaddress+2*(i*2+0));
                             LH(x4, wback, fixedaddress+2*(i*2+1));
                             ADDW(x3, x3, x4);
-                            SH(x3, gback, 2*(4+i));
+                            SH(x3, gback, gdoffset+2*(4+i));
                         }
                     }
                     break;
                 case 0x02:
                     INST_NAME("PHADDD Gx, Ex");
                     nextop = F8;
-                    GETGX(x1);
+                    GETGX();
                     // GX->sd[0] += GX->sd[1];
-                    LW(x3, gback, 0*4);
-                    LW(x4, gback, 1*4);
+                    LW(x3, gback, gdoffset+0*4);
+                    LW(x4, gback, gdoffset+1*4);
                     ADDW(x3, x3, x4);
-                    SW(x3, gback, 0*4);
+                    SW(x3, gback, gdoffset+0*4);
                     // GX->sd[1] = GX->sd[2] + GX->sd[3];
-                    LW(x3, gback, 2*4);
-                    LW(x4, gback, 3*4);
+                    LW(x3, gback, gdoffset+2*4);
+                    LW(x4, gback, gdoffset+3*4);
                     ADDW(x3, x3, x4);
-                    SW(x3, gback, 1*4);
+                    SW(x3, gback, gdoffset+1*4);
                     if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                         // GX->q[1] = GX->q[0];
-                        LD(x3, gback, 0);
-                        SD(x3, gback, 8);
+                        LD(x3, gback, gdoffset+0);
+                        SD(x3, gback, gdoffset+8);
                     } else {
                         GETEX(x2, 0);
                         // GX->sd[2] = EX->sd[0] + EX->sd[1];
                         LW(x3, wback, fixedaddress+0*4);
                         LW(x4, wback, fixedaddress+1*4);
                         ADDW(x3, x3, x4);
-                        SW(x3, gback, 2*4);
+                        SW(x3, gback, gdoffset+2*4);
                         // GX->sd[3] = EX->sd[2] + EX->sd[3];
                         LW(x3, wback, fixedaddress+2*4);
                         LW(x4, wback, fixedaddress+3*4);
                         ADDW(x3, x3, x4);
-                        SW(x3, gback, 3*4);
+                        SW(x3, gback, gdoffset+3*4);
+                    }
+                    break;
+
+                case 0x04:
+                    INST_NAME("PADDUBSW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    MOV64x(x5, 32767);
+                    MOV64x(x6, -32768);
+                    for(int i=0; i<8; ++i) {
+                        LBU(x3, gback, gdoffset+i*2);
+                        LB(x4, wback, fixedaddress+i*2);
+                        MUL(x9, x3, x4);
+                        LBU(x3, gback, gdoffset+i*2+1);
+                        LB(x4, wback, fixedaddress+i*2+1);
+                        MUL(x3, x3, x4);
+                        ADD(x3, x3, x9);
+                        if(rv64_zbb) {
+                            MIN(x3, x3, x5);
+                            MAX(x3, x3, x6);
+                        } else {
+                            BLT(x3, x5, 4+4);
+                            MV(x3, x5);
+                            BLT(x6, x3, 4+4);
+                            MV(x3, x6);
+                        }
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    break;
+
+                case 0x08:
+                    INST_NAME("PSIGNB Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<16; ++i) {
+                        LB(x3, gback, gdoffset+i);
+                        LB(x4, wback, fixedaddress+i);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x3, x3);
+                        BNE(x4, xZR, 4+4);
+                        MOV_U12(x3, 0);
+                        SB(x3, gback, gdoffset+i);
+                    }
+                    break;
+                case 0x09:
+                    INST_NAME("PSIGNW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<8; ++i) {
+                        LH(x3, gback, gdoffset+i*2);
+                        LH(x4, wback, fixedaddress+i*2);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x3, x3);
+                        BNE(x4, xZR, 4+4);
+                        MOV_U12(x3, 0);
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x0A:
+                    INST_NAME("PSIGND Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        LW(x4, wback, fixedaddress+i*4);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x3, x3);
+                        BNE(x4, xZR, 4+4);
+                        ADDI(x3, xZR, 0);
+                        SW(x3, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x0B:
+                    INST_NAME("PMULHRSW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<8; ++i) {
+                        LH(x3, gback, gdoffset+i*2);
+                        LH(x4, wback, fixedaddress+i*2);
+                        MUL(x3, x3, x4);
+                        SRAI(x3, x3, 14);
+                        ADDI(x3, x3, 1);
+                        SRAI(x3, x3, 1);
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x10:
+                    INST_NAME("PBLENDVB Gx,Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    sse_forget_reg(dyn, ninst, 0); // forget xmm[0]
+                    for (int i=0; i<16; ++i) {
+                        LB(x3, xEmu, offsetof(x64emu_t, xmm[0])+i);
+                        BGE(x3, xZR, 12); // continue
+                        LBU(x3, wback, fixedaddress+i);
+                        SB(x3, gback, gdoffset+i);
+                        // continue
                     }
                     break;
                 case 0x17:
                     INST_NAME("PTEST Gx, Ex");
                     nextop = F8;
                     SETFLAGS(X_ALL, SF_SET);
-                    GETGX(x1);
+                    GETGX();
                     GETEX(x2, 0);
                     CLEAR_FLAGS();
                     SET_DFNONE();
@@ -302,8 +394,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         LD(x6, wback, fixedaddress+8);
 
                         IFX(X_ZF) {
-                            LD(x3, gback, 0);
-                            LD(x4, gback, 8);
+                            LD(x3, gback, gdoffset+0);
+                            LD(x4, gback, gdoffset+8);
                             AND(x3, x3, x5);
                             AND(x4, x4, x6);
                             OR(x3, x3, x4);
@@ -311,9 +403,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                             ORI(xFlags, xFlags, 1<<F_ZF);
                         }
                         IFX(X_CF) {
-                            LD(x3, gback, 0);
+                            LD(x3, gback, gdoffset+0);
                             NOT(x3, x3);
-                            LD(x4, gback, 8);
+                            LD(x4, gback, gdoffset+8);
                             NOT(x4, x4);
                             AND(x3, x3, x5);
                             AND(x4, x4, x6);
@@ -323,19 +415,306 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         }
                     }
                     break;
+
+                case 0x1C:
+                    INST_NAME("PABSB Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<16; ++i) {
+                        LB(x4, wback, fixedaddress+i);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x4, x4);
+                        SB(x4, gback, gdoffset+i);
+                    }
+                    break;
+                case 0x1D:
+                    INST_NAME("PABSW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<8; ++i) {
+                        LH(x4, wback, fixedaddress+i*2);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x4, x4);
+                        SH(x4, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x1E:
+                    INST_NAME("PABSD Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    MOV64x(x5, ~(1<<31));
+                    for(int i=0; i<4; ++i) {
+                        LW(x4, wback, fixedaddress+i*4);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x4, x4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
+
+                case 0x2B:
+                    INST_NAME("PACKUSDW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    MOV64x(x5, 65535);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        if(rv64_zbb) {
+                            MIN(x3, x3, x5);
+                            MAX(x3, x3, xZR);
+                        } else {
+                            BGE(x3, xZR, 4+4);
+                            MV(x3, xZR);
+                            BLT(x3, x5, 4+4);
+                            MV(x3, x5);
+                        }
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    if(MODREG && gd==ed) {
+                        LD(x3, gback, gdoffset+0);
+                        SD(x3, gback, gdoffset+8);
+                    } else for(int i=0; i<4; ++i) {
+                        LW(x3, wback, fixedaddress+i*4);
+                        if(rv64_zbb) {
+                            MIN(x3, x3, x5);
+                            MAX(x3, x3, xZR);
+                        } else {
+                            BGE(x3, xZR, 4+4);
+                            MV(x3, xZR);
+                            BLT(x3, x5, 4+4);
+                            MV(x3, x5);
+                        }
+                        SH(x3, gback, gdoffset+8+i*2);
+                    }
+                    break;
+
+                case 0x30:
+                    INST_NAME("PMOVZXBW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=7; i>=0; --i) {
+                        LBU(x3, wback, fixedaddress+i);
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x31:
+                    INST_NAME("PMOVZXBD Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=3; i>=0; --i) {
+                        LBU(x3, wback, fixedaddress+i);
+                        SW(x3, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x32:
+                    INST_NAME("PMOVZXBQ Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=1; i>=0; --i) {
+                        LBU(x3, wback, fixedaddress+i);
+                        SD(x3, gback, gdoffset+i*8);
+                    }
+                    break;
+                case 0x33:
+                    INST_NAME("PMOVZXWD Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=3; i>=0; --i) {
+                        LHU(x3, wback, fixedaddress+i*2);
+                        SW(x3, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x34:
+                    INST_NAME("PMOVZXWQ Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=1; i>=0; --i) {
+                        LHU(x3, wback, fixedaddress+i*2);
+                        SD(x3, gback, gdoffset+i*8);
+                    }
+                    break;
+                case 0x35:
+                    INST_NAME("PMOVZXDQ Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=1; i>=0; --i) {
+                        LWU(x3, wback, fixedaddress+i*4);
+                        SD(x3, gback, gdoffset+i*8);
+                    }
+                    break;
+
+                case 0x38:
+                    INST_NAME("PMINSB Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<16; ++i) {
+                        LB(x3, gback, gdoffset+i);
+                        LB(x4, wback, fixedaddress+i);
+                        if(rv64_zbb) MIN(x4, x3, x4); else BLT(x3, x4, 4+4);
+                        SB(x4, gback, gdoffset+i);
+                    }
+                    break;
+                case 0x39:
+                    INST_NAME("PMINSD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        LW(x4, wback, fixedaddress+i*4);
+                        if(rv64_zbb) MIN(x4, x3, x4); else BLT(x3, x4, 4+4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
                 case 0x3A:
                     INST_NAME("PMINUW Gx, Ex");  // SSE4 opcode!
                     nextop = F8;
-                    GETGX(x1);
+                    GETGX();
                     GETEX(x2, 0);
                     for(int i=0; i<8; ++i) {
-                        // if(GX->uw[i]>EX->uw[i]) GX->uw[i] = EX->uw[i];
-                        LHU(x3, gback, i*2);
+                        LHU(x3, gback, gdoffset+i*2);
                         LHU(x4, wback, fixedaddress+i*2);
-                        BLTU(x3, x4, 8);
-                        SH(x4, gback, i*2);
+                        if(rv64_zbb) MINU(x4, x3, x4); else BLTU(x3, x4, 4+4);
+                        SH(x4, gback, gdoffset+i*2);
                     }
                     break;
+                case 0x3B:
+                    INST_NAME("PMINUD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LWU(x3, gback, gdoffset+i*4);
+                        LWU(x4, wback, fixedaddress+i*4);
+                        if(rv64_zbb) MINU(x4, x3, x4); else BLTU(x3, x4, 4+4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x3C:
+                    INST_NAME("PMAXSB Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<16; ++i) {
+                        LB(x3, gback, gdoffset+i);
+                        LB(x4, wback, fixedaddress+i);
+                        if(rv64_zbb) MAX(x4, x3, x4); else BLT(x4, x3, 4+4);
+                        SB(x4, gback, gdoffset+i);
+                    }
+                    break;
+                case 0x3D:
+                    INST_NAME("PMAXSD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        LW(x4, wback, fixedaddress+i*4);
+                        if(rv64_zbb) MAX(x4, x3, x4); else BLT(x4, x3, 4+4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x3E:
+                    INST_NAME("PMAXUW Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<8; ++i) {
+                        LHU(x3, gback, gdoffset+i*2);
+                        LHU(x4, wback, fixedaddress+i*2);
+                        if(rv64_zbb) MAXU(x4, x3, x4); else BLTU(x4, x3, 4+4);
+                        SH(x4, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x3F:
+                    INST_NAME("PMAXUD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LWU(x3, gback, gdoffset+i*4);
+                        LWU(x4, wback, fixedaddress+i*4);
+                        if(rv64_zbb) MAXU(x4, x3, x4); else BLTU(x4, x3, 4+4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x40:
+                    INST_NAME("PMULLD Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        LW(x4, wback, fixedaddress+i*4);
+                        MUL(x3, x3, x4);
+                        SW(x3, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0xDB:
+                    INST_NAME("AESIMC Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_MV_Q(x3);
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aesimc, -1);
+                    break;
+                case 0xDC:
+                    INST_NAME("AESENC Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aese, -1);
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
+                    break;
+                case 0xDD:
+                    INST_NAME("AESENCLAST Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aeselast, -1);
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
+                    break;
+                case 0xDE:
+                    INST_NAME("AESDEC Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aesd, -1);
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
+                    break;
+
+                case 0xDF:
+                    INST_NAME("AESDECLAST Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aesdlast, -1);
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
+                    break;
                 default:
                     DEFAULT;
             }
@@ -346,19 +725,20 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 case 0x0B:
                     INST_NAME("ROUNDSD Gx, Ex, Ib");
                     nextop = F8;
-                    GETEXSD(d0, 0);
+                    GETEXSD(d0, 1);
                     GETGXSD_empty(v0);
                     d1 = fpu_get_scratch(dyn);
+                    v1 = fpu_get_scratch(dyn);
                     u8 = F8;
                     FEQD(x2, d0, d0);
                     BNEZ_MARK(x2);
-                    FADDD(v0, d0, d0);
+                    if (v0!=d0) FMVD(v0, d0);
                     B_NEXT_nocond;
                     MARK; // d0 is not nan
-                    FABSD(v0, d0);
+                    FABSD(v1, d0);
                     MOV64x(x3, 1ULL << __DBL_MANT_DIG__);
                     FCVTDL(d1, x3, RD_RTZ);
-                    FLTD(x3, v0, d1);
+                    FLTD(x3, v1, d1);
                     BNEZ_MARK2(x3);
                     if (v0!=d0) FMVD(v0, d0);
                     B_NEXT_nocond;
@@ -366,17 +746,258 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(u8&4) {
                         u8 = sse_setround(dyn, ninst, x4, x2);
                         FCVTLD(x5, d0, RD_DYN);
-                        FCVTDL(v0, x5, RD_DYN);
+                        FCVTDL(v0, x5, RD_RTZ);
                         x87_restoreround(dyn, ninst, u8);
                     } else {
                         FCVTLD(x5, d0, round_round[u8&3]);
-                        FCVTDL(v0, x5, round_round[u8&3]);
+                        FCVTDL(v0, x5, RD_RTZ);
                     }
                     break;
-                default:
+                case 0x09:
+                    INST_NAME("ROUNDPD Gx, Ex, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 1);
+                    u8 = F8;
+                    d0 = fpu_get_scratch(dyn);
+                    d1 = fpu_get_scratch(dyn);
+                    v1 = fpu_get_scratch(dyn);
+                    MOV64x(x3, 1ULL << __DBL_MANT_DIG__);
+                    FCVTDL(d1, x3, RD_RTZ);
+
+                    // i = 0
+                    FLD(d0, wback, fixedaddress);
+                    FEQD(x4, d0, d0);
+                    BNEZ(x4, 8);
+                    B_MARK_nocond;
+                    // d0 is not nan
+                    FABSD(v1, d0);
+                    FLTD(x4, v1, d1);
+                    BNEZ(x4, 8);
+                    B_MARK_nocond;
+                    if(u8&4) {
+                        u8 = sse_setround(dyn, ninst, x4, x5);
+                        FCVTLD(x5, d0, RD_DYN);
+                        FCVTDL(d0, x5, RD_RTZ);
+                        x87_restoreround(dyn, ninst, u8);
+                    } else {
+                        FCVTLD(x5, d0, round_round[u8&3]);
+                        FCVTDL(d0, x5, RD_RTZ);
+                    }
+                    MARK;
+                    FSD(d0, gback, gdoffset+0);
+
+                    // i = 1
+                    FLD(d0, wback, fixedaddress+8);
+                    FEQD(x4, d0, d0);
+                    BNEZ(x4, 8);
+                    B_MARK2_nocond;
+                    // d0 is not nan
+                    FABSD(v1, d0);
+                    FLTD(x4, v1, d1);
+                    BNEZ(x4, 8);
+                    B_MARK2_nocond;
+                    if(u8&4) {
+                        u8 = sse_setround(dyn, ninst, x4, x5);
+                        FCVTLD(x5, d0, RD_DYN);
+                        FCVTDL(d0, x5, RD_RTZ);
+                        x87_restoreround(dyn, ninst, u8);
+                    } else {
+                        FCVTLD(x5, d0, round_round[u8&3]);
+                        FCVTDL(d0, x5, RD_RTZ);
+                    }
+                    MARK2;
+                    FSD(d0, gback, gdoffset+8);
+                    break;
+                case 0x0E:
+                    INST_NAME("PBLENDW Gx, Ex, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 1);
+                    u8 = F8;
+                    i32 = 0;
+                    if (MODREG && gd==ed) break;
+                    while (u8)
+                        if(u8&1) {
+                            if(!(i32&1) && u8&2) {
+                                if(!(i32&3) && (u8&0xf)==0xf) {
+                                    // whole 64bits
+                                    LD(x3, wback, fixedaddress+8*(i32>>2));
+                                    SD(x3, gback, gdoffset+8*(i32>>2));
+                                    i32+=4;
+                                    u8>>=4;
+                                } else {
+                                    // 32bits
+                                    LWU(x3, wback, fixedaddress+4*(i32>>1));
+                                    SW(x3, gback, gdoffset+4*(i32>>1));
+                                    i32+=2;
+                                    u8>>=2;
+                                }
+                            } else {
+                                // 16 bits
+                                LHU(x3, wback, fixedaddress+2*i32);
+                                SH(x3, gback, gdoffset+2*i32);
+                                i32++;
+                                u8>>=1;
+                            }
+                        } else {
+                            // nope
+                            i32++;
+                            u8>>=1;
+                        }
+                    break;
+                case 0x0F:
+                    INST_NAME("PALIGNR Gx, Ex, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 1);
+                    u8 = F8;
+                    sse_forget_reg(dyn, ninst, x5);
+                    ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                    // perserve gd
+                    LD(x3, gback, gdoffset+0);
+                    LD(x4, gback, gdoffset+8);
+                    SD(x3, x5, 0);
+                    SD(x4, x5, 8);
+                    if(u8>31) {
+                        SD(xZR, gback, gdoffset+0);
+                        SD(xZR, gback, gdoffset+8);
+                    } else {
+                        for (int i=0; i<16; ++i, ++u8) {
+                            if (u8>15) {
+                                if(u8>31) {
+                                    SB(xZR, gback, gdoffset+i);
+                                    continue;
+                                }
+                                else LBU(x3, x5, u8-16);
+                            } else {
+                                LBU(x3, wback, fixedaddress+u8);
+                            }
+                            SB(x3, gback, gdoffset+i);
+                        }
+                    }
+                    break;
+                case 0x16:
+                    if(rex.w) {INST_NAME("PEXTRQ Ed, Gx, Ib");} else {INST_NAME("PEXTRD Ed, Gx, Ib");}
+                    nextop = F8;
+                    GETGX();
+                    GETED(1);
+                    u8 = F8;
+                    if(rex.w)
+                        LD(ed, gback, gdoffset+8*(u8&1));
+                    else
+                        LWU(ed, gback, gdoffset+4*(u8&3));
+                    if (wback) {
+                        SDxw(ed, wback, fixedaddress);
+                        SMWRITE2();
+                    }
+                    break;
+                case 0x20:
+                    INST_NAME("PINSRB Gx, ED, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETED(1);
+                    u8 = F8;
+                    SB(ed, gback, gdoffset+u8&0xF);
+                    break;
+                case 0x21:
+                    INST_NAME("INSERTPS GX, EX, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 1);
+                    u8 = F8;
+                    if(MODREG) s8 = (u8>>6)&3; else s8 = 0;
+                    // GX->ud[(tmp8u>>4)&3] = EX->ud[tmp8s];
+                    LWU(x3, wback, fixedaddress+4*s8);
+                    SW(x3, gback, gdoffset+4*(u8>>4));
+                    for(int i=0; i<4; ++i) {
+                        if(u8&(1<<i))
+                            // GX->ud[i] = 0;
+                            SW(xZR, gback, gdoffset+4*i);
+                    }
+                    break;
+                case 0x22:
+                    INST_NAME("PINSRD Gx, ED, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETED(1);
+                    u8 = F8;
+                    if(rex.w) {
+                        SD(ed, gback, gdoffset+8*(u8&0x1));
+                    } else {
+                        SW(ed, gback, gdoffset+4*(u8&0x3));
+                    }
+                    break;
+                case 0x44:
+                    INST_NAME("PCLMULQDQ Gx, Ex, Ib");
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd); // gx
+                    if(MODREG) {
+                        ed = (nextop&7)+(rex.b<<3);
+                        sse_forget_reg(dyn, ninst, ed);
+                        MOV32w(x2, ed);
+                        MOV32w(x3, 0);  // p = NULL
+                    } else {
+                        MOV32w(x2, 0);
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 0, 1);
+                        if(ed!=x3) {
+                            MV(x3, ed);
+                        }
+                    }
+                    u8 = F8;
+                    MOV32w(x4, u8);
+                    CALL(native_pclmul, -1);
+                    break;
+                case 0xDF:
+                    INST_NAME("AESKEYGENASSIST Gx, Ex, Ib");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd); // gx
+                    if(MODREG) {
+                        ed = (nextop&7)+(rex.b<<3);
+                        sse_forget_reg(dyn, ninst, ed);
+                        MOV32w(x2, ed);
+                        MOV32w(x3, 0);  //p = NULL
+                    } else {
+                        MOV32w(x2, 0);
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 1);
+                        if(ed!=x3) {
+                            MV(x3, ed);
+                        }
+                    }
+                    u8 = F8;
+                    MOV32w(x4, u8);
+                    CALL(native_aeskeygenassist, -1);
+                    break;
+            default:
                     DEFAULT;
             }
             break;
+        #define GO(GETFLAGS, NO, YES, F)            \
+            READFLAGS(F);                           \
+            GETFLAGS;                               \
+            nextop=F8;                              \
+            GETGD;                                  \
+            if(MODREG) {                            \
+                ed = xRAX+(nextop&7)+(rex.b<<3);    \
+                ZEXTH(x4, ed);                      \
+                ed = x4;                            \
+            } else {                                \
+                SMREAD();                           \
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \
+                LHU(x4, ed, fixedaddress);          \
+                ed = x4;                            \
+            }                                       \
+            B##NO(x1, 4+3*4);                       \
+            LUI(x3, 0xffff0);                       \
+            AND(gd, gd, x3);                        \
+            OR(gd, gd, ed);
+
+        GOCOND(0x40, "CMOV", "Gw, Ew");
+        #undef GO
         case 0x50:
             INST_NAME("PMOVMSKD Gd, Ex");
             nextop = F8;
@@ -390,11 +1011,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 if (i) SLLI(x2, x2, 1);
                 OR(gd, gd, x2);
             }
-            break;   
+            break;
         case 0x51:
             INST_NAME("SQRTPD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             if(!box64_dynarec_fastnan) {
@@ -411,42 +1032,42 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     BEQ(x3, xZR, 8);
                     FNEGD(d0, d0);
                 }
-                FSD(d0, gback, i*8);
+                FSD(d0, gback, gdoffset+i*8);
             }
             break;
         case 0x54:
             INST_NAME("ANDPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_Q(x3, x4, AND(x3, x3, x4));
             break;
         case 0x55:
             INST_NAME("ANDNPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_Q(x3, x4, NOT(x3, x3); AND(x3, x3, x4));
             break;
         case 0x56:
             INST_NAME("ORPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_Q(x3, x4, OR(x3, x3, x4));
             break;
         case 0x57:
             INST_NAME("XORPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
             break;
         case 0x58:
             INST_NAME("ADDPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_FQ(x3, x4, {
                 if(!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
@@ -466,7 +1087,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("MULPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_FQ(x3, x4, {
                 if(!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
@@ -485,24 +1106,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5A:
             INST_NAME("CVTPD2PS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             // GX->f[0] = EX->d[0];
             FLD(d0, wback, fixedaddress+0);
             FCVTSD(d0, d0);
-            FSD(d0, gback, 0);
+            FSD(d0, gback, gdoffset+0);
             // GX->f[1] = EX->d[1];
             FLD(d0, wback, fixedaddress+8);
             FCVTSD(d0, d0);
-            FSD(d0, gback, 4);
+            FSD(d0, gback, gdoffset+4);
             // GX->q[1] = 0;
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+8);
             break;
         case 0x5B:
             INST_NAME("CVTPS2DQ Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             u8 = sse_setround(dyn, ninst, x6, x4);
@@ -513,7 +1134,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 SUB(x5, x5, x3);
                 BEQZ(x5, 8);
                 LUI(x3, 0x80000); // INT32_MIN
-                SW(x3, gback, 4*i);
+                SW(x3, gback, gdoffset+4*i);
             }
             x87_restoreround(dyn, ninst, u8);
             break;
@@ -521,7 +1142,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("SUBPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_FQ(x3, x4, {
                 if(!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
@@ -540,12 +1161,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5D:
             INST_NAME("MINPD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
             for (int i=0; i<2; ++i) {
-                FLD(d0, gback, 8*i);
+                FLD(d0, gback, gdoffset+8*i);
                 FLD(d1, wback, fixedaddress+8*i);
                 FEQD(x3, d0, d0);
                 FEQD(x4, d1, d1);
@@ -553,14 +1174,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 BEQ(x3, xZR, 12);
                 FLTD(x3, d1, d0);
                 BEQ(x3, xZR, 8); // continue
-                FSD(d1, gback, 8*i);
+                FSD(d1, gback, gdoffset+8*i);
             }
             break;
         case 0x5E:
             INST_NAME("DIVPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_FQ(x3, x4, {
                 if(!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
@@ -579,12 +1200,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5F:
             INST_NAME("MAXPD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
             for (int i=0; i<2; ++i) {
-                FLD(d0, gback, 8*i);
+                FLD(d0, gback, gdoffset+8*i);
                 FLD(d1, wback, fixedaddress+8*i);
                 FEQD(x3, d0, d0);
                 FEQD(x4, d1, d1);
@@ -592,54 +1213,54 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 BEQ(x3, xZR, 12);
                 FLTD(x3, d0, d1);
                 BEQ(x3, xZR, 8); // continue
-                FSD(d1, gback, 8*i);
+                FSD(d1, gback, gdoffset+8*i);
             }
             break;
         case 0x60:
             INST_NAME("PUNPCKLBW Gx,Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             for(int i=7; i>0; --i) { // 0 is untouched
                 // GX->ub[2 * i] = GX->ub[i];
-                LBU(x3, gback, i);
-                SB(x3, gback, 2*i);
+                LBU(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset+2*i);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 for(int i=0; i<8; ++i) {
                     // GX->ub[2 * i + 1] = GX->ub[2 * i];
-                    LBU(x3, gback, 2*i);
-                    SB(x3, gback, 2*i+1);
+                    LBU(x3, gback, gdoffset+2*i);
+                    SB(x3, gback, gdoffset+2*i+1);
                 }
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<8; ++i) {
                     // GX->ub[2 * i + 1] = EX->ub[i];
                     LBU(x3, wback, fixedaddress+i);
-                    SB(x3, gback, 2*i+1);
+                    SB(x3, gback, gdoffset+2*i+1);
                 }
             }
             break;
         case 0x61:
             INST_NAME("PUNPCKLWD Gx,Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             for(int i=3; i>0; --i) {
                 // GX->uw[2 * i] = GX->uw[i];
-                LHU(x3, gback, i*2);
-                SH(x3, gback, 2*i*2);
+                LHU(x3, gback, gdoffset+i*2);
+                SH(x3, gback, gdoffset+2*i*2);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 for(int i=0; i<4; ++i) {
                     // GX->uw[2 * i + 1] = GX->uw[2 * i];
-                    LHU(x3, gback, 2*i*2);
-                    SH(x3, gback, (2*i+1)*2);
+                    LHU(x3, gback, gdoffset+2*i*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
                 }
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<4; ++i) {
                     // GX->uw[2 * i + 1] = EX->uw[i];
                     LHU(x3, wback, fixedaddress+i*2);
-                    SH(x3, gback, (2*i+1)*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
                 }
             }
             break;
@@ -647,71 +1268,108 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("PUNPCKLDQ Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             // GX->ud[3] = EX->ud[1];
-            LWU(x3, x1, fixedaddress+1*4);
-            SW(x3, x2, 3*4);
+            LWU(x3, wback, fixedaddress+1*4);
+            SW(x3, gback, gdoffset+3*4);
             // GX->ud[2] = GX->ud[1];
-            LWU(x3, x2, 1*4);
-            SW(x3, x2, 2*4);
+            LWU(x3, gback, gdoffset+1*4);
+            SW(x3, gback, gdoffset+2*4);
             // GX->ud[1] = EX->ud[0];
-            LWU(x3, x1, fixedaddress+0*4);
-            SW(x3, x2, 1*4);
+            LWU(x3, wback, fixedaddress+0*4);
+            SW(x3, gback, gdoffset+1*4);
+            break;
+        case 0x63:
+            INST_NAME("PACKSSWB Gx, Ex");
+            nextop = F8;
+            GETGX();
+            GETEX(x2, 0);
+            MOV64x(x5, 127);
+            MOV64x(x6, -128);
+            for(int i=0; i<8; ++i) {
+                LH(x3, gback, gdoffset+i*2);
+                if(rv64_zbb) {
+                    MIN(x3, x3, x5);
+                    MAX(x3, x3, x6);
+                } else {
+                    BLT(x3, x5, 4+4);
+                    MV(x3, x5);
+                    BGE(x3, x6, 4+4);
+                    MV(x3, x6);
+                }
+                SB(x3, gback, gdoffset+i);
+            }
+            if(MODREG && gd==ed) {
+                LD(x3, gback, gdoffset+0);
+                SD(x3, gback, gdoffset+8);
+            } else for(int i=0; i<8; ++i) {
+                LH(x3, wback, fixedaddress+i*2);
+                if(rv64_zbb) {
+                    MIN(x3, x3, x5);
+                    MAX(x3, x3, x6);
+                } else {
+                    BLT(x3, x5, 4+4);
+                    MV(x3, x5);
+                    BGE(x3, x6, 4+4);
+                    MV(x3, x6);
+                }
+                SB(x3, gback, gdoffset+8+i);
+            }
             break;
         case 0x64:
             INST_NAME("PCMPGTB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // GX->ub[i] = (GX->sb[i]>EX->sb[i])?0xFF:0x00;
                 LB(x3, wback, fixedaddress+i);
-                LB(x4, gback, i);
+                LB(x4, gback, gdoffset+i);
                 SLT(x3, x3, x4);
                 NEG(x3, x3);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0x65:
             INST_NAME("PCMPGTW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
                 // GX->uw[i] = (GX->sw[i]>EX->sw[i])?0xFFFF:0x0000;
                 LH(x3, wback, fixedaddress+i*2);
-                LH(x4, gback, i*2);
+                LH(x4, gback, gdoffset+i*2);
                 SLT(x3, x3, x4);
                 NEG(x3, x3);
-                SH(x3, gback, i*2);
+                SH(x3, gback, gdoffset+i*2);
             }
             break;
         case 0x66:
             INST_NAME("PCMPGTD Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_DS(x3, x4, SLT(x4, x4, x3); SLLI(x3, x4, 63); SRAI(x3, x3, 63));
             break;
         case 0x67:
             INST_NAME("PACKUSWB Gx, Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             ADDI(x5, xZR, 0xFF);
             for(int i=0; i<8; ++i) {
                 // GX->ub[i] = (GX->sw[i]<0)?0:((GX->sw[i]>0xff)?0xff:GX->sw[i]);
-                LH(x3, gback, i*2);
+                LH(x3, gback, gdoffset+i*2);
                 BGE(x5, x3, 8);
                 ADDI(x3, xZR, 0xFF);
                 NOT(x4, x3);
                 SRAI(x4, x4, 63);
                 AND(x3, x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 // GX->q[1] = GX->q[0];
-                LD(x3, gback, 0*8);
-                SD(x3, gback, 1*8);
+                LD(x3, gback, gdoffset+0*8);
+                SD(x3, gback, gdoffset+1*8);
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<8; ++i) {
@@ -722,55 +1380,55 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     NOT(x4, x3);
                     SRAI(x4, x4, 63);
                     AND(x3, x3, x4);
-                    SB(x3, gback, 8+i);
+                    SB(x3, gback, gdoffset+8+i);
                 }
             }
             break;
         case 0x68:
             INST_NAME("PUNPCKHBW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             for(int i=0; i<8; ++i) {
                 // GX->ub[2 * i] = GX->ub[i + 8];
-                LBU(x3, gback, i+8);
-                SB(x3, gback, 2*i);
+                LBU(x3, gback, gdoffset+i+8);
+                SB(x3, gback, gdoffset+2*i);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 for(int i=0; i<8; ++i) {
                     // GX->ub[2 * i + 1] = GX->ub[2 * i];
-                    LBU(x3, gback, 2*i);
-                    SB(x3, gback, 2*i+1);
+                    LBU(x3, gback, gdoffset+2*i);
+                    SB(x3, gback, gdoffset+2*i+1);
                 }
             } else {
                 GETEX(x2, 0);
                 for(int i=0; i<8; ++i) {
                     // GX->ub[2 * i + 1] = EX->ub[i + 8];
                     LBU(x3, wback, fixedaddress+i+8);
-                    SB(x3, gback, 2*i+1);
+                    SB(x3, gback, gdoffset+2*i+1);
                 }
             }
             break;
         case 0x69:
             INST_NAME("PUNPCKHWD Gx,Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             for(int i=0; i<4; ++i) {
                 // GX->uw[2 * i] = GX->uw[i + 4];
-                LHU(x3, gback, (i+4)*2);
-                SH(x3, gback, 2*i*2);
+                LHU(x3, gback, gdoffset+(i+4)*2);
+                SH(x3, gback, gdoffset+2*i*2);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 for(int i=0; i<4; ++i) {
                     // GX->uw[2 * i + 1] = GX->uw[2 * i];
-                    LHU(x3, gback, 2*i*2);
-                    SH(x3, gback, (2*i+1)*2);
+                    LHU(x3, gback, gdoffset+2*i*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
                 }
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<4; ++i) {
                     // GX->uw[2 * i + 1] = EX->uw[i + 4];
                     LHU(x3, wback, fixedaddress+(i+4)*2);
-                    SH(x3, gback, (2*i+1)*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
                 }
             }
             break;
@@ -778,41 +1436,41 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("PUNPCKHDQ Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             // GX->ud[0] = GX->ud[2];
-            LWU(x3, gback, 2*4);
-            SW(x3, gback, 0*4);
+            LWU(x3, gback, gdoffset+2*4);
+            SW(x3, gback, gdoffset+0*4);
             // GX->ud[1] = EX->ud[2];
             LWU(x3, wback, fixedaddress+2*4);
-            SW(x3, gback, 1*4);
+            SW(x3, gback, gdoffset+1*4);
             // GX->ud[2] = GX->ud[3];
-            LWU(x3, gback, 3*4);
-            SW(x3, gback, 2*4);
+            LWU(x3, gback, gdoffset+3*4);
+            SW(x3, gback, gdoffset+2*4);
             // GX->ud[3] = EX->ud[3];
             if (!(MODREG && (gd==ed))) {
                 LWU(x3, wback, fixedaddress+3*4);
-                SW(x3, gback, 3*4);
+                SW(x3, gback, gdoffset+3*4);
             }
             break;
         case 0x6B:
             INST_NAME("PACKSSDW Gx,Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             MOV64x(x5, 32768);
             NEG(x6, x5);
             for(int i=0; i<4; ++i) {
                 // GX->sw[i] = (GX->sd[i]<-32768)?-32768:((GX->sd[i]>32767)?32767:GX->sd[i]);
-                LW(x3, gback, i*4);
+                LW(x3, gback, gdoffset+i*4);
                 BGE(x5, x3, 8);
                 ADDI(x3, x5, -1);
                 BGE(x3, x6, 8);
                 MV(x3, x6);
-                SH(x3, gback, i*2);
+                SH(x3, gback, gdoffset+i*2);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 // GX->q[1] = GX->q[0];
-                LD(x3, gback, 0*8);
-                SD(x3, gback, 1*8);
+                LD(x3, gback, gdoffset+0*8);
+                SD(x3, gback, gdoffset+1*8);
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<4; ++i) {
@@ -822,32 +1480,32 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     ADDI(x3, x5, -1);
                     BGE(x3, x6, 8);
                     MV(x3, x6);
-                    SH(x3, gback, (4+i)*2);
+                    SH(x3, gback, gdoffset+(4+i)*2);
                 }
             }
             break;
         case 0x6C:
             INST_NAME("PUNPCKLQDQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(MODREG) {
                 v1 = sse_get_reg(dyn, ninst, x2, (nextop&7)+(rex.b<<3), 0);
-                FSD(v1, gback, 8);
+                FSD(v1, gback, gdoffset+8);
             } else {
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
                 LD(x3, ed, fixedaddress+0);
-                SD(x3, gback, 8);
+                SD(x3, gback, gdoffset+8);
             }
             break;
         case 0x6D:
             INST_NAME("PUNPCKHQDQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
-            LD(x3, gback, 8);
-            SD(x3, gback, 0);
+            LD(x3, gback, gdoffset+8);
+            SD(x3, gback, gdoffset+0);
             LD(x3, wback, fixedaddress+8);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             break;
         case 0x6E:
             INST_NAME("MOVD Gx, Ed");
@@ -869,14 +1527,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x6F:
             INST_NAME("MOVDQA Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q(x3);
             break;
         case 0x70: // TODO: Optimize this!
             INST_NAME("PSHUFD Gx,Ex,Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 1);
             u8 = F8;
             int32_t idx;
@@ -890,10 +1548,10 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             idx = (u8>>(3*2))&3;
             LWU(x6, wback, fixedaddress+idx*4);
 
-            SW(x3, gback, 0*4);
-            SW(x4, gback, 1*4);
-            SW(x5, gback, 2*4);
-            SW(x6, gback, 3*4);
+            SW(x3, gback, gdoffset+0*4);
+            SW(x4, gback, gdoffset+1*4);
+            SW(x5, gback, gdoffset+2*4);
+            SW(x6, gback, gdoffset+3*4);
             break;
         case 0x71:
             nextop = F8;
@@ -904,8 +1562,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     u8 = F8;
                     if (u8>15) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else if(u8) {
                         for (int i=0; i<8; ++i) {
                             // EX->uw[i] >>= u8;
@@ -935,8 +1593,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     u8 = F8;
                     if (u8>15) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else if(u8) {
                         for (int i=0; i<8; ++i) {
                             // EX->uw[i] <<= u8;
@@ -961,8 +1619,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(u8) {
                         if (u8>31) {
                             // just zero dest
-                            SD(xZR, x1, fixedaddress+0);
-                            SD(xZR, x1, fixedaddress+8);
+                            SD(xZR, wback, fixedaddress+0);
+                            SD(xZR, wback, fixedaddress+8);
                         } else if(u8) {
                             SSE_LOOP_D_S(x3, SRLI(x3, x3, u8));
                         }
@@ -984,8 +1642,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(u8) {
                         if (u8>31) {
                             // just zero dest
-                            SD(xZR, x1, fixedaddress+0);
-                            SD(xZR, x1, fixedaddress+8);
+                            SD(xZR, wback, fixedaddress+0);
+                            SD(xZR, wback, fixedaddress+8);
                         } else if(u8) {
                             SSE_LOOP_D_S(x3, SLLI(x3, x3, u8));
                         }
@@ -1023,24 +1681,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(!u8) break;
                     if(u8>15) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else {
                         u8*=8;
                         if (u8 < 64) {
-                            LD(x3, x1, fixedaddress+0);
-                            LD(x4, x1, fixedaddress+8);
+                            LD(x3, wback, fixedaddress+0);
+                            LD(x4, wback, fixedaddress+8);
                             SRLI(x3, x3, u8);
                             SLLI(x5, x4, 64-u8);
                             OR(x3, x3, x5);
-                            SD(x3, x1, fixedaddress+0);
+                            SD(x3, wback, fixedaddress+0);
                             SRLI(x4, x4, u8);
-                            SD(x4, x1, fixedaddress+8);
+                            SD(x4, wback, fixedaddress+8);
                         } else {
-                            LD(x3, x1, fixedaddress+8);
+                            LD(x3, wback, fixedaddress+8);
                             if (u8-64 > 0) { SRLI(x3, x3, u8-64); }
-                            SD(x3, x1, fixedaddress+0);
-                            SD(xZR, x1, fixedaddress+8);
+                            SD(x3, wback, fixedaddress+0);
+                            SD(xZR, wback, fixedaddress+8);
                         }
                     }
                     break;
@@ -1051,8 +1709,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(!u8) break;
                     if(u8>63) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else {
                         LD(x3, wback, fixedaddress+0);
                         LD(x4, wback, fixedaddress+8);
@@ -1069,24 +1727,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(!u8) break;
                     if(u8>15) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else {
                         u8*=8;
                         if (u8 < 64) {
-                            LD(x3, x1, fixedaddress+0);
-                            LD(x4, x1, fixedaddress+8);
+                            LD(x3, wback, fixedaddress+0);
+                            LD(x4, wback, fixedaddress+8);
                             SLLI(x4, x4, u8);
                             SRLI(x5, x3, 64-u8);
                             OR(x4, x4, x5);
-                            SD(x4, x1, fixedaddress+8);
+                            SD(x4, wback, fixedaddress+8);
                             SLLI(x3, x3, u8);
-                            SD(x3, x1, fixedaddress+0);
+                            SD(x3, wback, fixedaddress+0);
                         } else {
-                            LD(x3, x1, fixedaddress+0);
+                            LD(x3, wback, fixedaddress+0);
                             if (u8-64 > 0) { SLLI(x3, x3, u8-64); }
-                            SD(x3, x1, fixedaddress+8);
-                            SD(xZR, x1, fixedaddress+0);
+                            SD(x3, wback, fixedaddress+8);
+                            SD(xZR, wback, fixedaddress+0);
                         }
                     }
                     break;
@@ -1097,52 +1755,94 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x74:
             INST_NAME("PCMPEQB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 SUB(x3, x3, x4);
                 SEQZ(x3, x3);
                 NEG(x3, x3);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0x75:
             INST_NAME("PCMPEQW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_W(x3, x4, SUB(x3, x3, x4); SEQZ(x3, x3); NEG(x3, x3));
             break;
         case 0x76:
             INST_NAME("PCMPEQD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_D(x3, x4, XOR(x3, x3, x4); SNEZ(x3, x3); ADDI(x3, x3, -1));
             break;
+        case 0x7C:
+            INST_NAME("HADDPD Gx, Ex");
+            nextop = F8;
+            GETGX();
+            d0 = fpu_get_scratch(dyn);
+            d1 = fpu_get_scratch(dyn);
+            FLD(d0, gback, gdoffset+0);
+            FLD(d1, gback, gdoffset+8);
+            if(!box64_dynarec_fastnan) {
+                FEQD(x3, d0, d0);
+                FEQD(x4, d1, d1);
+                AND(x3, x3, x4);
+            }
+            FADDD(d0, d0, d1);
+            if(!box64_dynarec_fastnan) {
+                FEQD(x4, d0, d0);
+                BEQZ(x3, 12);
+                BNEZ(x4, 8);
+                FNEGD(d0, d0);
+            }
+            FSD(d0, gback, gdoffset+0);
+            if(MODREG && gd==(nextop&7)+(rex.b<<3)) {
+                FSD(d0, gback, gdoffset+8);
+            } else {
+                GETEX(x2, 0);
+                FLD(d0, wback, fixedaddress+0);
+                FLD(d1, wback, fixedaddress+8);
+                if(!box64_dynarec_fastnan) {
+                    FEQD(x3, d0, d0);
+                    FEQD(x4, d1, d1);
+                    AND(x3, x3, x4);
+                }
+                FADDD(d0, d0, d1);
+                if(!box64_dynarec_fastnan) {
+                    FEQD(x4, d0, d0);
+                    BEQZ(x3, 12);
+                    BNEZ(x4, 8);
+                    FNEGD(d0, d0);
+                }
+                FSD(d0, gback, gdoffset+8);
+            }
+            break;
         case 0x7E:
             INST_NAME("MOVD Ed,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(rex.w) {
                 if(MODREG) {
                     ed = xRAX + (nextop&7) + (rex.b<<3);
-                    LD(ed, x1, 0);
+                    LD(ed, gback, gdoffset+0);
                 } else {
-                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
-                    LD(x3, x1, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                    LD(x3, gback, gdoffset+0);
                     SD(x3, ed, fixedaddress);
                     SMWRITE2();
                 }
             } else {
                 if(MODREG) {
                     ed = xRAX + (nextop&7) + (rex.b<<3);
-                    LWU(ed, x1, 0);
+                    LWU(ed, gback, gdoffset+0);
                 } else {
-                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
-                    LWU(x3, x1, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                    LWU(x3, gback, gdoffset+0);
                     SW(x3, ed, fixedaddress);
                     SMWRITE2();
                 }
@@ -1151,7 +1851,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x7F:
             INST_NAME("MOVDQA Ex,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
             if(!MODREG) SMWRITE2();
@@ -1165,8 +1865,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETSGW(x2);
             MULW(x2, x2, x1);
             UFLAG_RES(x2);
-            SLLI(x2, x2, 48);
-            SRLI(x2, x2, 48);
+            ZEXTH(x2, x2);
             GWBACK;
             break;
 
@@ -1188,7 +1887,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 SRAI(x1, x1, 56);
             } else {
                 SMREAD();
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 0, 0);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0);
                 LB(x1, ed, fixedaddress);
             }
             LUI(x5, 0xffff0);
@@ -1200,13 +1899,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xC2:
             INST_NAME("CMPPD Gx, Ex, Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 1);
             u8 = F8;
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
             for(int i=0; i<2; ++i) {
-                FLD(d0, gback, 8*i);
+                FLD(d0, gback, gdoffset+8*i);
                 FLD(d1, wback, fixedaddress+8*i);
                 if ((u8&7) == 0) {                                      // Equal
                     FEQD(x3, d0, d1);
@@ -1237,7 +1936,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     }
                     case 7: break;                                      // Not NaN
                     }
-                    
+
                     // MARK2;
                     if ((u8&7) == 5 || (u8&7) == 6) {
                         MOV32w(x3, 1);
@@ -1245,16 +1944,16 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     // MARK;
                 }
                 NEG(x3, x3);
-                SD(x3, gback, 8*i);
+                SD(x3, gback, gdoffset+8*i);
             }
             break;
         case 0xC4:
             INST_NAME("PINSRW Gx,Ed,Ib");
             nextop = F8;
             GETED(1);
-            GETGX(x3);
+            GETGX();
             u8 = (F8)&7;
-            SH(ed, gback, u8*2);
+            SH(ed, gback, gdoffset+u8*2);
             break;
         case 0xC5:
             INST_NAME("PEXTRW Gd,Ex,Ib");
@@ -1267,90 +1966,90 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xC6:
             INST_NAME("SHUFPD Gx, Ex, Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
+            GETEX(x2, 1);
             u8 = F8;
             if (MODREG && gd==(nextop&7)+(rex.b<<3) && u8==0) {
-                LD(x3, gback, 0);
-                SD(x3, gback, 8);
+                LD(x3, gback, gdoffset+0);
+                SD(x3, gback, gdoffset+8);
                 break;
             }
-            GETEX(x2, 1)
-            LD(x3, gback, 8*(u8&1));
+            LD(x3, gback, gdoffset+8*(u8&1));
             LD(x4, wback, fixedaddress+8*((u8>>1)&1));
-            SD(x3, gback, 0);
-            SD(x4, gback, 8);
+            SD(x3, gback, gdoffset+0);
+            SD(x4, gback, gdoffset+8);
             break;
         case 0xD1:
             INST_NAME("PSRLW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LD(x3, wback, fixedaddress);
             ADDI(x4, xZR, 16);
             BLTU_MARK(x3, x4);
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<8; ++i) {
-                LHU(x5, gback, 2*i);
+                LHU(x5, gback, gdoffset+2*i);
                 SRLW(x5, x5, x3);
-                SH(x5, gback, 2*i);
+                SH(x5, gback, gdoffset+2*i);
             }
             break;
         case 0xD2:
             INST_NAME("PSRLD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LD(x3, wback, fixedaddress);
             ADDI(x4, xZR, 32);
             BLTU_MARK(x3, x4);
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<4; ++i) {
-                LWU(x5, gback, 4*i);
+                LWU(x5, gback, gdoffset+4*i);
                 SRLW(x5, x5, x3);
-                SW(x5, gback, 4*i);
+                SW(x5, gback, gdoffset+4*i);
             }
             break;
         case 0xD3:
             INST_NAME("PSRLQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LD(x3, wback, fixedaddress);
             ADDI(x4, xZR, 64);
             BLTU_MARK(x3, x4);
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<2; ++i) {
-                LD(x5, gback, 8*i);
+                LD(x5, gback, gdoffset+8*i);
                 SRL(x5, x5, x3);
-                SD(x5, gback, 8*i);
+                SD(x5, gback, gdoffset+8*i);
             }
             break;
         case 0xD4:
             INST_NAME("PADDQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, ADD(x3, x3, x4));
             break;
         case 0xD5:
             INST_NAME("PMULLW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 MULW(x3, x3, x4);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xD6:
@@ -1381,314 +2080,347 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xD8:
             INST_NAME("PSUBUSB Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 SUB(x3, x3, x4);
                 NOT(x4, x3);
                 SRAI(x4, x4, 63);
                 AND(x3, x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xD9:
             INST_NAME("PSUBUSW Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_W(x3, x4, SUB(x3, x3, x4); NOT(x4, x3); SRAI(x4, x4, 63); AND(x3, x3, x4));
             break;
         case 0xDA:
             INST_NAME("PMINUB Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 BLTU(x3, x4, 8);
                 MV(x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xDB:
             INST_NAME("PAND Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, AND(x3, x3, x4));
             break;
         case 0xDC:
             INST_NAME("PADDUSB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x5, xZR, 0xFF);
             for(int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 ADD(x3, x3, x4);
                 BLT(x3, x5, 8);
                 ADDI(x3, xZR, 0xFF);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xDD:
             INST_NAME("PADDUSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
                 // tmp32s = (int32_t)GX->uw[i] + EX->uw[i];
                 // GX->uw[i] = (tmp32s>65535)?65535:tmp32s;
-                LHU(x3, gback, i*2);
+                LHU(x3, gback, gdoffset+i*2);
                 LHU(x4, wback, fixedaddress+i*2);
                 ADDW(x3, x3, x4);
                 MOV32w(x4, 65536);
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, i*2);
+                SH(x3, gback, gdoffset+i*2);
             }
             break;
         case 0xDE:
             INST_NAME("PMAXUB Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 BLTU(x4, x3, 8);
                 MV(x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xDF:
             INST_NAME("PANDN Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, NOT(x3, x3); AND(x3, x3, x4));
             break;
          case 0xE0:
             INST_NAME("PAVGB Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 ADDW(x3, x3, x4);
                 ADDIW(x3, x3, 1);
                 SRAIW(x3, x3, 1);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xE1:
             INST_NAME("PSRAW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 16);
             LD(x3, wback, fixedaddress);
             BLTU(x3, x4, 8);
             SUBI(x3, x4, 1);
             for (int i=0; i<8; ++i) {
-                LH(x4, gback, 2*i);
+                LH(x4, gback, gdoffset+2*i);
                 SRAW(x4, x4, x3);
-                SH(x4, gback, 2*i);
+                SH(x4, gback, gdoffset+2*i);
             }
             break;
         case 0xE2:
             INST_NAME("PSRAD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 32);
             LD(x3, wback, fixedaddress);
             BLTU(x3, x4, 8);
             SUBI(x3, x4, 1);
             for (int i=0; i<4; ++i) {
-                LW(x4, gback, 4*i);
+                LW(x4, gback, gdoffset+4*i);
                 SRAW(x4, x4, x3);
-                SW(x4, gback, 4*i);
+                SW(x4, gback, gdoffset+4*i);
             }
             break;
         case 0xE3:
             INST_NAME("PAVGW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<8; ++i) {
-                LHU(x3, gback, 2*i);
+                LHU(x3, gback, gdoffset+2*i);
                 LHU(x4, wback, fixedaddress+2*i);
                 ADDW(x3, x3, x4);
                 ADDIW(x3, x3, 1);
                 SRAIW(x3, x3, 1);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xE4:
             INST_NAME("PMULHUW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
-                LHU(x3, gback, 2*i);
+                LHU(x3, gback, gdoffset+2*i);
                 LHU(x4, wback, fixedaddress+2*i);
                 MULW(x3, x3, x4);
                 SRLIW(x3, x3, 16);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xE5:
             INST_NAME("PMULHW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 MULW(x3, x3, x4);
                 SRAIW(x3, x3, 16);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
+        case 0xE6:
+            INST_NAME("CVTTPD2DQ Gx, Ex");
+            nextop = F8;
+            GETGX();
+            GETEX(x2, 0);
+            v0 = fpu_get_scratch(dyn);
+            v1 = fpu_get_scratch(dyn);
+            FLD(v0, wback, fixedaddress+0);
+            FLD(v1, wback, fixedaddress+8);
+            if(!box64_dynarec_fastround) {
+                FSFLAGSI(0);  // // reset all bits
+            }
+            FCVTWD(x3, v0, RD_RTZ);
+            if(!box64_dynarec_fastround) {
+                FRFLAGS(x5);   // get back FPSR to check the IOC bit
+                ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                BEQ_MARK(x5, xZR);
+                MOV32w(x3, 0x80000000);
+                MARK;
+                FSFLAGSI(0);  // // reset all bits
+            }
+            FCVTWD(x4, v1, RD_RTZ);
+            if(!box64_dynarec_fastround) {
+                FRFLAGS(x5);   // get back FPSR to check the IOC bit
+                ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                BEQ_MARK2(x5, xZR);
+                MOV32w(x4, 0x80000000);
+                MARK2;
+            }
+            SW(x3, gback, gdoffset+0);
+            SW(x4, gback, gdoffset+4);
+            SD(xZR, gback, gdoffset+8);
+            break;
         case 0xE7:
             INST_NAME("MOVNTDQ Ex, Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
             break;
         case 0xE8:
             INST_NAME("PSUBSB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // tmp16s = (int16_t)GX->sb[i] - EX->sb[i];
                 // GX->sb[i] = (tmp16s<-128)?-128:((tmp16s>127)?127:tmp16s);
-                LB(x3, gback, i);
+                LB(x3, gback, gdoffset+i);
                 LB(x4, wback, fixedaddress+i);
                 SUBW(x3, x3, x4);
                 SLLIW(x3, x3, 16);
                 SRAIW(x3, x3, 16);
                 ADDI(x4, xZR, 0x7f);
                 BLT(x3, x4, 12);     // tmp16s>127?
-                SB(x4, gback, i);
+                SB(x4, gback, gdoffset+i);
                 J(24);               // continue
                 ADDI(x4, xZR, 0xf80);
                 BLT(x4, x3, 12);     // tmp16s<-128?
-                SB(x4, gback, i);
+                SB(x4, gback, gdoffset+i);
                 J(8);                // continue
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xE9:
             INST_NAME("PSUBSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
                 // tmp32s = (int32_t)GX->sw[i] - EX->sw[i];
                 // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 SUBW(x3, x3, x4);
                 LUI(x4, 0xFFFF8); // -32768
                 BGE(x3, x4, 12);
-                SH(x4, gback, 2*i);
+                SH(x4, gback, gdoffset+2*i);
                 J(20); // continue
                 LUI(x4, 8); // 32768
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xEA:
             INST_NAME("PMINSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<8; ++i) {
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 BLT(x3, x4, 8);
                 MV(x3, x4);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xEB:
             INST_NAME("POR Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, OR(x3, x3, x4));
             break;
         case 0xEC:
             INST_NAME("PADDSB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // tmp16s = (int16_t)GX->sb[i] + EX->sb[i];
                 // GX->sb[i] = (tmp16s>127)?127:((tmp16s<-128)?-128:tmp16s);
-                LB(x3, gback, i);
+                LB(x3, gback, gdoffset+i);
                 LB(x4, wback, fixedaddress+i);
                 ADDW(x3, x3, x4);
                 SLLIW(x3, x3, 16);
                 SRAIW(x3, x3, 16);
                 ADDI(x4, xZR, 0x7f);
                 BLT(x3, x4, 12);     // tmp16s>127?
-                SB(x4, gback, i);
+                SB(x4, gback, gdoffset+i);
                 J(24);               // continue
                 ADDI(x4, xZR, 0xf80);
                 BLT(x4, x3, 12);     // tmp16s<-128?
-                SB(x4, gback, i);
+                SB(x4, gback, gdoffset+i);
                 J(8);                // continue
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xED:
             INST_NAME("PADDSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
                 // tmp32s = (int32_t)GX->sw[i] + EX->sw[i];
                 // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 ADDW(x3, x3, x4);
                 LUI(x4, 0xFFFF8); // -32768
                 BGE(x3, x4, 12);
-                SH(x4, gback, 2*i);
+                SH(x4, gback, gdoffset+2*i);
                 J(20); // continue
                 LUI(x4, 8); // 32768
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xEE:
             INST_NAME("PMAXSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_WS(x3, x4, BGE(x3, x4, 8); MV(x3, x4));
             break;
         case 0xEF:
             INST_NAME("PXOR Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(MODREG && gd==(nextop&7)+(rex.b<<3))
             {
                 // just zero dest
-                SD(xZR, x1, 0);
-                SD(xZR, x1, 8);
+                SD(xZR, gback, gdoffset+0);
+                SD(xZR, gback, gdoffset+8);
             } else {
                 GETEX(x2, 0);
                 SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
@@ -1697,102 +2429,102 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xF1:
             INST_NAME("PSLLQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 16);
             LD(x3, wback, fixedaddress+0);
             BLTU_MARK(x3, x4);
             // just zero dest
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<8; ++i) {
-                LHU(x4, gback, 2*i);
+                LHU(x4, gback, gdoffset+2*i);
                 SLLW(x4, x4, x3);
-                SH(x4, gback, 2*i);
+                SH(x4, gback, gdoffset+2*i);
             }
             break;
         case 0xF2:
             INST_NAME("PSLLQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 32);
             LD(x3, wback, fixedaddress+0);
             BLTU_MARK(x3, x4);
             // just zero dest
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<4; ++i) {
-                LWU(x4, gback, 4*i);
+                LWU(x4, gback, gdoffset+4*i);
                 SLLW(x4, x4, x3);
-                SW(x4, gback, 4*i);
+                SW(x4, gback, gdoffset+4*i);
             }
             break;
         case 0xF3:
             INST_NAME("PSLLQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 64);
             LD(x3, wback, fixedaddress+0);
             BLTU_MARK(x3, x4);
             // just zero dest
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<2; ++i) {
-                LD(x4, gback, 8*i);
+                LD(x4, gback, gdoffset+8*i);
                 SLL(x4, x4, x3);
-                SD(x4, gback, 8*i);
+                SD(x4, gback, gdoffset+8*i);
             }
             break;
         case 0xF4:
             INST_NAME("PMULUDQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             // GX->q[1] = (uint64_t)EX->ud[2]*GX->ud[2];
-            LWU(x3, gback, 2*4);
+            LWU(x3, gback, gdoffset+2*4);
             LWU(x4, wback, fixedaddress+2*4);
             MUL(x3, x3, x4);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             // GX->q[0] = (uint64_t)EX->ud[0]*GX->ud[0];
-            LWU(x3, gback, 0*4);
+            LWU(x3, gback, gdoffset+0*4);
             LWU(x4, wback, fixedaddress+0*4);
             MUL(x3, x3, x4);
-            SD(x3, gback, 0);
+            SD(x3, gback, gdoffset+0);
             break;
         case 0xF5:
             INST_NAME("PMADDWD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<4; ++i) {
-                // GX->sd[i] = (int32_t)(GX->sw[i*2+0])*EX->sw[i*2+0] + 
+                // GX->sd[i] = (int32_t)(GX->sw[i*2+0])*EX->sw[i*2+0] +
                 //             (int32_t)(GX->sw[i*2+1])*EX->sw[i*2+1];
-                LH(x3, gback, 2*(i*2+0));
+                LH(x3, gback, gdoffset+2*(i*2+0));
                 LH(x4, wback, fixedaddress+2*(i*2+0));
                 MULW(x5, x3, x4);
-                LH(x3, gback, 2*(i*2+1));
+                LH(x3, gback, gdoffset+2*(i*2+1));
                 LH(x4, wback, fixedaddress+2*(i*2+1));
                 MULW(x6, x3, x4);
                 ADDW(x5, x5, x6);
-                SW(x5, gback, 4*i);
+                SW(x5, gback, gdoffset+4*i);
             }
             break;
         case 0xF6:
             INST_NAME("PSADBW Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             MV(x6, xZR);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 SUBW(x3, x3, x4);
                 SRAIW(x5, x3, 31);
@@ -1801,7 +2533,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 ANDI(x3, x3, 0xff);
                 ADDW(x6, x6, x3);
                 if (i==7 || i == 15) {
-                    SD(x6, gback, i+1-8);
+                    SD(x6, gback, gdoffset+i+1-8);
                     if (i==7) MV(x6, xZR);
                 }
             }
@@ -1809,61 +2541,61 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xF8:
             INST_NAME("PSUBB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // GX->sb[i] -= EX->sb[i];
                 LB(x3, wback, fixedaddress+i);
-                LB(x4, gback, i);
+                LB(x4, gback, gdoffset+i);
                 SUB(x3, x4, x3);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xF9:
             INST_NAME("PSUBW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_W(x3, x4, SUBW(x3, x3, x4));
             break;
         case 0xFA:
             INST_NAME("PSUBD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_D(x3, x4, SUBW(x3, x3, x4));
             break;
         case 0xFB:
             INST_NAME("PSUBQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, SUB(x3, x3, x4));
             break;
         case 0xFC:
             INST_NAME("PADDB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // GX->sb[i] += EX->sb[i];
-                LB(x3, gback, i);
+                LB(x3, gback, gdoffset+i);
                 LB(x4, wback, fixedaddress+i);
                 ADDW(x3, x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xFD:
             INST_NAME("PADDW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_W(x3, x4, ADDW(x3, x3, x4));
             break;
         case 0xFE:
             INST_NAME("PADDD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_D(x3, x4, ADDW(x3, x3, x4));
             break;
diff --git a/src/dynarec/rv64/dynarec_rv64_6664.c b/src/dynarec/rv64/dynarec_rv64_6664.c
new file mode 100644
index 00000000..a139e3ae
--- /dev/null
+++ b/src/dynarec/rv64/dynarec_rv64_6664.c
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+
+#include "rv64_printer.h"
+#include "dynarec_rv64_private.h"
+#include "dynarec_rv64_helper.h"
+#include "dynarec_rv64_functions.h"
+
+uintptr_t dynarec64_6664(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog)
+{
+    (void)ip; (void)need_epilog;
+
+    uint8_t opcode = F8;
+    uint8_t nextop;
+    uint8_t gd, ed;
+    int64_t j64;
+    int v0, v1;
+    int64_t fixedaddress;
+    int unscaled;
+    MAYUSE(j64);
+
+    GETREX();
+
+    switch(opcode) {
+        case 0x8B:
+            INST_NAME("MOV Gd, FS:Ed");
+            nextop=F8;
+            GETGD;
+            if(MODREG) {   // reg <= reg
+                ed = xRAX+(nextop&7)+(rex.b<<3);
+                if(rex.w) {
+                    MV(gd, ed);
+                } else {
+                    if(ed!=gd) {
+                        LUI(x1, 0xffff0);
+                        AND(gd, gd, x1);
+                        ZEXTH(x1, ed);
+                        OR(gd, gd, x1);
+                    }
+                }
+            } else {                    // mem <= reg
+                grab_segdata(dyn, addr, ninst, x4, seg);
+                SMREAD();
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                ADD(x4, ed, x4);
+                if(rex.w) {
+                    LD(gd, x4, fixedaddress);
+                } else {
+                    LHU(x1, x4, fixedaddress);
+                    SRLI(gd, gd, 16);
+                    SLLI(gd, gd, 16);
+                    OR(gd, gd, x1);
+                }
+            }
+            break;
+
+
+        default:
+            DEFAULT;
+    }
+    return addr;
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_66f0.c b/src/dynarec/rv64/dynarec_rv64_66f0.c
new file mode 100644
index 00000000..863e535d
--- /dev/null
+++ b/src/dynarec/rv64/dynarec_rv64_66f0.c
@@ -0,0 +1,129 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+
+#include "rv64_printer.h"
+#include "dynarec_rv64_private.h"
+#include "dynarec_rv64_helper.h"
+#include "dynarec_rv64_functions.h"
+
+
+uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip; (void)rep; (void)need_epilog;
+
+    uint8_t opcode = F8;
+    uint8_t nextop;
+    uint8_t gd, ed, u8;
+    uint8_t wback, wb1, wb2, gb1, gb2;
+    int32_t i32;
+    int64_t i64, j64;
+    int64_t fixedaddress;
+    int unscaled;
+    MAYUSE(gb1);
+    MAYUSE(gb2);
+    MAYUSE(wb1);
+    MAYUSE(wb2);
+    MAYUSE(j64);
+
+    while((opcode==0xF2) || (opcode==0xF3)) {
+        rep = opcode-0xF1;
+        opcode = F8;
+    }
+
+    GETREX();
+
+    switch(opcode) {
+        case 0x81:
+        case 0x83:
+            nextop = F8;
+            SMDMB();
+            switch((nextop>>3)&7) {
+                case 0: //ADD
+                    if(opcode==0x81) {
+                        INST_NAME("LOCK ADD Ew, Iw");
+                    } else {
+                        INST_NAME("LOCK ADD Ew, Ib");
+                    }
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    if(MODREG) {
+                        if(opcode==0x81) i32 = F16S; else i32 = F8S;
+                        ed = xRAX+(nextop&7)+(rex.b<<3);
+                        MOV32w(x5, i32);
+                        ZEXTH(x6, ed);
+                        emit_add16(dyn, ninst, x6, x5, x3, x4, x2);
+                        SRLI(ed, ed, 16);
+                        SLLI(ed, ed, 16);
+                        OR(ed, ed, x6);
+                    } else {
+                        addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, LOCK_LOCK, 0, (opcode==0x81)?2:1);
+                        if(opcode==0x81) i32 = F16S; else i32 = F8S;
+                        MOV32w(x5, i32);
+
+                        ANDI(x3, wback, 0b10);
+                        BNEZ_MARK(x3);
+
+                        // lower 16 bits
+                        MARKLOCK;
+                        LR_W(x1, wback, 1, 1);
+                        SRLIW(x3, x1, 16);
+                        SLLIW(x3, x3, 16);
+                        ADD(x4, x1, x5);
+                        SLLIW(x4, x4, 16);
+                        SRLIW(x4, x4, 16);
+                        OR(x4, x4, x3);
+                        SC_W(x3, x4, wback, 1, 1);
+                        BNEZ_MARKLOCK(x3);
+                        IFX(X_ALL|X_PEND) {
+                            SLLIW(x1, x1, 16);
+                            SRLIW(x1, x1, 16);
+                        }
+                        B_MARK3_nocond;
+
+                        MARK;
+                        // upper 16 bits
+                        XORI(wback, wback, 0b10);
+                        MARK2;
+                        LR_W(x1, wback, 1, 1);
+                        SLLIW(x3, x1, 16);
+                        SRLIW(x3, x3, 16);
+                        SRLIW(x1, x1, 16);
+                        ADD(x4, x1, x5);
+                        SLLIW(x4, x4, 16);
+                        OR(x4, x4, x3);
+                        SC_W(x3, x4, wback, 1, 1);
+                        BNEZ_MARK2(x3);
+
+                        MARK3;
+                        // final
+                        IFX(X_ALL|X_PEND) {
+                            emit_add16(dyn, ninst, x1, x5, x3, x4, x6);
+                        }
+                    }
+                    break;
+                default:
+                    DEFAULT;
+            }
+            SMDMB();
+            break;
+
+        default:
+            DEFAULT;
+    }
+
+    return addr;
+}
\ No newline at end of file
diff --git a/src/dynarec/rv64/dynarec_rv64_67.c b/src/dynarec/rv64/dynarec_rv64_67.c
new file mode 100644
index 00000000..cb7702a8
--- /dev/null
+++ b/src/dynarec/rv64/dynarec_rv64_67.c
@@ -0,0 +1,574 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+
+#include "rv64_printer.h"
+#include "dynarec_rv64_private.h"
+#include "dynarec_rv64_helper.h"
+#include "dynarec_rv64_functions.h"
+
+uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip; (void)need_epilog;
+
+    uint8_t opcode = F8;
+    uint8_t nextop;
+    uint8_t gd, ed, wback, wb, wb1, wb2, gb1, gb2, eb1, eb2;
+    int64_t fixedaddress;
+    int unscaled;
+    int8_t  i8;
+    uint8_t u8;
+    int32_t i32;
+    int64_t j64, i64;
+    int cacheupd = 0;
+    int lock;
+    int v0, v1, s0;
+    MAYUSE(i32);
+    MAYUSE(j64);
+    MAYUSE(v0);
+    MAYUSE(v1);
+    MAYUSE(s0);
+    MAYUSE(lock);
+    MAYUSE(cacheupd);
+
+    if(rex.is32bits) {
+        // should do a different file
+        DEFAULT;
+        return addr;
+    }
+
+    GETREX();
+
+    rep = 0;
+    while((opcode==0xF2) || (opcode==0xF3)) {
+        rep = opcode-0xF1;
+        opcode = F8;
+    }
+
+    switch(opcode) {
+
+        case 0x01:
+            INST_NAME("ADD Ed, Gd");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_add32(dyn, ninst, rex, ed, gd, x3, x4, x5);
+            WBACK;
+            break;
+        case 0x02:
+            INST_NAME("ADD Gb, Eb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB32(x2, 0);
+            GETGB(x1);
+            emit_add8(dyn, ninst, x1, x2, x3, x4);
+            GBBACK(x4);
+            break;
+        case 0x03:
+            INST_NAME("ADD Gd, Ed");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_add32(dyn, ninst, rex, gd, ed, x3, x4, x5);
+            break;
+
+        case 0x05:
+            INST_NAME("ADD EAX, Id");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            emit_add32c(dyn, ninst, rex, xRAX, i64, x3, x4, x5, x6);
+            break;
+
+        case 0x09:
+            INST_NAME("OR Ed, Gd");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_or32(dyn, ninst, rex, ed, gd, x3, x4);
+            WBACK;
+            break;
+        case 0x0A:
+            INST_NAME("OR Gb, Eb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB32(x2, 0);
+            GETGB(x1);
+            emit_or8(dyn, ninst, x1, x2, x3, x4);
+            GBBACK(x4);
+            break;
+        case 0x0B:
+            INST_NAME("OR Gd, Ed");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_or32(dyn, ninst, rex, gd, ed, x3, x4);
+            break;
+
+        case 0x0D:
+            INST_NAME("OR EAX, Id");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            emit_or32c(dyn, ninst, rex, xRAX, i64, x3, x4);
+            break;
+
+        case 0x0F:
+            opcode=F8;
+            switch(opcode) {
+                case 0x2E:
+                    // no special check...
+                case 0x2F:
+                    switch (rep) {
+                        case 0:
+                            if(opcode==0x2F) {INST_NAME("COMISS Gx, Ex");} else {INST_NAME("UCOMISS Gx, Ex");}
+                            SETFLAGS(X_ALL, SF_SET);
+                            nextop = F8;
+                            GETGXSS(s0);
+                            if(MODREG) {
+                                v0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 1);
+                            } else {
+                                v0 = fpu_get_scratch(dyn);
+                                SMREAD();
+                                addr = geted32(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0);
+                                FLW(v0, ed, fixedaddress);
+                            }
+                            CLEAR_FLAGS();
+                            // if isnan(s0) || isnan(v0)
+                            IFX(X_ZF | X_PF | X_CF) {
+                                FEQS(x3, s0, s0);
+                                FEQS(x2, v0, v0);
+                                AND(x2, x2, x3);
+                                BNE_MARK(x2, xZR);
+                                ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF));
+                                B_NEXT_nocond;
+                            }
+                            MARK;
+                            // else if isless(d0, v0)
+                            IFX(X_CF) {
+                                FLTS(x2, s0, v0);
+                                BEQ_MARK2(x2, xZR);
+                                ORI(xFlags, xFlags, 1<<F_CF);
+                                B_NEXT_nocond;
+                            }
+                            MARK2;
+                            // else if d0 == v0
+                            IFX(X_ZF) {
+                                FEQS(x2, s0, v0);
+                                CBZ_NEXT(x2);
+                                ORI(xFlags, xFlags, 1<<F_ZF);
+                            }
+                            break;
+                        default:
+                            DEFAULT;
+                    }
+                    break;
+                default:
+                    DEFAULT;
+            }
+            break;
+
+        case 0x11:
+            INST_NAME("ADC Ed, Gd");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_adc32(dyn, ninst, rex, ed, gd, x3, x4, x5, x6);
+            WBACK;
+            break;
+
+        case 0x13:
+            INST_NAME("ADC Gd, Ed");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_adc32(dyn, ninst, rex, gd, ed, x3, x4, x5, x6);
+            break;
+
+        case 0x15:
+            INST_NAME("ADC EAX, Id");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            MOV64xw(x1, i64);
+            emit_adc32(dyn, ninst, rex, xRAX, x1, x3, x4, x5, x6);
+            break;
+
+        case 0x19:
+            INST_NAME("SBB Ed, Gd");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_sbb32(dyn, ninst, rex, ed, gd, x3, x4, x5);
+            WBACK;
+            break;
+        case 0x1A:
+            INST_NAME("SBB Gb, Eb");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB32(x2, 0);
+            GETGB(x1);
+            emit_sbb8(dyn, ninst, x1, x2, x3, x4, x5);
+            GBBACK(x4);
+            break;
+        case 0x1B:
+            INST_NAME("SBB Gd, Ed");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_sbb32(dyn, ninst, rex, gd, ed, x3, x4, x5);
+            break;
+
+        case 0x1D:
+            INST_NAME("SBB EAX, Id");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            MOV64xw(x2, i64);
+            emit_sbb32(dyn, ninst, rex, xRAX, x2, x3, x4, x5);
+            break;
+
+        case 0x21:
+            INST_NAME("AND Ed, Gd");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_and32(dyn, ninst, rex, ed, gd, x3, x4);
+            WBACK;
+            break;
+        case 0x22:
+            INST_NAME("AND Gb, Eb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB32(x2, 0);
+            GETGB(x1);
+            emit_and8(dyn, ninst, x1, x2, x3, x4);
+            GBBACK(x4);
+            break;
+        case 0x23:
+            INST_NAME("AND Gd, Ed");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_and32(dyn, ninst, rex, gd, ed, x3, x4);
+            break;
+
+        case 0x25:
+            INST_NAME("AND EAX, Id");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            emit_and32c(dyn, ninst, rex, xRAX, i64, x3, x4);
+            break;
+
+        case 0x29:
+            INST_NAME("SUB Ed, Gd");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_sub32(dyn, ninst, rex, ed, gd, x3, x4, x5);
+            WBACK;
+            break;
+        case 0x2A:
+            INST_NAME("SUB Gb, Eb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB32(x2, 0);
+            GETGB(x1);
+            emit_sub8(dyn, ninst, x1, x2, x3, x4, x5);
+            GBBACK(x5);
+            break;
+        case 0x2B:
+            INST_NAME("SUB Gd, Ed");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_sub32(dyn, ninst, rex, gd, ed, x3, x4, x5);
+            break;
+
+        case 0x2D:
+            INST_NAME("SUB EAX, Id");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            emit_sub32c(dyn, ninst, rex, xRAX, i64, x3, x4, x5, x6);
+            break;
+
+        case 0x31:
+            INST_NAME("XOR Ed, Gd");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_xor32(dyn, ninst, rex, ed, gd, x3, x4);
+            WBACK;
+            break;
+        case 0x32:
+            INST_NAME("XOR Gb, Eb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB32(x2, 0);
+            GETGB(x1);
+            emit_xor8(dyn, ninst, x1, x2, x3, x4);
+            GBBACK(x4);
+            break;
+        case 0x33:
+            INST_NAME("XOR Gd, Ed");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_xor32(dyn, ninst, rex, gd, ed, x3, x4);
+            break;
+
+        case 0x35:
+            INST_NAME("XOR EAX, Id");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            emit_xor32c(dyn, ninst, rex, xRAX, i64, x3, x4);
+            break;
+
+        case 0x38:
+            INST_NAME("CMP Eb, Gb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB32(x1, 0);
+            GETGB(x2);
+            emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6);
+            break;
+        case 0x39:
+            INST_NAME("CMP Ed, Gd");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_cmp32(dyn, ninst, rex, ed, gd, x3, x4, x5, x6);
+            break;
+        case 0x3A:
+            INST_NAME("CMP Gb, Eb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB32(x2, 0);
+            GETGB(x1);
+            emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6);
+            break;
+        case 0x3B:
+            INST_NAME("CMP Gd, Ed");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED32(0);
+            emit_cmp32(dyn, ninst, rex, gd, ed, x3, x4, x5, x6);
+            break;
+        case 0x3C:
+            INST_NAME("CMP AL, Ib");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            u8 = F8;
+            ANDI(x1, xRAX, 0xff);
+            if(u8) {
+                MOV32w(x2, u8);
+                emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6);
+            } else {
+                emit_cmp8_0(dyn, ninst, x1, x3, x4);
+            }
+            break;
+        case 0x3D:
+            INST_NAME("CMP EAX, Id");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            if(i64) {
+                MOV64xw(x2, i64);
+                emit_cmp32(dyn, ninst, rex, xRAX, x2, x3, x4, x5, x6);
+            } else
+                emit_cmp32_0(dyn, ninst, rex, xRAX, x3, x4);
+            break;
+
+        case 0x81:
+        case 0x83:
+            nextop = F8;
+            switch((nextop>>3)&7) {
+                case 0: //ADD
+                    if(opcode==0x81) {INST_NAME("ADD Ed, Id");} else {INST_NAME("ADD Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETED32((opcode==0x81)?4:1);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_add32c(dyn, ninst, rex, ed, i64, x3, x4, x5, x6);
+                    WBACK;
+                    break;
+                case 1: //OR
+                    if(opcode==0x81) {INST_NAME("OR Ed, Id");} else {INST_NAME("OR Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETED32((opcode==0x81)?4:1);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_or32c(dyn, ninst, rex, ed, i64, x3, x4);
+                    WBACK;
+                    break;
+                case 2: //ADC
+                    if(opcode==0x81) {INST_NAME("ADC Ed, Id");} else {INST_NAME("ADC Ed, Ib");}
+                    READFLAGS(X_CF);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETED32((opcode==0x81)?4:1);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    MOV64xw(x5, i64);
+                    emit_adc32(dyn, ninst, rex, ed, x5, x3, x4, x5, x6);
+                    WBACK;
+                    break;
+                case 3: //SBB
+                    if(opcode==0x81) {INST_NAME("SBB Ed, Id");} else {INST_NAME("SBB Ed, Ib");}
+                    READFLAGS(X_CF);
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETED32((opcode==0x81)?4:1);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    MOV64xw(x5, i64);
+                    emit_sbb32(dyn, ninst, rex, ed, x5, x3, x4, x5);
+                    WBACK;
+                    break;
+                case 4: //AND
+                    if(opcode==0x81) {INST_NAME("AND Ed, Id");} else {INST_NAME("AND Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETED32((opcode==0x81)?4:1);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_and32c(dyn, ninst, rex, ed, i64, x3, x4);
+                    WBACK;
+                    break;
+                case 5: //SUB
+                    if(opcode==0x81) {INST_NAME("SUB Ed, Id");} else {INST_NAME("SUB Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETED32((opcode==0x81)?4:1);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_sub32c(dyn, ninst, rex, ed, i64, x3, x4, x5, x6);
+                    WBACK;
+                    break;
+                case 6: //XOR
+                    if(opcode==0x81) {INST_NAME("XOR Ed, Id");} else {INST_NAME("XOR Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETED32((opcode==0x81)?4:1);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    emit_xor32c(dyn, ninst, rex, ed, i64, x3, x4);
+                    WBACK;
+                    break;
+                case 7: //CMP
+                    if(opcode==0x81) {INST_NAME("CMP Ed, Id");} else {INST_NAME("CMP Ed, Ib");}
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETED32((opcode==0x81)?4:1);
+                    if(opcode==0x81) i64 = F32S; else i64 = F8S;
+                    if(i64) {
+                        MOV64xw(x2, i64);
+                        emit_cmp32(dyn, ninst, rex, ed, x2, x3, x4, x5, x6);
+                    } else
+                        emit_cmp32_0(dyn, ninst, rex, ed, x3, x4);
+                    break;
+            }
+            break;
+
+        case 0x88:
+            INST_NAME("MOV Eb, Gb");
+            nextop = F8;
+            gd = ((nextop&0x38)>>3)+(rex.r<<3);
+            if(rex.rex) {
+                gb2 = 0;
+                gb1 = xRAX + gd;
+            } else {
+                gb2 = ((gd&4)>>2);
+                gb1 = xRAX+(gd&3);
+            }
+            gd = x4;
+            if(gb2) {
+                SRLI(x4, gb1, 8);
+                gb1 = x4;
+            }
+            if(MODREG) {
+                ed = (nextop&7) + (rex.b<<3);
+                if(rex.rex) {
+                    eb1 = xRAX+ed;
+                    eb2 = 0;
+                } else {
+                    eb1 = xRAX+(ed&3);  // Ax, Cx, Dx or Bx
+                    eb2 = ((ed&4)>>2);    // L or H
+                }
+                ANDI(gd, gb1, 0xff);
+                if(eb2) {
+                    MOV64x(x1, 0xffffffffffff00ffLL);
+                    AND(x1, eb1, x1);
+                    SLLI(gd, gd, 8);
+                    OR(eb1, x1, gd);
+                } else {
+                    ANDI(x1, eb1, ~0xff);
+                    OR(eb1, x1, gd);
+                }
+            } else {
+                addr = geted32(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0);
+                SB(gb1, ed, fixedaddress);
+                SMWRITELOCK(lock);
+            }
+            break;
+        case 0x89:
+            INST_NAME("MOV Ed, Gd");
+            nextop=F8;
+            GETGD;
+            if(MODREG) {   // reg <= reg
+                MVxw(xRAX+(nextop&7)+(rex.b<<3), gd);
+            } else {                    // mem <= reg
+                addr = geted32(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0);
+                SDxw(gd, ed, fixedaddress);
+                SMWRITELOCK(lock);
+            }
+            break;
+        case 0x8B:
+            INST_NAME("MOV Gd, Ed");
+            nextop=F8;
+            GETGD;
+            if(MODREG) {
+                MVxw(gd, xRAX+(nextop&7)+(rex.b<<3));
+            } else {
+                addr = geted32(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0);
+                SMREADLOCK(lock);
+                LDxw(gd, ed, fixedaddress);
+            }
+            break;
+        case 0x8D:
+            INST_NAME("LEA Gd, Ed");
+            nextop=F8;
+            GETGD;
+            if(MODREG) {   // reg <= reg? that's an invalid operation
+                DEFAULT;
+            } else {                    // mem <= reg
+                addr = geted32(dyn, addr, ninst, nextop, &ed, gd, x1, &fixedaddress, rex, NULL, 0, 0);
+                if(ed!=gd) {
+                    AND(gd, ed, xMASK);
+                }
+            }
+            break;
+        default:
+            DEFAULT;
+    }
+    return addr;
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_d8.c b/src/dynarec/rv64/dynarec_rv64_d8.c
index beadb202..7f14468b 100644
--- a/src/dynarec/rv64/dynarec_rv64_d8.c
+++ b/src/dynarec/rv64/dynarec_rv64_d8.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -50,13 +49,73 @@ uintptr_t dynarec64_D8(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0xD0 ... 0xD7:
 
         case 0xD8 ... 0xDF:
-
+            INST_NAME("FCOMP ST0, STx");
+            v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
+            v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            LHU(x3, xEmu, offsetof(x64emu_t, sw));
+            MOV32w(x1, 0b1110100011111111); // mask off c0,c1,c2,c3
+            AND(x3, x3, x1);
+            if(ST_IS_F(0)) {
+                FEQS(x5, v1, v1);
+                FEQS(x4, v2, v2);
+                AND(x5, x5, x4);
+                BEQZ(x5, 24); // undefined/NaN
+                FEQS(x5, v1, v2);
+                BNEZ(x5, 28); // equal
+                FLTS(x3, v1, v2); // x3 = (v1<v2)?1:0
+                SLLI(x1, x3, 8);
+                J(20); // end
+                // undefined/NaN
+                LUI(x1, 1);
+                ADDI(x1, x1, 0b010100000000);
+                J(8); // end
+                // equal
+                LUI(x1, 1);
+                // end
+            } else {
+                FEQD(x5, v1, v1);
+                FEQD(x4, v2, v2);
+                AND(x5, x5, x4);
+                BEQZ(x5, 24); // undefined/NaN
+                FEQD(x5, v1, v2);
+                BNEZ(x5, 28); // equal
+                FLTD(x3, v1, v2); // x3 = (v1<v2)?1:0
+                SLLI(x1, x3, 8);
+                J(20); // end
+                // undefined/NaN
+                LUI(x1, 1);
+                ADDI(x1, x1, 0b010100000000);
+                J(8); // end
+                // equal
+                LUI(x1, 1);
+                // end
+            }
+            OR(x3, x3, x1);
+            SH(x3, xEmu, offsetof(x64emu_t, sw));
+            x87_do_pop(dyn, ninst, x3);
+            break;
         case 0xE0 ... 0xE7:
-
+            INST_NAME("FSUB ST0, STx");
+            v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
+            v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(ST_IS_F(0)) {
+                FSUBS(v1, v1, v2);
+            } else {
+                FSUBD(v1, v1, v2);
+            }
+            break;
         case 0xE8 ... 0xEF:
 
         case 0xF0 ... 0xF7:
-
+            INST_NAME("FDIV ST0, STx");
+            v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
+            v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(ST_IS_F(0)) {
+                FDIVS(v1, v1, v2);
+            } else {
+                FDIVD(v1, v1, v2);
+            }
+            break;
         case 0xF8 ... 0xFF:
             DEFAULT;
             break;
diff --git a/src/dynarec/rv64/dynarec_rv64_d9.c b/src/dynarec/rv64/dynarec_rv64_d9.c
index 9378c650..4940d6b4 100644
--- a/src/dynarec/rv64/dynarec_rv64_d9.c
+++ b/src/dynarec/rv64/dynarec_rv64_d9.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -34,13 +33,16 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
     uint8_t u8;
     int64_t fixedaddress;
     int unscaled;
-    int v1, v2;
+    int v0, v1, v2;
     int s0;
     int i1, i2, i3;
+    int64_t j64;
 
     MAYUSE(s0);
-    MAYUSE(v2);
+    MAYUSE(v0);
     MAYUSE(v1);
+    MAYUSE(v2);
+    MAYUSE(j64);
 
     switch(nextop) {
         case 0xC0:
@@ -260,7 +262,12 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             break;
         case 0xFA:
             INST_NAME("FSQRT");
-            DEFAULT;
+            v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+            if(ST_IS_F(0)) {
+                FSQRTS(v1, v1);
+            } else {
+                FSQRTD(v1, v1);
+            }
             break;
         case 0xFB:
             INST_NAME("FSINCOS");
@@ -271,7 +278,43 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             break;
         case 0xFC:
             INST_NAME("FRNDINT");
-            DEFAULT;
+            v0 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+            v1 = fpu_get_scratch(dyn);
+            v2 = fpu_get_scratch(dyn);
+            u8 = x87_setround(dyn, ninst, x1, x2);
+
+            if(ST_IS_F(0)) {
+                FEQS(x2, v0, v0);
+                BNEZ_MARK(x2);
+                B_NEXT_nocond;
+                MARK; // v0 is not nan
+                FABSS(v1, v0);
+                MOV64x(x3, 1ULL << __FLT_MANT_DIG__);
+                FCVTSL(v2, x3, RD_RTZ);
+                FLTS(x3, v1, v2);
+                BNEZ_MARK2(x3);
+                B_NEXT_nocond;
+                MARK2;
+                FCVTLS(x3, v0, RD_DYN);
+                FCVTSL(v1, x3, RD_DYN);
+                FSGNJS(v0, v1, v0);
+            } else {
+                FEQD(x2, v0, v0);
+                BNEZ_MARK(x2);
+                B_NEXT_nocond;
+                MARK; // v0 is not nan
+                FABSD(v1, v0);
+                MOV64x(x3, 1ULL << __DBL_MANT_DIG__);
+                FCVTDL(v2, x3, RD_RTZ);
+                FLTD(x3, v1, v2);
+                BNEZ_MARK2(x3);
+                B_NEXT_nocond;
+                MARK2;
+                FCVTLD(x3, v0, RD_DYN);
+                FCVTDL(v1, x3, RD_DYN);
+                FSGNJD(v0, v1, v0);
+            }
+            x87_restoreround(dyn, ninst, u8);
             break;
         case 0xFD:
             INST_NAME("FSCALE");
diff --git a/src/dynarec/rv64/dynarec_rv64_db.c b/src/dynarec/rv64/dynarec_rv64_db.c
index 95e350c0..7a5dddb0 100644
--- a/src/dynarec/rv64/dynarec_rv64_db.c
+++ b/src/dynarec/rv64/dynarec_rv64_db.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -150,7 +149,45 @@ uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0xEE:
         case 0xEF:
             INST_NAME("FUCOMI ST0, STx");
-            DEFAULT;
+            SETFLAGS(X_ALL, SF_SET);
+            SET_DFNONE();
+            v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
+            v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            IFX(F_ZF | F_PF | F_CF) {
+                if(ST_IS_F(0)) {
+                    FEQS(x5, v1, v1);
+                    FEQS(x4, v2, v2);
+                    AND(x5, x5, x4);
+                    BEQZ(x5, 24); // undefined/NaN
+                    FEQS(x5, v1, v2);
+                    BNEZ(x5, 24); // equal
+                    FLTS(x3, v1, v2); // x3 = (v1<v2)?1:0
+                    OR(xFlags, xFlags, x3); // CF is the least significant bit
+                    J(16); // end
+                    // NaN
+                    ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF));
+                    J(8); // end
+                    // equal
+                    ORI(xFlags, xFlags, 1<<F_ZF);
+                    // end
+                } else {
+                    FEQD(x5, v1, v1);
+                    FEQD(x4, v2, v2);
+                    AND(x5, x5, x4);
+                    BEQZ(x5, 24); // undefined/NaN
+                    FEQD(x5, v1, v2);
+                    BNEZ(x5, 24); // equal
+                    FLTD(x3, v1, v2); // x3 = (v1<v2)?1:0
+                    OR(xFlags, xFlags, x3); // CF is the least significant bit
+                    J(16); // end
+                    // NaN
+                    ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF));
+                    J(8); // end
+                    // equal
+                    ORI(xFlags, xFlags, 1<<F_ZF);
+                    // end
+                }
+            }
             break;
         case 0xF0:  
         case 0xF1:
@@ -191,7 +228,24 @@ uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     break;
                 case 3:
                     INST_NAME("FISTP Ed, ST0");
-                    DEFAULT;
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D);
+                    u8 = x87_setround(dyn, ninst, x1, x2);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                    v2 = fpu_get_scratch(dyn);
+                    if(!box64_dynarec_fastround) {
+                        FSFLAGSI(0); // reset all bits
+                    }
+                    FCVTWD(x4, v1, RD_DYN);
+                    x87_restoreround(dyn, ninst, u8);
+                    if(!box64_dynarec_fastround) {
+                        FRFLAGS(x5);   // get back FPSR to check the IOC bit
+                        ANDI(x5, x5, 1<<FR_NV);
+                        BEQ_MARK2(x5, xZR);
+                        MOV32w(x4, 0x80000000);
+                    }
+                    MARK2;
+                    SW(x4, wback, fixedaddress);
+                    x87_do_pop(dyn, ninst, x3);
                     break;
                 case 5:
                     INST_NAME("FLD tbyte");
diff --git a/src/dynarec/rv64/dynarec_rv64_dc.c b/src/dynarec/rv64/dynarec_rv64_dc.c
new file mode 100644
index 00000000..d802e2fb
--- /dev/null
+++ b/src/dynarec/rv64/dynarec_rv64_dc.c
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "rv64_printer.h"
+#include "dynarec_rv64_private.h"
+#include "dynarec_rv64_helper.h"
+#include "dynarec_rv64_functions.h"
+
+
+uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip; (void)rep; (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t wback;
+    int64_t fixedaddress;
+    int unscaled;
+    int v1, v2;
+
+    MAYUSE(v2);
+    MAYUSE(v1);
+
+    switch(nextop) {
+        case 0xC0 ... 0xC7:
+            INST_NAME("FADD STx, ST0");
+            DEFAULT;
+            break;
+        case 0xC8 ... 0xCF:
+            INST_NAME("FMUL STx, ST0");
+            DEFAULT;
+            break;
+        case 0xD0 ... 0xD7:
+            INST_NAME("FCOM ST0, STx"); //yep
+            DEFAULT;
+            break;
+        case 0xD8 ... 0xDF:
+            INST_NAME("FCOMP ST0, STx");
+            DEFAULT;
+            break;
+        case 0xE0 ... 0xE7:
+            INST_NAME("FSUBR STx, ST0");
+            DEFAULT;
+            break;
+            break;
+        case 0xE8 ... 0xEF:
+            INST_NAME("FSUB STx, ST0");
+            DEFAULT;
+            break;
+        case 0xF0 ... 0xF7:
+            INST_NAME("FDIVR STx, ST0");
+            DEFAULT;
+            break;
+        case 0xF8 ... 0xFF:
+            INST_NAME("FDIV STx, ST0");
+            DEFAULT;
+            break;
+        default:
+            switch((nextop>>3)&7) {
+                case 3:
+                    INST_NAME("FCOMP ST0, double[ED]");
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D);
+                    v2 = fpu_get_scratch(dyn);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                    FLD(v2, wback, fixedaddress);
+
+                    LHU(x3, xEmu, offsetof(x64emu_t, sw));
+                    MOV32w(x1, 0b1110100011111111); // mask off c0,c1,c2,c3
+                    AND(x3, x3, x1);
+                    FEQD(x5, v1, v1);
+                    FEQD(x4, v2, v2);
+                    AND(x5, x5, x4);
+                    BEQZ(x5, 24); // undefined/NaN
+                    FEQD(x5, v1, v2);
+                    BNEZ(x5, 28); // equal
+                    FLTD(x3, v1, v2); // x3 = (v1<v2)?1:0
+                    SLLI(x1, x3, 8);
+                    J(20); // end
+                    // undefined/NaN
+                    LUI(x1, 1);
+                    ADDI(x1, x1, 0b010100000000);
+                    J(8); // end
+                    // equal
+                    LUI(x1, 1);
+                    // end
+                    OR(x3, x3, x1);
+                    SH(x3, xEmu, offsetof(x64emu_t, sw));
+
+                    x87_do_pop(dyn, ninst, x3);
+                    break;
+                case 6:
+                    INST_NAME("FDIV ST0, double[ED]");
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D);
+                    v2 = fpu_get_scratch(dyn);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                    FLD(v2, wback, fixedaddress);
+                    FDIVD(v1, v1, v2);
+                    break;
+                default:
+                    DEFAULT;
+            }
+    }
+    return addr;
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_dd.c b/src/dynarec/rv64/dynarec_rv64_dd.c
new file mode 100644
index 00000000..044f9aab
--- /dev/null
+++ b/src/dynarec/rv64/dynarec_rv64_dd.c
@@ -0,0 +1,179 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "rv64_printer.h"
+#include "dynarec_rv64_private.h"
+#include "dynarec_rv64_helper.h"
+#include "dynarec_rv64_functions.h"
+
+
+uintptr_t dynarec64_DD(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip; (void)rep; (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t ed, wback;
+    int64_t fixedaddress;
+    int unscaled;
+    int v1, v2;
+    int s0;
+    int64_t j64;
+
+    MAYUSE(s0);
+    MAYUSE(v2);
+    MAYUSE(v1);
+    MAYUSE(j64);
+
+    switch(nextop) {
+        case 0xC0:
+        case 0xC1:
+        case 0xC2:
+        case 0xC3:
+        case 0xC4:
+        case 0xC5:
+        case 0xC6:
+        case 0xC7:
+            INST_NAME("FFREE STx");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_purgecache(dyn, ninst, 0, x1, x2, x3);
+            MOV32w(x1, nextop&7);
+            CALL(fpu_do_free, -1);
+            break;
+        case 0xD0:
+        case 0xD1:
+        case 0xD2:
+        case 0xD3:
+        case 0xD4:
+        case 0xD5:
+        case 0xD6:
+        case 0xD7:
+            INST_NAME("FST ST0, STx");
+            DEFAULT;
+            break;
+        case 0xD8:
+            INST_NAME("FSTP ST0, ST0");
+            x87_do_pop(dyn, ninst, x3);
+            break;
+        case 0xD9:
+        case 0xDA:
+        case 0xDB:
+        case 0xDC:
+        case 0xDD:
+        case 0xDE:
+        case 0xDF:
+            INST_NAME("FSTP ST0, STx");
+            // copy the cache value for st0 to stx
+            x87_get_st_empty(dyn, ninst, x1, x2, nextop&7, X87_ST(nextop&7));
+            x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+            x87_swapreg(dyn, ninst, x1, x2, 0, nextop&7);
+            x87_do_pop(dyn, ninst, x3);
+            break;
+        case 0xE0:
+        case 0xE1:
+        case 0xE2:
+        case 0xE3:
+        case 0xE4:
+        case 0xE5:
+        case 0xE6:
+        case 0xE7:
+            INST_NAME("FUCOM ST0, STx");
+            DEFAULT;
+            break;
+        case 0xE8:
+        case 0xE9:
+        case 0xEA:
+        case 0xEB:
+        case 0xEC:
+        case 0xED:
+        case 0xEE:
+        case 0xEF:
+            INST_NAME("FUCOMP ST0, STx");
+            DEFAULT;
+            break;
+        case 0xC8:
+        case 0xC9:
+        case 0xCA:
+        case 0xCB:
+        case 0xCC:
+        case 0xCD:
+        case 0xCE:
+        case 0xCF:
+        case 0xF0:
+        case 0xF1:
+        case 0xF2:
+        case 0xF3:
+        case 0xF4:
+        case 0xF5:
+        case 0xF6:
+        case 0xF7:
+        case 0xF8:
+        case 0xF9:
+        case 0xFA:
+        case 0xFB:
+        case 0xFC:
+        case 0xFD:
+        case 0xFE:
+        case 0xFF:
+            DEFAULT;
+            break;
+
+        default:
+            switch((nextop>>3)&7) {
+                case 0:
+                    INST_NAME("FLD double");
+                    v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                    FLD(v1, wback, fixedaddress);
+                    break;
+                case 2:
+                    INST_NAME("FST double");
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                    FSD(v1, wback, fixedaddress);
+                    break;
+                case 3:
+                    INST_NAME("FSTP double");
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                    FSD(v1, wback, fixedaddress);
+                    x87_do_pop(dyn, ninst, x3);
+                    break;
+                case 7:
+                    INST_NAME("FNSTSW m2byte");
+                    fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x4, x6, &fixedaddress, rex, NULL, 0, 0);
+                    LWU(x2, xEmu, offsetof(x64emu_t, top));
+                    LHU(x3, xEmu, offsetof(x64emu_t, sw));
+                    if(dyn->e.x87stack) {
+                        // update top
+                        ADDI(x2, x2, -dyn->e.x87stack);
+                        ANDI(x2, x2, 7);
+                    }
+                    MOV32w(x5, ~0x3800);
+                    AND(x3, x3, x5);    // mask out TOP
+                    SLLI(x2, x2, 11);   // shift TOP to bit 11
+                    OR(x3, x3, x2);     // inject TOP
+                    SH(x3, ed, fixedaddress);   // store whole sw flags
+                    break;
+                default:
+                    DEFAULT;
+            }
+    }
+    return addr;
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_de.c b/src/dynarec/rv64/dynarec_rv64_de.c
index 1511c6ef..a2341b40 100644
--- a/src/dynarec/rv64/dynarec_rv64_de.c
+++ b/src/dynarec/rv64/dynarec_rv64_de.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
diff --git a/src/dynarec/rv64/dynarec_rv64_df.c b/src/dynarec/rv64/dynarec_rv64_df.c
index a96a45f1..de99b02a 100644
--- a/src/dynarec/rv64/dynarec_rv64_df.c
+++ b/src/dynarec/rv64/dynarec_rv64_df.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -41,15 +40,29 @@ uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 
     switch(nextop) {
         case 0xC0 ... 0xC7:
-        
-        case 0xE0:
-        
-        case 0xE8 ... 0xEF:
+            INST_NAME("FFREEP STx");
             DEFAULT;
             break;
-        
-        case 0xF0 ... 0xF7:
-            INST_NAME("FCOMIP ST0, STx");
+
+        case 0xE0:
+            INST_NAME("FNSTSW AX");
+            LHU(x2, xEmu, offsetof(x64emu_t, top));
+            LHU(x1, xEmu, offsetof(x64emu_t, sw));
+            MOV32w(x3, 0b1100011111111111); // mask
+            AND(x1, x1, x3);
+            SLLI(x2, x2, 11);
+            OR(x1, x1, x2); // inject top
+            SH(x1, xEmu, offsetof(x64emu_t, sw));
+            SRLI(xRAX, xRAX, 16);
+            SLLI(xRAX, xRAX, 16);
+            OR(xRAX, xRAX, x1);
+            break;
+        case 0xE8 ... 0xF7:
+            if (nextop < 0xF0) {
+                INST_NAME("FUCOMIP ST0, STx");
+            } else {
+                INST_NAME("FCOMIP ST0, STx");
+            }
             SETFLAGS(X_ALL, SF_SET);
             SET_DFNONE();
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
@@ -114,9 +127,9 @@ uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 case 1:
                     INST_NAME("FISTTP Ew, ST0");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_F);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 0, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0);
                     if(!box64_dynarec_fastround) {
-                        FSFLAGSI(xZR); // reset all bits
+                        FSFLAGSI(0); // reset all bits
                     }
                     FCVTWD(x4, v1, RD_RTZ);
                     if(!box64_dynarec_fastround) {
@@ -136,12 +149,12 @@ uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 case 3:
                     INST_NAME("FISTP Ew, ST0");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_F);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 0, 0);
-                    u8 = sse_setround(dyn, ninst, x2, x3);
+                    u8 = x87_setround(dyn, ninst, x1, x2);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
                     if(!box64_dynarec_fastround) {
-                        FSFLAGSI(xZR); // reset all bits
+                        FSFLAGSI(0); // reset all bits
                     }
-                    FCVTWD(x4, v1, RD_RM);
+                    FCVTWD(x4, v1, RD_DYN);
                     x87_restoreround(dyn, ninst, u8);
                     if(!box64_dynarec_fastround) {
                         FRFLAGS(x5);   // get back FPSR to check the IOC bit
@@ -157,6 +170,71 @@ uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     SH(x4, wback, fixedaddress);
                     x87_do_pop(dyn, ninst, x3);
                     break;
+                case 5:
+                    INST_NAME("FILD ST0, i64");
+                    v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                    LD(x1, wback, fixedaddress);
+                    if (rex.is32bits) {
+                        // need to also feed the STll stuff...
+                        ADDI(x4, xEmu, offsetof(x64emu_t, fpu_ll));
+                        LWU(x5, xEmu, offsetof(x64emu_t, top));
+                        int a = 0 - dyn->e.x87stack;
+                        if(a) {
+                            ADDIW(x5, x5, a);
+                            ANDI(x5, x5, 0x7);
+                        }
+                        SLLI(x5, x5, 4); // fpu_ll is 2 i64
+                        ADD(x5, x5, x4);
+                        SD(x1, x5, 8);   // ll
+                    }
+                    FCVTDL(v1, x1, RD_RTZ);
+                    if(rex.is32bits) {
+                        FSD(v1, x5, 0);  // ref
+                    }
+                    break;
+                case 7:
+                    INST_NAME("FISTP i64, ST0");
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D);
+                    u8 = x87_setround(dyn, ninst, x1, x2);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+
+                    if(rex.is32bits) {
+                        // need to check STll first...
+                        ADDI(x4, xEmu, offsetof(x64emu_t, fpu_ll));
+                        LWU(x5, xEmu, offsetof(x64emu_t, top));
+                        int a = 0 - dyn->e.x87stack;
+                        if(a) {
+                            ADDIW(x5, x5, a);
+                            ANDI(x5, x5, 0x7);
+                        }
+                        SLLI(x5, x5, 4); // fpu_ll is 2 i64
+                        ADD(x5, x5, x4);
+                        FMVXD(x3, v1);
+                        LD(x6, x5, 0);  // ref
+                        BNE_MARK(x6, x3);
+                        LD(x6, x5, 8);  // ll
+                        SD(x6, wback, fixedaddress);
+                        B_MARK3_nocond;
+                        MARK;
+                    }
+
+                    if(!box64_dynarec_fastround) {
+                        FSFLAGSI(0); // reset all bits
+                    }
+                    FCVTLD(x4, v1, RD_DYN);
+                    if(!box64_dynarec_fastround) {
+                        FRFLAGS(x5);   // get back FPSR to check the IOC bit
+                        ANDI(x5, x5, 1<<FR_NV);
+                        BEQ_MARK2(x5, xZR);
+                        MOV64x(x4, 0x8000000000000000LL);
+                    }
+                    MARK2;
+                    SD(x4, wback, fixedaddress);
+                    MARK3;
+                    x87_restoreround(dyn, ninst, u8);
+                    x87_do_pop(dyn, ninst, x3);
+                    break;
                 default:
                     DEFAULT;
                     break;
diff --git a/src/dynarec/rv64/dynarec_rv64_emit_logic.c b/src/dynarec/rv64/dynarec_rv64_emit_logic.c
index 6d17895f..1352868b 100644
--- a/src/dynarec/rv64/dynarec_rv64_emit_logic.c
+++ b/src/dynarec/rv64/dynarec_rv64_emit_logic.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -16,7 +15,6 @@
 #include "emu/x64run_private.h"
 #include "x64trace.h"
 #include "dynarec_native.h"
-#include "../tools/bridge_private.h"
 
 #include "rv64_printer.h"
 #include "dynarec_rv64_private.h"
@@ -165,8 +163,7 @@ void emit_xor16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     }
 
     XOR(s1, s1, s2);
-    SLLI(s1, s1, 48);
-    SRLI(s1, s1, 48);
+    ZEXTH(s1, s1);
 
     IFX(X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
@@ -197,8 +194,7 @@ void emit_or16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4) {
     }
 
     OR(s1, s1, s2);
-    SLLI(s1, s1, 48);
-    SRLI(s1, s1, 48);
+    ZEXTH(s1, s1);
     IFX(X_PEND) {
         SD(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -426,7 +422,7 @@ void emit_and32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
         MOV64xw(s3, c);
         AND(s1, s1, s3); // res = s1 & s2
     }
-    if (!rex.w) ZEROUP(s1);
+    if (!rex.w && c<0 && c>=-2048) ZEROUP(s1);
 
     IFX(X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
diff --git a/src/dynarec/rv64/dynarec_rv64_emit_math.c b/src/dynarec/rv64/dynarec_rv64_emit_math.c
index 01579ea3..5d6f7e0e 100644
--- a/src/dynarec/rv64/dynarec_rv64_emit_math.c
+++ b/src/dynarec/rv64/dynarec_rv64_emit_math.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -16,7 +15,6 @@
 #include "emu/x64run_private.h"
 #include "x64trace.h"
 #include "dynarec_native.h"
-#include "../tools/bridge_private.h"
 
 #include "rv64_printer.h"
 #include "dynarec_rv64_private.h"
@@ -37,8 +35,7 @@ void emit_add32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX(X_CF) {
         if (rex.w) {
             AND(s5, xMASK, s1);
-            AND(s4, xMASK, s2);
-            ADD(s5, s5, s4); // lo
+            if(rv64_zba) ADDUW(s5, s2, s5); else {AND(s4, xMASK, s2); ADD(s5, s5, s4);} // lo
             SRLI(s3, s1, 0x20);
             SRLI(s4, s2, 0x20);
             ADD(s4, s4, s3);
@@ -65,8 +62,12 @@ void emit_add32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_AF | X_OF) {
-        NOT(s5, s1);   // s5 = ~res
-        AND(s3, s5, s3);   // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s3);   // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s5, s1);   // s5 = ~res
+            AND(s3, s5, s3);   // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s4, s3, 0x08); // AF: cc & 0x08
@@ -126,8 +127,7 @@ void emit_add32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
     IFX(X_CF) {
         if (rex.w) {
             AND(s5, xMASK, s1);
-            AND(s4, xMASK, s2);
-            ADD(s5, s5, s4); // lo
+            if(rv64_zba) ADDUW(s5, s2, s5); else {AND(s4, xMASK, s2); ADD(s5, s5, s4);} // lo
             SRLI(s3, s1, 0x20);
             SRLI(s4, s2, 0x20);
             ADD(s4, s4, s3);
@@ -159,8 +159,12 @@ void emit_add32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_AF | X_OF) {
-        NOT(s2, s1);   // s2 = ~res
-        AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s3);   // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s2, s1);   // s2 = ~res
+            AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s4, s3, 0x08); // AF: cc & 0x08
@@ -213,8 +217,12 @@ void emit_add16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
         SW(s1, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_AF | X_OF) {
-        NOT(s5, s1);   // s5 = ~res
-        AND(s3, s5, s3);   // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s3);    // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s5, s1);   // s5 = ~res
+            AND(s3, s5, s3);   // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s4, s3, 0x08); // AF: cc & 0x08
@@ -237,8 +245,7 @@ void emit_add16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
         ORI(xFlags, xFlags, 1 << F_CF);
     }
 
-    SLLI(s1, s1, 48);
-    SRLI(s1, s1, 48);
+    ZEXTH(s1, s1);
 
     IFX(X_ZF) {
         BNEZ(s1, 8);
@@ -272,8 +279,12 @@ void emit_add8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     ADD(s1, s1, s2);
 
     IFX(X_AF|X_OF) {
-        NOT(s4, s1);   // s4 = ~res
-        AND(s3, s4, s3);   // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s3);   // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s4, s1);   // s4 = ~res
+            AND(s3, s4, s3);   // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s2);   // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s4, s3, 0x08); // AF: cc & 0x08
@@ -332,8 +343,12 @@ void emit_add8c(dynarec_rv64_t* dyn, int ninst, int s1, int c, int s2, int s3, i
     ADDI(s1, s1, c);
 
     IFX(X_AF|X_OF) {
-        NOT(s2, s1);   // s2 = ~res
-        AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s3);   // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s2, s1);   // s2 = ~res
+            AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s4, s3, 0x08); // AF: cc & 0x08
@@ -580,8 +595,12 @@ void emit_inc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
         SB(s1, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_AF | X_OF) {
-        NOT(s2, s1);   // s2 = ~res
-        AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s3);   // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s2, s1);   // s2 = ~res
+            AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s2, s3, 0x08); // AF: cc & 0x08
@@ -625,8 +644,9 @@ void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
         SET_DFNONE();
     }
     IFX(X_AF | X_OF) {
-        ORI(s3, s1, 1);      // s3 = op1 | op2
-        ANDI(s4, s1, 1);      // s4 = op1 & op2
+        NOT(s4, s1);        // s4 = ~op1
+        ORI(s3, s4, 1);      // s3 = ~op1 | op2
+        ANDI(s4, s4, 1);      // s4 = ~op1 & op2
     }
 
     ADDIW(s1, s1, -1);
@@ -635,9 +655,8 @@ void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
         SB(s1, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_AF | X_OF) {
-        NOT(s2, s1);   // s2 = ~res
-        AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
-        OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
+        AND(s3, s1, s3);   // s3 = res & (~op1 | op2)
+        OR(s3, s3, s4);   // cc = (res & (~op1 | op2)) | (~op1 & op2)
         IFX(X_AF) {
             ANDI(s2, s3, 0x08); // AF: cc & 0x08
             BEQZ(s2, 8);
@@ -689,8 +708,12 @@ void emit_inc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_AF | X_OF) {
-        NOT(s2, s1);   // s2 = ~res
-        AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s3);    // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s2, s1);   // s2 = ~res
+            AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s5);   // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s2, s3, 0x08); // AF: cc & 0x08
@@ -781,6 +804,9 @@ void emit_dec32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 // emit INC16 instruction, from s1, store result in s1 using s3 and s4 as scratch
 void emit_inc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 {
+    IFX(X_ALL) {
+        ANDI(xFlags, xFlags, ~((1UL<<F_AF) | (1UL<<F_OF2) | (1UL<<F_ZF) | (1UL<<F_SF) | (1UL<<F_PF)));
+    }
     IFX(X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, op1));
         SET_DF(s3, d_inc16);
@@ -798,8 +824,12 @@ void emit_inc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
         SH(s1, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_AF | X_OF) {
-        NOT(s2, s1);   // s2 = ~res
-        AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s3);   // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s2, s1);   // s2 = ~res
+            AND(s3, s2, s3);   // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s4, s3, 0x08); // AF: cc & 0x08
@@ -816,8 +846,7 @@ void emit_inc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
         }
     }
 
-    SLLI(s1, s1, 48);
-    SRLI(s1, s1, 48);
+    ZEXTH(s1, s1);
 
     IFX(X_ZF) {
         BNEZ(s1, 8);
@@ -909,6 +938,7 @@ void emit_sbb8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     SUBW(s1, s1, s3);
     ANDI(s1, s1, 0xff);
 
+    CLEAR_FLAGS();
     IFX(X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -928,6 +958,78 @@ void emit_sbb8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     }
 }
 
+// emit ADC8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
+void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5) {
+    IFX(X_PEND) {
+        SH(s1, xEmu, offsetof(x64emu_t, op1));
+        SH(s2, xEmu, offsetof(x64emu_t, op2));
+        SET_DF(s3, d_adc8);
+    } else IFX(X_ALL) {
+        SET_DFNONE();
+    }
+    IFX(X_AF | X_OF) {
+        OR(s4, s1, s2);  // s3 = op1 | op2
+        AND(s5, s1, s2); // s4 = op1 & op2
+    }
+
+    ADD(s1, s1, s2);
+    ANDI(s3, xFlags, 1 << F_CF);
+    ADD(s1, s1, s3);
+
+    CLEAR_FLAGS();
+    IFX(X_PEND) {
+        SW(s1, xEmu, offsetof(x64emu_t, res));
+    }
+    IFX(X_AF | X_OF) {
+        if(rv64_zbb) {
+            ANDN(s3, s1, s4);   // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s2, s1);     // s2 = ~res
+            AND(s3, s2, s4); // s3 = ~res & (op1 | op2)
+        }
+        OR(s3, s3, s5);  // cc = (~res & (op1 | op2)) | (op1 & op2)
+        IFX(X_AF) {
+            ANDI(s4, s3, 0x08); // AF: cc & 0x08
+            BEQZ(s4, 8);
+            ORI(xFlags, xFlags, 1 << F_AF);
+        }
+        IFX(X_OF) {
+            SRLI(s3, s3, 6);
+            SRLI(s4, s3, 1);
+            XOR(s3, s3, s4);
+            ANDI(s3, s3, 1); // OF: xor of two MSB's of cc
+            BEQZ(s3, 8);
+            ORI(xFlags, xFlags, 1 << F_OF2);
+        }
+    }
+    IFX(X_CF) {
+        SRLI(s3, s1, 8);
+        BEQZ(s3, 8);
+        ORI(xFlags, xFlags, 1 << F_CF);
+    }
+
+    ANDI(s1, s1, 0xff);
+
+    IFX(X_ZF) {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+    IFX(X_SF) {
+        SRLI(s3, s1, 7);
+        BEQZ(s3, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    IFX(X_PF) {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+}
+
+// emit ADC8 instruction, from s1, const c, store result in s1 using s3, s4, s5 and s6 as scratch
+void emit_adc8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5, int s6) {
+    MOV32w(s5, c&0xff);
+    emit_adc8(dyn, ninst, s1, s5, s3, s4, s6);
+}
+
 // emit SBB8 instruction, from s1, constant c, store result in s1 using s3, s4, s5 and s6 as scratch
 void emit_sbb8c(dynarec_rv64_t* dyn, int ninst, int s1, int c, int s3, int s4, int s5, int s6)
 {
@@ -955,6 +1057,7 @@ void emit_sbb16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     ANDI(s3, xFlags, 1 << F_CF);
     SUBW(s1, s1, s3);
 
+    CLEAR_FLAGS();
     SLLIW(s1, s1, 16);
     IFX(X_SF) {
         BGE(s1, xZR, 8);
@@ -996,6 +1099,7 @@ void emit_sbb32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     ANDI(s3, xFlags, 1 << F_CF);
     SUBxw(s1, s1, s3);
 
+    CLEAR_FLAGS();
     IFX(X_SF) {
         BGE(s1, xZR, 8);
         ORI(xFlags, xFlags, 1 << F_SF);
@@ -1091,8 +1195,7 @@ void emit_neg16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3)
     }
 
     NEG(s1, s1);
-    SLLI(s1, s1, 48);
-    SRLI(s1, s1, 48);
+    ZEXTH(s1, s1);
     IFX(X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -1121,7 +1224,8 @@ void emit_neg16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3)
         }    
     }
     IFX(X_SF) {
-        ANDI(s3, s1, 1 << F_SF);    // 1<<F_SF is sign bit, so just mask
+        SRLI(s3, s1, 15-F_SF);    // put sign bit in place
+        ANDI(s3, s3, 1 << F_SF);    // 1<<F_SF is sign bit, so just mask
         OR(xFlags, xFlags, s3);
     }
     IFX(X_PF) {
@@ -1192,7 +1296,6 @@ void emit_neg8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3)
 // emit ADC16 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
 void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5)
 {
-    CLEAR_FLAGS();
     IFX(X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, op1));
         SH(s2, xEmu, offsetof(x64emu_t, op2));
@@ -1209,12 +1312,17 @@ void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     ANDI(s3, xFlags, 1 << F_CF);
     ADD(s1, s1, s3);
 
+    CLEAR_FLAGS();
     IFX(X_PEND) {
         SW(s1, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_AF | X_OF) {
-        NOT(s2, s1);     // s2 = ~res
-        AND(s3, s2, s4); // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s4);   // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s2, s1);     // s2 = ~res
+            AND(s3, s2, s4); // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s5);  // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s4, s3, 0x08); // AF: cc & 0x08
@@ -1236,8 +1344,7 @@ void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
         ORI(xFlags, xFlags, 1 << F_CF);
     }
 
-    SLLI(s1, s1, 48);
-    SRLI(s1, s1, 48);
+    ZEXTH(s1, s1);
 
     IFX(X_ZF) {
         BNEZ(s1, 8);
@@ -1254,9 +1361,8 @@ void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
 }
 
 // emit ADC32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
-void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5)
+void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6)
 {
-    CLEAR_FLAGS();
     IFX(X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, op1));
         SDxw(s2, xEmu, offsetof(x64emu_t, op2));
@@ -1267,21 +1373,16 @@ void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX(X_CF) {
         if (rex.w) {
             AND(s5, xMASK, s1);
-            AND(s4, xMASK, s2);
-            ADD(s5, s5, s4); // lo
+            if(rv64_zba) ADDUW(s5, s2, s5); else {AND(s4, xMASK, s2); ADD(s5, s5, s4);} // lo
             SRLI(s3, s1, 0x20);
             SRLI(s4, s2, 0x20);
             ADD(s4, s4, s3);
             SRLI(s5, s5, 0x20);
             ADD(s5, s5, s4); // hi
-            SRAI(s5, s5, 0x20);
-            BEQZ(s5, 8);
-            ORI(xFlags, xFlags, 1 << F_CF);
+            SRAI(s6, s5, 0x20);
         } else {
             ADD(s5, s1, s2);
-            SRLI(s5, s5, 0x20);
-            BEQZ(s5, 8);
-            ORI(xFlags, xFlags, 1 << F_CF);
+            SRLI(s6, s5, 0x20);
         }
     }
     IFX(X_AF | X_OF) {
@@ -1293,12 +1394,21 @@ void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     ANDI(s3, xFlags, 1 << F_CF);
     ADDxw(s1, s1, s3);
 
+    CLEAR_FLAGS();
     IFX(X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
+    IFX(X_CF) {
+        BEQZ(s6, 8);
+        ORI(xFlags, xFlags, 1 << F_CF);
+    }
     IFX(X_AF | X_OF) {
-        NOT(s2, s1);     // s2 = ~res
-        AND(s3, s2, s4); // s3 = ~res & (op1 | op2)
+        if(rv64_zbb) {
+            ANDN(s3, s1, s4);   // s3 = ~res & (op1 | op2)
+        } else {
+            NOT(s2, s1);     // s2 = ~res
+            AND(s3, s2, s4); // s3 = ~res & (op1 | op2)
+        }
         OR(s3, s3, s5);  // cc = (~res & (op1 | op2)) | (op1 & op2)
         IFX(X_AF) {
             ANDI(s4, s3, 0x08); // AF: cc & 0x08
diff --git a/src/dynarec/rv64/dynarec_rv64_emit_shift.c b/src/dynarec/rv64/dynarec_rv64_emit_shift.c
index dbcc2d5f..7030c674 100644
--- a/src/dynarec/rv64/dynarec_rv64_emit_shift.c
+++ b/src/dynarec/rv64/dynarec_rv64_emit_shift.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -16,7 +15,6 @@
 #include "emu/x64run_private.h"
 #include "x64trace.h"
 #include "dynarec_native.h"
-#include "../tools/bridge_private.h"
 
 #include "rv64_printer.h"
 #include "dynarec_rv64_private.h"
@@ -327,11 +325,15 @@ void emit_rol32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     } else {
         ANDI(s4, s2, 0x1f);
     }
-    SLLxw(s3, s1, s4);
-    NEG(s4, s4);
-    ADDI(s4, s4, rex.w?64:32);
-    SRLxw(s1, s1, s4);
-    OR(s1, s3, s1);
+    if(rv64_zbb) {
+        ROLxw(s1, s1, s4);
+    } else {
+        SLLxw(s3, s1, s4);
+        NEG(s4, s4);
+        ADDI(s4, s4, rex.w?64:32);
+        SRLxw(s1, s1, s4);
+        OR(s1, s3, s1);
+    }
     IFX(X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -370,11 +372,15 @@ void emit_ror32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     } else {
         ANDI(s4, s2, 0x1f);
     }
-    SRLxw(s3, s1, s4);
-    NEG(s4, s4);
-    ADDI(s4, s4, rex.w?64:32);
-    SLLxw(s1, s1, s4);
-    OR(s1, s3, s1);
+    if(rv64_zbb) {
+        RORxw(s1, s1, s4);
+    } else {
+        SRLxw(s3, s1, s4);
+        NEG(s4, s4);
+        ADDI(s4, s4, rex.w?64:32);
+        SLLxw(s1, s1, s4);
+        OR(s1, s3, s1);
+    }
     IFX(X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -413,9 +419,13 @@ void emit_rol32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
         }
         return;
     }
-    SLLIxw(s3, s1, c);
-    SRLIxw(s1, s1, (rex.w?64:32)-c);
-    OR(s1, s3, s1);
+    if(rv64_zbb) {
+        RORIxw(s1, s1, (rex.w?64:32)-c);
+    } else {
+        SLLIxw(s3, s1, c);
+        SRLIxw(s1, s1, (rex.w?64:32)-c);
+        OR(s1, s3, s1);
+    }
     IFX(X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -454,9 +464,13 @@ void emit_ror32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
         }
         return;
     }
-    SRLIxw(s3, s1, c);
-    SLLIxw(s1, s1, (rex.w?64:32)-c);
-    OR(s1, s3, s1);
+    if(rv64_zbb) {
+        RORIxw(s1, s1, c);
+    } else {
+        SRLIxw(s3, s1, c);
+        SLLIxw(s1, s1, (rex.w?64:32)-c);
+        OR(s1, s3, s1);
+    }
     IFX(X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
diff --git a/src/dynarec/rv64/dynarec_rv64_emit_tests.c b/src/dynarec/rv64/dynarec_rv64_emit_tests.c
index 79ebe6cb..00c1fb7d 100644
--- a/src/dynarec/rv64/dynarec_rv64_emit_tests.c
+++ b/src/dynarec/rv64/dynarec_rv64_emit_tests.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -16,7 +15,6 @@
 #include "emu/x64run_private.h"
 #include "x64trace.h"
 #include "dynarec_native.h"
-#include "../tools/bridge_private.h"
 
 #include "rv64_printer.h"
 #include "dynarec_rv64_private.h"
@@ -108,8 +106,7 @@ void emit_cmp16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     // It's a cmp, we can't store the result back to s1.
     SUB(s6, s1, s2);
     IFX(X_ALL) {
-        SLLI(s6, s6, 48);
-        SRLI(s6, s6, 48);
+        ZEXTH(s6, s6);
     }
     IFX_PENDOR0 {
         SH(s6, xEmu, offsetof(x64emu_t, res));
diff --git a/src/dynarec/rv64/dynarec_rv64_f0.c b/src/dynarec/rv64/dynarec_rv64_f0.c
index 3ccaafa4..348f2905 100644
--- a/src/dynarec/rv64/dynarec_rv64_f0.c
+++ b/src/dynarec/rv64/dynarec_rv64_f0.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -47,12 +46,8 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         rep = opcode-0xF1;
         opcode = F8;
     }
-    // REX prefix before the F0 are ignored
-    rex.rex = 0;
-    while(opcode>=0x40 && opcode<=0x4f) {
-        rex.rex = opcode;
-        opcode = F8;
-    }
+
+    GETREX();
 
     // TODO: Take care of unligned memory access for all the LOCK ones.
     // https://github.com/ptitSeb/box64/pull/604
@@ -104,6 +99,101 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x0F:
             nextop = F8;
             switch(nextop) {
+                case 0xB0:
+                    switch(rep) {
+                        case 0:
+                            INST_NAME("LOCK CMPXCHG Eb, Gb");
+                            SETFLAGS(X_ALL, SF_SET_PENDING);
+                            nextop = F8;
+                            ANDI(x6, xRAX, 0xff); // AL
+                            SMDMB();
+                            if(MODREG) {
+                                if(rex.rex) {
+                                    wback = xRAX+(nextop&7)+(rex.b<<3);
+                                    wb2 = 0;
+                                } else {
+                                    wback = (nextop&7);
+                                    wb2 = (wback>>2)*8;
+                                    wback = xRAX+(wback&3);
+                                }
+                                if (wb2) {
+                                    MV(x2, wback);
+                                    SRLI(x2, x2, wb2);
+                                    ANDI(x2, x2, 0xff);
+                                } else {
+                                    ANDI(x2, wback, 0xff);
+                                }
+                                wb1 = 0;
+                                ed = x2;
+                                UFLAG_IF {
+                                    emit_cmp8(dyn, ninst, x6, ed, x3, x4, x5, x1);
+                                }
+                                BNE_MARK2(x6, x2);
+                                if (wb2) {
+                                    MV(wback, x2);
+                                    SRLI(wback, wback, wb2);
+                                    ANDI(wback, wback, 0xff);
+                                } else {
+                                    ANDI(wback, x2, 0xff);
+                                }
+                                GETGB(x1);
+                                MV(ed, gd);
+                                MARK2;
+                                ANDI(xRAX, xRAX, ~0xff);
+                                OR(xRAX, xRAX, x2);
+                                B_NEXT_nocond;
+                            } else {
+                                // this one is tricky, and did some repetitive work.
+                                // mostly because we only got 6 scratch registers,
+                                // and has so much to do.
+                                if(rex.rex) {
+                                    gb1 = xRAX+((nextop&0x38)>>3)+(rex.r<<3);
+                                    gb2 = 0;
+                                } else {
+                                    gd = (nextop&0x38)>>3;
+                                    gb2 = ((gd&4)>>2);
+                                    gb1 = xRAX+(gd&3);
+                                }
+                                addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, LOCK_LOCK, 0, 0);
+                                ANDI(x5, wback, 0b11);
+                                SLLI(x5, x5, 3);        // shamt
+                                MARKLOCK;
+                                ANDI(x2, wback, ~0b11); // align to 32bit
+                                LWU(x1, x2, 0);
+                                LR_W(x4, x2, 1, 1);
+                                SRL(x4, x4, x5);
+                                ANDI(x4, x4, 0xff);
+                                BNE_MARK(x6, x4); // compare AL with m8
+                                // AL == m8, r8 is loaded into m8
+                                ADDI(x2, xZR, 0xff);
+                                SLL(x2, x2, x5);
+                                NOT(x2, x2);
+                                AND(x2, x1, x2);
+                                if (gb2) {
+                                    MV(x1, gb1);
+                                    SRLI(x1, x1, 8);
+                                    ANDI(x1, x1, 0xff);
+                                } else {
+                                    ANDI(x1, gb1, 0xff);
+                                }
+                                SLL(x1, x1, x5);
+                                OR(x1, x1, x2);
+                                ANDI(x2, wback, ~0b11); // align to 32bit again
+                                SC_W(x9, x1, x2, 1, 1);
+                                BNEZ_MARKLOCK(x9);
+                                // done
+                                MARK;
+                                UFLAG_IF {emit_cmp8(dyn, ninst, x6, x4, x1, x2, x3, x5);}
+                                // load m8 into AL
+                                ANDI(xRAX, xRAX, ~0xff);
+                                OR(xRAX, xRAX, x4);
+                            }
+                            SMDMB();
+                            break;
+                        default:
+                            DEFAULT;
+                    }
+                    break;
                 case 0xB1:
                     switch (rep) {
                         case 0:
@@ -188,9 +278,16 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                             addr = geted(dyn, addr, ninst, nextop, &wback, x1, x2, &fixedaddress, rex, LOCK_LOCK, 0, 0);
                             ANDI(xFlags, xFlags, ~(1<<F_ZF));
                             if (rex.w) {
-                                // there is no atomic move on 16bytes, so faking it
+                                // there is no atomic move on 16bytes, so implement it with mutex
+                                LD(x9, xEmu, offsetof(x64emu_t, context));
+                                ADDI(x9, x9, offsetof(box64context_t, mutex_16b));
+                                ADDI(x4, xZR, 1);
+                                MARKLOCK;
+                                AMOSWAP_W(x4, x4, x9, 1, 1);
+                                // x4 == 1 if locked
+                                BNEZ_MARKLOCK(x4);
+
                                 SMDMB();
-                                // MARKLOCK;
                                 LD(x2, wback, 0);
                                 LD(x3, wback, 8);
                                 BNE_MARK(x2, xRAX);
@@ -204,6 +301,9 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                                 MV(xRDX, x3);
                                 MARK3;
                                 SMDMB();
+
+                                // unlock
+                                AMOSWAP_W(xZR, xZR, x9, 1, 1);
                             } else {
                                 SMDMB();
                                 MARKLOCK;
@@ -260,6 +360,64 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             }
             SMDMB();
             break;
+        case 0x29:
+            INST_NAME("LOCK SUB Ed, Gd");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            SMDMB();
+            if(MODREG) {
+                ed = xRAX+(nextop&7)+(rex.b<<3);
+                emit_sub32(dyn, ninst, rex, ed, gd, x3, x4, x5);
+            } else {
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, LOCK_LOCK, 0, 0);
+                MARKLOCK;
+                LRxw(x1, wback, 1, 1);
+                SUB(x4, x1, gd);
+                SCxw(x3, x4, wback, 1, 1);
+                BNEZ_MARKLOCK(x3);
+                IFX(X_ALL|X_PEND)
+                    emit_sub32(dyn, ninst, rex, x1, gd, x3, x4, x5);
+            }
+            SMDMB();
+            break;
+        case 0x80:
+            nextop = F8;
+            SMDMB();
+            switch((nextop>>3)&7) {
+                case 1: // OR
+                    INST_NAME("LOCK OR Eb, Ib");
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    if(MODREG) {
+                        GETEB(x1, 1);
+                        u8 = F8;
+                        emit_or8c(dyn, ninst, x1, u8, x2, x4, x5);
+                        EBBACK(x5, 0);
+                    } else {
+                        addr = geted(dyn, addr, ninst, nextop, &wback, x5, x1, &fixedaddress, rex, LOCK_LOCK, 0, 1);
+                        u8 = F8;
+                        ANDI(x2, wback, 3);
+                        SLLI(x2, x2, 3);     // offset in bits
+                        ANDI(x3, wback, ~3); // aligned addr
+                        ADDI(x1, xZR, u8);
+                        SLL(x1, x1, x2);     // Ib << offset
+                        MARKLOCK;
+                        LR_W(x4, x3, 1, 1);
+                        OR(x6, x4, x1);
+                        SC_W(x6, x6, x3, 1, 1);
+                        BNEZ_MARKLOCK(x6);
+                        IFX(X_ALL|X_PEND) {
+                            SRL(x1, x4, x2);
+                            ANDI(x1, x1, 0xFF);
+                            emit_or8c(dyn, ninst, x1, u8, x2, x4, x5);
+                        }
+                    }
+                    break;
+                default:
+                    DEFAULT;
+            }
+            SMDMB();
+            break;
         case 0x81:
         case 0x83:
             nextop = F8;
@@ -379,7 +537,7 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                             emit_sub32c(dyn, ninst, rex, x1, i64, x3, x4, x5, x6);
                     }
                     break;
-                default: 
+                default:
                     DEFAULT;
             }
             SMDMB();
diff --git a/src/dynarec/rv64/dynarec_rv64_f20f.c b/src/dynarec/rv64/dynarec_rv64_f20f.c
index 95f526f0..ac3da811 100644
--- a/src/dynarec/rv64/dynarec_rv64_f20f.c
+++ b/src/dynarec/rv64/dynarec_rv64_f20f.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -35,7 +34,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     int v0, v1;
     int q0;
     int d0, d1;
-    int64_t fixedaddress;
+    int64_t fixedaddress, gdoffset;
     int unscaled;
 
     MAYUSE(d0);
@@ -82,11 +81,11 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x12:
             INST_NAME("MOVDDUP Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LD(x3, wback, fixedaddress+0);
-            SD(x3, gback, 0);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+0);
+            SD(x3, gback, gdoffset+8);
             break;
         case 0x2A:
             INST_NAME("CVTSI2SD Gx, Ed");
@@ -105,7 +104,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGD;
             GETEXSD(v0, 0);
             if(!box64_dynarec_fastround) {
-                FSFLAGSI(xZR);  // // reset all bits
+                FSFLAGSI(0);  // // reset all bits
             }
             FCVTLDxw(gd, v0, RD_RTZ);
             if(!rex.w)
@@ -127,7 +126,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGD;
             GETEXSD(v0, 0);
             if(!box64_dynarec_fastround) {
-                FSFLAGSI(xZR);  // // reset all bits
+                FSFLAGSI(0);  // // reset all bits
             }
             u8 = sse_setround(dyn, ninst, x2, x3);
             FCVTLDxw(gd, v0, RD_DYN);
@@ -184,8 +183,9 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEXSD(d0, 0);
             GETGXSD_empty(d1);
             if(!box64_dynarec_fastnan) {
-                FMVDX(d1, xZR);
-                FLTD(x3, d0, d1);
+                v0 = fpu_get_scratch(dyn);  // need a scratch in case d0 == d1
+                FMVDX(v0, xZR);
+                FLTD(x3, d0, v0);
             }
             FSQRTD(d1, d0);
             if(!box64_dynarec_fastnan) {
@@ -275,7 +275,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x70: // TODO: Optimize this!
             INST_NAME("PSHUFLW Gx, Ex, Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 1);
             u8 = F8;
             int32_t idx;
@@ -289,14 +289,14 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             idx = (u8>>(3*2))&3;
             LHU(x6, wback, fixedaddress+idx*2);
 
-            SH(x3, gback, 0*2);
-            SH(x4, gback, 1*2);
-            SH(x5, gback, 2*2);
-            SH(x6, gback, 3*2);
+            SH(x3, gback, gdoffset+0*2);
+            SH(x4, gback, gdoffset+1*2);
+            SH(x5, gback, gdoffset+2*2);
+            SH(x6, gback, gdoffset+3*2);
 
             if (!(MODREG && (gd==ed))) {
                 LD(x3, wback, fixedaddress+8);
-                SD(x3, gback, 8);
+                SD(x3, gback, gdoffset+8);
             }
             break;
         case 0xC2:
@@ -334,7 +334,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 }
                 case 7: break;                                      // Not NaN
                 }
-                
+
                 MARK2;
                 if ((u8&7) == 5 || (u8&7) == 6) {
                     MOV32w(x2, 1);
@@ -347,7 +347,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xE6:
             INST_NAME("CVTPD2DQ Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             u8 = sse_setround(dyn, ninst, x6, x4);
@@ -358,10 +358,17 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 SUB(x5, x5, x3);
                 BEQZ(x5, 8);
                 LUI(x3, 0x80000); // INT32_MIN
-                SW(x3, gback, 4*i);
+                SW(x3, gback, gdoffset+4*i);
             }
             x87_restoreround(dyn, ninst, u8);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+8);
+            break;
+        case 0xF0:
+            INST_NAME("LDDQU Gx,Ex");
+            nextop = F8;
+            GETGX();
+            GETEX(x2, 0);
+            SSE_LOOP_MV_Q(x3);
             break;
         default:
             DEFAULT;
diff --git a/src/dynarec/rv64/dynarec_rv64_f30f.c b/src/dynarec/rv64/dynarec_rv64_f30f.c
index 489d5ca0..0c0676e0 100644
--- a/src/dynarec/rv64/dynarec_rv64_f30f.c
+++ b/src/dynarec/rv64/dynarec_rv64_f30f.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -35,7 +34,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     int v0, v1;
     int q0, q1;
     int d0, d1;
-    int64_t fixedaddress;
+    int64_t fixedaddress, gdoffset;
     int unscaled;
     int64_t j64;
 
@@ -80,7 +79,22 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 SMWRITE2();
             }
             break;
-            
+
+        case 0x12:
+            INST_NAME("MOVSLDUP Gx, Ex");
+            nextop = F8;
+            GETGX();
+            GETEX(x2, 0);
+
+            // GX->ud[1] = GX->ud[0] = EX->ud[0];
+            // GX->ud[3] = GX->ud[2] = EX->ud[2];
+            LD(x3, wback, fixedaddress+0);
+            SD(x3, gback, gdoffset+0);
+            SD(x3, gback, gdoffset+4);
+            LD(x3, wback, fixedaddress+8);
+            SD(x3, gback, gdoffset+8);
+            SD(x3, gback, gdoffset+12);
+            break;
         case 0x1E:
             INST_NAME("NOP / ENDBR32 / ENDBR64");
             nextop = F8;
@@ -105,7 +119,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGD;
             GETEXSS(d0, 0);
             if(!box64_dynarec_fastround) {
-                FSFLAGSI(xZR);  // // reset all bits
+                FSFLAGSI(0);  // // reset all bits
             }
             FCVTSxw(gd, d0, RD_RTZ);
             if(!rex.w)
@@ -121,6 +135,31 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 }
             }
             break;
+        case 0x2D:
+            INST_NAME("CVTSS2SI Gd, Ex");
+            nextop = F8;
+            GETGD;
+            GETEXSS(d0, 0);
+            if(!box64_dynarec_fastround) {
+                FSFLAGSI(0);  // // reset all bits
+            }
+            u8 = sse_setround(dyn, ninst, x5, x6);
+            FCVTSxw(gd, d0, RD_DYN);
+            x87_restoreround(dyn, ninst, u8);
+            if(!rex.w)
+                ZEROUP(gd);
+            if(!box64_dynarec_fastround) {
+                FRFLAGS(x5);   // get back FPSR to check the IOC bit
+                ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                CBZ_NEXT(x5);
+                if(rex.w) {
+                    MOV64x(gd, 0x8000000000000000LL);
+                } else {
+                    MOV32w(gd, 0x80000000);
+                }
+            }
+            break;
+
         case 0x51:
             INST_NAME("SQRTSS Gx, Ex");
             nextop = F8;
@@ -128,6 +167,16 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETGXSS_empty(v1);
             FSQRTS(v1, v0);
             break;
+        case 0x53:
+            INST_NAME("RCPSS Gx, Ex");
+            nextop = F8;
+            GETEXSS(v0, 0);
+            GETGXSS_empty(v1);
+            q0 = fpu_get_scratch(dyn);
+            LUI(x3, 0x3F800); // 1.0f
+            FMVWX(q0, x3);
+            FDIVS(v1, q0, v0);
+            break;
         case 0x58:
             INST_NAME("ADDSS Gx, Ex");
             nextop = F8;
@@ -196,14 +245,14 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x6F:
             INST_NAME("MOVDQU Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q(x3);
             break;
         case 0x70: // TODO: Optimize this!
             INST_NAME("PSHUFHW Gx, Ex, Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 1);
             u8 = F8;
             int32_t idx;
@@ -217,14 +266,14 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             idx = 4+((u8>>(3*2))&3);
             LHU(x6, wback, fixedaddress+idx*2);
 
-            SH(x3, gback, (4+0)*2);
-            SH(x4, gback, (4+1)*2);
-            SH(x5, gback, (4+2)*2);
-            SH(x6, gback, (4+3)*2);
+            SH(x3, gback, gdoffset+(4+0)*2);
+            SH(x4, gback, gdoffset+(4+1)*2);
+            SH(x5, gback, gdoffset+(4+2)*2);
+            SH(x6, gback, gdoffset+(4+3)*2);
 
             if (!(MODREG && (gd==ed))) {
                 LD(x3, wback, fixedaddress+0);
-                SD(x3, gback, 0);
+                SD(x3, gback, gdoffset+0);
             }
             break;
         case 0x7E:
@@ -246,21 +295,21 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x7F:
             INST_NAME("MOVDQU Ex,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
             if(!MODREG) SMWRITE2();
             break;
-        
+
         case 0x5B:
             INST_NAME("CVTTPS2DQ Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             v0 = fpu_get_scratch(dyn);
             for(int i=0; i<4; ++i) {
                 if(!box64_dynarec_fastround) {
-                    FSFLAGSI(xZR); // reset all bits
+                    FSFLAGSI(0); // reset all bits
                 }
                 FLW(v0, wback, fixedaddress+i*4);
                 FCVTWS(x3, v0, RD_RTZ);
@@ -270,7 +319,49 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     BEQZ(x5, 8);
                     MOV32w(x3, 0x80000000);
                 }
-                SW(x3, gback, i*4);
+                SW(x3, gback, gdoffset+i*4);
+            }
+            break;
+        case 0xB8:
+            INST_NAME("POPCNT Gd, Ed");
+            SETFLAGS(X_ALL, SF_SET);
+            SET_DFNONE();
+            nextop = F8;
+            GETED(0);
+            GETGD;
+            if(!rex.w && MODREG) {
+                AND(x4, ed, xMASK);
+                ed = x4;
+            }
+            CLEAR_FLAGS();
+            BNE_MARK(ed, xZR);
+            ORI(xFlags, xFlags, 1<<F_ZF);
+            MOV32w(gd, 0);
+            B_NEXT_nocond;
+            MARK;
+            if(rv64_zbb) {
+                CPOPxw(gd, ed);
+            } else {
+                TABLE64(x1, 0x5555555555555555uLL);
+                SRLI(x5, ed, 1);
+                AND(x5, x5, x1);
+                SUB(x5, ed, x5);
+                TABLE64(x3, 0x3333333333333333uLL);
+                SRLI(x1, x5, 2);
+                AND(x1, x1, x3);
+                AND(x5, x5, x3);
+                ADD(x5, x5, x1);
+                TABLE64(x3, 0x0F0F0F0F0F0F0F0FuLL);
+                SRLI(x1, x5, 4);
+                ADD(x5, x5, x1);
+                AND(x5, x5, x3);
+                SRLI(x1, x5, 32);
+                ADDW(x5, x5, x1);
+                SRLIW(x1, x5, 16);
+                ADDW(x5, x5, x1);
+                SRLIW(x1, x5, 8);
+                ADDW(x5, x5, x1);
+                ANDI(gd, x5, 0x7F);
             }
             break;
         case 0xBC:
@@ -284,21 +375,24 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 AND(x4, ed, xMASK);
                 ed = x4;
             }
-            BNE_MARK(ed, xZR);
             ANDI(xFlags, xFlags, ~((1<<F_ZF) | (1<<F_CF)));
+            BNE_MARK(ed, xZR);
             ORI(xFlags, xFlags, 1<<F_CF);
             MOV32w(gd, rex.w?64:32);
             B_NEXT_nocond;
             MARK;
-            NEG(x2, ed);
-            AND(x2, x2, ed);
-            TABLE64(x3, 0x03f79d71b4ca8b09ULL);
-            MUL(x2, x2, x3);
-            SRLI(x2, x2, 64-6);
-            TABLE64(x1, (uintptr_t)&deBruijn64tab);
-            ADD(x1, x1, x2);
-            LBU(gd, x1, 0);
-            ANDI(xFlags, xFlags, ~((1<<F_ZF) | (1<<F_CF)));
+            if(rv64_zbb) {
+                CTZxw(gd, ed);
+            } else {
+                NEG(x2, ed);
+                AND(x2, x2, ed);
+                TABLE64(x3, 0x03f79d71b4ca8b09ULL);
+                MUL(x2, x2, x3);
+                SRLI(x2, x2, 64-6);
+                TABLE64(x1, (uintptr_t)&deBruijn64tab);
+                ADD(x1, x1, x2);
+                LBU(gd, x1, 0);
+            }
             BNE(gd, xZR, 4+4);
             ORI(xFlags, xFlags, 1<<F_ZF);
             break;
@@ -319,38 +413,42 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             ORI(xFlags, xFlags, 1<<F_CF);
             B_NEXT_nocond;
             MARK;
-            if(ed!=gd)
-                u8 = gd;
-            else
-                u8 = x1;
-            ADDI(u8, xZR, rex.w?63:31);
-            if(rex.w) {
-                MV(x2, ed);
-                SRLI(x3, x2, 32);
+            if(rv64_zbb) {
+                CLZxw(gd, ed);
+            } else {
+                if(ed!=gd)
+                    u8 = gd;
+                else
+                    u8 = x1;
+                ADDI(u8, xZR, rex.w?63:31);
+                if(rex.w) {
+                    MV(x2, ed);
+                    SRLI(x3, x2, 32);
+                    BEQZ(x3, 4+2*4);
+                    SUBI(u8, u8, 32);
+                    MV(x2, x3);
+                } else {
+                    AND(x2, ed, xMASK);
+                }
+                SRLI(x3, x2, 16);
                 BEQZ(x3, 4+2*4);
-                SUBI(u8, u8, 32);
+                SUBI(u8, u8, 16);
                 MV(x2, x3);
-            } else {
-                AND(x2, ed, xMASK);
+                SRLI(x3, x2, 8);
+                BEQZ(x3, 4+2*4);
+                SUBI(u8, u8, 8);
+                MV(x2, x3);
+                SRLI(x3, x2, 4);
+                BEQZ(x3, 4+2*4);
+                SUBI(u8, u8, 4);
+                MV(x2, x3);
+                ANDI(x2, x2, 0b1111);
+                TABLE64(x3, (uintptr_t)&lead0tab);
+                ADD(x3, x3, x2);
+                LBU(x2, x3, 0);
+                SUB(gd, u8, x2);
+                MARK2;
             }
-            SRLI(x3, x2, 16);
-            BEQZ(x3, 4+2*4);
-            SUBI(u8, u8, 16);
-            MV(x2, x3);
-            SRLI(x3, x2, 8);
-            BEQZ(x3, 4+2*4);
-            SUBI(u8, u8, 8);
-            MV(x2, x3);
-            SRLI(x3, x2, 4);
-            BEQZ(x3, 4+2*4);
-            SUBI(u8, u8, 4);
-            MV(x2, x3);
-            ANDI(x2, x2, 0b1111); 
-            TABLE64(x3, (uintptr_t)&lead0tab);
-            ADD(x3, x3, x2);
-            LBU(x2, x3, 0);
-            SUB(gd, u8, x2);
-            MARK2;
             ANDI(xFlags, xFlags, ~((1<<F_ZF) | (1<<F_CF)));
             BNE(gd, xZR, 4+4);
             ORI(xFlags, xFlags, 1<<F_ZF);
@@ -391,7 +489,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 }
                 case 7: break;                                      // Not NaN
                 }
-                
+
                 MARK2;
                 if ((u8&7) == 5 || (u8&7) == 6) {
                     MOV32w(x2, 1);
@@ -405,7 +503,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xE6:
             INST_NAME("CVTDQ2PD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             q0 = fpu_get_scratch(dyn);
             q1 = fpu_get_scratch(dyn);
@@ -413,8 +511,8 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             LW(x4, wback, fixedaddress+4);
             FCVTDW(q0, x3, RD_RTZ);
             FCVTDW(q1, x4, RD_RTZ);
-            FSD(q0, gback, 0);
-            FSD(q1, gback, 8);
+            FSD(q0, gback, gdoffset+0);
+            FSD(q1, gback, gdoffset+8);
             break;
 
         default:
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c
index dade3016..541ac45f 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.c
+++ b/src/dynarec/rv64/dynarec_rv64_functions.c
@@ -1,7 +1,6 @@
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
-#include <pthread.h>
 #include <errno.h>
 #include <string.h>
 #include <math.h>
@@ -13,7 +12,6 @@
 #include "box64context.h"
 #include "dynarec.h"
 #include "emu/x64emu_private.h"
-#include "tools/bridge_private.h"
 #include "x64run.h"
 #include "x64emu.h"
 #include "box64stack.h"
@@ -130,7 +128,7 @@ int extcache_get_st_f(dynarec_rv64_t* dyn, int ninst, int a)
          && dyn->insts[ninst].e.extcache[i].n==a)
             return i;
     return -1;
-} 
+}
 int extcache_get_st_f_noback(dynarec_rv64_t* dyn, int ninst, int a)
 {
     for(int i=0; i<24; ++i)
@@ -138,7 +136,7 @@ int extcache_get_st_f_noback(dynarec_rv64_t* dyn, int ninst, int a)
          && dyn->insts[ninst].e.extcache[i].n==a)
             return i;
     return -1;
-} 
+}
 int extcache_get_current_st_f(dynarec_rv64_t* dyn, int a)
 {
     for(int i=0; i<24; ++i)
@@ -146,7 +144,7 @@ int extcache_get_current_st_f(dynarec_rv64_t* dyn, int a)
          && dyn->e.extcache[i].n==a)
             return i;
     return -1;
-} 
+}
 
 static void extcache_promote_double_forward(dynarec_rv64_t* dyn, int ninst, int maxinst, int a);
 static void extcache_promote_double_internal(dynarec_rv64_t* dyn, int ninst, int maxinst, int a);
@@ -155,7 +153,7 @@ static void extcache_promote_double_combined(dynarec_rv64_t* dyn, int ninst, int
     if(a == dyn->insts[ninst].e.combined1 || a == dyn->insts[ninst].e.combined2) {
         if(a == dyn->insts[ninst].e.combined1) {
             a = dyn->insts[ninst].e.combined2;
-        } else 
+        } else
             a = dyn->insts[ninst].e.combined1;
         int i = extcache_get_st_f_noback(dyn, ninst, a);
         //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_combined, ninst=%d combined%c %d i=%d (stack:%d/%d)\n", ninst, (a == dyn->insts[ninst].e.combined2)?'2':'1', a ,i, dyn->insts[ninst].e.stack_push, -dyn->insts[ninst].e.stack_pop);
@@ -328,7 +326,7 @@ void extcacheUnwind(extcache_t* cache)
 {
     if(cache->swapped) {
         // unswap
-        int a = -1; 
+        int a = -1;
         int b = -1;
         for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j)
             if((cache->extcache[j].t == EXT_CACHE_ST_D || cache->extcache[j].t == EXT_CACHE_ST_F)) {
@@ -346,12 +344,21 @@ void extcacheUnwind(extcache_t* cache)
         cache->combined1 = cache->combined2 = 0;
     }
     if(cache->news) {
-        // reove the newly created extcache
+        // remove the newly created extcache
         for(int i=0; i<24; ++i)
             if(cache->news&(1<<i))
                 cache->extcache[i].v = 0;
         cache->news = 0;
     }
+    // add/change bad regs
+    for(int i=0; i<16; ++i) {
+        if(cache->olds[i].changed) {
+            cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD;
+        } else if(cache->olds[i].purged) {
+            cache->extcache[i].n = i;
+            cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD;
+        }
+    }
     if(cache->stack_push) {
         // unpush
         for(int j=0; j<24; ++j) {
@@ -484,15 +491,22 @@ const char* getCacheName(int t, int n)
     return buff;
 }
 
-void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name)
+void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex)
 {
+    static const char* fnames[] = {
+        "ft0"," ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7",
+        "fs0", "fs1",
+        "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7",
+        "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fs8", "fs9", "fs10", "fs11",
+        "ft8", "ft9", "ft10", "ft11"
+    };
     if(box64_dynarec_dump) {
-        printf_x64_instruction(my_context->dec, &dyn->insts[ninst].x64, name);
+        printf_x64_instruction(rex.is32bits?my_context->dec32:my_context->dec, &dyn->insts[ninst].x64, name);
         dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d/%d",
             (box64_dynarec_dump>1)?"\e[32m":"",
             (void*)(dyn->native_start+dyn->insts[ninst].address),
             dyn->insts[ninst].size/4,
-            ninst,         
+            ninst,
             dyn->insts[ninst].x64.barrier,
             dyn->insts[ninst].x64.state_flags,
             dyn->f.pending,
@@ -517,12 +531,12 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name)
             dynarec_log(LOG_NONE, ", last_ip=%p", (void*)dyn->last_ip);
         for(int ii=0; ii<24; ++ii) {
             switch(dyn->insts[ninst].e.extcache[ii].t) {
-                case EXT_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
-                case EXT_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
-                case EXT_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
-                case EXT_CACHE_SS: dynarec_log(LOG_NONE, " S%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
-                case EXT_CACHE_SD: dynarec_log(LOG_NONE, " D%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
-                case EXT_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
+                case EXT_CACHE_ST_D: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
+                case EXT_CACHE_ST_F: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
+                case EXT_CACHE_MM: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
+                case EXT_CACHE_SS: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
+                case EXT_CACHE_SD: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
+                case EXT_CACHE_SCR: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
                 case EXT_CACHE_NONE:
                 default:    break;
             }
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.h b/src/dynarec/rv64/dynarec_rv64_functions.h
index fc53dcd7..451336bd 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.h
+++ b/src/dynarec/rv64/dynarec_rv64_functions.h
@@ -45,7 +45,7 @@ void extcacheUnwind(extcache_t* cache);
 
 const char* getCacheName(int t, int n);
 
-void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name);
+void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex);
 void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode);
 void print_newinst(dynarec_native_t* dyn, int ninst);
 
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c
index 37bcec29..a005c3b9 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.c
+++ b/src/dynarec/rv64/dynarec_rv64_helper.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 #include <assert.h>
 #include <string.h>
@@ -20,7 +19,6 @@
 #include "x64trace.h"
 #include "dynarec_native.h"
 #include "../dynablock_private.h"
-#include "../tools/bridge_private.h"
 #include "custommem.h"
 
 #include "rv64_printer.h"
@@ -28,11 +26,16 @@
 #include "dynarec_rv64_functions.h"
 #include "dynarec_rv64_helper.h"
 
+static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, int *l, int i12);
+
 /* setup r2 to address pointed by ED, also fixaddress is an optionnal delta in the range [-absmax, +absmax], with delta&mask==0 to be added to ed for LDR/STR */
 uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int *l, int i12, int delta)
 {
     MAYUSE(dyn); MAYUSE(ninst); MAYUSE(delta);
 
+    if(rex.is32bits)
+        return geted_32(dyn, addr, ninst, nextop, ed, hint, scratch, fixaddress, l, i12);
+
     int lock = l?((l==LOCK_LOCK)?1:2):0;
     if(lock==2)
         *l = 0;
@@ -47,14 +50,19 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
         if((nextop&7)==4) {
             uint8_t sib = F8;
             int sib_reg = ((sib>>3)&7)+(rex.x<<3);
+            int sib_reg2 = (sib&0x7)+(rex.b<<3);
             if((sib&0x7)==5) {
                 int64_t tmp = F32S;
                 if (sib_reg!=4) {
                     if(tmp && ((tmp<-2048) || (tmp>maxval) || !i12)) {
                         MOV64x(scratch, tmp);
                         if((sib>>6)) {
-                            SLLI(ret, xRAX+sib_reg, (sib>>6));
-                            ADD(ret, ret, scratch);
+                            if(rv64_zba) {
+                                SHxADD(ret, xRAX+sib_reg, sib>>6, scratch);
+                            } else {
+                                SLLI(ret, xRAX+sib_reg, (sib>>6));
+                                ADD(ret, ret, scratch);
+                            }
                         } else {
                             ADD(ret, xRAX+sib_reg, scratch);
                         }
@@ -75,13 +83,17 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
             } else {
                 if (sib_reg!=4) {
                     if(sib>>6) {
-                        SLLI(scratch, xRAX+sib_reg, (sib>>6));
-                        ADD(ret, xRAX+(sib&0x7)+(rex.b<<3), scratch);
+                        if(rv64_zba) {
+                            SHxADD(ret, xRAX+sib_reg, sib>>6, xRAX+sib_reg2);
+                        } else {
+                            SLLI(scratch, xRAX+sib_reg, (sib>>6));
+                            ADD(ret, xRAX+sib_reg2, scratch);
+                        }
                     } else {
-                        ADD(ret, xRAX+(sib&0x7)+(rex.b<<3), xRAX+sib_reg);
+                        ADD(ret, xRAX+sib_reg2, xRAX+sib_reg);
                     }
                 } else {
-                    ret = xRAX+(sib&0x7)+(rex.b<<3);
+                    ret = xRAX+sib_reg2;
                 }
             }
         } else if((nextop&7)==5) {
@@ -125,6 +137,7 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
             sib = F8;
             sib_reg = ((sib>>3)&7)+(rex.x<<3);
         }
+        int sib_reg2 = (sib&0x07)+(rex.b<<3);
         if(nextop&0x80)
             i64 = F32S;
         else
@@ -134,13 +147,17 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
             if((nextop&7)==4) {
                 if (sib_reg!=4) {
                     if(sib>>6) {
-                        SLLI(scratch, xRAX+sib_reg, (sib>>6));
-                        ADD(ret, xRAX+(sib&0x07)+(rex.b<<3), scratch);
+                        if(rv64_zba) {
+                            SHxADD(ret, xRAX+sib_reg, sib>>6, xRAX+sib_reg2);
+                        } else {
+                            SLLI(scratch, xRAX+sib_reg, (sib>>6));
+                            ADD(ret, xRAX+sib_reg2, scratch);
+                        }
                     } else {
-                        ADD(ret, xRAX+(sib&0x07)+(rex.b<<3), xRAX+sib_reg);
+                        ADD(ret, xRAX+sib_reg2, xRAX+sib_reg);
                     }
                 } else {
-                    ret = xRAX+(sib&0x07)+(rex.b<<3);
+                    ret = xRAX+sib_reg2;
                 }
             } else
                 ret = xRAX+(nextop&0x07)+(rex.b<<3);
@@ -149,13 +166,17 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
                 if((nextop&7)==4) {
                     if (sib_reg!=4) {
                         if(sib>>6) {
-                            SLLI(scratch, xRAX+sib_reg, (sib>>6));
-                            ADD(scratch, xRAX+(sib&0x07)+(rex.b<<3), scratch);
+                            if(rv64_zba) {
+                                SHxADD(scratch, xRAX+sib_reg, sib>>6, xRAX+sib_reg2);
+                            } else {
+                                SLLI(scratch, xRAX+sib_reg, (sib>>6));
+                                ADD(scratch, xRAX+sib_reg2, scratch);
+                            }
                         } else {
-                            ADD(scratch, xRAX+(sib&0x07)+(rex.b<<3), xRAX+sib_reg);
+                            ADD(scratch, xRAX+sib_reg2, xRAX+sib_reg);
                         }
                     } else {
-                        scratch = xRAX+(sib&0x07)+(rex.b<<3);
+                        scratch = xRAX+sib_reg2;
                     }
                 } else
                     scratch = xRAX+(nextop&0x07)+(rex.b<<3);
@@ -164,15 +185,19 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
                 MOV64x(scratch, i64);
                 if((nextop&7)==4) {
                     if (sib_reg!=4) {
-                        ADD(scratch, scratch, xRAX+(sib&0x07)+(rex.b<<3));
+                        ADD(scratch, scratch, xRAX+sib_reg2);
                         if(sib>>6) {
-                            SLLI(ret, xRAX+sib_reg, (sib>>6));
-                            ADD(ret, scratch, ret);
+                            if(rv64_zba) {
+                                SHxADD(ret, xRAX+sib_reg, sib>>6, scratch);
+                            } else {
+                                SLLI(ret, xRAX+sib_reg, (sib>>6));
+                                ADD(ret, scratch, ret);
+                            }
                         } else {
                             ADD(ret, scratch, xRAX+sib_reg);
                         }
                     } else {
-                        PASS3(int tmp = xRAX+(sib&0x07)+(rex.b<<3));
+                        PASS3(int tmp = xRAX+sib_reg2);
                         ADD(ret, tmp, scratch);
                     }
                 } else {
@@ -186,6 +211,269 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
     return addr;
 }
 
+static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, int *l, int i12)
+{
+    MAYUSE(dyn); MAYUSE(ninst);
+
+    int lock = l?((l==LOCK_LOCK)?1:2):0;
+    if(lock==2)
+        *l = 0;
+    uint8_t ret = x2;
+    *fixaddress = 0;
+    if(hint>0) ret = hint;
+    int maxval = 2047;
+    if(i12>1)
+        maxval -= i12;
+    MAYUSE(scratch);
+    if(!(nextop&0xC0)) {
+        if((nextop&7)==4) {
+            uint8_t sib = F8;
+            int sib_reg = (sib>>3)&0x7;
+            int sib_reg2 = sib&0x7;
+            if(sib_reg2==5) {
+                int64_t tmp = F32S;
+                if (sib_reg!=4) {
+                    if(tmp && ((tmp<-2048) || (tmp>maxval) || !i12)) {
+                        MOV32w(scratch, tmp);
+                        if((sib>>6)) {
+                            if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), scratch); else {SLLI(ret, xRAX+sib_reg, sib>>6); ADDW(ret, ret, scratch);}
+                        } else
+                            ADDW(ret, xRAX+sib_reg, scratch);
+                    } else {
+                        if(sib>>6)
+                            SLLI(ret, xRAX+sib_reg, (sib>>6));
+                        else
+                            ret = xRAX+sib_reg;
+                        *fixaddress = tmp;
+                    }
+                } else {
+                    switch(lock) {
+                        case 1: addLockAddress((int32_t)tmp); break;
+                        case 2: if(isLockAddress((int32_t)tmp)) *l=1; break;
+                    }
+                    MOV32w(ret, tmp);
+                }
+            } else {
+                if (sib_reg!=4) {
+                    if((sib>>6)) {
+                        if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else { SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, xRAX+sib_reg2);}
+                    } else
+                        ADDW(ret, xRAX+sib_reg2, xRAX+sib_reg);
+                } else {
+                    ret = xRAX+sib_reg2;
+                }
+            }
+        } else if((nextop&7)==5) {
+            uint32_t tmp = F32;
+            MOV32w(ret, tmp);
+            switch(lock) {
+                case 1: addLockAddress(tmp); break;
+                case 2: if(isLockAddress(tmp)) *l=1; break;
+            }
+        } else {
+            ret = xRAX+(nextop&7);
+            if(ret==hint) {
+                AND(hint, ret, xMASK);    //to clear upper part
+            }
+        }
+    } else {
+        int64_t i32;
+        uint8_t sib = 0;
+        int sib_reg = 0;
+        if((nextop&7)==4) {
+            sib = F8;
+            sib_reg = (sib>>3)&7;
+        }
+        int sib_reg2 = sib&0x07;
+        if(nextop&0x80)
+            i32 = F32S;
+        else
+            i32 = F8S;
+        if(i32==0 || ((i32>=-2048) && (i32<=2047)  && i12)) {
+            *fixaddress = i32;
+            if((nextop&7)==4) {
+                if (sib_reg!=4) {
+                    if(sib>>6) {
+                    if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else {SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, xRAX+sib_reg2);}
+                    } else
+                        ADDW(ret, xRAX+sib_reg2, xRAX+sib_reg);
+                } else {
+                    ret = xRAX+sib_reg2;
+                }
+            } else {
+                ret = xRAX+(nextop&0x07);
+            }
+        } else {
+            if(i32>=-2048 && i32<=2047) {
+                if((nextop&7)==4) {
+                    if (sib_reg!=4) {
+                        if(sib>>6) {
+                            if(rv64_zba) SHxADDUW(scratch, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else {SLLI(scratch, xRAX+sib_reg, sib>>6); ADDW(scratch, scratch, xRAX+sib_reg2);}
+                        } else
+                            ADDW(scratch, xRAX+sib_reg2, xRAX+sib_reg);
+                    } else {
+                        scratch = xRAX+sib_reg2;
+                    }
+                } else
+                    scratch = xRAX+(nextop&0x07);
+                ADDIW(ret, scratch, i32);
+            } else {
+                MOV32w(scratch, i32);
+                if((nextop&7)==4) {
+                    if (sib_reg!=4) {
+                        ADDW(scratch, scratch, xRAX+sib_reg2);
+                        if(sib>>6) {
+                            if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), scratch); else {SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, scratch);}
+                        } else
+                            ADDW(ret, scratch, xRAX+sib_reg);
+                    } else {
+                        PASS3(int tmp = xRAX+sib_reg2);
+                        ADDW(ret, tmp, scratch);
+                    }
+                } else {
+                    PASS3(int tmp = xRAX+(nextop&0x07));
+                    ADDW(ret, tmp, scratch);
+                }
+            }
+        }
+    }
+    *ed = ret;
+    return addr;
+}
+
+/* setup r2 to address pointed by ED, also fixaddress is an optionnal delta in the range [-absmax, +absmax], with delta&mask==0 to be added to ed for LDR/STR */
+uintptr_t geted32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int *l, int i12, int delta)
+{
+    MAYUSE(dyn); MAYUSE(ninst); MAYUSE(delta);
+
+    int lock = l?((l==LOCK_LOCK)?1:2):0;
+    if(lock==2)
+        *l = 0;
+    uint8_t ret = x2;
+    *fixaddress = 0;
+    if(hint>0) ret = hint;
+    int maxval = 2047;
+    if(i12>1)
+        maxval -= i12;
+    MAYUSE(scratch);
+    if(!(nextop&0xC0)) {
+        if((nextop&7)==4) {
+            uint8_t sib = F8;
+            int sib_reg = ((sib>>3)&0x7)+(rex.x<<3);
+            int sib_reg2 = (sib&0x7)+(rex.b<<3);
+            if((sib&0x7)==5) {
+                int64_t tmp = F32S;
+                if (sib_reg!=4) {
+                    if(tmp && ((tmp<-2048) || (tmp>maxval) || !i12)) {
+                        MOV64x(scratch, tmp);
+                        if((sib>>6)) {
+                            if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), scratch); else {SLLI(ret, xRAX+sib_reg, sib>>6); ADDW(ret, ret, scratch);}
+                        } else
+                            ADDW(ret, xRAX+sib_reg, scratch);
+                    } else {
+                        if(sib>>6)
+                            SLLI(ret, xRAX+sib_reg, (sib>>6));
+                        else
+                            ret = xRAX+sib_reg;
+                        *fixaddress = tmp;
+                    }
+                } else {
+                    switch(lock) {
+                        case 1: addLockAddress(tmp); break;
+                        case 2: if(isLockAddress(tmp)) *l=1; break;
+                    }
+                    MOV64x(ret, tmp);
+                }
+            } else {
+                if (sib_reg!=4) {
+                    if((sib>>6)) {
+                        if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else { SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, xRAX+sib_reg2);}
+                    } else
+                        ADDW(ret, xRAX+sib_reg2, xRAX+sib_reg);
+                } else {
+                    ret = xRAX+sib_reg2;
+                }
+            }
+        } else if((nextop&7)==5) {
+            uint32_t tmp = F32;
+            MOV32w(ret, tmp);
+            GETIP(addr+delta);
+            ADDW(ret, ret, xRIP);
+            switch(lock) {
+                case 1: addLockAddress(addr+delta+tmp); break;
+                case 2: if(isLockAddress(addr+delta+tmp)) *l=1; break;
+            }
+        } else {
+            ret = xRAX+(nextop&7)+(rex.b<<3);
+            if(ret==hint) {
+                AND(hint, ret, xMASK);    //to clear upper part
+            }
+        }
+    } else {
+        int64_t i64;
+        uint8_t sib = 0;
+        int sib_reg = 0;
+        if((nextop&7)==4) {
+            sib = F8;
+            sib_reg = ((sib>>3)&7)+(rex.x<<3);
+        }
+        int sib_reg2 = (sib&0x07)+(rex.b<<3);
+        if(nextop&0x80)
+            i64 = F32S;
+        else
+            i64 = F8S;
+        if(i64==0 || ((i64>=-2048) && (i64<=2047)  && i12)) {
+            *fixaddress = i64;
+            if((nextop&7)==4) {
+                if (sib_reg!=4) {
+                    if(sib>>6) {
+                    if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else {SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, xRAX+sib_reg2);}
+                    } else
+                        ADDW(ret, xRAX+sib_reg2, xRAX+sib_reg);
+                } else {
+                    ret = xRAX+sib_reg2;
+                }
+            } else {
+                ret = xRAX+(nextop&0x07)+(rex.b<<3);
+            }
+        } else {
+            if(i64>=-2048 && i64<=2047) {
+                if((nextop&7)==4) {
+                    if (sib_reg!=4) {
+                        if(sib>>6) {
+                            if(rv64_zba) SHxADDUW(scratch, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else {SLLI(scratch, xRAX+sib_reg, sib>>6); ADDW(scratch, scratch, xRAX+sib_reg2);}
+                        } else
+                            ADDW(scratch, xRAX+sib_reg2, xRAX+sib_reg);
+                    } else {
+                        scratch = xRAX+sib_reg2;
+                    }
+                } else
+                    scratch = xRAX+(nextop&0x07)+(rex.b<<3);
+                ADDIW(ret, scratch, i64);
+            } else {
+                MOV32w(scratch, i64);
+                if((nextop&7)==4) {
+                    if (sib_reg!=4) {
+                        ADDW(scratch, scratch, xRAX+sib_reg2);
+                        if(sib>>6) {
+                            if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), scratch); else {SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, scratch);}
+                        } else
+                            ADDW(ret, scratch, xRAX+sib_reg);
+                    } else {
+                        PASS3(int tmp = xRAX+sib_reg2);
+                        ADDW(ret, tmp, scratch);
+                    }
+                } else {
+                    PASS3(int tmp = xRAX+(nextop&0x07)+(rex.b<<3));
+                    ADDW(ret, tmp, scratch);
+                }
+            }
+        }
+    }
+    *ed = ret;
+    return addr;
+}
+
 void jump_to_epilog(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst)
 {
     MAYUSE(dyn); MAYUSE(ip); MAYUSE(ninst);
@@ -233,8 +521,7 @@ void jump_to_next(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst)
         MAYUSE(tbl);
         TABLE64(x3, tbl);
         SRLI(x2, xRIP, JMPTABL_START3);
-        SLLI(x2, x2, 3);
-        ADD(x3, x3, x2);
+        if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);}
         LD(x3, x3, 0); // could be LR_D(x3, x3, 1, 1); for better safety
         MOV64x(x4, JMPTABLE_MASK2<<3);    // x4 = mask
         SRLI(x2, xRIP, JMPTABL_START2-3);
@@ -256,8 +543,7 @@ void jump_to_next(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst)
             }
             AND(x2, xRIP, x4);
         }
-        SLLI(x2, x2, 3);
-        ADD(x3, x3, x2);
+        if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);}
         LD(x2, x3, 0); //LR_D(x2, x3, 1, 1);
     } else {
         uintptr_t p = getJumpTableAddress64(ip);
@@ -277,12 +563,12 @@ void jump_to_next(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst)
     JALR(x2); // save LR...
 }
 
-void ret_to_epilog(dynarec_rv64_t* dyn, int ninst)
+void ret_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex)
 {
     MAYUSE(dyn); MAYUSE(ninst);
     MESSAGE(LOG_DUMP, "Ret to epilog\n");
-    POP1(xRIP);
-    MV(x1, xRIP);
+    POP1z(xRIP);
+    MVz(x1, xRIP);
     SMEND();
     /*if(box64_dynarec_callret) {
         // pop the actual return address from RV64 stack
@@ -297,8 +583,7 @@ void ret_to_epilog(dynarec_rv64_t* dyn, int ninst)
     uintptr_t tbl = getJumpTable64();
     MOV64x(x3, tbl);
     SRLI(x2, xRIP, JMPTABL_START3);
-    SLLI(x2, x2, 3);
-    ADD(x3, x3, x2);
+    if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);}
     LD(x3, x3, 0);
     MOV64x(x4, JMPTABLE_MASK2<<3);    // x4 = mask
     SRLI(x2, xRIP, JMPTABL_START2-3);
@@ -320,25 +605,24 @@ void ret_to_epilog(dynarec_rv64_t* dyn, int ninst)
         }
         AND(x2, xRIP, x4);
     }
-    SLLI(x2, x2, 3);
-    ADD(x3, x3, x2);
+    if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);}
     LD(x2, x3, 0);
     JALR(x2); // save LR
     CLEARIP();
 }
 
-void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, int n)
+void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex, int n)
 {
     MAYUSE(dyn); MAYUSE(ninst);
     MESSAGE(LOG_DUMP, "Retn to epilog\n");
-    POP1(xRIP);
+    POP1z(xRIP);
     if(n>0x7ff) {
         MOV64x(w1, n);
-        ADD(xRSP, xRSP, x1);
+        ADDz(xRSP, xRSP, x1);
     } else {
-        ADDI(xRSP, xRSP, n);
+        ADDIz(xRSP, xRSP, n);
     }
-    MV(x1, xRIP);
+    MVz(x1, xRIP);
     SMEND();
     /*if(box64_dynarec_callret) {
         // pop the actual return address from RV64 stack
@@ -353,8 +637,7 @@ void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, int n)
     uintptr_t tbl = getJumpTable64();
     MOV64x(x3, tbl);
     SRLI(x2, xRIP, JMPTABL_START3);
-    SLLI(x2, x2, 3);
-    ADD(x3, x3, x2);
+    if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);}
     LD(x3, x3, 0);
     MOV64x(x4, JMPTABLE_MASK2<<3);    // x4 = mask
     SRLI(x2, xRIP, JMPTABL_START2-3);
@@ -376,8 +659,7 @@ void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, int n)
         }
         AND(x2, xRIP, x4);
     }
-    SLLI(x2, x2, 3);
-    ADD(x3, x3, x2);
+    if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);}
     LD(x2, x3, 0);
     JALR(x2); // save LR
     CLEARIP();
@@ -388,26 +670,35 @@ void iret_to_epilog(dynarec_rv64_t* dyn, int ninst, int is64bits)
     //#warning TODO: is64bits
     MAYUSE(ninst);
     MESSAGE(LOG_DUMP, "IRet to epilog\n");
-    // POP IP
     NOTEST(x2);
-    POP1(xRIP);
-    // POP CS
-    POP1(x2);
+    if(is64bits) {
+        POP1(xRIP);
+        POP1(x2);
+        POP1(xFlags);
+    } else {
+        POP1_32(xRIP);
+        POP1_32(x2);
+        POP1_32(xFlags);
+    }
+
     SH(x2, xEmu, offsetof(x64emu_t, segs[_CS]));
-    MV(x1, xZR);
-    SD(x1, xEmu, offsetof(x64emu_t, segs_serial[_CS]));
-    SD(x1, xEmu, offsetof(x64emu_t, segs_serial[_SS]));
-    // POP EFLAGS
-    POP1(xFlags);
+    SW(xZR, xEmu, offsetof(x64emu_t, segs_serial[_CS]));
+    // clean EFLAGS
     MOV32w(x1, 0x3F7FD7);
     AND(xFlags, xFlags, x1);
     ORI(xFlags, xFlags, 0x2);
     SET_DFNONE();
     // POP RSP
-    POP1(x3);
+    if (is64bits) {
+        POP1(x3);   //rsp
+        POP1(x2);   //ss
+    } else {
+        POP1_32(x3);   //rsp
+        POP1_32(x2);   //ss
+    }
     // POP SS
-    POP1(x2);
     SH(x2, xEmu, offsetof(x64emu_t, segs[_SS]));
+    SW(xZR, xEmu, offsetof(x64emu_t, segs_serial[_SS]));
     // set new RSP
     MV(xRSP, x3);
     // Ret....
@@ -434,6 +725,7 @@ void call_c(dynarec_rv64_t* dyn, int ninst, void* fnc, int reg, int ret, int sav
         // x5..x8, x10..x17, x28..x31 those needs to be saved by caller
         STORE_REG(RAX);
         STORE_REG(RCX);
+        STORE_REG(RDX);
         STORE_REG(R12);
         STORE_REG(R13);
         STORE_REG(R14);
@@ -452,6 +744,7 @@ void call_c(dynarec_rv64_t* dyn, int ninst, void* fnc, int reg, int ret, int sav
         #define GO(A)   if(ret!=x##A) {LOAD_REG(A);}
         GO(RAX);
         GO(RCX);
+        GO(RDX);
         GO(R12);
         GO(R13);
         GO(R14);
@@ -703,16 +996,14 @@ void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, in
             for (int i=0; i<a; ++i) {
                 SUBI(s2, s2, 1);
                 ANDI(s2, s2, 7);    // (emu->top + st)&7
-                SLLI(s1, s2, 2);
-                ADD(s1, xEmu, s1);
+                if(rv64_zba) SH2ADD(s1, s2, xEmu); else {SLLI(s1, s2, 2); ADD(s1, xEmu, s1);}
                 SW(s3, s1, offsetof(x64emu_t, p_regs));
             }
         } else {
             // empty tags
             ADDI(s3, xZR, 0b11);
             for (int i=0; i<-a; ++i) {
-                SLLI(s1, s2, 2);
-                ADD(s1, xEmu, s1);
+                if(rv64_zba) SH2ADD(s1, s2, xEmu); else {SLLI(s1, s2, 2); ADD(s1, xEmu, s1);}
                 SW(s3, s1, offsetof(x64emu_t, p_regs));
                 ADDI(s2, s2, 1);
                 ANDI(s2, s2, 7);    // (emu->top + st)&7
@@ -741,8 +1032,7 @@ void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, in
                 #endif
                 ADDI(s3, s2, dyn->e.x87cache[i]);
                 ANDI(s3, s3, 7);   // (emu->top + st)&7
-                SLLI(s1, s3, 3);
-                ADD(s1, xEmu, s1);
+                if(rv64_zba) SH3ADD(s1, s3, xEmu); else {SLLI(s1, s3, 3); ADD(s1, xEmu, s1);}
                 if(next) {
                     // need to check if a ST_F need local promotion
                     if(extcache_get_st_f(dyn, ninst, dyn->e.x87cache[i])>=0) {
@@ -801,8 +1091,7 @@ static void x87_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int
         if(dyn->e.x87cache[i]!=-1) {
             ADDI(s3, s2, dyn->e.x87cache[i]);
             ANDI(s3, s3, 7);   // (emu->top + i)&7
-            SLLI(s1, s3, 3);
-            ADD(s1, xEmu, s1);
+            if(rv64_zba) SH3ADD(s1, s3, xEmu); else {SLLI(s1, s3, 3); ADD(s1, xEmu, s1);}
             if(extcache_get_st_f(dyn, ninst, dyn->e.x87cache[i])>=0) {
                 FCVTDS(SCRATCH0, dyn->e.x87reg[i]);
                 FSD(SCRATCH0, s1, offsetof(x64emu_t, x87));
@@ -834,7 +1123,7 @@ int x87_get_current_cache(dynarec_rv64_t* dyn, int ninst, int st, int t)
     for (int i=0; i<8; ++i) {
         if(dyn->e.x87cache[i]==st) {
             #if STEP == 1
-            if(t==EXT_CACHE_ST_D && (dyn->e.extcache[dyn->e.x87reg[i]].t==EXT_CACHE_ST_F))
+            if(t==EXT_CACHE_ST_D && (dyn->e.extcache[EXTIDX(dyn->e.x87reg[i])].t==EXT_CACHE_ST_F))
                 extcache_promote_double(dyn, ninst, st);
             #endif
             return i;
@@ -866,8 +1155,7 @@ int x87_get_cache(dynarec_rv64_t* dyn, int ninst, int populate, int s1, int s2,
             ADDI(s2, s2, a);
             ANDI(s2, s2, 7);
         }
-        SLLI(s2, s2, 3);
-        ADD(s1, xEmu, s2);
+        if(rv64_zba) SH3ADD(s1, s2, xEmu); else  {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);}
         FLD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87));
     }
     MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
@@ -912,7 +1200,7 @@ void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st)
         ANDI(s2, s2, 7);    // (emu->top + i)&7
     }
     ADD(s1, xEmu, s2);
-    if(dyn->e.extcache[dyn->e.x87reg[ret]].t==EXT_CACHE_ST_F) {
+    if(dyn->e.extcache[EXTIDX(dyn->e.x87reg[ret])].t==EXT_CACHE_ST_F) {
         FCVTDS(SCRATCH0, dyn->e.x87reg[ret]);
         FSD(SCRATCH0, s1, offsetof(x64emu_t, x87));
     } else {
@@ -932,23 +1220,24 @@ void x87_forget(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st)
         return;
     MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st);
     #if STEP == 1
-    if(dyn->e.extcache[dyn->e.x87reg[ret]].t==EXT_CACHE_ST_F)
+    if(dyn->e.extcache[EXTIDX(dyn->e.x87reg[ret])].t==EXT_CACHE_ST_F)
         extcache_promote_double(dyn, ninst, st);
     #endif
     // prepare offset to fpu => s1
     // Get top
     LW(s2, xEmu, offsetof(x64emu_t, top));
     // Update
-    if(st) {
-        ADDI(s2, s2, st);
+    int a = st - dyn->e.x87stack;
+    if(a) {
+        ADDI(s2, s2, a);
         ANDI(s2, s2, 7);    // (emu->top + i)&7
     }
-    ADD(s1, xEmu, s2);
+    if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);}
     FSD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87));
     MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st);
     // and forget that cache
     fpu_free_reg(dyn, dyn->e.x87reg[ret]);
-    dyn->e.extcache[dyn->e.x87reg[ret]].v = 0;
+    dyn->e.extcache[EXTIDX(dyn->e.x87reg[ret])].v = 0;
     dyn->e.x87cache[ret] = -1;
     dyn->e.x87reg[ret] = -1;
 }
@@ -963,15 +1252,16 @@ void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st)
             // refresh the value
             MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st);
             #if STEP == 1
-            if(dyn->e.extcache[dyn->e.x87reg[i]].t==EXT_CACHE_ST_F)
+            if(dyn->e.extcache[EXTIDX(dyn->e.x87reg[i])].t==EXT_CACHE_ST_F)
                 extcache_promote_double(dyn, ninst, st);
             #endif
             LW(s2, xEmu, offsetof(x64emu_t, top));
             int a = st - dyn->e.x87stack;
-            ADDI(s2, s2, a);
-            AND(s2, s2, 7);
-            SLLI(s2, s2, 3);
-            ADD(s1, xEmu, s2);
+            if(a) {
+                ADDI(s2, s2, a);
+                AND(s2, s2, 7);
+            }
+            if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);}
             FLD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87));
             MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
             // ok
@@ -991,8 +1281,7 @@ void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st)
     int a = st - dyn->e.x87stack;
     ADDI(s2, s2, a);
     ANDI(s2, s2, 7);    // (emu->top + i)&7
-    SLLI(s2, s2, 3);
-    ADD(s1, xEmu, s2);
+    if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);}
     FLD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87));
     MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
 }
@@ -1084,6 +1373,16 @@ static int isx87Empty(dynarec_rv64_t* dyn)
     return 1;
 }
 
+// forget ext register for a MMX reg, does nothing if the regs is not loaded
+void mmx_forget_reg(dynarec_rv64_t* dyn, int ninst, int a)
+{
+    if (dyn->e.mmxcache[a] == -1)
+        return;
+    FSD(dyn->e.mmxcache[a], xEmu, offsetof(x64emu_t, mmx[a]));
+    fpu_free_reg(dyn, dyn->e.mmxcache[a]);
+    return;
+}
+
 // get neon register for a MMX reg, create the entry if needed
 int mmx_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a)
 {
@@ -1153,6 +1452,10 @@ int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
         // forget / reload if change of size
         if(dyn->e.ssecache[a].single!=single) {
             sse_forget_reg(dyn, ninst, a);
+            // update olds after the forget...
+            dyn->e.olds[a].changed = 1;
+            dyn->e.olds[a].purged = 0;
+            dyn->e.olds[a].single = 1-single;
             return sse_get_reg(dyn, ninst, s1, a, single);
         }
         return dyn->e.ssecache[a].reg;
@@ -1176,6 +1479,10 @@ int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
             // need to wipe the half high 32bits of old Double because we now have a single
             //SW(xZR, xEmu, offsetof(x64emu_t, xmm[a])+4);
         }
+        dyn->e.olds[a].changed = 1;
+        dyn->e.olds[a].purged = 0;
+        dyn->e.olds[a].reg = EXTIDX(dyn->e.ssecache[a].reg);
+        dyn->e.olds[a].single = 1-single;
         dyn->e.ssecache[a].single = single;
         dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = single?EXT_CACHE_SS:EXT_CACHE_SD;
         return dyn->e.ssecache[a].reg;
@@ -1194,6 +1501,10 @@ void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a)
     else
         FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
     fpu_free_reg(dyn, dyn->e.ssecache[a].reg);
+    dyn->e.olds[a].changed = 0;
+    dyn->e.olds[a].purged = 1;
+    dyn->e.olds[a].reg = dyn->e.ssecache[a].reg;
+    dyn->e.olds[a].single = dyn->e.ssecache[a].single;
     dyn->e.ssecache[a].v = -1;
     return;
 }
@@ -1235,6 +1546,10 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1)
                 FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
             if(!next) {
                 fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
+                dyn->e.olds[i].changed = 0;
+                dyn->e.olds[i].purged = 1;
+                dyn->e.olds[i].reg = dyn->e.ssecache[i].reg;
+                dyn->e.olds[i].single = dyn->e.ssecache[i].single;
                 dyn->e.ssecache[i].v = -1;
             }
         }
@@ -1286,8 +1601,8 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
         for(int i=17; i<24; ++i)
             if(dyn->e.extcache[i].v!=0) {
                 switch(dyn->e.extcache[i].t) {
-                    case EXT_CACHE_ST_F: 
-                    case EXT_CACHE_SS: 
+                    case EXT_CACHE_ST_F:
+                    case EXT_CACHE_SS:
                         FSW(EXTREG(i), xSP, p*8);
                         break;
                     default:
@@ -1328,8 +1643,8 @@ void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
         for(int i=17; i<24; ++i)
             if(dyn->e.extcache[i].v!=0) {
                 switch(dyn->e.extcache[i].t) {
-                    case EXT_CACHE_ST_F: 
-                    case EXT_CACHE_SS: 
+                    case EXT_CACHE_ST_F:
+                    case EXT_CACHE_SS:
                         FLW(EXTREG(i), xSP, p*8);
                         break;
                     default:
@@ -1387,7 +1702,7 @@ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *
     int j_single = 0;
     if(cache->extcache[j].t==EXT_CACHE_SS || cache->extcache[j].t==EXT_CACHE_ST_F)
         j_single =1;
-    
+
     if(!cache->extcache[i].v) {
         // a mov is enough, no need to swap
         MESSAGE(LOG_DUMP, "\t  - Moving %d <- %d\n", i, j);
@@ -1454,12 +1769,12 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int
             FLD(reg, xEmu, offsetof(x64emu_t, xmm[n]));
             break;
         case EXT_CACHE_MM:
-            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));                    
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
             FLD(reg, xEmu, offsetof(x64emu_t, mmx[i]));
             break;
         case EXT_CACHE_ST_D:
         case EXT_CACHE_ST_F:
-            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));                    
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
             if((*s3_top) == 0xffff) {
                 LW(s3, xEmu, offsetof(x64emu_t, top));
                 *s3_top = 0;
@@ -1471,18 +1786,17 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int
             }
             *s3_top += a;
             *s2_val = 0;
-            SLLI(s2, s3, 3);
-            ADD(s2, xEmu, s2);
+            if(rv64_zba) SH3ADD(s2, s3, xEmu); else {SLLI(s2, s3, 3); ADD(s2, xEmu, s2);}
             FLD(reg, s2, offsetof(x64emu_t, x87));
             if(t==EXT_CACHE_ST_F) {
                 FCVTSD(reg, reg);
             }
-            break;                    
+            break;
         case EXT_CACHE_NONE:
         case EXT_CACHE_SCR:
         default:    /* nothing done */
             MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
-            break; 
+            break;
     }
     cache->extcache[i].n = n;
     cache->extcache[i].t = t;
@@ -1501,12 +1815,12 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i
             FSD(reg, xEmu, offsetof(x64emu_t, xmm[n]));
             break;
         case EXT_CACHE_MM:
-            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));                    
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
             FSD(reg, xEmu, offsetof(x64emu_t, mmx[n]));
             break;
         case EXT_CACHE_ST_D:
         case EXT_CACHE_ST_F:
-            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));                    
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
             if((*s3_top)==0xffff) {
                 LW(s3, xEmu, offsetof(x64emu_t, top));
                 *s3_top = 0;
@@ -1517,19 +1831,18 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i
                 ANDI(s3, s3, 7);
             }
             *s3_top += a;
-            SLLI(s2, s3, 3);
-            ADD(s2, xEmu, s2);
+            if(rv64_zba) SH3ADD(s2, s3, xEmu); else {SLLI(s2, s3, 3); ADD(s2, xEmu, s2);}
             *s2_val = 0;
             if(t==EXT_CACHE_ST_F) {
                 FCVTDS(reg, reg);
             }
             FSD(reg, s2, offsetof(x64emu_t, x87));
-            break;                    
+            break;
         case EXT_CACHE_NONE:
         case EXT_CACHE_SCR:
         default:    /* nothing done */
             MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
-            break; 
+            break;
     }
     cache->extcache[i].v = 0;
 }
@@ -1678,18 +1991,18 @@ static void flagsCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1)
     int go = 0;
     switch (dyn->insts[jmp].f_entry.pending) {
         case SF_UNKNOWN: break;
-        case SF_SET: 
-            if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING) 
-                go = 1; 
+        case SF_SET:
+            if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING)
+                go = 1;
             break;
         case SF_SET_PENDING:
-            if(dyn->f.pending!=SF_SET 
+            if(dyn->f.pending!=SF_SET
             && dyn->f.pending!=SF_SET_PENDING
-            && dyn->f.pending!=SF_PENDING) 
-                go = 1; 
+            && dyn->f.pending!=SF_PENDING)
+                go = 1;
             break;
         case SF_PENDING:
-            if(dyn->f.pending!=SF_SET 
+            if(dyn->f.pending!=SF_SET
             && dyn->f.pending!=SF_SET_PENDING
             && dyn->f.pending!=SF_PENDING)
                 go = 1;
@@ -1702,11 +2015,11 @@ static void flagsCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1)
     if(go) {
         if(dyn->f.pending!=SF_PENDING) {
             LW(s1, xEmu, offsetof(x64emu_t, df));
-            j64 = (GETMARK3)-(dyn->native_size);
+            j64 = (GETMARKF2)-(dyn->native_size);
             BEQZ(s1, j64);
         }
         CALL_(UpdateFlags, -1, 0);
-        MARK3;
+        MARKF2;
     }
 #endif
 }
@@ -1734,7 +2047,7 @@ void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zerou
         LUI(reg, hi20);
         src = reg;
     }
-    if (lo12 || !hi20) ADDI(reg, src, lo12);
+    if (lo12 || !hi20) ADDIW(reg, src, lo12);
     if((zeroup && ((hi20&0x80000) || (!hi20 && (lo12&0x800)))
     || (!zeroup && !(val&0x80000000) && ((hi20&0x80000) || (!hi20 && (lo12&0x800)))))) {
         ZEROUP(reg);
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index b12ee96b..0b1023b3 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -99,6 +99,25 @@
                     LD(x1, wback, fixedaddress);        \
                     ed = x1;                            \
                 }
+#define GETEDz(D) if(MODREG) {                          \
+                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
+                    wback = 0;                          \
+                } else {                                \
+                    SMREAD()                            \
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+                    LDz(x1, wback, fixedaddress);       \
+                    ed = x1;                            \
+                }
+// GETED32 can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
+#define GETED32(D)  if(MODREG) {                        \
+                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
+                    wback = 0;                          \
+                } else {                                \
+                    SMREAD()                            \
+                    addr = geted32(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+                    LDxw(x1, wback, fixedaddress);      \
+                    ed = x1;                            \
+                }
 //GETEDH can use hint for ed, and x1 or x2 for wback (depending on hint), might also use x3. wback is 0 if ed is xEAX..xEDI
 #define GETEDH(hint, D) if(MODREG) {                    \
                     ed = xRAX+(nextop&7)+(rex.b<<3);    \
@@ -109,13 +128,23 @@
                     LDxw(hint, wback, fixedaddress);    \
                     ed = hint;                          \
                 }
+//GETEDW can use hint for wback and ret for ed. wback is 0 if ed is xEAX..xEDI
+#define GETEDW(hint, ret, D)   if(MODREG) {             \
+                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
+                    MV(ret, ed);                        \
+                    wback = 0;                          \
+                } else {                                \
+                    SMREAD();                           \
+                    addr = geted(dyn, addr, ninst, nextop, &wback, (hint==x2)?x1:x2, (hint==x1)?x1:x3, &fixedaddress, rex, NULL, 0, D); \
+                    ed = ret;                           \
+                    LDxw(ed, wback, fixedaddress);      \
+                }
 // GETGW extract x64 register in gd, that is i
-#define GETGW(i) gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3); SLLI(i, gd, 48); SRLI(i, i, 48); gd = i;
+#define GETGW(i) gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3); ZEXTH(i, gd); gd = i;
 //GETEWW will use i for ed, and can use w for wback.
 #define GETEWW(w, i, D) if(MODREG) {        \
                     wback = xRAX+(nextop&7)+(rex.b<<3);\
-                    SLLI(i, wback, 48);     \
-                    SRLI(i, i, 48);         \
+                    ZEXTH(i, wback);        \
                     ed = i;                 \
                     wb1 = 0;                \
                 } else {                    \
@@ -130,8 +159,7 @@
 //GETSEW will use i for ed, and can use r3 for wback. This is the Signed version
 #define GETSEW(i, D) if(MODREG) {           \
                     wback = xRAX+(nextop&7)+(rex.b<<3);\
-                    SLLI(i, wback, 48);     \
-                    SRAI(i, i, 48);         \
+                    if(rv64_zbb) SEXTH(i, wback); else {SLLI(i, wback, 48); SRAI(i, i, 48);}\
                     ed = i;                 \
                     wb1 = 0;                \
                 } else {                    \
@@ -159,6 +187,7 @@
                     LDxw(x1, S, fixedaddress);          \
                     ed = x1;                            \
                 }
+#define WBACKO(O)   if(wback) {ADD(O, wback, O); SDxw(ed, O, 0); SMWRITE2();}
 
 // FAKEED like GETED, but doesn't get anything
 #define FAKEED  if(!MODREG) {   \
@@ -191,6 +220,28 @@
                     wb1 = 1;                    \
                     ed = i;                     \
                 }
+//GETEBO will use i for ed, i is also Offset, and can use r3 for wback.
+#define GETEBO(i, D) if(MODREG) {               \
+                    if(rex.rex) {               \
+                        wback = xRAX+(nextop&7)+(rex.b<<3);     \
+                        wb2 = 0;                \
+                    } else {                    \
+                        wback = (nextop&7);     \
+                        wb2 = (wback>>2)*8;     \
+                        wback = xRAX+(wback&3); \
+                    }                           \
+                    if (wb2) {MV(i, wback); SRLI(i, i, wb2); ANDI(i, i, 0xff);} else {ANDI(i, wback, 0xff);}   \
+                    wb1 = 0;                    \
+                    ed = i;                     \
+                } else {                        \
+                    SMREAD();                   \
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
+                    ADD(x3, wback, i);          \
+                    if(wback!=x3) wback = x3;   \
+                    LBU(i, wback, fixedaddress);\
+                    wb1 = 1;                    \
+                    ed = i;                     \
+                }
 //GETSEB sign extend EB, will use i for ed, and can use r3 for wback.
 #define GETSEB(i, D) if(MODREG) {                \
                     if(rex.rex) {               \
@@ -213,6 +264,26 @@
                     wb1 = 1;                    \
                     ed = i;                     \
                 }
+// GETEB32 will use i for ed, and can use r3 for wback.
+#define GETEB32(i, D) if(MODREG) {                \
+                    if(rex.rex) {               \
+                        wback = xRAX+(nextop&7)+(rex.b<<3);     \
+                        wb2 = 0;                \
+                    } else {                    \
+                        wback = (nextop&7);     \
+                        wb2 = (wback>>2)*8;     \
+                        wback = xRAX+(wback&3); \
+                    }                           \
+                    if (wb2) {MV(i, wback); SRLI(i, i, wb2); ANDI(i, i, 0xff);} else {ANDI(i, wback, 0xff);}   \
+                    wb1 = 0;                    \
+                    ed = i;                     \
+                } else {                        \
+                    SMREAD();                   \
+                    addr = geted32(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
+                    LBU(i, wback, fixedaddress);\
+                    wb1 = 1;                    \
+                    ed = i;                     \
+                }
 
 //GETGB will use i for gd
 #define GETGB(i) if(rex.rex) {                                \
@@ -228,7 +299,6 @@
 
 // Write gb (gd) back to original register / memory, using s1 as scratch
 #define GBBACK(s1) if(gb2) {                            \
-                    assert(gb2 == 8);                   \
                     MOV64x(s1, 0xffffffffffff00ffLL);   \
                     AND(gb1, gb1, s1);                  \
                     SLLI(s1, gd, 8);                    \
@@ -243,7 +313,6 @@
                     SB(ed, wback, fixedaddress);        \
                     SMWRITE();                          \
                 } else if(wb2) {                        \
-                    assert(wb2 == 8);                   \
                     MOV64x(s1, 0xffffffffffff00ffLL);   \
                     AND(wback, wback, s1);              \
                     if (c) {ANDI(ed, ed, 0xff);}        \
@@ -309,31 +378,49 @@
     }
 
 // Will get pointer to GX in general register a, will purge SS or SD if loaded. can use gback as load address
-#define GETGX(a)                        \
-    gd = ((nextop&0x38)>>3)+(rex.r<<3); \
-    sse_forget_reg(dyn, ninst, gd);     \
-    gback = a;                          \
-    ADDI(a, xEmu, offsetof(x64emu_t, xmm[gd]))
+#define GETGX()                             \
+    gd = ((nextop&0x38)>>3)+(rex.r<<3);     \
+    sse_forget_reg(dyn, ninst, gd);         \
+    gback = xEmu;                           \
+    gdoffset = offsetof(x64emu_t, xmm[gd])
 
 // Get Ex address in general register a, will purge SS or SD if it's reg and is loaded. May use x3. Use wback as load address!
 #define GETEX(a, D)                                                                                     \
     if(MODREG) {                                                                                        \
         ed = (nextop&7)+(rex.b<<3);                                                                     \
         sse_forget_reg(dyn, ninst, ed);                                                                 \
-        fixedaddress = 0;                                                                               \
-        ADDI(a, xEmu, offsetof(x64emu_t, xmm[ed]));                                                     \
-        wback = a;                                                                                      \
+        fixedaddress = offsetof(x64emu_t, xmm[ed]);                                                     \
+        wback = xEmu;                                                                                   \
     } else {                                                                                            \
         SMREAD();                                                                                       \
         ed=16;                                                                                          \
         addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D);          \
     }
 
+#define GETGM()                             \
+    gd = ((nextop&0x38)>>3);                \
+    mmx_forget_reg(dyn, ninst, gd);         \
+    gback = xEmu;                           \
+    gdoffset = offsetof(x64emu_t, mmx[gd])
+
+// Get EM, might use x3
+#define GETEM(a, D)                                                                             \
+    if(MODREG) {                                                                                \
+        ed = (nextop&7);                                                                        \
+        mmx_forget_reg(dyn, ninst, ed);                                                         \
+        fixedaddress = offsetof(x64emu_t, mmx[ed]);                                             \
+        wback = xEmu;                                                                           \
+    } else {                                                                                    \
+        SMREAD();                                                                               \
+        ed=8;                                                                                   \
+        addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D);  \
+    }
+
 #define SSE_LOOP_D_ITEM(GX1, EX1, F, i) \
-    LWU(GX1, gback, i*4);               \
+    LWU(GX1, gback, gdoffset+i*4);      \
     LWU(EX1, wback, fixedaddress+i*4);  \
     F;                                  \
-    SW(GX1, gback, i*4);
+    SW(GX1, gback, gdoffset+i*4);
 
 // Loop for SSE opcode that use 32bits value and write to GX.
 #define SSE_LOOP_D(GX1, EX1, F)     \
@@ -343,10 +430,10 @@
     SSE_LOOP_D_ITEM(GX1, EX1, F, 3)
 
 #define SSE_LOOP_DS_ITEM(GX1, EX1, F, i) \
-    LW(GX1, gback, i*4);                 \
+    LW(GX1, gback, gdoffset+i*4);        \
     LW(EX1, wback, fixedaddress+i*4);    \
     F;                                   \
-    SW(GX1, gback, i*4);
+    SW(GX1, gback, gdoffset+i*4);
 
 // Loop for SSE opcode that use 32bits value and write to GX.
 #define SSE_LOOP_DS(GX1, EX1, F)     \
@@ -355,20 +442,28 @@
     SSE_LOOP_DS_ITEM(GX1, EX1, F, 2) \
     SSE_LOOP_DS_ITEM(GX1, EX1, F, 3)
 
+#define MMX_LOOP_W(GX1, EX1, F)            \
+    for (int i=0; i<4; ++i) {              \
+        LHU(GX1, gback, gdoffset+i*2);     \
+        LHU(EX1, wback, fixedaddress+i*2); \
+        F;                                 \
+        SH(GX1, gback, gdoffset+i*2);      \
+    }
+
 #define SSE_LOOP_W(GX1, EX1, F)            \
     for (int i=0; i<8; ++i) {              \
-        LHU(GX1, gback, i*2);              \
+        LHU(GX1, gback, gdoffset+i*2);     \
         LHU(EX1, wback, fixedaddress+i*2); \
         F;                                 \
-        SH(GX1, gback, i*2);               \
+        SH(GX1, gback, gdoffset+i*2);      \
     }
 
 #define SSE_LOOP_WS(GX1, EX1, F)          \
     for (int i=0; i<8; ++i) {             \
-        LH(GX1, gback, i*2);              \
+        LH(GX1, gback, gdoffset+i*2);     \
         LH(EX1, wback, fixedaddress+i*2); \
         F;                                \
-        SH(GX1, gback, i*2);              \
+        SH(GX1, gback, gdoffset+i*2);     \
     }
 
 #define SSE_LOOP_D_S_ITEM(EX1, F, i)    \
@@ -384,10 +479,10 @@
     SSE_LOOP_D_S_ITEM(EX1, F, 3)
 
 #define SSE_LOOP_Q_ITEM(GX1, EX1, F, i) \
-    LD(GX1, gback, i*8);                \
+    LD(GX1, gback, gdoffset+i*8);       \
     LD(EX1, wback, fixedaddress+i*8);   \
     F;                                  \
-    SD(GX1, gback, i*8);
+    SD(GX1, gback, gdoffset+i*8);
 
 // Loop for SSE opcode that use 64bits value and write to GX.
 #define SSE_LOOP_Q(GX1, EX1, F)     \
@@ -396,10 +491,10 @@
 
 
 #define SSE_LOOP_FQ_ITEM(GX1, EX1, F, i)            \
-    FLD(v0, gback, i*8);                            \
+    FLD(v0, gback, gdoffset+i*8);                   \
     FLD(v1, wback, fixedaddress+i*8);               \
     F;                                              \
-    FSD(v0, gback, i*8);
+    FSD(v0, gback, gdoffset+i*8);
 
 #define SSE_LOOP_FQ(GX1, EX1, F)     \
     v0 = fpu_get_scratch(dyn);       \
@@ -410,7 +505,7 @@
 
 #define SSE_LOOP_MV_Q_ITEM(s, i)      \
     LD(s, wback, fixedaddress+i*8);   \
-    SD(s, gback, i*8);
+    SD(s, gback, gdoffset+i*8);
 
 // Loop for SSE opcode that moves 64bits value from wback to gback, use s as scratch.
 #define SSE_LOOP_MV_Q(s)     \
@@ -418,7 +513,7 @@
     SSE_LOOP_MV_Q_ITEM(s, 1)
 
 #define SSE_LOOP_MV_Q_ITEM2(s, i)     \
-    LD(s, gback, i*8);                \
+    LD(s, gback, gdoffset+i*8);       \
     SD(s, wback, fixedaddress+i*8);
 
 // Loop for SSE opcode that moves 64bits value from gback to wback, use s as scratch.
@@ -436,17 +531,19 @@
 // R0 will not be pushed/popd if ret is -2. Flags are not save/restored
 #define CALL_S(F, ret) call_c(dyn, ninst, F, x6, ret, 0, 0)
 
-#define MARK    dyn->insts[ninst].mark = dyn->native_size
-#define GETMARK dyn->insts[ninst].mark
-#define MARK2   dyn->insts[ninst].mark2 = dyn->native_size
-#define GETMARK2 dyn->insts[ninst].mark2
-#define MARK3   dyn->insts[ninst].mark3 = dyn->native_size
-#define GETMARK3 dyn->insts[ninst].mark3
-#define MARKF   dyn->insts[ninst].markf = dyn->native_size
-#define GETMARKF dyn->insts[ninst].markf
-#define MARKSEG dyn->insts[ninst].markseg = dyn->native_size
-#define GETMARKSEG dyn->insts[ninst].markseg
-#define MARKLOCK dyn->insts[ninst].marklock = dyn->native_size
+#define MARK        dyn->insts[ninst].mark = dyn->native_size
+#define GETMARK     dyn->insts[ninst].mark
+#define MARK2       dyn->insts[ninst].mark2 = dyn->native_size
+#define GETMARK2    dyn->insts[ninst].mark2
+#define MARK3       dyn->insts[ninst].mark3 = dyn->native_size
+#define GETMARK3    dyn->insts[ninst].mark3
+#define MARKF       dyn->insts[ninst].markf = dyn->native_size
+#define GETMARKF    dyn->insts[ninst].markf
+#define MARKF2      dyn->insts[ninst].markf2 = dyn->native_size
+#define GETMARKF2   dyn->insts[ninst].markf2
+#define MARKSEG     dyn->insts[ninst].markseg = dyn->native_size
+#define GETMARKSEG  dyn->insts[ninst].markseg
+#define MARKLOCK    dyn->insts[ninst].marklock = dyn->native_size
 #define GETMARKLOCK dyn->insts[ninst].marklock
 
 #define Bxx_gen(OP, M, reg1, reg2)      \
@@ -526,7 +623,7 @@
 #define STORE_REG(A)    SD(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
 #define LOAD_REG(A)     LD(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
 
-// Need to also store current value of some register, as they may be used by functions like setjump
+// Need to also store current value of some register, as they may be used by functions like setjmp
 #define STORE_XEMU_CALL()   \
     STORE_REG(RBX);         \
     STORE_REG(RDX);         \
@@ -606,11 +703,11 @@
 
 // Adjust the xFlags bit 5 -> bit 11, src and dst can be the same (and can be xFlags, but not s1)
 #define FLAGS_ADJUST_TO11(dst, src, s1) \
-    MOV64x(s1, ~(1<<11));               \
-    AND(dst, src, s1);                  \
-    ANDI(s1, dst, 1<<5);                \
-    SLLI(s1, s1, 11-5);                 \
-    ANDI(dst, dst, ~(1<<5));            \
+    LUI(s1, 0xFFFFF);                   \
+    ADDIW(s1, s1, 0x7DF);               \
+    AND(s1, src, s1);                   \
+    ANDI(dst, src, 1<<5);               \
+    SLLI(dst, dst, 11-5);               \
     OR(dst, dst, s1)
 
 #ifndef MAYSETFLAGS
@@ -721,8 +818,8 @@
 
 #define MODREG  ((nextop&0xC0)==0xC0)
 
-void rv64_epilog();
-void rv64_epilog_fast();
+void rv64_epilog(void);
+void rv64_epilog_fast(void);
 void* rv64_next(x64emu_t* emu, uintptr_t addr);
 
 #ifndef STEPNAME
@@ -863,6 +960,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 #define sse_setround    STEPNAME(sse_setround)
 #define mmx_get_reg     STEPNAME(mmx_get_reg)
 #define mmx_get_reg_empty STEPNAME(mmx_get_reg_empty)
+#define mmx_forget_reg   STEPNAME(mmx_forget_reg)
 #define sse_get_reg     STEPNAME(sse_get_reg)
 #define sse_get_reg_empty STEPNAME(sse_get_reg_empty)
 #define sse_forget_reg   STEPNAME(sse_forget_reg)
@@ -888,7 +986,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta);
 
 /* setup r2 to address pointed by */
-//uintptr_t geted32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, rex_t rex, int* l, int s, int delta);
+uintptr_t geted32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta);
 
 /* setup r2 to address pointed by */
 //uintptr_t geted16(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, int s);
@@ -898,8 +996,8 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
 void jump_to_epilog(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst);
 void jump_to_epilog_fast(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst);
 void jump_to_next(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst);
-void ret_to_epilog(dynarec_rv64_t* dyn, int ninst);
-void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, int n);
+void ret_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex);
+void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex, int n);
 void iret_to_epilog(dynarec_rv64_t* dyn, int ninst, int is64bits);
 void call_c(dynarec_rv64_t* dyn, int ninst, void* fnc, int reg, int ret, int saveflags, int save_reg);
 void call_n(dynarec_rv64_t* dyn, int ninst, void* fnc, int w);
@@ -950,10 +1048,10 @@ void emit_inc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
 void emit_dec32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
 void emit_dec16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
 void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
-void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
+void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6);
 //void emit_adc32c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
-//void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
-//void emit_adc8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5);
+void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
+void emit_adc8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5, int s6);
 void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
 //void emit_adc16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_sbb32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
@@ -1047,12 +1145,20 @@ int extcache_st_coherency(dynarec_rv64_t* dyn, int ninst, int a, int b);
 #define X87_ST(A)   extcache_get_st(dyn, ninst, A)
 #endif
 
+//MMX helpers
+// get float register for a MMX reg, create the entry if needed
+int mmx_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a);
+// get float register for a MMX reg, but don't try to synch it if it needed to be created
+int mmx_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a);
+// forget float register for a MMX reg, create the entry if needed
+void mmx_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
+
 //SSE/SSE2 helpers
-// get neon register for a SSE reg, create the entry if needed
+// get float register for a SSE reg, create the entry if needed
 int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
-// get neon register for a SSE reg, but don't try to synch it if it needed to be created
+// get float register for a SSE reg, but don't try to synch it if it needed to be created
 int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
-// forget neon register for a SSE reg, create the entry if needed
+// forget float register for a SSE reg, create the entry if needed
 void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
 // purge the XMM0..XMM7 cache (before function call)
 void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1);
@@ -1085,19 +1191,19 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int seg, int* ok, int* need_epilog);
 //uintptr_t dynarec64_65(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep,int* ok, int* need_epilog);
 uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
-//uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_D8(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 //uintptr_t dynarec64_DA(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
-//uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
-//uintptr_t dynarec64_DD(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DD(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_DE(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
-//uintptr_t dynarec64_6664(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog);
-//uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_6664(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog);
+uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 
@@ -1231,4 +1337,12 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         SW(s2, xEmu, offsetof(x64emu_t, test.test));        \
     }
 
+#define GETREX()                                \
+    rex.rex = 0;                                \
+    if(!rex.is32bits)                           \
+        while(opcode>=0x40 && opcode<=0x4f) {   \
+            rex.rex = opcode;                   \
+            opcode = F8;                        \
+        }
+
 #endif //__DYNAREC_RV64_HELPER_H__
diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h
index b07162eb..fbba8f22 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass0.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass0.h
@@ -22,13 +22,14 @@
 #define NEW_INST \
         ++dyn->size;                            \
         if(dyn->size+3>=dyn->cap) {             \
-                dyn->insts = (instruction_native_t*)customRealloc(dyn->insts, sizeof(instruction_native_t)*dyn->cap*2);\
+                dyn->insts = (instruction_native_t*)dynaRealloc(dyn->insts, sizeof(instruction_native_t)*dyn->cap*2);\
                 memset(&dyn->insts[dyn->cap], 0, sizeof(instruction_native_t)*dyn->cap);   \
                 dyn->cap *= 2;                  \
         }                                       \
         dyn->insts[ninst].x64.addr = ip;        \
         dyn->e.combined1 = dyn->e.combined2 = 0;\
         dyn->e.swapped = 0; dyn->e.barrier = 0; \
+        for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\
         dyn->insts[ninst].f_entry = dyn->f;     \
         if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;}
 
@@ -40,9 +41,10 @@
 #define DEFAULT                         \
         --dyn->size;                    \
         *ok = -1;                       \
-        if(box64_dynarec_log>=LOG_INFO) {\
-        dynarec_log(LOG_NONE, "%p: Dynarec stopped because of Opcode %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X", \
-        (void*)ip, PKip(0),             \
+        if(box64_dynarec_log>=LOG_INFO || box64_dynarec_dump || box64_dynarec_missing) {\
+        dynarec_log(LOG_NONE, "%p: Dynarec stopped because of %sOpcode %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X", \
+        (void*)ip, rex.is32bits?"32bits ":"",\
+        PKip(0),                        \
         PKip(1), PKip(2), PKip(3),      \
         PKip(4), PKip(5), PKip(6),      \
         PKip(7), PKip(8), PKip(9),      \
diff --git a/src/dynarec/rv64/dynarec_rv64_pass1.h b/src/dynarec/rv64/dynarec_rv64_pass1.h
index c818c26c..34d0a468 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass1.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass1.h
@@ -5,6 +5,7 @@
 #define NEW_INST                                \
         dyn->insts[ninst].f_entry = dyn->f;     \
         dyn->e.combined1 = dyn->e.combined2 = 0;\
+        for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\
         dyn->e.swapped = 0; dyn->e.barrier = 0
 
 #define INST_EPILOG                             \
diff --git a/src/dynarec/rv64/dynarec_rv64_pass2.h b/src/dynarec/rv64/dynarec_rv64_pass2.h
index d71f9180..1c6e4734 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass2.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass2.h
@@ -2,7 +2,7 @@
 #define FINI                                                                                            \
         if(ninst) {                                                                                     \
                 dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size);     \
-                dyn->insts_size += 1+((dyn->insts[ninst].x64.size>dyn->insts[ninst].size)?dyn->insts[ninst].x64.size:dyn->insts[ninst].size)/15; \
+                dyn->insts_size += 1+((dyn->insts[ninst].x64.size>(dyn->insts[ninst].size/4))?dyn->insts[ninst].x64.size:(dyn->insts[ninst].size/4))/15; \
         }
 
 #define MESSAGE(A, ...)  
@@ -10,7 +10,7 @@
 #define NEW_INST                                                                                        \
         if(ninst) {                                                                                     \
                 dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size);     \
-                dyn->insts_size += 1+((dyn->insts[ninst-1].x64.size>dyn->insts[ninst-1].size)?dyn->insts[ninst-1].x64.size:dyn->insts[ninst-1].size)/15; \
+                dyn->insts_size += 1+((dyn->insts[ninst-1].x64.size>(dyn->insts[ninst-1].size/4))?dyn->insts[ninst-1].x64.size:(dyn->insts[ninst-1].size/4))/15; \
         }
 #define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; 
 #define INST_NAME(name) 
diff --git a/src/dynarec/rv64/dynarec_rv64_pass3.h b/src/dynarec/rv64/dynarec_rv64_pass3.h
index dafef0c5..459c4e13 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass3.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass3.h
@@ -1,4 +1,4 @@
-#define INIT    
+#define INIT
 #define FINI        \
     if(ninst)       \
         addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst].x64.size, dyn->insts[ninst].size/4); \
@@ -16,8 +16,8 @@
     if(box64_dynarec_dump) print_newinst(dyn, ninst);   \
     if(ninst)                                           \
         addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst-1].x64.size, dyn->insts[ninst-1].size/4);
-#define INST_EPILOG     
-#define INST_NAME(name) inst_name_pass3(dyn, ninst, name)
+#define INST_EPILOG
+#define INST_NAME(name) inst_name_pass3(dyn, ninst, name, rex)
 
 #define TABLE64(A, V)   {int val64offset = Table64(dyn, (V), 3); MESSAGE(LOG_DUMP, "  Table64: 0x%lx\n", (V)); AUIPC(A, SPLIT20(val64offset)); LD(A, A, SPLIT12(val64offset));}
 #define FTABLE64(A, V)  {mmx87_regs_t v = {.d = V}; int val64offset = Table64(dyn, v.q, 3); MESSAGE(LOG_DUMP, "  FTable64: %g\n", v.d); AUIPC(x1, SPLIT20(val64offset)); FLD(A, x1, SPLIT12(val64offset));}
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index 01657427..b9cbb2af 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -31,6 +31,15 @@ typedef union sse_cache_s {
         uint8_t     single:1;
     };
 } sse_cache_t;
+typedef union sse_old_s {
+    int8_t      v;
+    struct {
+        uint8_t     changed:1;
+        uint8_t     purged:1;
+        uint8_t     reg:4;
+        uint8_t     single:1;
+    };
+} sse_old_t;
 typedef struct extcache_s {
     // ext cache
     ext_cache_t         extcache[24];
@@ -43,6 +52,7 @@ typedef struct extcache_s {
     uint8_t             swapped;        // the combined reg were swapped
     uint8_t             barrier;        // is there a barrier at instruction epilog?
     uint32_t            news;           // bitmask, wich neoncache are new for this opcode
+    sse_old_t           olds[16];       // SSE regs has changed or has been removed
     // fpu cache
     int8_t              x87cache[8];    // cache status for the 8 x87 register behind the fpu stack
     int8_t              x87reg[8];      // reg used for x87cache entry
@@ -70,7 +80,7 @@ typedef struct instruction_rv64_s {
     int                 pred_sz;    // size of predecessor list
     int                 *pred;      // predecessor array
     uintptr_t           mark, mark2, mark3;
-    uintptr_t           markf;
+    uintptr_t           markf, markf2;
     uintptr_t           markseg;
     uintptr_t           marklock;
     int                 pass2choice;// value for choices that are fixed on pass2 for pass3
diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h
index 29336895..e9fa2f6d 100644
--- a/src/dynarec/rv64/rv64_emitter.h
+++ b/src/dynarec/rv64/rv64_emitter.h
@@ -74,6 +74,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define x4      14
 #define x5      15
 #define x6      6
+#define x9      9
 // used to clear the upper 32bits
 #define xMASK   5
 // 32bits version of scratch
@@ -112,6 +113,7 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define MOV64x(A, B)    rv64_move64(dyn, ninst, A, B)
 #define MOV32w(A, B)    rv64_move32(dyn, ninst, A, B, 1)
 #define MOV64xw(A, B)   if(rex.w) {MOV64x(A, B);} else {MOV32w(A, B);}
+#define MOV64z(A, B)    if(rex.is32bits) {MOV32w(A, B);} else {MOV64x(A, B);}
 
 // ZERO the upper part
 #define ZEROUP(r)       AND(r, r, xMASK)
@@ -174,12 +176,16 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define ADDW(rd, rs1, rs2)          EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, 0b0111011))
 // rd = rs1 + rs2
 #define ADDxw(rd, rs1, rs2)         EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, rex.w?0b0110011:0b0111011))
+// rd = rs1 + rs2
+#define ADDz(rd, rs1, rs2)          EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, rex.is32bits?0b0111011:0b0110011))
 // rd = rs1 - rs2
 #define SUB(rd, rs1, rs2)           EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, 0b0110011))
 // rd = rs1 - rs2
 #define SUBW(rd, rs1, rs2)          EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, 0b0111011))
 // rd = rs1 - rs2
 #define SUBxw(rd, rs1, rs2)         EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, rex.w?0b0110011:0b0111011))
+// rd = rs1 - rs2
+#define SUBz(rd, rs1, rs2)          EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, rex.is32bits?0b0111011:0b0110011))
 // rd = rs1<<rs2
 #define SLL(rd, rs1, rs2)           EMIT(R_type(0b0000000, rs2, rs1, 0b001, rd, 0b0110011))
 // rd = (rs1<rs2)?1:0
@@ -202,7 +208,9 @@ f28–31  ft8–11  FP temporaries                  Caller
 // rd = rs1 (pseudo instruction)
 #define MV(rd, rs1)                 ADDI(rd, rs1, 0)
 // rd = rs1 (pseudo instruction)
-#define MVxw(rd, rs1)               if(rex.w) {MV(rd, rs1); } else {AND(rd, rs1, xMASK);}
+#define MVxw(rd, rs1)               if(rex.w) {MV(rd, rs1);} else {AND(rd, rs1, xMASK);}
+// rd = rs1 (pseudo instruction)
+#define MVz(rd, rs1)               if(rex.is32bits) {AND(rd, rs1, xMASK);} else {MV(rd, rs1);}
 // rd = !rs1
 #define NOT(rd, rs1)                XORI(rd, rs1, -1)
 // rd = -rs1
@@ -254,7 +262,12 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define SW(rs2, rs1, imm12)         EMIT(S_type(imm12, rs2, rs1, 0b010, 0b0100011))
 
 #define PUSH1(reg)                  do {SD(reg, xRSP, -8); SUBI(xRSP, xRSP, 8);} while(0)
-#define POP1(reg)                   do {LD(reg, xRSP, 0); ADDI(xRSP, xRSP, 8);}while(0)
+#define POP1(reg)                   do {LD(reg, xRSP, 0); if (reg!=xRSP) ADDI(xRSP, xRSP, 8);} while(0)
+#define PUSH1_32(reg)               do {SW(reg, xRSP, -4); SUBIW(xRSP, xRSP, 4);} while(0)
+#define POP1_32(reg)                do {LWU(reg, xRSP, 0); if (reg!=xRSP) ADDIW(xRSP, xRSP, 4);} while(0)
+
+#define POP1z(reg)                  if(rex.is32bits) {POP1_32(reg);} else {POP1(reg);}
+#define PUSH1z(reg)                 if(rex.is32bits) {PUSH1_32(reg);} else {PUSH1(reg);}
 
 #define FENCE_gen(pred, succ)       (((pred)<<24) | ((succ)<<20) | 0b0001111)
 #define FENCE()                     EMIT(FENCE_gen(3, 3))
@@ -271,10 +284,14 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define LD(rd, rs1, imm12)          EMIT(I_type(imm12, rs1, 0b011, rd, 0b0000011))
 // rd = [rs1 + imm12]
 #define LDxw(rd, rs1, imm12)        EMIT(I_type(imm12, rs1, 0b011<<(1-rex.w), rd, 0b0000011))
+// rd = [rs1 + imm12]
+#define LDz(rd, rs1, imm12)         EMIT(I_type(imm12, rs1, 0b011<<rex.is32bits, rd, 0b0000011))
 // [rs1 + imm12] = rs2
 #define SD(rs2, rs1, imm12)         EMIT(S_type(imm12, rs2, rs1, 0b011, 0b0100011))
 // [rs1 + imm12] = rs2
 #define SDxw(rs2, rs1, imm12)       EMIT(S_type(imm12, rs2, rs1, 0b010+rex.w, 0b0100011))
+// [rs1 + imm12] = rs2
+#define SDz(rs2, rs1, imm12)        EMIT(S_type(imm12, rs2, rs1, 0b010+(1-rex.is32bits), 0b0100011))
 
 // Shift Left Immediate
 #define SLLI(rd, rs1, imm6)         EMIT(I_type(imm6, rs1, 0b001, rd, 0b0010011))
@@ -285,8 +302,12 @@ f28–31  ft8–11  FP temporaries                  Caller
 
 // rd = rs1 + imm12
 #define ADDIW(rd, rs1, imm12)       EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, 0b0011011))
+// rd = rs1 - imm12
+#define SUBIW(rd, rs1, imm12)       EMIT(I_type((-imm12)&0b111111111111, rs1, 0b000, rd, 0b0011011))
 // rd = rs1 + imm12
 #define ADDIxw(rd, rs1, imm12)      EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, rex.w?0b0010011:0b0011011))
+// rd = rs1 + imm12
+#define ADDIz(rd, rs1, imm12)       EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, rex.is32bits?0b0011011:0b0010011))
 
 #define SEXT_W(rd, rs1)             ADDIW(rd, rs1, 0)
 
@@ -359,6 +380,8 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define LR_W(rd, rs1, aq, rl)       EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010, rd, 0b0101111))
 #define SC_W(rd, rs2, rs1, aq, rl)  EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010, rd, 0b0101111))
 
+#define AMOSWAP_W(rd, rs2, rs1, aq, rl)  EMIT(R_type(AQ_RL(0b00001, aq, rl), rs2, rs1, 0b010, rd, 0b0101111))
+
 // RV64A
 #define LR_D(rd, rs1, aq, rl)       EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b011, rd, 0b0101111))
 #define SC_D(rd, rs2, rs1, aq, rl)  EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b011, rd, 0b0101111))
@@ -366,6 +389,8 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define LRxw(rd, rs1, aq, rl)       EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010|rex.w, rd, 0b0101111))
 #define SCxw(rd, rs2, rs1, aq, rl)  EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010|rex.w, rd, 0b0101111))
 
+#define AMOSWAP_D(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00001, aq, rl), rs2, rs1, 0b011, rd, 0b0101111))
+
 // RV32F
 // Read round mode
 #define FRRM(rd)                    CSRRS(rd, xZR, 0x002)
@@ -509,4 +534,120 @@ f28–31  ft8–11  FP temporaries                  Caller
 // Convert from Double to unsigned integer
 #define FCVTLUDxw(rd, frs1, rm)     EMIT(R_type(0b1100001, 0b00001+(rex.w?0b10:0b00), frs1, rm, rd, 0b1010011))
 
+//Zba
+// Add unsigned word (Wz(rs1) + X(rs2))
+#define ADDUW(rd, rs1, rs2)         EMIT(R_type(0b0000100, rs2, rs1, 0b000, rd, 0b0111011))
+// Zero-extend Word
+#define ZEXTW(rd, rs1)              ADDUW(rd, rs1, xZR)
+// Shift left by 1 and add (rd = X(rs2) + X(rs1)<<1)
+#define SH1ADD(rd, rs1, rs2)        EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b0110011))
+// Shift unsigned word left by 1 and add (rd = X(rs2) + Wz(rs1)<<1)
+#define SH1ADDUW(rd, rs1, rs2)      EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b0111011))
+// Shift left by 2 and add (rd = X(rs2) + X(rs1)<<2)
+#define SH2ADD(rd, rs1, rs2)        EMIT(R_type(0b0010000, rs2, rs1, 0b100, rd, 0b0110011))
+// Shift unsigned word left by 2 and add (rd = X(rs2) + Wz(rs1)<<2)
+#define SH2ADDUW(rd, rs1, rs2)      EMIT(R_type(0b0010000, rs2, rs1, 0b100, rd, 0b0111011))
+// Shift left by 3 and add (rd = X(rs2) + X(rs1)<<3)
+#define SH3ADD(rd, rs1, rs2)        EMIT(R_type(0b0010000, rs2, rs1, 0b110, rd, 0b0110011))
+// Shift unsigned word left by 3 and add (rd = X(rs2) + Wz(rs1)<<3)
+#define SH3ADDUW(rd, rs1, rs2)      EMIT(R_type(0b0010000, rs2, rs1, 0b110, rd, 0b0111011))
+// Shift left unsigned word (immediate)
+#define SLLIUW(rd, rs1, imm)        EMIT(R_type(0b0000100, imm, rs1, 0b001, rd, 0b0011011))
+// Shift left by 1,2 or 3 and add (rd = X(rs2) + X(rs1)<<x)
+#define SHxADD(rd, rs1, x, rs2)        EMIT(R_type(0b0010000, rs2, rs1, (x)<<1, rd, 0b0110011))
+// Shift unsigned word left by 1,2 or 3 and add (rd = X(rs2) + Wz(rs1)<<x)
+#define SHxADDUW(rd, rs1, x, rs2)      EMIT(R_type(0b0010000, rs2, rs1, (x)<<1, rd, 0b0111011))
+
+//Zbb
+// AND with reverted operand (rs1 & ~rs2)
+#define ANDN(rd, rs1, rs2)      EMIT(R_type(0b0100000, rs2, rs1, 0b111, rd, 0b0110011))
+// OR with reverted operand (rs1 | ~rs2)
+#define ORN(rd, rs1, rs2)       EMIT(R_type(0b0100000, rs2, rs1, 0b110, rd, 0b0110011))
+// Exclusive NOR (~(rs1 ^ rs2))
+#define XNOR(rd, rs1, rs2)      EMIT(R_type(0b0100000, rs2, rs1, 0b100, rd, 0b0110011))
+// Count leading zero bits
+#define CLZ(rd, rs)             EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, 0b0010011))
+// Count leading zero bits in word
+#define CLZW(rd, rs)            EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, 0b0011011))
+// Count leading zero bits
+#define CLZxw(rd, rs)           EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, rex.w?0b0010011:0b0011011))
+// Count trailing zero bits
+#define CTZ(rd, rs)             EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, 0b0010011))
+// Count trailing zero bits in word
+#define CTZW(rd, rs)            EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, 0b0011011))
+// Count trailing zero bits
+#define CTZxw(rd, rs)           EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, rex.w?0b0010011:0b0011011))
+// Count set bits
+#define CPOP(rd, rs)            EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, 0b0010011))
+// Count set bits in word
+#define CPOPW(rd, rs)           EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, 0b0011011))
+// Count set bits
+#define CPOPxw(rd, rs)          EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, rex.w?0b0010011:0b0011011))
+// Maximum
+#define MAX(rd, rs1, rs2)       EMIT(R_type(0b0000101, rs2, rs1, 0b110, rd, 0b0110011))
+// Unisgned maximum
+#define MAXU(rd, rs1, rs2)      EMIT(R_type(0b0000101, rs2, rs1, 0b111, rd, 0b0110011))
+// Minimum
+#define MIN(rd, rs1, rs2)       EMIT(R_type(0b0000101, rs2, rs1, 0b100, rd, 0b0110011))
+// Unsigned minimum
+#define MINU(rd, rs1, rs2)      EMIT(R_type(0b0000101, rs2, rs1, 0b101, rd, 0b0110011))
+// Sign-extend byte
+#define SEXTB(rd, rs)           EMIT(R_type(0b0110000, 0b00100, rs, 0b001, rd, 0b0010011))
+// Sign-extend half-word
+#define SEXTH(rd, rs)           EMIT(R_type(0b0110000, 0b00101, rs, 0b001, rd, 0b0010011))
+// Zero-extend half-word
+#define ZEXTH_(rd, rs)          EMIT(R_type(0b0000100, 0b00000, rs, 0b100, rd, 0b0111011))
+// Zero-extend half-word
+#define ZEXTH(rd, rs)           if(rv64_zbb) ZEXTH_(rd, rs); else {SLLI(rd, rs, 48); SRLI(rd, rd, 48);}
+// Rotate left (register)
+#define ROL(rd, rs1, rs2)       EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, 0b0110011))
+// Rotate left word (register)
+#define ROLW(rd, rs1, rs2)      EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, 0b0111011))
+// Rotate left (register)
+#define ROLxw(rd, rs1, rs2)     EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, rex.w?0b0110011:0b0111011))
+// Rotate right (register)
+#define ROR(rd, rs1, rs2)       EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, 0b0110011))
+// Rotate right (immediate)
+#define RORI(rd, rs1, shamt)    EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, 0b0010011))
+// Rotate right word (immediate)
+#define RORIW(rd, rs1, shamt)   EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, 0b0011011))
+// Rotate right (immediate)
+#define RORIxw(rd, rs1, shamt)  EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, rex.w?0b0010011:0b0011011))
+// Rotate right word (register)
+#define RORW(rd, rs1, rs2)      EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, 0b0111011))
+// Rotate right (register)
+#define RORxw(rd, rs1, rs2)     EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, rex.w?0b0110011:0b0111011))
+// Bitwise OR Combine, byte granule (for all byte, if byte==0, res.byte=0, else res.byte=0xff)
+#define ORCB(rd, rs)            EMIT(I_type(0b001010000111, rs, 0b101, rd, 0b0010011))
+// Byte-reverse register
+#define REV8(rd, rs)            EMIT(I_type(0b011010111000, rs, 0b101, rd, 0b0010011))
+
+//Zbc
+// Carry-less multily (low-part)
+#define CLMUL(rd, rs1, rs2)         EMIT(R_type(0b0000101, rs2, rs1, 0b001, rd, 0b0110011))
+// Carry-less multiply (high-part)
+#define CLMULH(rd, rs1, rs2)        EMIT(R_type(0b0000101, rs2, rs1, 0b011, rd, 0b0110011))
+// Carry-less multiply (reversed)
+#define CLMULR(rd, rs1, rs2)        EMIT(R_type(0b0000101, rs2, rs1, 0b010, rd, 0b0110011))
+
+//Zbs
+// encoding of the "imm" on RV64 use a slight different mask, but it will work using R_type with high bit of imm ovewriting low bit op func
+// Single-bit Clear (Register)
+#define BCLR(rd, rs1, rs2)          EMIT(R_type(0b0100100, rs2, rs1, 0b001, rd, 0b0110011))
+// Single-bit Clear (Immediate)
+#define BCLI(rd, rs1, imm)          EMIT(R_type(0b0100100, imm, rs1, 0b001, rd, 0b0010011))
+// Single-bit Extreact (Register)
+#define BEXT(rd, rs1, rs2)          EMIT(R_type(0b0100100, rs2, rs1, 0b101, rd, 0b0110011))
+// Single-bit Extract (Immediate)
+#define BEXTI(rd, rs1, imm)         EMIT(R_type(0b0100100, imm, rs1, 0b101, rd, 0b0010011))
+// Single-bit Invert (Register)
+#define BINV(rd, rs1, rs2)          EMIT(R_type(0b0110100, rs2, rs1, 0b001, rd, 0b0110011))
+// Single-bit Invert (Immediate)
+#define BINVI(rd, rs1, imm)         EMIT(R_type(0b0110100, imm, rs1, 0b001, rd, 0b0010011))
+// Single-bit Set (Register)
+#define BSET(rd, rs1, rs2)          EMIT(R_type(0b0010100, rs2, rs1, 0b001, rd, 0b0110011))
+// Single-bit Set (Immediate)
+#define BSETI(rd, rs1, imm)         EMIT(R_type(0b0010100, imm, rs1, 0b001, rd, 0b0010011))
+
+
 #endif //__RV64_EMITTER_H__
diff --git a/src/dynarec/rv64/rv64_epilog.S b/src/dynarec/rv64/rv64_epilog.S
index 6a299d9d..17dc117f 100644
--- a/src/dynarec/rv64/rv64_epilog.S
+++ b/src/dynarec/rv64/rv64_epilog.S
@@ -39,26 +39,27 @@ rv64_epilog:
 rv64_epilog_fast:
     ld      ra, (sp)  // save ra
     ld      x8, 8(sp) // save fp
-    ld      x18, 16(sp)
-    ld      x19, 24(sp)
-    ld      x20, 32(sp)
-    ld      x21, 40(sp)
-    ld      x22, 48(sp)
-    ld      x23, 56(sp)
-    ld      x24, 64(sp)
-    ld      x25, 72(sp)
-    ld      x26, 80(sp)
-    ld      x27, 88(sp)
-    fld     f18, (12*8)(sp)
-    fld     f19, (13*8)(sp)
-    fld     f20, (14*8)(sp)
-    fld     f21, (15*8)(sp)
-    fld     f22, (16*8)(sp)
-    fld     f23, (17*8)(sp)
-    fld     f24, (18*8)(sp)
-    fld     f25, (19*8)(sp)
-    fld     f26, (20*8)(sp)
-    fld     f27, (21*8)(sp)
-    addi    sp,  sp, (8 * 22)
+    ld      x18, (2*8)(sp)
+    ld      x19, (3*8)(sp)
+    ld      x20, (4*8)(sp)
+    ld      x21, (5*8)(sp)
+    ld      x22, (6*8)(sp)
+    ld      x23, (7*8)(sp)
+    ld      x24, (8*8)(sp)
+    ld      x25, (9*8)(sp)
+    ld      x26, (10*8)(sp)
+    ld      x27, (11*8)(sp)
+    ld      x9,  (12*8)(sp)
+    fld     f18, (13*8)(sp)
+    fld     f19, (14*8)(sp)
+    fld     f20, (15*8)(sp)
+    fld     f21, (16*8)(sp)
+    fld     f22, (17*8)(sp)
+    fld     f23, (19*8)(sp)
+    fld     f24, (19*8)(sp)
+    fld     f25, (20*8)(sp)
+    fld     f26, (21*8)(sp)
+    fld     f27, (22*8)(sp)
+    addi    sp,  sp, (8 * 24)
     //end, return...
     ret
diff --git a/src/dynarec/rv64/rv64_printer.c b/src/dynarec/rv64/rv64_printer.c
index bdc424c1..db013c32 100644
--- a/src/dynarec/rv64/rv64_printer.c
+++ b/src/dynarec/rv64/rv64_printer.c
@@ -785,6 +785,9 @@ const char* rv64_print(uint32_t data, uintptr_t addr)
                 } else if (imm116 == 0x10) { /* SRAI */
                     insn.name = "srai";
                     insn.imm&=0b111111;
+                } else if (insn.imm==0b011010111000) {
+                    insn.name = "rev8";
+                    PRINT_rd_rs1();
                 }
                 break;
             }
@@ -968,6 +971,20 @@ const char* rv64_print(uint32_t data, uintptr_t addr)
                 }
             }
             break;
+            case 0x10: {
+                switch (funct3) {
+                case 0b010:
+                    insn.name = "sh1add";
+                    break;
+                case 0b100:
+                    insn.name = "sh2add";
+                    break;
+                case 0b110:
+                    insn.name = "sh3add";
+                    break;
+                }
+            }
+            break;
             case 0x20: {
                 switch (funct3) {
                 case 0x0: /* SUB */
diff --git a/src/dynarec/rv64/rv64_prolog.S b/src/dynarec/rv64/rv64_prolog.S
index 0817bdc1..96a85d3b 100644
--- a/src/dynarec/rv64/rv64_prolog.S
+++ b/src/dynarec/rv64/rv64_prolog.S
@@ -11,29 +11,30 @@
 .global rv64_prolog
 rv64_prolog:
     //save all 18 used register
-    addi    sp,  sp, -(8 * 22)
+    addi    sp,  sp, -(8 * 24)  // 16 bytes aligned
     sd      ra, (sp)  // save ra
     sd      x8, 8(sp) // save fp
-    sd      x18, 16(sp)
-    sd      x19, 24(sp)
-    sd      x20, 32(sp)
-    sd      x21, 40(sp)
-    sd      x22, 48(sp)
-    sd      x23, 56(sp)
-    sd      x24, 64(sp)
-    sd      x25, 72(sp)
-    sd      x26, 80(sp)
-    sd      x27, 88(sp)
-    fsd     f18, (12*8)(sp)
-    fsd     f19, (13*8)(sp)
-    fsd     f20, (14*8)(sp)
-    fsd     f21, (15*8)(sp)
-    fsd     f22, (16*8)(sp)
-    fsd     f23, (17*8)(sp)
-    fsd     f24, (18*8)(sp)
-    fsd     f25, (19*8)(sp)
-    fsd     f26, (20*8)(sp)
-    fsd     f27, (21*8)(sp)
+    sd      x18, (2*8)(sp)
+    sd      x19, (3*8)(sp)
+    sd      x20, (4*8)(sp)
+    sd      x21, (5*8)(sp)
+    sd      x22, (6*8)(sp)
+    sd      x23, (7*8)(sp)
+    sd      x24, (8*8)(sp)
+    sd      x25, (9*8)(sp)
+    sd      x26, (10*8)(sp)
+    sd      x27, (11*8)(sp)
+    sd      x9,  (12*8)(sp)
+    fsd     f18, (13*8)(sp)
+    fsd     f19, (14*8)(sp)
+    fsd     f20, (15*8)(sp)
+    fsd     f21, (16*8)(sp)
+    fsd     f22, (17*8)(sp)
+    fsd     f23, (19*8)(sp)
+    fsd     f24, (19*8)(sp)
+    fsd     f25, (20*8)(sp)
+    fsd     f26, (21*8)(sp)
+    fsd     f27, (22*8)(sp)
     //setup emu -> register
     ld      x16, (a0)
     ld      x17, 8(a0)