about summary refs log tree commit diff stats
path: root/src/dynarec/rv64/dynarec_rv64_660f.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynarec/rv64/dynarec_rv64_660f.c')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f.c1442
1 files changed, 1087 insertions, 355 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c
index 260ea32b..3f51289e 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 
 #include "debug.h"
@@ -27,7 +26,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     (void)ip; (void)need_epilog;
 
     uint8_t opcode = F8;
-    uint8_t nextop, u8;
+    uint8_t nextop, u8, s8;
     int32_t i32;
     uint8_t gd, ed;
     uint8_t wback, wb1, wb2, gback;
@@ -37,7 +36,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     int v0, v1;
     int q0, q1;
     int d0, d1;
-    int64_t fixedaddress;
+    int64_t fixedaddress, gdoffset;
     int unscaled;
 
     MAYUSE(d0);
@@ -49,27 +48,27 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     MAYUSE(j64);
 
     static const int8_t round_round[] = { RD_RNE, RD_RDN, RD_RUP, RD_RTZ };
-    
+
     switch(opcode) {
         case 0x10:
             INST_NAME("MOVUPD Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_MV_Q(x3);
             break;
         case 0x11:
             INST_NAME("MOVUPD Ex,Gx");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_MV_Q2(x3);
             if(!MODREG) SMWRITE2();
             break;
         case 0x12:
             INST_NAME("MOVLPD Gx, Eq");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(MODREG) {
                 // access register instead of memory is bad opcode!
                 DEFAULT;
@@ -78,33 +77,47 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             SMREAD();
             addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
             LD(x3, wback, fixedaddress);
-            SD(x3, gback, 0);
+            SD(x3, gback, gdoffset+0);
+            break;
+        case 0x13:
+            INST_NAME("MOVLPD Eq, Gx");
+            nextop = F8;
+            GETGX();
+            if(MODREG) {
+                // access register instead of memory is bad opcode!
+                DEFAULT;
+                return addr;
+            }
+            addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+            LD(x3, gback, gdoffset+0);
+            SD(x3, wback, fixedaddress);
+            SMWRITE2();
             break;
         case 0x14:
             INST_NAME("UNPCKLPD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             // GX->q[1] = EX->q[0];
             LD(x3, wback, fixedaddress+0);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             break;
         case 0x15:
             INST_NAME("UNPCKHPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             // GX->q[0] = GX->q[1];
-            LD(x3, gback, 8);
-            SD(x3, gback, 0);
+            LD(x3, gback, gdoffset+8);
+            SD(x3, gback, gdoffset+0);
             // GX->q[1] = EX->q[1];
             LD(x3, wback, fixedaddress+8);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             break;
         case 0x16:
             INST_NAME("MOVHPD Gx, Eq");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(MODREG) {
                 // access register instead of memory is bad opcode!
                 DEFAULT;
@@ -113,56 +126,32 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             SMREAD();
             addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
             LD(x3, wback, fixedaddress);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             break;
         case 0x1F:
             INST_NAME("NOP (multibyte)");
             nextop = F8;
             FAKEED;
             break;
-        
-        #define GO(GETFLAGS, NO, YES, F)            \
-            READFLAGS(F);                           \
-            GETFLAGS;                               \
-            nextop=F8;                              \
-            GETGD;                                  \
-            if(MODREG) {                            \
-                ed = xRAX+(nextop&7)+(rex.b<<3);    \
-                SLLI(x4, ed, 48);                   \
-                SRLI(x4, x4, 48);                   \
-            } else {                                \
-                SMREAD();                           \
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \
-                LHU(x4, ed, fixedaddress);          \
-                ed = x4;                            \
-            }                                       \
-            B##NO(x1, 4+4*4);                       \
-            ADDI(x3, xZR, -1);                      \
-            SRLI(x3, x3, 48);                       \
-            AND(gd, gd, x3);                        \
-            OR(gd, gd, ed);
-
-        GOCOND(0x40, "CMOV", "Gw, Ew");
-        #undef GO
         case 0x28:
             INST_NAME("MOVAPD Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_MV_Q(x3);
             break;
         case 0x29:
             INST_NAME("MOVAPD Ex,Gx");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_MV_Q2(x3);
             if(!MODREG) SMWRITE2();
             break;
         case 0x2B:
             INST_NAME("MOVNTPD Ex, Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
             break;
@@ -207,15 +196,15 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 case 0x00:
                     INST_NAME("PSHUFB Gx, Ex");
                     nextop = F8;
-                    GETGX(x1);
+                    GETGX();
                     GETEX(x2, 0);
                     sse_forget_reg(dyn, ninst, x5);
 
                     ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
 
                     // perserve gd
-                    LD(x3, gback, 0);
-                    LD(x4, gback, 8);
+                    LD(x3, gback, gdoffset+0);
+                    LD(x4, gback, gdoffset+8);
                     SD(x3, x5, 0);
                     SD(x4, x5, 8);
 
@@ -223,29 +212,29 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         LBU(x3, wback, fixedaddress+i);
                         ANDI(x4, x3, 128);
                         BEQZ(x4, 12);
-                        SB(xZR, gback, i);
+                        SB(xZR, gback, gdoffset+i);
                         BEQZ(xZR, 20); // continue
                         ANDI(x4, x3, 15);
                         ADD(x4, x4, x5);
                         LBU(x4, x4, 0);
-                        SB(x4, gback, i);
+                        SB(x4, gback, gdoffset+i);
                     }
                     break;
                 case 0x01:
                     INST_NAME("PHADDW Gx, Ex");
                     nextop = F8;
-                    GETGX(x1);
+                    GETGX();
                     for (int i=0; i<4; ++i) {
                         // GX->sw[i] = GX->sw[i*2+0]+GX->sw[i*2+1];
-                        LH(x3, gback, 2*(i*2+0));
-                        LH(x4, gback, 2*(i*2+1));
+                        LH(x3, gback, gdoffset+2*(i*2+0));
+                        LH(x4, gback, gdoffset+2*(i*2+1));
                         ADDW(x3, x3, x4);
-                        SH(x3, gback, 2*i);
+                        SH(x3, gback, gdoffset+2*i);
                     }
                     if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                         // GX->q[1] = GX->q[0];
-                        LD(x3, gback, 0);
-                        SD(x3, gback, 8);
+                        LD(x3, gback, gdoffset+0);
+                        SD(x3, gback, gdoffset+8);
                     } else {
                         GETEX(x2, 0);
                         for (int i=0; i<4; ++i) {
@@ -253,47 +242,150 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                             LH(x3, wback, fixedaddress+2*(i*2+0));
                             LH(x4, wback, fixedaddress+2*(i*2+1));
                             ADDW(x3, x3, x4);
-                            SH(x3, gback, 2*(4+i));
+                            SH(x3, gback, gdoffset+2*(4+i));
                         }
                     }
                     break;
                 case 0x02:
                     INST_NAME("PHADDD Gx, Ex");
                     nextop = F8;
-                    GETGX(x1);
+                    GETGX();
                     // GX->sd[0] += GX->sd[1];
-                    LW(x3, gback, 0*4);
-                    LW(x4, gback, 1*4);
+                    LW(x3, gback, gdoffset+0*4);
+                    LW(x4, gback, gdoffset+1*4);
                     ADDW(x3, x3, x4);
-                    SW(x3, gback, 0*4);
+                    SW(x3, gback, gdoffset+0*4);
                     // GX->sd[1] = GX->sd[2] + GX->sd[3];
-                    LW(x3, gback, 2*4);
-                    LW(x4, gback, 3*4);
+                    LW(x3, gback, gdoffset+2*4);
+                    LW(x4, gback, gdoffset+3*4);
                     ADDW(x3, x3, x4);
-                    SW(x3, gback, 1*4);
+                    SW(x3, gback, gdoffset+1*4);
                     if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                         // GX->q[1] = GX->q[0];
-                        LD(x3, gback, 0);
-                        SD(x3, gback, 8);
+                        LD(x3, gback, gdoffset+0);
+                        SD(x3, gback, gdoffset+8);
                     } else {
                         GETEX(x2, 0);
                         // GX->sd[2] = EX->sd[0] + EX->sd[1];
                         LW(x3, wback, fixedaddress+0*4);
                         LW(x4, wback, fixedaddress+1*4);
                         ADDW(x3, x3, x4);
-                        SW(x3, gback, 2*4);
+                        SW(x3, gback, gdoffset+2*4);
                         // GX->sd[3] = EX->sd[2] + EX->sd[3];
                         LW(x3, wback, fixedaddress+2*4);
                         LW(x4, wback, fixedaddress+3*4);
                         ADDW(x3, x3, x4);
-                        SW(x3, gback, 3*4);
+                        SW(x3, gback, gdoffset+3*4);
+                    }
+                    break;
+
+                case 0x04:
+                    INST_NAME("PADDUBSW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    MOV64x(x5, 32767);
+                    MOV64x(x6, -32768);
+                    for(int i=0; i<8; ++i) {
+                        LBU(x3, gback, gdoffset+i*2);
+                        LB(x4, wback, fixedaddress+i*2);
+                        MUL(x9, x3, x4);
+                        LBU(x3, gback, gdoffset+i*2+1);
+                        LB(x4, wback, fixedaddress+i*2+1);
+                        MUL(x3, x3, x4);
+                        ADD(x3, x3, x9);
+                        if(rv64_zbb) {
+                            MIN(x3, x3, x5);
+                            MAX(x3, x3, x6);
+                        } else {
+                            BLT(x3, x5, 4+4);
+                            MV(x3, x5);
+                            BLT(x6, x3, 4+4);
+                            MV(x3, x6);
+                        }
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    break;
+
+                case 0x08:
+                    INST_NAME("PSIGNB Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<16; ++i) {
+                        LB(x3, gback, gdoffset+i);
+                        LB(x4, wback, fixedaddress+i);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x3, x3);
+                        BNE(x4, xZR, 4+4);
+                        MOV_U12(x3, 0);
+                        SB(x3, gback, gdoffset+i);
+                    }
+                    break;
+                case 0x09:
+                    INST_NAME("PSIGNW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<8; ++i) {
+                        LH(x3, gback, gdoffset+i*2);
+                        LH(x4, wback, fixedaddress+i*2);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x3, x3);
+                        BNE(x4, xZR, 4+4);
+                        MOV_U12(x3, 0);
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x0A:
+                    INST_NAME("PSIGND Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        LW(x4, wback, fixedaddress+i*4);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x3, x3);
+                        BNE(x4, xZR, 4+4);
+                        ADDI(x3, xZR, 0);
+                        SW(x3, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x0B:
+                    INST_NAME("PMULHRSW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<8; ++i) {
+                        LH(x3, gback, gdoffset+i*2);
+                        LH(x4, wback, fixedaddress+i*2);
+                        MUL(x3, x3, x4);
+                        SRAI(x3, x3, 14);
+                        ADDI(x3, x3, 1);
+                        SRAI(x3, x3, 1);
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x10:
+                    INST_NAME("PBLENDVB Gx,Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    sse_forget_reg(dyn, ninst, 0); // forget xmm[0]
+                    for (int i=0; i<16; ++i) {
+                        LB(x3, xEmu, offsetof(x64emu_t, xmm[0])+i);
+                        BGE(x3, xZR, 12); // continue
+                        LBU(x3, wback, fixedaddress+i);
+                        SB(x3, gback, gdoffset+i);
+                        // continue
                     }
                     break;
                 case 0x17:
                     INST_NAME("PTEST Gx, Ex");
                     nextop = F8;
                     SETFLAGS(X_ALL, SF_SET);
-                    GETGX(x1);
+                    GETGX();
                     GETEX(x2, 0);
                     CLEAR_FLAGS();
                     SET_DFNONE();
@@ -302,8 +394,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         LD(x6, wback, fixedaddress+8);
 
                         IFX(X_ZF) {
-                            LD(x3, gback, 0);
-                            LD(x4, gback, 8);
+                            LD(x3, gback, gdoffset+0);
+                            LD(x4, gback, gdoffset+8);
                             AND(x3, x3, x5);
                             AND(x4, x4, x6);
                             OR(x3, x3, x4);
@@ -311,9 +403,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                             ORI(xFlags, xFlags, 1<<F_ZF);
                         }
                         IFX(X_CF) {
-                            LD(x3, gback, 0);
+                            LD(x3, gback, gdoffset+0);
                             NOT(x3, x3);
-                            LD(x4, gback, 8);
+                            LD(x4, gback, gdoffset+8);
                             NOT(x4, x4);
                             AND(x3, x3, x5);
                             AND(x4, x4, x6);
@@ -323,19 +415,306 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         }
                     }
                     break;
+
+                case 0x1C:
+                    INST_NAME("PABSB Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<16; ++i) {
+                        LB(x4, wback, fixedaddress+i);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x4, x4);
+                        SB(x4, gback, gdoffset+i);
+                    }
+                    break;
+                case 0x1D:
+                    INST_NAME("PABSW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<8; ++i) {
+                        LH(x4, wback, fixedaddress+i*2);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x4, x4);
+                        SH(x4, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x1E:
+                    INST_NAME("PABSD Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    MOV64x(x5, ~(1<<31));
+                    for(int i=0; i<4; ++i) {
+                        LW(x4, wback, fixedaddress+i*4);
+                        BGE(x4, xZR, 4+4);
+                        NEG(x4, x4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
+
+                case 0x2B:
+                    INST_NAME("PACKUSDW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    MOV64x(x5, 65535);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        if(rv64_zbb) {
+                            MIN(x3, x3, x5);
+                            MAX(x3, x3, xZR);
+                        } else {
+                            BGE(x3, xZR, 4+4);
+                            MV(x3, xZR);
+                            BLT(x3, x5, 4+4);
+                            MV(x3, x5);
+                        }
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    if(MODREG && gd==ed) {
+                        LD(x3, gback, gdoffset+0);
+                        SD(x3, gback, gdoffset+8);
+                    } else for(int i=0; i<4; ++i) {
+                        LW(x3, wback, fixedaddress+i*4);
+                        if(rv64_zbb) {
+                            MIN(x3, x3, x5);
+                            MAX(x3, x3, xZR);
+                        } else {
+                            BGE(x3, xZR, 4+4);
+                            MV(x3, xZR);
+                            BLT(x3, x5, 4+4);
+                            MV(x3, x5);
+                        }
+                        SH(x3, gback, gdoffset+8+i*2);
+                    }
+                    break;
+
+                case 0x30:
+                    INST_NAME("PMOVZXBW Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=7; i>=0; --i) {
+                        LBU(x3, wback, fixedaddress+i);
+                        SH(x3, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x31:
+                    INST_NAME("PMOVZXBD Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=3; i>=0; --i) {
+                        LBU(x3, wback, fixedaddress+i);
+                        SW(x3, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x32:
+                    INST_NAME("PMOVZXBQ Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=1; i>=0; --i) {
+                        LBU(x3, wback, fixedaddress+i);
+                        SD(x3, gback, gdoffset+i*8);
+                    }
+                    break;
+                case 0x33:
+                    INST_NAME("PMOVZXWD Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=3; i>=0; --i) {
+                        LHU(x3, wback, fixedaddress+i*2);
+                        SW(x3, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x34:
+                    INST_NAME("PMOVZXWQ Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=1; i>=0; --i) {
+                        LHU(x3, wback, fixedaddress+i*2);
+                        SD(x3, gback, gdoffset+i*8);
+                    }
+                    break;
+                case 0x35:
+                    INST_NAME("PMOVZXDQ Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=1; i>=0; --i) {
+                        LWU(x3, wback, fixedaddress+i*4);
+                        SD(x3, gback, gdoffset+i*8);
+                    }
+                    break;
+
+                case 0x38:
+                    INST_NAME("PMINSB Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<16; ++i) {
+                        LB(x3, gback, gdoffset+i);
+                        LB(x4, wback, fixedaddress+i);
+                        if(rv64_zbb) MIN(x4, x3, x4); else BLT(x3, x4, 4+4);
+                        SB(x4, gback, gdoffset+i);
+                    }
+                    break;
+                case 0x39:
+                    INST_NAME("PMINSD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        LW(x4, wback, fixedaddress+i*4);
+                        if(rv64_zbb) MIN(x4, x3, x4); else BLT(x3, x4, 4+4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
                 case 0x3A:
                     INST_NAME("PMINUW Gx, Ex");  // SSE4 opcode!
                     nextop = F8;
-                    GETGX(x1);
+                    GETGX();
                     GETEX(x2, 0);
                     for(int i=0; i<8; ++i) {
-                        // if(GX->uw[i]>EX->uw[i]) GX->uw[i] = EX->uw[i];
-                        LHU(x3, gback, i*2);
+                        LHU(x3, gback, gdoffset+i*2);
                         LHU(x4, wback, fixedaddress+i*2);
-                        BLTU(x3, x4, 8);
-                        SH(x4, gback, i*2);
+                        if(rv64_zbb) MINU(x4, x3, x4); else BLTU(x3, x4, 4+4);
+                        SH(x4, gback, gdoffset+i*2);
                     }
                     break;
+                case 0x3B:
+                    INST_NAME("PMINUD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LWU(x3, gback, gdoffset+i*4);
+                        LWU(x4, wback, fixedaddress+i*4);
+                        if(rv64_zbb) MINU(x4, x3, x4); else BLTU(x3, x4, 4+4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x3C:
+                    INST_NAME("PMAXSB Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<16; ++i) {
+                        LB(x3, gback, gdoffset+i);
+                        LB(x4, wback, fixedaddress+i);
+                        if(rv64_zbb) MAX(x4, x3, x4); else BLT(x4, x3, 4+4);
+                        SB(x4, gback, gdoffset+i);
+                    }
+                    break;
+                case 0x3D:
+                    INST_NAME("PMAXSD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        LW(x4, wback, fixedaddress+i*4);
+                        if(rv64_zbb) MAX(x4, x3, x4); else BLT(x4, x3, 4+4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x3E:
+                    INST_NAME("PMAXUW Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<8; ++i) {
+                        LHU(x3, gback, gdoffset+i*2);
+                        LHU(x4, wback, fixedaddress+i*2);
+                        if(rv64_zbb) MAXU(x4, x3, x4); else BLTU(x4, x3, 4+4);
+                        SH(x4, gback, gdoffset+i*2);
+                    }
+                    break;
+                case 0x3F:
+                    INST_NAME("PMAXUD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LWU(x3, gback, gdoffset+i*4);
+                        LWU(x4, wback, fixedaddress+i*4);
+                        if(rv64_zbb) MAXU(x4, x3, x4); else BLTU(x4, x3, 4+4);
+                        SW(x4, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0x40:
+                    INST_NAME("PMULLD Gx, Ex");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    for(int i=0; i<4; ++i) {
+                        LW(x3, gback, gdoffset+i*4);
+                        LW(x4, wback, fixedaddress+i*4);
+                        MUL(x3, x3, x4);
+                        SW(x3, gback, gdoffset+i*4);
+                    }
+                    break;
+                case 0xDB:
+                    INST_NAME("AESIMC Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_MV_Q(x3);
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aesimc, -1);
+                    break;
+                case 0xDC:
+                    INST_NAME("AESENC Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aese, -1);
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
+                    break;
+                case 0xDD:
+                    INST_NAME("AESENCLAST Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aeselast, -1);
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
+                    break;
+                case 0xDE:
+                    INST_NAME("AESDEC Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aesd, -1);
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
+                    break;
+
+                case 0xDF:
+                    INST_NAME("AESDECLAST Gx, Ex");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd);
+                    CALL(native_aesdlast, -1);
+                    GETGX();
+                    GETEX(x2, 0);
+                    SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
+                    break;
                 default:
                     DEFAULT;
             }
@@ -346,19 +725,20 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 case 0x0B:
                     INST_NAME("ROUNDSD Gx, Ex, Ib");
                     nextop = F8;
-                    GETEXSD(d0, 0);
+                    GETEXSD(d0, 1);
                     GETGXSD_empty(v0);
                     d1 = fpu_get_scratch(dyn);
+                    v1 = fpu_get_scratch(dyn);
                     u8 = F8;
                     FEQD(x2, d0, d0);
                     BNEZ_MARK(x2);
-                    FADDD(v0, d0, d0);
+                    if (v0!=d0) FMVD(v0, d0);
                     B_NEXT_nocond;
                     MARK; // d0 is not nan
-                    FABSD(v0, d0);
+                    FABSD(v1, d0);
                     MOV64x(x3, 1ULL << __DBL_MANT_DIG__);
                     FCVTDL(d1, x3, RD_RTZ);
-                    FLTD(x3, v0, d1);
+                    FLTD(x3, v1, d1);
                     BNEZ_MARK2(x3);
                     if (v0!=d0) FMVD(v0, d0);
                     B_NEXT_nocond;
@@ -366,17 +746,258 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(u8&4) {
                         u8 = sse_setround(dyn, ninst, x4, x2);
                         FCVTLD(x5, d0, RD_DYN);
-                        FCVTDL(v0, x5, RD_DYN);
+                        FCVTDL(v0, x5, RD_RTZ);
                         x87_restoreround(dyn, ninst, u8);
                     } else {
                         FCVTLD(x5, d0, round_round[u8&3]);
-                        FCVTDL(v0, x5, round_round[u8&3]);
+                        FCVTDL(v0, x5, RD_RTZ);
                     }
                     break;
-                default:
+                case 0x09:
+                    INST_NAME("ROUNDPD Gx, Ex, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 1);
+                    u8 = F8;
+                    d0 = fpu_get_scratch(dyn);
+                    d1 = fpu_get_scratch(dyn);
+                    v1 = fpu_get_scratch(dyn);
+                    MOV64x(x3, 1ULL << __DBL_MANT_DIG__);
+                    FCVTDL(d1, x3, RD_RTZ);
+
+                    // i = 0
+                    FLD(d0, wback, fixedaddress);
+                    FEQD(x4, d0, d0);
+                    BNEZ(x4, 8);
+                    B_MARK_nocond;
+                    // d0 is not nan
+                    FABSD(v1, d0);
+                    FLTD(x4, v1, d1);
+                    BNEZ(x4, 8);
+                    B_MARK_nocond;
+                    if(u8&4) {
+                        u8 = sse_setround(dyn, ninst, x4, x5);
+                        FCVTLD(x5, d0, RD_DYN);
+                        FCVTDL(d0, x5, RD_RTZ);
+                        x87_restoreround(dyn, ninst, u8);
+                    } else {
+                        FCVTLD(x5, d0, round_round[u8&3]);
+                        FCVTDL(d0, x5, RD_RTZ);
+                    }
+                    MARK;
+                    FSD(d0, gback, gdoffset+0);
+
+                    // i = 1
+                    FLD(d0, wback, fixedaddress+8);
+                    FEQD(x4, d0, d0);
+                    BNEZ(x4, 8);
+                    B_MARK2_nocond;
+                    // d0 is not nan
+                    FABSD(v1, d0);
+                    FLTD(x4, v1, d1);
+                    BNEZ(x4, 8);
+                    B_MARK2_nocond;
+                    if(u8&4) {
+                        u8 = sse_setround(dyn, ninst, x4, x5);
+                        FCVTLD(x5, d0, RD_DYN);
+                        FCVTDL(d0, x5, RD_RTZ);
+                        x87_restoreround(dyn, ninst, u8);
+                    } else {
+                        FCVTLD(x5, d0, round_round[u8&3]);
+                        FCVTDL(d0, x5, RD_RTZ);
+                    }
+                    MARK2;
+                    FSD(d0, gback, gdoffset+8);
+                    break;
+                case 0x0E:
+                    INST_NAME("PBLENDW Gx, Ex, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 1);
+                    u8 = F8;
+                    i32 = 0;
+                    if (MODREG && gd==ed) break;
+                    while (u8)
+                        if(u8&1) {
+                            if(!(i32&1) && u8&2) {
+                                if(!(i32&3) && (u8&0xf)==0xf) {
+                                    // whole 64bits
+                                    LD(x3, wback, fixedaddress+8*(i32>>2));
+                                    SD(x3, gback, gdoffset+8*(i32>>2));
+                                    i32+=4;
+                                    u8>>=4;
+                                } else {
+                                    // 32bits
+                                    LWU(x3, wback, fixedaddress+4*(i32>>1));
+                                    SW(x3, gback, gdoffset+4*(i32>>1));
+                                    i32+=2;
+                                    u8>>=2;
+                                }
+                            } else {
+                                // 16 bits
+                                LHU(x3, wback, fixedaddress+2*i32);
+                                SH(x3, gback, gdoffset+2*i32);
+                                i32++;
+                                u8>>=1;
+                            }
+                        } else {
+                            // nope
+                            i32++;
+                            u8>>=1;
+                        }
+                    break;
+                case 0x0F:
+                    INST_NAME("PALIGNR Gx, Ex, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 1);
+                    u8 = F8;
+                    sse_forget_reg(dyn, ninst, x5);
+                    ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                    // perserve gd
+                    LD(x3, gback, gdoffset+0);
+                    LD(x4, gback, gdoffset+8);
+                    SD(x3, x5, 0);
+                    SD(x4, x5, 8);
+                    if(u8>31) {
+                        SD(xZR, gback, gdoffset+0);
+                        SD(xZR, gback, gdoffset+8);
+                    } else {
+                        for (int i=0; i<16; ++i, ++u8) {
+                            if (u8>15) {
+                                if(u8>31) {
+                                    SB(xZR, gback, gdoffset+i);
+                                    continue;
+                                }
+                                else LBU(x3, x5, u8-16);
+                            } else {
+                                LBU(x3, wback, fixedaddress+u8);
+                            }
+                            SB(x3, gback, gdoffset+i);
+                        }
+                    }
+                    break;
+                case 0x16:
+                    if(rex.w) {INST_NAME("PEXTRQ Ed, Gx, Ib");} else {INST_NAME("PEXTRD Ed, Gx, Ib");}
+                    nextop = F8;
+                    GETGX();
+                    GETED(1);
+                    u8 = F8;
+                    if(rex.w)
+                        LD(ed, gback, gdoffset+8*(u8&1));
+                    else
+                        LWU(ed, gback, gdoffset+4*(u8&3));
+                    if (wback) {
+                        SDxw(ed, wback, fixedaddress);
+                        SMWRITE2();
+                    }
+                    break;
+                case 0x20:
+                    INST_NAME("PINSRB Gx, ED, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETED(1);
+                    u8 = F8;
+                    SB(ed, gback, gdoffset+u8&0xF);
+                    break;
+                case 0x21:
+                    INST_NAME("INSERTPS GX, EX, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETEX(x2, 1);
+                    u8 = F8;
+                    if(MODREG) s8 = (u8>>6)&3; else s8 = 0;
+                    // GX->ud[(tmp8u>>4)&3] = EX->ud[tmp8s];
+                    LWU(x3, wback, fixedaddress+4*s8);
+                    SW(x3, gback, gdoffset+4*(u8>>4));
+                    for(int i=0; i<4; ++i) {
+                        if(u8&(1<<i))
+                            // GX->ud[i] = 0;
+                            SW(xZR, gback, gdoffset+4*i);
+                    }
+                    break;
+                case 0x22:
+                    INST_NAME("PINSRD Gx, ED, Ib");
+                    nextop = F8;
+                    GETGX();
+                    GETED(1);
+                    u8 = F8;
+                    if(rex.w) {
+                        SD(ed, gback, gdoffset+8*(u8&0x1));
+                    } else {
+                        SW(ed, gback, gdoffset+4*(u8&0x3));
+                    }
+                    break;
+                case 0x44:
+                    INST_NAME("PCLMULQDQ Gx, Ex, Ib");
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd); // gx
+                    if(MODREG) {
+                        ed = (nextop&7)+(rex.b<<3);
+                        sse_forget_reg(dyn, ninst, ed);
+                        MOV32w(x2, ed);
+                        MOV32w(x3, 0);  // p = NULL
+                    } else {
+                        MOV32w(x2, 0);
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 0, 1);
+                        if(ed!=x3) {
+                            MV(x3, ed);
+                        }
+                    }
+                    u8 = F8;
+                    MOV32w(x4, u8);
+                    CALL(native_pclmul, -1);
+                    break;
+                case 0xDF:
+                    INST_NAME("AESKEYGENASSIST Gx, Ex, Ib");  // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd); // gx
+                    if(MODREG) {
+                        ed = (nextop&7)+(rex.b<<3);
+                        sse_forget_reg(dyn, ninst, ed);
+                        MOV32w(x2, ed);
+                        MOV32w(x3, 0);  //p = NULL
+                    } else {
+                        MOV32w(x2, 0);
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 1);
+                        if(ed!=x3) {
+                            MV(x3, ed);
+                        }
+                    }
+                    u8 = F8;
+                    MOV32w(x4, u8);
+                    CALL(native_aeskeygenassist, -1);
+                    break;
+            default:
                     DEFAULT;
             }
             break;
+        #define GO(GETFLAGS, NO, YES, F)            \
+            READFLAGS(F);                           \
+            GETFLAGS;                               \
+            nextop=F8;                              \
+            GETGD;                                  \
+            if(MODREG) {                            \
+                ed = xRAX+(nextop&7)+(rex.b<<3);    \
+                ZEXTH(x4, ed);                      \
+                ed = x4;                            \
+            } else {                                \
+                SMREAD();                           \
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \
+                LHU(x4, ed, fixedaddress);          \
+                ed = x4;                            \
+            }                                       \
+            B##NO(x1, 4+3*4);                       \
+            LUI(x3, 0xffff0);                       \
+            AND(gd, gd, x3);                        \
+            OR(gd, gd, ed);
+
+        GOCOND(0x40, "CMOV", "Gw, Ew");
+        #undef GO
         case 0x50:
             INST_NAME("PMOVMSKD Gd, Ex");
             nextop = F8;
@@ -390,11 +1011,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 if (i) SLLI(x2, x2, 1);
                 OR(gd, gd, x2);
             }
-            break;   
+            break;
         case 0x51:
             INST_NAME("SQRTPD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             if(!box64_dynarec_fastnan) {
@@ -411,42 +1032,42 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     BEQ(x3, xZR, 8);
                     FNEGD(d0, d0);
                 }
-                FSD(d0, gback, i*8);
+                FSD(d0, gback, gdoffset+i*8);
             }
             break;
         case 0x54:
             INST_NAME("ANDPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_Q(x3, x4, AND(x3, x3, x4));
             break;
         case 0x55:
             INST_NAME("ANDNPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_Q(x3, x4, NOT(x3, x3); AND(x3, x3, x4));
             break;
         case 0x56:
             INST_NAME("ORPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_Q(x3, x4, OR(x3, x3, x4));
             break;
         case 0x57:
             INST_NAME("XORPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
             break;
         case 0x58:
             INST_NAME("ADDPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_FQ(x3, x4, {
                 if(!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
@@ -466,7 +1087,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("MULPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_FQ(x3, x4, {
                 if(!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
@@ -485,24 +1106,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5A:
             INST_NAME("CVTPD2PS Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             // GX->f[0] = EX->d[0];
             FLD(d0, wback, fixedaddress+0);
             FCVTSD(d0, d0);
-            FSD(d0, gback, 0);
+            FSD(d0, gback, gdoffset+0);
             // GX->f[1] = EX->d[1];
             FLD(d0, wback, fixedaddress+8);
             FCVTSD(d0, d0);
-            FSD(d0, gback, 4);
+            FSD(d0, gback, gdoffset+4);
             // GX->q[1] = 0;
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+8);
             break;
         case 0x5B:
             INST_NAME("CVTPS2DQ Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             u8 = sse_setround(dyn, ninst, x6, x4);
@@ -513,7 +1134,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 SUB(x5, x5, x3);
                 BEQZ(x5, 8);
                 LUI(x3, 0x80000); // INT32_MIN
-                SW(x3, gback, 4*i);
+                SW(x3, gback, gdoffset+4*i);
             }
             x87_restoreround(dyn, ninst, u8);
             break;
@@ -521,7 +1142,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("SUBPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_FQ(x3, x4, {
                 if(!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
@@ -540,12 +1161,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5D:
             INST_NAME("MINPD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
             for (int i=0; i<2; ++i) {
-                FLD(d0, gback, 8*i);
+                FLD(d0, gback, gdoffset+8*i);
                 FLD(d1, wback, fixedaddress+8*i);
                 FEQD(x3, d0, d0);
                 FEQD(x4, d1, d1);
@@ -553,14 +1174,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 BEQ(x3, xZR, 12);
                 FLTD(x3, d1, d0);
                 BEQ(x3, xZR, 8); // continue
-                FSD(d1, gback, 8*i);
+                FSD(d1, gback, gdoffset+8*i);
             }
             break;
         case 0x5E:
             INST_NAME("DIVPD Gx, Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_FQ(x3, x4, {
                 if(!box64_dynarec_fastnan) {
                     FEQD(x3, v0, v0);
@@ -579,12 +1200,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5F:
             INST_NAME("MAXPD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
             for (int i=0; i<2; ++i) {
-                FLD(d0, gback, 8*i);
+                FLD(d0, gback, gdoffset+8*i);
                 FLD(d1, wback, fixedaddress+8*i);
                 FEQD(x3, d0, d0);
                 FEQD(x4, d1, d1);
@@ -592,54 +1213,54 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 BEQ(x3, xZR, 12);
                 FLTD(x3, d0, d1);
                 BEQ(x3, xZR, 8); // continue
-                FSD(d1, gback, 8*i);
+                FSD(d1, gback, gdoffset+8*i);
             }
             break;
         case 0x60:
             INST_NAME("PUNPCKLBW Gx,Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             for(int i=7; i>0; --i) { // 0 is untouched
                 // GX->ub[2 * i] = GX->ub[i];
-                LBU(x3, gback, i);
-                SB(x3, gback, 2*i);
+                LBU(x3, gback, gdoffset+i);
+                SB(x3, gback, gdoffset+2*i);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 for(int i=0; i<8; ++i) {
                     // GX->ub[2 * i + 1] = GX->ub[2 * i];
-                    LBU(x3, gback, 2*i);
-                    SB(x3, gback, 2*i+1);
+                    LBU(x3, gback, gdoffset+2*i);
+                    SB(x3, gback, gdoffset+2*i+1);
                 }
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<8; ++i) {
                     // GX->ub[2 * i + 1] = EX->ub[i];
                     LBU(x3, wback, fixedaddress+i);
-                    SB(x3, gback, 2*i+1);
+                    SB(x3, gback, gdoffset+2*i+1);
                 }
             }
             break;
         case 0x61:
             INST_NAME("PUNPCKLWD Gx,Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             for(int i=3; i>0; --i) {
                 // GX->uw[2 * i] = GX->uw[i];
-                LHU(x3, gback, i*2);
-                SH(x3, gback, 2*i*2);
+                LHU(x3, gback, gdoffset+i*2);
+                SH(x3, gback, gdoffset+2*i*2);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 for(int i=0; i<4; ++i) {
                     // GX->uw[2 * i + 1] = GX->uw[2 * i];
-                    LHU(x3, gback, 2*i*2);
-                    SH(x3, gback, (2*i+1)*2);
+                    LHU(x3, gback, gdoffset+2*i*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
                 }
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<4; ++i) {
                     // GX->uw[2 * i + 1] = EX->uw[i];
                     LHU(x3, wback, fixedaddress+i*2);
-                    SH(x3, gback, (2*i+1)*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
                 }
             }
             break;
@@ -647,71 +1268,108 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("PUNPCKLDQ Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             // GX->ud[3] = EX->ud[1];
-            LWU(x3, x1, fixedaddress+1*4);
-            SW(x3, x2, 3*4);
+            LWU(x3, wback, fixedaddress+1*4);
+            SW(x3, gback, gdoffset+3*4);
             // GX->ud[2] = GX->ud[1];
-            LWU(x3, x2, 1*4);
-            SW(x3, x2, 2*4);
+            LWU(x3, gback, gdoffset+1*4);
+            SW(x3, gback, gdoffset+2*4);
             // GX->ud[1] = EX->ud[0];
-            LWU(x3, x1, fixedaddress+0*4);
-            SW(x3, x2, 1*4);
+            LWU(x3, wback, fixedaddress+0*4);
+            SW(x3, gback, gdoffset+1*4);
+            break;
+        case 0x63:
+            INST_NAME("PACKSSWB Gx, Ex");
+            nextop = F8;
+            GETGX();
+            GETEX(x2, 0);
+            MOV64x(x5, 127);
+            MOV64x(x6, -128);
+            for(int i=0; i<8; ++i) {
+                LH(x3, gback, gdoffset+i*2);
+                if(rv64_zbb) {
+                    MIN(x3, x3, x5);
+                    MAX(x3, x3, x6);
+                } else {
+                    BLT(x3, x5, 4+4);
+                    MV(x3, x5);
+                    BGE(x3, x6, 4+4);
+                    MV(x3, x6);
+                }
+                SB(x3, gback, gdoffset+i);
+            }
+            if(MODREG && gd==ed) {
+                LD(x3, gback, gdoffset+0);
+                SD(x3, gback, gdoffset+8);
+            } else for(int i=0; i<8; ++i) {
+                LH(x3, wback, fixedaddress+i*2);
+                if(rv64_zbb) {
+                    MIN(x3, x3, x5);
+                    MAX(x3, x3, x6);
+                } else {
+                    BLT(x3, x5, 4+4);
+                    MV(x3, x5);
+                    BGE(x3, x6, 4+4);
+                    MV(x3, x6);
+                }
+                SB(x3, gback, gdoffset+8+i);
+            }
             break;
         case 0x64:
             INST_NAME("PCMPGTB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // GX->ub[i] = (GX->sb[i]>EX->sb[i])?0xFF:0x00;
                 LB(x3, wback, fixedaddress+i);
-                LB(x4, gback, i);
+                LB(x4, gback, gdoffset+i);
                 SLT(x3, x3, x4);
                 NEG(x3, x3);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0x65:
             INST_NAME("PCMPGTW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
                 // GX->uw[i] = (GX->sw[i]>EX->sw[i])?0xFFFF:0x0000;
                 LH(x3, wback, fixedaddress+i*2);
-                LH(x4, gback, i*2);
+                LH(x4, gback, gdoffset+i*2);
                 SLT(x3, x3, x4);
                 NEG(x3, x3);
-                SH(x3, gback, i*2);
+                SH(x3, gback, gdoffset+i*2);
             }
             break;
         case 0x66:
             INST_NAME("PCMPGTD Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             SSE_LOOP_DS(x3, x4, SLT(x4, x4, x3); SLLI(x3, x4, 63); SRAI(x3, x3, 63));
             break;
         case 0x67:
             INST_NAME("PACKUSWB Gx, Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             ADDI(x5, xZR, 0xFF);
             for(int i=0; i<8; ++i) {
                 // GX->ub[i] = (GX->sw[i]<0)?0:((GX->sw[i]>0xff)?0xff:GX->sw[i]);
-                LH(x3, gback, i*2);
+                LH(x3, gback, gdoffset+i*2);
                 BGE(x5, x3, 8);
                 ADDI(x3, xZR, 0xFF);
                 NOT(x4, x3);
                 SRAI(x4, x4, 63);
                 AND(x3, x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 // GX->q[1] = GX->q[0];
-                LD(x3, gback, 0*8);
-                SD(x3, gback, 1*8);
+                LD(x3, gback, gdoffset+0*8);
+                SD(x3, gback, gdoffset+1*8);
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<8; ++i) {
@@ -722,55 +1380,55 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     NOT(x4, x3);
                     SRAI(x4, x4, 63);
                     AND(x3, x3, x4);
-                    SB(x3, gback, 8+i);
+                    SB(x3, gback, gdoffset+8+i);
                 }
             }
             break;
         case 0x68:
             INST_NAME("PUNPCKHBW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             for(int i=0; i<8; ++i) {
                 // GX->ub[2 * i] = GX->ub[i + 8];
-                LBU(x3, gback, i+8);
-                SB(x3, gback, 2*i);
+                LBU(x3, gback, gdoffset+i+8);
+                SB(x3, gback, gdoffset+2*i);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 for(int i=0; i<8; ++i) {
                     // GX->ub[2 * i + 1] = GX->ub[2 * i];
-                    LBU(x3, gback, 2*i);
-                    SB(x3, gback, 2*i+1);
+                    LBU(x3, gback, gdoffset+2*i);
+                    SB(x3, gback, gdoffset+2*i+1);
                 }
             } else {
                 GETEX(x2, 0);
                 for(int i=0; i<8; ++i) {
                     // GX->ub[2 * i + 1] = EX->ub[i + 8];
                     LBU(x3, wback, fixedaddress+i+8);
-                    SB(x3, gback, 2*i+1);
+                    SB(x3, gback, gdoffset+2*i+1);
                 }
             }
             break;
         case 0x69:
             INST_NAME("PUNPCKHWD Gx,Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             for(int i=0; i<4; ++i) {
                 // GX->uw[2 * i] = GX->uw[i + 4];
-                LHU(x3, gback, (i+4)*2);
-                SH(x3, gback, 2*i*2);
+                LHU(x3, gback, gdoffset+(i+4)*2);
+                SH(x3, gback, gdoffset+2*i*2);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 for(int i=0; i<4; ++i) {
                     // GX->uw[2 * i + 1] = GX->uw[2 * i];
-                    LHU(x3, gback, 2*i*2);
-                    SH(x3, gback, (2*i+1)*2);
+                    LHU(x3, gback, gdoffset+2*i*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
                 }
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<4; ++i) {
                     // GX->uw[2 * i + 1] = EX->uw[i + 4];
                     LHU(x3, wback, fixedaddress+(i+4)*2);
-                    SH(x3, gback, (2*i+1)*2);
+                    SH(x3, gback, gdoffset+(2*i+1)*2);
                 }
             }
             break;
@@ -778,41 +1436,41 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("PUNPCKHDQ Gx,Ex");
             nextop = F8;
             GETEX(x1, 0);
-            GETGX(x2);
+            GETGX();
             // GX->ud[0] = GX->ud[2];
-            LWU(x3, gback, 2*4);
-            SW(x3, gback, 0*4);
+            LWU(x3, gback, gdoffset+2*4);
+            SW(x3, gback, gdoffset+0*4);
             // GX->ud[1] = EX->ud[2];
             LWU(x3, wback, fixedaddress+2*4);
-            SW(x3, gback, 1*4);
+            SW(x3, gback, gdoffset+1*4);
             // GX->ud[2] = GX->ud[3];
-            LWU(x3, gback, 3*4);
-            SW(x3, gback, 2*4);
+            LWU(x3, gback, gdoffset+3*4);
+            SW(x3, gback, gdoffset+2*4);
             // GX->ud[3] = EX->ud[3];
             if (!(MODREG && (gd==ed))) {
                 LWU(x3, wback, fixedaddress+3*4);
-                SW(x3, gback, 3*4);
+                SW(x3, gback, gdoffset+3*4);
             }
             break;
         case 0x6B:
             INST_NAME("PACKSSDW Gx,Ex");
             nextop = F8;
-            GETGX(x2);
+            GETGX();
             MOV64x(x5, 32768);
             NEG(x6, x5);
             for(int i=0; i<4; ++i) {
                 // GX->sw[i] = (GX->sd[i]<-32768)?-32768:((GX->sd[i]>32767)?32767:GX->sd[i]);
-                LW(x3, gback, i*4);
+                LW(x3, gback, gdoffset+i*4);
                 BGE(x5, x3, 8);
                 ADDI(x3, x5, -1);
                 BGE(x3, x6, 8);
                 MV(x3, x6);
-                SH(x3, gback, i*2);
+                SH(x3, gback, gdoffset+i*2);
             }
             if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
                 // GX->q[1] = GX->q[0];
-                LD(x3, gback, 0*8);
-                SD(x3, gback, 1*8);
+                LD(x3, gback, gdoffset+0*8);
+                SD(x3, gback, gdoffset+1*8);
             } else {
                 GETEX(x1, 0);
                 for(int i=0; i<4; ++i) {
@@ -822,32 +1480,32 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     ADDI(x3, x5, -1);
                     BGE(x3, x6, 8);
                     MV(x3, x6);
-                    SH(x3, gback, (4+i)*2);
+                    SH(x3, gback, gdoffset+(4+i)*2);
                 }
             }
             break;
         case 0x6C:
             INST_NAME("PUNPCKLQDQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(MODREG) {
                 v1 = sse_get_reg(dyn, ninst, x2, (nextop&7)+(rex.b<<3), 0);
-                FSD(v1, gback, 8);
+                FSD(v1, gback, gdoffset+8);
             } else {
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
                 LD(x3, ed, fixedaddress+0);
-                SD(x3, gback, 8);
+                SD(x3, gback, gdoffset+8);
             }
             break;
         case 0x6D:
             INST_NAME("PUNPCKHQDQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
-            LD(x3, gback, 8);
-            SD(x3, gback, 0);
+            LD(x3, gback, gdoffset+8);
+            SD(x3, gback, gdoffset+0);
             LD(x3, wback, fixedaddress+8);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             break;
         case 0x6E:
             INST_NAME("MOVD Gx, Ed");
@@ -869,14 +1527,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x6F:
             INST_NAME("MOVDQA Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q(x3);
             break;
         case 0x70: // TODO: Optimize this!
             INST_NAME("PSHUFD Gx,Ex,Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 1);
             u8 = F8;
             int32_t idx;
@@ -890,10 +1548,10 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             idx = (u8>>(3*2))&3;
             LWU(x6, wback, fixedaddress+idx*4);
 
-            SW(x3, gback, 0*4);
-            SW(x4, gback, 1*4);
-            SW(x5, gback, 2*4);
-            SW(x6, gback, 3*4);
+            SW(x3, gback, gdoffset+0*4);
+            SW(x4, gback, gdoffset+1*4);
+            SW(x5, gback, gdoffset+2*4);
+            SW(x6, gback, gdoffset+3*4);
             break;
         case 0x71:
             nextop = F8;
@@ -904,8 +1562,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     u8 = F8;
                     if (u8>15) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else if(u8) {
                         for (int i=0; i<8; ++i) {
                             // EX->uw[i] >>= u8;
@@ -935,8 +1593,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     u8 = F8;
                     if (u8>15) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else if(u8) {
                         for (int i=0; i<8; ++i) {
                             // EX->uw[i] <<= u8;
@@ -961,8 +1619,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(u8) {
                         if (u8>31) {
                             // just zero dest
-                            SD(xZR, x1, fixedaddress+0);
-                            SD(xZR, x1, fixedaddress+8);
+                            SD(xZR, wback, fixedaddress+0);
+                            SD(xZR, wback, fixedaddress+8);
                         } else if(u8) {
                             SSE_LOOP_D_S(x3, SRLI(x3, x3, u8));
                         }
@@ -984,8 +1642,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(u8) {
                         if (u8>31) {
                             // just zero dest
-                            SD(xZR, x1, fixedaddress+0);
-                            SD(xZR, x1, fixedaddress+8);
+                            SD(xZR, wback, fixedaddress+0);
+                            SD(xZR, wback, fixedaddress+8);
                         } else if(u8) {
                             SSE_LOOP_D_S(x3, SLLI(x3, x3, u8));
                         }
@@ -1023,24 +1681,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(!u8) break;
                     if(u8>15) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else {
                         u8*=8;
                         if (u8 < 64) {
-                            LD(x3, x1, fixedaddress+0);
-                            LD(x4, x1, fixedaddress+8);
+                            LD(x3, wback, fixedaddress+0);
+                            LD(x4, wback, fixedaddress+8);
                             SRLI(x3, x3, u8);
                             SLLI(x5, x4, 64-u8);
                             OR(x3, x3, x5);
-                            SD(x3, x1, fixedaddress+0);
+                            SD(x3, wback, fixedaddress+0);
                             SRLI(x4, x4, u8);
-                            SD(x4, x1, fixedaddress+8);
+                            SD(x4, wback, fixedaddress+8);
                         } else {
-                            LD(x3, x1, fixedaddress+8);
+                            LD(x3, wback, fixedaddress+8);
                             if (u8-64 > 0) { SRLI(x3, x3, u8-64); }
-                            SD(x3, x1, fixedaddress+0);
-                            SD(xZR, x1, fixedaddress+8);
+                            SD(x3, wback, fixedaddress+0);
+                            SD(xZR, wback, fixedaddress+8);
                         }
                     }
                     break;
@@ -1051,8 +1709,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(!u8) break;
                     if(u8>63) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else {
                         LD(x3, wback, fixedaddress+0);
                         LD(x4, wback, fixedaddress+8);
@@ -1069,24 +1727,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     if(!u8) break;
                     if(u8>15) {
                         // just zero dest
-                        SD(xZR, x1, fixedaddress+0);
-                        SD(xZR, x1, fixedaddress+8);
+                        SD(xZR, wback, fixedaddress+0);
+                        SD(xZR, wback, fixedaddress+8);
                     } else {
                         u8*=8;
                         if (u8 < 64) {
-                            LD(x3, x1, fixedaddress+0);
-                            LD(x4, x1, fixedaddress+8);
+                            LD(x3, wback, fixedaddress+0);
+                            LD(x4, wback, fixedaddress+8);
                             SLLI(x4, x4, u8);
                             SRLI(x5, x3, 64-u8);
                             OR(x4, x4, x5);
-                            SD(x4, x1, fixedaddress+8);
+                            SD(x4, wback, fixedaddress+8);
                             SLLI(x3, x3, u8);
-                            SD(x3, x1, fixedaddress+0);
+                            SD(x3, wback, fixedaddress+0);
                         } else {
-                            LD(x3, x1, fixedaddress+0);
+                            LD(x3, wback, fixedaddress+0);
                             if (u8-64 > 0) { SLLI(x3, x3, u8-64); }
-                            SD(x3, x1, fixedaddress+8);
-                            SD(xZR, x1, fixedaddress+0);
+                            SD(x3, wback, fixedaddress+8);
+                            SD(xZR, wback, fixedaddress+0);
                         }
                     }
                     break;
@@ -1097,52 +1755,94 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x74:
             INST_NAME("PCMPEQB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 SUB(x3, x3, x4);
                 SEQZ(x3, x3);
                 NEG(x3, x3);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0x75:
             INST_NAME("PCMPEQW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_W(x3, x4, SUB(x3, x3, x4); SEQZ(x3, x3); NEG(x3, x3));
             break;
         case 0x76:
             INST_NAME("PCMPEQD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_D(x3, x4, XOR(x3, x3, x4); SNEZ(x3, x3); ADDI(x3, x3, -1));
             break;
+        case 0x7C:
+            INST_NAME("HADDPD Gx, Ex");
+            nextop = F8;
+            GETGX();
+            d0 = fpu_get_scratch(dyn);
+            d1 = fpu_get_scratch(dyn);
+            FLD(d0, gback, gdoffset+0);
+            FLD(d1, gback, gdoffset+8);
+            if(!box64_dynarec_fastnan) {
+                FEQD(x3, d0, d0);
+                FEQD(x4, d1, d1);
+                AND(x3, x3, x4);
+            }
+            FADDD(d0, d0, d1);
+            if(!box64_dynarec_fastnan) {
+                FEQD(x4, d0, d0);
+                BEQZ(x3, 12);
+                BNEZ(x4, 8);
+                FNEGD(d0, d0);
+            }
+            FSD(d0, gback, gdoffset+0);
+            if(MODREG && gd==(nextop&7)+(rex.b<<3)) {
+                FSD(d0, gback, gdoffset+8);
+            } else {
+                GETEX(x2, 0);
+                FLD(d0, wback, fixedaddress+0);
+                FLD(d1, wback, fixedaddress+8);
+                if(!box64_dynarec_fastnan) {
+                    FEQD(x3, d0, d0);
+                    FEQD(x4, d1, d1);
+                    AND(x3, x3, x4);
+                }
+                FADDD(d0, d0, d1);
+                if(!box64_dynarec_fastnan) {
+                    FEQD(x4, d0, d0);
+                    BEQZ(x3, 12);
+                    BNEZ(x4, 8);
+                    FNEGD(d0, d0);
+                }
+                FSD(d0, gback, gdoffset+8);
+            }
+            break;
         case 0x7E:
             INST_NAME("MOVD Ed,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(rex.w) {
                 if(MODREG) {
                     ed = xRAX + (nextop&7) + (rex.b<<3);
-                    LD(ed, x1, 0);
+                    LD(ed, gback, gdoffset+0);
                 } else {
-                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
-                    LD(x3, x1, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                    LD(x3, gback, gdoffset+0);
                     SD(x3, ed, fixedaddress);
                     SMWRITE2();
                 }
             } else {
                 if(MODREG) {
                     ed = xRAX + (nextop&7) + (rex.b<<3);
-                    LWU(ed, x1, 0);
+                    LWU(ed, gback, gdoffset+0);
                 } else {
-                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
-                    LWU(x3, x1, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                    LWU(x3, gback, gdoffset+0);
                     SW(x3, ed, fixedaddress);
                     SMWRITE2();
                 }
@@ -1151,7 +1851,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x7F:
             INST_NAME("MOVDQA Ex,Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
             if(!MODREG) SMWRITE2();
@@ -1165,8 +1865,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETSGW(x2);
             MULW(x2, x2, x1);
             UFLAG_RES(x2);
-            SLLI(x2, x2, 48);
-            SRLI(x2, x2, 48);
+            ZEXTH(x2, x2);
             GWBACK;
             break;
 
@@ -1188,7 +1887,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 SRAI(x1, x1, 56);
             } else {
                 SMREAD();
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 0, 0);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0);
                 LB(x1, ed, fixedaddress);
             }
             LUI(x5, 0xffff0);
@@ -1200,13 +1899,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xC2:
             INST_NAME("CMPPD Gx, Ex, Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 1);
             u8 = F8;
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
             for(int i=0; i<2; ++i) {
-                FLD(d0, gback, 8*i);
+                FLD(d0, gback, gdoffset+8*i);
                 FLD(d1, wback, fixedaddress+8*i);
                 if ((u8&7) == 0) {                                      // Equal
                     FEQD(x3, d0, d1);
@@ -1237,7 +1936,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     }
                     case 7: break;                                      // Not NaN
                     }
-                    
+
                     // MARK2;
                     if ((u8&7) == 5 || (u8&7) == 6) {
                         MOV32w(x3, 1);
@@ -1245,16 +1944,16 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     // MARK;
                 }
                 NEG(x3, x3);
-                SD(x3, gback, 8*i);
+                SD(x3, gback, gdoffset+8*i);
             }
             break;
         case 0xC4:
             INST_NAME("PINSRW Gx,Ed,Ib");
             nextop = F8;
             GETED(1);
-            GETGX(x3);
+            GETGX();
             u8 = (F8)&7;
-            SH(ed, gback, u8*2);
+            SH(ed, gback, gdoffset+u8*2);
             break;
         case 0xC5:
             INST_NAME("PEXTRW Gd,Ex,Ib");
@@ -1267,90 +1966,90 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xC6:
             INST_NAME("SHUFPD Gx, Ex, Ib");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
+            GETEX(x2, 1);
             u8 = F8;
             if (MODREG && gd==(nextop&7)+(rex.b<<3) && u8==0) {
-                LD(x3, gback, 0);
-                SD(x3, gback, 8);
+                LD(x3, gback, gdoffset+0);
+                SD(x3, gback, gdoffset+8);
                 break;
             }
-            GETEX(x2, 1)
-            LD(x3, gback, 8*(u8&1));
+            LD(x3, gback, gdoffset+8*(u8&1));
             LD(x4, wback, fixedaddress+8*((u8>>1)&1));
-            SD(x3, gback, 0);
-            SD(x4, gback, 8);
+            SD(x3, gback, gdoffset+0);
+            SD(x4, gback, gdoffset+8);
             break;
         case 0xD1:
             INST_NAME("PSRLW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LD(x3, wback, fixedaddress);
             ADDI(x4, xZR, 16);
             BLTU_MARK(x3, x4);
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<8; ++i) {
-                LHU(x5, gback, 2*i);
+                LHU(x5, gback, gdoffset+2*i);
                 SRLW(x5, x5, x3);
-                SH(x5, gback, 2*i);
+                SH(x5, gback, gdoffset+2*i);
             }
             break;
         case 0xD2:
             INST_NAME("PSRLD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LD(x3, wback, fixedaddress);
             ADDI(x4, xZR, 32);
             BLTU_MARK(x3, x4);
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<4; ++i) {
-                LWU(x5, gback, 4*i);
+                LWU(x5, gback, gdoffset+4*i);
                 SRLW(x5, x5, x3);
-                SW(x5, gback, 4*i);
+                SW(x5, gback, gdoffset+4*i);
             }
             break;
         case 0xD3:
             INST_NAME("PSRLQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             LD(x3, wback, fixedaddress);
             ADDI(x4, xZR, 64);
             BLTU_MARK(x3, x4);
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<2; ++i) {
-                LD(x5, gback, 8*i);
+                LD(x5, gback, gdoffset+8*i);
                 SRL(x5, x5, x3);
-                SD(x5, gback, 8*i);
+                SD(x5, gback, gdoffset+8*i);
             }
             break;
         case 0xD4:
             INST_NAME("PADDQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, ADD(x3, x3, x4));
             break;
         case 0xD5:
             INST_NAME("PMULLW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 MULW(x3, x3, x4);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xD6:
@@ -1381,314 +2080,347 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xD8:
             INST_NAME("PSUBUSB Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 SUB(x3, x3, x4);
                 NOT(x4, x3);
                 SRAI(x4, x4, 63);
                 AND(x3, x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xD9:
             INST_NAME("PSUBUSW Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_W(x3, x4, SUB(x3, x3, x4); NOT(x4, x3); SRAI(x4, x4, 63); AND(x3, x3, x4));
             break;
         case 0xDA:
             INST_NAME("PMINUB Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 BLTU(x3, x4, 8);
                 MV(x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xDB:
             INST_NAME("PAND Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, AND(x3, x3, x4));
             break;
         case 0xDC:
             INST_NAME("PADDUSB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x5, xZR, 0xFF);
             for(int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 ADD(x3, x3, x4);
                 BLT(x3, x5, 8);
                 ADDI(x3, xZR, 0xFF);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xDD:
             INST_NAME("PADDUSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
                 // tmp32s = (int32_t)GX->uw[i] + EX->uw[i];
                 // GX->uw[i] = (tmp32s>65535)?65535:tmp32s;
-                LHU(x3, gback, i*2);
+                LHU(x3, gback, gdoffset+i*2);
                 LHU(x4, wback, fixedaddress+i*2);
                 ADDW(x3, x3, x4);
                 MOV32w(x4, 65536);
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, i*2);
+                SH(x3, gback, gdoffset+i*2);
             }
             break;
         case 0xDE:
             INST_NAME("PMAXUB Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 BLTU(x4, x3, 8);
                 MV(x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xDF:
             INST_NAME("PANDN Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, NOT(x3, x3); AND(x3, x3, x4));
             break;
          case 0xE0:
             INST_NAME("PAVGB Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 ADDW(x3, x3, x4);
                 ADDIW(x3, x3, 1);
                 SRAIW(x3, x3, 1);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xE1:
             INST_NAME("PSRAW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 16);
             LD(x3, wback, fixedaddress);
             BLTU(x3, x4, 8);
             SUBI(x3, x4, 1);
             for (int i=0; i<8; ++i) {
-                LH(x4, gback, 2*i);
+                LH(x4, gback, gdoffset+2*i);
                 SRAW(x4, x4, x3);
-                SH(x4, gback, 2*i);
+                SH(x4, gback, gdoffset+2*i);
             }
             break;
         case 0xE2:
             INST_NAME("PSRAD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 32);
             LD(x3, wback, fixedaddress);
             BLTU(x3, x4, 8);
             SUBI(x3, x4, 1);
             for (int i=0; i<4; ++i) {
-                LW(x4, gback, 4*i);
+                LW(x4, gback, gdoffset+4*i);
                 SRAW(x4, x4, x3);
-                SW(x4, gback, 4*i);
+                SW(x4, gback, gdoffset+4*i);
             }
             break;
         case 0xE3:
             INST_NAME("PAVGW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<8; ++i) {
-                LHU(x3, gback, 2*i);
+                LHU(x3, gback, gdoffset+2*i);
                 LHU(x4, wback, fixedaddress+2*i);
                 ADDW(x3, x3, x4);
                 ADDIW(x3, x3, 1);
                 SRAIW(x3, x3, 1);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xE4:
             INST_NAME("PMULHUW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
-                LHU(x3, gback, 2*i);
+                LHU(x3, gback, gdoffset+2*i);
                 LHU(x4, wback, fixedaddress+2*i);
                 MULW(x3, x3, x4);
                 SRLIW(x3, x3, 16);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xE5:
             INST_NAME("PMULHW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 MULW(x3, x3, x4);
                 SRAIW(x3, x3, 16);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
+        case 0xE6:
+            INST_NAME("CVTTPD2DQ Gx, Ex");
+            nextop = F8;
+            GETGX();
+            GETEX(x2, 0);
+            v0 = fpu_get_scratch(dyn);
+            v1 = fpu_get_scratch(dyn);
+            FLD(v0, wback, fixedaddress+0);
+            FLD(v1, wback, fixedaddress+8);
+            if(!box64_dynarec_fastround) {
+                FSFLAGSI(0);  // // reset all bits
+            }
+            FCVTWD(x3, v0, RD_RTZ);
+            if(!box64_dynarec_fastround) {
+                FRFLAGS(x5);   // get back FPSR to check the IOC bit
+                ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                BEQ_MARK(x5, xZR);
+                MOV32w(x3, 0x80000000);
+                MARK;
+                FSFLAGSI(0);  // // reset all bits
+            }
+            FCVTWD(x4, v1, RD_RTZ);
+            if(!box64_dynarec_fastround) {
+                FRFLAGS(x5);   // get back FPSR to check the IOC bit
+                ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                BEQ_MARK2(x5, xZR);
+                MOV32w(x4, 0x80000000);
+                MARK2;
+            }
+            SW(x3, gback, gdoffset+0);
+            SW(x4, gback, gdoffset+4);
+            SD(xZR, gback, gdoffset+8);
+            break;
         case 0xE7:
             INST_NAME("MOVNTDQ Ex, Gx");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_MV_Q2(x3);
             break;
         case 0xE8:
             INST_NAME("PSUBSB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // tmp16s = (int16_t)GX->sb[i] - EX->sb[i];
                 // GX->sb[i] = (tmp16s<-128)?-128:((tmp16s>127)?127:tmp16s);
-                LB(x3, gback, i);
+                LB(x3, gback, gdoffset+i);
                 LB(x4, wback, fixedaddress+i);
                 SUBW(x3, x3, x4);
                 SLLIW(x3, x3, 16);
                 SRAIW(x3, x3, 16);
                 ADDI(x4, xZR, 0x7f);
                 BLT(x3, x4, 12);     // tmp16s>127?
-                SB(x4, gback, i);
+                SB(x4, gback, gdoffset+i);
                 J(24);               // continue
                 ADDI(x4, xZR, 0xf80);
                 BLT(x4, x3, 12);     // tmp16s<-128?
-                SB(x4, gback, i);
+                SB(x4, gback, gdoffset+i);
                 J(8);                // continue
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xE9:
             INST_NAME("PSUBSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
                 // tmp32s = (int32_t)GX->sw[i] - EX->sw[i];
                 // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 SUBW(x3, x3, x4);
                 LUI(x4, 0xFFFF8); // -32768
                 BGE(x3, x4, 12);
-                SH(x4, gback, 2*i);
+                SH(x4, gback, gdoffset+2*i);
                 J(20); // continue
                 LUI(x4, 8); // 32768
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xEA:
             INST_NAME("PMINSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<8; ++i) {
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 BLT(x3, x4, 8);
                 MV(x3, x4);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xEB:
             INST_NAME("POR Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, OR(x3, x3, x4));
             break;
         case 0xEC:
             INST_NAME("PADDSB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // tmp16s = (int16_t)GX->sb[i] + EX->sb[i];
                 // GX->sb[i] = (tmp16s>127)?127:((tmp16s<-128)?-128:tmp16s);
-                LB(x3, gback, i);
+                LB(x3, gback, gdoffset+i);
                 LB(x4, wback, fixedaddress+i);
                 ADDW(x3, x3, x4);
                 SLLIW(x3, x3, 16);
                 SRAIW(x3, x3, 16);
                 ADDI(x4, xZR, 0x7f);
                 BLT(x3, x4, 12);     // tmp16s>127?
-                SB(x4, gback, i);
+                SB(x4, gback, gdoffset+i);
                 J(24);               // continue
                 ADDI(x4, xZR, 0xf80);
                 BLT(x4, x3, 12);     // tmp16s<-128?
-                SB(x4, gback, i);
+                SB(x4, gback, gdoffset+i);
                 J(8);                // continue
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xED:
             INST_NAME("PADDSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<8; ++i) {
                 // tmp32s = (int32_t)GX->sw[i] + EX->sw[i];
                 // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
-                LH(x3, gback, 2*i);
+                LH(x3, gback, gdoffset+2*i);
                 LH(x4, wback, fixedaddress+2*i);
                 ADDW(x3, x3, x4);
                 LUI(x4, 0xFFFF8); // -32768
                 BGE(x3, x4, 12);
-                SH(x4, gback, 2*i);
+                SH(x4, gback, gdoffset+2*i);
                 J(20); // continue
                 LUI(x4, 8); // 32768
                 BLT(x3, x4, 8);
                 ADDIW(x3, x4, -1);
-                SH(x3, gback, 2*i);
+                SH(x3, gback, gdoffset+2*i);
             }
             break;
         case 0xEE:
             INST_NAME("PMAXSW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_WS(x3, x4, BGE(x3, x4, 8); MV(x3, x4));
             break;
         case 0xEF:
             INST_NAME("PXOR Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             if(MODREG && gd==(nextop&7)+(rex.b<<3))
             {
                 // just zero dest
-                SD(xZR, x1, 0);
-                SD(xZR, x1, 8);
+                SD(xZR, gback, gdoffset+0);
+                SD(xZR, gback, gdoffset+8);
             } else {
                 GETEX(x2, 0);
                 SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4));
@@ -1697,102 +2429,102 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xF1:
             INST_NAME("PSLLQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 16);
             LD(x3, wback, fixedaddress+0);
             BLTU_MARK(x3, x4);
             // just zero dest
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<8; ++i) {
-                LHU(x4, gback, 2*i);
+                LHU(x4, gback, gdoffset+2*i);
                 SLLW(x4, x4, x3);
-                SH(x4, gback, 2*i);
+                SH(x4, gback, gdoffset+2*i);
             }
             break;
         case 0xF2:
             INST_NAME("PSLLQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 32);
             LD(x3, wback, fixedaddress+0);
             BLTU_MARK(x3, x4);
             // just zero dest
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<4; ++i) {
-                LWU(x4, gback, 4*i);
+                LWU(x4, gback, gdoffset+4*i);
                 SLLW(x4, x4, x3);
-                SW(x4, gback, 4*i);
+                SW(x4, gback, gdoffset+4*i);
             }
             break;
         case 0xF3:
             INST_NAME("PSLLQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             ADDI(x4, xZR, 64);
             LD(x3, wback, fixedaddress+0);
             BLTU_MARK(x3, x4);
             // just zero dest
-            SD(xZR, gback, 0);
-            SD(xZR, gback, 8);
+            SD(xZR, gback, gdoffset+0);
+            SD(xZR, gback, gdoffset+8);
             B_NEXT_nocond;
             MARK;
             for (int i=0; i<2; ++i) {
-                LD(x4, gback, 8*i);
+                LD(x4, gback, gdoffset+8*i);
                 SLL(x4, x4, x3);
-                SD(x4, gback, 8*i);
+                SD(x4, gback, gdoffset+8*i);
             }
             break;
         case 0xF4:
             INST_NAME("PMULUDQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             // GX->q[1] = (uint64_t)EX->ud[2]*GX->ud[2];
-            LWU(x3, gback, 2*4);
+            LWU(x3, gback, gdoffset+2*4);
             LWU(x4, wback, fixedaddress+2*4);
             MUL(x3, x3, x4);
-            SD(x3, gback, 8);
+            SD(x3, gback, gdoffset+8);
             // GX->q[0] = (uint64_t)EX->ud[0]*GX->ud[0];
-            LWU(x3, gback, 0*4);
+            LWU(x3, gback, gdoffset+0*4);
             LWU(x4, wback, fixedaddress+0*4);
             MUL(x3, x3, x4);
-            SD(x3, gback, 0);
+            SD(x3, gback, gdoffset+0);
             break;
         case 0xF5:
             INST_NAME("PMADDWD Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for (int i=0; i<4; ++i) {
-                // GX->sd[i] = (int32_t)(GX->sw[i*2+0])*EX->sw[i*2+0] + 
+                // GX->sd[i] = (int32_t)(GX->sw[i*2+0])*EX->sw[i*2+0] +
                 //             (int32_t)(GX->sw[i*2+1])*EX->sw[i*2+1];
-                LH(x3, gback, 2*(i*2+0));
+                LH(x3, gback, gdoffset+2*(i*2+0));
                 LH(x4, wback, fixedaddress+2*(i*2+0));
                 MULW(x5, x3, x4);
-                LH(x3, gback, 2*(i*2+1));
+                LH(x3, gback, gdoffset+2*(i*2+1));
                 LH(x4, wback, fixedaddress+2*(i*2+1));
                 MULW(x6, x3, x4);
                 ADDW(x5, x5, x6);
-                SW(x5, gback, 4*i);
+                SW(x5, gback, gdoffset+4*i);
             }
             break;
         case 0xF6:
             INST_NAME("PSADBW Gx, Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             MV(x6, xZR);
             for (int i=0; i<16; ++i) {
-                LBU(x3, gback, i);
+                LBU(x3, gback, gdoffset+i);
                 LBU(x4, wback, fixedaddress+i);
                 SUBW(x3, x3, x4);
                 SRAIW(x5, x3, 31);
@@ -1801,7 +2533,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 ANDI(x3, x3, 0xff);
                 ADDW(x6, x6, x3);
                 if (i==7 || i == 15) {
-                    SD(x6, gback, i+1-8);
+                    SD(x6, gback, gdoffset+i+1-8);
                     if (i==7) MV(x6, xZR);
                 }
             }
@@ -1809,61 +2541,61 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xF8:
             INST_NAME("PSUBB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // GX->sb[i] -= EX->sb[i];
                 LB(x3, wback, fixedaddress+i);
-                LB(x4, gback, i);
+                LB(x4, gback, gdoffset+i);
                 SUB(x3, x4, x3);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xF9:
             INST_NAME("PSUBW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_W(x3, x4, SUBW(x3, x3, x4));
             break;
         case 0xFA:
             INST_NAME("PSUBD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_D(x3, x4, SUBW(x3, x3, x4));
             break;
         case 0xFB:
             INST_NAME("PSUBQ Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, SUB(x3, x3, x4));
             break;
         case 0xFC:
             INST_NAME("PADDB Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             for(int i=0; i<16; ++i) {
                 // GX->sb[i] += EX->sb[i];
-                LB(x3, gback, i);
+                LB(x3, gback, gdoffset+i);
                 LB(x4, wback, fixedaddress+i);
                 ADDW(x3, x3, x4);
-                SB(x3, gback, i);
+                SB(x3, gback, gdoffset+i);
             }
             break;
         case 0xFD:
             INST_NAME("PADDW Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_W(x3, x4, ADDW(x3, x3, x4));
             break;
         case 0xFE:
             INST_NAME("PADDD Gx,Ex");
             nextop = F8;
-            GETGX(x1);
+            GETGX();
             GETEX(x2, 0);
             SSE_LOOP_D(x3, x4, ADDW(x3, x3, x4));
             break;