about summary refs log tree commit diff stats
path: root/src/dynarec/rv64/dynarec_rv64_helper.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynarec/rv64/dynarec_rv64_helper.h')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h238
1 files changed, 176 insertions, 62 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index b12ee96b..0b1023b3 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -99,6 +99,25 @@
                     LD(x1, wback, fixedaddress);        \
                     ed = x1;                            \
                 }
+#define GETEDz(D) if(MODREG) {                          \
+                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
+                    wback = 0;                          \
+                } else {                                \
+                    SMREAD()                            \
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+                    LDz(x1, wback, fixedaddress);       \
+                    ed = x1;                            \
+                }
+// GETED32 can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
+#define GETED32(D)  if(MODREG) {                        \
+                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
+                    wback = 0;                          \
+                } else {                                \
+                    SMREAD()                            \
+                    addr = geted32(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+                    LDxw(x1, wback, fixedaddress);      \
+                    ed = x1;                            \
+                }
 //GETEDH can use hint for ed, and x1 or x2 for wback (depending on hint), might also use x3. wback is 0 if ed is xEAX..xEDI
 #define GETEDH(hint, D) if(MODREG) {                    \
                     ed = xRAX+(nextop&7)+(rex.b<<3);    \
@@ -109,13 +128,23 @@
                     LDxw(hint, wback, fixedaddress);    \
                     ed = hint;                          \
                 }
+//GETEDW can use hint for wback and ret for ed. wback is 0 if ed is xEAX..xEDI
+#define GETEDW(hint, ret, D)   if(MODREG) {             \
+                    ed = xRAX+(nextop&7)+(rex.b<<3);    \
+                    MV(ret, ed);                        \
+                    wback = 0;                          \
+                } else {                                \
+                    SMREAD();                           \
+                    addr = geted(dyn, addr, ninst, nextop, &wback, (hint==x2)?x1:x2, (hint==x1)?x1:x3, &fixedaddress, rex, NULL, 0, D); \
+                    ed = ret;                           \
+                    LDxw(ed, wback, fixedaddress);      \
+                }
 // GETGW extract x64 register in gd, that is i
-#define GETGW(i) gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3); SLLI(i, gd, 48); SRLI(i, i, 48); gd = i;
+#define GETGW(i) gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3); ZEXTH(i, gd); gd = i;
 //GETEWW will use i for ed, and can use w for wback.
 #define GETEWW(w, i, D) if(MODREG) {        \
                     wback = xRAX+(nextop&7)+(rex.b<<3);\
-                    SLLI(i, wback, 48);     \
-                    SRLI(i, i, 48);         \
+                    ZEXTH(i, wback);        \
                     ed = i;                 \
                     wb1 = 0;                \
                 } else {                    \
@@ -130,8 +159,7 @@
 //GETSEW will use i for ed, and can use r3 for wback. This is the Signed version
 #define GETSEW(i, D) if(MODREG) {           \
                     wback = xRAX+(nextop&7)+(rex.b<<3);\
-                    SLLI(i, wback, 48);     \
-                    SRAI(i, i, 48);         \
+                    if(rv64_zbb) SEXTH(i, wback); else {SLLI(i, wback, 48); SRAI(i, i, 48);}\
                     ed = i;                 \
                     wb1 = 0;                \
                 } else {                    \
@@ -159,6 +187,7 @@
                     LDxw(x1, S, fixedaddress);          \
                     ed = x1;                            \
                 }
+#define WBACKO(O)   if(wback) {ADD(O, wback, O); SDxw(ed, O, 0); SMWRITE2();}
 
 // FAKEED like GETED, but doesn't get anything
 #define FAKEED  if(!MODREG) {   \
@@ -191,6 +220,28 @@
                     wb1 = 1;                    \
                     ed = i;                     \
                 }
+//GETEBO will use i for ed, i is also Offset, and can use r3 for wback.
+#define GETEBO(i, D) if(MODREG) {               \
+                    if(rex.rex) {               \
+                        wback = xRAX+(nextop&7)+(rex.b<<3);     \
+                        wb2 = 0;                \
+                    } else {                    \
+                        wback = (nextop&7);     \
+                        wb2 = (wback>>2)*8;     \
+                        wback = xRAX+(wback&3); \
+                    }                           \
+                    if (wb2) {MV(i, wback); SRLI(i, i, wb2); ANDI(i, i, 0xff);} else {ANDI(i, wback, 0xff);}   \
+                    wb1 = 0;                    \
+                    ed = i;                     \
+                } else {                        \
+                    SMREAD();                   \
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
+                    ADD(x3, wback, i);          \
+                    if(wback!=x3) wback = x3;   \
+                    LBU(i, wback, fixedaddress);\
+                    wb1 = 1;                    \
+                    ed = i;                     \
+                }
 //GETSEB sign extend EB, will use i for ed, and can use r3 for wback.
 #define GETSEB(i, D) if(MODREG) {                \
                     if(rex.rex) {               \
@@ -213,6 +264,26 @@
                     wb1 = 1;                    \
                     ed = i;                     \
                 }
+// GETEB32 will use i for ed, and can use r3 for wback.
+#define GETEB32(i, D) if(MODREG) {                \
+                    if(rex.rex) {               \
+                        wback = xRAX+(nextop&7)+(rex.b<<3);     \
+                        wb2 = 0;                \
+                    } else {                    \
+                        wback = (nextop&7);     \
+                        wb2 = (wback>>2)*8;     \
+                        wback = xRAX+(wback&3); \
+                    }                           \
+                    if (wb2) {MV(i, wback); SRLI(i, i, wb2); ANDI(i, i, 0xff);} else {ANDI(i, wback, 0xff);}   \
+                    wb1 = 0;                    \
+                    ed = i;                     \
+                } else {                        \
+                    SMREAD();                   \
+                    addr = geted32(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
+                    LBU(i, wback, fixedaddress);\
+                    wb1 = 1;                    \
+                    ed = i;                     \
+                }
 
 //GETGB will use i for gd
 #define GETGB(i) if(rex.rex) {                                \
@@ -228,7 +299,6 @@
 
 // Write gb (gd) back to original register / memory, using s1 as scratch
 #define GBBACK(s1) if(gb2) {                            \
-                    assert(gb2 == 8);                   \
                     MOV64x(s1, 0xffffffffffff00ffLL);   \
                     AND(gb1, gb1, s1);                  \
                     SLLI(s1, gd, 8);                    \
@@ -243,7 +313,6 @@
                     SB(ed, wback, fixedaddress);        \
                     SMWRITE();                          \
                 } else if(wb2) {                        \
-                    assert(wb2 == 8);                   \
                     MOV64x(s1, 0xffffffffffff00ffLL);   \
                     AND(wback, wback, s1);              \
                     if (c) {ANDI(ed, ed, 0xff);}        \
@@ -309,31 +378,49 @@
     }
 
 // Will get pointer to GX in general register a, will purge SS or SD if loaded. can use gback as load address
-#define GETGX(a)                        \
-    gd = ((nextop&0x38)>>3)+(rex.r<<3); \
-    sse_forget_reg(dyn, ninst, gd);     \
-    gback = a;                          \
-    ADDI(a, xEmu, offsetof(x64emu_t, xmm[gd]))
+#define GETGX()                             \
+    gd = ((nextop&0x38)>>3)+(rex.r<<3);     \
+    sse_forget_reg(dyn, ninst, gd);         \
+    gback = xEmu;                           \
+    gdoffset = offsetof(x64emu_t, xmm[gd])
 
 // Get Ex address in general register a, will purge SS or SD if it's reg and is loaded. May use x3. Use wback as load address!
 #define GETEX(a, D)                                                                                     \
     if(MODREG) {                                                                                        \
         ed = (nextop&7)+(rex.b<<3);                                                                     \
         sse_forget_reg(dyn, ninst, ed);                                                                 \
-        fixedaddress = 0;                                                                               \
-        ADDI(a, xEmu, offsetof(x64emu_t, xmm[ed]));                                                     \
-        wback = a;                                                                                      \
+        fixedaddress = offsetof(x64emu_t, xmm[ed]);                                                     \
+        wback = xEmu;                                                                                   \
     } else {                                                                                            \
         SMREAD();                                                                                       \
         ed=16;                                                                                          \
         addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D);          \
     }
 
+#define GETGM()                             \
+    gd = ((nextop&0x38)>>3);                \
+    mmx_forget_reg(dyn, ninst, gd);         \
+    gback = xEmu;                           \
+    gdoffset = offsetof(x64emu_t, mmx[gd])
+
+// Get EM, might use x3
+#define GETEM(a, D)                                                                             \
+    if(MODREG) {                                                                                \
+        ed = (nextop&7);                                                                        \
+        mmx_forget_reg(dyn, ninst, ed);                                                         \
+        fixedaddress = offsetof(x64emu_t, mmx[ed]);                                             \
+        wback = xEmu;                                                                           \
+    } else {                                                                                    \
+        SMREAD();                                                                               \
+        ed=8;                                                                                   \
+        addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D);  \
+    }
+
 #define SSE_LOOP_D_ITEM(GX1, EX1, F, i) \
-    LWU(GX1, gback, i*4);               \
+    LWU(GX1, gback, gdoffset+i*4);      \
     LWU(EX1, wback, fixedaddress+i*4);  \
     F;                                  \
-    SW(GX1, gback, i*4);
+    SW(GX1, gback, gdoffset+i*4);
 
 // Loop for SSE opcode that use 32bits value and write to GX.
 #define SSE_LOOP_D(GX1, EX1, F)     \
@@ -343,10 +430,10 @@
     SSE_LOOP_D_ITEM(GX1, EX1, F, 3)
 
 #define SSE_LOOP_DS_ITEM(GX1, EX1, F, i) \
-    LW(GX1, gback, i*4);                 \
+    LW(GX1, gback, gdoffset+i*4);        \
     LW(EX1, wback, fixedaddress+i*4);    \
     F;                                   \
-    SW(GX1, gback, i*4);
+    SW(GX1, gback, gdoffset+i*4);
 
 // Loop for SSE opcode that use 32bits value and write to GX.
 #define SSE_LOOP_DS(GX1, EX1, F)     \
@@ -355,20 +442,28 @@
     SSE_LOOP_DS_ITEM(GX1, EX1, F, 2) \
     SSE_LOOP_DS_ITEM(GX1, EX1, F, 3)
 
+#define MMX_LOOP_W(GX1, EX1, F)            \
+    for (int i=0; i<4; ++i) {              \
+        LHU(GX1, gback, gdoffset+i*2);     \
+        LHU(EX1, wback, fixedaddress+i*2); \
+        F;                                 \
+        SH(GX1, gback, gdoffset+i*2);      \
+    }
+
 #define SSE_LOOP_W(GX1, EX1, F)            \
     for (int i=0; i<8; ++i) {              \
-        LHU(GX1, gback, i*2);              \
+        LHU(GX1, gback, gdoffset+i*2);     \
         LHU(EX1, wback, fixedaddress+i*2); \
         F;                                 \
-        SH(GX1, gback, i*2);               \
+        SH(GX1, gback, gdoffset+i*2);      \
     }
 
 #define SSE_LOOP_WS(GX1, EX1, F)          \
     for (int i=0; i<8; ++i) {             \
-        LH(GX1, gback, i*2);              \
+        LH(GX1, gback, gdoffset+i*2);     \
         LH(EX1, wback, fixedaddress+i*2); \
         F;                                \
-        SH(GX1, gback, i*2);              \
+        SH(GX1, gback, gdoffset+i*2);     \
     }
 
 #define SSE_LOOP_D_S_ITEM(EX1, F, i)    \
@@ -384,10 +479,10 @@
     SSE_LOOP_D_S_ITEM(EX1, F, 3)
 
 #define SSE_LOOP_Q_ITEM(GX1, EX1, F, i) \
-    LD(GX1, gback, i*8);                \
+    LD(GX1, gback, gdoffset+i*8);       \
     LD(EX1, wback, fixedaddress+i*8);   \
     F;                                  \
-    SD(GX1, gback, i*8);
+    SD(GX1, gback, gdoffset+i*8);
 
 // Loop for SSE opcode that use 64bits value and write to GX.
 #define SSE_LOOP_Q(GX1, EX1, F)     \
@@ -396,10 +491,10 @@
 
 
 #define SSE_LOOP_FQ_ITEM(GX1, EX1, F, i)            \
-    FLD(v0, gback, i*8);                            \
+    FLD(v0, gback, gdoffset+i*8);                   \
     FLD(v1, wback, fixedaddress+i*8);               \
     F;                                              \
-    FSD(v0, gback, i*8);
+    FSD(v0, gback, gdoffset+i*8);
 
 #define SSE_LOOP_FQ(GX1, EX1, F)     \
     v0 = fpu_get_scratch(dyn);       \
@@ -410,7 +505,7 @@
 
 #define SSE_LOOP_MV_Q_ITEM(s, i)      \
     LD(s, wback, fixedaddress+i*8);   \
-    SD(s, gback, i*8);
+    SD(s, gback, gdoffset+i*8);
 
 // Loop for SSE opcode that moves 64bits value from wback to gback, use s as scratch.
 #define SSE_LOOP_MV_Q(s)     \
@@ -418,7 +513,7 @@
     SSE_LOOP_MV_Q_ITEM(s, 1)
 
 #define SSE_LOOP_MV_Q_ITEM2(s, i)     \
-    LD(s, gback, i*8);                \
+    LD(s, gback, gdoffset+i*8);       \
     SD(s, wback, fixedaddress+i*8);
 
 // Loop for SSE opcode that moves 64bits value from gback to wback, use s as scratch.
@@ -436,17 +531,19 @@
 // R0 will not be pushed/popd if ret is -2. Flags are not save/restored
 #define CALL_S(F, ret) call_c(dyn, ninst, F, x6, ret, 0, 0)
 
-#define MARK    dyn->insts[ninst].mark = dyn->native_size
-#define GETMARK dyn->insts[ninst].mark
-#define MARK2   dyn->insts[ninst].mark2 = dyn->native_size
-#define GETMARK2 dyn->insts[ninst].mark2
-#define MARK3   dyn->insts[ninst].mark3 = dyn->native_size
-#define GETMARK3 dyn->insts[ninst].mark3
-#define MARKF   dyn->insts[ninst].markf = dyn->native_size
-#define GETMARKF dyn->insts[ninst].markf
-#define MARKSEG dyn->insts[ninst].markseg = dyn->native_size
-#define GETMARKSEG dyn->insts[ninst].markseg
-#define MARKLOCK dyn->insts[ninst].marklock = dyn->native_size
+#define MARK        dyn->insts[ninst].mark = dyn->native_size
+#define GETMARK     dyn->insts[ninst].mark
+#define MARK2       dyn->insts[ninst].mark2 = dyn->native_size
+#define GETMARK2    dyn->insts[ninst].mark2
+#define MARK3       dyn->insts[ninst].mark3 = dyn->native_size
+#define GETMARK3    dyn->insts[ninst].mark3
+#define MARKF       dyn->insts[ninst].markf = dyn->native_size
+#define GETMARKF    dyn->insts[ninst].markf
+#define MARKF2      dyn->insts[ninst].markf2 = dyn->native_size
+#define GETMARKF2   dyn->insts[ninst].markf2
+#define MARKSEG     dyn->insts[ninst].markseg = dyn->native_size
+#define GETMARKSEG  dyn->insts[ninst].markseg
+#define MARKLOCK    dyn->insts[ninst].marklock = dyn->native_size
 #define GETMARKLOCK dyn->insts[ninst].marklock
 
 #define Bxx_gen(OP, M, reg1, reg2)      \
@@ -526,7 +623,7 @@
 #define STORE_REG(A)    SD(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
 #define LOAD_REG(A)     LD(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
 
-// Need to also store current value of some register, as they may be used by functions like setjump
+// Need to also store current value of some register, as they may be used by functions like setjmp
 #define STORE_XEMU_CALL()   \
     STORE_REG(RBX);         \
     STORE_REG(RDX);         \
@@ -606,11 +703,11 @@
 
 // Adjust the xFlags bit 5 -> bit 11, src and dst can be the same (and can be xFlags, but not s1)
 #define FLAGS_ADJUST_TO11(dst, src, s1) \
-    MOV64x(s1, ~(1<<11));               \
-    AND(dst, src, s1);                  \
-    ANDI(s1, dst, 1<<5);                \
-    SLLI(s1, s1, 11-5);                 \
-    ANDI(dst, dst, ~(1<<5));            \
+    LUI(s1, 0xFFFFF);                   \
+    ADDIW(s1, s1, 0x7DF);               \
+    AND(s1, src, s1);                   \
+    ANDI(dst, src, 1<<5);               \
+    SLLI(dst, dst, 11-5);               \
     OR(dst, dst, s1)
 
 #ifndef MAYSETFLAGS
@@ -721,8 +818,8 @@
 
 #define MODREG  ((nextop&0xC0)==0xC0)
 
-void rv64_epilog();
-void rv64_epilog_fast();
+void rv64_epilog(void);
+void rv64_epilog_fast(void);
 void* rv64_next(x64emu_t* emu, uintptr_t addr);
 
 #ifndef STEPNAME
@@ -863,6 +960,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 #define sse_setround    STEPNAME(sse_setround)
 #define mmx_get_reg     STEPNAME(mmx_get_reg)
 #define mmx_get_reg_empty STEPNAME(mmx_get_reg_empty)
+#define mmx_forget_reg   STEPNAME(mmx_forget_reg)
 #define sse_get_reg     STEPNAME(sse_get_reg)
 #define sse_get_reg_empty STEPNAME(sse_get_reg_empty)
 #define sse_forget_reg   STEPNAME(sse_forget_reg)
@@ -888,7 +986,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta);
 
 /* setup r2 to address pointed by */
-//uintptr_t geted32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, rex_t rex, int* l, int s, int delta);
+uintptr_t geted32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta);
 
 /* setup r2 to address pointed by */
 //uintptr_t geted16(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, int s);
@@ -898,8 +996,8 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
 void jump_to_epilog(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst);
 void jump_to_epilog_fast(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst);
 void jump_to_next(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst);
-void ret_to_epilog(dynarec_rv64_t* dyn, int ninst);
-void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, int n);
+void ret_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex);
+void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex, int n);
 void iret_to_epilog(dynarec_rv64_t* dyn, int ninst, int is64bits);
 void call_c(dynarec_rv64_t* dyn, int ninst, void* fnc, int reg, int ret, int saveflags, int save_reg);
 void call_n(dynarec_rv64_t* dyn, int ninst, void* fnc, int w);
@@ -950,10 +1048,10 @@ void emit_inc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
 void emit_dec32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
 void emit_dec16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
 void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
-void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
+void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6);
 //void emit_adc32c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
-//void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
-//void emit_adc8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5);
+void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
+void emit_adc8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5, int s6);
 void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
 //void emit_adc16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
 void emit_sbb32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
@@ -1047,12 +1145,20 @@ int extcache_st_coherency(dynarec_rv64_t* dyn, int ninst, int a, int b);
 #define X87_ST(A)   extcache_get_st(dyn, ninst, A)
 #endif
 
+//MMX helpers
+// get float register for a MMX reg, create the entry if needed
+int mmx_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a);
+// get float register for a MMX reg, but don't try to synch it if it needed to be created
+int mmx_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a);
+// forget float register for a MMX reg, create the entry if needed
+void mmx_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
+
 //SSE/SSE2 helpers
-// get neon register for a SSE reg, create the entry if needed
+// get float register for a SSE reg, create the entry if needed
 int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
-// get neon register for a SSE reg, but don't try to synch it if it needed to be created
+// get float register for a SSE reg, but don't try to synch it if it needed to be created
 int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
-// forget neon register for a SSE reg, create the entry if needed
+// forget float register for a SSE reg, create the entry if needed
 void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
 // purge the XMM0..XMM7 cache (before function call)
 void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1);
@@ -1085,19 +1191,19 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int seg, int* ok, int* need_epilog);
 //uintptr_t dynarec64_65(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep,int* ok, int* need_epilog);
 uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
-//uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_D8(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 //uintptr_t dynarec64_DA(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
-//uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
-//uintptr_t dynarec64_DD(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DD(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_DE(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
-//uintptr_t dynarec64_6664(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog);
-//uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_6664(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog);
+uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 
@@ -1231,4 +1337,12 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         SW(s2, xEmu, offsetof(x64emu_t, test.test));        \
     }
 
+#define GETREX()                                \
+    rex.rex = 0;                                \
+    if(!rex.is32bits)                           \
+        while(opcode>=0x40 && opcode<=0x4f) {   \
+            rex.rex = opcode;                   \
+            opcode = F8;                        \
+        }
+
 #endif //__DYNAREC_RV64_HELPER_H__