about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorYang Liu <numbksco@gmail.com>2024-06-02 16:25:57 +0800
committerGitHub <noreply@github.com>2024-06-02 10:25:57 +0200
commitf44d60c58f418b8844be3dc4b2505ae097a71f3f (patch)
treec7fd30db485c7e742e6ae1fec476a19501991784
parentdc71840a5a1e065e5f98e3a230f6714ec8d946ed (diff)
downloadbox64-f44d60c58f418b8844be3dc4b2505ae097a71f3f.tar.gz
box64-f44d60c58f418b8844be3dc4b2505ae097a71f3f.zip
[LA64_DYNAREC] Added more opcodes (#1549)
* [LA64_DYNAREC] Added more opcodes

* fastnan handling and fixed PALIGNR...
-rw-r--r--CMakeLists.txt1
-rw-r--r--src/dynarec/la64/dynarec_la64_00.c62
-rw-r--r--src/dynarec/la64/dynarec_la64_0f.c131
-rw-r--r--src/dynarec/la64/dynarec_la64_66.c30
-rw-r--r--src/dynarec/la64/dynarec_la64_660f.c70
-rw-r--r--src/dynarec/la64/dynarec_la64_67.c109
-rw-r--r--src/dynarec/la64/dynarec_la64_emit_shift.c58
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.c165
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h21
-rw-r--r--src/dynarec/la64/la64_emitter.h8
10 files changed, 655 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7fd8f411..75f0c608 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -848,6 +848,7 @@ if(LARCH64_DYNAREC)
     "${BOX64_ROOT}/src/dynarec/la64/dynarec_la64_0f.c"
     "${BOX64_ROOT}/src/dynarec/la64/dynarec_la64_64.c"
     "${BOX64_ROOT}/src/dynarec/la64/dynarec_la64_66.c"
+    "${BOX64_ROOT}/src/dynarec/la64/dynarec_la64_67.c"
     "${BOX64_ROOT}/src/dynarec/la64/dynarec_la64_f30f.c"
     "${BOX64_ROOT}/src/dynarec/la64/dynarec_la64_660f.c"
     "${BOX64_ROOT}/src/dynarec/la64/dynarec_la64_f0.c"
diff --git a/src/dynarec/la64/dynarec_la64_00.c b/src/dynarec/la64/dynarec_la64_00.c
index 51818d83..a7207003 100644
--- a/src/dynarec/la64/dynarec_la64_00.c
+++ b/src/dynarec/la64/dynarec_la64_00.c
@@ -167,6 +167,25 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     DEFAULT;
             }
             break;
+        case 0x11:
+            INST_NAME("ADC Ed, Gd");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED(0);
+            emit_adc32(dyn, ninst, rex, ed, gd, x3, x4, x5, x6);
+            WBACK;
+            break;
+        case 0x13:
+            INST_NAME("ADC Gd, Ed");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED(0);
+            emit_adc32(dyn, ninst, rex, gd, ed, x3, x4, x5, x6);
+            break;
         case 0x18:
             INST_NAME("SBB Eb, Gb");
             READFLAGS(X_CF);
@@ -187,6 +206,15 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             emit_sbb32(dyn, ninst, rex, ed, gd, x3, x4, x5);
             WBACK;
             break;
+        case 0x1B:
+            INST_NAME("SBB Gd, Ed");
+            READFLAGS(X_CF);
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED(0);
+            emit_sbb32(dyn, ninst, rex, gd, ed, x3, x4, x5);
+            break;
         case 0x1C:
             INST_NAME("SBB AL, Ib");
             READFLAGS(X_CF);
@@ -460,6 +488,12 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x66:
             addr = dynarec64_66(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
             break;
+        case 0x67:
+            if (rex.is32bits) {
+                DEFAULT;
+            } else
+                addr = dynarec64_67(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
         case 0x68:
             INST_NAME("PUSH Id");
             i64 = F32S;
@@ -1372,6 +1406,26 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0xC1:
             nextop = F8;
             switch ((nextop >> 3) & 7) {
+                case 0:
+                    INST_NAME("ROL Ed, Ib");
+                    u8 = geted_ib(dyn, addr, ninst, nextop) & (rex.w ? 0x3f : 0x1f);
+                    // flags are not affected if count is 0, we make it a nop if possible.
+                    if (u8) {
+                        SETFLAGS(X_OF | X_CF, SF_SUBSET_PENDING);
+                        GETED(1);
+                        F8;
+                        emit_rol32c(dyn, ninst, rex, ed, u8, x3, x4);
+                        WBACK;
+                    } else {
+                        if (MODREG && !rex.w) {
+                            GETED(1);
+                            ZEROUP(ed);
+                        } else {
+                            FAKEED;
+                        }
+                        F8;
+                    }
+                    break;
                 case 1:
                     INST_NAME("ROR Ed, Ib");
                     u8 = geted_ib(dyn, addr, ninst, nextop) & (rex.w ? 0x3f : 0x1f);
@@ -1607,6 +1661,14 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0xD1:
             nextop = F8;
             switch ((nextop >> 3) & 7) {
+                case 0:
+                    INST_NAME("ROL Ed, 1");
+                    SETFLAGS(X_OF | X_CF, SF_SUBSET_PENDING);
+                    GETED(0);
+                    emit_rol32c(dyn, ninst, rex, ed, 1, x3, x4);
+                    WBACK;
+                    if (!wback && !rex.w) ZEROUP(ed);
+                    break;
                 case 4:
                 case 6:
                     INST_NAME("SHL Ed, 1");
diff --git a/src/dynarec/la64/dynarec_la64_0f.c b/src/dynarec/la64/dynarec_la64_0f.c
index 7255c32e..e96c3437 100644
--- a/src/dynarec/la64/dynarec_la64_0f.c
+++ b/src/dynarec/la64/dynarec_la64_0f.c
@@ -311,6 +311,74 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 SPILL_EFLAGS();
             }
             break;
+        case 0x38:
+            // SSE3
+            nextop = F8;
+            switch (nextop) {
+                case 0xC8 ... 0xCD:
+                    u8 = nextop;
+                    switch (u8) {
+                        case 0xC8:
+                            INST_NAME("SHA1NEXTE Gx, Ex");
+                            break;
+                        case 0xC9:
+                            INST_NAME("SHA1MSG1 Gx, Ex");
+                            break;
+                        case 0xCA:
+                            INST_NAME("SHA1MSG2 Gx, Ex");
+                            break;
+                        case 0xCB:
+                            INST_NAME("SHA256RNDS2 Gx, Ex");
+                            break;
+                        case 0xCC:
+                            INST_NAME("SHA256MSG1 Gx, Ex");
+                            break;
+                        case 0xCD:
+                            INST_NAME("SHA256MSG2 Gx, Ex");
+                            break;
+                    }
+                    nextop = F8;
+                    if (MODREG) {
+                        ed = (nextop & 7) + (rex.b << 3);
+                        sse_reflect_reg(dyn, ninst, ed);
+                        ADDI_D(x2, xEmu, offsetof(x64emu_t, xmm[ed]));
+                    } else {
+                        SMREAD();
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 0, 0);
+                        if (ed != x2) {
+                            MV(x2, ed);
+                        }
+                    }
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    ADDI_D(x1, xEmu, offsetof(x64emu_t, xmm[gd]));
+                    sse_reflect_reg(dyn, ninst, 0);
+                    switch (u8) {
+                        case 0xC8:
+                            CALL(sha1nexte, -1);
+                            break;
+                        case 0xC9:
+                            CALL(sha1msg1, -1);
+                            break;
+                        case 0xCA:
+                            CALL(sha1msg2, -1);
+                            break;
+                        case 0xCB:
+                            CALL(sha256rnds2, -1);
+                            break;
+                        case 0xCC:
+                            CALL(sha256msg1, -1);
+                            break;
+                        case 0xCD:
+                            CALL(sha256msg2, -1);
+                            break;
+                    }
+                    break;
+                default:
+                    DEFAULT;
+            }
+            break;
+
         #define GO(GETFLAGS, NO, YES, F, I)                                                          \
             READFLAGS(F);                                                                            \
             if (la64_lbt) {                                                                          \
@@ -341,6 +409,16 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 
         #undef GO
 
+        case 0x50:
+            INST_NAME("MOVMSPKPS Gd, Ex");
+            nextop = F8;
+            GETEX(q0, 0, 0);
+            GETGD;
+            q1 = fpu_get_scratch(dyn);
+            VMSKLTZ_W(q1, q0);
+            MOVFR2GR_S(gd, q1);
+            BSTRPICK_D(gd, gd, 31, 0);
+            break;
         case 0x51:
             INST_NAME("SQRTPS Gx, Ex");
             nextop = F8;
@@ -356,6 +434,15 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGX_empty(q1);
             VFRSQRT_S(q1, q0);
             break;
+        case 0x53:
+            INST_NAME("RCPPS Gx, Ex");
+            nextop = F8;
+            SKIPTEST(x1);
+            GETEX(q0, 0, 0);
+            GETGX_empty(q1);
+            // TODO: use v1.1 vfrecipe when possible
+            VFRECIP_S(q1, q0);
+            break;
         case 0x54:
             INST_NAME("ANDPS Gx, Ex");
             nextop = F8;
@@ -426,6 +513,21 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGX(v0, 1);
             VFSUB_S(v0, v0, q0);
             break;
+        case 0x5D:
+            INST_NAME("MINPS Gx, Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETEX(v1, 0, 0);
+            if (!box64_dynarec_fastnan && v0 != v1) {
+                q0 = fpu_get_scratch(dyn);
+                // always copy from v1 if any oprand is NaN
+                VFCMP_S(q0, v0, v1, cUN);
+                VANDN_V(v0, q0, v0);
+                VAND_V(q0, q0, v1);
+                VOR_V(v0, v0, q0);
+            }
+            VFMIN_S(v0, v0, v1);
+            break;
         case 0x5E:
             INST_NAME("DIVPS Gx, Ex");
             nextop = F8;
@@ -433,6 +535,21 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETGX(v0, 1);
             VFDIV_S(v0, v0, q0);
             break;
+        case 0x5F:
+            INST_NAME("MAXPS Gx, Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETEX(v1, 0, 0);
+            if (!box64_dynarec_fastnan && v0 != v1) {
+                q0 = fpu_get_scratch(dyn);
+                // always copy from v1 if any oprand is NaN
+                VFCMP_S(q0, v0, v1, cUN);
+                VANDN_V(v0, q0, v0);
+                VAND_V(q0, q0, v1);
+                VOR_V(v0, v0, q0);
+            }
+            VFMAX_S(v0, v0, v1);
+            break;
 
         #define GO(GETFLAGS, NO, YES, F, I)                                                         \
             if (box64_dynarec_test == 2) { NOTEST(x1); }                                            \
@@ -556,6 +673,20 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 }
             else
                 switch ((nextop >> 3) & 7) {
+                    case 2:
+                        INST_NAME("LDMXCSR Md");
+                        GETED(0);
+                        ST_W(ed, xEmu, offsetof(x64emu_t, mxcsr));
+                        if (box64_sse_flushto0) {
+                            // TODO
+                        }
+                        break;
+                    case 3:
+                        INST_NAME("STMXCSR Md");
+                        addr = geted(dyn, addr, ninst, nextop, &wback, x1, x2, &fixedaddress, rex, NULL, 0, 0);
+                        LD_WU(x4, xEmu, offsetof(x64emu_t, mxcsr));
+                        ST_W(x4, wback, fixedaddress);
+                        break;
                     default:
                         DEFAULT;
                 }
diff --git a/src/dynarec/la64/dynarec_la64_66.c b/src/dynarec/la64/dynarec_la64_66.c
index 8e7764d0..3d27d369 100644
--- a/src/dynarec/la64/dynarec_la64_66.c
+++ b/src/dynarec/la64/dynarec_la64_66.c
@@ -103,6 +103,15 @@ uintptr_t dynarec64_66(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             emit_or16(dyn, ninst, x1, x2, x4, x5);
             GWBACK;
             break;
+        case 0x0D:
+            INST_NAME("OR AX, Iw");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i32 = F16;
+            BSTRPICK_D(x1, xRAX, 15, 0);
+            MOV32w(x2, i32);
+            emit_or16(dyn, ninst, x1, x2, x3, x4);
+            BSTRINS_D(xRAX, x1, 15, 0);
+            break;
         case 0x0F:
             switch (rep) {
                 case 0: addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); break;
@@ -211,6 +220,27 @@ uintptr_t dynarec64_66(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 emit_cmp16_0(dyn, ninst, x1, x3, x4);
             }
             break;
+        case 0x69:
+        case 0x6B:
+            if (opcode == 0x69) {
+                INST_NAME("IMUL Gw,Ew,Iw");
+            } else {
+                INST_NAME("IMUL Gw,Ew,Ib");
+            }
+            SETFLAGS(X_ALL, SF_PENDING);
+            nextop = F8;
+            GETSEW(x1, (opcode == 0x69) ? 2 : 1);
+            if (opcode == 0x69)
+                i32 = F16S;
+            else
+                i32 = F8S;
+            MOV32w(x2, i32);
+            MUL_W(x2, x2, x1);
+            UFLAG_RES(x2);
+            gd = x2;
+            GWBACK;
+            UFLAG_DF(x1, d_imul16);
+            break;
         case 0x81:
         case 0x83:
             nextop = F8;
diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c
index c9057c1e..5768cc62 100644
--- a/src/dynarec/la64/dynarec_la64_660f.c
+++ b/src/dynarec/la64/dynarec_la64_660f.c
@@ -277,6 +277,76 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     DEFAULT;
             }
             break;
+        case 0x3A: // these are some more SSSE3+ opcodes
+            opcode = F8;
+            switch (opcode) {
+                case 0x0F:
+                    INST_NAME("PALIGNR Gx, Ex, Ib");
+                    nextop = F8;
+                    GETGX(q0, 1);
+                    GETEX(q1, 0, 1);
+                    u8 = F8;
+                    if (u8 > 31) {
+                        VXOR_V(q0, q0, q0);
+                    } else if (u8 > 15) {
+                        VBSRL_V(q0, q0, u8 - 16);
+                    } else if (!u8) {
+                        VOR_V(q0, q1, q1);
+                    } else {
+                        d0 = fpu_get_scratch(dyn);
+                        VBSLL_V(q0, q0, 16 - u8);
+                        VBSRL_V(d0, q1, u8);
+                        VOR_V(q0, q0, d0);
+                    }
+                    break;
+                case 0x44:
+                    INST_NAME("PCLMULQDQ Gx, Ex, Ib");
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd); // gx
+                    if (MODREG) {
+                        ed = (nextop & 7) + (rex.b << 3);
+                        sse_forget_reg(dyn, ninst, ed);
+                        MOV32w(x2, ed);
+                        MOV32w(x3, 0); // p = NULL
+                    } else {
+                        MOV32w(x2, 0);
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 0, 1);
+                        if (ed != x3) {
+                            MV(x3, ed);
+                        }
+                    }
+                    u8 = F8;
+                    MOV32w(x4, u8);
+                    CALL(native_pclmul, -1);
+                    break;
+                case 0xDF:
+                    INST_NAME("AESKEYGENASSIST Gx, Ex, Ib"); // AES-NI
+                    nextop = F8;
+                    GETG;
+                    sse_forget_reg(dyn, ninst, gd);
+                    MOV32w(x1, gd); // gx
+                    if (MODREG) {
+                        ed = (nextop & 7) + (rex.b << 3);
+                        sse_forget_reg(dyn, ninst, ed);
+                        MOV32w(x2, ed);
+                        MOV32w(x3, 0); // p = NULL
+                    } else {
+                        MOV32w(x2, 0);
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 1);
+                        if (ed != x3) {
+                            MV(x3, ed);
+                        }
+                    }
+                    u8 = F8;
+                    MOV32w(x4, u8);
+                    CALL(native_aeskeygenassist, -1);
+                    break;
+                default:
+                    DEFAULT;
+            }
+            break;
         case 0x54:
             INST_NAME("ANDPD Gx, Ex");
             nextop = F8;
diff --git a/src/dynarec/la64/dynarec_la64_67.c b/src/dynarec/la64/dynarec_la64_67.c
new file mode 100644
index 00000000..5b81eb9d
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_67.c
@@ -0,0 +1,109 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "la64_emitter.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "dynarec_la64_helper.h"
+#include "dynarec_la64_functions.h"
+
+uintptr_t dynarec64_67(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip; (void)need_epilog;
+
+    uint8_t opcode = F8;
+    uint8_t nextop;
+    uint8_t gd, ed, wback, wb, wb1, wb2, gb1, gb2, eb1, eb2;
+    int64_t fixedaddress;
+    int unscaled;
+    int8_t  i8;
+    uint8_t u8;
+    int32_t i32;
+    int64_t j64, i64;
+    int cacheupd = 0;
+    int lock;
+    int v0, v1, s0;
+    MAYUSE(i32);
+    MAYUSE(j64);
+    MAYUSE(v0);
+    MAYUSE(v1);
+    MAYUSE(s0);
+    MAYUSE(lock);
+    MAYUSE(cacheupd);
+
+    if(rex.is32bits) {
+        // should do a different file
+        DEFAULT;
+        return addr;
+    }
+
+    GETREX();
+
+    rep = 0;
+    while((opcode==0xF2) || (opcode==0xF3)) {
+        rep = opcode-0xF1;
+        opcode = F8;
+    }
+
+    switch(opcode) {
+        case 0x89:
+            INST_NAME("MOV Ed, Gd");
+            nextop = F8;
+            GETGD;
+            if (MODREG) { // reg <= reg
+                MVxw(TO_LA64((nextop & 7) + (rex.b << 3)), gd);
+            } else { // mem <= reg
+                addr = geted32(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0);
+                SDxw(gd, ed, fixedaddress);
+                SMWRITELOCK(lock);
+            }
+            break;
+        case 0xF7:
+            nextop = F8;
+            switch ((nextop >> 3) & 7) {
+                case 4:
+                    INST_NAME("MUL EAX, Ed");
+                    SETFLAGS(X_ALL, SF_PENDING);
+                    GETED32(0);
+                    if (rex.w) {
+                        if (ed == xRDX)
+                            gd = x3;
+                        else
+                            gd = xRDX;
+                        MULH_DU(gd, xRAX, ed);
+                        MUL_D(xRAX, xRAX, ed);
+                        if (gd != xRDX) MV(xRDX, gd);
+                    } else {
+                        MUL_D(xRDX, xRAX, ed); // 64 <- 32x32
+                        AND(xRAX, xRDX, xMASK);
+                        SRLI_W(xRDX, xRDX, 32);
+                    }
+                    UFLAG_RES(xRAX);
+                    UFLAG_OP1(xRDX);
+                    UFLAG_DF(x2, rex.w ? d_mul64 : d_mul32);
+                    break;
+                default:
+                    DEFAULT;
+            }
+            break;
+        default:
+            DEFAULT;
+    }
+    return addr;
+}
diff --git a/src/dynarec/la64/dynarec_la64_emit_shift.c b/src/dynarec/la64/dynarec_la64_emit_shift.c
index 1fe8aba8..cff14cc8 100644
--- a/src/dynarec/la64/dynarec_la64_emit_shift.c
+++ b/src/dynarec/la64/dynarec_la64_emit_shift.c
@@ -751,3 +751,61 @@ void emit_rol32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         OR(xFlags, xFlags, s3);
     }
 }
+
+
+// emit ROL32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
+void emit_rol32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4)
+{
+    if (!c) return;
+
+    IFX (X_PEND) {
+        MOV32w(s3, c);
+        SDxw(s3, xEmu, offsetof(x64emu_t, op2));
+        SET_DF(s4, rex.w ? d_rol64 : d_rol32);
+    } else IFX (X_ALL) {
+        SET_DFNONE();
+    }
+    if (!c) {
+        IFX (X_PEND) {
+            SDxw(s1, xEmu, offsetof(x64emu_t, res));
+        }
+        return;
+    }
+
+    if (la64_lbt) {
+        IFX (X_CF | X_OF) {
+            if (rex.w)
+                X64_ROTLI_D(s1, c);
+            else
+                X64_ROTLI_W(s1, c);
+        }
+    }
+
+    ROTRIxw(s1, s1, (rex.w ? 64 : 32) - c);
+
+    if (!rex.w) ZEROUP(s1);
+
+    IFX (X_PEND) {
+        SDxw(s1, xEmu, offsetof(x64emu_t, res));
+    }
+
+    if (la64_lbt) return;
+
+    IFX (X_CF | X_OF) {
+        MOV64x(s3, (1UL << F_CF | 1UL << F_OF));
+        ANDN(xFlags, xFlags, s3);
+    }
+    IFX (X_CF | X_OF) {
+        ANDI(s4, s1, 1 << F_CF);
+        IFX (X_CF) OR(xFlags, xFlags, s4);
+    }
+    IFX (X_OF) {
+        // the OF flag is set to the exclusive OR of the CF bit (after the rotate) and the most-significant bit of the result.
+        if (c == 1) {
+            SRLIxw(s3, s1, rex.w ? 63 : 31);
+            XOR(s3, s3, s4);
+            SLLI_D(s3, s3, F_OF);
+            OR(xFlags, xFlags, s3);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c
index 85bd950c..f6b6a72e 100644
--- a/src/dynarec/la64/dynarec_la64_helper.c
+++ b/src/dynarec/la64/dynarec_la64_helper.c
@@ -324,6 +324,150 @@ static uintptr_t geted_32(dynarec_la64_t* dyn, uintptr_t addr, int ninst, uint8_
     return addr;
 }
 
+/* setup r2 to address pointed by ED, also fixaddress is an optionnal delta in the range [-absmax, +absmax], with delta&mask==0 to be added to ed for LDR/STR */
+uintptr_t geted32(dynarec_la64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta)
+{
+    MAYUSE(dyn);
+    MAYUSE(ninst);
+    MAYUSE(delta);
+
+    int lock = l ? ((l == LOCK_LOCK) ? 1 : 2) : 0;
+    if (lock == 2)
+        *l = 0;
+    uint8_t ret = x2;
+    *fixaddress = 0;
+    if (hint > 0) ret = hint;
+    int maxval = 2047;
+    if (i12 > 1)
+        maxval -= i12;
+    MAYUSE(scratch);
+    if (!(nextop & 0xC0)) {
+        if ((nextop & 7) == 4) {
+            uint8_t sib = F8;
+            int sib_reg = ((sib >> 3) & 0x7) + (rex.x << 3);
+            int sib_reg2 = (sib & 0x7) + (rex.b << 3);
+            if ((sib & 0x7) == 5) {
+                int64_t tmp = F32S;
+                if (sib_reg != 4) {
+                    if (tmp && ((tmp < -2048) || (tmp > maxval) || !i12)) {
+                        MOV64x(scratch, tmp);
+                        if ((sib >> 6)) {
+                            SLLI_D(ret, TO_LA64(sib_reg), sib >> 6);
+                            ADD_W(ret, ret, scratch);
+                        } else
+                            ADD_W(ret, TO_LA64(sib_reg), scratch);
+                    } else {
+                        if (sib >> 6)
+                            SLLI_D(ret, TO_LA64(sib_reg), (sib >> 6));
+                        else
+                            ret = TO_LA64(sib_reg);
+                        *fixaddress = tmp;
+                    }
+                } else {
+                    switch (lock) {
+                        case 1: addLockAddress(tmp); break;
+                        case 2:
+                            if (isLockAddress(tmp)) *l = 1;
+                            break;
+                    }
+                    MOV64x(ret, tmp);
+                }
+            } else {
+                if (sib_reg != 4) {
+                    if ((sib >> 6)) {
+                        SLLI_D(ret, TO_LA64(sib_reg), (sib >> 6));
+                        ADD_W(ret, ret, TO_LA64(sib_reg2));
+                    } else
+                        ADD_W(ret, TO_LA64(sib_reg2), TO_LA64(sib_reg));
+                } else {
+                    ret = TO_LA64(sib_reg2);
+                }
+            }
+        } else if ((nextop & 7) == 5) {
+            uint32_t tmp = F32;
+            MOV32w(ret, tmp);
+            GETIP(addr + delta);
+            ADD_W(ret, ret, xRIP);
+            switch (lock) {
+                case 1: addLockAddress(addr + delta + tmp); break;
+                case 2:
+                    if (isLockAddress(addr + delta + tmp)) *l = 1;
+                    break;
+            }
+        } else {
+            ret = TO_LA64((nextop & 7) + (rex.b << 3));
+            if (ret == hint) {
+                AND(hint, ret, xMASK); // to clear upper part
+            }
+        }
+    } else {
+        int64_t i64;
+        uint8_t sib = 0;
+        int sib_reg = 0;
+        if ((nextop & 7) == 4) {
+            sib = F8;
+            sib_reg = ((sib >> 3) & 7) + (rex.x << 3);
+        }
+        int sib_reg2 = (sib & 0x07) + (rex.b << 3);
+        if (nextop & 0x80)
+            i64 = F32S;
+        else
+            i64 = F8S;
+        if (i64 == 0 || ((i64 >= -2048) && (i64 <= 2047) && i12)) {
+            *fixaddress = i64;
+            if ((nextop & 7) == 4) {
+                if (sib_reg != 4) {
+                    if (sib >> 6) {
+                        SLLI_D(ret, TO_LA64(sib_reg), (sib >> 6));
+                        ADD_W(ret, ret, TO_LA64(sib_reg2));
+                    } else
+                        ADD_W(ret, TO_LA64(sib_reg2), TO_LA64(sib_reg));
+                } else {
+                    ret = TO_LA64(sib_reg2);
+                }
+            } else {
+                ret = TO_LA64((nextop & 0x07) + (rex.b << 3));
+            }
+        } else {
+            if (i64 >= -2048 && i64 <= 2047) {
+                if ((nextop & 7) == 4) {
+                    if (sib_reg != 4) {
+                        if (sib >> 6) {
+                            SLLI_D(scratch, TO_LA64(sib_reg), sib >> 6);
+                            ADD_W(scratch, scratch, TO_LA64(sib_reg2));
+                        } else
+                            ADD_W(scratch, TO_LA64(sib_reg2), TO_LA64(sib_reg));
+                    } else {
+                        scratch = TO_LA64(sib_reg2);
+                    }
+                } else
+                    scratch = TO_LA64((nextop & 0x07) + (rex.b << 3));
+                ADDI_W(ret, scratch, i64);
+            } else {
+                MOV32w(scratch, i64);
+                if ((nextop & 7) == 4) {
+                    if (sib_reg != 4) {
+                        ADD_W(scratch, scratch, TO_LA64(sib_reg2));
+                        if (sib >> 6) {
+                            SLLI_D(ret, TO_LA64(sib_reg), (sib >> 6));
+                            ADD_W(ret, ret, scratch);
+                        } else
+                            ADD_W(ret, scratch, TO_LA64(sib_reg));
+                    } else {
+                        PASS3(int tmp = TO_LA64(sib_reg2));
+                        ADD_W(ret, tmp, scratch);
+                    }
+                } else {
+                    PASS3(int tmp = TO_LA64((nextop & 0x07) + (rex.b << 3)));
+                    ADD_W(ret, tmp, scratch);
+                }
+            }
+        }
+    }
+    *ed = ret;
+    return addr;
+}
+
 void jump_to_epilog(dynarec_la64_t* dyn, uintptr_t ip, int reg, int ninst)
 {
     MAYUSE(dyn);
@@ -652,6 +796,27 @@ int sse_get_reg_empty(dynarec_la64_t* dyn, int ninst, int s1, int a)
     dyn->lsx.ssecache[a].write = 1; // it will be write...
     return dyn->lsx.ssecache[a].reg;
 }
+// forget ext register for a SSE reg, does nothing if the regs is not loaded
+void sse_forget_reg(dynarec_la64_t* dyn, int ninst, int a)
+{
+    if (dyn->lsx.ssecache[a].v == -1)
+        return;
+    if (dyn->lsx.lsxcache[dyn->lsx.ssecache[a].reg].t == LSX_CACHE_XMMW) {
+        VST(dyn->lsx.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
+    }
+    fpu_free_reg(dyn, dyn->lsx.ssecache[a].reg);
+    dyn->lsx.ssecache[a].v = -1;
+    return;
+}
+
+void sse_reflect_reg(dynarec_la64_t* dyn, int ninst, int a)
+{
+    if (dyn->lsx.ssecache[a].v == -1)
+        return;
+    if (dyn->lsx.lsxcache[dyn->lsx.ssecache[a].reg].t == LSX_CACHE_XMMW) {
+        VST(dyn->lsx.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
+    }
+}
 
 // purge the SSE cache for XMM0..XMM7 (to use before function native call)
 void sse_purge07cache(dynarec_la64_t* dyn, int ninst, int s1)
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index 14ad25d2..9302d252 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -111,6 +111,17 @@
         LDz(x1, wback, fixedaddress);                                                           \
         ed = x1;                                                                                \
     }
+// GETED32 can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
+#define GETED32(D)                                                                                \
+    if (MODREG) {                                                                                 \
+        ed = TO_LA64((nextop & 7) + (rex.b << 3));                                                \
+        wback = 0;                                                                                \
+    } else {                                                                                      \
+        SMREAD();                                                                                 \
+        addr = geted32(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+        LDxw(x1, wback, fixedaddress);                                                            \
+        ed = x1;                                                                                  \
+    }
 // GETEDH can use hint for ed, and x1 or x2 for wback (depending on hint), might also use x3. wback is 0 if ed is xEAX..xEDI
 #define GETEDH(hint, D)                                                                                                                 \
     if (MODREG) {                                                                                                                       \
@@ -727,6 +738,7 @@ void* la64_next(x64emu_t* emu, uintptr_t addr);
 #define dynarec64_0F   STEPNAME(dynarec64_0F)
 #define dynarec64_64   STEPNAME(dynarec64_64)
 #define dynarec64_66   STEPNAME(dynarec64_66)
+#define dynarec64_67   STEPNAME(dynarec64_67)
 #define dynarec64_F30F STEPNAME(dynarec64_F30F)
 #define dynarec64_660F STEPNAME(dynarec64_660F)
 #define dynarec64_F0   STEPNAME(dynarec64_F0)
@@ -800,6 +812,7 @@ void* la64_next(x64emu_t* emu, uintptr_t addr);
 #define emit_sar32c         STEPNAME(emit_sar32c)
 #define emit_ror32c         STEPNAME(emit_ror32c)
 #define emit_rol32          STEPNAME(emit_rol32)
+#define emit_rol32c         STEPNAME(emit_rol32c)
 
 #define emit_pf STEPNAME(emit_pf)
 
@@ -809,6 +822,8 @@ void* la64_next(x64emu_t* emu, uintptr_t addr);
 #define sse_purge07cache STEPNAME(sse_purge07cache)
 #define sse_get_reg       STEPNAME(sse_get_reg)
 #define sse_get_reg_empty STEPNAME(sse_get_reg_empty)
+#define sse_forget_reg    STEPNAME(sse_forget_reg)
+#define sse_reflect_reg   STEPNAME(sse_reflect_reg)
 
 #define fpu_pushcache       STEPNAME(fpu_pushcache)
 #define fpu_popcache        STEPNAME(fpu_popcache)
@@ -895,6 +910,7 @@ void emit_sar16(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
 void emit_sar32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4);
 void emit_ror32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4);
 void emit_rol32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4);
+void emit_rol32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4);
 
 void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4);
 
@@ -923,6 +939,10 @@ void sse_purge07cache(dynarec_la64_t* dyn, int ninst, int s1);
 int sse_get_reg(dynarec_la64_t* dyn, int ninst, int s1, int a, int forwrite);
 // get lsx register for an SSE reg, but don't try to synch it if it needed to be created
 int sse_get_reg_empty(dynarec_la64_t* dyn, int ninst, int s1, int a);
+// forget float register for a SSE reg, create the entry if needed
+void sse_forget_reg(dynarec_la64_t* dyn, int ninst, int a);
+// Push current value to the cache
+void sse_reflect_reg(dynarec_la64_t* dyn, int ninst, int a);
 
 void CacheTransform(dynarec_la64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3);
 
@@ -940,6 +960,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 uintptr_t dynarec64_F30F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_64(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int seg, int* ok, int* need_epilog);
 uintptr_t dynarec64_66(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_67(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_F0(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_F20F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h
index 6201eda8..83f5719c 100644
--- a/src/dynarec/la64/la64_emitter.h
+++ b/src/dynarec/la64/la64_emitter.h
@@ -1139,6 +1139,12 @@ LSX instruction starts with V, LASX instruction starts with XV.
 #define VSIGNCOV_H(vd, vj, vk)      EMIT(type_3R(0b01110001001011101, vk, vj, vd))
 #define VSIGNCOV_W(vd, vj, vk)      EMIT(type_3R(0b01110001001011110, vk, vj, vd))
 #define VSIGNCOV_D(vd, vj, vk)      EMIT(type_3R(0b01110001001011111, vk, vj, vd))
+#define VMSKLTZ_B(vd, vj)           EMIT(type_2R(0b0111001010011100010000, vj, vd))
+#define VMSKLTZ_H(vd, vj)           EMIT(type_2R(0b0111001010011100010001, vj, vd))
+#define VMSKLTZ_W(vd, vj)           EMIT(type_2R(0b0111001010011100010010, vj, vd))
+#define VMSKLTZ_D(vd, vj)           EMIT(type_2R(0b0111001010011100010011, vj, vd))
+#define VMSKGEZ_B(vd, vj)           EMIT(type_2R(0b0111001010011100010100, vj, vd))
+#define VMSKNZ_B(vd, vj)            EMIT(type_2R(0b0111001010011100011000, vj, vd))
 #define VAND_V(vd, vj, vk)          EMIT(type_3R(0b01110001001001100, vk, vj, vd))
 #define VLDI(vd, imm13)             EMIT(type_1RI13(0b01110011111000, imm13, vd))
 #define VOR_V(vd, vj, vk)           EMIT(type_3R(0b01110001001001101, vk, vj, vd))
@@ -1694,6 +1700,8 @@ LSX instruction starts with V, LASX instruction starts with XV.
 #define XVSLT_HU(vd, vj, vk)         EMIT(type_3R(0b01110100000010001, vk, vj, vd))
 #define XVSLT_WU(vd, vj, vk)         EMIT(type_3R(0b01110100000010010, vk, vj, vd))
 #define XVSLT_DU(vd, vj, vk)         EMIT(type_3R(0b01110100000010011, vk, vj, vd))
+#define XVBSLL_V(vd, vj, imm5)       EMIT(type_2RI5(0b01110110100011100, imm5, vj, vd))
+#define XVBSRL_V(vd, vj, imm5)       EMIT(type_2RI5(0b01110110100011101, imm5, vj, vd))
 #define XVPACKEV_B(vd, vj, vk)       EMIT(type_3R(0b01110101000101100, vk, vj, vd))
 #define XVPACKEV_H(vd, vj, vk)       EMIT(type_3R(0b01110101000101101, vk, vj, vd))
 #define XVPACKEV_W(vd, vj, vk)       EMIT(type_3R(0b01110101000101110, vk, vj, vd))