about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2025-08-18 20:12:24 +0800
committerGitHub <noreply@github.com>2025-08-18 14:12:24 +0200
commit1c2e763ffbff668851ab0845dee3d4f2072a0e36 (patch)
treec1e932ceff58067f27a829168117fe3f468f0a46 /src
parent5f144d8ddde847fd6e99d1739940cfd3cbf3779e (diff)
downloadbox64-1c2e763ffbff668851ab0845dee3d4f2072a0e36.tar.gz
box64-1c2e763ffbff668851ab0845dee3d4f2072a0e36.zip
[RV64_DYNAREC] Added a few more scalar AVX 66 0F38 opcodes (#2949)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f38.c4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx.c2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx_0f.c22
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx_66_0f.c26
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c402
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c22
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h15
7 files changed, 451 insertions, 42 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_660f38.c b/src/dynarec/rv64/dynarec_rv64_660f38.c
index df87f2f7..b3088fd2 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f38.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f38.c
@@ -70,9 +70,9 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                     for (int i = 0; i < 16; ++i) {
                         LBU(x3, wback, fixedaddress + i);
                         ANDI(x4, x3, 128);
-                        BEQZ(x4, 12);
+                        BEQZ(x4, 4 + 4 * 2);
                         SB(xZR, gback, gdoffset + i);
-                        BEQZ(xZR, 20); // continue
+                        J(4 + 4 * 4); // continue
                         ANDI(x4, x3, 15);
                         ADD(x4, x4, x5);
                         LBU(x4, x4, 0);
diff --git a/src/dynarec/rv64/dynarec_rv64_avx.c b/src/dynarec/rv64/dynarec_rv64_avx.c
index 0d4a6f2c..6209ccff 100644
--- a/src/dynarec/rv64/dynarec_rv64_avx.c
+++ b/src/dynarec/rv64/dynarec_rv64_avx.c
@@ -54,6 +54,8 @@ uintptr_t dynarec64_AVX(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int n
         addr = dynarec64_AVX_66_0F(dyn, addr, ip, ninst, vex, ok, need_epilog);
     else if ((vex.m == VEX_M_0F) && (vex.p == VEX_P_F3))
         addr = dynarec64_AVX_F3_0F(dyn, addr, ip, ninst, vex, ok, need_epilog);
+    else if ((vex.m == VEX_M_0F38) && (vex.p == VEX_P_66))
+        addr = dynarec64_AVX_66_0F38(dyn, addr, ip, ninst, vex, ok, need_epilog);
     else {
         DEFAULT;
     }
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_0f.c b/src/dynarec/rv64/dynarec_rv64_avx_0f.c
index 95207210..6b4b812d 100644
--- a/src/dynarec/rv64/dynarec_rv64_avx_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_avx_0f.c
@@ -30,7 +30,7 @@ uintptr_t dynarec64_AVX_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, in
     uint8_t opcode = F8;
     uint8_t nextop, u8;
     uint8_t gd, ed, vd;
-    uint8_t wback, wb1, wb2, gback, vback, gyback;
+    uint8_t wback, wb1, wb2, gback, vback;
     uint8_t eb1, eb2, gb1, gb2;
     int32_t i32, i32_;
     int cacheupd = 0;
@@ -59,12 +59,12 @@ uintptr_t dynarec64_AVX_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, in
             if (vex.l) {
                 GETEY();
                 LD(x3, wback, fixedaddress);
-                SD(x3, gyback, gyoffset);
+                SD(x3, gback, gyoffset);
                 LD(x3, wback, fixedaddress + 8);
-                SD(x3, gyback, gyoffset + 8);
+                SD(x3, gback, gyoffset + 8);
             } else {
-                SD(xZR, gyback, gyoffset);
-                SD(xZR, gyback, gyoffset + 8);
+                SD(xZR, gback, gyoffset);
+                SD(xZR, gback, gyoffset + 8);
             }
             break;
         case 0x29:
@@ -79,9 +79,9 @@ uintptr_t dynarec64_AVX_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, in
             if (vex.l) {
                 GETEY();
                 GETGY();
-                LD(x3, gyback, gyoffset);
+                LD(x3, gback, gyoffset);
                 SD(x3, wback, fixedaddress);
-                LD(x3, gyback, gyoffset + 8);
+                LD(x3, gback, gyoffset + 8);
                 SD(x3, wback, fixedaddress + 8);
             } else if (MODREG) {
                 GETEY();
@@ -147,11 +147,11 @@ uintptr_t dynarec64_AVX_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, in
                 FLW(s1, wback, fixedaddress + 12);
                 FCVTDS(s0, s0);
                 FCVTDS(s1, s1);
-                FSD(s0, gyback, gyoffset + 0);
-                FSD(s1, gyback, gyoffset + 8);
+                FSD(s0, gback, gyoffset + 0);
+                FSD(s1, gback, gyoffset + 8);
             } else {
-                FSD(xZR, gyback, gyoffset + 0);
-                FSD(xZR, gyback, gyoffset + 8);
+                FSD(xZR, gback, gyoffset + 0);
+                FSD(xZR, gback, gyoffset + 8);
             }
             break;
         default:
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c
index 64d2a93f..a360cfa9 100644
--- a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c
@@ -30,7 +30,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
     uint8_t opcode = F8;
     uint8_t nextop, u8;
     uint8_t gd, ed, vd;
-    uint8_t wback, wb1, wb2, gback, vback, gyback;
+    uint8_t wback, wb1, wb2, gback, vback;
     uint8_t eb1, eb2, gb1, gb2;
     int32_t i32, i32_;
     int cacheupd = 0;
@@ -59,12 +59,12 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             if (vex.l) {
                 GETEY();
                 LD(x3, wback, fixedaddress);
-                SD(x3, gyback, gyoffset);
+                SD(x3, gback, gyoffset);
                 LD(x3, wback, fixedaddress + 8);
-                SD(x3, gyback, gyoffset + 8);
+                SD(x3, gback, gyoffset + 8);
             } else {
-                SD(xZR, gyback, gyoffset);
-                SD(xZR, gyback, gyoffset + 8);
+                SD(xZR, gback, gyoffset);
+                SD(xZR, gback, gyoffset + 8);
             }
             break;
         case 0x6E:
@@ -75,8 +75,8 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             GETGY();
             SD(ed, gback, gdoffset);
             SD(xZR, gback, gdoffset + 8);
-            SD(xZR, gyback, gyoffset);
-            SD(xZR, gyback, gyoffset + 8);
+            SD(xZR, gback, gyoffset);
+            SD(xZR, gback, gyoffset + 8);
             break;
         case 0x6F:
             INST_NAME("VMOVDQA Gx, Ex");
@@ -91,12 +91,12 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             if (vex.l) {
                 GETEY();
                 LD(x3, wback, fixedaddress);
-                SD(x3, gyback, gyoffset);
+                SD(x3, gback, gyoffset);
                 LD(x3, wback, fixedaddress + 8);
-                SD(x3, gyback, gyoffset + 8);
+                SD(x3, gback, gyoffset + 8);
             } else {
-                SD(xZR, gyback, gyoffset);
-                SD(xZR, gyback, gyoffset + 8);
+                SD(xZR, gback, gyoffset);
+                SD(xZR, gback, gyoffset + 8);
             }
             break;
         case 0x7E:
@@ -136,9 +136,9 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             SD(x3, wback, fixedaddress + 8);
             if (vex.l) {
                 GETEY();
-                LD(x3, gyback, gyoffset);
+                LD(x3, gback, gyoffset);
                 SD(x3, wback, fixedaddress);
-                LD(x3, gyback, gyoffset + 8);
+                LD(x3, gback, gyoffset + 8);
                 SD(x3, wback, fixedaddress + 8);
             } else if (MODREG) {
                 SD(xZR, wback, fixedaddress);
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c
new file mode 100644
index 00000000..bb41fbdf
--- /dev/null
+++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c
@@ -0,0 +1,402 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+#include "my_cpuid.h"
+#include "emu/x87emu_private.h"
+#include "emu/x64shaext.h"
+
+#include "rv64_printer.h"
+#include "dynarec_rv64_private.h"
+#include "dynarec_rv64_functions.h"
+#include "../dynarec_helper.h"
+
+uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)need_epilog;
+
+    uint8_t opcode = F8;
+    uint8_t nextop, u8;
+    uint8_t gd, ed, vd;
+    uint8_t wback, wb1, wb2, gback, vback;
+    uint8_t eb1, eb2, gb1, gb2;
+    int32_t i32, i32_;
+    int cacheupd = 0;
+    int v0, v1, v2;
+    int q0, q1, q2;
+    int d0, d1, d2;
+    int s0;
+    uint64_t tmp64u, u64;
+    int64_t j64;
+    int64_t fixedaddress, gdoffset, vxoffset, gyoffset, vyoffset;
+    int unscaled;
+
+    rex_t rex = vex.rex;
+
+    switch (opcode) {
+        case 0x00:
+            INST_NAME("VPSHUFB Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 31 : 15);
+            GETGX();
+            GETVX();
+            GETGY();
+            GETVY();
+
+            if (gd == vex.v) {
+                ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                LD(x3, vback, vxoffset + 0);
+                LD(x4, vback, vxoffset + 8);
+                SD(x3, x5, 0);
+                SD(x4, x5, 8);
+                vback = x5;
+                vxoffset = 0;
+            }
+
+            for (int i = 0; i < 16; ++i) {
+                LBU(x3, wback, fixedaddress + i);
+                ANDI(x4, x3, 128);
+                BEQZ(x4, 4 + 4 * 2);
+                SB(xZR, gback, gdoffset + i);
+                J(4 + 4 * 4); // continue
+                ANDI(x4, x3, 15);
+                ADD(x4, x4, vback);
+                LBU(x4, x4, vxoffset);
+                SB(x4, gback, gdoffset + i);
+            }
+
+            if (vex.l) {
+                GETEY();
+                if (gd == vex.v) {
+                    LD(x3, vback, vyoffset + 0);
+                    LD(x4, vback, vyoffset + 8);
+                    SD(x3, x5, 0);
+                    SD(x4, x5, 8);
+                    vback = x5;
+                    vyoffset = 0;
+                }
+                for (int i = 0; i < 16; ++i) {
+                    LBU(x3, wback, fixedaddress + i);
+                    ANDI(x4, x3, 128);
+                    BEQZ(x4, 4 + 4 * 2);
+                    SB(xZR, gback, gdoffset + i);
+                    J(4 + 4 * 4); // continue
+                    ANDI(x4, x3, 15);
+                    ADD(x4, x4, vback);
+                    LBU(x4, x4, vxoffset);
+                    SB(x4, gback, gdoffset + i);
+                }
+            } else {
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+            }
+            break;
+        case 0x01:
+            INST_NAME("VPHADDW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 46 : 14);
+            GETGX();
+            GETVX();
+            GETGY();
+            GETVY();
+            if (gd == ed) {
+                ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                LD(x3, wback, fixedaddress + 0);
+                LD(x4, wback, fixedaddress + 8);
+                SD(x3, x5, 0);
+                SD(x4, x5, 8);
+                wback = x5;
+                fixedaddress = 0;
+            }
+            for (int i = 0; i < 4; ++i) {
+                // GX->sw[i] = VX->sw[i*2+0]+VX->sw[i*2+1];
+                LH(x3, vback, vxoffset + 2 * (i * 2 + 0));
+                LH(x4, vback, vxoffset + 2 * (i * 2 + 1));
+                ADDW(x3, x3, x4);
+                SH(x3, gback, gdoffset + 2 * i);
+            }
+            if (MODREG && ed == vex.v) {
+                // GX->q[1] = GX->q[0];
+                LD(x3, gback, gdoffset + 0);
+                SD(x3, gback, gdoffset + 8);
+            } else {
+                for (int i = 0; i < 4; ++i) {
+                    // GX->sw[4+i] = EX->sw[i*2+0]+EX->sw[i*2+1];
+                    LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
+                    LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
+                    ADDW(x3, x3, x4);
+                    SH(x3, gback, gdoffset + 2 * (4 + i));
+                }
+            }
+            if (vex.l) {
+                GETEY();
+                if (gd == ed) {
+                    ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                    LD(x3, wback, fixedaddress + 0);
+                    LD(x4, wback, fixedaddress + 8);
+                    SD(x3, x5, 0);
+                    SD(x4, x5, 8);
+                    wback = x5;
+                    fixedaddress = 0;
+                }
+                for (int i = 0; i < 4; ++i) {
+                    // GY->sw[i] = VY->sw[i*2+0]+VY->sw[i*2+1];
+                    LH(x3, vback, vyoffset + 2 * (i * 2 + 0));
+                    LH(x4, vback, vyoffset + 2 * (i * 2 + 1));
+                    ADDW(x3, x3, x4);
+                    SH(x3, gback, gyoffset + 2 * i);
+                }
+                if (MODREG && ed == vex.v) {
+                    // GY->q[1] = GY->q[0];
+                    LD(x3, gback, gyoffset + 0);
+                    SD(x3, gback, gyoffset + 8);
+                } else {
+                    for (int i = 0; i < 4; ++i) {
+                        // GY->sw[4+i] = EY->sw[i*2+0]+EY->sw[i*2+1];
+                        LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
+                        LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
+                        ADDW(x3, x3, x4);
+                        SH(x3, gback, gyoffset + 2 * (4 + i));
+                    }
+                }
+            } else {
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+            }
+            break;
+        case 0x02:
+            INST_NAME("VPHADDD Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 44 : 12);
+            GETGX();
+            GETVX();
+            GETGY();
+            GETVY();
+            if (gd == ed) {
+                ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                LD(x3, wback, fixedaddress + 0);
+                LD(x4, wback, fixedaddress + 8);
+                SD(x3, x5, 0);
+                SD(x4, x5, 8);
+                wback = x5;
+                fixedaddress = 0;
+            }
+            for (int i = 0; i < 2; ++i) {
+                // GX->sd[i] = VX->sd[i*2+0]+VX->sd[i*2+1];
+                LW(x3, vback, vxoffset + 4 * (i * 2 + 0));
+                LW(x4, vback, vxoffset + 4 * (i * 2 + 1));
+                ADDW(x3, x3, x4);
+                SW(x3, gback, gdoffset + 4 * i);
+            }
+            if (MODREG && ed == vex.v) {
+                // GX->q[1] = GX->q[0];
+                LD(x3, gback, gdoffset + 0);
+                SD(x3, gback, gdoffset + 8);
+            } else {
+                for (int i = 0; i < 2; ++i) {
+                    // GX->sd[4+i] = EX->sd[i*2+0]+EX->sd[i*2+1];
+                    LW(x3, wback, fixedaddress + 4 * (i * 2 + 0));
+                    LW(x4, wback, fixedaddress + 4 * (i * 2 + 1));
+                    ADDW(x3, x3, x4);
+                    SW(x3, gback, gdoffset + 4 * (2 + i));
+                }
+            }
+            if (vex.l) {
+                GETEY();
+                if (gd == ed) {
+                    ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                    LD(x3, wback, fixedaddress + 0);
+                    LD(x4, wback, fixedaddress + 8);
+                    SD(x3, x5, 0);
+                    SD(x4, x5, 8);
+                    wback = x5;
+                    fixedaddress = 0;
+                }
+                for (int i = 0; i < 2; ++i) {
+                    // GY->sd[i] = VY->sd[i*2+0]+VY->sd[i*2+1];
+                    LW(x3, vback, vyoffset + 4 * (i * 2 + 0));
+                    LW(x4, vback, vyoffset + 4 * (i * 2 + 1));
+                    ADDW(x3, x3, x4);
+                    SW(x3, gback, gyoffset + 4 * i);
+                }
+                if (MODREG && ed == vex.v) {
+                    // GY->q[1] = GY->q[0];
+                    LD(x3, gback, gyoffset + 0);
+                    SD(x3, gback, gyoffset + 8);
+                } else {
+                    for (int i = 0; i < 4; ++i) {
+                        // GY->sd[4+i] = EY->sd[i*2+0]+EY->sd[i*2+1];
+                        LW(x3, wback, fixedaddress + 4 * (i * 2 + 0));
+                        LW(x4, wback, fixedaddress + 4 * (i * 2 + 1));
+                        ADDW(x3, x3, x4);
+                        SW(x3, gback, gyoffset + 4 * (2 + i));
+                    }
+                }
+            } else {
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+            }
+            break;
+        case 0x05:
+            INST_NAME("VPHSUBW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 46 : 14);
+            GETGX();
+            GETVX();
+            GETGY();
+            GETVY();
+            if (gd == ed) {
+                ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                LD(x3, wback, fixedaddress + 0);
+                LD(x4, wback, fixedaddress + 8);
+                SD(x3, x5, 0);
+                SD(x4, x5, 8);
+                wback = x5;
+                fixedaddress = 0;
+            }
+            for (int i = 0; i < 4; ++i) {
+                // GX->sw[i] = VX->sw[i*2+0]-VX->sw[i*2+1];
+                LH(x3, vback, vxoffset + 2 * (i * 2 + 0));
+                LH(x4, vback, vxoffset + 2 * (i * 2 + 1));
+                SUBW(x3, x3, x4);
+                SH(x3, gback, gdoffset + 2 * i);
+            }
+            if (MODREG && ed == vex.v) {
+                // GX->q[1] = GX->q[0];
+                LD(x3, gback, gdoffset + 0);
+                SD(x3, gback, gdoffset + 8);
+            } else {
+                for (int i = 0; i < 4; ++i) {
+                    // GX->sw[4+i] = EX->sw[i*2+0]-EX->sw[i*2+1];
+                    LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
+                    LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
+                    SUBW(x3, x3, x4);
+                    SH(x3, gback, gdoffset + 2 * (4 + i));
+                }
+            }
+            if (vex.l) {
+                GETEY();
+                if (gd == ed) {
+                    ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                    LD(x3, wback, fixedaddress + 0);
+                    LD(x4, wback, fixedaddress + 8);
+                    SD(x3, x5, 0);
+                    SD(x4, x5, 8);
+                    wback = x5;
+                    fixedaddress = 0;
+                }
+                for (int i = 0; i < 4; ++i) {
+                    // GY->sw[i] = VY->sw[i*2+0]-VY->sw[i*2+1];
+                    LH(x3, vback, vyoffset + 2 * (i * 2 + 0));
+                    LH(x4, vback, vyoffset + 2 * (i * 2 + 1));
+                    SUBW(x3, x3, x4);
+                    SH(x3, gback, gyoffset + 2 * i);
+                }
+                if (MODREG && ed == vex.v) {
+                    // GY->q[1] = GY->q[0];
+                    LD(x3, gback, gyoffset + 0);
+                    SD(x3, gback, gyoffset + 8);
+                } else {
+                    for (int i = 0; i < 4; ++i) {
+                        // GY->sw[4+i] = EY->sw[i*2+0]-EY->sw[i*2+1];
+                        LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
+                        LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
+                        SUBW(x3, x3, x4);
+                        SH(x3, gback, gyoffset + 2 * (4 + i));
+                    }
+                }
+            } else {
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+            }
+            break;
+        case 0x06:
+            INST_NAME("VPHSUBD Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 44 : 12);
+            GETGX();
+            GETVX();
+            GETGY();
+            GETVY();
+            if (gd == ed) {
+                ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                LD(x3, wback, fixedaddress + 0);
+                LD(x4, wback, fixedaddress + 8);
+                SD(x3, x5, 0);
+                SD(x4, x5, 8);
+                wback = x5;
+                fixedaddress = 0;
+            }
+            for (int i = 0; i < 2; ++i) {
+                // GX->sd[i] = VX->sd[i*2+0]-VX->sd[i*2+1];
+                LW(x3, vback, vxoffset + 4 * (i * 2 + 0));
+                LW(x4, vback, vxoffset + 4 * (i * 2 + 1));
+                SUBW(x3, x3, x4);
+                SW(x3, gback, gdoffset + 4 * i);
+            }
+            if (MODREG && ed == vex.v) {
+                // GX->q[1] = GX->q[0];
+                LD(x3, gback, gdoffset + 0);
+                SD(x3, gback, gdoffset + 8);
+            } else {
+                for (int i = 0; i < 2; ++i) {
+                    // GX->sd[4+i] = EX->sd[i*2+0]-EX->sd[i*2+1];
+                    LW(x3, wback, fixedaddress + 4 * (i * 2 + 0));
+                    LW(x4, wback, fixedaddress + 4 * (i * 2 + 1));
+                    SUBW(x3, x3, x4);
+                    SW(x3, gback, gdoffset + 4 * (2 + i));
+                }
+            }
+            if (vex.l) {
+                GETEY();
+                if (gd == ed) {
+                    ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                    LD(x3, wback, fixedaddress + 0);
+                    LD(x4, wback, fixedaddress + 8);
+                    SD(x3, x5, 0);
+                    SD(x4, x5, 8);
+                    wback = x5;
+                    fixedaddress = 0;
+                }
+                for (int i = 0; i < 2; ++i) {
+                    // GY->sd[i] = VY->sd[i*2+0]-VY->sd[i*2+1];
+                    LW(x3, vback, vyoffset + 4 * (i * 2 + 0));
+                    LW(x4, vback, vyoffset + 4 * (i * 2 + 1));
+                    SUBW(x3, x3, x4);
+                    SW(x3, gback, gyoffset + 4 * i);
+                }
+                if (MODREG && ed == vex.v) {
+                    // GY->q[1] = GY->q[0];
+                    LD(x3, gback, gyoffset + 0);
+                    SD(x3, gback, gyoffset + 8);
+                } else {
+                    for (int i = 0; i < 4; ++i) {
+                        // GY->sd[4+i] = EY->sd[i*2+0]-EY->sd[i*2+1];
+                        LW(x3, wback, fixedaddress + 4 * (i * 2 + 0));
+                        LW(x4, wback, fixedaddress + 4 * (i * 2 + 1));
+                        SUBW(x3, x3, x4);
+                        SW(x3, gback, gyoffset + 4 * (2 + i));
+                    }
+                }
+            } else {
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+            }
+            break;
+        default:
+            DEFAULT;
+    }
+    return addr;
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c b/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c
index 05bad3e7..6dbe2d37 100644
--- a/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c
@@ -30,7 +30,7 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
     uint8_t opcode = F8;
     uint8_t nextop, u8;
     uint8_t gd, ed, vd;
-    uint8_t wback, wb1, wb2, gback, vback, gyback;
+    uint8_t wback, wb1, wb2, gback, vback;
     uint8_t eb1, eb2, gb1, gb2;
     int32_t i32, i32_;
     int cacheupd = 0;
@@ -66,8 +66,8 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 SD(xZR, gback, gdoffset + 8);
             }
             GETGY();
-            SD(xZR, gyback, gyoffset);
-            SD(xZR, gyback, gyoffset + 8);
+            SD(xZR, gback, gyoffset);
+            SD(xZR, gback, gyoffset + 8);
             break;
         case 0x11:
             INST_NAME("VMOVSS Ex, [Vx,] Gx");
@@ -102,8 +102,8 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 LD(x2, vback, vxoffset + 8);
                 SD(x2, gback, gdoffset + 8);
             }
-            SD(xZR, gyback, gyoffset);
-            SD(xZR, gyback, gyoffset + 8);
+            SD(xZR, gback, gyoffset);
+            SD(xZR, gback, gyoffset + 8);
             break;
         case 0x5D:
             INST_NAME("VMINSS Gx, Vx, Ex");
@@ -132,8 +132,8 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 LD(x2, vback, vxoffset + 8);
                 SD(x2, gback, gdoffset + 8);
             }
-            SD(xZR, gyback, gyoffset);
-            SD(xZR, gyback, gyoffset + 8);
+            SD(xZR, gback, gyoffset);
+            SD(xZR, gback, gyoffset + 8);
             break;
         case 0x5F:
             INST_NAME("VMAXSS Gx, Vx, Ex");
@@ -162,8 +162,8 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 LD(x2, vback, vxoffset + 8);
                 SD(x2, gback, gdoffset + 8);
             }
-            SD(xZR, gyback, gyoffset);
-            SD(xZR, gyback, gyoffset + 8);
+            SD(xZR, gback, gyoffset);
+            SD(xZR, gback, gyoffset + 8);
             break;
         case 0xC2:
             INST_NAME("VCMPSS Gx, Vx, Ex, Ib");
@@ -227,8 +227,8 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 LD(x2, vback, vxoffset + 8);
                 SD(x2, gback, gdoffset + 8);
             }
-            SD(xZR, gyback, gyoffset);
-            SD(xZR, gyback, gyoffset + 8);
+            SD(xZR, gback, gyoffset);
+            SD(xZR, gback, gyoffset + 8);
             break;
         default:
             DEFAULT;
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 7fdcad6b..973f21fa 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -460,7 +460,6 @@
 #define GETGY()                                 \
     gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
     /* TODO: forget */                          \
-    gyback = xEmu;                              \
     gyoffset = offsetof(x64emu_t, ymm[gd])
 
 #define GETVX()                            \
@@ -468,6 +467,10 @@
     vback = xEmu;                          \
     vxoffset = offsetof(x64emu_t, xmm[vex.v])
 
+#define GETVY()        \
+    /* TODO: forget */ \
+    vyoffset = offsetof(x64emu_t, ymm[vex.v]);
+
 // Get Ex address in general register a, will purge SS or SD if it's reg and is loaded. May use x3. Use wback as load address!
 #define GETEX(a, D, I12)                                                                         \
     if (MODREG) {                                                                                \
@@ -1286,10 +1289,11 @@
 #define dynarec64_F20F_vector STEPNAME(dynarec64_F20F_vector)
 #define dynarec64_F30F_vector STEPNAME(dynarec64_F30F_vector)
 
-#define dynarec64_AVX       STEPNAME(dynarec64_AVX)
-#define dynarec64_AVX_0F    STEPNAME(dynarec64_AVX_0F)
-#define dynarec64_AVX_66_0F STEPNAME(dynarec64_AVX_66_0F)
-#define dynarec64_AVX_F3_0F STEPNAME(dynarec64_AVX_F3_0F)
+#define dynarec64_AVX         STEPNAME(dynarec64_AVX)
+#define dynarec64_AVX_0F      STEPNAME(dynarec64_AVX_0F)
+#define dynarec64_AVX_66_0F   STEPNAME(dynarec64_AVX_66_0F)
+#define dynarec64_AVX_66_0F38 STEPNAME(dynarec64_AVX_66_0F38)
+#define dynarec64_AVX_F3_0F   STEPNAME(dynarec64_AVX_F3_0F)
 
 #define geted               STEPNAME(geted)
 #define geted32             STEPNAME(geted32)
@@ -1740,6 +1744,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
 uintptr_t dynarec64_AVX(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 uintptr_t dynarec64_AVX_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
+uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 
 #if STEP < 2