about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2024-10-24 22:09:23 +0800
committerGitHub <noreply@github.com>2024-10-24 16:09:23 +0200
commitb3bc09f898ed380811efb943b3cc23924547c332 (patch)
tree8630527b247ab796ecf78d26d86e1398c25dbc2f /src
parent320459c2c6324c0934b5faa640e3e656a4bf9e1e (diff)
downloadbox64-b3bc09f898ed380811efb943b3cc23924547c332.tar.gz
box64-b3bc09f898ed380811efb943b3cc23924547c332.zip
[RV64_DYNAREC] Added 1 more 66 0F opcode for vector (#1956)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_0f_vector.c8
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f_vector.c77
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f20f_vector.c28
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f30f_vector.c30
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.c5
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h3
6 files changed, 93 insertions, 58 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_0f_vector.c b/src/dynarec/rv64/dynarec_rv64_0f_vector.c
index 268023c9..e01ed34b 100644
--- a/src/dynarec/rv64/dynarec_rv64_0f_vector.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f_vector.c
@@ -104,7 +104,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 q0 = fpu_get_scratch(dyn);
                 VSLIDEDOWN_VI(q0, v1, 1, VECTOR_UNMASKED);
                 if (rv64_xtheadvector) {
-                    vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                    VECTOR_LOAD_VMASK(0b01, x4, 1);
                     VMERGE_VVM(v0, v0, q0); // implies VMASK
                 } else {
                     VMV_X_S(x4, q0);
@@ -116,7 +116,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); // unaligned!
                 GETGX_vector(v0, 1, VECTOR_SEW8);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
             }
             break;
@@ -140,7 +140,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
                 v1 = fpu_get_scratch(dyn);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 VSLIDEUP_VI(v0, v1, 8, VECTOR_UNMASKED);
             }
@@ -156,7 +156,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 q0 = fpu_get_scratch(dyn);
                 VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED);
                 if (rv64_xtheadvector) {
-                    vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                    VECTOR_LOAD_VMASK(0b01, x4, 1);
                     VMERGE_VVM(v1, v1, q0); // implies VMASK
                 } else {
                     VMV_X_S(x4, q0);
diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c
index b27f4135..3327adf8 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c
@@ -99,7 +99,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             } else {
                 q0 = fpu_get_scratch(dyn);
                 VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
-                vector_loadmask(dyn, ninst, VMASK, 0b10, x1, 1);
+                VECTOR_LOAD_VMASK(0b10, x1, 1);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
                 VLUXEI64_V(v0, q0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
@@ -117,7 +117,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 q0 = fpu_get_scratch(dyn);
                 VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED);
                 if (rv64_xtheadvector) {
-                    vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                    VECTOR_LOAD_VMASK(0b01, x4, 1);
                     VMERGE_VVM(v0, v1, q0); // implies VMASK
                 } else {
                     if (v0 != v1) { VMV_V_V(v0, v1); }
@@ -126,7 +126,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 }
             } else {
                 q0 = fpu_get_scratch(dyn);
-                vector_loadmask(dyn, ninst, VMASK, 0b10, x1, 1);
+                VECTOR_LOAD_VMASK(0b10, x1, 1);
                 VSLIDE1DOWN_VX(v0, v0, xZR, VECTOR_UNMASKED);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
@@ -197,7 +197,6 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
-                    VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
                     VMV_V_V(v0, q0);
                     if (q1 & 1) VMV_V_V(d1, q1);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2);
@@ -216,7 +215,6 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
-                    VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
                     VMV_V_V(v0, q0);
                     if (q1 & 1) VMV_V_V(d1, q1);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL2, 2);
@@ -236,7 +234,6 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
-                    VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
                     VMV_V_V(v0, q0);
                     if (q1 & 1) VMV_V_V(d1, q1);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2);
@@ -257,7 +254,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // no more scratches!
                     VWMULSU_VV(v0, q1, q0, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2);
-                    vector_loadmask(dyn, ninst, VMASK, 0b0101010101010101, x4, 2);
+                    VECTOR_LOAD_VMASK(0b0101010101010101, x4, 2);
                     VCOMPRESS_VM(d0, v0, VMASK);
                     VXOR_VI(VMASK, VMASK, 0x1F, VECTOR_UNMASKED);
                     VCOMPRESS_VM(d1, v0, VMASK);
@@ -273,7 +270,6 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
-                    VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
                     VMV_V_V(v0, q0);
                     if (q1 & 1) VMV_V_V(d1, q1);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2);
@@ -671,7 +667,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETGX_vector(q0, 1, VECTOR_SEW16);
                     GETEX_vector(q1, 0, 0, VECTOR_SEW16);
                     u8 = F8;
-                    vector_loadmask(dyn, ninst, VMASK, u8, x4, 1);
+                    VECTOR_LOAD_VMASK(u8, x4, 1);
                     VADD_VI(q0, q1, 0, VECTOR_MASKED);
                     break;
                 case 0x0F:
@@ -911,7 +907,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             INST_NAME("PUNPCKLBW Gx, Ex");
             nextop = F8;
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
-            vector_loadmask(dyn, ninst, VMASK, 0b1010101010101010, x1, 1);
+            VECTOR_LOAD_VMASK(0b1010101010101010, x1, 1);
             v0 = fpu_get_scratch(dyn);
             VIOTA_M(v0, VMASK, VECTOR_UNMASKED); // v0 = 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0
             GETGX_vector(q0, 1, VECTOR_SEW8);
@@ -926,7 +922,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             INST_NAME("PUNPCKLWD Gx, Ex");
             nextop = F8;
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
-            vector_loadmask(dyn, ninst, VMASK, 0b10101010, x1, 1);
+            VECTOR_LOAD_VMASK(0b10101010, x1, 1);
             v0 = fpu_get_scratch(dyn);
             VIOTA_M(v0, VMASK, VECTOR_UNMASKED); // v0 = 3 3 2 2 1 1 0 0
             GETGX_vector(q0, 1, VECTOR_SEW16);
@@ -941,7 +937,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             INST_NAME("PUNPCKLDQ Gx, Ex");
             nextop = F8;
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
-            vector_loadmask(dyn, ninst, VMASK, 0b1010, x1, 1);
+            VECTOR_LOAD_VMASK(0b1010, x1, 1);
             v0 = fpu_get_scratch(dyn);
             VIOTA_M(v0, VMASK, VECTOR_UNMASKED); // v0 = 1 1 0 0
             GETGX_vector(q0, 1, VECTOR_SEW32);
@@ -1021,7 +1017,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 INST_NAME("PUNPCKHBW Gx, Ex");
                 nextop = F8;
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
-                vector_loadmask(dyn, ninst, VMASK, 0b1010101010101010, x1, 1);
+                VECTOR_LOAD_VMASK(0b1010101010101010, x1, 1);
                 v0 = fpu_get_scratch(dyn);
                 VIOTA_M(v0, VMASK, VECTOR_UNMASKED);
                 VADD_VI(v0, v0, 8, VECTOR_UNMASKED); // v0 = 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8
@@ -1029,7 +1025,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 INST_NAME("PUNPCKHWD Gx, Ex");
                 nextop = F8;
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
-                vector_loadmask(dyn, ninst, VMASK, 0b10101010, x1, 1);
+                VECTOR_LOAD_VMASK(0b10101010, x1, 1);
                 v0 = fpu_get_scratch(dyn);
                 VIOTA_M(v0, VMASK, VECTOR_UNMASKED);
                 VADD_VI(v0, v0, 4, VECTOR_UNMASKED); // v0 = 7 7 6 6 5 5 4 4
@@ -1037,7 +1033,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 INST_NAME("PUNPCKHDQ Gx, Ex");
                 nextop = F8;
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
-                vector_loadmask(dyn, ninst, VMASK, 0b1010, x1, 1);
+                VECTOR_LOAD_VMASK(0b1010, x1, 1);
                 v0 = fpu_get_scratch(dyn);
                 VIOTA_M(v0, VMASK, VECTOR_UNMASKED);
                 VADD_VI(v0, v0, 2, VECTOR_UNMASKED); // v0 = 3 3 2 2
@@ -1087,7 +1083,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             } else {
                 q0 = fpu_get_scratch(dyn);
                 VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
-                vector_loadmask(dyn, ninst, VMASK, 0b10, x1, 1);
+                VECTOR_LOAD_VMASK(0b10, x1, 1);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
                 VLUXEI64_V(v0, q0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
@@ -1105,7 +1101,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 q0 = fpu_get_scratch(dyn);
                 VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED);
                 if (rv64_xtheadvector) {
-                    vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                    VECTOR_LOAD_VMASK(0b01, x4, 1);
                     VMERGE_VVM(v0, v1, q0); // implies VMASK
                 } else {
                     if (v0 != v1) { VMV_V_V(v0, v1); }
@@ -1114,7 +1110,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 }
             } else {
                 q0 = fpu_get_scratch(dyn);
-                vector_loadmask(dyn, ninst, VMASK, 0b10, x1, 1);
+                VECTOR_LOAD_VMASK(0b10, x1, 1);
                 VSLIDE1DOWN_VX(v0, v0, xZR, VECTOR_UNMASKED);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
@@ -1132,7 +1128,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 SET_ELEMENT_WIDTH(x3, VECTOR_SEW32, 1);
             }
             VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
-            vector_loadmask(dyn, ninst, VMASK, 1, x4, 1);
+            VECTOR_LOAD_VMASK(1, x4, 1);
             VMERGE_VXM(v0, v0, ed);
             break;
         case 0x6F:
@@ -1389,6 +1385,37 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             VMERGE_VIM(q0, q0, 1); // implies vmask and widened it
             VRSUB_VX(q0, q0, xZR, VECTOR_UNMASKED);
             break;
+        case 0x7C:
+            INST_NAME("HADDPD Gx, Ex");
+            nextop = F8;
+            SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
+            GETGX_vector(q0, 1, VECTOR_SEW64);
+            GETEX_vector(q1, 0, 0, VECTOR_SEW64);
+            v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
+            v1 = fpu_get_scratch(dyn);
+            d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
+            d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // no more scratches!
+            VMV_V_V(v0, q0);
+            if (q1 & 1) VMV_V_V(d1, q1);
+            VMV_V_I(VMASK, rv64_xtheadvector ? 1 : 0b0101);
+            vector_vsetvli(dyn, ninst, x1, VECTOR_SEW64, VECTOR_LMUL2, 2);
+            VSLIDEUP_VI(v0, (q1 & 1) ? d1 : q1, 2, VECTOR_UNMASKED);
+            VCOMPRESS_VM(d0, v0, VMASK);
+            VXOR_VI(VMASK, VMASK, 0x1F, VECTOR_UNMASKED);
+            VCOMPRESS_VM(d1, v0, VMASK);
+            vector_vsetvli(dyn, ninst, x1, VECTOR_SEW64, VECTOR_LMUL1, 1);
+            if (!box64_dynarec_fastnan) {
+                VMFEQ_VV(v0, d0, d0, VECTOR_UNMASKED);
+                VMFEQ_VV(v1, d1, d1, VECTOR_UNMASKED);
+                VMAND_MM(v0, v0, v1);
+            }
+            VFADD_VV(q0, d0, d1, VECTOR_UNMASKED);
+            if (!box64_dynarec_fastnan) {
+                VMFEQ_VV(v1, q0, q0, VECTOR_UNMASKED);
+                VMANDN_MM(VMASK, v0, v1);
+                VFSGNJN_VV(q0, q0, q0, VECTOR_MASKED);
+            }
+            break;
         case 0x7E:
             return 0;
         case 0x7F:
@@ -1424,7 +1451,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 LHU(x4, ed, fixedaddress);
                 ed = x4;
             }
-            vector_loadmask(dyn, ninst, VMASK, (1 << u8), x5, 1);
+            VECTOR_LOAD_VMASK((1 << u8), x5, 1);
             v0 = fpu_get_scratch(dyn);
             VMERGE_VXM(v0, q0, ed); // uses VMASK
             VMV_V_V(q0, v0);
@@ -1470,7 +1497,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (MODREG) {
                 q1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
             } else {
-                vector_loadmask(dyn, ninst, VMASK, 1, x1, 1);
+                VECTOR_LOAD_VMASK(1, x1, 1);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
                 q1 = fpu_get_scratch(dyn);
@@ -1624,7 +1651,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             nextop = F8;
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
             GETGX_vector(q0, 1, VECTOR_SEW64);
-            vector_loadmask(dyn, ninst, VMASK, 1, x1, 1);
+            VECTOR_LOAD_VMASK(1, x1, 1);
             if (MODREG) {
                 q1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
             } else {
@@ -1645,7 +1672,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             nextop = F8;
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
             GETGX_vector(q0, 1, VECTOR_SEW64);
-            vector_loadmask(dyn, ninst, VMASK, 1, x1, 1);
+            VECTOR_LOAD_VMASK(1, x1, 1);
             if (MODREG) {
                 q1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
             } else {
@@ -1789,7 +1816,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (MODREG) {
                 q1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
             } else {
-                vector_loadmask(dyn, ninst, VMASK, 1, x1, 1);
+                VECTOR_LOAD_VMASK(1, x1, 1);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
                 q1 = fpu_get_scratch(dyn);
@@ -1857,7 +1884,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             VSRA_VI(v1, v0, 15, VECTOR_UNMASKED);
             VXOR_VV(v0, v1, v0, VECTOR_UNMASKED);
             VSUB_VV(v1, v0, v1, VECTOR_UNMASKED);
-            vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 2);
+            VECTOR_LOAD_VMASK(0xFF, x4, 2);
             VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
             VREDSUM_VS(v0, v1, v0, VECTOR_MASKED); // sum low 64
             VSLIDEDOWN_VI(d0, v1, 8, VECTOR_UNMASKED);
diff --git a/src/dynarec/rv64/dynarec_rv64_f20f_vector.c b/src/dynarec/rv64/dynarec_rv64_f20f_vector.c
index 263f3030..120e1281 100644
--- a/src/dynarec/rv64/dynarec_rv64_f20f_vector.c
+++ b/src/dynarec/rv64/dynarec_rv64_f20f_vector.c
@@ -56,7 +56,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v0 = sse_get_reg_vector(dyn, ninst, x1, gd, 1, VECTOR_SEW64);
                 v1 = sse_get_reg_vector(dyn, ninst, x1, ed, 0, VECTOR_SEW64);
                 if (rv64_xtheadvector) {
-                    vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                    VECTOR_LOAD_VMASK(0b01, x4, 1);
                     VMERGE_VVM(v0, v0, v1); // implies VMASK
                 } else {
                     VMV_X_S(x4, v1);
@@ -68,7 +68,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd);
                 d0 = fpu_get_scratch(dyn);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(d0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
                 VMERGE_VVM(v0, v0, d0); // implies VMASK
@@ -84,7 +84,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 ed = (nextop & 7) + (rex.b << 3);
                 d0 = sse_get_reg_vector(dyn, ninst, x1, ed, 1, VECTOR_SEW64);
                 if (rv64_xtheadvector) {
-                    vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                    VECTOR_LOAD_VMASK(0b01, x4, 1);
                     VMERGE_VVM(v0, v0, v1); // implies VMASK
                 } else {
                     VMV_X_S(x4, v1);
@@ -115,7 +115,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (rv64_xtheadvector) {
                 v1 = fpu_get_scratch(dyn);
                 VFMV_S_F(v1, v0);
-                vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                VECTOR_LOAD_VMASK(0b01, x4, 1);
                 VMERGE_VVM(v0, v0, v1); // implies VMASK
             } else {
                 VFMV_S_F(v0, v0);
@@ -133,7 +133,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v0 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
             }
@@ -168,7 +168,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v0 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
             }
@@ -209,13 +209,13 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v1 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
                 GETGX_vector(v0, 1, VECTOR_SEW64);
             }
             if (box64_dynarec_fastnan) {
-                vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                VECTOR_LOAD_VMASK(0b01, x4, 1);
                 VFMUL_VV(v0, v0, v1, VECTOR_MASKED);
             } else {
                 VFMV_F_S(v0, v0);
@@ -232,7 +232,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 if (rv64_xtheadvector) {
                     d0 = fpu_get_scratch(dyn);
                     VFMV_S_F(d0, v0);
-                    vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                    VECTOR_LOAD_VMASK(0b01, x4, 1);
                     VMERGE_VVM(v0, v0, d0); // implies VMASK
                 } else {
                     VFMV_S_F(v0, v0);
@@ -251,7 +251,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v1 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
                 GETGX_vector(v0, 1, VECTOR_SEW64);
@@ -271,13 +271,13 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 if (rv64_xtheadvector) {
                     d0 = fpu_get_scratch(dyn);
                     VFMV_S_F(d0, v0);
-                    vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                    VECTOR_LOAD_VMASK(0b01, x4, 1);
                     VMERGE_VVM(v0, v0, d0); // implies VMASK
                 } else {
                     VFMV_S_F(v0, v0);
                 }
             } else {
-                vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                VECTOR_LOAD_VMASK(0b01, x4, 1);
                 VFDIV_VV(v0, v0, v1, VECTOR_MASKED);
             }
             break;
@@ -293,7 +293,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 d1 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(d1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
                 GETGX_vector(d0, 1, VECTOR_SEW64);
@@ -346,7 +346,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (rv64_xtheadvector) {
                 v0 = fpu_get_scratch(dyn);
                 VMV_S_X(v0, x2);
-                vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1);
+                VECTOR_LOAD_VMASK(0b01, x4, 1);
                 VMERGE_VVM(d0, d0, v0); // implies VMASK
             } else {
                 VMV_S_X(d0, x2);
diff --git a/src/dynarec/rv64/dynarec_rv64_f30f_vector.c b/src/dynarec/rv64/dynarec_rv64_f30f_vector.c
index 46c3db2d..3dda70b1 100644
--- a/src/dynarec/rv64/dynarec_rv64_f30f_vector.c
+++ b/src/dynarec/rv64/dynarec_rv64_f30f_vector.c
@@ -58,7 +58,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v0 = sse_get_reg_vector(dyn, ninst, x1, gd, 1, VECTOR_SEW32);
                 v1 = sse_get_reg_vector(dyn, ninst, x1, ed, 0, VECTOR_SEW32);
                 if (rv64_xtheadvector) {
-                    vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1);
+                    VECTOR_LOAD_VMASK(0b0001, x4, 1);
                     VMERGE_VVM(v0, v0, v1); // implies VMASK
                 } else {
                     VMV_X_S(x4, v1);
@@ -70,7 +70,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd);
                 d0 = fpu_get_scratch(dyn);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xF, x4, 1);
+                VECTOR_LOAD_VMASK(0xF, x4, 1);
                 VLE8_V(d0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
                 VMERGE_VVM(v0, v0, d0); // implies VMASK
@@ -86,7 +86,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 ed = (nextop & 7) + (rex.b << 3);
                 d0 = sse_get_reg_vector(dyn, ninst, x1, ed, 1, VECTOR_SEW32);
                 if (rv64_xtheadvector) {
-                    vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1);
+                    VECTOR_LOAD_VMASK(0b0001, x4, 1);
                     VMERGE_VVM(v0, v0, v1); // implies VMASK
                 } else {
                     VMV_X_S(x4, v1);
@@ -119,7 +119,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (rv64_xtheadvector) {
                 v1 = fpu_get_scratch(dyn);
                 VFMV_S_F(v1, v0);
-                vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1);
+                VECTOR_LOAD_VMASK(0b0001, x4, 1);
                 VMERGE_VVM(v0, v0, v1); // implies VMASK
             } else {
                 VFMV_S_F(v0, v0);
@@ -139,13 +139,13 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v1 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                 GETGX_vector(v0, 1, VECTOR_SEW32);
             }
             if (box64_dynarec_fastnan) {
-                vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1);
+                VECTOR_LOAD_VMASK(0b0001, x4, 1);
                 VFMUL_VV(v0, v0, v1, VECTOR_MASKED);
             } else {
                 VFMV_F_S(v0, v0);
@@ -162,7 +162,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 if (rv64_xtheadvector) {
                     d0 = fpu_get_scratch(dyn);
                     VFMV_S_F(d0, v0);
-                    vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1);
+                    VECTOR_LOAD_VMASK(0b0001, x4, 1);
                     VMERGE_VVM(v0, v0, d0); // implies VMASK
                 } else {
                     VFMV_S_F(v0, v0);
@@ -181,13 +181,13 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v1 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                 GETGX_vector(v0, 1, VECTOR_SEW32);
             }
             d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
-            vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1);
+            VECTOR_LOAD_VMASK(0b0001, x4, 1);
             VFWCVT_F_F_V(d0, v1, VECTOR_MASKED);
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
             if (rv64_xtheadvector) {
@@ -209,7 +209,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v1 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                 GETGX_vector(v0, 1, VECTOR_SEW32);
@@ -218,7 +218,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             d1 = fpu_get_scratch(dyn);
             q0 = fpu_get_scratch(dyn);
             q1 = fpu_get_scratch(dyn);
-            vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1);
+            VECTOR_LOAD_VMASK(0b0001, x4, 1);
             VMV_V_V(q1, VMASK);
             VMFEQ_VV(d0, v0, v0, VECTOR_MASKED);
             VMFEQ_VV(d1, v1, v1, VECTOR_MASKED);
@@ -241,7 +241,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 v1 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                 GETGX_vector(v0, 1, VECTOR_SEW32);
@@ -250,7 +250,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             d1 = fpu_get_scratch(dyn);
             q0 = fpu_get_scratch(dyn);
             q1 = fpu_get_scratch(dyn);
-            vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1);
+            VECTOR_LOAD_VMASK(0b0001, x4, 1);
             VMV_V_V(q1, VMASK);
             VMFEQ_VV(d0, v0, v0, VECTOR_MASKED);
             VMFEQ_VV(d1, v1, v1, VECTOR_MASKED);
@@ -278,7 +278,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 d1 = fpu_get_scratch(dyn);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1);
-                vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1);
+                VECTOR_LOAD_VMASK(0xFF, x4, 1);
                 VLE8_V(d1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                 GETGX_vector(d0, 1, VECTOR_SEW32);
@@ -331,7 +331,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (rv64_xtheadvector) {
                 v0 = fpu_get_scratch(dyn);
                 VMV_S_X(v0, x2);
-                vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1);
+                VECTOR_LOAD_VMASK(0b0001, x4, 1);
                 VMERGE_VVM(d0, d0, v0); // implies VMASK
             } else {
                 VMV_S_X(d0, x2);
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c
index eb239e5c..ea62bc69 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.c
+++ b/src/dynarec/rv64/dynarec_rv64_helper.c
@@ -2711,6 +2711,11 @@ void vector_loadmask(dynarec_rv64_t* dyn, int ninst, int vreg, uint64_t imm, int
                     ADDI(s1, xZR, 1);
                     VMV_S_X(vreg, s1);
                     return;
+                case 0b0101:
+                    vector_vsetvli(dyn, ninst, s1, VECTOR_SEW64, VECTOR_LMUL1, 1);
+                    VMV_V_I(vreg, 1);
+                    vector_vsetvli(dyn, ninst, s1, sew, vlmul, multiple);
+                    return;
                 case 0b1010:
                     vector_vsetvli(dyn, ninst, s1, VECTOR_SEW64, VECTOR_LMUL1, 1);
                     MOV64x(s1, 0x100000000ULL);
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 12dd594b..d2d84e90 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -1828,4 +1828,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
         }                                               \
     } while (0)
 
+#define VECTOR_LOAD_VMASK(mask, s1, multiple) \
+    vector_loadmask(dyn, ninst, VMASK, mask, s1, multiple)
+
 #endif //__DYNAREC_RV64_HELPER_H__