about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2024-09-29 16:38:49 +0800
committerGitHub <noreply@github.com>2024-09-29 10:38:49 +0200
commit2a7eabfb1c4f883e20b3415cf8cfe87840a35e71 (patch)
tree97c4e34235072e857467f313a113d56abf705de8 /src
parentb4a0ae20c9c9c6644d6c5993498f5d173b335928 (diff)
downloadbox64-2a7eabfb1c4f883e20b3415cf8cfe87840a35e71.tar.gz
box64-2a7eabfb1c4f883e20b3415cf8cfe87840a35e71.zip
[RV64_DYNAREC] Refined RISC-V vector emitter (#1884)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_0f_vector.c18
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f_vector.c416
-rw-r--r--src/dynarec/rv64/rv64_emitter.h588
3 files changed, 510 insertions, 512 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_0f_vector.c b/src/dynarec/rv64/dynarec_rv64_0f_vector.c
index 104db227..454fa02a 100644
--- a/src/dynarec/rv64/dynarec_rv64_0f_vector.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f_vector.c
@@ -102,7 +102,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 GETGX_vector(v0, 1, dyn->vector_eew);
                 GETEX_vector(v1, 0, 0, VECTOR_SEW64);
                 q0 = fpu_get_scratch(dyn);
-                VSLIDEDOWN_VI(q0, 1, v1, VECTOR_UNMASKED);
+                VSLIDEDOWN_VI(q0, v1, 1, VECTOR_UNMASKED);
                 VMV_X_S(x4, q0);
                 VMV_S_X(v0, x4);
             } else {
@@ -126,7 +126,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                     v1 = fpu_get_scratch(dyn);
                     VMV_V_V(v1, v0);
                 }
-                VSLIDEUP_VI(v0, 1, v1, VECTOR_UNMASKED);
+                VSLIDEUP_VI(v0, v1, 1, VECTOR_UNMASKED);
             } else {
                 INST_NAME("MOVHPS Gx, Ex");
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
@@ -137,7 +137,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 MOV64x(x4, 0xFF);
                 VMV_S_X(VMASK, x4);
                 VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1);
-                VSLIDEUP_VI(v0, 8, v1, VECTOR_UNMASKED);
+                VSLIDEUP_VI(v0, v1, 8, VECTOR_UNMASKED);
             }
             break;
         case 0x17:
@@ -149,13 +149,13 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             if (MODREG) {
                 v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
                 q0 = fpu_get_scratch(dyn);
-                VSLIDE1DOWN_VX(q0, xZR, v0, VECTOR_UNMASKED);
+                VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED);
                 VMV_X_S(x4, q0);
                 VMV_S_X(v1, x4);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
                 q0 = fpu_get_scratch(dyn);
-                VSLIDE1DOWN_VX(q0, xZR, v0, VECTOR_UNMASKED);
+                VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED);
                 VMV_X_S(x4, q0);
                 SD(x4, ed, fixedaddress);
                 SMWRITE2();
@@ -205,7 +205,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             } else {
                 q0 = sse_get_reg_vector(dyn, ninst, x1, gd, 1, dyn->vector_eew);
                 GETEX_vector(q1, 0, 0, dyn->vector_eew);
-                VXOR_VV(q0, q0, q1, VECTOR_UNMASKED);
+                VXOR_VV(q0, q1, q0, VECTOR_UNMASKED);
             }
             break;
         case 0xC6:
@@ -220,14 +220,14 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             d1 = fpu_get_scratch(dyn);
             tmp64u0 = (((u8 >> 2) & 3) << 16) | (u8 & 3);
             VECTOR_SPLAT_IMM(q0, tmp64u0, x4);
-            VRGATHEREI16_VV(d0, q0, v0, VECTOR_UNMASKED);
+            VRGATHEREI16_VV(d0, v0, q0, VECTOR_UNMASKED);
             tmp64u1 = (((u8 >> 6) & 3) << 16) | ((u8 >> 4) & 3);
             if (tmp64u1 != tmp64u0) {
                 VECTOR_SPLAT_IMM(q0, tmp64u1, x4);
             }
-            VRGATHEREI16_VV(d1, q0, v1, VECTOR_UNMASKED);
+            VRGATHEREI16_VV(d1, v1, q0, VECTOR_UNMASKED);
             VMV_V_V(v0, d0);
-            VSLIDEUP_VI(v0, 2, d1, VECTOR_UNMASKED);
+            VSLIDEUP_VI(v0, d1, 2, VECTOR_UNMASKED);
             break;
         case 0x00 ... 0x0F:
         case 0x18:
diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c
index 81afe063..d43297d3 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c
@@ -95,14 +95,14 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     v1 = fpu_get_scratch(dyn);
                     VMV_V_V(v1, v0);
                 }
-                VSLIDEUP_VI(v0, 1, v1, VECTOR_UNMASKED);
+                VSLIDEUP_VI(v0, v1, 1, VECTOR_UNMASKED);
             } else {
                 q0 = fpu_get_scratch(dyn);
                 VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
                 VMV_V_I(VMASK, 0b10);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
-                VLUXEI64_V(v0, ed, q0, VECTOR_MASKED, VECTOR_NFIELD1);
+                VLUXEI64_V(v0, q0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
             }
             break;
         case 0x15:
@@ -115,14 +115,14 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (MODREG) {
                 v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
                 q0 == fpu_get_scratch(dyn);
-                VSLIDE1DOWN_VX(q0, xZR, v0, VECTOR_UNMASKED);
+                VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED);
                 VMV_X_S(x4, q0);
                 if (v0 != v1) { VMV_V_V(v0, v1); }
                 VMV_S_X(v0, x4);
             } else {
                 q0 = fpu_get_scratch(dyn);
                 VMV_V_I(VMASK, 0b10);
-                VSLIDE1DOWN_VX(v0, xZR, v0, VECTOR_UNMASKED);
+                VSLIDE1DOWN_VX(v0, v0, xZR, VECTOR_UNMASKED);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
                 VLE64_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
@@ -179,8 +179,8 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     v1 = fpu_get_scratch(dyn);
                     ADDI(x4, xZR, 0b000010001111);
                     VMV_V_X(v0, x4); // broadcast the mask
-                    VAND_VV(v0, v0, q1, VECTOR_UNMASKED);
-                    VRGATHER_VV(v1, v0, q0, VECTOR_UNMASKED); // registers cannot be overlapped!!
+                    VAND_VV(v0, q1, v0, VECTOR_UNMASKED);
+                    VRGATHER_VV(v1, q0, v0, VECTOR_UNMASKED); // registers cannot be overlapped!!
                     VMV_V_V(q0, v1);
                     break;
                 case 0x01:
@@ -196,14 +196,14 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     VMV_V_V(v0, q0);
                     if (q1 & 1) VMV_V_V(d1, q1);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2);
-                    VSLIDEUP_VI(v0, 8, (q1 & 1) ? d1 : q1, VECTOR_UNMASKED);
+                    VSLIDEUP_VI(v0, (q1 & 1) ? d1 : q1, 8, VECTOR_UNMASKED);
                     MOV64x(x4, 0b0101010101010101);
                     VMV_S_X(VMASK, x4);
-                    VCOMPRESS_VM(d0, VMASK, v0);
-                    VXOR_VI(VMASK, 0x1F, VMASK, VECTOR_UNMASKED);
-                    VCOMPRESS_VM(d1, VMASK, v0);
+                    VCOMPRESS_VM(d0, v0, VMASK);
+                    VXOR_VI(VMASK, VMASK, 0x1F, VECTOR_UNMASKED);
+                    VCOMPRESS_VM(d1, v0, VMASK);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 1);
-                    VADD_VV(q0, d0, d1, VECTOR_UNMASKED);
+                    VADD_VV(q0, d1, d0, VECTOR_UNMASKED);
                     break;
                 case 0x02:
                     INST_NAME("PHADDD Gx, Ex");
@@ -218,14 +218,14 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     VMV_V_V(v0, q0);
                     if (q1 & 1) VMV_V_V(d1, q1);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL2, 2);
-                    VSLIDEUP_VI(v0, 4, (q1 & 1) ? d1 : q1, VECTOR_UNMASKED);
+                    VSLIDEUP_VI(v0, (q1 & 1) ? d1 : q1, 4, VECTOR_UNMASKED);
                     MOV64x(x4, 0b01010101);
                     VMV_S_X(VMASK, x4);
-                    VCOMPRESS_VM(d0, VMASK, v0);
-                    VXOR_VI(VMASK, 0x1F, VMASK, VECTOR_UNMASKED);
-                    VCOMPRESS_VM(d1, VMASK, v0);
+                    VCOMPRESS_VM(d0, v0, VMASK);
+                    VXOR_VI(VMASK, VMASK, 0x1F, VECTOR_UNMASKED);
+                    VCOMPRESS_VM(d1, v0, VMASK);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 1);
-                    VADD_VV(q0, d0, d1, VECTOR_UNMASKED);
+                    VADD_VV(q0, d1, d0, VECTOR_UNMASKED);
                     break;
                 case 0x04:
                     INST_NAME("PMADDUBSW Gx, Ex");
@@ -236,15 +236,15 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // no more scratches!
-                    VWMULSU_VV(v0, q0, q1, VECTOR_UNMASKED);
+                    VWMULSU_VV(v0, q1, q0, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2);
                     MOV64x(x4, 0b0101010101010101);
                     VMV_S_X(VMASK, x4);
-                    VCOMPRESS_VM(d0, VMASK, v0);
-                    VXOR_VI(VMASK, 0x1F, VMASK, VECTOR_UNMASKED);
-                    VCOMPRESS_VM(d1, VMASK, v0);
+                    VCOMPRESS_VM(d0, v0, VMASK);
+                    VXOR_VI(VMASK, VMASK, 0x1F, VECTOR_UNMASKED);
+                    VCOMPRESS_VM(d1, v0, VMASK);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
-                    VSADD_VV(q0, d0, d1, VECTOR_UNMASKED);
+                    VSADD_VV(q0, d1, d0, VECTOR_UNMASKED);
                     break;
                 case 0x08 ... 0x0A:
                     if (nextop == 0x08) {
@@ -272,14 +272,14 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     v0 = fpu_get_scratch(dyn);
                     v1 = fpu_get_scratch(dyn);
                     // absolute
-                    VSRA_VI(v0, i32, q1, VECTOR_UNMASKED);
-                    VXOR_VV(v1, v0, q0, VECTOR_UNMASKED);
-                    VSUB_VV(v1, v0, v1, VECTOR_UNMASKED);
+                    VSRA_VI(v0, q1, i32, VECTOR_UNMASKED);
+                    VXOR_VV(v1, q0, v0, VECTOR_UNMASKED);
+                    VSUB_VV(v1, v1, v0, VECTOR_UNMASKED);
                     // handle zeroing
-                    VMSEQ_VI(VECTOR_MASKREG, 0, q1, VECTOR_UNMASKED);
+                    VMSEQ_VI(VMASK, q1, 0, VECTOR_UNMASKED);
                     VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
-                    VADC_VIM(v0, 0x1f, v0); // implies VECTOR_MASKREG
-                    VAND_VV(q0, v1, v0, VECTOR_UNMASKED);
+                    VADC_VIM(v0, v0, 0x1f); // implies VMASK
+                    VAND_VV(q0, v0, v1, VECTOR_UNMASKED);
                     break;
                 case 0x0B:
                     INST_NAME("PMULHRSW Gx, Ex");
@@ -289,12 +289,12 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETEX_vector(q1, 0, 0, VECTOR_SEW16);
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     fpu_get_scratch(dyn);
-                    VWMUL_VV(v0, q0, q1, VECTOR_UNMASKED);
+                    VWMUL_VV(v0, q1, q0, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL2, 2);
-                    VSRL_VI(v0, 14, v0, VECTOR_UNMASKED);
-                    VADD_VI(v0, 1, v0, VECTOR_UNMASKED);
+                    VSRL_VI(v0, v0, 14, VECTOR_UNMASKED);
+                    VADD_VI(v0, v0, 1, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 1);
-                    VNSRL_WI(q0, 1, v0, VECTOR_UNMASKED);
+                    VNSRL_WI(q0, v0, 1, VECTOR_UNMASKED);
                     break;
                 case 0x14:
                     INST_NAME("PBLENDVPS Gx, Ex");
@@ -303,8 +303,8 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETGX_vector(q0, 1, VECTOR_SEW32);
                     GETEX_vector(q1, 0, 0, VECTOR_SEW32);
                     v0 = sse_get_reg_vector(dyn, ninst, x4, 0, 0, VECTOR_SEW32);
-                    VMSLT_VX(VMASK, xZR, v0, VECTOR_UNMASKED);
-                    VADD_VX(q0, xZR, q1, VECTOR_MASKED);
+                    VMSLT_VX(VMASK, v0, xZR, VECTOR_UNMASKED);
+                    VADD_VX(q0, q1, xZR, VECTOR_MASKED);
                     break;
                 case 0x17:
                     INST_NAME("PTEST Gx, Ex");
@@ -317,17 +317,17 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     SET_DFNONE();
                     v0 = fpu_get_scratch(dyn);
                     IFX (X_ZF) {
-                        VAND_VV(v0, q0, q1, VECTOR_MASKED);
-                        VMSGT_VX(VMASK, xZR, v0, VECTOR_UNMASKED);
+                        VAND_VV(v0, q1, q0, VECTOR_MASKED);
+                        VMSGT_VX(VMASK, v0, xZR, VECTOR_UNMASKED);
                         VMV_X_S(x4, VMASK);
                         ANDI(x4, x4, 0b11);
                         BNEZ(x3, 8);
                         ORI(xFlags, xFlags, 1 << F_ZF);
                     }
                     IFX (X_CF) {
-                        VXOR_VI(v0, 0x1F, q0, VECTOR_UNMASKED);
-                        VAND_VV(v0, v0, q1, VECTOR_MASKED);
-                        VMSGT_VX(VMASK, xZR, v0, VECTOR_UNMASKED);
+                        VXOR_VI(v0, q0, 0x1F, VECTOR_UNMASKED);
+                        VAND_VV(v0, q1, v0, VECTOR_MASKED);
+                        VMSGT_VX(VMASK, v0, xZR, VECTOR_UNMASKED);
                         VMV_X_S(x4, VMASK);
                         ANDI(x4, x4, 0b11);
                         BNEZ(x3, 8);
@@ -350,9 +350,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETEX_vector(q1, 0, 0, u8);
                     GETGX_empty_vector(q0);
                     v0 = fpu_get_scratch(dyn);
-                    VSRA_VI(v0, 0x1F, q1, VECTOR_UNMASKED);
-                    VXOR_VV(q0, q1, v0, VECTOR_UNMASKED);
-                    VSUB_VV(q0, v0, q0, VECTOR_UNMASKED);
+                    VSRA_VI(v0, q1, 0x1F, VECTOR_UNMASKED);
+                    VXOR_VV(q0, v0, q1, VECTOR_UNMASKED);
+                    VSUB_VV(q0, q0, v0, VECTOR_UNMASKED);
                     break;
                 case 0x20:
                     INST_NAME("PMOVSXBW Gx, Ex");
@@ -362,7 +362,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETGX_empty_vector(q0);
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.5);
-                    VWADD_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADD_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
                     VMV_V_V(q0, v0);
                     break;
@@ -376,9 +376,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     fpu_get_scratch(dyn);
                     v1 = fpu_get_scratch(dyn);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.25);
-                    VWADD_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADD_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.5);
-                    VWADD_VX(v1, xZR, v0, VECTOR_UNMASKED);
+                    VWADD_VX(v1, v0, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                     VMV_V_V(q0, v1);
                     break;
@@ -392,11 +392,11 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     fpu_get_scratch(dyn);
                     v1 = fpu_get_scratch(dyn);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.125);
-                    VWADD_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADD_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.25);
-                    VWADD_VX(v1, xZR, v0, VECTOR_UNMASKED);
+                    VWADD_VX(v1, v0, xZR, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5);
-                    VWADD_VX(v0, xZR, v1, VECTOR_UNMASKED);
+                    VWADD_VX(v0, v1, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
                     VMV_V_V(q0, v0);
                     break;
@@ -408,7 +408,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETGX_empty_vector(q0);
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.5);
-                    VWADD_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADD_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                     VMV_V_V(q0, v0);
                     break;
@@ -422,9 +422,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     fpu_get_scratch(dyn);
                     v1 = fpu_get_scratch(dyn);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.25);
-                    VWADD_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADD_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5);
-                    VWADD_VX(v1, xZR, v0, VECTOR_UNMASKED);
+                    VWADD_VX(v1, v0, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
                     VMV_V_V(q0, v1);
                     break;
@@ -436,7 +436,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETGX_empty_vector(q0);
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5);
-                    VWADD_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADD_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
                     VMV_V_V(q0, v0);
                     break;
@@ -454,9 +454,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     if (v0 != q0) VMV_V_V(v0, q0);
                     if (v1 != q1) VMV_V_V(v1, q1);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5);
-                    VNSRL_WX(d0, xZR, v0, VECTOR_UNMASKED);
-                    VNSRL_WX(d1, xZR, v1, VECTOR_UNMASKED);
-                    VWMUL_VV(v0, d0, d1, VECTOR_UNMASKED);
+                    VNSRL_WX(d0, v0, xZR, VECTOR_UNMASKED);
+                    VNSRL_WX(d1, v1, xZR, VECTOR_UNMASKED);
+                    VWMUL_VV(v0, d1, d0, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW64, VECTOR_LMUL1, 1);
                     if (v0 != q0) VMV_V_V(q0, v0);
                     break;
@@ -470,14 +470,14 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     VXOR_VV(d0, d0, d0, VECTOR_UNMASKED);
                     d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
-                    VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED);
-                    if (q0 != q1) VMAX_VX(d1, xZR, q1, VECTOR_UNMASKED);
+                    VMAX_VX(d0, q0, xZR, VECTOR_UNMASKED);
+                    if (q0 != q1) VMAX_VX(d1, q1, xZR, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.5);
-                    VNCLIPU_WX(q0, xZR, d0, VECTOR_UNMASKED);
-                    if (q0 != q1) VNCLIPU_WX(v0, xZR, d1, VECTOR_UNMASKED);
+                    VNCLIPU_WX(q0, d0, xZR, VECTOR_UNMASKED);
+                    if (q0 != q1) VNCLIPU_WX(v0, d1, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
                     if (q0 == q1) VMV_V_V(v0, q0);
-                    VSLIDEUP_VI(q0, 4, v0, VECTOR_UNMASKED);
+                    VSLIDEUP_VI(q0, v0, 4, VECTOR_UNMASKED);
                     break;
                 case 0x30:
                     INST_NAME("PMOVZXBW Gx, Ex");
@@ -487,7 +487,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETGX_empty_vector(q0);
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.5);
-                    VWADDU_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADDU_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
                     VMV_V_V(q0, v0);
                     break;
@@ -501,9 +501,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     fpu_get_scratch(dyn);
                     v1 = fpu_get_scratch(dyn);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.25);
-                    VWADDU_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADDU_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.5);
-                    VWADDU_VX(v1, xZR, v0, VECTOR_UNMASKED);
+                    VWADDU_VX(v1, v0, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                     VMV_V_V(q0, v1);
                     break;
@@ -517,11 +517,11 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     fpu_get_scratch(dyn);
                     v1 = fpu_get_scratch(dyn);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.125);
-                    VWADDU_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADDU_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.25);
-                    VWADDU_VX(v1, xZR, v0, VECTOR_UNMASKED);
+                    VWADDU_VX(v1, v0, xZR, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5);
-                    VWADDU_VX(v0, xZR, v1, VECTOR_UNMASKED);
+                    VWADDU_VX(v0, v1, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
                     VMV_V_V(q0, v0);
                     break;
@@ -533,7 +533,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETGX_empty_vector(q0);
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.5);
-                    VWADDU_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADDU_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                     VMV_V_V(q0, v0);
                     break;
@@ -547,9 +547,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     fpu_get_scratch(dyn);
                     v1 = fpu_get_scratch(dyn);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.25);
-                    VWADDU_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADDU_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5);
-                    VWADDU_VX(v1, xZR, v0, VECTOR_UNMASKED);
+                    VWADDU_VX(v1, v0, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
                     VMV_V_V(q0, v1);
                     break;
@@ -561,7 +561,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     GETGX_empty_vector(q0);
                     v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
                     vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5);
-                    VWADDU_VX(v0, xZR, q1, VECTOR_UNMASKED);
+                    VWADDU_VX(v0, q1, xZR, VECTOR_UNMASKED);
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
                     VMV_V_V(q0, v0);
                     break;
@@ -571,7 +571,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                     GETGX_vector(q0, 1, VECTOR_SEW32);
                     GETEX_vector(q1, 0, 0, VECTOR_SEW32);
-                    VMIN_VV(q0, q0, q1, VECTOR_UNMASKED);
+                    VMIN_VV(q0, q1, q0, VECTOR_UNMASKED);
                     break;
                 case 0x3A:
                     INST_NAME("PMINUW Gx, Ex");
@@ -579,7 +579,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
                     GETEX_vector(q1, 0, 0, VECTOR_SEW16);
                     GETGX_vector(q0, 1, VECTOR_SEW16);
-                    VMINU_VV(q0, q0, q1, VECTOR_UNMASKED);
+                    VMINU_VV(q0, q1, q0, VECTOR_UNMASKED);
                     break;
                 case 0x3B:
                     INST_NAME("PMINUD Gx, Ex");
@@ -587,7 +587,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                     GETEX_vector(q1, 0, 0, VECTOR_SEW32);
                     GETGX_vector(q0, 1, VECTOR_SEW32);
-                    VMINU_VV(q0, q0, q1, VECTOR_UNMASKED);
+                    VMINU_VV(q0, q1, q0, VECTOR_UNMASKED);
                     break;
                 case 0x3D:
                     INST_NAME("PMAXSD Gx, Ex");
@@ -595,7 +595,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                     GETGX_vector(q0, 1, VECTOR_SEW32);
                     GETEX_vector(q1, 0, 0, VECTOR_SEW32);
-                    VMAX_VV(q0, q0, q1, VECTOR_UNMASKED);
+                    VMAX_VV(q0, q1, q0, VECTOR_UNMASKED);
                     break;
                 case 0x40:
                     INST_NAME("PMULLD Gx, Ex");
@@ -603,7 +603,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
                     GETEX_vector(q1, 0, 0, VECTOR_SEW32);
                     GETGX_vector(q0, 1, VECTOR_SEW32);
-                    VMUL_VV(q0, q0, q1, VECTOR_UNMASKED);
+                    VMUL_VV(q0, q1, q0, VECTOR_UNMASKED);
                     break;
                 default:
                     DEFAULT_VECTOR;
@@ -621,7 +621,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     u8 = F8;
                     ADDI(x4, xZR, u8);
                     VMV_V_X(VMASK, x4);
-                    VADD_VI(q0, 0, q1, VECTOR_MASKED);
+                    VADD_VI(q0, q1, 0, VECTOR_MASKED);
                     break;
                 case 0x0F:
                     INST_NAME("PALIGNR Gx, Ex, Ib");
@@ -641,14 +641,14 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                             vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 1);
                         }
                         VMV_V_V(d0, q0);
-                        VSLIDEDOWN_VI(q0, u8 - 16, d0, VECTOR_UNMASKED);
+                        VSLIDEDOWN_VI(q0, d0, u8 - 16, VECTOR_UNMASKED);
                     } else if (u8 == 16) {
                         // nop
                     } else if (u8 > 0) {
                         v0 = fpu_get_scratch(dyn);
                         v1 = fpu_get_scratch(dyn);
                         VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
-                        VSLIDEUP_VI(v0, 16 - u8, q0, VECTOR_UNMASKED);
+                        VSLIDEUP_VI(v0, q0, 16 - u8, VECTOR_UNMASKED);
                         if (rv64_vlen >= 256) {
                             // clear high bits before slidedown!
                             d0 = fpu_get_scratch(dyn);
@@ -658,8 +658,8 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                             VMV_V_V(d0, q1);
                             q1 = d0;
                         }
-                        VSLIDEDOWN_VI(v1, u8, q1, VECTOR_UNMASKED);
-                        VOR_VV(q0, v0, v1, VECTOR_UNMASKED);
+                        VSLIDEDOWN_VI(v1, q1, u8, VECTOR_UNMASKED);
+                        VOR_VV(q0, v1, v0, VECTOR_UNMASKED);
                     } else {
                         if (q0 != q1) VMV_V_V(q0, q1);
                     }
@@ -675,8 +675,8 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETEX_vector(q0, 0, 0, VECTOR_SEW64);
             v0 = fpu_get_scratch(dyn);
             ADDI(x4, xZR, 63);
-            VSRL_VX(v0, x4, q0, VECTOR_UNMASKED);
-            VMSNE_VX(VMASK, xZR, v0, VECTOR_UNMASKED);
+            VSRL_VX(v0, q0, x4, VECTOR_UNMASKED);
+            VMSNE_VX(VMASK, v0, xZR, VECTOR_UNMASKED);
             VMV_X_S(gd, VMASK);
             ANDI(gd, gd, 0b11);
             break;
@@ -689,7 +689,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (!box64_dynarec_fastnan) {
                 v0 = fpu_get_scratch(dyn);
                 VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
-                VMFLT_VV(VMASK, v0, q1, VECTOR_UNMASKED);
+                VMFLT_VV(VMASK, q1, v0, VECTOR_UNMASKED);
             }
             VFSQRT_V(q0, q1, VECTOR_UNMASKED);
             if (!box64_dynarec_fastnan) {
@@ -702,7 +702,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
-            VAND_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VAND_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0x55:
             INST_NAME("ANDNPD Gx, Ex");
@@ -710,8 +710,8 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
-            VXOR_VI(q0, 0x1F, q0, VECTOR_UNMASKED);
-            VAND_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VXOR_VI(q0, q0, 0x1F, VECTOR_UNMASKED);
+            VAND_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0x56:
             INST_NAME("ORPD Gx, Ex");
@@ -725,7 +725,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             } else {
                 GETGX_vector(q0, 1, dyn->vector_eew);
                 GETEX_vector(q1, 0, 0, dyn->vector_eew);
-                VOR_VV(q0, q0, q1, VECTOR_UNMASKED);
+                VOR_VV(q0, q1, q0, VECTOR_UNMASKED);
             }
             break;
         case 0x57:
@@ -734,7 +734,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
-            VXOR_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VXOR_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0x58:
             INST_NAME("ADDPD Gx, Ex");
@@ -747,12 +747,12 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VMFEQ_VV(VMASK, q0, q0, VECTOR_UNMASKED);
                 VMFEQ_VV(v0, q1, q1, VECTOR_UNMASKED);
             }
-            VFADD_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VFADD_VV(q0, q0, q1, VECTOR_UNMASKED);
             if (!box64_dynarec_fastnan) {
-                VMAND_MM(VMASK, VMASK, v0);
+                VMAND_MM(VMASK, v0, VMASK);
                 VMFEQ_VV(v0, q0, q0, VECTOR_UNMASKED);
-                VXOR_VI(v0, 0x1F, v0, VECTOR_UNMASKED);
-                VMAND_MM(VMASK, VMASK, v0);
+                VXOR_VI(v0, v0, 0x1F, VECTOR_UNMASKED);
+                VMAND_MM(VMASK, v0, VMASK);
                 VFSGNJN_VV(q0, q0, q0, VECTOR_MASKED);
             }
             break;
@@ -767,12 +767,12 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VMFEQ_VV(VMASK, q0, q0, VECTOR_UNMASKED);
                 VMFEQ_VV(v0, q1, q1, VECTOR_UNMASKED);
             }
-            VFMUL_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VFMUL_VV(q0, q0, q1, VECTOR_UNMASKED);
             if (!box64_dynarec_fastnan) {
-                VMAND_MM(VMASK, VMASK, v0);
+                VMAND_MM(VMASK, v0, VMASK);
                 VMFEQ_VV(v0, q0, q0, VECTOR_UNMASKED);
-                VXOR_VI(v0, 0x1F, v0, VECTOR_UNMASKED);
-                VMAND_MM(VMASK, VMASK, v0);
+                VXOR_VI(v0, v0, 0x1F, VECTOR_UNMASKED);
+                VMAND_MM(VMASK, v0, VMASK);
                 VFSGNJN_VV(q0, q0, q0, VECTOR_MASKED);
             }
             break;
@@ -787,12 +787,12 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VMFEQ_VV(VMASK, q0, q0, VECTOR_UNMASKED);
                 VMFEQ_VV(v0, q1, q1, VECTOR_UNMASKED);
             }
-            VFSUB_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VFSUB_VV(q0, q0, q1, VECTOR_UNMASKED);
             if (!box64_dynarec_fastnan) {
-                VMAND_MM(VMASK, VMASK, v0);
+                VMAND_MM(VMASK, v0, VMASK);
                 VMFEQ_VV(v0, q0, q0, VECTOR_UNMASKED);
-                VXOR_VI(v0, 0x1F, v0, VECTOR_UNMASKED);
-                VMAND_MM(VMASK, VMASK, v0);
+                VXOR_VI(v0, v0, 0x1F, VECTOR_UNMASKED);
+                VMAND_MM(VMASK, v0, VMASK);
                 VFSGNJN_VV(q0, q0, q0, VECTOR_MASKED);
             }
             break;
@@ -805,10 +805,10 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             v0 = fpu_get_scratch(dyn);
             VMFEQ_VV(VMASK, q0, q0, VECTOR_UNMASKED);
             VMFEQ_VV(v0, q1, q1, VECTOR_UNMASKED);
-            VFMIN_VV(q0, q1, q0, VECTOR_UNMASKED);
-            VMAND_MM(VMASK, VMASK, v0);
-            VXOR_VI(VMASK, 0x1F, VMASK, VECTOR_UNMASKED);
-            VADD_VX(q0, xZR, q1, VECTOR_MASKED);
+            VFMIN_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VMAND_MM(VMASK, v0, VMASK);
+            VXOR_VI(VMASK, VMASK, 0x1F, VECTOR_UNMASKED);
+            VADD_VX(q0, q1, xZR, VECTOR_MASKED);
             break;
         case 0x5E:
             INST_NAME("DIVPD Gx, Ex");
@@ -821,12 +821,12 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VMFEQ_VV(VMASK, q0, q0, VECTOR_UNMASKED);
                 VMFEQ_VV(v0, q1, q1, VECTOR_UNMASKED);
             }
-            VFDIV_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VFDIV_VV(q0, q0, q1, VECTOR_UNMASKED);
             if (!box64_dynarec_fastnan) {
-                VMAND_MM(VMASK, VMASK, v0);
+                VMAND_MM(VMASK, v0, VMASK);
                 VMFEQ_VV(v0, q0, q0, VECTOR_UNMASKED);
-                VXOR_VI(v0, 0x1F, v0, VECTOR_UNMASKED);
-                VMAND_MM(VMASK, VMASK, v0);
+                VXOR_VI(v0, v0, 0x1F, VECTOR_UNMASKED);
+                VMAND_MM(VMASK, v0, VMASK);
                 VFSGNJN_VV(q0, q0, q0, VECTOR_MASKED);
             }
             break;
@@ -839,10 +839,10 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             v0 = fpu_get_scratch(dyn);
             VMFEQ_VV(VMASK, q0, q0, VECTOR_UNMASKED);
             VMFEQ_VV(v0, q1, q1, VECTOR_UNMASKED);
-            VFMAX_VV(q0, q1, q0, VECTOR_UNMASKED);
-            VMAND_MM(VMASK, VMASK, v0);
-            VXOR_VI(VMASK, 0x1F, VMASK, VECTOR_UNMASKED);
-            VADD_VX(q0, xZR, q1, VECTOR_MASKED);
+            VFMAX_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VMAND_MM(VMASK, v0, VMASK);
+            VXOR_VI(VMASK, VMASK, 0x1F, VECTOR_UNMASKED);
+            VADD_VX(q0, q1, xZR, VECTOR_MASKED);
             break;
         case 0x60:
             INST_NAME("PUNPCKLBW Gx, Ex");
@@ -856,9 +856,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETEX_vector(q1, 0, 0, VECTOR_SEW8);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
-            VRGATHER_VV(d0, v0, q0, VECTOR_UNMASKED);
-            VRGATHER_VV(d1, v0, q1, VECTOR_UNMASKED);
-            VMERGE_VVM(q0, d1, d0);
+            VRGATHER_VV(d0, q0, v0, VECTOR_UNMASKED);
+            VRGATHER_VV(d1, q1, v0, VECTOR_UNMASKED);
+            VMERGE_VVM(q0, d0, d1);
             break;
         case 0x61:
             INST_NAME("PUNPCKLWD Gx, Ex");
@@ -872,9 +872,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
-            VRGATHER_VV(d0, v0, q0, VECTOR_UNMASKED);
-            VRGATHER_VV(d1, v0, q1, VECTOR_UNMASKED);
-            VMERGE_VVM(q0, d1, d0);
+            VRGATHER_VV(d0, q0, v0, VECTOR_UNMASKED);
+            VRGATHER_VV(d1, q1, v0, VECTOR_UNMASKED);
+            VMERGE_VVM(q0, d0, d1);
             break;
         case 0x62:
             INST_NAME("PUNPCKLDQ Gx, Ex");
@@ -888,9 +888,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETEX_vector(q1, 0, 0, VECTOR_SEW32);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
-            VRGATHER_VV(d0, v0, q0, VECTOR_UNMASKED);
-            VRGATHER_VV(d1, v0, q1, VECTOR_UNMASKED);
-            VMERGE_VVM(q0, d1, d0);
+            VRGATHER_VV(d0, q0, v0, VECTOR_UNMASKED);
+            VRGATHER_VV(d1, q1, v0, VECTOR_UNMASKED);
+            VMERGE_VVM(q0, d0, d1);
             break;
         case 0x63:
             INST_NAME("PACKSSWB Gx, Ex");
@@ -903,13 +903,13 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (rv64_vlen >= 256) {
                 vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 2); // double the vl for slideup.
                 VMV_V_V(d0, q0);
-                VSLIDEUP_VI(d0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here!
+                VSLIDEUP_VI(d0, q1, 8, VECTOR_UNMASKED); // splice q0 and q1 here!
             } else {
                 VMV_V_V(d0, q0);
                 VMV_V_V(d1, q1);
             }
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
-            VNCLIP_WI(q0, 0, d0, VECTOR_UNMASKED);
+            VNCLIP_WI(q0, d0, 0, VECTOR_UNMASKED);
             break;
         case 0x64 ... 0x66:
             if (opcode == 0x64) {
@@ -926,10 +926,10 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, u8, 1);
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
-            VMSLT_VV(VMASK, q0, q1, VECTOR_UNMASKED);
+            VMSLT_VV(VMASK, q1, q0, VECTOR_UNMASKED);
             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
-            VMERGE_VIM(q0, 1, q0); // implies vmask and widened it
-            VRSUB_VX(q0, xZR, q0, VECTOR_UNMASKED);
+            VMERGE_VIM(q0, q0, 1); // implies vmask and widened it
+            VRSUB_VX(q0, q0, xZR, VECTOR_UNMASKED);
             break;
         case 0x67:
             INST_NAME("PACKUSWB Gx, Ex");
@@ -943,18 +943,18 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 2); // double the vl for slideup.
                 if (q0 == q1) {
                     VMV_V_V(d0, q0);
-                    VSLIDEUP_VI(d0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here!
-                    VMAX_VX(d0, xZR, d0, VECTOR_UNMASKED);
+                    VSLIDEUP_VI(d0, q1, 8, VECTOR_UNMASKED); // splice q0 and q1 here!
+                    VMAX_VX(d0, d0, xZR, VECTOR_UNMASKED);
                 } else {
-                    VSLIDEUP_VI(q0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here!
-                    VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED);
+                    VSLIDEUP_VI(q0, q1, 8, VECTOR_UNMASKED); // splice q0 and q1 here!
+                    VMAX_VX(d0, q0, xZR, VECTOR_UNMASKED);
                 }
             } else {
-                VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED);
-                VMAX_VX(d1, xZR, q1, VECTOR_UNMASKED);
+                VMAX_VX(d0, q0, xZR, VECTOR_UNMASKED);
+                VMAX_VX(d1, q1, xZR, VECTOR_UNMASKED);
             }
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
-            VNCLIPU_WI(q0, 0, d0, VECTOR_UNMASKED);
+            VNCLIPU_WI(q0, d0, 0, VECTOR_UNMASKED);
             break;
         case 0x68 ... 0x6A:
             if (opcode == 0x68) {
@@ -965,7 +965,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VMV_V_X(VMASK, x1); // VMASK = 0b1010101010101010
                 v0 = fpu_get_scratch(dyn);
                 VIOTA_M(v0, VMASK, VECTOR_UNMASKED);
-                VADD_VI(v0, 8, v0, VECTOR_UNMASKED); // v0 = 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8
+                VADD_VI(v0, v0, 8, VECTOR_UNMASKED); // v0 = 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8
             } else if (opcode == 0x69) {
                 INST_NAME("PUNPCKHWD Gx, Ex");
                 nextop = F8;
@@ -974,7 +974,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VMV_V_X(VMASK, x1); // VMASK = 0b10101010
                 v0 = fpu_get_scratch(dyn);
                 VIOTA_M(v0, VMASK, VECTOR_UNMASKED);
-                VADD_VI(v0, 4, v0, VECTOR_UNMASKED); // v0 = 7 7 6 6 5 5 4 4
+                VADD_VI(v0, v0, 4, VECTOR_UNMASKED); // v0 = 7 7 6 6 5 5 4 4
             } else {
                 INST_NAME("PUNPCKHDQ Gx, Ex");
                 nextop = F8;
@@ -982,15 +982,15 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VMV_V_I(VMASK, 0b1010);
                 v0 = fpu_get_scratch(dyn);
                 VIOTA_M(v0, VMASK, VECTOR_UNMASKED);
-                VADD_VI(v0, 2, v0, VECTOR_UNMASKED); // v0 = 3 3 2 2
+                VADD_VI(v0, v0, 2, VECTOR_UNMASKED); // v0 = 3 3 2 2
             }
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
             d0 = fpu_get_scratch(dyn);
             d1 = fpu_get_scratch(dyn);
-            VRGATHER_VV(d0, v0, q0, VECTOR_UNMASKED);
-            VRGATHER_VV(d1, v0, q1, VECTOR_UNMASKED);
-            VMERGE_VVM(q0, d1, d0);
+            VRGATHER_VV(d0, q0, v0, VECTOR_UNMASKED);
+            VRGATHER_VV(d1, q1, v0, VECTOR_UNMASKED);
+            VMERGE_VVM(q0, d0, d1);
             break;
         case 0x6B:
             INST_NAME("PACKSSDW Gx, Ex");
@@ -1003,13 +1003,13 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (rv64_vlen >= 256) {
                 vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 2); // double the vl for slideup.
                 VMV_V_V(d0, q0);
-                VSLIDEUP_VI(d0, 4, q1, VECTOR_UNMASKED); // splice q0 and q1 here!
+                VSLIDEUP_VI(d0, q1, 4, VECTOR_UNMASKED); // splice q0 and q1 here!
             } else {
                 VMV_V_V(d0, q0);
                 VMV_V_V(d1, q1);
             }
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
-            VNCLIP_WI(q0, 0, d0, VECTOR_UNMASKED);
+            VNCLIP_WI(q0, d0, 0, VECTOR_UNMASKED);
             break;
         case 0x6C:
             INST_NAME("PUNPCKLQDQ Gx, Ex");
@@ -1025,14 +1025,14 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     v1 = fpu_get_scratch(dyn);
                     VMV_V_V(v1, v0);
                 }
-                VSLIDEUP_VI(v0, 1, v1, VECTOR_UNMASKED);
+                VSLIDEUP_VI(v0, v1, 1, VECTOR_UNMASKED);
             } else {
                 q0 = fpu_get_scratch(dyn);
                 VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
                 VMV_V_I(VMASK, 0b10);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
-                VLUXEI64_V(v0, ed, q0, VECTOR_MASKED, VECTOR_NFIELD1);
+                VLUXEI64_V(v0, q0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
             }
             break;
         case 0x6D:
@@ -1045,14 +1045,14 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             if (MODREG) {
                 v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
                 q0 == fpu_get_scratch(dyn);
-                VSLIDE1DOWN_VX(q0, xZR, v0, VECTOR_UNMASKED);
+                VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED);
                 VMV_X_S(x4, q0);
                 if (v0 != v1) { VMV_V_V(v0, v1); }
                 VMV_S_X(v0, x4);
             } else {
                 q0 = fpu_get_scratch(dyn);
                 VMV_V_I(VMASK, 0b10);
-                VSLIDE1DOWN_VX(v0, xZR, v0, VECTOR_UNMASKED);
+                VSLIDE1DOWN_VX(v0, v0, xZR, VECTOR_UNMASKED);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
                 VLE64_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
@@ -1070,7 +1070,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             }
             VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
             VMV_V_I(VMASK, 1);
-            VMERGE_VXM(v0, ed, v0);
+            VMERGE_VXM(v0, v0, ed);
             break;
         case 0x6F:
             INST_NAME("MOVDQA Gx, Ex");
@@ -1100,10 +1100,10 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
             if (q0 == q1) {
                 v1 = fpu_get_scratch(dyn);
-                VRGATHEREI16_VV(v1, v0, q1, VECTOR_UNMASKED);
+                VRGATHEREI16_VV(v1, q1, v0, VECTOR_UNMASKED);
                 VMV_V_V(q0, v1);
             } else {
-                VRGATHEREI16_VV(q0, v0, q1, VECTOR_UNMASKED);
+                VRGATHEREI16_VV(q0, q1, v0, VECTOR_UNMASKED);
             }
             break;
         case 0x71:
@@ -1118,7 +1118,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                         if (u8 > 15) {
                             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
                         } else {
-                            VSRL_VI(q0, u8, q0, VECTOR_UNMASKED);
+                            VSRL_VI(q0, q0, u8, VECTOR_UNMASKED);
                         }
                         PUTEX_vector(q0, VECTOR_SEW16);
                     }
@@ -1130,7 +1130,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     u8 = F8;
                     if (u8 > 15) u8 = 15;
                     if (u8) {
-                        VSRA_VI(q0, u8, q0, VECTOR_UNMASKED);
+                        VSRA_VI(q0, q0, u8, VECTOR_UNMASKED);
                     }
                     PUTEX_vector(q0, VECTOR_SEW16);
                     break;
@@ -1143,7 +1143,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                         if (u8 > 15) {
                             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
                         } else {
-                            VSLL_VI(q0, u8, q0, VECTOR_UNMASKED);
+                            VSLL_VI(q0, q0, u8, VECTOR_UNMASKED);
                         }
                         PUTEX_vector(q0, VECTOR_SEW16);
                     }
@@ -1164,7 +1164,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                         if (u8 > 31) {
                             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
                         } else if (u8) {
-                            VSRL_VI(q0, u8, q0, VECTOR_UNMASKED);
+                            VSRL_VI(q0, q0, u8, VECTOR_UNMASKED);
                         }
                         PUTEX_vector(q0, VECTOR_SEW32);
                     }
@@ -1176,7 +1176,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     u8 = F8;
                     if (u8 > 31) u8 = 31;
                     if (u8) {
-                        VSRA_VI(q0, u8, q0, VECTOR_UNMASKED);
+                        VSRA_VI(q0, q0, u8, VECTOR_UNMASKED);
                     }
                     PUTEX_vector(q0, VECTOR_SEW32);
                     break;
@@ -1189,7 +1189,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                         if (u8 > 31) {
                             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
                         } else {
-                            VSLL_VI(q0, u8, q0, VECTOR_UNMASKED);
+                            VSLL_VI(q0, q0, u8, VECTOR_UNMASKED);
                         }
                         PUTEX_vector(q0, VECTOR_SEW32);
                     }
@@ -1211,7 +1211,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
                         } else {
                             MOV64x(x4, u8);
-                            VSRL_VX(q0, x4, q0, VECTOR_UNMASKED);
+                            VSRL_VX(q0, q0, x4, VECTOR_UNMASKED);
                         }
                         PUTEX_vector(q0, VECTOR_SEW64);
                     }
@@ -1233,7 +1233,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                             vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 1);
                         }
                         VMV_V_V(d0, q0);
-                        VSLIDEDOWN_VI(q0, u8, d0, VECTOR_UNMASKED);
+                        VSLIDEDOWN_VI(q0, d0, u8, VECTOR_UNMASKED);
                     }
                     PUTEX_vector(q0, VECTOR_SEW8);
                     break;
@@ -1247,7 +1247,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
                         } else {
                             MOV64x(x4, u8);
-                            VSLL_VX(q0, x4, q0, VECTOR_UNMASKED);
+                            VSLL_VX(q0, q0, x4, VECTOR_UNMASKED);
                         }
                         PUTEX_vector(q0, VECTOR_SEW64);
                     }
@@ -1264,7 +1264,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     } else {
                         d0 = fpu_get_scratch(dyn);
                         VXOR_VV(d0, d0, d0, VECTOR_UNMASKED);
-                        VSLIDEUP_VI(d0, u8, q0, VECTOR_UNMASKED);
+                        VSLIDEUP_VI(d0, q0, u8, VECTOR_UNMASKED);
                         if (MODREG) {
                             VMV_V_V(q0, d0);
                         } else {
@@ -1290,10 +1290,10 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, u8, 1);
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
-            VMSEQ_VV(VMASK, q0, q1, VECTOR_UNMASKED);
+            VMSEQ_VV(VMASK, q1, q0, VECTOR_UNMASKED);
             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
-            VMERGE_VIM(q0, 1, q0); // implies vmask and widened it
-            VRSUB_VX(q0, xZR, q0, VECTOR_UNMASKED);
+            VMERGE_VIM(q0, q0, 1); // implies vmask and widened it
+            VRSUB_VX(q0, q0, xZR, VECTOR_UNMASKED);
             break;
         case 0x7E:
             return 0;
@@ -1333,7 +1333,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             ADDI(x5, xZR, 1 << u8);
             VMV_S_X(VMASK, x5);
             v0 = fpu_get_scratch(dyn);
-            VMERGE_VXM(v0, ed, q0); // uses VMASK
+            VMERGE_VXM(v0, q0, ed); // uses VMASK
             VMV_V_V(q0, v0);
             break;
         case 0xC5:
@@ -1345,7 +1345,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 GETEX_vector(q0, 0, 1, VECTOR_SEW16);
                 u8 = (F8) & 7;
                 v0 = fpu_get_scratch(dyn);
-                VSLIDEDOWN_VI(v0, u8, q0, VECTOR_UNMASKED);
+                VSLIDEDOWN_VI(v0, q0, u8, VECTOR_UNMASKED);
                 VMV_X_S(gd, v0);
                 ZEXTH(gd, gd);
             } else {
@@ -1390,7 +1390,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
             B_NEXT_nocond;
             MARK;
-            VSRL_VX(q0, x4, q0, VECTOR_UNMASKED);
+            VSRL_VX(q0, q0, x4, VECTOR_UNMASKED);
             break;
         case 0xD4:
             INST_NAME("PADDQ Gx, Ex");
@@ -1398,7 +1398,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
             GETGX_vector(q0, 1, VECTOR_SEW64);
             GETEX_vector(q1, 0, 0, VECTOR_SEW64);
-            VADD_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VADD_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xD5:
             INST_NAME("PMULLW Gx, Ex");
@@ -1407,7 +1407,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETGX_vector(q0, 1, VECTOR_SEW16);
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
             v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
-            VMUL_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VMUL_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xD6:
             INST_NAME("MOVQ Ex, Gx");
@@ -1433,8 +1433,8 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETGD;
             GETEX_vector(q0, 0, 0, VECTOR_SEW8);
             v0 = fpu_get_scratch(dyn);
-            VSRL_VI(v0, 7, q0, VECTOR_UNMASKED);
-            VMSNE_VX(VMASK, xZR, v0, VECTOR_UNMASKED);
+            VSRL_VI(v0, q0, 7, VECTOR_UNMASKED);
+            VMSNE_VX(VMASK, v0, xZR, VECTOR_UNMASKED);
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
             VMV_X_S(gd, VMASK);
             ZEXTH(gd, gd);
@@ -1452,7 +1452,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, u8, 1);
             GETGX_vector(q0, 1, u8);
             GETEX_vector(q1, 0, 0, u8);
-            VSSUBU_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VSSUBU_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xDA:
             INST_NAME("PMINUB Gx, Ex");
@@ -1460,7 +1460,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
             GETGX_vector(q0, 1, VECTOR_SEW8);
             GETEX_vector(q1, 0, 0, VECTOR_SEW8);
-            VMINU_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VMINU_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xDB:
             INST_NAME("PAND Gx, Ex");
@@ -1468,7 +1468,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
-            VAND_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VAND_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xDC:
         case 0xDD:
@@ -1483,7 +1483,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, u8, 1);
             GETGX_vector(q0, 1, u8);
             GETEX_vector(q1, 0, 0, u8);
-            VSADDU_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VSADDU_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xDE:
             INST_NAME("PMAXUB Gx, Ex");
@@ -1491,7 +1491,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
             GETGX_vector(q0, 1, VECTOR_SEW8);
             GETEX_vector(q1, 0, 0, VECTOR_SEW8);
-            VMAXU_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VMAXU_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xDF:
             INST_NAME("PANDN Gx, Ex");
@@ -1499,8 +1499,8 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
-            VXOR_VI(q0, 0x1F, q0, VECTOR_UNMASKED);
-            VAND_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VXOR_VI(q0, q0, 0x1F, VECTOR_UNMASKED);
+            VAND_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xE0:
             INST_NAME("PAVGB Gx, Ex");
@@ -1509,7 +1509,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETGX_vector(q0, 1, VECTOR_SEW8);
             GETEX_vector(q1, 0, 0, VECTOR_SEW8);
             CSRRWI(xZR, 0b00 /* rnu */, 0x00A /* vxrm */);
-            VAADDU_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VAADDU_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xE1:
             INST_NAME("PSRAW Gx, Ex");
@@ -1527,10 +1527,10 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             }
             v1 = fpu_get_scratch(dyn);
             ADDI(x4, xZR, 15);
-            VMINU_VX(v1, x4, q1, VECTOR_MASKED);
+            VMINU_VX(v1, q1, x4, VECTOR_MASKED);
             VMV_X_S(x4, v1);
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
-            VSRA_VX(q0, x4, q0, VECTOR_UNMASKED);
+            VSRA_VX(q0, q0, x4, VECTOR_UNMASKED);
             break;
         case 0xE2:
             INST_NAME("PSRAD Gx, Ex");
@@ -1548,10 +1548,10 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             }
             v1 = fpu_get_scratch(dyn);
             ADDI(x4, xZR, 31);
-            VMINU_VX(v1, x4, q1, VECTOR_MASKED);
+            VMINU_VX(v1, q1, x4, VECTOR_MASKED);
             VMV_X_S(x4, v1);
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
-            VSRA_VX(q0, x4, q0, VECTOR_UNMASKED);
+            VSRA_VX(q0, q0, x4, VECTOR_UNMASKED);
             break;
         case 0xE3:
             INST_NAME("PAVGW Gx, Ex");
@@ -1560,7 +1560,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETGX_vector(q0, 1, VECTOR_SEW16);
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
             CSRRWI(xZR, 0b00 /* rnu */, 0x00A /* vxrm */);
-            VAADDU_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VAADDU_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xE4:
             INST_NAME("PMULHUW Gx, Ex");
@@ -1568,7 +1568,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
             GETGX_vector(q0, 1, VECTOR_SEW16);
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
-            VMULHU_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VMULHU_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xE5:
             INST_NAME("PMULHW Gx, Ex");
@@ -1576,7 +1576,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
             GETGX_vector(q0, 1, VECTOR_SEW16);
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
-            VMULH_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VMULH_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xE8:
             INST_NAME("PSUBSB Gx, Ex");
@@ -1584,7 +1584,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
             GETGX_vector(q0, 1, VECTOR_SEW8);
             GETEX_vector(q1, 0, 0, VECTOR_SEW8);
-            VSSUB_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VSSUB_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xE9:
             INST_NAME("PSUBSW Gx, Ex");
@@ -1592,7 +1592,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
             GETGX_vector(q0, 1, VECTOR_SEW16);
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
-            VSSUB_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VSSUB_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xEA:
             INST_NAME("PMINSW Gx, Ex");
@@ -1600,7 +1600,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
             GETGX_vector(q0, 1, VECTOR_SEW16);
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
-            VMIN_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VMIN_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xEB:
             INST_NAME("POR Gx, Ex");
@@ -1608,7 +1608,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
-            VOR_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VOR_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xEC:
             INST_NAME("PADDSB Gx, Ex");
@@ -1616,7 +1616,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
             GETGX_vector(q0, 1, VECTOR_SEW8);
             GETEX_vector(q1, 0, 0, VECTOR_SEW8);
-            VSADD_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VSADD_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xED:
             INST_NAME("PADDSW Gx, Ex");
@@ -1624,7 +1624,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
             GETGX_vector(q0, 1, VECTOR_SEW16);
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
-            VSADD_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VSADD_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xEE:
             INST_NAME("PMAXSW Gx, Ex");
@@ -1632,7 +1632,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
             GETGX_vector(q0, 1, VECTOR_SEW16);
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
-            VMAX_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VMAX_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xEF:
             INST_NAME("PXOR Gx, Ex");
@@ -1647,7 +1647,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
                 q0 = sse_get_reg_vector(dyn, ninst, x1, gd, 1, dyn->vector_eew);
                 GETEX_vector(q1, 0, 0, dyn->vector_eew);
-                VXOR_VV(q0, q0, q1, VECTOR_UNMASKED);
+                VXOR_VV(q0, q1, q0, VECTOR_UNMASKED);
             }
             break;
         case 0xF1:
@@ -1685,7 +1685,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
             B_NEXT_nocond;
             MARK;
-            VSLL_VX(q0, x4, q0, VECTOR_UNMASKED);
+            VSLL_VX(q0, q0, x4, VECTOR_UNMASKED);
             break;
         case 0xF5:
             INST_NAME("PMADDWD Gx, Ex");
@@ -1695,18 +1695,18 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
             v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
             v1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
-            VWMUL_VV(v0, q0, q1, VECTOR_UNMASKED);
+            VWMUL_VV(v0, q1, q0, VECTOR_UNMASKED);
             d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // warning, no more scratches!
             ADDI(x4, xZR, 6);
             VID_V(d0, VECTOR_UNMASKED);
-            VSLL_VI(d0, 1, d0, VECTOR_UNMASKED); // times 2
-            VMIN_VX(d0, x4, d0, VECTOR_UNMASKED);
-            VADD_VI(q0, 1, d0, VECTOR_UNMASKED);
+            VSLL_VI(d0, d0, 1, VECTOR_UNMASKED); // times 2
+            VMIN_VX(d0, d0, x4, VECTOR_UNMASKED);
+            VADD_VI(q0, d0, 1, VECTOR_UNMASKED);
             vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL2, 2);
-            VRGATHEREI16_VV(v1, d0, v0, VECTOR_UNMASKED); // 6 4 2 0
-            VRGATHEREI16_VV(d0, q0, v0, VECTOR_UNMASKED); // 7 5 3 1
+            VRGATHEREI16_VV(v1, v0, d0, VECTOR_UNMASKED); // 6 4 2 0
+            VRGATHEREI16_VV(d0, v0, q0, VECTOR_UNMASKED); // 7 5 3 1
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
-            VADD_VV(q0, d0, v1, VECTOR_UNMASKED);
+            VADD_VV(q0, v1, d0, VECTOR_UNMASKED);
             break;
         case 0xF6:
             INST_NAME("PSADBW Gx, Ex");
@@ -1717,20 +1717,20 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
             v1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
             d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // no more scratches!
-            VWSUBU_VV(v0, q1, q0, VECTOR_UNMASKED);
+            VWSUBU_VV(v0, q0, q1, VECTOR_UNMASKED);
             vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2);
-            VSRA_VI(v1, 15, v0, VECTOR_UNMASKED);
-            VXOR_VV(v0, v0, v1, VECTOR_UNMASKED);
-            VSUB_VV(v1, v1, v0, VECTOR_UNMASKED);
+            VSRA_VI(v1, v0, 15, VECTOR_UNMASKED);
+            VXOR_VV(v0, v1, v0, VECTOR_UNMASKED);
+            VSUB_VV(v1, v0, v1, VECTOR_UNMASKED);
             ADDI(x4, xZR, 0xFF);
             VXOR_VV(VMASK, VMASK, VMASK, VECTOR_UNMASKED);
             VMV_S_X(VMASK, x4);
             VXOR_VV(v0, v0, v0, VECTOR_UNMASKED);
-            VREDSUM_VS(v0, v0, v1, VECTOR_MASKED); // sum low 64
-            VSLIDEDOWN_VI(d0, 8, v1, VECTOR_UNMASKED);
+            VREDSUM_VS(v0, v1, v0, VECTOR_MASKED); // sum low 64
+            VSLIDEDOWN_VI(d0, v1, 8, VECTOR_UNMASKED);
             VXOR_VV(v1, v1, v1, VECTOR_UNMASKED);
-            VREDSUM_VS(v1, v1, d0, VECTOR_MASKED); // sum high 64
-            VSLIDEUP_VI(v0, 4, v1, VECTOR_UNMASKED);
+            VREDSUM_VS(v1, d0, v1, VECTOR_MASKED); // sum high 64
+            VSLIDEUP_VI(v0, v1, 4, VECTOR_UNMASKED);
             vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 1);
             VMV_V_V(q0, v0);
             break;
@@ -1752,7 +1752,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, u8, 1);
             GETGX_vector(q0, 1, u8);
             GETEX_vector(q1, 0, 0, u8);
-            VSUB_VV(q0, q1, q0, VECTOR_UNMASKED);
+            VSUB_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xFC ... 0xFE:
             nextop = F8;
@@ -1769,7 +1769,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, u8, 1);
             GETGX_vector(q0, 1, dyn->vector_eew);
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
-            VADD_VV(q0, q0, q1, VECTOR_UNMASKED);
+            VADD_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         default:
             DEFAULT_VECTOR;
diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h
index 7c927e06..e34c6487 100644
--- a/src/dynarec/rv64/rv64_emitter.h
+++ b/src/dynarec/rv64/rv64_emitter.h
@@ -1248,8 +1248,6 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define VECTOR_NFIELD7 0b110
 #define VECTOR_NFIELD8 0b111
 
-#define VECTOR_MASKREG 0 // fixed to v0
-
 //  configuration setting
 //  https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc
 #define VSETIVLI(rd, zimm, zimm10) EMIT(I_type(0b110000000000 | (zimm10), zimm, 0b111, rd, 0b1010111)) // 11...............111.....1010111
@@ -1280,38 +1278,38 @@ f28–31  ft8–11  FP temporaries                  Caller
 //  Vector Indexed-Unordered Instructions (including segment part)
 //  https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#76-vector-indexed-instructions
 
-#define VLUXEI8_V(vd, rs1, vs2, vm, nf)   EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b000, vd, 0b0000111))  // ...001...........000.....0000111
-#define VLUXEI16_V(vd, rs1, vs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b101, vd, 0b0000111))  // ...001...........101.....0000111
-#define VLUXEI32_V(vd, rs1, vs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b110, vd, 0b0000111))  // ...001...........110.....0000111
-#define VLUXEI64_V(vd, rs1, vs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b111, vd, 0b0000111))  // ...001...........111.....0000111
-#define VSUXEI8_V(vs3, rs1, vs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b000, vs3, 0b0100111)) // ...001...........000.....0100111
-#define VSUXEI16_V(vs3, rs1, vs2, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b101, vs3, 0b0100111)) // ...001...........101.....0100111
-#define VSUXEI32_V(vs3, rs1, vs2, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b110, vs3, 0b0100111)) // ...001...........110.....0100111
-#define VSUXEI64_V(vs3, rs1, vs2, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b111, vs3, 0b0100111)) // ...001...........111.....0100111
+#define VLUXEI8_V(vd, vs2, rs1, vm, nf)   EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b000, vd, 0b0000111))  // ...001...........000.....0000111
+#define VLUXEI16_V(vd, vs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b101, vd, 0b0000111))  // ...001...........101.....0000111
+#define VLUXEI32_V(vd, vs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b110, vd, 0b0000111))  // ...001...........110.....0000111
+#define VLUXEI64_V(vd, vs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b111, vd, 0b0000111))  // ...001...........111.....0000111
+#define VSUXEI8_V(vs3, vs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b000, vs3, 0b0100111)) // ...001...........000.....0100111
+#define VSUXEI16_V(vs3, vs2, rs1, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b101, vs3, 0b0100111)) // ...001...........101.....0100111
+#define VSUXEI32_V(vs3, vs2, rs1, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b110, vs3, 0b0100111)) // ...001...........110.....0100111
+#define VSUXEI64_V(vs3, vs2, rs1, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0010, vs2, rs1, 0b111, vs3, 0b0100111)) // ...001...........111.....0100111
 
 //  Vector Strided Instructions (including segment part)
 //  https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#75-vector-strided-instructions
 
-#define VLSE8_V(vd, rs1, rs2, vm, nf)   EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b000, vd, 0b0000111))  // ...010...........000.....0000111
-#define VLSE16_V(vd, rs1, rs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b101, vd, 0b0000111))  // ...010...........101.....0000111
-#define VLSE32_V(vd, rs1, rs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b110, vd, 0b0000111))  // ...010...........110.....0000111
-#define VLSE64_V(vd, rs1, rs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b111, vd, 0b0000111))  // ...010...........111.....0000111
-#define VSSE8_V(vs3, rs1, rs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b000, vs3, 0b0100111)) // ...010...........000.....0100111
-#define VSSE16_V(vs3, rs1, rs2, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b101, vs3, 0b0100111)) // ...010...........101.....0100111
-#define VSSE32_V(vs3, rs1, rs2, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b110, vs3, 0b0100111)) // ...010...........110.....0100111
-#define VSSE64_V(vs3, rs1, rs2, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b111, vs3, 0b0100111)) // ...010...........111.....0100111
+#define VLSE8_V(vd, rs2, rs1, vm, nf)   EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b000, vd, 0b0000111))  // ...010...........000.....0000111
+#define VLSE16_V(vd, rs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b101, vd, 0b0000111))  // ...010...........101.....0000111
+#define VLSE32_V(vd, rs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b110, vd, 0b0000111))  // ...010...........110.....0000111
+#define VLSE64_V(vd, rs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b111, vd, 0b0000111))  // ...010...........111.....0000111
+#define VSSE8_V(vs3, rs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b000, vs3, 0b0100111)) // ...010...........000.....0100111
+#define VSSE16_V(vs3, rs2, rs1, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b101, vs3, 0b0100111)) // ...010...........101.....0100111
+#define VSSE32_V(vs3, rs2, rs1, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b110, vs3, 0b0100111)) // ...010...........110.....0100111
+#define VSSE64_V(vs3, rs2, rs1, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0100, rs2, rs1, 0b111, vs3, 0b0100111)) // ...010...........111.....0100111
 
 //  Vector Indexed-Ordered Instructions (including segment part)
 //  https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#76-vector-indexed-instructions
 
-#define VLOXEI8_V(vd, rs1, vs2, vm, nf)   EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b000, vd, 0b0000111))  // ...011...........000.....0000111
-#define VLOXEI16_V(vd, rs1, vs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b101, vd, 0b0000111))  // ...011...........101.....0000111
-#define VLOXEI32_V(vd, rs1, vs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b110, vd, 0b0000111))  // ...011...........110.....0000111
-#define VLOXEI64_V(vd, rs1, vs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b111, vd, 0b0000111))  // ...011...........111.....0000111
-#define VSOXEI8_V(vs3, rs1, vs2, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b000, vs3, 0b0100111)) // ...011...........000.....0100111
-#define VSOXEI16_V(vs3, rs1, vs2, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b101, vs3, 0b0100111)) // ...011...........101.....0100111
-#define VSOXEI32_V(vs3, rs1, vs2, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b110, vs3, 0b0100111)) // ...011...........110.....0100111
-#define VSOXEI64_V(vs3, rs1, vs2, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b111, vs3, 0b0100111)) // ...011...........111.....0100111
+#define VLOXEI8_V(vd, vs2, rs1, vm, nf)   EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b000, vd, 0b0000111))  // ...011...........000.....0000111
+#define VLOXEI16_V(vd, vs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b101, vd, 0b0000111))  // ...011...........101.....0000111
+#define VLOXEI32_V(vd, vs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b110, vd, 0b0000111))  // ...011...........110.....0000111
+#define VLOXEI64_V(vd, vs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b111, vd, 0b0000111))  // ...011...........111.....0000111
+#define VSOXEI8_V(vs3, vs2, rs1, vm, nf)  EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b000, vs3, 0b0100111)) // ...011...........000.....0100111
+#define VSOXEI16_V(vs3, vs2, rs1, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b101, vs3, 0b0100111)) // ...011...........101.....0100111
+#define VSOXEI32_V(vs3, vs2, rs1, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b110, vs3, 0b0100111)) // ...011...........110.....0100111
+#define VSOXEI64_V(vs3, vs2, rs1, vm, nf) EMIT(R_type(((nf) << 4) | (vm) | 0b0110, vs2, rs1, 0b111, vs3, 0b0100111)) // ...011...........111.....0100111
 
 //  Unit-stride F31..29=0ault-Only-First Loads
 //  https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#77-unit-stride-fault-only-first-loads
@@ -1349,78 +1347,78 @@ f28–31  ft8–11  FP temporaries                  Caller
 //  https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#14-vector-floating-point-instructions
 
 //  OPFVF
-#define VFADD_VF(vd, rs1, vs2, vm)        EMIT(R_type(0b0000000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 000000...........101.....1010111
-#define VFSUB_VF(vd, rs1, vs2, vm)        EMIT(R_type(0b0000100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 000010...........101.....1010111
-#define VFMIN_VF(vd, rs1, vs2, vm)        EMIT(R_type(0b0001000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 000100...........101.....1010111
-#define VFMAX_VF(vd, rs1, vs2, vm)        EMIT(R_type(0b0001100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 000110...........101.....1010111
-#define VFSGNJ_VF(vd, rs1, vs2, vm)       EMIT(R_type(0b0010000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001000...........101.....1010111
-#define VFSGNJN_VF(vd, rs1, vs2, vm)      EMIT(R_type(0b0010010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001001...........101.....1010111
-#define VFSGNJX_VF(vd, rs1, vs2, vm)      EMIT(R_type(0b0010100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001010...........101.....1010111
-#define VFSLIDE1UP_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b0011100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001110...........101.....1010111
-#define VFSLIDE1DOWN_VF(vd, rs1, vs2, vm) EMIT(R_type(0b0011110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001111...........101.....1010111
+#define VFADD_VF(vd, vs2, rs1, vm)        EMIT(R_type(0b0000000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 000000...........101.....1010111
+#define VFSUB_VF(vd, vs2, rs1, vm)        EMIT(R_type(0b0000100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 000010...........101.....1010111
+#define VFMIN_VF(vd, vs2, rs1, vm)        EMIT(R_type(0b0001000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 000100...........101.....1010111
+#define VFMAX_VF(vd, vs2, rs1, vm)        EMIT(R_type(0b0001100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 000110...........101.....1010111
+#define VFSGNJ_VF(vd, vs2, rs1, vm)       EMIT(R_type(0b0010000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001000...........101.....1010111
+#define VFSGNJN_VF(vd, vs2, rs1, vm)      EMIT(R_type(0b0010010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001001...........101.....1010111
+#define VFSGNJX_VF(vd, vs2, rs1, vm)      EMIT(R_type(0b0010100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001010...........101.....1010111
+#define VFSLIDE1UP_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b0011100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001110...........101.....1010111
+#define VFSLIDE1DOWN_VF(vd, vs2, rs1, vm) EMIT(R_type(0b0011110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 001111...........101.....1010111
 
 #define VFMV_S_F(vd, rs1) EMIT(I_type(0b010000100000, rs1, 0b101, vd, 0b1010111)) // 010000100000.....101.....1010111
 #define VFMV_V_F(vd, rs1) EMIT(I_type(0b010111100000, rs1, 0b101, vd, 0b1010111)) // 010111100000.....101.....1010111
 
-#define VFMERGE_VFM(vd, rs1, vs2) EMIT(R_type(0b0101110, vs2, rs1, 0b101, vd, 0b1010111)) // 0101110..........101.....1010111
-
-#define VMFEQ_VF(vd, rs1, vs2, vm)    EMIT(R_type(0b0110000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011000...........101.....1010111
-#define VMFLE_VF(vd, rs1, vs2, vm)    EMIT(R_type(0b0110010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011001...........101.....1010111
-#define VMFLT_VF(vd, rs1, vs2, vm)    EMIT(R_type(0b0110110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011011...........101.....1010111
-#define VMFNE_VF(vd, rs1, vs2, vm)    EMIT(R_type(0b0111000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011100...........101.....1010111
-#define VMFGT_VF(vd, rs1, vs2, vm)    EMIT(R_type(0b0111010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011101...........101.....1010111
-#define VMFGE_VF(vd, rs1, vs2, vm)    EMIT(R_type(0b0111110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011111...........101.....1010111
-#define VFDIV_VF(vd, rs1, vs2, vm)    EMIT(R_type(0b1000000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 100000...........101.....1010111
-#define VFRDIV_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b1000010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 100001...........101.....1010111
-#define VFMUL_VF(vd, rs1, vs2, vm)    EMIT(R_type(0b1001000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 100100...........101.....1010111
-#define VFRSUB_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b1001110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 100111...........101.....1010111
-#define VFMADD_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b1010000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101000...........101.....1010111
-#define VFNMADD_VF(vd, rs1, vs2, vm)  EMIT(R_type(0b1010010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101001...........101.....1010111
-#define VFMSUB_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b1010100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101010...........101.....1010111
-#define VFNMSUB_VF(vd, rs1, vs2, vm)  EMIT(R_type(0b1010110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101011...........101.....1010111
-#define VFMACC_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b1011000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101100...........101.....1010111
-#define VFNMACC_VF(vd, rs1, vs2, vm)  EMIT(R_type(0b1011010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101101...........101.....1010111
-#define VFMSAC_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b1011100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101110...........101.....1010111
-#define VFNMSAC_VF(vd, rs1, vs2, vm)  EMIT(R_type(0b1011110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101111...........101.....1010111
-#define VFWADD_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b1100000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 110000...........101.....1010111
-#define VFWSUB_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b1100100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 110010...........101.....1010111
-#define VFWADD_WF(vd, rs1, vs2, vm)   EMIT(R_type(0b1101000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 110100...........101.....1010111
-#define VFWSUB_WF(vd, rs1, vs2, vm)   EMIT(R_type(0b1101100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 110110...........101.....1010111
-#define VFWMUL_VF(vd, rs1, vs2, vm)   EMIT(R_type(0b1110000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111000...........101.....1010111
-#define VFWMACC_VF(vd, rs1, vs2, vm)  EMIT(R_type(0b1111000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111100...........101.....1010111
-#define VFWNMACC_VF(vd, rs1, vs2, vm) EMIT(R_type(0b1111010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111101...........101.....1010111
-#define VFWMSAC_VF(vd, rs1, vs2, vm)  EMIT(R_type(0b1111100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111110...........101.....1010111
-#define VFWNMSAC_VF(vd, rs1, vs2, vm) EMIT(R_type(0b1111110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111111...........101.....1010111
+#define VFMERGE_VFM(vd, vs2, rs1) EMIT(R_type(0b0101110, vs2, rs1, 0b101, vd, 0b1010111)) // 0101110..........101.....1010111
+
+#define VMFEQ_VF(vd, vs2, rs1, vm)    EMIT(R_type(0b0110000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011000...........101.....1010111
+#define VMFLE_VF(vd, vs2, rs1, vm)    EMIT(R_type(0b0110010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011001...........101.....1010111
+#define VMFLT_VF(vd, vs2, rs1, vm)    EMIT(R_type(0b0110110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011011...........101.....1010111
+#define VMFNE_VF(vd, vs2, rs1, vm)    EMIT(R_type(0b0111000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011100...........101.....1010111
+#define VMFGT_VF(vd, vs2, rs1, vm)    EMIT(R_type(0b0111010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011101...........101.....1010111
+#define VMFGE_VF(vd, vs2, rs1, vm)    EMIT(R_type(0b0111110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 011111...........101.....1010111
+#define VFDIV_VF(vd, vs2, rs1, vm)    EMIT(R_type(0b1000000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 100000...........101.....1010111
+#define VFRDIV_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b1000010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 100001...........101.....1010111
+#define VFMUL_VF(vd, vs2, rs1, vm)    EMIT(R_type(0b1001000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 100100...........101.....1010111
+#define VFRSUB_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b1001110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 100111...........101.....1010111
+#define VFMADD_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b1010000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101000...........101.....1010111
+#define VFNMADD_VF(vd, vs2, rs1, vm)  EMIT(R_type(0b1010010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101001...........101.....1010111
+#define VFMSUB_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b1010100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101010...........101.....1010111
+#define VFNMSUB_VF(vd, vs2, rs1, vm)  EMIT(R_type(0b1010110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101011...........101.....1010111
+#define VFMACC_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b1011000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101100...........101.....1010111
+#define VFNMACC_VF(vd, vs2, rs1, vm)  EMIT(R_type(0b1011010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101101...........101.....1010111
+#define VFMSAC_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b1011100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101110...........101.....1010111
+#define VFNMSAC_VF(vd, vs2, rs1, vm)  EMIT(R_type(0b1011110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 101111...........101.....1010111
+#define VFWADD_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b1100000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 110000...........101.....1010111
+#define VFWSUB_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b1100100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 110010...........101.....1010111
+#define VFWADD_WF(vd, vs2, rs1, vm)   EMIT(R_type(0b1101000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 110100...........101.....1010111
+#define VFWSUB_WF(vd, vs2, rs1, vm)   EMIT(R_type(0b1101100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 110110...........101.....1010111
+#define VFWMUL_VF(vd, vs2, rs1, vm)   EMIT(R_type(0b1110000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111000...........101.....1010111
+#define VFWMACC_VF(vd, vs2, rs1, vm)  EMIT(R_type(0b1111000 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111100...........101.....1010111
+#define VFWNMACC_VF(vd, vs2, rs1, vm) EMIT(R_type(0b1111010 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111101...........101.....1010111
+#define VFWMSAC_VF(vd, vs2, rs1, vm)  EMIT(R_type(0b1111100 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111110...........101.....1010111
+#define VFWNMSAC_VF(vd, vs2, rs1, vm) EMIT(R_type(0b1111110 | (vm), vs2, rs1, 0b101, vd, 0b1010111)) // 111111...........101.....1010111
 
 //  OPFVV
-#define VFADD_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b0000000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000000...........001.....1010111
-#define VFREDUSUM_VS(vd, vs1, vs2, vm) EMIT(R_type(0b0000010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000001...........001.....1010111
-#define VFSUB_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b0000100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000010...........001.....1010111
-#define VFREDOSUM_VS(vd, vs1, vs2, vm) EMIT(R_type(0b0000110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000011...........001.....1010111
-#define VFMIN_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b0001000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000100...........001.....1010111
-#define VFREDMIN_VS(vd, vs1, vs2, vm)  EMIT(R_type(0b0001010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000101...........001.....1010111
-#define VFMAX_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b0001100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000110...........001.....1010111
-#define VFREDMAX_VS(vd, vs1, vs2, vm)  EMIT(R_type(0b0001110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000111...........001.....1010111
-#define VFSGNJ_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b0010000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 001000...........001.....1010111
-#define VFSGNJN_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b0010010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 001001...........001.....1010111
-#define VFSGNJX_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b0010100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 001010...........001.....1010111
+#define VFADD_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b0000000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000000...........001.....1010111
+#define VFREDUSUM_VS(vd, vs2, vs1, vm) EMIT(R_type(0b0000010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000001...........001.....1010111
+#define VFSUB_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b0000100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000010...........001.....1010111
+#define VFREDOSUM_VS(vd, vs2, vs1, vm) EMIT(R_type(0b0000110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000011...........001.....1010111
+#define VFMIN_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b0001000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000100...........001.....1010111
+#define VFREDMIN_VS(vd, vs2, vs1, vm)  EMIT(R_type(0b0001010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000101...........001.....1010111
+#define VFMAX_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b0001100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000110...........001.....1010111
+#define VFREDMAX_VS(vd, vs2, vs1, vm)  EMIT(R_type(0b0001110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 000111...........001.....1010111
+#define VFSGNJ_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b0010000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 001000...........001.....1010111
+#define VFSGNJN_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b0010010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 001001...........001.....1010111
+#define VFSGNJX_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b0010100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 001010...........001.....1010111
 
 #define VFMV_F_S(rd, vs2) EMIT(R_type(0b0100001, vs2, 0b00000, 0b001, rd, 0b1010111)) // 0100001.....00000001.....1010111
 
-#define VMFEQ_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b0110000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 011000...........001.....1010111
-#define VMFLE_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b0110010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 011001...........001.....1010111
-#define VMFLT_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b0110110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 011011...........001.....1010111
-#define VMFNE_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b0111000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 011100...........001.....1010111
-#define VFDIV_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1000000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 100000...........001.....1010111
-#define VFMUL_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1001000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 100100...........001.....1010111
-#define VFMADD_VV(vd, vs1, vs2, vm)  EMIT(R_type(0b1010000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101000...........001.....1010111
-#define VFNMADD_VV(vd, vs1, vs2, vm) EMIT(R_type(0b1010010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101001...........001.....1010111
-#define VFMSUB_VV(vd, vs1, vs2, vm)  EMIT(R_type(0b1010100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101010...........001.....1010111
-#define VFNMSUB_VV(vd, vs1, vs2, vm) EMIT(R_type(0b1010110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101011...........001.....1010111
-#define VFMACC_VV(vd, vs1, vs2, vm)  EMIT(R_type(0b1011000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101100...........001.....1010111
-#define VFNMACC_VV(vd, vs1, vs2, vm) EMIT(R_type(0b1011010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101101...........001.....1010111
-#define VFMSAC_VV(vd, vs1, vs2, vm)  EMIT(R_type(0b1011100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101110...........001.....1010111
-#define VFNMSAC_VV(vd, vs1, vs2, vm) EMIT(R_type(0b1011110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101111...........001.....1010111
+#define VMFEQ_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b0110000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 011000...........001.....1010111
+#define VMFLE_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b0110010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 011001...........001.....1010111
+#define VMFLT_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b0110110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 011011...........001.....1010111
+#define VMFNE_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b0111000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 011100...........001.....1010111
+#define VFDIV_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1000000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 100000...........001.....1010111
+#define VFMUL_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1001000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 100100...........001.....1010111
+#define VFMADD_VV(vd, vs2, vs1, vm)  EMIT(R_type(0b1010000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101000...........001.....1010111
+#define VFNMADD_VV(vd, vs2, vs1, vm) EMIT(R_type(0b1010010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101001...........001.....1010111
+#define VFMSUB_VV(vd, vs2, vs1, vm)  EMIT(R_type(0b1010100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101010...........001.....1010111
+#define VFNMSUB_VV(vd, vs2, vs1, vm) EMIT(R_type(0b1010110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101011...........001.....1010111
+#define VFMACC_VV(vd, vs2, vs1, vm)  EMIT(R_type(0b1011000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101100...........001.....1010111
+#define VFNMACC_VV(vd, vs2, vs1, vm) EMIT(R_type(0b1011010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101101...........001.....1010111
+#define VFMSAC_VV(vd, vs2, vs1, vm)  EMIT(R_type(0b1011100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101110...........001.....1010111
+#define VFNMSAC_VV(vd, vs2, vs1, vm) EMIT(R_type(0b1011110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 101111...........001.....1010111
 
 #define VFCVT_XU_F_V(vd, vs2, vm)      EMIT(R_type(0b0100100 | (vm), vs2, 0b00000, 0b001, vd, 0b1010111)) // 010010......00000001.....1010111
 #define VFCVT_X_F_V(vd, vs2, vm)       EMIT(R_type(0b0100100 | (vm), vs2, 0b00001, 0b001, vd, 0b1010111)) // 010010......00001001.....1010111
@@ -1448,147 +1446,147 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define VFREC7_V(vd, vs2, vm)          EMIT(R_type(0b0100110 | (vm), vs2, 0b00101, 0b001, vd, 0b1010111)) // 010011......00101001.....1010111
 #define VFCLASS_V(vd, vs2, vm)         EMIT(R_type(0b0100110 | (vm), vs2, 0b10000, 0b001, vd, 0b1010111)) // 010011......10000001.....1010111
 
-#define VFWADD_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1100000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110000...........001.....1010111
-#define VFWREDUSUM_VS(vd, vs1, vs2, vm) EMIT(R_type(0b1100010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110001...........001.....1010111
-#define VFWSUB_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1100100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110010...........001.....1010111
-#define VFWREDOSUM_VS(vd, vs1, vs2, vm) EMIT(R_type(0b1100110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110011...........001.....1010111
-#define VFWADD_WV(vd, vs1, vs2, vm)     EMIT(R_type(0b1101000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110100...........001.....1010111
-#define VFWSUB_WV(vd, vs1, vs2, vm)     EMIT(R_type(0b1101100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110110...........001.....1010111
-#define VFWMUL_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1110000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111000...........001.....1010111
-#define VFWMACC_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1111000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111100...........001.....1010111
-#define VFWNMACC_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1111010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111101...........001.....1010111
-#define VFWMSAC_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1111100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111110...........001.....1010111
-#define VFWNMSAC_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1111110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111111...........001.....1010111
+#define VFWADD_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1100000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110000...........001.....1010111
+#define VFWREDUSUM_VS(vd, vs2, vs1, vm) EMIT(R_type(0b1100010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110001...........001.....1010111
+#define VFWSUB_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1100100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110010...........001.....1010111
+#define VFWREDOSUM_VS(vd, vs2, vs1, vm) EMIT(R_type(0b1100110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110011...........001.....1010111
+#define VFWADD_WV(vd, vs2, vs1, vm)     EMIT(R_type(0b1101000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110100...........001.....1010111
+#define VFWSUB_WV(vd, vs2, vs1, vm)     EMIT(R_type(0b1101100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 110110...........001.....1010111
+#define VFWMUL_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1110000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111000...........001.....1010111
+#define VFWMACC_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1111000 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111100...........001.....1010111
+#define VFWNMACC_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1111010 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111101...........001.....1010111
+#define VFWMSAC_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1111100 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111110...........001.....1010111
+#define VFWNMSAC_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1111110 | (vm), vs2, vs1, 0b001, vd, 0b1010111)) // 111111...........001.....1010111
 
 //  OPIVX
-#define VADD_VX(vd, rs1, vs2, vm)       EMIT(R_type(0b0000000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000000...........100.....1010111
-#define VSUB_VX(vd, rs1, vs2, vm)       EMIT(R_type(0b0000100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000010...........100.....1010111
-#define VRSUB_VX(vd, rs1, vs2, vm)      EMIT(R_type(0b0000110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000011...........100.....1010111
-#define VMINU_VX(vd, rs1, vs2, vm)      EMIT(R_type(0b0001000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000100...........100.....1010111
-#define VMIN_VX(vd, rs1, vs2, vm)       EMIT(R_type(0b0001010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000101...........100.....1010111
-#define VMAXU_VX(vd, rs1, vs2, vm)      EMIT(R_type(0b0001100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000110...........100.....1010111
-#define VMAX_VX(vd, rs1, vs2, vm)       EMIT(R_type(0b0001110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000111...........100.....1010111
-#define VAND_VX(vd, rs1, vs2, vm)       EMIT(R_type(0b0010010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001001...........100.....1010111
-#define VOR_VX(vd, rs1, vs2, vm)        EMIT(R_type(0b0010100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001010...........100.....1010111
-#define VXOR_VX(vd, rs1, vs2, vm)       EMIT(R_type(0b0010110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001011...........100.....1010111
-#define VRGATHER_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b0011000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001100...........100.....1010111
-#define VSLIDEUP_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b0011100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001110...........100.....1010111
-#define VSLIDEDOWN_VX(vd, rs1, vs2, vm) EMIT(R_type(0b0011110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001111...........100.....1010111
-
-#define VADC_VXM(vd, rs1, vs2)   EMIT(R_type(0b0100000, vs2, rs1, 0b100, vd, 0b1010111)) // 0100000..........100.....1010111
-#define VMADC_VXM(vd, rs1, vs2)  EMIT(R_type(0b0100010, vs2, rs1, 0b100, vd, 0b1010111)) // 0100010..........100.....1010111
-#define VMADC_VX(vd, rs1, vs2)   EMIT(R_type(0b0100011, vs2, rs1, 0b100, vd, 0b1010111)) // 0100011..........100.....1010111
-#define VSBC_VXM(vd, rs1, vs2)   EMIT(R_type(0b0100100, vs2, rs1, 0b100, vd, 0b1010111)) // 0100100..........100.....1010111
-#define VMSBC_VXM(vd, rs1, vs2)  EMIT(R_type(0b0100110, vs2, rs1, 0b100, vd, 0b1010111)) // 0100110..........100.....1010111
-#define VMSBC_VX(vd, rs1, vs2)   EMIT(R_type(0b0100111, vs2, rs1, 0b100, vd, 0b1010111)) // 0100111..........100.....1010111
-#define VMERGE_VXM(vd, rs1, vs2) EMIT(R_type(0b0101110, vs2, rs1, 0b100, vd, 0b1010111)) // 0101110..........100.....1010111
+#define VADD_VX(vd, vs2, rs1, vm)       EMIT(R_type(0b0000000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000000...........100.....1010111
+#define VSUB_VX(vd, vs2, rs1, vm)       EMIT(R_type(0b0000100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000010...........100.....1010111
+#define VRSUB_VX(vd, vs2, rs1, vm)      EMIT(R_type(0b0000110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000011...........100.....1010111
+#define VMINU_VX(vd, vs2, rs1, vm)      EMIT(R_type(0b0001000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000100...........100.....1010111
+#define VMIN_VX(vd, vs2, rs1, vm)       EMIT(R_type(0b0001010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000101...........100.....1010111
+#define VMAXU_VX(vd, vs2, rs1, vm)      EMIT(R_type(0b0001100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000110...........100.....1010111
+#define VMAX_VX(vd, vs2, rs1, vm)       EMIT(R_type(0b0001110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 000111...........100.....1010111
+#define VAND_VX(vd, vs2, rs1, vm)       EMIT(R_type(0b0010010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001001...........100.....1010111
+#define VOR_VX(vd, vs2, rs1, vm)        EMIT(R_type(0b0010100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001010...........100.....1010111
+#define VXOR_VX(vd, vs2, rs1, vm)       EMIT(R_type(0b0010110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001011...........100.....1010111
+#define VRGATHER_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b0011000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001100...........100.....1010111
+#define VSLIDEUP_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b0011100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001110...........100.....1010111
+#define VSLIDEDOWN_VX(vd, vs2, rs1, vm) EMIT(R_type(0b0011110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 001111...........100.....1010111
+
+#define VADC_VXM(vd, vs2, rs1)   EMIT(R_type(0b0100000, vs2, rs1, 0b100, vd, 0b1010111)) // 0100000..........100.....1010111
+#define VMADC_VXM(vd, vs2, rs1)  EMIT(R_type(0b0100010, vs2, rs1, 0b100, vd, 0b1010111)) // 0100010..........100.....1010111
+#define VMADC_VX(vd, vs2, rs1)   EMIT(R_type(0b0100011, vs2, rs1, 0b100, vd, 0b1010111)) // 0100011..........100.....1010111
+#define VSBC_VXM(vd, vs2, rs1)   EMIT(R_type(0b0100100, vs2, rs1, 0b100, vd, 0b1010111)) // 0100100..........100.....1010111
+#define VMSBC_VXM(vd, vs2, rs1)  EMIT(R_type(0b0100110, vs2, rs1, 0b100, vd, 0b1010111)) // 0100110..........100.....1010111
+#define VMSBC_VX(vd, vs2, rs1)   EMIT(R_type(0b0100111, vs2, rs1, 0b100, vd, 0b1010111)) // 0100111..........100.....1010111
+#define VMERGE_VXM(vd, vs2, rs1) EMIT(R_type(0b0101110, vs2, rs1, 0b100, vd, 0b1010111)) // 0101110..........100.....1010111
 
 #define VMV_V_X(vd, rs1) EMIT(I_type(0b010111100000, rs1, 0b100, vd, 0b1010111)) // 010111100000.....100.....1010111
 
-#define VMSEQ_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b0110000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011000...........100.....1010111
-#define VMSNE_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b0110010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011001...........100.....1010111
-#define VMSLTU_VX(vd, rs1, vs2, vm)  EMIT(R_type(0b0110100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011010...........100.....1010111
-#define VMSLT_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b0110110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011011...........100.....1010111
-#define VMSLEU_VX(vd, rs1, vs2, vm)  EMIT(R_type(0b0111000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011100...........100.....1010111
-#define VMSLE_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b0111010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011101...........100.....1010111
-#define VMSGTU_VX(vd, rs1, vs2, vm)  EMIT(R_type(0b0111100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011110...........100.....1010111
-#define VMSGT_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b0111110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011111...........100.....1010111
-#define VSADDU_VX(vd, rs1, vs2, vm)  EMIT(R_type(0b1000000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100000...........100.....1010111
-#define VSADD_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1000010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100001...........100.....1010111
-#define VSSUBU_VX(vd, rs1, vs2, vm)  EMIT(R_type(0b1000100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100010...........100.....1010111
-#define VSSUB_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1000110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100011...........100.....1010111
-#define VSLL_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1001010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100101...........100.....1010111
-#define VSMUL_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1001110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100111...........100.....1010111
-#define VSRL_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1010000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101000...........100.....1010111
-#define VSRA_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1010010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101001...........100.....1010111
-#define VSSRL_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1010100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101010...........100.....1010111
-#define VSSRA_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1010110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101011...........100.....1010111
-#define VNSRL_WX(vd, rs1, vs2, vm)   EMIT(R_type(0b1011000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101100...........100.....1010111
-#define VNSRA_WX(vd, rs1, vs2, vm)   EMIT(R_type(0b1011010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101101...........100.....1010111
-#define VNCLIPU_WX(vd, rs1, vs2, vm) EMIT(R_type(0b1011100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101110...........100.....1010111
-#define VNCLIP_WX(vd, rs1, vs2, vm)  EMIT(R_type(0b1011110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101111...........100.....1010111
+#define VMSEQ_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b0110000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011000...........100.....1010111
+#define VMSNE_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b0110010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011001...........100.....1010111
+#define VMSLTU_VX(vd, vs2, rs1, vm)  EMIT(R_type(0b0110100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011010...........100.....1010111
+#define VMSLT_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b0110110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011011...........100.....1010111
+#define VMSLEU_VX(vd, vs2, rs1, vm)  EMIT(R_type(0b0111000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011100...........100.....1010111
+#define VMSLE_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b0111010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011101...........100.....1010111
+#define VMSGTU_VX(vd, vs2, rs1, vm)  EMIT(R_type(0b0111100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011110...........100.....1010111
+#define VMSGT_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b0111110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 011111...........100.....1010111
+#define VSADDU_VX(vd, vs2, rs1, vm)  EMIT(R_type(0b1000000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100000...........100.....1010111
+#define VSADD_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1000010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100001...........100.....1010111
+#define VSSUBU_VX(vd, vs2, rs1, vm)  EMIT(R_type(0b1000100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100010...........100.....1010111
+#define VSSUB_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1000110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100011...........100.....1010111
+#define VSLL_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1001010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100101...........100.....1010111
+#define VSMUL_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1001110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 100111...........100.....1010111
+#define VSRL_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1010000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101000...........100.....1010111
+#define VSRA_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1010010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101001...........100.....1010111
+#define VSSRL_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1010100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101010...........100.....1010111
+#define VSSRA_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1010110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101011...........100.....1010111
+#define VNSRL_WX(vd, vs2, rs1, vm)   EMIT(R_type(0b1011000 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101100...........100.....1010111
+#define VNSRA_WX(vd, vs2, rs1, vm)   EMIT(R_type(0b1011010 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101101...........100.....1010111
+#define VNCLIPU_WX(vd, vs2, rs1, vm) EMIT(R_type(0b1011100 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101110...........100.....1010111
+#define VNCLIP_WX(vd, vs2, rs1, vm)  EMIT(R_type(0b1011110 | (vm), vs2, rs1, 0b100, vd, 0b1010111)) // 101111...........100.....1010111
 
 //  OPIVV
-#define VADD_VV(vd, vs1, vs2, vm)         EMIT(R_type(0b0000000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000000...........000.....1010111
-#define VSUB_VV(vd, vs1, vs2, vm)         EMIT(R_type(0b0000100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000010...........000.....1010111
-#define VMINU_VV(vd, vs1, vs2, vm)        EMIT(R_type(0b0001000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000100...........000.....1010111
-#define VMIN_VV(vd, vs1, vs2, vm)         EMIT(R_type(0b0001010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000101...........000.....1010111
-#define VMAXU_VV(vd, vs1, vs2, vm)        EMIT(R_type(0b0001100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000110...........000.....1010111
-#define VMAX_VV(vd, vs1, vs2, vm)         EMIT(R_type(0b0001110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000111...........000.....1010111
-#define VAND_VV(vd, vs1, vs2, vm)         EMIT(R_type(0b0010010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001001...........000.....1010111
-#define VOR_VV(vd, vs1, vs2, vm)          EMIT(R_type(0b0010100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001010...........000.....1010111
-#define VXOR_VV(vd, vs1, vs2, vm)         EMIT(R_type(0b0010110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001011...........000.....1010111
-#define VRGATHER_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b0011000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001100...........000.....1010111
-#define VRGATHEREI16_VV(vd, vs1, vs2, vm) EMIT(R_type(0b0011100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001110...........000.....1010111
-
-#define VADC_VVM(vd, vs1, vs2)   EMIT(R_type(0b0100000, vs2, vs1, 0b000, vd, 0b1010111)) // 0100000..........000.....1010111
-#define VMADC_VVM(vd, vs1, vs2)  EMIT(R_type(0b0100010, vs2, vs1, 0b000, vd, 0b1010111)) // 0100010..........000.....1010111
-#define VMADC_VV(vd, vs1, vs2)   EMIT(R_type(0b0100011, vs2, vs1, 0b000, vd, 0b1010111)) // 0100011..........000.....1010111
-#define VSBC_VVM(vd, vs1, vs2)   EMIT(R_type(0b0100100, vs2, vs1, 0b000, vd, 0b1010111)) // 0100100..........000.....1010111
-#define VMSBC_VVM(vd, vs1, vs2)  EMIT(R_type(0b0100110, vs2, vs1, 0b000, vd, 0b1010111)) // 0100110..........000.....1010111
-#define VMSBC_VV(vd, vs1, vs2)   EMIT(R_type(0b0100111, vs2, vs1, 0b000, vd, 0b1010111)) // 0100111..........000.....1010111
-#define VMERGE_VVM(vd, vs1, vs2) EMIT(R_type(0b0101110, vs2, vs1, 0b000, vd, 0b1010111)) // 0101110..........000.....1010111
+#define VADD_VV(vd, vs2, vs1, vm)         EMIT(R_type(0b0000000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000000...........000.....1010111
+#define VSUB_VV(vd, vs2, vs1, vm)         EMIT(R_type(0b0000100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000010...........000.....1010111
+#define VMINU_VV(vd, vs2, vs1, vm)        EMIT(R_type(0b0001000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000100...........000.....1010111
+#define VMIN_VV(vd, vs2, vs1, vm)         EMIT(R_type(0b0001010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000101...........000.....1010111
+#define VMAXU_VV(vd, vs2, vs1, vm)        EMIT(R_type(0b0001100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000110...........000.....1010111
+#define VMAX_VV(vd, vs2, vs1, vm)         EMIT(R_type(0b0001110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 000111...........000.....1010111
+#define VAND_VV(vd, vs2, vs1, vm)         EMIT(R_type(0b0010010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001001...........000.....1010111
+#define VOR_VV(vd, vs2, vs1, vm)          EMIT(R_type(0b0010100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001010...........000.....1010111
+#define VXOR_VV(vd, vs2, vs1, vm)         EMIT(R_type(0b0010110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001011...........000.....1010111
+#define VRGATHER_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b0011000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001100...........000.....1010111
+#define VRGATHEREI16_VV(vd, vs2, vs1, vm) EMIT(R_type(0b0011100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 001110...........000.....1010111
+
+#define VADC_VVM(vd, vs2, vs1)   EMIT(R_type(0b0100000, vs2, vs1, 0b000, vd, 0b1010111)) // 0100000..........000.....1010111
+#define VMADC_VVM(vd, vs2, vs1)  EMIT(R_type(0b0100010, vs2, vs1, 0b000, vd, 0b1010111)) // 0100010..........000.....1010111
+#define VMADC_VV(vd, vs2, vs1)   EMIT(R_type(0b0100011, vs2, vs1, 0b000, vd, 0b1010111)) // 0100011..........000.....1010111
+#define VSBC_VVM(vd, vs2, vs1)   EMIT(R_type(0b0100100, vs2, vs1, 0b000, vd, 0b1010111)) // 0100100..........000.....1010111
+#define VMSBC_VVM(vd, vs2, vs1)  EMIT(R_type(0b0100110, vs2, vs1, 0b000, vd, 0b1010111)) // 0100110..........000.....1010111
+#define VMSBC_VV(vd, vs2, vs1)   EMIT(R_type(0b0100111, vs2, vs1, 0b000, vd, 0b1010111)) // 0100111..........000.....1010111
+#define VMERGE_VVM(vd, vs2, vs1) EMIT(R_type(0b0101110, vs2, vs1, 0b000, vd, 0b1010111)) // 0101110..........000.....1010111
 
 #define VMV_V_V(vd, vs1) EMIT(I_type(0b010111100000, vs1, 0b000, vd, 0b1010111)) // 010111100000.....000.....1010111
 
-#define VMSEQ_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b0110000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011000...........000.....1010111
-#define VMSNE_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b0110010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011001...........000.....1010111
-#define VMSLTU_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b0110100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011010...........000.....1010111
-#define VMSLT_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b0110110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011011...........000.....1010111
-#define VMSLEU_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b0111000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011100...........000.....1010111
-#define VMSLE_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b0111010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011101...........000.....1010111
-#define VSADDU_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1000000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100000...........000.....1010111
-#define VSADD_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1000010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100001...........000.....1010111
-#define VSSUBU_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1000100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100010...........000.....1010111
-#define VSSUB_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1000110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100011...........000.....1010111
-#define VSLL_VV(vd, vs1, vs2, vm)      EMIT(R_type(0b1001010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100101...........000.....1010111
-#define VSMUL_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1001110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100111...........000.....1010111
-#define VSRL_VV(vd, vs1, vs2, vm)      EMIT(R_type(0b1010000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101000...........000.....1010111
-#define VSRA_VV(vd, vs1, vs2, vm)      EMIT(R_type(0b1010010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101001...........000.....1010111
-#define VSSRL_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1010100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101010...........000.....1010111
-#define VSSRA_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1010110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101011...........000.....1010111
-#define VNSRL_WV(vd, vs1, vs2, vm)     EMIT(R_type(0b1011000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101100...........000.....1010111
-#define VNSRA_WV(vd, vs1, vs2, vm)     EMIT(R_type(0b1011010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101101...........000.....1010111
-#define VNCLIPU_WV(vd, vs1, vs2, vm)   EMIT(R_type(0b1011100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101110...........000.....1010111
-#define VNCLIP_WV(vd, vs1, vs2, vm)    EMIT(R_type(0b1011110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101111...........000.....1010111
-#define VWREDSUMU_VS(vd, vs1, vs2, vm) EMIT(R_type(0b1100000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 110000...........000.....1010111
-#define VWREDSUM_VS(vd, vs1, vs2, vm)  EMIT(R_type(0b1100010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 110001...........000.....1010111
+#define VMSEQ_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b0110000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011000...........000.....1010111
+#define VMSNE_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b0110010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011001...........000.....1010111
+#define VMSLTU_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b0110100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011010...........000.....1010111
+#define VMSLT_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b0110110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011011...........000.....1010111
+#define VMSLEU_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b0111000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011100...........000.....1010111
+#define VMSLE_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b0111010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 011101...........000.....1010111
+#define VSADDU_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1000000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100000...........000.....1010111
+#define VSADD_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1000010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100001...........000.....1010111
+#define VSSUBU_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1000100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100010...........000.....1010111
+#define VSSUB_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1000110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100011...........000.....1010111
+#define VSLL_VV(vd, vs2, vs1, vm)      EMIT(R_type(0b1001010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100101...........000.....1010111
+#define VSMUL_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1001110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 100111...........000.....1010111
+#define VSRL_VV(vd, vs2, vs1, vm)      EMIT(R_type(0b1010000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101000...........000.....1010111
+#define VSRA_VV(vd, vs2, vs1, vm)      EMIT(R_type(0b1010010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101001...........000.....1010111
+#define VSSRL_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1010100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101010...........000.....1010111
+#define VSSRA_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1010110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101011...........000.....1010111
+#define VNSRL_WV(vd, vs2, vs1, vm)     EMIT(R_type(0b1011000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101100...........000.....1010111
+#define VNSRA_WV(vd, vs2, vs1, vm)     EMIT(R_type(0b1011010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101101...........000.....1010111
+#define VNCLIPU_WV(vd, vs2, vs1, vm)   EMIT(R_type(0b1011100 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101110...........000.....1010111
+#define VNCLIP_WV(vd, vs2, vs1, vm)    EMIT(R_type(0b1011110 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 101111...........000.....1010111
+#define VWREDSUMU_VS(vd, vs2, vs1, vm) EMIT(R_type(0b1100000 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 110000...........000.....1010111
+#define VWREDSUM_VS(vd, vs2, vs1, vm)  EMIT(R_type(0b1100010 | (vm), vs2, vs1, 0b000, vd, 0b1010111)) // 110001...........000.....1010111
 
 //  OPIVI
-#define VADD_VI(vd, simm5, vs2, vm)       EMIT(R_type(0b0000000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 000000...........011.....1010111
-#define VRSUB_VI(vd, simm5, vs2, vm)      EMIT(R_type(0b0000110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 000011...........011.....1010111
-#define VAND_VI(vd, simm5, vs2, vm)       EMIT(R_type(0b0010010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001001...........011.....1010111
-#define VOR_VI(vd, simm5, vs2, vm)        EMIT(R_type(0b0010100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001010...........011.....1010111
-#define VXOR_VI(vd, simm5, vs2, vm)       EMIT(R_type(0b0010110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001011...........011.....1010111
-#define VRGATHER_VI(vd, simm5, vs2, vm)   EMIT(R_type(0b0011000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001100...........011.....1010111
-#define VSLIDEUP_VI(vd, simm5, vs2, vm)   EMIT(R_type(0b0011100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001110...........011.....1010111
-#define VSLIDEDOWN_VI(vd, simm5, vs2, vm) EMIT(R_type(0b0011110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001111...........011.....1010111
-
-#define VADC_VIM(vd, simm5, vs2)   EMIT(R_type(0b0100000, vs2, simm5, 0b011, vd, 0b1010111)) // 0100000..........011.....1010111
-#define VMADC_VIM(vd, simm5, vs2)  EMIT(R_type(0b0100010, vs2, simm5, 0b011, vd, 0b1010111)) // 0100010..........011.....1010111
-#define VMADC_VI(vd, simm5, vs2)   EMIT(R_type(0b0100011, vs2, simm5, 0b011, vd, 0b1010111)) // 0100011..........011.....1010111
-#define VMERGE_VIM(vd, simm5, vs2) EMIT(R_type(0b0101110, vs2, simm5, 0b011, vd, 0b1010111)) // 0101110..........011.....1010111
+#define VADD_VI(vd, vs2, simm5, vm)       EMIT(R_type(0b0000000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 000000...........011.....1010111
+#define VRSUB_VI(vd, vs2, simm5, vm)      EMIT(R_type(0b0000110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 000011...........011.....1010111
+#define VAND_VI(vd, vs2, simm5, vm)       EMIT(R_type(0b0010010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001001...........011.....1010111
+#define VOR_VI(vd, vs2, simm5, vm)        EMIT(R_type(0b0010100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001010...........011.....1010111
+#define VXOR_VI(vd, vs2, simm5, vm)       EMIT(R_type(0b0010110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001011...........011.....1010111
+#define VRGATHER_VI(vd, vs2, simm5, vm)   EMIT(R_type(0b0011000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001100...........011.....1010111
+#define VSLIDEUP_VI(vd, vs2, simm5, vm)   EMIT(R_type(0b0011100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001110...........011.....1010111
+#define VSLIDEDOWN_VI(vd, vs2, simm5, vm) EMIT(R_type(0b0011110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 001111...........011.....1010111
+
+#define VADC_VIM(vd, vs2, simm5)   EMIT(R_type(0b0100000, vs2, simm5, 0b011, vd, 0b1010111)) // 0100000..........011.....1010111
+#define VMADC_VIM(vd, vs2, simm5)  EMIT(R_type(0b0100010, vs2, simm5, 0b011, vd, 0b1010111)) // 0100010..........011.....1010111
+#define VMADC_VI(vd, vs2, simm5)   EMIT(R_type(0b0100011, vs2, simm5, 0b011, vd, 0b1010111)) // 0100011..........011.....1010111
+#define VMERGE_VIM(vd, vs2, simm5) EMIT(R_type(0b0101110, vs2, simm5, 0b011, vd, 0b1010111)) // 0101110..........011.....1010111
 
 #define VMV_V_I(vd, simm5) EMIT(I_type(0b010111100000, simm5, 0b011, vd, 0b1010111)) // 010111100000.....011.....1010111
 
-#define VMSEQ_VI(vd, simm5, vs2, vm)  EMIT(R_type(0b0110000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011000...........011.....1010111
-#define VMSNE_VI(vd, simm5, vs2, vm)  EMIT(R_type(0b0110010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011001...........011.....1010111
-#define VMSLEU_VI(vd, simm5, vs2, vm) EMIT(R_type(0b0111000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011100...........011.....1010111
-#define VMSLE_VI(vd, simm5, vs2, vm)  EMIT(R_type(0b0111010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011101...........011.....1010111
-#define VMSGTU_VI(vd, simm5, vs2, vm) EMIT(R_type(0b0111100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011110...........011.....1010111
-#define VMSGT_VI(vd, simm5, vs2, vm)  EMIT(R_type(0b0111110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011111...........011.....1010111
-
-#define VSADDU_VI(vd, simm5, vs2, vm)  EMIT(R_type(0b1000000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 100000...........011.....1010111
-#define VSADD_VI(vd, simm5, vs2, vm)   EMIT(R_type(0b1000010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 100001...........011.....1010111
-#define VSLL_VI(vd, simm5, vs2, vm)    EMIT(R_type(0b1001010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 100101...........011.....1010111
-#define VSRL_VI(vd, simm5, vs2, vm)    EMIT(R_type(0b1010000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101000...........011.....1010111
-#define VSRA_VI(vd, simm5, vs2, vm)    EMIT(R_type(0b1010010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101001...........011.....1010111
-#define VSSRL_VI(vd, simm5, vs2, vm)   EMIT(R_type(0b1010100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101010...........011.....1010111
-#define VSSRA_VI(vd, simm5, vs2, vm)   EMIT(R_type(0b1010110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101011...........011.....1010111
-#define VNSRL_WI(vd, simm5, vs2, vm)   EMIT(R_type(0b1011000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101100...........011.....1010111
-#define VNSRA_WI(vd, simm5, vs2, vm)   EMIT(R_type(0b1011010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101101...........011.....1010111
-#define VNCLIPU_WI(vd, simm5, vs2, vm) EMIT(R_type(0b1011100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101110...........011.....1010111
-#define VNCLIP_WI(vd, simm5, vs2, vm)  EMIT(R_type(0b1011110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101111...........011.....1010111
+#define VMSEQ_VI(vd, vs2, simm5, vm)  EMIT(R_type(0b0110000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011000...........011.....1010111
+#define VMSNE_VI(vd, vs2, simm5, vm)  EMIT(R_type(0b0110010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011001...........011.....1010111
+#define VMSLEU_VI(vd, vs2, simm5, vm) EMIT(R_type(0b0111000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011100...........011.....1010111
+#define VMSLE_VI(vd, vs2, simm5, vm)  EMIT(R_type(0b0111010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011101...........011.....1010111
+#define VMSGTU_VI(vd, vs2, simm5, vm) EMIT(R_type(0b0111100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011110...........011.....1010111
+#define VMSGT_VI(vd, vs2, simm5, vm)  EMIT(R_type(0b0111110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 011111...........011.....1010111
+
+#define VSADDU_VI(vd, vs2, simm5, vm)  EMIT(R_type(0b1000000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 100000...........011.....1010111
+#define VSADD_VI(vd, vs2, simm5, vm)   EMIT(R_type(0b1000010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 100001...........011.....1010111
+#define VSLL_VI(vd, vs2, simm5, vm)    EMIT(R_type(0b1001010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 100101...........011.....1010111
+#define VSRL_VI(vd, vs2, simm5, vm)    EMIT(R_type(0b1010000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101000...........011.....1010111
+#define VSRA_VI(vd, vs2, simm5, vm)    EMIT(R_type(0b1010010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101001...........011.....1010111
+#define VSSRL_VI(vd, vs2, simm5, vm)   EMIT(R_type(0b1010100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101010...........011.....1010111
+#define VSSRA_VI(vd, vs2, simm5, vm)   EMIT(R_type(0b1010110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101011...........011.....1010111
+#define VNSRL_WI(vd, vs2, simm5, vm)   EMIT(R_type(0b1011000 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101100...........011.....1010111
+#define VNSRA_WI(vd, vs2, simm5, vm)   EMIT(R_type(0b1011010 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101101...........011.....1010111
+#define VNCLIPU_WI(vd, vs2, simm5, vm) EMIT(R_type(0b1011100 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101110...........011.....1010111
+#define VNCLIP_WI(vd, vs2, simm5, vm)  EMIT(R_type(0b1011110 | (vm), vs2, simm5, 0b011, vd, 0b1010111)) // 101111...........011.....1010111
 
 #define VMV1R_V(vd, vs2) EMIT(R_type(0b1001111, vs2, 0b00000, 0b011, vd, 0b1010111)) // 1001111.....00000011.....1010111
 #define VMV2R_V(vd, vs2) EMIT(R_type(0b1001111, vs2, 0b00001, 0b011, vd, 0b1010111)) // 1001111.....00001011.....1010111
@@ -1596,18 +1594,18 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define VMV8R_V(vd, vs2) EMIT(R_type(0b1001111, vs2, 0b00111, 0b011, vd, 0b1010111)) // 1001111.....00111011.....1010111
 
 //  OPMVV
-#define VREDSUM_VS(vd, vs1, vs2, vm)  EMIT(R_type(0b0000000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000000...........010.....1010111
-#define VREDAND_VS(vd, vs1, vs2, vm)  EMIT(R_type(0b0000010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000001...........010.....1010111
-#define VREDOR_VS(vd, vs1, vs2, vm)   EMIT(R_type(0b0000100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000010...........010.....1010111
-#define VREDXOR_VS(vd, vs1, vs2, vm)  EMIT(R_type(0b0000110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000011...........010.....1010111
-#define VREDMINU_VS(vd, vs1, vs2, vm) EMIT(R_type(0b0001000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000100...........010.....1010111
-#define VREDMIN_VS(vd, vs1, vs2, vm)  EMIT(R_type(0b0001010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000101...........010.....1010111
-#define VREDMAXU_VS(vd, vs1, vs2, vm) EMIT(R_type(0b0001100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000110...........010.....1010111
-#define VREDMAX_VS(vd, vs1, vs2, vm)  EMIT(R_type(0b0001110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000111...........010.....1010111
-#define VAADDU_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b0010000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 001000...........010.....1010111
-#define VAADD_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b0010010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 001001...........010.....1010111
-#define VASUBU_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b0010100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 001010...........010.....1010111
-#define VASUB_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b0010110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 001011...........010.....1010111
+#define VREDSUM_VS(vd, vs2, vs1, vm)  EMIT(R_type(0b0000000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000000...........010.....1010111
+#define VREDAND_VS(vd, vs2, vs1, vm)  EMIT(R_type(0b0000010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000001...........010.....1010111
+#define VREDOR_VS(vd, vs2, vs1, vm)   EMIT(R_type(0b0000100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000010...........010.....1010111
+#define VREDXOR_VS(vd, vs2, vs1, vm)  EMIT(R_type(0b0000110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000011...........010.....1010111
+#define VREDMINU_VS(vd, vs2, vs1, vm) EMIT(R_type(0b0001000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000100...........010.....1010111
+#define VREDMIN_VS(vd, vs2, vs1, vm)  EMIT(R_type(0b0001010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000101...........010.....1010111
+#define VREDMAXU_VS(vd, vs2, vs1, vm) EMIT(R_type(0b0001100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000110...........010.....1010111
+#define VREDMAX_VS(vd, vs2, vs1, vm)  EMIT(R_type(0b0001110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 000111...........010.....1010111
+#define VAADDU_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b0010000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 001000...........010.....1010111
+#define VAADD_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b0010010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 001001...........010.....1010111
+#define VASUBU_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b0010100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 001010...........010.....1010111
+#define VASUB_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b0010110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 001011...........010.....1010111
 
 #define VMV_X_S(rd, vs2) EMIT(R_type(0b0100001, vs2, 0b00000, 0b010, rd, 0b1010111)) // 0100001.....00000010.....1010111
 
@@ -1621,15 +1619,15 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define VZEXT_VF2(vd, vs2, vm) EMIT(R_type(0b0100100 | (vm), vs2, 0b00110, 0b010, vd, 0b1010111)) // 010010......00110010.....1010111
 #define VSEXT_VF2(vd, vs2, vm) EMIT(R_type(0b0100100 | (vm), vs2, 0b00111, 0b010, vd, 0b1010111)) // 010010......00111010.....1010111
 
-#define VCOMPRESS_VM(vd, vs1, vs2) EMIT(R_type(0b0101111, vs2, vs1, 0b010, vd, 0b1010111)) // 0101111..........010.....1010111
-#define VMANDN_MM(vd, vs1, vs2)    EMIT(R_type(0b0110001, vs2, vs1, 0b010, vd, 0b1010111)) // 0110001..........010.....1010111
-#define VMAND_MM(vd, vs1, vs2)     EMIT(R_type(0b0110011, vs2, vs1, 0b010, vd, 0b1010111)) // 0110011..........010.....1010111
-#define VMOR_MM(vd, vs1, vs2)      EMIT(R_type(0b0110101, vs2, vs1, 0b010, vd, 0b1010111)) // 0110101..........010.....1010111
-#define VMXOR_MM(vd, vs1, vs2)     EMIT(R_type(0b0110111, vs2, vs1, 0b010, vd, 0b1010111)) // 0110111..........010.....1010111
-#define VMORN_MM(vd, vs1, vs2)     EMIT(R_type(0b0111001, vs2, vs1, 0b010, vd, 0b1010111)) // 0111001..........010.....1010111
-#define VMNAND_MM(vd, vs1, vs2)    EMIT(R_type(0b0111011, vs2, vs1, 0b010, vd, 0b1010111)) // 0111011..........010.....1010111
-#define VMNOR_MM(vd, vs1, vs2)     EMIT(R_type(0b0111101, vs2, vs1, 0b010, vd, 0b1010111)) // 0111101..........010.....1010111
-#define VMXNOR_MM(vd, vs1, vs2)    EMIT(R_type(0b0111111, vs2, vs1, 0b010, vd, 0b1010111)) // 0111111..........010.....1010111
+#define VCOMPRESS_VM(vd, vs2, vs1) EMIT(R_type(0b0101111, vs2, vs1, 0b010, vd, 0b1010111)) // 0101111..........010.....1010111
+#define VMANDN_MM(vd, vs2, vs1)    EMIT(R_type(0b0110001, vs2, vs1, 0b010, vd, 0b1010111)) // 0110001..........010.....1010111
+#define VMAND_MM(vd, vs2, vs1)     EMIT(R_type(0b0110011, vs2, vs1, 0b010, vd, 0b1010111)) // 0110011..........010.....1010111
+#define VMOR_MM(vd, vs2, vs1)      EMIT(R_type(0b0110101, vs2, vs1, 0b010, vd, 0b1010111)) // 0110101..........010.....1010111
+#define VMXOR_MM(vd, vs2, vs1)     EMIT(R_type(0b0110111, vs2, vs1, 0b010, vd, 0b1010111)) // 0110111..........010.....1010111
+#define VMORN_MM(vd, vs2, vs1)     EMIT(R_type(0b0111001, vs2, vs1, 0b010, vd, 0b1010111)) // 0111001..........010.....1010111
+#define VMNAND_MM(vd, vs2, vs1)    EMIT(R_type(0b0111011, vs2, vs1, 0b010, vd, 0b1010111)) // 0111011..........010.....1010111
+#define VMNOR_MM(vd, vs2, vs1)     EMIT(R_type(0b0111101, vs2, vs1, 0b010, vd, 0b1010111)) // 0111101..........010.....1010111
+#define VMXNOR_MM(vd, vs2, vs1)    EMIT(R_type(0b0111111, vs2, vs1, 0b010, vd, 0b1010111)) // 0111111..........010.....1010111
 
 #define VMSBF_M(vd, vs2, vm)  EMIT(R_type(0b0101000 | (vm), vs2, 0b00001, 0b010, vd, 0b1010111)) // 010100......00001010.....1010111
 #define VMSOF_M(vd, vs2, vm)  EMIT(R_type(0b0101000 | (vm), vs2, 0b00010, 0b010, vd, 0b1010111)) // 010100......00010010.....1010111
@@ -1640,69 +1638,69 @@ f28–31  ft8–11  FP temporaries                  Caller
 
 #define VID_V(vd, vm) EMIT(R_type(0b0101000 | (vm), 0b00000, 0b10001, 0b010, vd, 0b1010111)) // 010100.0000010001010.....1010111
 
-#define VDIVU_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1000000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100000...........010.....1010111
-#define VDIV_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1000010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100001...........010.....1010111
-#define VREMU_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1000100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100010...........010.....1010111
-#define VREM_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1000110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100011...........010.....1010111
-#define VMULHU_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1001000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100100...........010.....1010111
-#define VMUL_VV(vd, vs1, vs2, vm)     EMIT(R_type(0b1001010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100101...........010.....1010111
-#define VMULHSU_VV(vd, vs1, vs2, vm)  EMIT(R_type(0b1001100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100110...........010.....1010111
-#define VMULH_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1001110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100111...........010.....1010111
-#define VMADD_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1010010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 101001...........010.....1010111
-#define VNMSUB_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1010110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 101011...........010.....1010111
-#define VMACC_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1011010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 101101...........010.....1010111
-#define VNMSAC_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1011110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 101111...........010.....1010111
-#define VWADDU_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1100000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110000...........010.....1010111
-#define VWADD_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1100010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110001...........010.....1010111
-#define VWSUBU_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1100100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110010...........010.....1010111
-#define VWSUB_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1100110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110011...........010.....1010111
-#define VWADDU_WV(vd, vs1, vs2, vm)   EMIT(R_type(0b1101000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110100...........010.....1010111
-#define VWADD_WV(vd, vs1, vs2, vm)    EMIT(R_type(0b1101010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110101...........010.....1010111
-#define VWSUBU_WV(vd, vs1, vs2, vm)   EMIT(R_type(0b1101100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110110...........010.....1010111
-#define VWSUB_WV(vd, vs1, vs2, vm)    EMIT(R_type(0b1101110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110111...........010.....1010111
-#define VWMULU_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1110000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111000...........010.....1010111
-#define VWMULSU_VV(vd, vs1, vs2, vm)  EMIT(R_type(0b1110100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111010...........010.....1010111
-#define VWMUL_VV(vd, vs1, vs2, vm)    EMIT(R_type(0b1110110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111011...........010.....1010111
-#define VWMACCU_VV(vd, vs1, vs2, vm)  EMIT(R_type(0b1111000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111100...........010.....1010111
-#define VWMACC_VV(vd, vs1, vs2, vm)   EMIT(R_type(0b1111010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111101...........010.....1010111
-#define VWMACCSU_VV(vd, vs1, vs2, vm) EMIT(R_type(0b1111110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111111...........010.....1010111
+#define VDIVU_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1000000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100000...........010.....1010111
+#define VDIV_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1000010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100001...........010.....1010111
+#define VREMU_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1000100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100010...........010.....1010111
+#define VREM_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1000110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100011...........010.....1010111
+#define VMULHU_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1001000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100100...........010.....1010111
+#define VMUL_VV(vd, vs2, vs1, vm)     EMIT(R_type(0b1001010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100101...........010.....1010111
+#define VMULHSU_VV(vd, vs2, vs1, vm)  EMIT(R_type(0b1001100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100110...........010.....1010111
+#define VMULH_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1001110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 100111...........010.....1010111
+#define VMADD_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1010010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 101001...........010.....1010111
+#define VNMSUB_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1010110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 101011...........010.....1010111
+#define VMACC_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1011010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 101101...........010.....1010111
+#define VNMSAC_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1011110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 101111...........010.....1010111
+#define VWADDU_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1100000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110000...........010.....1010111
+#define VWADD_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1100010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110001...........010.....1010111
+#define VWSUBU_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1100100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110010...........010.....1010111
+#define VWSUB_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1100110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110011...........010.....1010111
+#define VWADDU_WV(vd, vs2, vs1, vm)   EMIT(R_type(0b1101000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110100...........010.....1010111
+#define VWADD_WV(vd, vs2, vs1, vm)    EMIT(R_type(0b1101010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110101...........010.....1010111
+#define VWSUBU_WV(vd, vs2, vs1, vm)   EMIT(R_type(0b1101100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110110...........010.....1010111
+#define VWSUB_WV(vd, vs2, vs1, vm)    EMIT(R_type(0b1101110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 110111...........010.....1010111
+#define VWMULU_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1110000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111000...........010.....1010111
+#define VWMULSU_VV(vd, vs2, vs1, vm)  EMIT(R_type(0b1110100 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111010...........010.....1010111
+#define VWMUL_VV(vd, vs2, vs1, vm)    EMIT(R_type(0b1110110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111011...........010.....1010111
+#define VWMACCU_VV(vd, vs2, vs1, vm)  EMIT(R_type(0b1111000 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111100...........010.....1010111
+#define VWMACC_VV(vd, vs2, vs1, vm)   EMIT(R_type(0b1111010 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111101...........010.....1010111
+#define VWMACCSU_VV(vd, vs2, vs1, vm) EMIT(R_type(0b1111110 | (vm), vs2, vs1, 0b010, vd, 0b1010111)) // 111111...........010.....1010111
 
 //  OPMVX
-#define VAADDU_VX(vd, rs1, vs2, vm)      EMIT(R_type(0b0010000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001000...........110.....1010111
-#define VAADD_VX(vd, rs1, vs2, vm)       EMIT(R_type(0b0010010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001001...........110.....1010111
-#define VASUBU_VX(vd, rs1, vs2, vm)      EMIT(R_type(0b0010100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001010...........110.....1010111
-#define VASUB_VX(vd, rs1, vs2, vm)       EMIT(R_type(0b0010110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001011...........110.....1010111
-#define VSLIDE1UP_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b0011100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001110...........110.....1010111
-#define VSLIDE1DOWN_VX(vd, rs1, vs2, vm) EMIT(R_type(0b0011110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001111...........110.....1010111
+#define VAADDU_VX(vd, vs2, rs1, vm)      EMIT(R_type(0b0010000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001000...........110.....1010111
+#define VAADD_VX(vd, vs2, rs1, vm)       EMIT(R_type(0b0010010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001001...........110.....1010111
+#define VASUBU_VX(vd, vs2, rs1, vm)      EMIT(R_type(0b0010100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001010...........110.....1010111
+#define VASUB_VX(vd, vs2, rs1, vm)       EMIT(R_type(0b0010110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001011...........110.....1010111
+#define VSLIDE1UP_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b0011100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001110...........110.....1010111
+#define VSLIDE1DOWN_VX(vd, vs2, rs1, vm) EMIT(R_type(0b0011110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 001111...........110.....1010111
 
 #define VMV_S_X(vd, rs1) EMIT(I_type(0b010000100000, rs1, 0b110, vd, 0b1010111)) // 010000100000.....110.....1010111
 
-#define VDIVU_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1000000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100000...........110.....1010111
-#define VDIV_VX(vd, rs1, vs2, vm)     EMIT(R_type(0b1000010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100001...........110.....1010111
-#define VREMU_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1000100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100010...........110.....1010111
-#define VREM_VX(vd, rs1, vs2, vm)     EMIT(R_type(0b1000110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100011...........110.....1010111
-#define VMULHU_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1001000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100100...........110.....1010111
-#define VMUL_VX(vd, rs1, vs2, vm)     EMIT(R_type(0b1001010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100101...........110.....1010111
-#define VMULHSU_VX(vd, rs1, vs2, vm)  EMIT(R_type(0b1001100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100110...........110.....1010111
-#define VMULH_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1001110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100111...........110.....1010111
-#define VMADD_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1010010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 101001...........110.....1010111
-#define VNMSUB_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1010110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 101011...........110.....1010111
-#define VMACC_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1011010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 101101...........110.....1010111
-#define VNMSAC_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1011110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 101111...........110.....1010111
-#define VWADDU_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1100000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110000...........110.....1010111
-#define VWADD_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1100010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110001...........110.....1010111
-#define VWSUBU_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1100100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110010...........110.....1010111
-#define VWSUB_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1100110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110011...........110.....1010111
-#define VWADDU_WX(vd, rs1, vs2, vm)   EMIT(R_type(0b1101000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110100...........110.....1010111
-#define VWADD_WX(vd, rs1, vs2, vm)    EMIT(R_type(0b1101010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110101...........110.....1010111
-#define VWSUBU_WX(vd, rs1, vs2, vm)   EMIT(R_type(0b1101100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110110...........110.....1010111
-#define VWSUB_WX(vd, rs1, vs2, vm)    EMIT(R_type(0b1101110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110111...........110.....1010111
-#define VWMULU_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1110000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111000...........110.....1010111
-#define VWMULSU_VX(vd, rs1, vs2, vm)  EMIT(R_type(0b1110100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111010...........110.....1010111
-#define VWMUL_VX(vd, rs1, vs2, vm)    EMIT(R_type(0b1110110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111011...........110.....1010111
-#define VWMACCU_VX(vd, rs1, vs2, vm)  EMIT(R_type(0b1111000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111100...........110.....1010111
-#define VWMACC_VX(vd, rs1, vs2, vm)   EMIT(R_type(0b1111010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111101...........110.....1010111
-#define VWMACCUS_VX(vd, rs1, vs2, vm) EMIT(R_type(0b1111100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111110...........110.....1010111
-#define VWMACCSU_VX(vd, rs1, vs2, vm) EMIT(R_type(0b1111110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111111...........110.....1010111
+#define VDIVU_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1000000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100000...........110.....1010111
+#define VDIV_VX(vd, vs2, rs1, vm)     EMIT(R_type(0b1000010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100001...........110.....1010111
+#define VREMU_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1000100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100010...........110.....1010111
+#define VREM_VX(vd, vs2, rs1, vm)     EMIT(R_type(0b1000110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100011...........110.....1010111
+#define VMULHU_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1001000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100100...........110.....1010111
+#define VMUL_VX(vd, vs2, rs1, vm)     EMIT(R_type(0b1001010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100101...........110.....1010111
+#define VMULHSU_VX(vd, vs2, rs1, vm)  EMIT(R_type(0b1001100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100110...........110.....1010111
+#define VMULH_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1001110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 100111...........110.....1010111
+#define VMADD_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1010010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 101001...........110.....1010111
+#define VNMSUB_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1010110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 101011...........110.....1010111
+#define VMACC_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1011010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 101101...........110.....1010111
+#define VNMSAC_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1011110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 101111...........110.....1010111
+#define VWADDU_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1100000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110000...........110.....1010111
+#define VWADD_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1100010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110001...........110.....1010111
+#define VWSUBU_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1100100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110010...........110.....1010111
+#define VWSUB_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1100110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110011...........110.....1010111
+#define VWADDU_WX(vd, vs2, rs1, vm)   EMIT(R_type(0b1101000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110100...........110.....1010111
+#define VWADD_WX(vd, vs2, rs1, vm)    EMIT(R_type(0b1101010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110101...........110.....1010111
+#define VWSUBU_WX(vd, vs2, rs1, vm)   EMIT(R_type(0b1101100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110110...........110.....1010111
+#define VWSUB_WX(vd, vs2, rs1, vm)    EMIT(R_type(0b1101110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 110111...........110.....1010111
+#define VWMULU_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1110000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111000...........110.....1010111
+#define VWMULSU_VX(vd, vs2, rs1, vm)  EMIT(R_type(0b1110100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111010...........110.....1010111
+#define VWMUL_VX(vd, vs2, rs1, vm)    EMIT(R_type(0b1110110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111011...........110.....1010111
+#define VWMACCU_VX(vd, vs2, rs1, vm)  EMIT(R_type(0b1111000 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111100...........110.....1010111
+#define VWMACC_VX(vd, vs2, rs1, vm)   EMIT(R_type(0b1111010 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111101...........110.....1010111
+#define VWMACCUS_VX(vd, vs2, rs1, vm) EMIT(R_type(0b1111100 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111110...........110.....1010111
+#define VWMACCSU_VX(vd, vs2, rs1, vm) EMIT(R_type(0b1111110 | (vm), vs2, rs1, 0b110, vd, 0b1010111)) // 111111...........110.....1010111
 
 #endif //__RV64_EMITTER_H__