about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--src/dynarec/rv64/dynarec_rv64_0f_vector.c21
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f_vector.c130
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.c11
3 files changed, 148 insertions, 14 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_0f_vector.c b/src/dynarec/rv64/dynarec_rv64_0f_vector.c
index 91d44c56..96f7e7ee 100644
--- a/src/dynarec/rv64/dynarec_rv64_0f_vector.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f_vector.c
@@ -140,6 +140,27 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 VSLIDEUP_VI(v0, 8, v1, VECTOR_UNMASKED);
             }
             break;
+        case 0x17:
+            INST_NAME("MOVHPS Ex, Gx");
+            nextop = F8;
+            SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
+            GETGX_vector(v0, 1, VECTOR_SEW64);
+            // EX->q[0] = GX->q[1];
+            if (MODREG) {
+                v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
+                q0 = fpu_get_scratch(dyn);
+                VSLIDE1DOWN_VX(q0, xZR, v0, VECTOR_UNMASKED);
+                VMV_X_S(x4, q0);
+                VMV_S_X(v1, x4);
+            } else {
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                q0 = fpu_get_scratch(dyn);
+                VSLIDE1DOWN_VX(q0, xZR, v0, VECTOR_UNMASKED);
+                VMV_X_S(x4, q0);
+                SD(x4, ed, fixedaddress);
+                SMWRITE2();
+            }
+            break;
         case 0x28:
             INST_NAME("MOVAPS Gx, Ex");
             nextop = F8;
diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c
index 4f79aa9e..aad39907 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c
@@ -815,6 +815,29 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VLUXEI64_V(v0, ed, q0, VECTOR_MASKED, VECTOR_NFIELD1);
             }
             break;
+        case 0x6D:
+            INST_NAME("PUNPCKHQDQ Gx, Ex");
+            nextop = F8;
+            SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
+            // GX->q[0] = GX->q[1];
+            // GX->q[1] = EX->q[1];
+            GETGX_vector(v0, 1, VECTOR_SEW64);
+            if (MODREG) {
+                v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
+                q0 == fpu_get_scratch(dyn);
+                VSLIDE1DOWN_VX(q0, xZR, v0, VECTOR_UNMASKED);
+                VMV_X_S(x4, q0);
+                if (v0 != v1) { VMV_V_V(v0, v1); }
+                VMV_S_X(v0, x4);
+            } else {
+                q0 = fpu_get_scratch(dyn);
+                VMV_V_I(VMASK, 0b10);
+                VSLIDE1DOWN_VX(v0, xZR, v0, VECTOR_UNMASKED);
+                SMREAD();
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
+                VLE64_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1);
+            }
+            break;
         case 0x6E:
             INST_NAME("MOVD Gx, Ed");
             nextop = F8;
@@ -975,6 +998,38 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VSE_V(v1, ed, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
             }
             break;
+        case 0xD1:
+        case 0xD2:
+            if (opcode == 0xD1) {
+                INST_NAME("PSRLW Gx, Ex");
+                u8 = VECTOR_SEW16;
+                i32 = 16;
+            } else {
+                INST_NAME("PSRLD Gx, Ex");
+                u8 = VECTOR_SEW32;
+                i32 = 32;
+            }
+            nextop = F8;
+            SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
+            GETGX_vector(q0, 1, VECTOR_SEW64);
+            if (MODREG) {
+                q1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
+            } else {
+                VMV_V_I(VMASK, 0b01);
+                SMREAD();
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
+                q1 = fpu_get_scratch(dyn);
+                VLE_V(q1, ed, VECTOR_SEW64, VECTOR_MASKED, VECTOR_NFIELD1);
+            }
+            VMV_X_S(x4, q1);
+            ADDI(x5, xZR, i32);
+            SET_ELEMENT_WIDTH(x1, u8, 1);
+            BLTU_MARK(x4, x5);
+            VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
+            B_NEXT_nocond;
+            MARK;
+            VSRL_VX(q0, x4, q0, VECTOR_UNMASKED);
+            break;
         case 0xD4:
             INST_NAME("PADDQ Gx, Ex");
             nextop = F8;
@@ -1026,12 +1081,15 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             SET_ELEMENT_WIDTH(x1, u8, 1);
             GETGX_vector(q0, 1, u8);
             GETEX_vector(q1, 0, 0, u8);
-            v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
-            VWSUBU_VV(v0, q1, q0, VECTOR_UNMASKED);
-            vector_vsetvli(dyn, ninst, x1, u8 + 1, rv64_vlen == 128 ? VECTOR_LMUL2 : VECTOR_LMUL1, 2);
-            VMAX_VX(v0, xZR, v0, VECTOR_UNMASKED);
-            vector_vsetvli(dyn, ninst, x1, u8, VECTOR_LMUL1, 1);
-            VNSRL_WX(q0, xZR, v0, VECTOR_UNMASKED);
+            VSSUBU_VV(q0, q1, q0, VECTOR_UNMASKED);
+            break;
+        case 0xDA:
+            INST_NAME("PMINUB Gx, Ex");
+            nextop = F8;
+            SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
+            GETGX_vector(q0, 1, VECTOR_SEW8);
+            GETEX_vector(q1, 0, 0, VECTOR_SEW8);
+            VMINU_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
         case 0xDB:
             INST_NAME("PAND Gx, Ex");
@@ -1041,6 +1099,21 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETEX_vector(q1, 0, 0, dyn->vector_eew);
             VAND_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
+        case 0xDC:
+        case 0xDD:
+            if (opcode == 0xDC) {
+                INST_NAME("PADDUSB Gx, Ex");
+                u8 = VECTOR_SEW8;
+            } else {
+                INST_NAME("PADDUSW Gx, Ex");
+                u8 = VECTOR_SEW16;
+            }
+            nextop = F8;
+            SET_ELEMENT_WIDTH(x1, u8, 1);
+            GETGX_vector(q0, 1, u8);
+            GETEX_vector(q1, 0, 0, u8);
+            VSADDU_VV(q0, q1, q0, VECTOR_UNMASKED);
+            break;
         case 0xDF:
             INST_NAME("PANDN Gx, Ex");
             nextop = F8;
@@ -1050,6 +1123,15 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             VXOR_VI(q0, 0x1F, q0, VECTOR_UNMASKED);
             VAND_VV(q0, q0, q1, VECTOR_UNMASKED);
             break;
+        case 0xE0:
+            INST_NAME("PAVGB Gx, Ex");
+            nextop = F8;
+            SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
+            GETGX_vector(q0, 1, VECTOR_SEW8);
+            GETEX_vector(q1, 0, 0, VECTOR_SEW8);
+            CSRRWI(xZR, 0b00 /* rnu */, 0x00A /* vxrm */);
+            VAADDU_VV(q0, q1, q0, VECTOR_UNMASKED);
+            break;
         case 0xE1:
             INST_NAME("PSRAW Gx,Ex");
             nextop = F8;
@@ -1141,12 +1223,44 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 VXOR_VV(q0, q0, q1, VECTOR_UNMASKED);
             }
             break;
-        case 0xF9:
-            INST_NAME("PSUBW Gx, Ex");
+        case 0xF5:
+            INST_NAME("PMADDWD Gx, Ex");
             nextop = F8;
             SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1);
             GETGX_vector(q0, 1, VECTOR_SEW16);
             GETEX_vector(q1, 0, 0, VECTOR_SEW16);
+            v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
+            v1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2);
+            VWMUL_VV(v0, q0, q1, VECTOR_UNMASKED);
+            d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // warning, no more scratches!
+            ADDI(x4, xZR, 6);
+            VID_V(d0, VECTOR_UNMASKED);
+            VSLL_VI(d0, 1, d0, VECTOR_UNMASKED); // times 2
+            VMIN_VX(d0, x4, d0, VECTOR_UNMASKED);
+            VADD_VI(q0, 1, d0, VECTOR_UNMASKED);
+            vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL2, 2);
+            VRGATHEREI16_VV(v1, d0, v0, VECTOR_UNMASKED); // 6 4 2 0
+            VRGATHEREI16_VV(d0, q0, v0, VECTOR_UNMASKED); // 7 5 3 1
+            SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1);
+            VADD_VV(q0, d0, v1, VECTOR_UNMASKED);
+            break;
+        case 0xF8:
+        case 0xF9:
+        case 0xFA:
+            if (opcode == 0xF8) {
+                INST_NAME("PSUBB Gx, Ex");
+                u8 = VECTOR_SEW8;
+            } else if (opcode == 0xF9) {
+                INST_NAME("PSUBW Gx, Ex");
+                u8 = VECTOR_SEW16;
+            } else {
+                INST_NAME("PSUBD Gx, Ex");
+                u8 = VECTOR_SEW32;
+            }
+            nextop = F8;
+            SET_ELEMENT_WIDTH(x1, u8, 1);
+            GETGX_vector(q0, 1, u8);
+            GETEX_vector(q1, 0, 0, u8);
             VSUB_VV(q0, q1, q0, VECTOR_UNMASKED);
             break;
         case 0xFC ... 0xFE:
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c
index 9ff6b4ff..619041be 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.c
+++ b/src/dynarec/rv64/dynarec_rv64_functions.c
@@ -42,7 +42,7 @@ int fpu_get_scratch(dynarec_rv64_t* dyn)
 int fpu_get_scratch_lmul(dynarec_rv64_t* dyn, int lmul)
 {
     int reg = SCRATCH0 + dyn->e.fpu_scratch;
-    int skip = (1 << lmul) - (reg % (1 << lmul));
+    int skip = (reg % (1 << lmul)) ? (1 << lmul) - (reg % (1 << lmul)) : 0;
     dyn->e.fpu_scratch += skip + 1;
     return reg + skip;
 }
@@ -92,17 +92,18 @@ int fpu_get_reg_xmm(dynarec_rv64_t* dyn, int t, int xmm)
     return EXTREG(i);
 }
 // Reset fpu regs counter
-void fpu_reset_reg_extcache(extcache_t* e)
+void fpu_reset_reg_extcache(dynarec_rv64_t* dyn, extcache_t* e)
 {
     e->fpu_reg = 0;
     for (int i=0; i<24; ++i) {
         e->fpuused[i]=0;
         e->extcache[i].v = 0;
     }
+    dyn->vector_sew = VECTOR_SEWNA;
 }
 void fpu_reset_reg(dynarec_rv64_t* dyn)
 {
-    fpu_reset_reg_extcache(&dyn->e);
+    fpu_reset_reg_extcache(dyn, &dyn->e);
 }
 
 int extcache_no_i64(dynarec_rv64_t* dyn, int ninst, int st, int a)
@@ -732,7 +733,6 @@ void fpu_reset(dynarec_rv64_t* dyn)
     mmx_reset(&dyn->e);
     sse_reset(&dyn->e);
     fpu_reset_reg(dyn);
-    dyn->vector_sew = VECTOR_SEWNA;
 }
 
 void fpu_reset_ninst(dynarec_rv64_t* dyn, int ninst)
@@ -740,8 +740,7 @@ void fpu_reset_ninst(dynarec_rv64_t* dyn, int ninst)
     x87_reset(&dyn->insts[ninst].e);
     mmx_reset(&dyn->insts[ninst].e);
     sse_reset(&dyn->insts[ninst].e);
-    fpu_reset_reg_extcache(&dyn->insts[ninst].e);
-    dyn->vector_sew = VECTOR_SEWNA;
+    fpu_reset_reg_extcache(dyn, &dyn->insts[ninst].e);
 }
 
 int fpu_is_st_freed(dynarec_rv64_t* dyn, int ninst, int st)