about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorphorcys <phorcys@126.com>2025-08-02 20:42:05 +0800
committerGitHub <noreply@github.com>2025-08-02 14:42:05 +0200
commitcb0b274c2704e5af3c118d30992d63c6f5dff6e8 (patch)
tree34f9be4ccb43e85a8f2ed29f8f3ae26f7a668dcb /src
parentebac5834410781a5c263e50d987edf65bcbb9d8c (diff)
downloadbox64-cb0b274c2704e5af3c118d30992d63c6f5dff6e8.tar.gz
box64-cb0b274c2704e5af3c118d30992d63c6f5dff6e8.zip
[LA64_DYNAREC] Fix some la64 avx/sse ops. (#2882)
Fix 66.0F.F3 PSLLQ
Fix VEX.66.0F.7E VMOVD not zero-extend
Fix Vex.66.0F.3A.06 VPERM2F128/VPERM2I128
Fix Vex.66.0F.3A.0D VBLENDPD
Fix VEX.66.0F.3A.18/38 VINSERTF128/VINSERTI128  when q0 == q1 or q0 == q2
Fix VEX.66.0F.3A.21 VINSERTPS fix u8 get pos
Fix VEX.66.0F.3A.40 VDPPS Fix VREPLVEIxy emit when vex.l
Fix VEX.66.0F.38.0C VPERMILPS
Fix VEX.66.0F.38.2B VPACKUSDW
Fix VEX.66.0F.38.93 VGATHERQPD
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/la64/dynarec_la64_660f.c2
-rw-r--r--src/dynarec/la64/dynarec_la64_avx_66_0f.c2
-rw-r--r--src/dynarec/la64/dynarec_la64_avx_66_0f38.c18
-rw-r--r--src/dynarec/la64/dynarec_la64_avx_66_0f3a.c27
-rw-r--r--src/dynarec/la64/la64_emitter.h24
-rw-r--r--src/dynarec/la64/la64_printer.c44
6 files changed, 80 insertions, 37 deletions
diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c
index afc1f848..3e0080a3 100644
--- a/src/dynarec/la64/dynarec_la64_660f.c
+++ b/src/dynarec/la64/dynarec_la64_660f.c
@@ -2519,7 +2519,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             v1 = fpu_get_scratch(dyn);
             VREPLVEI_D(v0, q1, 0);
             VLDI(v1, (0b011 << 10) | 0x3f);
-            VSLEI_DU(v1, v0, v1);
+            VSLE_DU(v1, v0, v1);
             VSLL_D(q0, q0, v0);
             VAND_V(q0, q0, v1);
             break;
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f.c b/src/dynarec/la64/dynarec_la64_avx_66_0f.c
index eac332ff..18379800 100644
--- a/src/dynarec/la64/dynarec_la64_avx_66_0f.c
+++ b/src/dynarec/la64/dynarec_la64_avx_66_0f.c
@@ -827,7 +827,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip,
                 if (rex.w) {
                     VPICKVE2GR_D(ed, v0, 0);
                 } else {
-                    VPICKVE2GR_W(ed, v0, 0);
+                    VPICKVE2GR_WU(ed, v0, 0);
                 }
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c
index 037d49a6..3017b7ae 100644
--- a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c
+++ b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c
@@ -187,7 +187,6 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             INST_NAME("VPERMILPS Gx, Vx, Ex");
             nextop = F8;
             GETGY_empty_VYEY_xy(v0, v1, v2, 0);
-            u8 = F8;
             d0 = fpu_get_scratch(dyn);
             VANDIxy(d0, v2, 0b11);
             VSHUFxy(W, d0, v1, v1);
@@ -457,17 +456,17 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             q0 = fpu_get_scratch(dyn);
             q1 = fpu_get_scratch(dyn);
             d0 = fpu_get_scratch(dyn);
-            VLDIxy(q0, 0b0010011111111); // broadcast 0xff as 16-bit elements to all lanes
+            VLDIxy(d0, (0b10111 <<8) | 0x00);  // Broadcast 0x0000FFFF as 32bits to all lane
             if (v1 == v2) {
-                VMAXIxy(W, v0, v1, 0);
-                VMINxy(W, v0, v1, q0);
-                VPICKEVxy(H, v0, v0, v0);
+                VMAXIxy(W, q0, v1, 0);
+                VMINxy(W, q0, q0, d0);
+                VPICKEVxy(H, v0, q0, q0);
             } else {
                 VMAXIxy(W, q1, v2, 0);
-                VMAXIxy(W, v0, v1, 0);
-                VMINxy(W, q1, q1, q0);
-                VMINxy(W, v0, v0, q0);
-                VPICKEVxy(H, v0, q1, v0);
+                VMAXIxy(W, q0, v1, 0);
+                VMINxy(W, q1, q1, d0);
+                VMINxy(W, q0, q0, d0);
+                VPICKEVxy(H, v0, q1, q0);
             }
             break;
         case 0x2C:
@@ -980,7 +979,6 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
                 }
                 VXOR_V(v2, v2, v2);
             }
-            XVPERMI_Q(v0, v2, XVPERMI_IMM_4_0(1, 2));
             break;
         case 0x96:
             INST_NAME("VFMADDSUB132PS/D Gx, Vx, Ex");
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c
index ccfe759c..dbd1bca4 100644
--- a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c
+++ b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c
@@ -140,8 +140,8 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             }
             nextop = F8;
             if (!vex.l) EMIT(0);
-            u8 = F8;
             GETGY_empty_VYEY_xy(v0, v1, v2, 1);
+            u8 = F8;
             if (u8 == 0x88) {
                 XVXOR_V(v0, v0, v0);
                 break;
@@ -296,7 +296,7 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
                 XVPERMI_Q(d0, d1, XVPERMI_IMM_4_0(1, 2));
                 XVOR_V(v0, d0, d0);
             } else {
-                u8 = F8 & 0b11;
+                u8 = u8 & 0b11;
                 switch (u8) {
                     case 0b00:
                         VOR_V(v0, v1, v1);
@@ -411,8 +411,12 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             nextop = F8;
             GETGY_empty_VYEY_xy(q0, q1, q2, 1);
             u8 = F8;
-            XVOR_V(q0, q1, q1);
-            XVPERMI_Q(q0, q2, (u8 & 1) == 0 ? 0b00110000 : 0b00000010);
+            if(q0 != q2){
+                if(q0 != q1) XVOR_V(q0, q1, q1);
+                XVPERMI_Q(q0, q2, ((u8 & 1) == 0) ? 0x30: 0x02);
+            } else{
+                XVPERMI_Q(q0, q1, ((u8 & 1) == 0) ? 0x12 : 0x20);
+            }
             break;
         case 0x19:
         case 0x39:
@@ -470,9 +474,6 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
         case 0x21:
             INST_NAME("VINSERTPS Gx, Vx, Ex, Ib");
             nextop = F8;
-            uint8_t src_index = (u8 >> 6) & 3;
-            uint8_t dst_index = (u8 >> 4) & 3;
-            uint8_t zmask = u8 & 0xf;
             q1 = fpu_get_scratch(dyn);
             if (MODREG) {
                 GETGY_empty_VYEY_xy(v0, v1, v2, 1);
@@ -480,24 +481,24 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
                 if (v0 == v2) {
                     VOR_V(q1, v2, v2);
                     if (v0 != v1) VOR_V(v0, v1, v1);
-                    VEXTRINS_W(v0, q1, VEXTRINS_IMM_4_0(dst_index, src_index));
+                    VEXTRINS_W(v0, q1, VEXTRINS_IMM_4_0((u8 >> 4) & 3, (u8 >> 6) & 3));
                 } else {
                     if (v0 != v1) VOR_V(v0, v1, v1);
-                    VEXTRINS_W(v0, v2, VEXTRINS_IMM_4_0(dst_index, src_index));
+                    VEXTRINS_W(v0, v2, VEXTRINS_IMM_4_0((u8 >> 4) & 3, (u8 >> 6) & 3));
                 }
             } else {
                 GETVYx(v1, 0);
                 GETGYx_empty(v0);
-                u8 = F8;
                 if (v0 != v1) VOR_V(v0, v1, v1);
                 SMREAD();
-                addr = geted(dyn, addr, ninst, nextop, &wback, x3, x5, &fixedaddress, rex, NULL, 0, 1);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x3, x5, &fixedaddress, rex, NULL, 1, 1);
                 u8 = F8;
                 FLD_S(q1, wback, fixedaddress);
-                VEXTRINS_W(v0, q1, VEXTRINS_IMM_4_0(dst_index, 0)); // src index is zero when Ex is mem operand
+                VEXTRINS_W(v0, q1, VEXTRINS_IMM_4_0((u8 >> 4) & 3, 0)); // src index is zero when Ex is mem operand
             }
-            VXOR_V(q1, q1, q1);
+            uint8_t zmask = u8 & 0xf;
             if (zmask) {
+                VXOR_V(q1, q1, q1);
                 for (uint8_t i = 0; i < 4; i++) {
                     if (zmask & (1 << i)) {
                         VEXTRINS_W(v0, q1, VEXTRINS_IMM_4_0(i, 0));
diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h
index 7552a1f9..196126a7 100644
--- a/src/dynarec/la64/la64_emitter.h
+++ b/src/dynarec/la64/la64_emitter.h
@@ -3288,18 +3288,18 @@ LSX instruction starts with V, LASX instruction starts with XV.
         }                               \
     } while (0)
 
-#define VREPLVEIxy(width, vd, vj, imm)        \
-    do {                                      \
-        if (vex.l) {                          \
-            if (imm > 0) {                    \
-                ADDI_D(x5, xZR, imm);         \
-                XVREPLVE_##width(vd, vj, x5); \
-            } else {                          \
-                XVREPLVE0_##width(vd, vj);    \
-            }                                 \
-        } else {                              \
-            VREPLVEI_##width(vd, vj, imm);    \
-        }                                     \
+#define VREPLVEIxy(width, vd, vj, imm)         \
+    do {                                       \
+        if (vex.l) {                           \
+            if (imm > 0) {                     \
+                ADDI_D(x5, xZR, imm);          \
+                XVREPLVE_##width(vd, vj, x5);  \
+            } else {                           \
+                XVREPLVE_##width(vd, vj, xZR); \
+            }                                  \
+        } else {                               \
+            VREPLVEI_##width(vd, vj, imm);     \
+        }                                      \
     } while (0)
 
 #define VSEQxy(width, vd, vj, vk)      \
diff --git a/src/dynarec/la64/la64_printer.c b/src/dynarec/la64/la64_printer.c
index 2fe47957..3b984d32 100644
--- a/src/dynarec/la64/la64_printer.c
+++ b/src/dynarec/la64/la64_printer.c
@@ -7672,6 +7672,50 @@ const char* la64_print(uint32_t opcode, uintptr_t addr)
         snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VBITCLRI.D", Vt[Rd], Vt[Rj], imm);
         return buff;
     }
+    if (isMask(opcode, "01110010100110100iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VFRSTPI.B", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110010100110101iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VFRSTPI.H", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110010100100000iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VMAXI.B", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110010100100001iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VMAXI.H", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110010100100010iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VMAXI.W", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110010100100011iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VMAXI.D", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110010100101000iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VMAXI.BU", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110010100101001iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VMAXI.HU", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110010100101010iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VMAXI.WU", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110010100101011iiiiijjjjjddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%lx", "VMAXI.DU", Vt[Rd], Vt[Rj], imm);
+        return buff;
+    }
+    if (isMask(opcode, "01110011111000iiiiiiiiiiiiiddddd", &a)) {
+        snprintf(buff, sizeof(buff), "%-15s %s, 0x%lx", "VLDI", Vt[Rd], imm);
+        return buff;
+    }
     snprintf(buff, sizeof(buff), "%08X ???", __builtin_bswap32(opcode));
     return buff;
 }