about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorphorcys <phorcys@126.com>2025-07-15 18:23:13 +0800
committerGitHub <noreply@github.com>2025-07-15 12:23:13 +0200
commit37b1d9ea85fb37ac1e415bfd70151beaad298ff9 (patch)
treed12d38a9a4afadbd9b7b3b1084fce6ce335ccc09 /src
parent98f2460c46347d59aa3a01b0c0a19bc9f7bc6ffb (diff)
downloadbox64-37b1d9ea85fb37ac1e415bfd70151beaad298ff9.tar.gz
box64-37b1d9ea85fb37ac1e415bfd70151beaad298ff9.zip
[LA64_DYNAREC] Add la64 avx arith ops, part2. (#2816)
*  VEX.66.0F  VPMADDWD,VPSADBW
  *  VEX.66.0F.38 VPH{ADD,SUB}{W,D,SW}, VPABS{B,W,D} VPMADDUBSW,VPMULHRSW,
  *  VEX.66.0F.3A  VMPSADBW
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/la64/dynarec_la64_avx_66_0f.c19
-rw-r--r--src/dynarec/la64/dynarec_la64_avx_66_0f38.c120
-rw-r--r--src/dynarec/la64/dynarec_la64_avx_66_0f3a.c80
-rw-r--r--src/dynarec/la64/la64_emitter.h88
4 files changed, 307 insertions, 0 deletions
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f.c b/src/dynarec/la64/dynarec_la64_avx_66_0f.c
index acc0ca9c..27d4cab9 100644
--- a/src/dynarec/la64/dynarec_la64_avx_66_0f.c
+++ b/src/dynarec/la64/dynarec_la64_avx_66_0f.c
@@ -731,6 +731,25 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip,
             GETGY_empty_VYEY_xy(v0, v1, v2, 0);
             VMULWEVxy(D_WU, v0, v1, v2);
             break;
+        case 0xF5:
+            INST_NAME("VPMADDWD Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            VMULWEVxy(W_H, q0, v1, v2);
+            VMULWODxy(W_H, q1, v1, v2);
+            VADDxy(W, v0, q0, q1);
+            break;
+        case 0xF6:
+            INST_NAME("VPSADBW Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            VABSDxy(BU, v0, v1, v2);
+            VHADDWxy(HU_BU, v0, v0, v0);
+            VHADDWxy(WU_HU, v0, v0, v0);
+            VHADDWxy(DU_WU, v0, v0, v0);
+            break;
         case 0xF7:
             INST_NAME("VMASKMOVDQU Gx, Ex");
             nextop = F8;
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c
index 6e794734..c411dc48 100644
--- a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c
+++ b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c
@@ -57,6 +57,76 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
     rex_t rex = vex.rex;
 
     switch (opcode) {
+        case 0x01:
+            INST_NAME("VPHADDW Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            VPICKEVxy(H, q0, v2, v1);
+            VPICKODxy(H, q1, v2, v1);
+            VADDxy(H, v0, q0, q1);
+            break;
+        case 0x02:
+            INST_NAME("VPHADDD Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            VPICKEVxy(W, q0, v2, v1);
+            VPICKODxy(W, q1, v2, v1);
+            VADDxy(W, v0, q0, q1);
+            break;
+        case 0x03:
+            INST_NAME("VPHADDSW Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            VPICKEVxy(H, q0, v2, v1);
+            VPICKODxy(H, q1, v2, v1);
+            VSADDxy(H, v0, q0, q1);
+            break;
+        case 0x04:
+            INST_NAME("VPMADDUBSW Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            VMULWEVxy(H_BU_B, q0, v1, v2);
+            VMULWODxy(H_BU_B, q1, v1, v2);
+            VSADDxy(H, v0, q0, q1);
+            break;
+        case 0x05:
+            INST_NAME("VPHSUBW Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            VPICKEVxy(H, q0, v2, v1);
+            VPICKODxy(H, q1, v2, v1);
+            VSUBxy(H, v0, q0, q1);
+            break;
+        case 0x06:
+            INST_NAME("VPHSUBD Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            VPICKEVxy(W, q0, v2, v1);
+            VPICKODxy(W, q1, v2, v1);
+            VSUBxy(W, v0, q0, q1);
+            break;
+        case 0x07:
+            INST_NAME("VPHSUBSW Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            VPICKEVxy(H, q0, v2, v1);
+            VPICKODxy(H, q1, v2, v1);
+            VSSUBxy(H, v0, q0, q1);
+            break;
         case 0x08:
             INST_NAME("VPSIGNB Gx, Vx, Ex");
             nextop = F8;
@@ -75,6 +145,32 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             GETGY_empty_VYEY_xy(v0, v1, v2, 0);
             VSIGNCOVxy(W, v0, v2, v1);
             break;
+        case 0x0B:
+            INST_NAME("VPMULHRSW Gx, Vx, Ex");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            if (vex.l) {
+                XVMULWEV_W_H(q0, v1, v2);
+                XVMULWOD_W_H(q1, v1, v2);
+                XVSRLI_W(q0, q0, 14);
+                XVSRLI_W(q1, q1, 14);
+                XVADDI_WU(q0, q0, 1);
+                XVADDI_WU(q1, q1, 1);
+                XVSRLNI_H_W(q0, q0, 1);
+                XVSRLNI_H_W(q1, q1, 1);
+                XVILVL_H(v0, q1, q0);
+            } else {
+                VEXT2XV_W_H(q0, v1);
+                VEXT2XV_W_H(q1, v2);
+                XVMUL_W(q0, q0, q1);
+                XVSRLI_W(q0, q0, 14);
+                XVADDI_WU(q0, q0, 1);
+                XVSRLNI_H_W(q0, q0, 1);
+                XVPERMI_D(v0, q0, 0b1000);
+            }
+            break;
         case 0x18:
             INST_NAME("VBROADCASTSS Gx, Ex");
             nextop = F8;
@@ -103,6 +199,30 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             GETGY_empty_EY_xy(q0, q2, 0);
             XVREPLVE0_Q(q0, q2);
             break;
+        case 0x1C:
+            INST_NAME("VPABSB Gx, Ex");
+            nextop = F8;
+            GETGY_empty_EY_xy(v0, v1, 0);
+            q0 = fpu_get_scratch(dyn);
+            XVXOR_V(q0, q0, q0);
+            VABSDxy(B, v0, v1, q0);
+            break;
+        case 0x1D:
+            INST_NAME("VPABSW Gx, Ex");
+            nextop = F8;
+            GETGY_empty_EY_xy(v0, v1, 0);
+            q0 = fpu_get_scratch(dyn);
+            XVXOR_V(q0, q0, q0);
+            VABSDxy(H, v0, v1, q0);
+            break;
+        case 0x1E:
+            INST_NAME("VPABSD Gx, Ex");
+            nextop = F8;
+            GETGY_empty_EY_xy(v0, v1, 0);
+            q0 = fpu_get_scratch(dyn);
+            XVXOR_V(q0, q0, q0);
+            VABSDxy(W, v0, v1, q0);
+            break;
         case 0x20:
             INST_NAME("VPMOVSXBW Gx, Ex");
             nextop = F8;
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c
index 74cf1759..2207c3c7 100644
--- a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c
+++ b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c
@@ -107,6 +107,86 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
                 }
             }
             break;
+        case 0x42:
+            INST_NAME("VMPSADBW Gx, Vx, Ex, Ib");
+            nextop = F8;
+            GETGY_empty_VYEY_xy(v0, v1, v2, 1);
+            u8 = F8;
+            if (vex.l) {
+                uint8_t low_blk2_offset = 4 * (u8 & 3);
+                uint8_t low_blk1_offset = 4 * ((u8 >> 2) & 1);
+                uint8_t high_blk2_offset = 4 * ((u8 >> 3) & 3);
+                uint8_t high_blk1_offset = 4 * ((u8 >> 5) & 1);
+                q0 = fpu_get_scratch(dyn);
+                q1 = fpu_get_scratch(dyn);
+                q2 = fpu_get_scratch(dyn);
+                d0 = fpu_get_scratch(dyn);
+                d1 = fpu_get_scratch(dyn);
+                if( low_blk1_offset == high_blk1_offset) {
+                    // generate hi128/low128 mask in one shot
+                    XVMEPATMSK_V(d0, 1, low_blk1_offset);
+                    XVMEPATMSK_V(d1, 1, low_blk1_offset + 4);
+                    XVSHUF_B(q0, v1, v1, d0);
+                    XVSHUF_B(q2, v1, v1, d1);
+                } else {
+                    XVMEPATMSK_V(d0, 1, low_blk1_offset);
+                    XVMEPATMSK_V(d1, 1, high_blk1_offset);
+                    XVSHUF_B(q0, v1, v1, d0);
+                    XVSHUF_B(q1, v1, v1, d1);
+                    XVPERMI_Q(q0, q1, XVPERMI_IMM_4_0(1, 2));
+                    XVMEPATMSK_V(d0, 1, low_blk1_offset + 4);
+                    XVMEPATMSK_V(d1, 1, high_blk1_offset + 4);
+                    XVSHUF_B(q2, v1, v1, d0);
+                    XVSHUF_B(q1, v1, v1, d1);
+                    XVPERMI_Q(q2, q1, XVPERMI_IMM_4_0(1, 2));
+                }
+                if( low_blk2_offset == high_blk2_offset) {
+                    // generate hi128/low128 mask in one shot
+                    XVBSRL_V(q1, v2, low_blk2_offset);
+                    XVSHUF4I_W(q1, q1, 0b00000000);
+                } else {
+                    XVBSRL_V(q1, v2, low_blk2_offset);
+                    XVBSRL_V(d1, v2, high_blk2_offset);
+                    XVPERMI_Q(q1, d1, XVPERMI_IMM_4_0(1, 2));
+                    XVSHUF4I_W(q1, q1, 0b00000000);
+                }                
+                XVABSD_BU(d0, q0, q1);
+                XVABSD_BU(d1, q2, q1);
+                XVHADDW_HU_BU(d0, d0, d0);
+                XVHADDW_HU_BU(d1, d1, d1);
+                XVHADDW_WU_HU(d0, d0, d0);
+                XVHADDW_WU_HU(d1, d1, d1);
+                XVSSRANI_HU_W(d0, d0, 0);
+                XVSSRANI_HU_W(d1, d1, 0);
+                XVEXTRINS_D(v0, d0, VEXTRINS_IMM_4_0(0, 0));
+                XVEXTRINS_D(v0, d1, VEXTRINS_IMM_4_0(1, 0));
+            } else {
+                uint8_t blk2_offset = 4 * (u8 & 3);
+                uint8_t blk1_offset = 4 * ((u8 >> 2) & 1);
+                q0 = fpu_get_scratch(dyn);
+                q1 = fpu_get_scratch(dyn);
+                q2 = fpu_get_scratch(dyn);
+                d0 = fpu_get_scratch(dyn);
+                d1 = fpu_get_scratch(dyn);
+                VMEPATMSK_V(d0, 1, blk1_offset);
+                VMEPATMSK_V(d1, 1, blk1_offset + 4);
+                VSHUF_B(q0, v1, v1, d0);
+                VSHUF_B(q2, v1, v1, d1);
+                VBSRL_V(q1, v2, blk2_offset);
+                VSHUF4I_W(q1, q1, 0b00000000);
+
+                VABSD_BU(d0, q0, q1);
+                VABSD_BU(d1, q2, q1);
+                VHADDW_HU_BU(d0, d0, d0);
+                VHADDW_HU_BU(d1, d1, d1);
+                VHADDW_WU_HU(d0, d0, d0);
+                VHADDW_WU_HU(d1, d1, d1);
+                VSSRANI_HU_W(d0, d0, 0);
+                VSSRANI_HU_W(d1, d1, 0);
+                VEXTRINS_D(v0, d0, VEXTRINS_IMM_4_0(0, 0));
+                VEXTRINS_D(v0, d1, VEXTRINS_IMM_4_0(1, 0));
+            }
+            break;
         default:
             DEFAULT;
     }
diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h
index ea78e328..40aa62d0 100644
--- a/src/dynarec/la64/la64_emitter.h
+++ b/src/dynarec/la64/la64_emitter.h
@@ -164,6 +164,7 @@ f24-f31  fs0-fs7   Static registers                Callee
 #define type_2RI9(opc, imm9, rj, rd)   ((opc) << 19 | ((imm9) & 0x1FF) << 10 | (rj) << 5 | (rd))
 #define type_2RI10(opc, imm10, rj, rd) ((opc) << 20 | ((imm10) & 0x3FF) << 10 | (rj) << 5 | (rd))
 #define type_2RI11(opc, imm11, rj, rd) ((opc) << 21 | ((imm11) & 0x7FF) << 10 | (rj) << 5 | (rd))
+#define type_1RI5I5(opc, imm5, imm5_2, rd)   ((opc) << 15 | ((imm5) & 0x1F) << 10 | ((imm5_2) & 0x1F) << 5 | (rd))
 
 // tmp = GR[rj][31:0] + GR[rk][31:0]
 // Gr[rd] = SignExtend(tmp[31:0], GRLEN)
@@ -2239,6 +2240,7 @@ LSX instruction starts with V, LASX instruction starts with XV.
 #define XVFRSTPI_B(xd, xj, imm5)     EMIT(type_2RI5(0b01110110100110100, imm5, xj, xd))
 #define XVFRSTPI_H(xd, xj, imm5)     EMIT(type_2RI5(0b01110110100110101, imm5, xj, xd))
 #define XVLDI(xd, imm13)             EMIT(type_1RI13(0b01110111111000, imm13, xd))
+#define XVSHUF_B(xd, xj, xk, xa)     EMIT(type_4R(0b000011010110, xa, xk, xj, xd))
 
 #define XVFMADD_S(xd, xj, xk, xa)  EMIT(type_4R(0b000010100001, xa, xk, xj, xd))
 #define XVFMSUB_S(xd, xj, xk, xa)  EMIT(type_4R(0b000010100101, xa, xk, xj, xd))
@@ -2248,6 +2250,10 @@ LSX instruction starts with V, LASX instruction starts with XV.
 #define XVFMSUB_D(xd, xj, xk, xa)  EMIT(type_4R(0b000010100110, xa, xk, xj, xd))
 #define XVFNMADD_D(xd, xj, xk, xa) EMIT(type_4R(0b000010101010, xa, xk, xj, xd))
 #define XVFNMSUB_D(xd, xj, xk, xa) EMIT(type_4R(0b000010101110, xa, xk, xj, xd))
+
+#define VMEPATMSK_V(vd, mode, uimm5)     EMIT(type_1RI5I5(0b01110010100110111, uimm5, mode, vd))
+#define XVMEPATMSK_V(xd, mode, uimm5)    EMIT(type_1RI5I5(0b01110110100110111, uimm5, mode, xd))
+
 ////////////////////////////////////////////////////////////////////////////////
 // (undocumented) LBT extension instructions
 
@@ -2891,4 +2897,86 @@ LSX instruction starts with V, LASX instruction starts with XV.
             VAVGR_##width(vd, vj, vk);  \
         }                               \
     } while (0)
+
+#define VABSDxy(width, vd, vj, vk)      \
+    do {                                \
+        if (vex.l) {                    \
+            XVABSD_##width(vd, vj, vk); \
+        } else {                        \
+            VABSD_##width(vd, vj, vk);  \
+        }                               \
+    } while (0)
+
+#define VHADDWxy(width, vd, vj, vk)      \
+    do {                                 \
+        if (vex.l) {                     \
+            XVHADDW_##width(vd, vj, vk); \
+        } else {                         \
+            VHADDW_##width(vd, vj, vk);  \
+        }                                \
+    } while (0)
+
+#define VMADDxy(width, vd, vj, vk)      \
+    do {                                \
+        if (vex.l) {                    \
+            XVMADD_##width(vd, vj, vk); \
+        } else {                        \
+            VMADD_##width(vd, vj, vk);  \
+        }                               \
+    } while (0)
+
+#define VPICKEVxy(width, vd, vj, vk)      \
+    do {                                  \
+        if (vex.l) {                      \
+            XVPICKEV_##width(vd, vj, vk); \
+        } else {                          \
+            VPICKEV_##width(vd, vj, vk);  \
+        }                                 \
+    } while (0)
+
+#define VPICKODxy(width, vd, vj, vk)      \
+    do {                                  \
+        if (vex.l) {                      \
+            XVPICKOD_##width(vd, vj, vk); \
+        } else {                          \
+            VPICKOD_##width(vd, vj, vk);  \
+        }                                 \
+    } while (0)
+
+#define VPACKEVxy(width, vd, vj, vk)      \
+    do {                                  \
+        if (vex.l) {                      \
+            XVPACKEV_##width(vd, vj, vk); \
+        } else {                          \
+            VPACKEV_##width(vd, vj, vk);  \
+        }                                 \
+    } while (0)
+
+#define VPACKODxy(width, vd, vj, vk)      \
+    do {                                  \
+        if (vex.l) {                      \
+            XVPACKOD_##width(vd, vj, vk); \
+        } else {                          \
+            VPACKOD_##width(vd, vj, vk);  \
+        }                                 \
+    } while (0)
+
+#define VILVLxy(width, vd, vj, vk)      \
+    do {                                \
+        if (vex.l) {                    \
+            XVILVL_##width(vd, vj, vk); \
+        } else {                        \
+            VPILVL_##width(vd, vj, vk); \
+        }                               \
+    } while (0)
+
+#define VILVHxy(width, vd, vj, vk)      \
+    do {                                \
+        if (vex.l) {                    \
+            XVILVH_##width(vd, vj, vk); \
+        } else {                        \
+            VPILVH_##width(vd, vj, vk); \
+        }                               \
+    } while (0)
+
 #endif //__ARM64_EMITTER_H__