diff options
| author | phorcys <phorcys@126.com> | 2025-07-15 18:23:13 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-07-15 12:23:13 +0200 |
| commit | 37b1d9ea85fb37ac1e415bfd70151beaad298ff9 (patch) | |
| tree | d12d38a9a4afadbd9b7b3b1084fce6ce335ccc09 /src | |
| parent | 98f2460c46347d59aa3a01b0c0a19bc9f7bc6ffb (diff) | |
| download | box64-37b1d9ea85fb37ac1e415bfd70151beaad298ff9.tar.gz box64-37b1d9ea85fb37ac1e415bfd70151beaad298ff9.zip | |
[LA64_DYNAREC] Add la64 avx arith ops, part2. (#2816)
* VEX.66.0F VPMADDWD,VPSADBW
* VEX.66.0F.38 VPH{ADD,SUB}{W,D,SW}, VPABS{B,W,D} VPMADDUBSW,VPMULHRSW,
* VEX.66.0F.3A VMPSADBWDiffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_avx_66_0f.c | 19 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_avx_66_0f38.c | 120 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_avx_66_0f3a.c | 80 | ||||
| -rw-r--r-- | src/dynarec/la64/la64_emitter.h | 88 |
4 files changed, 307 insertions, 0 deletions
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f.c b/src/dynarec/la64/dynarec_la64_avx_66_0f.c index acc0ca9c..27d4cab9 100644 --- a/src/dynarec/la64/dynarec_la64_avx_66_0f.c +++ b/src/dynarec/la64/dynarec_la64_avx_66_0f.c @@ -731,6 +731,25 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, GETGY_empty_VYEY_xy(v0, v1, v2, 0); VMULWEVxy(D_WU, v0, v1, v2); break; + case 0xF5: + INST_NAME("VPMADDWD Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + VMULWEVxy(W_H, q0, v1, v2); + VMULWODxy(W_H, q1, v1, v2); + VADDxy(W, v0, q0, q1); + break; + case 0xF6: + INST_NAME("VPSADBW Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + VABSDxy(BU, v0, v1, v2); + VHADDWxy(HU_BU, v0, v0, v0); + VHADDWxy(WU_HU, v0, v0, v0); + VHADDWxy(DU_WU, v0, v0, v0); + break; case 0xF7: INST_NAME("VMASKMOVDQU Gx, Ex"); nextop = F8; diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c index 6e794734..c411dc48 100644 --- a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c +++ b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c @@ -57,6 +57,76 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i rex_t rex = vex.rex; switch (opcode) { + case 0x01: + INST_NAME("VPHADDW Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + VPICKEVxy(H, q0, v2, v1); + VPICKODxy(H, q1, v2, v1); + VADDxy(H, v0, q0, q1); + break; + case 0x02: + INST_NAME("VPHADDD Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + VPICKEVxy(W, q0, v2, v1); + VPICKODxy(W, q1, v2, v1); + VADDxy(W, v0, q0, q1); + break; + case 0x03: + INST_NAME("VPHADDSW Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + VPICKEVxy(H, q0, v2, v1); + VPICKODxy(H, q1, v2, v1); + VSADDxy(H, v0, q0, q1); + break; + case 0x04: + INST_NAME("VPMADDUBSW Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + VMULWEVxy(H_BU_B, q0, v1, v2); + VMULWODxy(H_BU_B, q1, v1, v2); + VSADDxy(H, v0, q0, q1); + break; + case 0x05: + INST_NAME("VPHSUBW Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + VPICKEVxy(H, q0, v2, v1); + VPICKODxy(H, q1, v2, v1); + VSUBxy(H, v0, q0, q1); + break; + case 0x06: + INST_NAME("VPHSUBD Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + VPICKEVxy(W, q0, v2, v1); + VPICKODxy(W, q1, v2, v1); + VSUBxy(W, v0, q0, q1); + break; + case 0x07: + INST_NAME("VPHSUBSW Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + VPICKEVxy(H, q0, v2, v1); + VPICKODxy(H, q1, v2, v1); + VSSUBxy(H, v0, q0, q1); + break; case 0x08: INST_NAME("VPSIGNB Gx, Vx, Ex"); nextop = F8; @@ -75,6 +145,32 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i GETGY_empty_VYEY_xy(v0, v1, v2, 0); VSIGNCOVxy(W, v0, v2, v1); break; + case 0x0B: + INST_NAME("VPMULHRSW Gx, Vx, Ex"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + if (vex.l) { + XVMULWEV_W_H(q0, v1, v2); + XVMULWOD_W_H(q1, v1, v2); + XVSRLI_W(q0, q0, 14); + XVSRLI_W(q1, q1, 14); + XVADDI_WU(q0, q0, 1); + XVADDI_WU(q1, q1, 1); + XVSRLNI_H_W(q0, q0, 1); + XVSRLNI_H_W(q1, q1, 1); + XVILVL_H(v0, q1, q0); + } else { + VEXT2XV_W_H(q0, v1); + VEXT2XV_W_H(q1, v2); + XVMUL_W(q0, q0, q1); + XVSRLI_W(q0, q0, 14); + XVADDI_WU(q0, q0, 1); + XVSRLNI_H_W(q0, q0, 1); + XVPERMI_D(v0, q0, 0b1000); + } + break; case 0x18: INST_NAME("VBROADCASTSS Gx, Ex"); nextop = F8; @@ -103,6 +199,30 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i GETGY_empty_EY_xy(q0, q2, 0); XVREPLVE0_Q(q0, q2); break; + case 0x1C: + INST_NAME("VPABSB Gx, Ex"); + nextop = F8; + GETGY_empty_EY_xy(v0, v1, 0); + q0 = fpu_get_scratch(dyn); + XVXOR_V(q0, q0, q0); + VABSDxy(B, v0, v1, q0); + break; + case 0x1D: + INST_NAME("VPABSW Gx, Ex"); + nextop = F8; + GETGY_empty_EY_xy(v0, v1, 0); + q0 = fpu_get_scratch(dyn); + XVXOR_V(q0, q0, q0); + VABSDxy(H, v0, v1, q0); + break; + case 0x1E: + INST_NAME("VPABSD Gx, Ex"); + nextop = F8; + GETGY_empty_EY_xy(v0, v1, 0); + q0 = fpu_get_scratch(dyn); + XVXOR_V(q0, q0, q0); + VABSDxy(W, v0, v1, q0); + break; case 0x20: INST_NAME("VPMOVSXBW Gx, Ex"); nextop = F8; diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c index 74cf1759..2207c3c7 100644 --- a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c +++ b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c @@ -107,6 +107,86 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i } } break; + case 0x42: + INST_NAME("VMPSADBW Gx, Vx, Ex, Ib"); + nextop = F8; + GETGY_empty_VYEY_xy(v0, v1, v2, 1); + u8 = F8; + if (vex.l) { + uint8_t low_blk2_offset = 4 * (u8 & 3); + uint8_t low_blk1_offset = 4 * ((u8 >> 2) & 1); + uint8_t high_blk2_offset = 4 * ((u8 >> 3) & 3); + uint8_t high_blk1_offset = 4 * ((u8 >> 5) & 1); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + q2 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + if( low_blk1_offset == high_blk1_offset) { + // generate hi128/low128 mask in one shot + XVMEPATMSK_V(d0, 1, low_blk1_offset); + XVMEPATMSK_V(d1, 1, low_blk1_offset + 4); + XVSHUF_B(q0, v1, v1, d0); + XVSHUF_B(q2, v1, v1, d1); + } else { + XVMEPATMSK_V(d0, 1, low_blk1_offset); + XVMEPATMSK_V(d1, 1, high_blk1_offset); + XVSHUF_B(q0, v1, v1, d0); + XVSHUF_B(q1, v1, v1, d1); + XVPERMI_Q(q0, q1, XVPERMI_IMM_4_0(1, 2)); + XVMEPATMSK_V(d0, 1, low_blk1_offset + 4); + XVMEPATMSK_V(d1, 1, high_blk1_offset + 4); + XVSHUF_B(q2, v1, v1, d0); + XVSHUF_B(q1, v1, v1, d1); + XVPERMI_Q(q2, q1, XVPERMI_IMM_4_0(1, 2)); + } + if( low_blk2_offset == high_blk2_offset) { + // generate hi128/low128 mask in one shot + XVBSRL_V(q1, v2, low_blk2_offset); + XVSHUF4I_W(q1, q1, 0b00000000); + } else { + XVBSRL_V(q1, v2, low_blk2_offset); + XVBSRL_V(d1, v2, high_blk2_offset); + XVPERMI_Q(q1, d1, XVPERMI_IMM_4_0(1, 2)); + XVSHUF4I_W(q1, q1, 0b00000000); + } + XVABSD_BU(d0, q0, q1); + XVABSD_BU(d1, q2, q1); + XVHADDW_HU_BU(d0, d0, d0); + XVHADDW_HU_BU(d1, d1, d1); + XVHADDW_WU_HU(d0, d0, d0); + XVHADDW_WU_HU(d1, d1, d1); + XVSSRANI_HU_W(d0, d0, 0); + XVSSRANI_HU_W(d1, d1, 0); + XVEXTRINS_D(v0, d0, VEXTRINS_IMM_4_0(0, 0)); + XVEXTRINS_D(v0, d1, VEXTRINS_IMM_4_0(1, 0)); + } else { + uint8_t blk2_offset = 4 * (u8 & 3); + uint8_t blk1_offset = 4 * ((u8 >> 2) & 1); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + q2 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + VMEPATMSK_V(d0, 1, blk1_offset); + VMEPATMSK_V(d1, 1, blk1_offset + 4); + VSHUF_B(q0, v1, v1, d0); + VSHUF_B(q2, v1, v1, d1); + VBSRL_V(q1, v2, blk2_offset); + VSHUF4I_W(q1, q1, 0b00000000); + + VABSD_BU(d0, q0, q1); + VABSD_BU(d1, q2, q1); + VHADDW_HU_BU(d0, d0, d0); + VHADDW_HU_BU(d1, d1, d1); + VHADDW_WU_HU(d0, d0, d0); + VHADDW_WU_HU(d1, d1, d1); + VSSRANI_HU_W(d0, d0, 0); + VSSRANI_HU_W(d1, d1, 0); + VEXTRINS_D(v0, d0, VEXTRINS_IMM_4_0(0, 0)); + VEXTRINS_D(v0, d1, VEXTRINS_IMM_4_0(1, 0)); + } + break; default: DEFAULT; } diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h index ea78e328..40aa62d0 100644 --- a/src/dynarec/la64/la64_emitter.h +++ b/src/dynarec/la64/la64_emitter.h @@ -164,6 +164,7 @@ f24-f31 fs0-fs7 Static registers Callee #define type_2RI9(opc, imm9, rj, rd) ((opc) << 19 | ((imm9) & 0x1FF) << 10 | (rj) << 5 | (rd)) #define type_2RI10(opc, imm10, rj, rd) ((opc) << 20 | ((imm10) & 0x3FF) << 10 | (rj) << 5 | (rd)) #define type_2RI11(opc, imm11, rj, rd) ((opc) << 21 | ((imm11) & 0x7FF) << 10 | (rj) << 5 | (rd)) +#define type_1RI5I5(opc, imm5, imm5_2, rd) ((opc) << 15 | ((imm5) & 0x1F) << 10 | ((imm5_2) & 0x1F) << 5 | (rd)) // tmp = GR[rj][31:0] + GR[rk][31:0] // Gr[rd] = SignExtend(tmp[31:0], GRLEN) @@ -2239,6 +2240,7 @@ LSX instruction starts with V, LASX instruction starts with XV. #define XVFRSTPI_B(xd, xj, imm5) EMIT(type_2RI5(0b01110110100110100, imm5, xj, xd)) #define XVFRSTPI_H(xd, xj, imm5) EMIT(type_2RI5(0b01110110100110101, imm5, xj, xd)) #define XVLDI(xd, imm13) EMIT(type_1RI13(0b01110111111000, imm13, xd)) +#define XVSHUF_B(xd, xj, xk, xa) EMIT(type_4R(0b000011010110, xa, xk, xj, xd)) #define XVFMADD_S(xd, xj, xk, xa) EMIT(type_4R(0b000010100001, xa, xk, xj, xd)) #define XVFMSUB_S(xd, xj, xk, xa) EMIT(type_4R(0b000010100101, xa, xk, xj, xd)) @@ -2248,6 +2250,10 @@ LSX instruction starts with V, LASX instruction starts with XV. #define XVFMSUB_D(xd, xj, xk, xa) EMIT(type_4R(0b000010100110, xa, xk, xj, xd)) #define XVFNMADD_D(xd, xj, xk, xa) EMIT(type_4R(0b000010101010, xa, xk, xj, xd)) #define XVFNMSUB_D(xd, xj, xk, xa) EMIT(type_4R(0b000010101110, xa, xk, xj, xd)) + +#define VMEPATMSK_V(vd, mode, uimm5) EMIT(type_1RI5I5(0b01110010100110111, uimm5, mode, vd)) +#define XVMEPATMSK_V(xd, mode, uimm5) EMIT(type_1RI5I5(0b01110110100110111, uimm5, mode, xd)) + //////////////////////////////////////////////////////////////////////////////// // (undocumented) LBT extension instructions @@ -2891,4 +2897,86 @@ LSX instruction starts with V, LASX instruction starts with XV. VAVGR_##width(vd, vj, vk); \ } \ } while (0) + +#define VABSDxy(width, vd, vj, vk) \ + do { \ + if (vex.l) { \ + XVABSD_##width(vd, vj, vk); \ + } else { \ + VABSD_##width(vd, vj, vk); \ + } \ + } while (0) + +#define VHADDWxy(width, vd, vj, vk) \ + do { \ + if (vex.l) { \ + XVHADDW_##width(vd, vj, vk); \ + } else { \ + VHADDW_##width(vd, vj, vk); \ + } \ + } while (0) + +#define VMADDxy(width, vd, vj, vk) \ + do { \ + if (vex.l) { \ + XVMADD_##width(vd, vj, vk); \ + } else { \ + VMADD_##width(vd, vj, vk); \ + } \ + } while (0) + +#define VPICKEVxy(width, vd, vj, vk) \ + do { \ + if (vex.l) { \ + XVPICKEV_##width(vd, vj, vk); \ + } else { \ + VPICKEV_##width(vd, vj, vk); \ + } \ + } while (0) + +#define VPICKODxy(width, vd, vj, vk) \ + do { \ + if (vex.l) { \ + XVPICKOD_##width(vd, vj, vk); \ + } else { \ + VPICKOD_##width(vd, vj, vk); \ + } \ + } while (0) + +#define VPACKEVxy(width, vd, vj, vk) \ + do { \ + if (vex.l) { \ + XVPACKEV_##width(vd, vj, vk); \ + } else { \ + VPACKEV_##width(vd, vj, vk); \ + } \ + } while (0) + +#define VPACKODxy(width, vd, vj, vk) \ + do { \ + if (vex.l) { \ + XVPACKOD_##width(vd, vj, vk); \ + } else { \ + VPACKOD_##width(vd, vj, vk); \ + } \ + } while (0) + +#define VILVLxy(width, vd, vj, vk) \ + do { \ + if (vex.l) { \ + XVILVL_##width(vd, vj, vk); \ + } else { \ + VPILVL_##width(vd, vj, vk); \ + } \ + } while (0) + +#define VILVHxy(width, vd, vj, vk) \ + do { \ + if (vex.l) { \ + XVILVH_##width(vd, vj, vk); \ + } else { \ + VPILVH_##width(vd, vj, vk); \ + } \ + } while (0) + #endif //__ARM64_EMITTER_H__ |