diff options
| author | phorcys <phorcys@126.com> | 2025-07-21 18:06:55 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-07-21 12:06:55 +0200 |
| commit | ea16e05142769747aa49f73d56b09959718fe896 (patch) | |
| tree | 14e13868b856bc8cdd20a5ae11584466a716b062 /src | |
| parent | 3e74454cf49af05567f494100646db12c0d9bd1a (diff) | |
| download | box64-ea16e05142769747aa49f73d56b09959718fe896.tar.gz box64-ea16e05142769747aa49f73d56b09959718fe896.zip | |
[LA64_DYNAREC] add la64 avx pack/unpack ops, part5. (#2837)
INSERT/EXTRACT/BROADCAST/GATHER ops.
VEXTRACTPS,VINSERTPS
VBROADCAST{SD,SS}, VPBROADCAST{B,W,D,Q,I128}
VPGATHER{DD,DQ,QD,QQ,DPD,DPS,QPD,QPS}Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_avx_66_0f38.c | 227 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_avx_66_0f3a.c | 53 | ||||
| -rw-r--r-- | src/dynarec/la64/la64_emitter.h | 92 |
3 files changed, 340 insertions, 32 deletions
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c index ceed503a..31246726 100644 --- a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c +++ b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c @@ -214,22 +214,14 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i nextop = F8; GETEYSS(q2, 0, 0); GETGYxy_empty(q0); - if (vex.l) { - XVREPLVE0_W(q0, q2); - } else { - VREPLVE_W(q0, q2, 0); - } + VREPLVE0xy(W, q0, q2); break; case 0x19: INST_NAME("VBROADCASTSD Gx, Ex"); nextop = F8; GETEYSD(q2, 0, 0); GETGYxy_empty(q0); - if (vex.l) { - XVREPLVE0_D(q0, q2); - } else { - VREPLVE_D(q0, q2, 0); - } + VREPLVE0xy(D, q0, q2); break; case 0x1A: INST_NAME("VBROADCASTF128 Gx, Ex"); @@ -611,6 +603,68 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i VAND_Vxy(v0, v0, d1); } break; + case 0x58: + INST_NAME("VPBROADCASTD Gx, Ex"); + nextop = F8; + if (MODREG) { + GETEYx(v1, 0, 0); + GETGYxy_empty(v0); + VREPLVE0xy(W, v0, v1); + } else { + GETGYxy_empty(v0); + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x4, x5, &fixedaddress, rex, NULL, 0, 0); + VLDREPLxy(W, v0, ed, 0); + } + break; + case 0x59: + INST_NAME("VPBROADCASTQ Gx, Ex"); + nextop = F8; + if (MODREG) { + GETEYx(v1, 0, 0); + GETGYxy_empty(v0); + VREPLVE0xy(D, v0, v1); + } else { + GETGYxy_empty(v0); + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x4, x5, &fixedaddress, rex, NULL, 0, 0); + VLDREPLxy(D, v0, ed, 0); + } + break; + case 0x5A: + INST_NAME("VBROADCASTI128 Gx, Ex"); + nextop = F8; + GETGY_empty_EY_xy(q0, q2, 0); + XVREPLVE0_Q(q0, q2); + break; + case 0x78: + INST_NAME("VPBROADCASTB Gx, Ex"); + nextop = F8; + if (MODREG) { + GETEYx(v1, 0, 0); + GETGYxy_empty(v0); + VREPLVE0xy(B, v0, v1); + } else { + GETGYxy_empty(v0); + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x4, x5, &fixedaddress, rex, NULL, 0, 0); + VLDREPLxy(B, v0, ed, 0); + } + break; + case 0x79: + INST_NAME("VPBROADCASTW Gx, Ex"); + nextop = F8; + if (MODREG) { + GETEYx(v1, 0, 0); + GETGYxy_empty(v0); + VREPLVE0xy(H, v0, v1); + } else { + GETGYxy_empty(v0); + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x4, x5, &fixedaddress, rex, NULL, 0, 0); + VLDREPLxy(H, v0, ed, 0); + } + break; case 0x8C: INST_NAME("VPMASKMOVD/Q Gx, Vx, Ex"); nextop = F8; @@ -658,6 +712,159 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i PUTEYx(v0); } break; + case 0x90: + case 0x92: + if (opcode == 0x90) { + INST_NAME("VPGATHERDD/VPGATHERDQ Gx, VSIB, Vx"); + } else { + INST_NAME("VGATHERDPD/VGATHERDPS Gx, VSIB, Vx"); + } + nextop = F8; + if (((nextop & 7) != 4) || MODREG) { UDF(); } + GETG; + u8 = F8; // SIB + if ((u8 & 0x7) == 0x5 && !(nextop & 0xC0)) { + int64_t i64 = F32S64; + MOV64x(x5, i64); + eb1 = x5; + } else + eb1 = TO_NAT((u8 & 0x7) + (rex.b << 3)); // base + eb2 = ((u8 >> 3) & 7) + (rex.x << 3); // index + if (nextop & 0x40) + i32 = F8S; + else if (nextop & 0x80) + i32 = F32S; + else + i32 = 0; + if (!i32) + ed = eb1; + else { + ed = x3; + if (i32 < -2048 || i32 >= 2048) { + MOV64x(ed, i32); + ADD_D(ed, ed, eb1); + } else { + ADDI_D(ed, eb1, i32); + } + } + // ed is base + wb1 = u8 >> 6; // scale + GETVYxy(v2, 1); + GETGYxy(v0, 1); + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + + if (vex.l) { + v1 = avx_get_reg(dyn, ninst, x6, eb2, 0, LSX_AVX_WIDTH_256); + if (rex.w) { + XVSRLI_D(d1, v2, 63); + } else { + XVSRLI_W(d1, v2, 31); + }; + for (int i = 0; i < (rex.w ? 4 : 8); i++) { + XVPICKVE2GRxw(x4, d1, i); + BEQZ(x4, 4 + 4 * 4); + XVPICKVE2GR_WU(x4, v1, i); + SLLI_D(x4, x4, wb1); + LDXxw(x6, ed, x4); + XVINSGR2VRxw(v0, x6, i); + } + XVXOR_V(v2, v2, v2); + } else { + v1 = avx_get_reg(dyn, ninst, x6, eb2, 0, LSX_AVX_WIDTH_128); + if (rex.w) { + VSRLI_D(d1, v2, 63); + } else { + VSRLI_W(d1, v2, 31); + }; + for (int i = 0; i < (rex.w ? 2 : 4); i++) { + VPICKVE2GRxw(x4, d1, i); + BEQZ(x4, 4 + 4 * 4); + VPICKVE2GR_WU(x4, v1, i); + SLLI_D(x4, x4, wb1); + LDXxw(x6, ed, x4); + VINSGR2VRxw(v0, x6, i); + } + VXOR_V(v2, v2, v2); + } + break; + case 0x91: + case 0x93: + if (opcode == 0x91) { + INST_NAME("VPGATHERQD/VPGATHERQQ Gx, VSIB, Vx"); + } else { + INST_NAME("VGATHERQPD/VGATHERQPS Gx, VSIB, Vx"); + } + nextop = F8; + if (((nextop & 7) != 4) || MODREG) { UDF(); } + GETG; + u8 = F8; // SIB + if ((u8 & 0x7) == 0x5 && !(nextop & 0xC0)) { + int64_t i64 = F32S64; + MOV64x(x5, i64); + eb1 = x5; + } else + eb1 = TO_NAT((u8 & 0x7) + (rex.b << 3)); // base + eb2 = ((u8 >> 3) & 7) + (rex.x << 3); // index + if (nextop & 0x40) + i32 = F8S; + else if (nextop & 0x80) + i32 = F32S; + else + i32 = 0; + if (!i32) + ed = eb1; + else { + ed = x3; + if (i32 < -2048 || i32 >= 2048) { + MOV64x(ed, i32); + ADD_D(ed, ed, eb1); + } else { + ADDI_D(ed, eb1, i32); + } + } + // ed is base + wb1 = u8 >> 6; // scale + GETVYxy(v2, 1); + GETGYxy(v0, 1); + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + + if (vex.l) { + v1 = avx_get_reg(dyn, ninst, x6, eb2, 0, LSX_AVX_WIDTH_256); + if (rex.w) { + XVSRLI_D(d1, v2, 63); + } else { + XVSRLI_W(d1, v2, 31); + }; + for (int i = 0; i < 4; i++) { + XVPICKVE2GRxw(x4, d1, i); + BEQZ(x4, 4 + 4 * 4); + XVPICKVE2GR_D(x4, v1, i); + SLLI_D(x4, x4, wb1); + LDXxw(x6, ed, x4); + XVINSGR2VRxw(v0, x6, i); + } + XVXOR_V(v2, v2, v2); + } else { + v1 = avx_get_reg(dyn, ninst, x6, eb2, 0, LSX_AVX_WIDTH_128); + if (rex.w) { + VSRLI_D(d1, v2, 63); + } else { + VSRLI_W(d1, v2, 31); + }; + for (int i = 0; i < 2; i++) { + VPICKVE2GRxw(x4, d1, i); + BEQZ(x4, 4 + 4 * 4); + VPICKVE2GR_D(x4, v1, i); + SLLI_D(x4, x4, wb1); + LDXxw(x6, ed, x4); + VINSGR2VRxw(v0, x6, i); + } + VXOR_V(v2, v2, v2); + } + XVPERMI_Q(v0, v2, XVPERMI_IMM_4_0(1, 2)); + break; case 0xF7: INST_NAME("SHLX Gd, Ed, Vd"); nextop = F8; diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c index 4a5baf93..fedc6ec8 100644 --- a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c +++ b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c @@ -301,6 +301,21 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i } } break; + case 0x17: + INST_NAME("VEXTRACTPS Ed, Gx, imm8"); + nextop = F8; + GETGYx(v0, 0); + if (MODREG) { + ed = TO_NAT((nextop & 7) + (rex.b << 3)); + u8 = F8 & 0b11; + VPICKVE2GR_WU(ed, v0, u8); + } else { + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 0, 1); + u8 = F8 & 0b11; + VSTELM_W(v0, ed, 0, u8); + SMWRITE2(); + } + break; case 0x18: case 0x38: if (opcode == 0x18) { @@ -335,6 +350,44 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i } } break; + case 0x21: + INST_NAME("VINSERTPS Gx, Vx, Ex, Ib"); + nextop = F8; + uint8_t src_index = (u8 >> 6) & 3; + uint8_t dst_index = (u8 >> 4) & 3; + uint8_t zmask = u8 & 0xf; + q1 = fpu_get_scratch(dyn); + if (MODREG) { + GETGY_empty_VYEY_xy(v0, v1, v2, 1); + u8 = F8; + if (v0 == v2) { + VOR_V(q1, v2, v2); + if (v0 != v1) VOR_V(v0, v1, v1); + VEXTRINS_W(v0, q1, VEXTRINS_IMM_4_0(dst_index, src_index)); + } else { + if (v0 != v1) VOR_V(v0, v1, v1); + VEXTRINS_W(v0, v2, VEXTRINS_IMM_4_0(dst_index, src_index)); + } + } else { + GETVYx(v1, 0); + GETGYx_empty(v0); + u8 = F8; + if (v0 != v1) VOR_V(v0, v1, v1); + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x5, &fixedaddress, rex, NULL, 0, 1); + u8 = F8; + FLD_S(q1, wback, fixedaddress); + VEXTRINS_W(v0, q1, VEXTRINS_IMM_4_0(dst_index, 0)); // src index is zero when Ex is mem operand + } + VXOR_V(q1, q1, q1); + if (zmask) { + for (uint8_t i = 0; i < 4; i++) { + if (zmask & (1 << i)) { + VEXTRINS_W(v0, q1, VEXTRINS_IMM_4_0(i, 0)); + } + } + } + break; case 0x2A: INST_NAME("VMOVNTDQA Gx, Ex"); nextop = F8; diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h index b94c3c88..d1ba3b41 100644 --- a/src/dynarec/la64/la64_emitter.h +++ b/src/dynarec/la64/la64_emitter.h @@ -1943,10 +1943,19 @@ LSX instruction starts with V, LASX instruction starts with XV. #define VNEG_W(vd, vj) EMIT(type_2R(0b0111011010011100001110, vj, vd)) #define VNEG_D(vd, vj) EMIT(type_2R(0b0111011010011100001111, vj, vd)) -#define XVLD(vd, rj, imm12) EMIT(type_2RI12(0b0010110010, imm12, rj, vd)) -#define XVST(vd, rj, imm12) EMIT(type_2RI12(0b0010110011, imm12, rj, vd)) -#define XVLDX(vd, vj, vk) EMIT(type_3R(0b00111000010010000, vk, vj, vd)) -#define XVSTX(vd, vj, vk) EMIT(type_3R(0b00111000010011000, vk, vj, vd)) +#define XVLD(vd, rj, imm12) EMIT(type_2RI12(0b0010110010, imm12, rj, vd)) +#define XVST(vd, rj, imm12) EMIT(type_2RI12(0b0010110011, imm12, rj, vd)) +#define XVLDX(vd, vj, vk) EMIT(type_3R(0b00111000010010000, vk, vj, vd)) +#define XVSTX(vd, vj, vk) EMIT(type_3R(0b00111000010011000, vk, vj, vd)) +#define XVLDREPL_D(xd, rj, offset) EMIT(type_2RI9(0b0011001000010, (offset >> 3), rj, xd)) +#define XVLDREPL_W(xd, rj, offset) EMIT(type_2RI10(0b001100100010, (offset >> 2), rj, xd)) +#define XVLDREPL_H(xd, rj, offset) EMIT(type_2RI11(0b00110010010, (offset >> 1), rj, xd)) +#define XVLDREPL_B(xd, rj, offset) EMIT(type_2RI12(0b0011001010, offset, rj, xd)) +#define XVSTELM_D(xd, rj, offset, imm2) EMIT(type_2RI10(0b001100110001, (((imm2) << 8) | (offset >>3), rj, xd)) +#define XVSTELM_W(xd, rj, offset, imm3) EMIT(type_2RI11(0b00110011001, (((imm3) << 8) | (offset >>2), rj, xd)) +#define XVSTELM_H(xd, rj, offset, imm4) EMIT(type_2RI12(0b0011001101, (((imm4) << 8) | (offset >>1), rj, xd)) +#define XVSTELM_B(xd, rj, offset, imm5) EMIT(type_2RI13(0b001100111, (((imm5) << 8) | offset, rj, xd)) + #define XVHSELI_D(vd, vj, imm5) EMIT(type_2RI5(0b01110110100111111, imm5, vj, vd)) #define XVROTRI_B(vd, vj, imm3) EMIT(type_2RI3(0b0111011010100000001, imm3, vj, vd)) @@ -2896,24 +2905,6 @@ LSX instruction starts with V, LASX instruction starts with XV. } \ } while (0) -#define VSLTIxy(width, vd, vj, imm) \ - do { \ - if (vex.l) { \ - XVSLTI_##width(vd, vj, imm); \ - } else { \ - VSLTI_##width(vd, vj, imm); \ - } \ - } while (0) - -#define VBITSEL_Vxy(vd, vj, vk, va) \ - do { \ - if (vex.l) { \ - XVBITSEL_V(vd, vj, vk, va); \ - } else { \ - VBITSEL_V(vd, vj, vk, va); \ - } \ - } while (0) - #define VSHUF_Bxy(vd, vj, vk, va) \ do { \ if (vex.l) { \ @@ -2958,4 +2949,61 @@ LSX instruction starts with V, LASX instruction starts with XV. VANDI_B(vd, vj, imm); \ } \ } while (0) + +#define VLDREPLxy(width, vd, rj, imm) \ + do { \ + if (vex.l) { \ + XVLDREPL_##width(vd, rj, imm); \ + } else { \ + VLDREPL_##width(vd, rj, imm); \ + } \ + } while (0) + +#define XVPICKVE2GRxw(rd, xj, imm) \ + do { \ + if (rex.w) \ + XVPICKVE2GR_D(rd, xj, imm); \ + else \ + XVPICKVE2GR_WU(rd, xj, imm); \ + } while (0) + +#define XVINSGR2VRxw(xd, rj, imm) \ + do { \ + if (rex.w) \ + XVINSGR2VR_D(xd, rj, imm); \ + else \ + XVINSGR2VR_W(xd, rj, imm); \ + } while (0) + +#define VPICKVE2GRxw(rd, xj, imm) \ + do { \ + if (rex.w) \ + VPICKVE2GR_D(rd, xj, imm); \ + else \ + VPICKVE2GR_WU(rd, xj, imm); \ + } while (0) + +#define VINSGR2VRxw(xd, rj, imm) \ + do { \ + if (rex.w) \ + VINSGR2VR_D(xd, rj, imm); \ + else \ + VINSGR2VR_W(xd, rj, imm); \ + } while (0) + +#define XVINSVE0xw(xd, xj, imm) \ + do { \ + if (rex.w) \ + XVINSVE0_D(xd, xj, imm); \ + else \ + XVINSVE0_W(xd, xj, imm); \ + } while (0) + +#define VEXTRINSxw(xd, xj, imm) \ + do { \ + if (rex.w) \ + VEXTRINS_D(xd, xj, imm); \ + else \ + VEXTRINS_W(xd, xj, imm); \ + } while (0) #endif //__ARM64_EMITTER_H__ |