diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-06-08 21:31:11 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-06-08 21:31:11 +0200 |
| commit | 950241d52f7fb777b8afa7a253c88f3de65530b8 (patch) | |
| tree | dc713205434c082ebceba45524e3b25c0b4fa07a /src | |
| parent | a147c12f728cd50a9b462244334cf5a720a2b435 (diff) | |
| download | box64-950241d52f7fb777b8afa7a253c88f3de65530b8.tar.gz box64-950241d52f7fb777b8afa7a253c88f3de65530b8.zip | |
[ARM64_DYNAREC] Added AVX.66.0F38 41/8E/9B, AVX.66.0F3A 06 and AVX.66.0F F6 opcodes
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/arm64/arm64_emitter.h | 27 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f.c | 15 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c | 71 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c | 43 |
4 files changed, 156 insertions, 0 deletions
diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h index 844d29dd..f0d4c4b8 100644 --- a/src/dynarec/arm64/arm64_emitter.h +++ b/src/dynarec/arm64/arm64_emitter.h @@ -1980,6 +1980,33 @@ int convert_bitmask(uint64_t bitmask); #define UMINQ_32(Vd, Vn, Vm) EMIT(MINMAX_vector(1, 1, 0b10, Vm, 1, Vn, Vd)) //#define UMINQ_64(Vd, Vn, Vm) EMIT(MINMAX_vector(1, 1, 0b11, Vm, 1, Vn, Vd)) +// MIN or MAX accros vector +#define MAXMINV_vector(Q, U, size, op, Rn, Rd) ((Q)<<30 | (U)<<29 | 0b01110<<24 | (size)<<22 | 0b11000<<17 | (op)<<16 | 0b1010<<12 | 0b10<<10 | (Rn)<<5 | (Rd)) +#define SMAXV_8(Vd, Vn) EMIT(MAXMINV_vector(0, 0, 0b00, 0, Vn, Vd)) +#define SMINV_8(Vd, Vn) EMIT(MAXMINV_vector(0, 0, 0b00, 1, Vn, Vd)) +#define UMAXV_8(Vd, Vn) EMIT(MAXMINV_vector(0, 1, 0b00, 0, Vn, Vd)) +#define UMINV_8(Vd, Vn) EMIT(MAXMINV_vector(0, 1, 0b00, 1, Vn, Vd)) +#define SMAXV_16(Vd, Vn) EMIT(MAXMINV_vector(0, 0, 0b01, 0, Vn, Vd)) +#define SMINV_16(Vd, Vn) EMIT(MAXMINV_vector(0, 0, 0b01, 1, Vn, Vd)) +#define UMAXV_16(Vd, Vn) EMIT(MAXMINV_vector(0, 1, 0b01, 0, Vn, Vd)) +#define UMINV_16(Vd, Vn) EMIT(MAXMINV_vector(0, 1, 0b01, 1, Vn, Vd)) +#define SMAXV_32(Vd, Vn) EMIT(MAXMINV_vector(0, 0, 0b10, 0, Vn, Vd)) +#define SMINV_32(Vd, Vn) EMIT(MAXMINV_vector(0, 0, 0b10, 1, Vn, Vd)) +#define UMAXV_32(Vd, Vn) EMIT(MAXMINV_vector(0, 1, 0b10, 0, Vn, Vd)) +#define UMINV_32(Vd, Vn) EMIT(MAXMINV_vector(0, 1, 0b10, 1, Vn, Vd)) +#define SMAXVQ_8(Vd, Vn) EMIT(MAXMINV_vector(1, 0, 0b00, 0, Vn, Vd)) +#define SMINVQ_8(Vd, Vn) EMIT(MAXMINV_vector(1, 0, 0b00, 1, Vn, Vd)) +#define UMAXVQ_8(Vd, Vn) EMIT(MAXMINV_vector(1, 1, 0b00, 0, Vn, Vd)) +#define UMINVQ_8(Vd, Vn) EMIT(MAXMINV_vector(1, 1, 0b00, 1, Vn, Vd)) +#define SMAXVQ_16(Vd, Vn) EMIT(MAXMINV_vector(1, 0, 0b01, 0, Vn, Vd)) +#define SMINVQ_16(Vd, Vn) EMIT(MAXMINV_vector(1, 0, 0b01, 1, Vn, Vd)) +#define UMAXVQ_16(Vd, Vn) EMIT(MAXMINV_vector(1, 1, 0b01, 0, Vn, Vd)) +#define UMINVQ_16(Vd, Vn) EMIT(MAXMINV_vector(1, 1, 0b01, 1, Vn, Vd)) +#define SMAXVQ_32(Vd, Vn) EMIT(MAXMINV_vector(1, 0, 0b10, 0, Vn, Vd)) +#define SMINVQ_32(Vd, Vn) EMIT(MAXMINV_vector(1, 0, 0b10, 1, Vn, Vd)) +#define UMAXVQ_32(Vd, Vn) EMIT(MAXMINV_vector(1, 1, 0b10, 0, Vn, Vd)) +#define UMINVQ_32(Vd, Vn) EMIT(MAXMINV_vector(1, 1, 0b10, 1, Vn, Vd)) + // HADD vector #define HADD_vector(Q, U, size, Rm, Rn, Rd) ((Q)<<30 | (U)<<29 | 0b01110<<24 | (size)<<22 | 1<<21 | (Rm)<<16 | 1<<10 | (Rn)<<5 | (Rd)) #define SHADD_8(Vd, Vn, Vm) EMIT(HADD_vector(0, 0, 0b00, Vm, Vn, Vd)) diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c index f36bf476..9fa9498a 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c @@ -1764,6 +1764,21 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } if(!vex.l) YMM0(gd); break; + case 0xF6: + INST_NAME("VPSADBW Gx, Vx, Ex"); + nextop = F8; + d0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + UABDL_8(d0, v2, v1); + UADDLVQ_16(d0, d0); + VMOVeD(v0, 0, d0, 0); + UABDL2_8(d0, v2, v1); + UADDLVQ_16(d0, d0); + VMOVeD(v0, 1, d0, 0); + } + if(!vex.l) YMM0(gd); + break; case 0xF8: INST_NAME("VPSUBB Gx, Vx, Ex"); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c index 9236fc8b..320c31ab 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c @@ -712,6 +712,27 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } if(!vex.l) YMM0(gd); break; + case 0x41: + INST_NAME("PHMINPOSUW Gx, Ex"); + nextop = F8; + GETEX(v1, 0, 0); + GETGX_empty(v0); + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + // get the min value + UMINVQ_16(q0, v1); //q0.uw[0] = min value + VDUPQ_16(q1, q0, 0); // vector of min value + VCMEQQ_16(q1, q1, v1); // bit field of the element that are the min value + UQXTN_8(q1, q1); // same bit field, but on 8bits elements only, easier to handle + VMOVQDto(x1, q1, 0); // grab the bit field as a 64bits value + VEORQ(v0, v0, v0); // RAZ everything + RBITx(x1, x1); // reverse, we want trailling zero but can only count leading ones + CLZx(x1, x1); + VMOVeH(v0, 0, q0, 0); // set up min + LSRw(x1, x1, 3); // divide by 8, that's our index... + VMOVQHfrom(v0, 1, x1); + YMM0(gd); + break; case 0x45: INST_NAME("VPSRLVD/Q Gx, Vx, Ex"); @@ -871,6 +892,39 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip if(!vex.l) YMM0(gd); break; + case 0x8E: + INST_NAME("VMASKMOVD/Q Ex, Vx, Gx"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { + GETGX(v0, 0); GETVX(v2, 0); + if(MODREG) { + s0 = (nextop&7)+(rex.b<<3); + v1 = sse_get_reg_empty(dyn, ninst, x1, s0); + } else { + s0 = -1; + v1 = fpu_get_scratch(dyn, ninst); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0); + VLDR128_U12(v1, ed, fixedaddress); + } + } else { + GETGY(v0, 0, vex.v, s0, -1); v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1); + if(MODREG) + v1 = ymm_get_reg_empty(dyn, ninst, x1, s0, gd, vex.v, -1); + else + VLDR128_U12(v1, ed, fixedaddress+16); + } + if(rex.w) + VSSHRQ_64(q0, v2, 63); + else + VSSHRQ_32(q0, v2, 31); + VBITQ(v1, v0, q0); + if(!MODREG) + VSTR128_U12(v1, ed, fixedaddress+16*l); + } + // no raz of upper ymm + break; case 0x90: case 0x92: @@ -959,6 +1013,23 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip YMM0(gd); break; + case 0x9B: + INST_NAME("VFMSUB132SS/D Gx, Vx, Ex"); + nextop = F8; + GETGX(v0, 1); + GETVX(v2, 0); + if(rex.w) {GETEXSD(v1, 0, 0);} else {GETEXSS(v1, 0, 0);} + q0 = fpu_get_scratch(dyn, ninst); + if(rex.w) { + FNMSUB_64(q0, v2, v1, v0); + VMOVeD(v0, 0, q0, 0); + } else { + FNMSUB_32(q0, v2, v1, v0); + VMOVeS(v0, 0, q0, 0); + } + YMM0(gd); + break; + case 0x9D: INST_NAME("VFNMADD132SS/D Gx, Vx, Ex"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c index f85f9f3c..620bf31d 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c @@ -137,6 +137,49 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } if(!vex.l) YMM0(gd); break; + case 0x06: + INST_NAME("VPERM2F128 Gx, Vx, Ex, Imm8"); + nextop = F8; + if(!vex.l) UDF(0); + if(MODREG) { + s0 = (nextop&7)+(rex.b<<3); + v1 = sse_get_reg_empty(dyn, ninst, x1, s0); + } else { + s0 = -1; + v1 = fpu_get_scratch(dyn, ninst); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0); + } + u8 = F8; + GETVX(v2, 0); + GETGX_empty(v0); + if((v0==v2) && ((u8&0xf0)==0)) { + q0 = fpu_get_scratch(dyn, ninst); + VMOVQ(q0, v2); + v2 = q0; + } + if((v0==v1) && ((u8&0xf0)==0x20)) { + q1 = fpu_get_scratch(dyn, ninst); + VMOVQ(q1, v1); + v1 = q1; + } + switch(u8&0x0f) { + case 0: if(v0!=v2) VMOVQ(v0, v2); break; + case 1: d2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1); VMOVQ(v0, d2); break; + case 2: if(MODREG) { if(v0!=v1) VMOVQ(v0, v1); } else { VLDR128_U12(v0, ed, fixedaddress); } break; + case 3: if(MODREG) { d1 = ymm_get_reg(dyn, ninst, x1, s0, 0, gd, vex.v, -1); VMOVQ(v0, d1); } else { VLDR128_U12(v0, ed, fixedaddress+16); } break; + default: VEORQ(v0, v0, v0); + } + if((u8&0xf0)==0x10) { if((u8&0x0f)!=1) d2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1); } + if(MODREG && ((u8&0xf0)==0x30)) { if((u8&0x0f)!=3) d1 = ymm_get_reg(dyn, ninst, x1, s0, 0, gd, vex.v, -1); } + v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, s0, -1); + switch((u8>>4)&0x0f) { + case 0: VMOVQ(v0, v2); break; + case 1: if(v0!=d2) VMOVQ(v0, d2); break; + case 2: if(MODREG) { if(v0!=v1) VMOVQ(v0, v1); } else { VLDR128_U12(v0, ed, fixedaddress); } break; + case 3: if(MODREG) { if(v0!=d1) VMOVQ(v0, d1); } else { VLDR128_U12(v0, ed, fixedaddress+16); } break; + default: VEORQ(v0, v0, v0); + } + break; case 0x08: INST_NAME("VROUNDPS Gx, Ex, Ib"); |