diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-06-08 15:21:55 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-06-08 15:21:55 +0200 |
| commit | 874828c2ac6ede4302b5f86a3405ba6650a9ebd4 (patch) | |
| tree | fdefbaaf1a97a9880476779992cf26d9bac02f0e /src | |
| parent | 3369e5e755dfbf291e3637730a8bdc530589cb97 (diff) | |
| download | box64-874828c2ac6ede4302b5f86a3405ba6650a9ebd4.tar.gz box64-874828c2ac6ede4302b5f86a3405ba6650a9ebd4.zip | |
[ARM64_DYNAREC] Added AVX.66.0F 50/C2, AVX.66.0F3A 0D, AVX.66.0F38 16/28/29/2B/36/AC/BA and AVX.F2.0F 51 opcodes
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f.c | 53 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c | 118 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c | 15 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c | 26 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.c | 17 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.h | 2 |
6 files changed, 228 insertions, 3 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c index 634dd91c..f36bf476 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c @@ -276,6 +276,26 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, FCOMI(x1, x2); break; + case 0x50: + nextop = F8; + INST_NAME("VMOVMSKPD Gd, Ex"); + GETEX_Y(q0, 0, 0); + GETGD; + VMOVQDto(x1, q0, 0); + VMOVQDto(gd, q0, 1); + LSRx(gd, gd, 62); + BFXILx(gd, x1, 63, 1); + if(vex.l) { + GETEY(q0); + VMOVQDto(x1, q0, 0); + VMOVQDto(x2, q0, 1); + LSRx(x1, x1, 63); + LSRx(x2, x2, 63); + BFIx(gd, x1, 2, 1); + BFIx(gd, x2, 3, 1); + } + break; + case 0x54: INST_NAME("VANDPD Gx, Vx, Ex"); nextop = F8; @@ -1120,6 +1140,39 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } break; + case 0xC2: + INST_NAME("VCMPPD Gx, Vx, Ex, Ib"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 1); u8 = F8; } else { GETGY_empty_VYEY(v0, v2, v1); } + if(((u8&15)==3) || ((u8&15)==7) || ((u8&15)==8) || ((u8&15)==9) || ((u8&15)==10) || ((u8&15)==12) || ((u8&15)==13) || ((u8&15)==14)) { + VFMAXQD(q0, v2, v1); // propagate NAN + VFCMEQQD(((u8&15)==7)?v0:q0, q0, q0); // 0 if NAN, 1 if not NAN + } + switch(u8&0xf) { + // the inversion of the params in the comparison is there to handle NaN the same way SSE does + case 0x00: VFCMEQQD(v0, v2, v1); break; // Equal, not unordered + case 0x01: VFCMGTQD(v0, v1, v2); break; // Less than + case 0x02: VFCMGEQD(v0, v1, v2); break; // Less or equal + case 0x03: VMVNQ(v0, q0); break; // unordered + case 0x04: VFCMEQQD(v0, v2, v1); VMVNQ(v0, v0); break; // Not Equal (or unordered on ARM, not on X86...) + case 0x05: VFCMGTQD(v0, v1, v2); VMVNQ(v0, v0); break; // Greater or equal or unordered + case 0x06: VFCMGEQD(v0, v1, v2); VMVNQ(v0, v0); break; // Greater or unordered + case 0x07: break; // ordered + case 0x08: VFCMEQQD(v0, v2, v1); VORNQ(v0, v0, q0); break; // Equal, or unordered + case 0x09: VFCMGTQD(v0, v1, v2); VORNQ(v0, v0, q0); break; // Less than or unordered + case 0x0a: VFCMGEQD(v0, v1, v2); VORNQ(v0, v0, q0); break; // Less or equal or unordered + case 0x0b: VEORQ(v0, v0, v0); break; // false + case 0x0c: VFCMEQQD(v0, v2, v1); VBICQ(v0, q0, v0); break; + case 0x0d: VFCMGEQD(v0, v2, v1); break; + case 0x0e: VFCMGTQD(v0, v2, v1); break; + case 0x0f: MOVIQ_64(v0, 0xff); break; //true + } + } + if(!vex.l) YMM0(gd); + break; + case 0xC4: INST_NAME("VPINSRW Gx, Vx, Ed, Ib"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c index 59573d53..9236fc8b 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c @@ -175,6 +175,52 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } else YMM0(gd); break; + case 0x16: + case 0x36: + if(opcode==0x16) { INST_NAME("VPERMPS Gx, Vx, Ex"); } else { INST_NAME("VPERMD Gx, Vx, Ex"); } + nextop = F8; + if(!vex.l) UDF(0); + d0 = fpu_get_double_scratch(dyn, ninst); + d1 = d0+1; + q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + s0 = MODREG?((nextop&7)+(rex.b<<3)):-1; + if(MODREG) { + v1 = sse_get_reg(dyn, ninst, x3, s0, 0); + VMOVQ(d0, v1); + v1 = ymm_get_reg(dyn, ninst, x3, s0, 0, gd, vex.v, -1); + VMOVQ(d1, v1); + } else { + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0); + VLDR128_U12(d0, ed, fixedaddress); + VLDR128_U12(d1, ed, fixedaddress+16); + } + for(int l=0; l<1+vex.l; ++l) { + if(!l) { + GETVX(v2, 0); + GETGX_empty(v0); + } else { + v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1); + GETGY_empty(v0, vex.v, s0, -1); + } + // transform u32 index in V2 to 4 u8 index in q0 for VTBL + MOVIQ_32(q0, 7); // index and 7 + VANDQ(q0, v2, q0); + SQXTN_16(q0, q0); // index in 16bits + VSHL_16(q0, q0, 1); // double the index + VZIP1Q_16(q0, q0, q0); // repeat the index by pair + MOVIQ_32_lsl(q1, 1, 2); // q1 as 16bits is 0 / 1 + VADDQ_16(q0, q0, q1); + SQXTN_8(q0, q0); // index in 8bits + VSHL_8(q0, q0, 1); // double the index + VZIP1Q_8(q0, q0, q0); // repeat the index by pair + MOVIQ_16(q1, 1, 1); + VADDQ_8(q0, q0, q1); + // fetch the datas + VTBLQ2_8(v0, d0, q0); + } + if(!vex.l) YMM0(gd); + break; case 0x17: INST_NAME("VPTEST GX, EX"); SETFLAGS(X_ALL, SF_SET); @@ -374,6 +420,26 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip SXTL_32(q0, q1); break; + case 0x28: + INST_NAME("VPMULDQ Gx, Vx, Ex"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + VUZP1Q_32(q0, v2, v2); // needs elem 0 and 2 in lower part + if(v2==v1) { + q1 = q0; + } else { + if(MODREG) { + if(!l) q1 = fpu_get_scratch(dyn, ninst); + } else + q1 = q0; + VUZP1Q_32(q1, v1, v1); + } + VSMULL_32(v0, q1, q0); + } + if(!vex.l) YMM0(gd); + break; case 0x29: INST_NAME("VPCMPEQQ Gx, Vx, Ex"); nextop = F8; @@ -384,6 +450,30 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip if(!vex.l) YMM0(gd); break; + case 0x2B: + INST_NAME("VPACKUSDW Gx, Ex, Vx"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + VEORQ(q0, q0, q0); + q1 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + if(v0==v1 && v2!=v1) { + if(!l) q2 = fpu_get_scratch(dyn, ninst); + VMOVQ(q2, v1); + v1 = q2; + } + SMAXQ_32(q1, v2, q0); // values < 0 => 0 + UQXTN_16(v0, q1); + if(v2==v1) { + VMOVeD(v0, 1, v0, 0); + } else { + SMAXQ_32(q1, v1, q0); + UQXTN2_16(v0, q1); + } + } + if(!vex.l) YMM0(gd); + break; case 0x2C: INST_NAME("VMASKMOVPS Gx, Vx, Ex"); nextop = F8; @@ -948,7 +1038,22 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } YMM0(gd); break; - + case 0xAC: + INST_NAME("VFNMADD213PS/D Gx, Vx, Ex"); + nextop = F8; + if(MODREG) q0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_VXEX(v0, v2, v1, 0); } else { GETGY_VYEY(v0, v2, v1); } + if(!MODREG) q0 = v1; else VMOVQ(q0, v1); + if(rex.w) { + VFMLSQD(q0, v0, v2); + } else { + VFMLSQS(q0, v0, v2); + } + VMOVQ(v0, q0); + } + if(!vex.l) YMM0(gd); + break; case 0xAD: INST_NAME("VFMNADD213SS/D Gx, Vx, Ex"); nextop = F8; @@ -1025,7 +1130,16 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } YMM0(gd); break; - + case 0xBA: + INST_NAME("VFNMSUB231PS/D Gx, Vx, Ex"); + nextop = F8; + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_VXEX(v0, v2, v1, 0); } else { GETGY_VYEY(v0, v2, v1); } + if(rex.w) VFMLSQD(v0, v1, v2); else VFMLSQS(v0, v1, v2); + if(rex.w) VFNEGQD(v0, v0); else VFNEGQS(v0, v0); + } + if(!vex.l) YMM0(gd); + break; case 0xBB: INST_NAME("VFMSUB231SS/D Gx, Vx, Ex"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c index e667f562..f85f9f3c 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c @@ -249,7 +249,20 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } } else YMM0(gd); break; - + case 0x0D: + INST_NAME("VPBLENDPD Gx, Vx, Ex, Ib"); + nextop = F8; + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(q0, q2, q1, 1); u8 = F8; } else { GETGY_empty_VYEY(q0, q2, q1); } + switch(u8>>(l*2)&3) { + case 0b00: if(q0!=q2) VMOVQ(q0, q2); break; // VxVx + case 0b01: if(q0!=q1) VMOVeD(q0, 0, q1, 0); if(q0!=q2) VMOVeD(q0, 1, q2, 1); break; // Ex[0]Vx[1] + case 0b10: if(q0!=q2) VMOVeD(q0, 0, q2, 0); if(q0!=q1) VMOVeD(q0, 1, q1, 1); break; // Vx[0]Ex[1] + case 0b11: if(q0!=q1) VMOVQ(q0, q1); break; // ExEx + } + } + if(!vex.l) YMM0(gd); + break; case 0x0E: INST_NAME("VPBLENDW Gx, Vx, Ex, u8"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c index d51d6d2b..de57bff9 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c @@ -183,6 +183,32 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } break; + case 0x51: + INST_NAME("VSQRTSD Gx, Vx, Ex"); + nextop = F8; + d1 = fpu_get_scratch(dyn, ninst); + GETEXSD(v1, 0, 0); + GETGX_empty_VX(v0, v2); + if(!box64_dynarec_fastnan) { + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FCMEQD(q0, v1, v1); // 0 if NAN, 1 if not NAN + } + FSQRTD(d1, v1); + if(!box64_dynarec_fastnan) { + FCMEQD(q1, d1, d1); // 0 => out is NAN + VBIC(q1, q0, q1); // forget it in any input was a NAN already + VSHLQ_64(q1, q1, 63); // only keep the sign bit + VORR(d1, d1, q1); // NAN -> -NAN + } + if(v0!=v2) { + VMOVQ(v0, v2); + } + VMOVeD(v0, 0, d1, 0); + YMM0(gd) + break; + case 0x58: INST_NAME("VADDSD Gx, Vx, Ex"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index 7248ef69..b05cbcc3 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -37,6 +37,23 @@ int fpu_get_scratch(dynarec_arm_t* dyn, int ninst) } return ret; } +// Get 2 consicutive FPU scratch reg +int fpu_get_double_scratch(dynarec_arm_t* dyn, int ninst) +{ + int ret = SCRATCH0 + dyn->n.fpu_scratch; + if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) { + // should only happens in step 0... + dyn->scratchs |= (1<<(dyn->n.fpu_scratch)); // mark as not free + dyn->n.neoncache[ret].v = 0; // reset it + } + if(dyn->n.neoncache[ret+1].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret+1].t==NEON_CACHE_YMMW) { + // should only happens in step 0... + dyn->scratchs |= (1<<(dyn->n.fpu_scratch+1)); // mark as not free + dyn->n.neoncache[ret+1].v = 0; // reset it + } + dyn->n.fpu_scratch+=2; + return ret; +} // Reset scratch regs counter void fpu_reset_scratch(dynarec_arm_t* dyn) { diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h index 342f0f33..b6c95904 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.h +++ b/src/dynarec/arm64/dynarec_arm64_functions.h @@ -12,6 +12,8 @@ // Get an FPU scratch reg int fpu_get_scratch(dynarec_arm_t* dyn, int ninst); +// Get 2 consecutive FPU scratch reg +int fpu_get_double_scratch(dynarec_arm_t* dyn, int ninst); // Reset scratch regs counter void fpu_reset_scratch(dynarec_arm_t* dyn); // Get an x87 double reg |