diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-06-05 19:44:02 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-06-05 19:44:02 +0200 |
| commit | b568cc529e9b10b6b42b2139351b4b3cb0858a28 (patch) | |
| tree | 437dada7817da906f7a89cf7f503d029e9d20f88 | |
| parent | dc8e24c7b785874eb6d7cca0df75f0fe6b597ebb (diff) | |
| download | box64-b568cc529e9b10b6b42b2139351b4b3cb0858a28.tar.gz box64-b568cc529e9b10b6b42b2139351b4b3cb0858a28.zip | |
[ARM64_DYNAREC] Added a bunch of AVX ocpodes and some fixes too
| -rw-r--r-- | src/dynarec/arm64/arm64_emitter.h | 2 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_660f.c | 2 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_0f.c | 55 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f.c | 164 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c | 31 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c | 8 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c | 78 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c | 50 |
8 files changed, 301 insertions, 89 deletions
diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h index b8c71b6b..c7bb614d 100644 --- a/src/dynarec/arm64/arm64_emitter.h +++ b/src/dynarec/arm64/arm64_emitter.h @@ -1862,9 +1862,11 @@ int convert_bitmask(uint64_t bitmask); #define MOVIQ_8(Rd, imm8) EMIT(MOVI_vector(1, 0, (((imm8)>>5)&0b111), 0b1110, ((imm8)&0b11111), Rd)) #define MOVIQ_16(Rd, imm8, lsl8) EMIT(MOVI_vector(1, 0, (((imm8)>>5)&0b111), 0b1000|((lsl8)?0b10:0), ((imm8)&0b11111), Rd)) #define MOVIQ_32(Rd, imm8) EMIT(MOVI_vector(1, 0, (((imm8)>>5)&0b111), 0b0000, ((imm8)&0b11111), Rd)) +#define MOVIQ_32_lsl(Rd, imm8, lsl8) EMIT(MOVI_vector(1, 0, (((imm8)>>5)&0b111), (lsl8<<1), ((imm8)&0b11111), Rd)) #define MOVIQ_64(Rd, imm8) EMIT(MOVI_vector(1, 1, (((imm8)>>5)&0b111), 0b1110, ((imm8)&0b11111), Rd)) #define MOVI_8(Rd, imm8) EMIT(MOVI_vector(0, 0, (((imm8)>>5)&0b111), 0b1110, ((imm8)&0b11111), Rd)) #define MOVI_16(Rd, imm8, lsl8) EMIT(MOVI_vector(0, 0, (((imm8)>>5)&0b111), 0b1000|((lsl8)?0b10:0), ((imm8)&0b11111), Rd)) +#define MOVI_32_lsl(Rd, imm8, lsl8) EMIT(MOVI_vector(0, 0, (((imm8)>>5)&0b111), (lsl8<<1), ((imm8)&0b11111), Rd)) #define MOVI_32(Rd, imm8) EMIT(MOVI_vector(0, 0, (((imm8)>>5)&0b111), 0b0000, ((imm8)&0b11111), Rd)) #define MOVI_64(Rd, imm8) EMIT(MOVI_vector(0, 1, (((imm8)>>5)&0b111), 0b1110, ((imm8)&0b11111), Rd)) diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c index b6450d96..caeaa465 100644 --- a/src/dynarec/arm64/dynarec_arm64_660f.c +++ b/src/dynarec/arm64/dynarec_arm64_660f.c @@ -1742,8 +1742,6 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n VFCVTZSQS(v0, v0); } else { MRS_fpsr(x5); - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); u8 = sse_setround(dyn, ninst, x1, x2, x3); MOV32w(x4, 0x80000000); d0 = fpu_get_scratch(dyn, ninst); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c index 3b7d453c..268e1f86 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c @@ -326,8 +326,11 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; SKIPTEST(x1); v0 = fpu_get_scratch(dyn, ninst); + VFMOVSQ_8(v0, 0b01110000); //1.0f for(int l=0; l<1+vex.l; ++l) { if(!l) { GETGX_empty_EX(q0, q1, 0); } else { GETGY_empty_EY(q0, q1); } + #if 0 + // the aproximation doesn't not work on Death Stranding. code around 0x1419c9100 fail... if(!l) { if(q1==q0) v1 = fpu_get_scratch(dyn, ninst); @@ -339,6 +342,10 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int VFMULQS(v1, v0, q1); VFRSQRTSQS(v1, v1, v0); VFMULQS(q0, v1, v0); + #else + VFSQRTQS(q0, q1); + VFDIVQS(q0, v0, q0); + #endif } if(!vex.l) YMM0(gd); break; @@ -347,18 +354,25 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; SKIPTEST(x1); q0 = fpu_get_scratch(dyn, ninst); + VFMOVSQ_8(q0, 0b01110000); //1.0f for(int l=0; l<1+vex.l; ++l) { if(!l) { GETGX_empty_EX(v0, v1, 0); + #if 0 if(v0==v1) q1 = fpu_get_scratch(dyn, ninst); + #endif } else { GETGY_empty_EY(v0, v1); } + #if 0 if(v0!=v1) q1 = v0; VFRECPEQS(q0, v1); VFRECPSQS(q1, q0, v1); VFMULQS(v0, q0, q1); + #else + VFDIVQS(v0, q0, v1); + #endif } if(!vex.l) YMM0(gd); break; @@ -550,27 +564,28 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int q0 = fpu_get_scratch(dyn, ninst); for(int l=0; l<1+vex.l; ++l) { if(!l) { GETGX_empty_VXEX(v0, v2, v1, 1); u8 = F8; } else { GETGY_empty_VYEY(v0, v2, v1); } - switch(u8&7) { + if(((u8&15)==3) || ((u8&15)==7) || ((u8&15)==8) || ((u8&15)==9) || ((u8&15)==10) || ((u8&15)==12) || ((u8&15)==13) || ((u8&15)==14)) { + VFMAXQS(q0, v2, v1); // propagate NAN + VFCMEQQS(((u8&15)==7)?v0:q0, q0, q0); // 0 if NAN, 1 if not NAN + } + switch(u8&0xf) { // the inversion of the params in the comparison is there to handle NaN the same way SSE does - case 0: VFCMEQQS(v0, v2, v1); break; // Equal - case 1: VFCMGTQS(v0, v1, v2); break; // Less than - case 2: VFCMGEQS(v0, v1, v2); break; // Less or equal - case 3: VFCMEQQS(v0, (v0==v1)?v1:v2, (v0==v1)?v1:v2); - if(v2!=v1) { - VFCMEQQS(q0, (v0==v1)?v2:v1, (v0==v1)?v2:v1); - VANDQ(v0, v0, q0); - } - VMVNQ(v0, v0); - break; // NaN (NaN is not equal to himself) - case 4: VFCMEQQS(v0, v2, v1); VMVNQ(v0, v0); break; // Not Equal (or unordered on ARM, not on X86...) - case 5: VFCMGTQS(v0, v1, v2); VMVNQ(v0, v0); break; // Greater or equal or unordered - case 6: VFCMGEQS(v0, v1, v2); VMVNQ(v0, v0); break; // Greater or unordered - case 7: VFCMEQQS(v0, (v0==v1)?v1:v2, (v0==v1)?v1:v2); - if(v2!=v1) { - VFCMEQQS(q0, (v0==v1)?v2:v1, (v0==v1)?v2:v1); - VANDQ(v0, v0, q0); - } - break; // not NaN + case 0x00: VFCMEQQS(v0, v2, v1); break; // Equal, not unordered + case 0x01: VFCMGTQS(v0, v1, v2); break; // Less than + case 0x02: VFCMGEQS(v0, v1, v2); break; // Less or equal + case 0x03: VMVNQ(v0, q0); break; // unordered + case 0x04: VFCMEQQS(v0, v2, v1); VMVNQ(v0, v0); break; // Not Equal (or unordered on ARM, not on X86...) + case 0x05: VFCMGTQS(v0, v1, v2); VMVNQ(v0, v0); break; // Greater or equal or unordered + case 0x06: VFCMGEQS(v0, v1, v2); VMVNQ(v0, v0); break; // Greater or unordered + case 0x07: break; // ordered + case 0x08: VFCMEQQS(v0, v2, v1); VORNQ(v0, v0, q0); break; // Equal, or unordered + case 0x09: VFCMGTQS(v0, v1, v2); VORNQ(v0, v0, q0); break; // Less than or unordered + case 0x0a: VFCMGEQS(v0, v1, v2); VORNQ(v0, v0, q0); break; // Less or equal or unordered + case 0x0b: VEORQ(v0, v0, v0); break; // false + case 0x0c: VFCMEQQS(v0, v2, v1); VBICQ(v0, q0, v0); break; + case 0x0d: VFCMGEQS(v0, v2, v1); break; + case 0x0e: VFCMGTQS(v0, v2, v1); break; + case 0x0f: MOVIQ_64(v0, 0xff); break; //true } } if(!vex.l) YMM0(gd); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c index 12dc5144..b7a3f80a 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c @@ -319,22 +319,50 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, case 0x58: INST_NAME("VADDPD Gx, Vx, Ex"); nextop = F8; - GETGX_empty_VXEX(v0, v2, v1, 0); - VFADDQD(v0, v2, v1); - if(vex.l) { - GETGY_empty_VYEY(v0, v2, v1); + if(!box64_dynarec_fastnan) { + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + } + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + if(!box64_dynarec_fastnan) { + // check if any input value was NAN + VFMAXQD(q0, v2, v1); // propagate NAN + VFCMEQQD(q0, q0, q0); // 0 if NAN, 1 if not NAN + } VFADDQD(v0, v2, v1); - } else YMM0(gd) + if(!box64_dynarec_fastnan) { + VFCMEQQD(q1, v0, v0); // 0 => out is NAN + VBICQ(q1, q0, q1); // forget it in any input was a NAN already + VSHLQ_64(q1, q1, 63); // only keep the sign bit + VORRQ(v0, v0, q1); // NAN -> -NAN + } + } + if(!vex.l) YMM0(gd) break; case 0x59: INST_NAME("VMULPD Gx, Vx, Ex"); nextop = F8; - GETGX_empty_VXEX(v0, v2, v1, 0); - VFMULQD(v0, v2, v1); - if(vex.l) { - GETGY_empty_VYEY(v0, v2, v1); + if(!box64_dynarec_fastnan) { + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + } + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + if(!box64_dynarec_fastnan) { + // check if any input value was NAN + VFMAXQD(q0, v2, v1); // propagate NAN + VFCMEQQD(q0, q0, q0); // 0 if NAN, 1 if not NAN + } VFMULQD(v0, v2, v1); - } else YMM0(gd) + if(!box64_dynarec_fastnan) { + VFCMEQQD(q1, v0, v0); // 0 => out is NAN + VBICQ(q1, q0, q1); // forget it in any input was a NAN already + VSHLQ_64(q1, q1, 63); // only keep the sign bit + VORRQ(v0, v0, q1); // NAN -> -NAN + } + } + if(!vex.l) YMM0(gd) break; case 0x5A: INST_NAME("VCVTPD2PS Gx, Ex"); @@ -362,59 +390,39 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, case 0x5B: INST_NAME("VCVTPS2DQ Gx, Ex"); nextop = F8; - GETEX(v1, 0, 0); - GETGX_empty(v0); - if(box64_dynarec_fastround) { - u8 = sse_setround(dyn, ninst, x1, x2, x3); - VFRINTISQ(v0, v1); - if(!vex.l) x87_restoreround(dyn, ninst, u8); - VFCVTZSQS(v0, v0); - } else { - MRS_fpsr(x5); - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - u8 = sse_setround(dyn, ninst, x1, x2, x3); - MOV32w(x4, 0x80000000); + u8 = sse_setround(dyn, ninst, x1, x2, x6); + if(!box64_dynarec_fastround && !arm64_frintts) { d0 = fpu_get_scratch(dyn, ninst); - for(int i=0; i<4; ++i) { - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - VMOVeS(d0, 0, v1, i); - FRINTIS(d0, d0); - VFCVTZSs(d0, d0); - MRS_fpsr(x5); // get back FPSR to check the IOC bit - TBZ(x5, FPSR_IOC, 4+4); - VMOVQSfrom(d0, 0, x4); - VMOVeS(v0, i, d0, 0); - } - if(!vex.l) x87_restoreround(dyn, ninst, u8); + d1 = fpu_get_scratch(dyn, ninst); + MOVI_32_lsl(d1, 0x80, 3); } - if(vex.l) { - GETGY_empty_EY(v0, v1); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETEX(v1, 0, 0); GETGX_empty(v0); } else { GETGY_empty_EY(v0, v1); } if(box64_dynarec_fastround) { VFRINTISQ(v0, v1); - x87_restoreround(dyn, ninst, u8); VFCVTZSQS(v0, v0); } else { - MRS_fpsr(x5); - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - MOV32w(x4, 0x80000000); - d0 = fpu_get_scratch(dyn, ninst); - for(int i=0; i<4; ++i) { - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - VMOVeS(d0, 0, v1, i); - FRINTIS(d0, d0); - VFCVTZSs(d0, d0); - MRS_fpsr(x5); // get back FPSR to check the IOC bit - TBZ(x5, FPSR_IOC, 4+4); - VMOVQSfrom(d0, 0, x4); - VMOVeS(v0, i, d0, 0); + if(arm64_frintts) { + VFRINT32XSQ(v0, v1); + VFCVTZSQS(v0, v0); + } else { + if(!l) MRS_fpsr(x5); + for(int i=0; i<4; ++i) { + BFCx(x5, FPSR_IOC, 1); // reset IOC bits + MSR_fpsr(x5); + VMOVeS(d0, 0, v1, i); + FRINTIS(d0, d0); + VFCVTZSs(d0, d0); + MRS_fpsr(x5); // get back FPSR to check the IOC bit + TSTw_mask(x5, 0, 0); // mask=1 + FCSELS(d0, d0, d1, cEQ); + VMOVeS(v0, i, d0, 0); + } } - x87_restoreround(dyn, ninst, u8); } - } else YMM0(gd); + } + x87_restoreround(dyn, ninst, u8); + if(!vex.l) YMM0(gd); break; case 0x5C: INST_NAME("VSUBPD Gx, Vx, Ex"); @@ -1552,7 +1560,51 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } if(!vex.l) YMM0(gd); break; - + case 0xF1: + INST_NAME("VPSLLW Gx, Vx, Ex"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + MOVI_32(q1, 16); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + UQXTN_32(q0, v1); + UMIN_32(q0, q0, q1); // limit to 0 .. +16 values + VDUPQ_16(q0, q0, 0); // only the low 8bits will be used anyway + USHLQ_16(v0, v2, q0); + } + if(!vex.l) YMM0(gd); + break; + case 0xF2: + INST_NAME("VPSLLD Gx, Vx, Ex"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + MOVI_32(q1, 32); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + UQXTN_32(q0, v1); + UMIN_32(q0, q0, q1); // limit to 0 .. +32 values + VDUPQ_32(q0, q0, 0); // only the low 8bits will be used anyway + USHLQ_32(v0, v2, q0); + } + if(!vex.l) YMM0(gd); + break; + case 0xF3: + INST_NAME("VPSLLQ Gx, Vx, Ex"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + MOVI_32(q1, 64); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + UQXTN_32(q0, v1); + UMIN_32(q0, q0, q1); // limit to 0 .. +64 values + VDUPQ_64(q0, q0, 0); // only the low 8bits will be used anyway + USHLQ_64(v0, v2, q0); + } + if(!vex.l) YMM0(gd); + break; case 0xF4: INST_NAME("VPMULUDQ Gx, Vx, Ex"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c index d358597d..f3ea8f41 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c @@ -123,6 +123,31 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip if(!vex.l) YMM0(gd); break; + case 0x0C: + INST_NAME("VPERMILPS Gx, Vx, Ex"); + nextop = F8; + q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + // transform u32 index in V1 to 4 u8 index in q0 for VTBL + MOVIQ_32(q0, 3); // index and 3 + VANDQ(q0, v1, q0); + SQXTN_16(q0, q0); // index in 16bits + VSHL_16(q0, q0, 1); // double the index + VZIP1Q_16(q0, q0, q0); // repeat the index by pair + MOVIQ_32_lsl(q1, 1, 2); // q1 as 16bits is 0 / 1 + VADDQ_16(q0, q0, q1); + SQXTN_8(q0, q0); // index in 8bits + VSHL_8(q0, q0, 1); // double the index + VZIP1Q_8(q0, q0, q0); // repeat the index by pair + MOVIQ_16(q1, 1, 1); + VADDQ_8(q0, q0, q1); + VTBLQ1_8(v0, v2, q0); + } + if(!vex.l) YMM0(gd); + break; + case 0x17: INST_NAME("VPTEST GX, EX"); SETFLAGS(X_ALL, SF_SET); @@ -148,17 +173,19 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } IFX(X_ZF) { VANDQ(v2, v0, v1); + CMEQQ_0_64(v2, v2); UQXTN_32(v2, v2); VMOVQDto(x2, v2, 0); - CMPSw_U12(x2, 0); + ADDSx_U12(xZR, x2, 1); CSETw(x2, cEQ); BFIw(xFlags, x2, F_ZF, 1); } IFX(X_CF) { VBICQ(v2, v1, v0); + CMEQQ_0_64(v2, v2); UQXTN_32(v2, v2); VMOVQDto(x2, v2, 0); - CMPSw_U12(x2, 0); + ADDSx_U12(xZR, x2, 1); CSETw(x2, cEQ); BFIw(xFlags, x2, F_CF, 1); } diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c index 71b50105..371fe25a 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c @@ -111,7 +111,9 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip GETGY_empty_EY(v0, v1); if(v0==v1) {VMOVQ(q1, v1);} } - for(int i=0; i<4; ++i) + if(u8==0x00 || u8==0x55 || u8==0xAA || u8==0xFF) + VDUPQ_32(v0, (v0==v1)?q1:v1, u8&3); + else for(int i=0; i<4; ++i) VMOVeS(v0, i, (v0==v1)?q1:v1, (u8>>(i*2))&3); } if(!vex.l) YMM0(gd); @@ -128,7 +130,9 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip GETGY_empty_EY(v0, v1); if(v0==v1) {VMOVQ(q1, v1);} } - for(int i=0; i<2; ++i) + if(((u8>>(l*2))&1)==((u8>>(1+l*2))&1)) + VDUPQ_64(v0, (v0==v1)?q1:v1, ((u8>>(l*2))&1)); + else for(int i=0; i<2; ++i) VMOVeD(v0, i, (v0==v1)?q1:v1, (u8>>(i+l*2))&1); } if(!vex.l) YMM0(gd); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c index fb791452..3391a293 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c @@ -39,7 +39,7 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int q0, q1, q2; int d0, d1, d2; int s0; - uint64_t tmp64u; + uint64_t tmp64u, u64; int64_t j64; int64_t fixedaddress; int unscaled; @@ -242,8 +242,9 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, d1 = fpu_get_scratch(dyn, ninst); GETEXSD(v1, 0, 0); GETGX_empty_VX(v0, v2); + // VMINSD: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0] FCMPD(v2, v1); - FCSELD(d1, v2, v1, cLS); + FCSELD(d1, v1, v2, cCS); if(v0!=v2) { VMOVQ(v0, v2); } @@ -256,7 +257,20 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, d1 = fpu_get_scratch(dyn, ninst); GETEXSD(v1, 0, 0); GETGX_empty_VX(v0, v2); + if(!box64_dynarec_fastnan) { + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXD(q0, v2, v1); // propagate NAN + FCMEQD(q0, q0, q0); // 0 if NAN, 1 if not NAN + } FDIVD(d1, v2, v1); + if(!box64_dynarec_fastnan) { + FCMEQD(q1, d1, d1); // 0 => out is NAN + VBIC(q1, q0, q1); // forget it in any input was a NAN already + VSHLQ_64(q1, q1, 63); // only keep the sign bit + VORR(d1, d1, q1); // NAN -> -NAN + } if(v0!=v2) { VMOVQ(v0, v2); } @@ -269,8 +283,8 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, d1 = fpu_get_scratch(dyn, ninst); GETEXSD(v1, 0, 0); GETGX_empty_VX(v0, v2); - FCMPD(v2, v1); - FCSELD(d1, v2, v1, cGE); + FCMPD(v1, v2); + FCSELD(d1, v1, v2, cCS); if(v0!=v2) { VMOVQ(v0, v2); } @@ -278,6 +292,62 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, YMM0(gd) break; + case 0x70: + INST_NAME("VPSHUFLW Gx, Ex, Ib"); + nextop = F8; + d0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETEX(v1, 0, 1); GETGX(v0, 1); u8 = F8; } else { GETGY(v0, 1, MODREG?((nextop&7)+(rex.b<<3)):-1, -1, -1); GETEY(v1); } + if(u8==0b00000000 || u8==0b01010101 || u8==0b10101010 || u8==0b11111111) { + if(v0==v1) { + VMOVQ(d0, v1); + } + VDUP_16(v0, v1, u8&3); + if(v0==v1) + v1 = d0; + } else { + // only low part need to be suffled. VTBL only handle 8bits value, so the 16bits suffles need to be changed in 8bits + if(!l) { + u64 = 0; + for (int i=0; i<4; ++i) { + u64 |= ((uint64_t)((u8>>(i*2))&3)*2+0)<<(i*16+0); + u64 |= ((uint64_t)((u8>>(i*2))&3)*2+1)<<(i*16+8); + } + MOV64x(x2, u64); + } + VMOVQDfrom(d0, 0, x2); + VTBL1_8(d0, v1, d0); + VMOVeD(v0, 0, d0, 0); + } + if(v0!=v1) { + VMOVeD(v0, 1, v1, 1); + } + } + if(!vex.l) YMM0(gd); + break; + + case 0xC2: + INST_NAME("CMPSD Gx, Ex, Ib"); + nextop = F8; + GETEXSD(v1, 0, 1); + GETGX_empty_VX(v0, v2); + u8 = F8; + FCMPD(v2, v1); + if(v0!=v2) VMOVQ(v0, v2); + switch(u8&7) { + case 0: CSETMx(x2, cEQ); break; // Equal + case 1: CSETMx(x2, cCC); break; // Less than + case 2: CSETMx(x2, cLS); break; // Less or equal + case 3: CSETMx(x2, cVS); break; // NaN + case 4: CSETMx(x2, cNE); break; // Not Equal or unordered + case 5: CSETMx(x2, cCS); break; // Greater or equal or unordered + case 6: CSETMx(x2, cHI); break; // Greater or unordered, test inverted, N!=V so unordered or less than (inverted) + case 7: CSETMx(x2, cVC); break; // not NaN + } + VMOVQDfrom(v0, 0, x2); + YMM0(gd); + break; + default: DEFAULT; } diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c index 47c6391d..11aee1d2 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c @@ -182,6 +182,17 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } break; + case 0x51: + INST_NAME("SQRTSS Gx, Ex"); + nextop = F8; + GETEXSS(d0, 0, 0); + GETGX_empty_VX(v0, v2); + d1 = fpu_get_scratch(dyn, ninst); + FSQRTS(d1, d0); + if(v0!=v2) VMOVQ(v0, v2); + VMOVeS(v0, 0, d1, 0); + YMM0(gd); + break; case 0x52: INST_NAME("VRSQRTSS Gx, Vx Ex"); nextop = F8; @@ -305,7 +316,7 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, GETEXSS(v1, 0, 0); GETGX_empty_VX(v0, v2); FCMPS(v2, v1); - FCSELS(d1, v2, v1, cLS); + FCSELS(d1, v1, v2, cCS); if(v0!=v2) { VMOVQ(v0, v2); } @@ -331,8 +342,8 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, d1 = fpu_get_scratch(dyn, ninst); GETEXSS(v1, 0, 0); GETGX_empty_VX(v0, v2); - FCMPS(v2, v1); - FCSELS(d1, v2, v1, cGE); + FCMPS(v1, v2); + FCSELS(d1, v1, v2, cCS); if(v0!=v2) { VMOVQ(v0, v2); } @@ -403,6 +414,39 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } break; + case 0xC2: + INST_NAME("VCMPSS Gx, Vx, Ex, Ib"); + nextop = F8; + GETEXSS(v1, 0, 1); + GETGX_empty_VX(v0, v2); + u8 = F8; + if(((u8&15)==12)||((u8&15)==13)||((u8&15)==9)||((u8&15)==10)) + FCMPS(v1, v2); + else + FCMPS(v2, v1); + if(v0!=v2) VMOVQ(v0, v2); + switch(u8&7) { + case 0x00: CSETMw(x2, cEQ); break; // Equal + case 0x01: CSETMw(x2, cCC); break; // Less than + case 0x02: CSETMw(x2, cLS); break; // Less or equal + case 0x03: CSETMw(x2, cVS); break; // NaN + case 0x04: CSETMw(x2, cNE); break; // Not Equal or unordered + case 0x05: CSETMw(x2, cCS); break; // Greater or equal or unordered + case 0x06: CSETMw(x2, cHI); break; // Greater or unordered + case 0x07: CSETMw(x2, cVC); break; // not NaN + case 0x08: CSETMw(x2, cEQ); CSETMw(x3, cVS); ORRw_REG(x2, x2, x3); break; // Equal than or ordered + case 0x09: CSETMw(x2, cCS); break; // Less than or ordered + case 0x0a: CSETMw(x2, cHI); break; // Less or equal or ordered + case 0x0b: MOV32w(x2, 0); break; // false + case 0x0c: CSETMw(x2, cNE); CSETMw(x3, cVC); ANDw_REG(x2, x2, x3); break; // Not Equal not unordered + case 0x0d: CSETMw(x2, cCC); break; // Greater or equal not unordered + case 0x0e: CSETMw(x2, cLS); break; // Greater not unordered + case 0x0f: MOV32w(x2, 0xffffffff); break; // true + } + VMOVQSfrom(v0, 0, x2); + YMM0(gd); + break; + default: DEFAULT; } |