diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2025-01-22 20:37:17 +0100 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2025-01-22 20:37:17 +0100 |
| commit | f4a9f8dd79456cbb16d2048c36b48074c831c0bf (patch) | |
| tree | 58fa4036ab4372d8b50185bca6e011df0a62926c /src | |
| parent | f235c7f702a4a5873e3b7ef04e3fdf17627e6ca9 (diff) | |
| download | box64-f4a9f8dd79456cbb16d2048c36b48074c831c0bf.tar.gz box64-f4a9f8dd79456cbb16d2048c36b48074c831c0bf.zip | |
[ARM64_DYNAREC] Added a few AVX opcode and Improved/Fixed some existing SSE and AVX ones
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_0f.c | 68 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_660f.c | 20 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_0f.c | 124 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f.c | 16 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c | 137 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c | 60 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c | 112 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c | 84 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_f20f.c | 2 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_f30f.c | 69 |
10 files changed, 557 insertions, 135 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c index 98f84863..71c844d2 100644 --- a/src/dynarec/arm64/dynarec_arm64_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_0f.c @@ -1136,15 +1136,41 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("ADDPS Gx, Ex"); nextop = F8; GETEX(q0, 0, 0); - GETGX(v0, 1); - VFADDQS(v0, v0, q0); + GETGX(q1, 1); + if(!BOX64ENV(dynarec_fastnan)) { + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + VFMAXQS(v0, q0, q1); // propagate NAN + VFCMEQQS(v0, v0, v0); // 0 if NAN, 1 if not NAN + } + VFADDQS(q1, q1, q0); + if(!BOX64ENV(dynarec_fastnan)) { + VFCMEQQS(v1, q1, q1); // 0 => out is NAN + VBICQ(v1, v0, v1); // forget it in any input was a NAN already + VSHLQ_32(v1, v1, 31); // only keep the sign bit + VORRQ(q1, q1, v1); // NAN -> -NAN + } break; case 0x59: INST_NAME("MULPS Gx, Ex"); nextop = F8; GETEX(q0, 0, 0); - GETGX(v0, 1); - VFMULQS(v0, v0, q0); + GETGX(q1, 1); + if(!BOX64ENV(dynarec_fastnan)) { + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + VFMAXQS(v0, q0, q1); // propagate NAN + VFCMEQQS(v0, v0, v0); // 0 if NAN, 1 if not NAN + } + VFMULQS(q1, q1, q0); + if(!BOX64ENV(dynarec_fastnan)) { + VFCMEQQS(v1, q1, q1); // 0 => out is NAN + VBICQ(v1, v0, v1); // forget it in any input was a NAN already + VSHLQ_32(v1, v1, 31); // only keep the sign bit + VORRQ(q1, q1, v1); // NAN -> -NAN + } break; case 0x5A: INST_NAME("CVTPS2PD Gx, Ex"); @@ -1164,8 +1190,21 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("SUBPS Gx, Ex"); nextop = F8; GETEX(q0, 0, 0); - GETGX(v0, 1); - VFSUBQS(v0, v0, q0); + GETGX(q1, 1); + if(!BOX64ENV(dynarec_fastnan)) { + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + VFMAXQS(v0, q0, q1); // propagate NAN + VFCMEQQS(v0, v0, v0); // 0 if NAN, 1 if not NAN + } + VFSUBQS(q1, q1, q0); + if(!BOX64ENV(dynarec_fastnan)) { + VFCMEQQS(v1, q1, q1); // 0 => out is NAN + VBICQ(v1, v0, v1); // forget it in any input was a NAN already + VSHLQ_32(v1, v1, 31); // only keep the sign bit + VORRQ(q1, q1, v1); // NAN -> -NAN + } break; case 0x5D: INST_NAME("MINPS Gx, Ex"); @@ -1185,8 +1224,21 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("DIVPS Gx, Ex"); nextop = F8; GETEX(q0, 0, 0); - GETGX(v0, 1); - VFDIVQS(v0, v0, q0); + GETGX(q1, 1); + if(!BOX64ENV(dynarec_fastnan)) { + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + VFMAXQS(v0, q0, q1); // propagate NAN + VFCMEQQS(v0, v0, v0); // 0 if NAN, 1 if not NAN + } + VFDIVQS(q1, q1, q0); + if(!BOX64ENV(dynarec_fastnan)) { + VFCMEQQS(v1, q1, q1); // 0 => out is NAN + VBICQ(v1, v0, v1); // forget it in any input was a NAN already + VSHLQ_32(v1, v1, 31); // only keep the sign bit + VORRQ(q1, q1, v1); // NAN -> -NAN + } break; case 0x5F: INST_NAME("MAXPS Gx, Ex"); diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c index 54f98516..893ac0ce 100644 --- a/src/dynarec/arm64/dynarec_arm64_660f.c +++ b/src/dynarec/arm64/dynarec_arm64_660f.c @@ -475,7 +475,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n break; case 0x14: - INST_NAME("PBLENDVPS Gx,Ex"); + INST_NAME("BLENDVPS Gx,Ex"); nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); @@ -483,13 +483,11 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n v1 = fpu_get_scratch(dyn, ninst); if(q0!=q1) { VSSHRQ_32(v1, v0, 31); // bit[31]-> bit[31..0] - VBICQ(q0, q0, v1); - VANDQ(v1, q1, v1); - VORRQ(q0, q0, v1); + VBITQ(q0, q1, v1); } break; case 0x15: - INST_NAME("PBLENDVPD Gx,Ex"); + INST_NAME("BLENDVPD Gx,Ex"); nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); @@ -497,9 +495,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n v1 = fpu_get_scratch(dyn, ninst); if(q0!=q1) { VSSHRQ_64(v1, v0, 63); // bit[63]-> bit[63..0] - VBICQ(q0, q0, v1); - VANDQ(v1, q1, v1); - VORRQ(q0, q0, v1); + VBITQ(q0, q1, v1); } break; @@ -1028,9 +1024,13 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n u8 = F8&0b1111; if(u8==0b0011) { VMOVeD(q0, 0, q1, 0); - } else if(u8==0b1100) { + u8&=~0b0011; + } + if(u8==0b1100) { VMOVeD(q0, 1, q1, 1); - } else for(int i=0; i<4; ++i) + u8&=~0b1100; + } + for(int i=0; i<4; ++i) if(u8&(1<<i)) { VMOVeS(q0, i, q1, i); } diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c index 8c333f7d..5ce00466 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c @@ -440,22 +440,50 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x58: INST_NAME("VADDPS Gx, Vx, Ex"); nextop = F8; - GETGX_empty_VXEX(v0, v2, v1, 0); - VFADDQS(v0, v2, v1); - if(vex.l) { - GETGY_empty_VYEY(v0, v2, v1); + if(!BOX64ENV(dynarec_fastnan)) { + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + } + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + if(!BOX64ENV(dynarec_fastnan)) { + // check if any input value was NAN + VFMAXQS(q0, v2, v1); // propagate NAN + VFCMEQQS(q0, q0, q0); // 0 if NAN, 1 if not NAN + } VFADDQS(v0, v2, v1); - } else YMM0(gd) + if(!BOX64ENV(dynarec_fastnan)) { + VFCMEQQS(q1, v0, v0); // 0 => out is NAN + VBICQ(q1, q0, q1); // forget it in any input was a NAN already + VSHLQ_32(q1, q1, 31); // only keep the sign bit + VORRQ(v0, v0, q1); // NAN -> -NAN + } + } + if(!vex.l) YMM0(gd) break; case 0x59: INST_NAME("VMULPS Gx, Vx, Ex"); nextop = F8; - GETGX_empty_VXEX(v0, v2, v1, 0); - VFMULQS(v0, v2, v1); - if(vex.l) { - GETGY_empty_VYEY(v0, v2, v1); + if(!BOX64ENV(dynarec_fastnan)) { + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + } + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + if(!BOX64ENV(dynarec_fastnan)) { + // check if any input value was NAN + VFMAXQS(q0, v2, v1); // propagate NAN + VFCMEQQS(q0, q0, q0); // 0 if NAN, 1 if not NAN + } VFMULQS(v0, v2, v1); - } else YMM0(gd) + if(!BOX64ENV(dynarec_fastnan)) { + VFCMEQQS(q1, v0, v0); // 0 => out is NAN + VBICQ(q1, q0, q1); // forget it in any input was a NAN already + VSHLQ_32(q1, q1, 31); // only keep the sign bit + VORRQ(v0, v0, q1); // NAN -> -NAN + } + } + if(!vex.l) YMM0(gd) break; case 0x5A: INST_NAME("VCVTPS2PD Gx, Ex"); @@ -480,12 +508,26 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x5C: INST_NAME("VSUBPS Gx, Vx, Ex"); nextop = F8; - GETGX_empty_VXEX(v0, v2, v1, 0); - VFSUBQS(v0, v2, v1); - if(vex.l) { - GETGY_empty_VYEY(v0, v2, v1); + if(!BOX64ENV(dynarec_fastnan)) { + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + } + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + if(!BOX64ENV(dynarec_fastnan)) { + // check if any input value was NAN + VFMAXQS(q0, v2, v1); // propagate NAN + VFCMEQQS(q0, q0, q0); // 0 if NAN, 1 if not NAN + } VFSUBQS(v0, v2, v1); - } else YMM0(gd) + if(!BOX64ENV(dynarec_fastnan)) { + VFCMEQQS(q1, v0, v0); // 0 => out is NAN + VBICQ(q1, q0, q1); // forget it in any input was a NAN already + VSHLQ_32(q1, q1, 31); // only keep the sign bit + VORRQ(v0, v0, q1); // NAN -> -NAN + } + } + if(!vex.l) YMM0(gd) break; case 0x5D: INST_NAME("VMINPS Gx, Vx, Ex"); @@ -508,12 +550,26 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x5E: INST_NAME("VDIVPS Gx, Vx, Ex"); nextop = F8; - GETGX_empty_VXEX(v0, v2, v1, 0); - VFDIVQS(v0, v2, v1); - if(vex.l) { - GETGY_empty_VYEY(v0, v2, v1); + if(!BOX64ENV(dynarec_fastnan)) { + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + } + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + if(!BOX64ENV(dynarec_fastnan)) { + // check if any input value was NAN + VFMAXQS(q0, v2, v1); // propagate NAN + VFCMEQQS(q0, q0, q0); // 0 if NAN, 1 if not NAN + } VFDIVQS(v0, v2, v1); - } else YMM0(gd) + if(!BOX64ENV(dynarec_fastnan)) { + VFCMEQQS(q1, v0, v0); // 0 => out is NAN + VBICQ(q1, q0, q1); // forget it in any input was a NAN already + VSHLQ_32(q1, q1, 31); // only keep the sign bit + VORRQ(v0, v0, q1); // NAN -> -NAN + } + } + if(!vex.l) YMM0(gd) break; case 0x5F: INST_NAME("VMAXPS Gx, Vx, Ex"); @@ -634,15 +690,21 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int u8 = F8; if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) { VDUPQ_32(v0, v2, u8&3); - } else if(v2==v1 && (u8==0xe0)) { // easy special case + } else if(v2==v1 && (u8==0xe0)) { // elements 3 2 0 0 VMOVQ(v0, v2); VMOVeS(v0, 1, v0, 0); - } else if(v2==v1 && (u8==0xe5)) { // easy special case + } else if(v2==v1 && (u8==0xe5)) { // elements 3 2 1 1 VMOVQ(v0, v2); VMOVeS(v0, 0, v0, 1); - } else if(MODREG && u8==0x88) { + } else if(v2==v1 && (u8==0xa0)) { // elements 2 2 0 0 + VTRNQ1_32(v0, v1, v2); + } else if(v2==v1 && (u8==0xf5)) { // elements 3 3 1 1 + VTRNQ2_32(v0, v1, v2); + } else if(v2==v1 && (u8==0xb1)) { // elements 2 3 0 1 + VREV64Q_32(v0, v1); + } else if(MODREG && u8==0x88) { // elements 2 0 2 0 VUZP1Q_32(v0, v2, v1); - } else if(MODREG && u8==0xdd) { + } else if(MODREG && u8==0xdd) { // elements 3 1 3 1 VUZP2Q_32(v0, v2, v1); } else { if((v0==v1) || (v0==v2)) { @@ -674,15 +736,21 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int GETGY_empty_VY(v0, v2, 0, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) { VDUPQ_32(v0, v2, u8&3); - } else if(v2==v1 && (u8==0xe0)) { + } else if(v2==v1 && (u8==0xe0)) { // elements 3 2 0 0 VMOVQ(v0, v2); VMOVeS(v0, 1, v0, 0); - } else if(v2==v1 && (u8==0xe5)) { + } else if(v2==v1 && (u8==0xe5)) { // elements 3 2 1 1 VMOVQ(v0, v2); VMOVeS(v0, 0, v0, 1); - } else if(MODREG && u8==0x88) { + } else if(v2==v1 && (u8==0xa0)) { // elements 2 2 0 0 + VTRNQ1_32(v0, v1, v2); + } else if(v2==v1 && (u8==0xf5)) { // elements 3 3 1 1 + VTRNQ2_32(v0, v1, v2); + } else if(v2==v1 && (u8==0xb1)) { // elements 2 3 0 1 + VREV64Q_32(v0, v1); + } else if(MODREG && u8==0x88) { // elements 2 0 2 0 VUZP1Q_32(v0, v2, v1); - } else if(MODREG && u8==0xdd) { + } else if(MODREG && u8==0xdd) { // elements 3 1 3 1 VUZP2Q_32(v0, v2, v1); } else { if(s0) d0 = v0; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c index 570098e4..81169c55 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c @@ -1877,7 +1877,21 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } if(!vex.l) YMM0(gd); break; - + case 0xF7: + INST_NAME("VMASKMOVDQU Gx, Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + v0 = fpu_get_scratch(dyn, ninst); + VLDR128_U12(v0, xRDI, 0); + if(MODREG) + v1 = fpu_get_scratch(dyn, ninst); // need to preserve the register + else + v1 = q1; + VSSHRQ_8(v1, q1, 7); // get the mask + VBITQ(v0, q0, v1); + VSTR128_U12(v0, xRDI, 0); // put back + break; case 0xF8: INST_NAME("VPSUBB Gx, Vx, Ex"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c index 0647dca1..dcc455e1 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c @@ -26,6 +26,8 @@ static const float addsubps[4] = {-1.f, 1.f, -1.f, 1.f}; static const double addsubpd[2] = {-1., 1.}; +static const float subaddps[4] = {1.f, -1.f, 1.f, -1.f}; +static const double subaddpd[2] = {1., -1.}; uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) { @@ -204,6 +206,109 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip if(!vex.l) YMM0(gd); break; + case 0x0E: + INST_NAME("VTESTPS GX, EX"); + SETFLAGS(X_ALL, SF_SET); + nextop = F8; + GETGX(v0, 0); + GETEX(v1, 0, 0); + v2 = fpu_get_scratch(dyn, ninst); + if(vex.l) { + if(!MODREG) + q1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); + GETGY(q0, 0, MODREG?((nextop&7)+(rex.b<<3)):-1, -1, -1); + GETEY(q1); + } + IFX(X_CF) { + VBICQ(v2, v1, v0); + VSHRQ_32(v2, v2, 31); + if(vex.l) { + VBICQ(q2, q1, q0); + VSHRQ_32(q2, q2, 31); + VORRQ(v2, v2, q2); + } + CMEQQ_0_64(v2, v2); + UQXTN_32(v2, v2); + VMOVQDto(x2, v2, 0); + ADDSx_U12(xZR, x2, 1); + CSETw(x2, cEQ); + BFIw(xFlags, x2, F_CF, 1); + } + IFX(X_ZF) { + VANDQ(v2, v0, v1); + VSHRQ_32(v2, v2, 31); + if(vex.l) { + VANDQ(q2, q0, q1); + VSHRQ_32(q2, q2, 31); + VORRQ(v2, v2, q2); + } + CMEQQ_0_64(v2, v2); + UQXTN_32(v2, v2); + VMOVQDto(x2, v2, 0); + ADDSx_U12(xZR, x2, 1); + IFNATIVE(NF_EQ) {} else { + CSETw(x2, cEQ); + BFIw(xFlags, x2, F_ZF, 1); + } + } + IFX(X_AF|X_SF|X_OF|X_PF) { + MOV32w(x2, (1<<F_AF) | (1<<F_OF) | (1<<F_SF) | (1<<F_PF)); + BICw(xFlags, xFlags, x2); + } + break; + case 0x0F: + INST_NAME("VTESTPD GX, EX"); + SETFLAGS(X_ALL, SF_SET); + nextop = F8; + GETGX(v0, 0); + GETEX(v1, 0, 0); + v2 = fpu_get_scratch(dyn, ninst); + if(vex.l) { + if(!MODREG) + q1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); + GETGY(q0, 0, MODREG?((nextop&7)+(rex.b<<3)):-1, -1, -1); + GETEY(q1); + } + IFX(X_CF) { + VBICQ(v2, v1, v0); + VSHRQ_64(v2, v2, 63); + if(vex.l) { + VBICQ(q2, q1, q0); + VSHRQ_64(q2, q2, 63); + VORRQ(v2, v2, q2); + } + CMEQQ_0_64(v2, v2); + UQXTN_32(v2, v2); + VMOVQDto(x2, v2, 0); + ADDSx_U12(xZR, x2, 1); + CSETw(x2, cEQ); + BFIw(xFlags, x2, F_CF, 1); + } + IFX(X_ZF) { + VANDQ(v2, v0, v1); + VSHRQ_64(v2, v2, 63); + if(vex.l) { + VANDQ(q2, q0, q1); + VSHRQ_64(q2, q2, 63); + VORRQ(v2, v2, q2); + } + CMEQQ_0_64(v2, v2); + UQXTN_32(v2, v2); + VMOVQDto(x2, v2, 0); + ADDSx_U12(xZR, x2, 1); + IFNATIVE(NF_EQ) {} else { + CSETw(x2, cEQ); + BFIw(xFlags, x2, F_ZF, 1); + } + } + IFX(X_AF|X_SF|X_OF|X_PF) { + MOV32w(x2, (1<<F_AF) | (1<<F_OF) | (1<<F_SF) | (1<<F_PF)); + BICw(xFlags, xFlags, x2); + } + break; + case 0x13: INST_NAME("VCVTPH2PS Gx, Ex"); nextop = F8; @@ -1015,6 +1120,10 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip if(!l) { GETGX_empty_VX(v0, v2); addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); + if(ed!=x3) { + MOVx_REG(x3, ed); + ed = x3; + } v1 = fpu_get_scratch(dyn, ninst); } else { GETGY_empty_VY(v0, v2, 0, -1, -1); @@ -1028,12 +1137,12 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip VMOVQDto(x4, q0, 0); CBZx(x4, 4+1*4); VLD1_64(v1, 0, ed); - ADDx_U12(ed, ed, 4); + ADDx_U12(ed, ed, 8); VMOVQDto(x4, q0, 1); CBZx(x4, 4+1*4); VLD1_64(v1, 1, ed); if(!l && vex.l) - ADDx_U12(ed, ed, 4); + ADDx_U12(ed, ed, 8); } else { VSSHRQ_32(q0, v2, 31); VMOVSto(x4, q0, 0); @@ -1503,7 +1612,29 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } if(!vex.l) YMM0(gd); break; - + case 0xB7: + INST_NAME("VFMSUBADD231PS/D Gx, Vx, Ex"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + TABLE64(x2, (rex.w)?((uintptr_t)&subaddpd):((uintptr_t)&subaddps)); + VLDR128_U12(q0, x2, 0); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_VXEX(v0, v2, v1, 0); if(v0==v2 || v0==v1) q1 = fpu_get_scratch(dyn, ninst); } else { GETGY_VYEY(v0, v2, v1); } + if(v0!=v1 && v0!=v2) { + q1 = v0; + } + if(rex.w) { + VFMULQD(q1, v0, q0); + VFMLAQD(q1, v1, v2); + } else { + VFMULQS(q1, v0, q0); + VFMLAQS(q1, v1, v2); + } + if(q1!=v0) + VMOVQ(v0, q1); + } + if(!vex.l) YMM0(gd); + break; case 0xB8: INST_NAME("VFMADD231PS/D Gx, Vx, Ex"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c index dee4a568..a2111a4e 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c @@ -135,15 +135,21 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip if(!l) { GETGX_empty_EX(v0, v1, 1); u8 = F8; - if(v0==v1) {q1 = fpu_get_scratch(dyn, ninst); VMOVQ(q1, v1);} } else { GETGY_empty_EY(v0, v1); - if(v0==v1) {VMOVQ(q1, v1);} } - if(((u8>>(l*2))&1)==((u8>>(1+l*2))&1)) - VDUPQ_64(v0, (v0==v1)?q1:v1, ((u8>>(l*2))&1)); - else for(int i=0; i<2; ++i) - VMOVeD(v0, i, (v0==v1)?q1:v1, (u8>>(i+l*2))&1); + switch(((u8>>(l*2))&3)) { + case 0b00: + case 0b11: + VDUPQ_64(v0, v1, ((u8>>(l*2))&1)); + break; + case 0b10: + if(v0!=v1) VMOVQ(v0, v1); + break; + case 0b01: + VEXTQ_8(v0, v1, v1, 8); // invert 64bits values + break; + } } if(!vex.l) YMM0(gd); break; @@ -308,7 +314,7 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } else YMM0(gd); break; case 0x0D: - INST_NAME("VPBLENDPD Gx, Vx, Ex, Ib"); + INST_NAME("VBLENDPD Gx, Vx, Ex, Ib"); nextop = F8; for(int l=0; l<1+vex.l; ++l) { if(!l) { GETGX_empty_VXEX(q0, q2, q1, 1); u8 = F8; } else { GETGY_empty_VYEY(q0, q2, q1); } @@ -592,28 +598,36 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip u8 = geted_ib(dyn, addr, ninst, nextop); q0 = fpu_get_scratch(dyn, ninst); // first mask - wb1 = 0; // mask - for(int i=0; i<4; ++i) - if(u8&(1<<i)) - wb1 |= (3<<(i*2)); - MOVI_64(q0, wb1); // load 8bits value as a 8bytes mask - SXTL_16(q0, q0); // expand 16bits to 32bits... - q1 = fpu_get_scratch(dyn, ninst); - // second mask - wb1 = 0; // mask - for(int i=0; i<4; ++i) - if((u8>>4)&(1<<i)) - wb1 |= (3<<(i*2)); - MOVI_64(q1, wb1); // load 8bits value as a 8bytes mask - SXTL_16(q1, q1); // expand 16bits to 32bits... + if((u8&0x0f)!=0x0f) { + wb1 = 0; // mask + for(int i=0; i<4; ++i) + if(u8&(1<<i)) + wb1 |= (3<<(i*2)); + MOVI_64(q0, wb1); // load 8bits value as a 8bytes mask + SXTL_16(q0, q0); // expand 16bits to 32bits... + } + if((u8&0xf0)!=0xf0) { + q1 = fpu_get_scratch(dyn, ninst); + // second mask + wb1 = 0; // mask + for(int i=0; i<4; ++i) + if((u8>>4)&(1<<i)) + wb1 |= (3<<(i*2)); + MOVI_64(q1, wb1); // load 8bits value as a 8bytes mask + SXTL_16(q1, q1); // expand 16bits to 32bits... + } for(int l=0; l<1+vex.l; ++l) { if(!l) { GETGX_empty_VXEX(v0, v2, v1, 1); u8 = F8; } else { GETGY_empty_VYEY(v0, v2, v1); } VFMULQS(v0, v2, v1); - VANDQ(v0, v0, q1); // second mask + if((u8&0xf0)!=0xf0) { + VANDQ(v0, v0, q1); // second mask + } VFADDPQS(v0, v0, v0); FADDPS(v0, v0); VDUPQ_32(v0, v0, 0); - VANDQ(v0, v0, q0); // first mask + if((u8&0x0f)!=0x0f) { + VANDQ(v0, v0, q0); // first mask + } } if(!vex.l) YMM0(gd); break; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c index 8ee698ab..35e30357 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c @@ -211,28 +211,53 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, case 0x58: INST_NAME("VADDSD Gx, Vx, Ex"); nextop = F8; - d1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); GETEXSD(v1, 0, 0); GETGX_empty_VX(v0, v2); - FADDD(d1, v2, v1); + if(!BOX64ENV(dynarec_fastnan)) { + q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXD(q1, v0, v1); // propagate NAN + FCMEQD(q1, q1, q1); // 0 if NAN, 1 if not NAN + FADDD(q2, v1, v2); // the high part of the vector is erased... + FCMEQD(q0, q2, q2); // 0 => out is NAN + VBIC(q0, q1, q0); // forget it in any input was a NAN already + SHL_64(q0, q0, 63); // only keep the sign bit + VORR(q2, q2, q0); // NAN -> -NAN + } else { + FADDD(q2, v1, v2); // the high part of the vector is erased... + } if(v0!=v2) { VMOVQ(v0, v2); } - VMOVeD(v0, 0, d1, 0); + VMOVeD(v0, 0, q2, 0); YMM0(gd) break; case 0x59: INST_NAME("VMULSD Gx, Vx, Ex"); nextop = F8; - d1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); GETEXSD(v1, 0, 0); GETGX_empty_VX(v0, v2); - FMULD(d1, v2, v1); + if(!BOX64ENV(dynarec_fastnan)) { + q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXD(q1, v0, v1); // propagate NAN + FCMEQD(q1, q1, q1); // 0 if NAN, 1 if not NAN + FMULD(q2, v1, v2); // the high part of the vector is erased... + FCMEQD(q0, q2, q2); // 0 => out is NAN + VBIC(q0, q1, q0); // forget it in any input was a NAN already + SHL_64(q0, q0, 63); // only keep the sign bit + VORR(q2, q2, q0); // NAN -> -NAN + } else { + FMULD(q2, v1, v2); // the high part of the vector is erased... + } if(v0!=v2) { VMOVQ(v0, v2); } - VMOVeD(v0, 0, d1, 0); - YMM0(gd) + VMOVeD(v0, 0, q2, 0); break; case 0x5A: INST_NAME("VCVTSD2SS Gx, Vx, Ex"); @@ -257,14 +282,27 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, case 0x5C: INST_NAME("VSUBSD Gx, Vx, Ex"); nextop = F8; - d1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); GETEXSD(v1, 0, 0); GETGX_empty_VX(v0, v2); - FSUBD(d1, v2, v1); + if(!BOX64ENV(dynarec_fastnan)) { + q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXD(q1, v0, v1); // propagate NAN + FCMEQD(q1, q1, q1); // 0 if NAN, 1 if not NAN + FSUBD(q2, v2, v1); // the high part of the vector is erased... + FCMEQD(q0, q2, q2); // 0 => out is NAN + VBIC(q0, q1, q0); // forget it in any input was a NAN already + SHL_64(q0, q0, 63); // only keep the sign bit + VORR(q2, q2, q0); // NAN -> -NAN + } else { + FSUBD(q2, v2, v1); // the high part of the vector is erased... + } if(v0!=v2) { VMOVQ(v0, v2); } - VMOVeD(v0, 0, d1, 0); + VMOVeD(v0, 0, q2, 0); YMM0(gd) break; case 0x5D: @@ -285,27 +323,27 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, case 0x5E: INST_NAME("VDIVSD Gx, Vx, Ex"); nextop = F8; - d1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); GETEXSD(v1, 0, 0); GETGX_empty_VX(v0, v2); if(!BOX64ENV(dynarec_fastnan)) { - q0 = fpu_get_scratch(dyn, ninst); q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN - FMAXD(q0, v2, v1); // propagate NAN - FCMEQD(q0, q0, q0); // 0 if NAN, 1 if not NAN - } - FDIVD(d1, v2, v1); - if(!BOX64ENV(dynarec_fastnan)) { - FCMEQD(q1, d1, d1); // 0 => out is NAN - VBIC(q1, q0, q1); // forget it in any input was a NAN already - VSHLQ_64(q1, q1, 63); // only keep the sign bit - VORR(d1, d1, q1); // NAN -> -NAN + FMAXD(q1, v0, v1); // propagate NAN + FCMEQD(q1, q1, q1); // 0 if NAN, 1 if not NAN + FDIVD(q2, v2, v1); // the high part of the vector is erased... + FCMEQD(q0, q2, q2); // 0 => out is NAN + VBIC(q0, q1, q0); // forget it in any input was a NAN already + SHL_64(q0, q0, 63); // only keep the sign bit + VORR(q2, q2, q0); // NAN -> -NAN + } else { + FDIVD(q2, v2, v1); // the high part of the vector is erased... } if(v0!=v2) { VMOVQ(v0, v2); } - VMOVeD(v0, 0, d1, 0); + VMOVeD(v0, 0, q2, 0); YMM0(gd) break; case 0x5F: @@ -366,6 +404,29 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } if(!vex.l) YMM0(gd); break; + case 0x7D: + INST_NAME("VHSUBPS Gx, Vx, Ex"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + if(MODREG || (v1==v2)) { + q1 = fpu_get_scratch(dyn, ninst); + } else + q1 = v1; + if(vex.l) + q2 = fpu_get_scratch(dyn, ninst); + else + q2 = q0; + // q0 will contains -1 / 0 / -1 / 0 + MOVIQ_64(q0, 0xf0); + VSHLQ_32(q0, q0, 31); // keep sign bit + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + VEORQ(q1, v1, q0); + VEORQ(q2, v2, q0); + VFADDPQS(v0, q2, q1); + } + if(!vex.l) YMM0(gd); + break; case 0xC2: INST_NAME("CMPSD Gx, Ex, Ib"); @@ -395,13 +456,12 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, q0 = fpu_get_scratch(dyn, ninst); static float addsubps[4] = {-1.f, 1.f, -1.f, 1.f}; MAYUSE(addsubps); - TABLE64(x2, (uintptr_t)&addsubps); + MOV64x(x2, (uintptr_t)&addsubps); VLDR128_U12(q0, x2, 0); for(int l=0; l<1+vex.l; ++l) { - if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); if(v0==v1) q1 = fpu_get_scratch(dyn, ninst); } else { GETGY_empty_VYEY(v0, v2, v1); } + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } if(v0==v1) { - VFMULQS(q1, v1, q0); - VFADDQS(v0, v2, q1); + VFMLAQS(v0, v2, q0); } else { if(v0!=v2) VMOVQ(v0, v2); VFMLAQS(v0, v1, q0); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c index eed9fb59..1dc4c55b 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c @@ -234,35 +234,53 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, case 0x58: INST_NAME("VADDSS Gx, Vx, Ex"); nextop = F8; - d1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); GETEXSS(v1, 0, 0); GETGX_empty_VX(v0, v2); + if(!BOX64ENV(dynarec_fastnan)) { + q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXS(q1, v0, v1); // propagate NAN + FCMEQS(q1, q1, q1); // 0 if NAN, 1 if not NAN + FADDS(q2, v1, v2); // the high part of the vector is erased... + FCMEQS(q0, q2, q2); // 0 => out is NAN + VBIC(q0, q1, q0); // forget it in any input was a NAN already + VSHL_32(q0, q0, 31); // only keep the sign bit + VORR(q2, q2, q0); // NAN -> -NAN + } else { + FADDS(q2, v1, v2); // the high part of the vector is erased... + } if(v0!=v2) { - if(v0==v1) { - VMOV(d1, v1); - v1 = d1; - } VMOVQ(v0, v2); } - FADDS(d1, v0, v1); - VMOVeS(v0, 0, d1, 0); + VMOVeS(v0, 0, q2, 0); YMM0(gd) break; case 0x59: INST_NAME("VMULSS Gx, Vx, Ex"); nextop = F8; - d1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); GETEXSS(v1, 0, 0); GETGX_empty_VX(v0, v2); + if(!BOX64ENV(dynarec_fastnan)) { + q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXS(q1, v0, v1); // propagate NAN + FCMEQS(q1, q1, q1); // 0 if NAN, 1 if not NAN + FMULS(q2, v1, v2); // the high part of the vector is erased... + FCMEQS(q0, q2, q2); // 0 => out is NAN + VBIC(q0, q1, q0); // forget it in any input was a NAN already + VSHL_32(q0, q0, 31); // only keep the sign bit + VORR(q2, q2, q0); // NAN -> -NAN + } else { + FMULS(q2, v1, v2); // the high part of the vector is erased... + } if(v0!=v2) { - if(v0==v1) { - VMOV(d1, v1); - v1 = d1; - } VMOVQ(v0, v2); } - FMULS(d1, v0, v1); - VMOVeS(v0, 0, d1, 0); + VMOVeS(v0, 0, q2, 0); YMM0(gd) break; case 0x5A: @@ -314,14 +332,27 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, case 0x5C: INST_NAME("VSUBSS Gx, Vx, Ex"); nextop = F8; - d1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); GETEXSS(v1, 0, 0); GETGX_empty_VX(v0, v2); - FSUBS(d1, v2, v1); + if(!BOX64ENV(dynarec_fastnan)) { + q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXS(q1, v0, v1); // propagate NAN + FCMEQS(q1, q1, q1); // 0 if NAN, 1 if not NAN + FSUBS(q2, v2, v1); // the high part of the vector is erased... + FCMEQS(q0, q2, q2); // 0 => out is NAN + VBIC(q0, q1, q0); // forget it in any input was a NAN already + VSHL_32(q0, q0, 31); // only keep the sign bit + VORR(q2, q2, q0); // NAN -> -NAN + } else { + FSUBS(q2, v2, v1); // the high part of the vector is erased... + } if(v0!=v2) { VMOVQ(v0, v2); } - VMOVeS(v0, 0, d1, 0); + VMOVeS(v0, 0, q2, 0); YMM0(gd) break; case 0x5D: @@ -341,14 +372,27 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, case 0x5E: INST_NAME("VDIVSS Gx, Vx, Ex"); nextop = F8; - d1 = fpu_get_scratch(dyn, ninst); + q2 = fpu_get_scratch(dyn, ninst); GETEXSS(v1, 0, 0); GETGX_empty_VX(v0, v2); - FDIVS(d1, v2, v1); + if(!BOX64ENV(dynarec_fastnan)) { + q1 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXS(q1, v0, v1); // propagate NAN + FCMEQS(q1, q1, q1); // 0 if NAN, 1 if not NAN + FDIVS(q2, v2, v1); // the high part of the vector is erased... + FCMEQS(q0, q2, q2); // 0 => out is NAN + VBIC(q0, q1, q0); // forget it in any input was a NAN already + VSHL_32(q0, q0, 31); // only keep the sign bit + VORR(q2, q2, q0); // NAN -> -NAN + } else { + FDIVS(q2, v2, v1); // the high part of the vector is erased... + } if(v0!=v2) { VMOVQ(v0, v2); } - VMOVeS(v0, 0, d1, 0); + VMOVeS(v0, 0, q2, 0); YMM0(gd) break; case 0x5F: diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c index 984ebc2e..d1ff597b 100644 --- a/src/dynarec/arm64/dynarec_arm64_f20f.c +++ b/src/dynarec/arm64/dynarec_arm64_f20f.c @@ -497,7 +497,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n q0 = fpu_get_scratch(dyn, ninst); static float addsubps[4] = {-1.f, 1.f, -1.f, 1.f}; MAYUSE(addsubps); - TABLE64(x2, (uintptr_t)&addsubps); + MOV64x(x2, (uintptr_t)&addsubps); // no need to use table64, as box64 is loaded in low memory VLDR128_U12(q0, x2, 0); VFMLAQS(v0, v1, q0); break; diff --git a/src/dynarec/arm64/dynarec_arm64_f30f.c b/src/dynarec/arm64/dynarec_arm64_f30f.c index 45a5c454..b136c59c 100644 --- a/src/dynarec/arm64/dynarec_arm64_f30f.c +++ b/src/dynarec/arm64/dynarec_arm64_f30f.c @@ -246,11 +246,24 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x58: INST_NAME("ADDSS Gx, Ex"); nextop = F8; - GETGX(v0, 1); - d1 = fpu_get_scratch(dyn, ninst); + GETGX(d1, 1); + v1 = fpu_get_scratch(dyn, ninst); GETEXSS(d0, 0, 0); - FADDS(d1, v0, d0); // the high part of the vector is erased... - VMOVeS(v0, 0, d1, 0); + if(!BOX64ENV(dynarec_fastnan)) { + v0 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXS(v0, d0, d1); // propagate NAN + FCMEQS(v0, v0, v0); // 0 if NAN, 1 if not NAN + FADDS(v1, d1, d0); // the high part of the vector is erased... + FCMEQS(q0, v1, v1); // 0 => out is NAN + VBIC(q0, v0, q0); // forget it in any input was a NAN already + VSHL_32(q0, q0, 31); // only keep the sign bit + VORR(v1, v1, q0); // NAN -> -NAN + } else { + FADDS(v1, d1, d0); // the high part of the vector is erased... + } + VMOVeS(d1, 0, v1, 0); break; case 0x59: INST_NAME("MULSS Gx, Ex"); @@ -264,13 +277,13 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n // check if any input value was NAN FMAXS(v0, d0, d1); // propagate NAN FCMEQS(v0, v0, v0); // 0 if NAN, 1 if not NAN - FMULS(v1, d1, d0); + FMULS(v1, d1, d0); // the high part of the vector is erased... FCMEQS(q0, v1, v1); // 0 => out is NAN VBIC(q0, v0, q0); // forget it in any input was a NAN already - VSHL_32(q0, q0, 31); // only keep the sign bit + VSHL_32(q0, q0, 31); // only keep the sign bit VORR(v1, v1, q0); // NAN -> -NAN } else { - FMULS(v1, d1, d0); + FMULS(v1, d1, d0); // the high part of the vector is erased... } VMOVeS(d1, 0, v1, 0); break; @@ -311,11 +324,24 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5C: INST_NAME("SUBSS Gx, Ex"); nextop = F8; - GETGX(v0, 1); - d1 = fpu_get_scratch(dyn, ninst); + GETGX(d1, 1); + v1 = fpu_get_scratch(dyn, ninst); GETEXSS(d0, 0, 0); - FSUBS(d1, v0, d0); - VMOVeS(v0, 0, d1, 0); + if(!BOX64ENV(dynarec_fastnan)) { + v0 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXS(v0, d0, d1); // propagate NAN + FCMEQS(v0, v0, v0); // 0 if NAN, 1 if not NAN + FSUBS(v1, d1, d0); // the high part of the vector is erased... + FCMEQS(q0, v1, v1); // 0 => out is NAN + VBIC(q0, v0, q0); // forget it in any input was a NAN already + VSHL_32(q0, q0, 31); // only keep the sign bit + VORR(v1, v1, q0); // NAN -> -NAN + } else { + FSUBS(v1, d1, d0); // the high part of the vector is erased... + } + VMOVeS(d1, 0, v1, 0); break; case 0x5D: INST_NAME("MINSS Gx, Ex"); @@ -336,11 +362,24 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5E: INST_NAME("DIVSS Gx, Ex"); nextop = F8; - GETGX(v0, 1); - d1 = fpu_get_scratch(dyn, ninst); + GETGX(d1, 1); + v1 = fpu_get_scratch(dyn, ninst); GETEXSS(d0, 0, 0); - FDIVS(d1, v0, d0); - VMOVeS(v0, 0, d1, 0); + if(!BOX64ENV(dynarec_fastnan)) { + v0 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); + // check if any input value was NAN + FMAXS(v0, d0, d1); // propagate NAN + FCMEQS(v0, v0, v0); // 0 if NAN, 1 if not NAN + FDIVS(v1, d1, d0); // the high part of the vector is erased... + FCMEQS(q0, v1, v1); // 0 => out is NAN + VBIC(q0, v0, q0); // forget it in any input was a NAN already + VSHL_32(q0, q0, 31); // only keep the sign bit + VORR(v1, v1, q0); // NAN -> -NAN + } else { + FDIVS(v1, d1, d0); // the high part of the vector is erased... + } + VMOVeS(d1, 0, v1, 0); break; case 0x5F: INST_NAME("MAXSS Gx, Ex"); |