From 1e89ccb17e0c10161e60cc1979c8a656f73bb50f Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Fri, 4 Apr 2025 16:59:44 +0200 Subject: [ARM4_DYNAREC] A few changes to seem SSE/AVX comparison and convertions opcodes, and more FRINTTS usage too --- src/dynarec/arm64/dynarec_arm64_0f.c | 20 ++--- src/dynarec/arm64/dynarec_arm64_660f.c | 28 ++++--- src/dynarec/arm64/dynarec_arm64_avx_0f.c | 2 +- src/dynarec/arm64/dynarec_arm64_avx_66_0f.c | 65 ++++++++------- src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c | 119 +++++++++++++++++++--------- src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c | 68 +++++++++++----- src/dynarec/arm64/dynarec_arm64_f20f.c | 72 +++++++++++------ src/dynarec/arm64/dynarec_arm64_f30f.c | 67 +++++++++++----- 8 files changed, 287 insertions(+), 154 deletions(-) (limited to 'src') diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c index b66f5bf3..2821c846 100644 --- a/src/dynarec/arm64/dynarec_arm64_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_0f.c @@ -1189,7 +1189,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("CVTPS2PD Gx, Ex"); nextop = F8; GETEX(q0, 0, 0); - GETGX(q1, 1); + GETGX_empty(q1); FCVTL(q1, q0); break; case 0x5B: @@ -2433,22 +2433,24 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0: VFCMEQQS(v0, v0, v1); break; // Equal case 1: VFCMGTQS(v0, v1, v0); break; // Less than case 2: VFCMGEQS(v0, v1, v0); break; // Less or equal - case 3: VFCMEQQS(v0, v0, v0); - if(v0!=v1) { + case 3: if(v0!=v1) { q0 = fpu_get_scratch(dyn, ninst); - VFCMEQQS(q0, v1, v1); - VANDQ(v0, v0, q0); + VFMAXQS(q0, v0, v1); // propagate NAN + VFCMEQQS(v0, q0, q0); + } else { + VFCMEQQS(v0, v0, v0); } VMVNQ(v0, v0); break; // NaN (NaN is not equal to himself) case 4: VFCMEQQS(v0, v0, v1); VMVNQ(v0, v0); break; // Not Equal (or unordered on ARM, not on X86...) case 5: VFCMGTQS(v0, v1, v0); VMVNQ(v0, v0); break; // Greater or equal or unordered case 6: VFCMGEQS(v0, v1, v0); VMVNQ(v0, v0); break; // Greater or unordered - case 7: VFCMEQQS(v0, v0, v0); - if(v0!=v1) { + case 7: if(v0!=v1) { q0 = fpu_get_scratch(dyn, ninst); - VFCMEQQS(q0, v1, v1); - VANDQ(v0, v0, q0); + VFMAXQS(q0, v0, v1); // propagate NAN + VFCMEQQS(v0, q0, q0); + } else { + VFCMEQQS(v0, v0, v0); } break; // not NaN } diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c index be7acca3..de9eb515 100644 --- a/src/dynarec/arm64/dynarec_arm64_660f.c +++ b/src/dynarec/arm64/dynarec_arm64_660f.c @@ -1787,8 +1787,12 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n for(int i=0; i<4; ++i) { BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); - VMOVeS(d0, 0, v1, i); - FRINTIS(d0, d0); + if(i) { + VMOVeS(d0, 0, v1, i); + FRINTIS(d0, d0); + } else { + FRINTIS(d0, v1); + } VFCVTZSs(d0, d0); MRS_fpsr(x5); // get back FPSR to check the IOC bit TBZ(x5, FPSR_IOC, 4+4); @@ -2820,22 +2824,24 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0: VFCMEQQD(v0, v0, v1); break; // Equal case 1: VFCMGTQD(v0, v1, v0); break; // Less than case 2: VFCMGEQD(v0, v1, v0); break; // Less or equal - case 3: VFCMEQQD(v0, v0, v0); - if(v0!=v1) { + case 3: if(v0!=v1) { q0 = fpu_get_scratch(dyn, ninst); - VFCMEQQD(q0, v1, v1); - VANDQ(v0, v0, q0); + VFMAXQD(q0, v0, v1); // propagate NAN + VFCMEQQD(v0, q0, q0); + } else { + VFCMEQQD(v0, v0, v0); } VMVNQ(v0, v0); break; // NaN (NaN is not equal to himself) case 4: VFCMEQQD(v0, v0, v1); VMVNQ(v0, v0); break; // Not Equal (or unordered on ARM, not on X86...) case 5: VFCMGTQD(v0, v1, v0); VMVNQ(v0, v0); break; // Greater or equal or unordered case 6: VFCMGEQD(v0, v1, v0); VMVNQ(v0, v0); break; // Greater or unordered - case 7: VFCMEQQD(v0, v0, v0); - if(v0!=v1) { + case 7: if(v0!=v1) { q0 = fpu_get_scratch(dyn, ninst); - VFCMEQQD(q0, v1, v1); - VANDQ(v0, v0, q0); + VFMAXQD(q0, v0, v1); // propagate NAN + VFCMEQQD(v0, q0, q0); + } else { + VFCMEQQD(v0, v0, v0); } break; // not NaN } @@ -3146,8 +3152,6 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n SQXTN_32(v0, v0); // convert int64 -> int32 with saturation in lower part, RaZ high part } else { MRS_fpsr(x5); - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); ORRw_mask(x4, xZR, 1, 0); //0x80000000 d0 = fpu_get_scratch(dyn, ninst); for(int i=0; i<2; ++i) { diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c index e8347bb4..50231d0e 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c @@ -700,7 +700,7 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int q0 = fpu_get_scratch(dyn, ninst); for(int l=0; l<1+vex.l; ++l) { if(!l) { GETGX_empty_VXEX(v0, v2, v1, 1); u8 = F8; } else { GETGY_empty_VYEY(v0, v2, v1); } - if(((u8&15)==3) || ((u8&15)==7) || ((u8&15)==8) || ((u8&15)==9) || ((u8&15)==10) || ((u8&15)==12) || ((u8&15)==13) || ((u8&15)==14)) { + if(((u8&15)==3) || ((u8&15)==7) || ((u8&15)==8) || ((u8&15)==9) || ((u8&15)==10) || ((u8&15)==12)) { VFMAXQS(q0, v2, v1); // propagate NAN VFCMEQQS(((u8&15)==7)?v0:q0, q0, q0); // 0 if NAN, 1 if not NAN } diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c index 3d187ebc..119e2bce 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c @@ -416,10 +416,8 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } if(vex.l) { GETEY(v1); - if(BOX64ENV(dynarec_fastround)==2) { - FCVTXN2(v0, v1); - } else { - FCVTN2(v0, v1); + FCVTXN2(v0, v1); + if(BOX64ENV(dynarec_fastround)<2) { x87_restoreround(dyn, ninst, u8); } } @@ -435,7 +433,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, MOVI_32_lsl(d1, 0x80, 3); } for(int l=0; l<1+vex.l; ++l) { - if(!l) { GETEX_Y(v1, 0, 0); GETGX_empty(v0); } else { GETGY_empty_EY(v0, v1); } + if(!l) { GETGX_empty_EX(v0, v1, 0); } else { GETGY_empty_EY(v0, v1); } if(BOX64ENV(dynarec_fastround)) { VFRINTISQ(v0, v1); VFCVTZSQS(v0, v0); @@ -448,8 +446,12 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, for(int i=0; i<4; ++i) { BFCx(x5, FPSR_IOC, 1); // reset IOC bits MSR_fpsr(x5); - VMOVeS(d0, 0, v1, i); - FRINTIS(d0, d0); + if(i) { + VMOVeS(d0, 0, v1, i); + FRINTIS(d0, d0); + } else { + FRINTIS(d0, v1); + } VFCVTZSs(d0, d0); MRS_fpsr(x5); // get back FPSR to check the IOC bit TSTw_mask(x5, 0, 0); // mask=(1< int32 with saturation in higher part } else { - if(!l) { - MRS_fpsr(x5); - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - ORRw_mask(x4, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn, ninst); - } - for(int i=0; i<2; ++i) { - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - if(i) { - VMOVeD(d0, 0, v1, i); - FCVTZSwD(x1, d0); - } else { - FCVTZSwD(x1, v1); + if(arm64_frintts) { + VFRINT32ZDQ(l?d0:v0, v1); // handle overflow + VFCVTZSQD(l?d0:v0, l?d0:v0); // convert double -> int64 + if(!l) + SQXTN_32(v0, v0); // convert int64 -> int32 with saturation in lower part, RaZ high part + else + SQXTN2_32(v0, d0); // convert int64 -> int32 with saturation in higher part + } else { + if(!l) { + MRS_fpsr(x5); + ORRw_mask(x4, xZR, 1, 0); //0x80000000 + d0 = fpu_get_scratch(dyn, ninst); + } + for(int i=0; i<2; ++i) { + BFCw(x5, FPSR_IOC, 1); // reset IOC bit + MSR_fpsr(x5); + if(i) { + VMOVeD(d0, 0, v1, i); + FCVTZSwD(x1, d0); + } else { + FCVTZSwD(x1, v1); + } + MRS_fpsr(x5); // get back FPSR to check the IOC bit + TSTw_mask(x5, 0, 0); // mask = 1 = FPSR_IOC + CSELx(x1, x1, x4, cEQ); + VMOVQSfrom(v0, i+l*2, x1); } - MRS_fpsr(x5); // get back FPSR to check the IOC bit - TSTw_mask(x5, 0, 0); // mask = 1 = FPSR_IOC - CSELx(x1, x1, x4, cEQ); - VMOVQSfrom(v0, i+l*2, x1); + if(!vex.l && !l) VMOVQDfrom(v0, 1, xZR); } - if(!vex.l && !l) VMOVQDfrom(v0, 1, xZR); } } YMM0(gd); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c index a9429bf7..156d9243 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c @@ -123,11 +123,17 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, GETGX_empty_VX(v0, v1); GETED(0); d1 = fpu_get_scratch(dyn, ninst); + if(BOX64ENV(dynarec_fastround)<2) { + u8 = sse_setround(dyn, ninst, x3, x4, x5); + } if(rex.w) { SCVTFDx(d1, ed); } else { SCVTFDw(d1, ed); } + if(BOX64ENV(dynarec_fastround)<2) { + x87_restoreround(dyn, ninst, u8); + } if(v0!=v1) VMOVQ(v0, v1); VMOVeD(v0, 0, d1, 0); YMM0(gd); @@ -138,13 +144,23 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, nextop = F8; GETGD; GETEXSD(q0, 0, 0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); } - FCVTZSxwD(gd, q0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && arm64_frintts) { + v0 = fpu_get_scratch(dyn, ninst); + if(rex.w) { + FRINT64ZD(v0, q0); + } else { + FRINT32ZD(v0, q0); + } + FCVTZSxwD(gd, v0); + } else { + FCVTZSxwD(gd, q0); + } + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); // get back FPSR to check the IOC bit TBZ_NEXT(x5, FPSR_IOC); if(rex.w) { @@ -159,17 +175,25 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, nextop = F8; GETGD; GETEXSD(q0, 0, 0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); } u8 = sse_setround(dyn, ninst, x1, x2, x3); d1 = fpu_get_scratch(dyn, ninst); - FRINTID(d1, q0); + if(!BOX64ENV(dynarec_fastround) && arm64_frintts) { + if(rex.w) { + FRINT64XD(d1, q0); + } else { + FRINT32XD(d1, q0); + } + } else { + FRINTID(d1, q0); + } x87_restoreround(dyn, ninst, u8); FCVTZSxwD(gd, d1); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); // get back FPSR to check the IOC bit TBZ_NEXT(x5, FPSR_IOC); if(rex.w) { @@ -449,22 +473,36 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, break; case 0xC2: - INST_NAME("CMPSD Gx, Ex, Ib"); + INST_NAME("VCMPSD Gx, Ex, Ib"); nextop = F8; GETEXSD(v1, 0, 1); GETGX_empty_VX(v0, v2); u8 = F8; - FCMPD(v2, v1); + if(((u8&15)!=0x0b) && ((u8&15)!=0x0f)) { + if((u8&15)>7) + FCMPD(v1, v2); + else + FCMPD(v2, v1); + } + // TODO: create a test for this one, there might be an issue with cases 9, 10 and 13 if(v0!=v2) VMOVQ(v0, v2); - switch(u8&7) { - case 0: CSETMx(x2, cEQ); break; // Equal - case 1: CSETMx(x2, cCC); break; // Less than - case 2: CSETMx(x2, cLS); break; // Less or equal - case 3: CSETMx(x2, cVS); break; // NaN - case 4: CSETMx(x2, cNE); break; // Not Equal or unordered - case 5: CSETMx(x2, cCS); break; // Greater or equal or unordered - case 6: CSETMx(x2, cHI); break; // Greater or unordered, test inverted, N!=V so unordered or less than (inverted) - case 7: CSETMx(x2, cVC); break; // not NaN + switch(u8&15) { + case 0x00: CSETMx(x2, cEQ); break; // Equal + case 0x01: CSETMx(x2, cCC); break; // Less than + case 0x02: CSETMx(x2, cLS); break; // Less or equal + case 0x03: CSETMx(x2, cVS); break; // NaN + case 0x04: CSETMx(x2, cNE); break; // Not Equal or unordered + case 0x05: CSETMx(x2, cCS); break; // Greater or equal or unordered + case 0x06: CSETMx(x2, cHI); break; // Greater or unordered + case 0x07: CSETMx(x2, cVC); break; // not NaN + case 0x08: CSETMx(x2, cEQ); CSETMx(x3, cVS); ORRx_REG(x2, x2, x3); break; // Equal or unordered + case 0x09: CSETMx(x2, cHI); break; // Less than or unordered + case 0x0a: CSETMx(x2, cCS); break; // Less or equal or unordered + case 0x0b: MOV32w(x2, 0); break; // false + case 0x0c: CSETMw(x2, cNE); CSETMx(x3, cVS); BICx(x2, x2, x3); break; // Not Equal not unordered + case 0x0d: CSETMw(x2, cLS); break; // Greater or equal not unordered + case 0x0e: CSETMw(x2, cCC); break; // Greater not unordered + case 0x0f: MOV64x(x2, 0xffffffffffffffffLL); break; // true } VMOVQDfrom(v0, 0, x2); YMM0(gd); @@ -515,28 +553,35 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, else SQXTN2_32(v0, d0); // convert int64 -> int32 with saturation in higher part } else { - if(!l) { - MRS_fpsr(x5); - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - ORRw_mask(x4, xZR, 1, 0); //0x80000000 - } - for(int i=0; i<2; ++i) { - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - if(i) { - VMOVeD(d0, 0, v1, i); - FRINTID(d0, d0); - } else { - FRINTID(d0, v1); + if(arm64_frintts) { + VFRINT32XDQ(l?d0:v0, v1); // round, handling of overflow and Nan to 0x80000000 + VFCVTNSQD(l?d0:v0, l?d0:v0); // convert double -> int64 + if(!l) + SQXTN_32(v0, v0); // convert int64 -> int32 with saturation in lower part, RaZ high part + else + SQXTN2_32(v0, d0); // convert int64 -> int32 with saturation in higher part + } else { + if(!l) { + MRS_fpsr(x5); + ORRw_mask(x4, xZR, 1, 0); //0x80000000 + } + for(int i=0; i<2; ++i) { + BFCw(x5, FPSR_IOC, 1); // reset IOC bit + MSR_fpsr(x5); + if(i) { + VMOVeD(d0, 0, v1, i); + FRINTID(d0, d0); + } else { + FRINTID(d0, v1); + } + FCVTZSwD(x1, d0); + MRS_fpsr(x5); // get back FPSR to check the IOC bit + TSTw_mask(x5, 0, 0); // mask = 1 = FPSR_IOC + CSELx(x1, x1, x4, cEQ); + VMOVQSfrom(v0, i+l*2, x1); } - FCVTZSwD(x1, d0); - MRS_fpsr(x5); // get back FPSR to check the IOC bit - TSTw_mask(x5, 0, 0); // mask = 1 = FPSR_IOC - CSELx(x1, x1, x4, cEQ); - VMOVQSfrom(v0, i+l*2, x1); + if(!vex.l && !l) VMOVQDfrom(v0, 1, xZR); } - if(!vex.l && !l) VMOVQDfrom(v0, 1, xZR); } } x87_restoreround(dyn, ninst, u8); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c index 74c8f1fc..9e523b7c 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c @@ -123,11 +123,17 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, d1 = fpu_get_scratch(dyn, ninst); GETGX_empty_VX(v0, v1); GETED(0); + if(BOX64ENV(dynarec_fastround)<2) { + u8 = sse_setround(dyn, ninst, x3, x4, x5); + } if(rex.w) { SCVTFSx(d1, ed); } else { SCVTFSw(d1, ed); } + if(BOX64ENV(dynarec_fastround)<2) { + x87_restoreround(dyn, ninst, u8); + } if(v0!=v1) VMOVQ(v0, v1); VMOVeS(v0, 0, d1, 0); YMM0(gd); @@ -138,13 +144,23 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, nextop = F8; GETGD; GETEXSS(d0, 0, 0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); } - FCVTZSxwS(gd, d0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && arm64_frintts) { + v0 = fpu_get_scratch(dyn, ninst); + if(rex.w) { + FRINT64ZS(v0, q0); + } else { + FRINT32ZS(v0, q0); + } + FCVTZSxwS(gd, v0); + } else { + FCVTZSxwS(gd, d0); + } + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); // get back FPSR to check the IOC bit TBZ_NEXT(x5, FPSR_IOC); if(rex.w) { @@ -159,17 +175,25 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, nextop = F8; GETGD; GETEXSS(q0, 0, 0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); } u8 = sse_setround(dyn, ninst, x1, x2, x3); d1 = fpu_get_scratch(dyn, ninst); - FRINTIS(d1, q0); + if(!BOX64ENV(dynarec_fastround) && arm64_frintts) { + if(rex.w) { + FRINT64XS(d1, q0); + } else { + FRINT32XS(d1, q0); + } + } else { + FRINTIS(d1, q0); + } x87_restoreround(dyn, ninst, u8); FCVTZSxwS(gd, d1); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); // get back FPSR to check the IOC bit TBZ_NEXT(x5, FPSR_IOC); if(rex.w) { @@ -287,15 +311,10 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, d1 = fpu_get_scratch(dyn, ninst); GETEXSS(v1, 0, 0); GETGX_empty_VX(v0, v2); - if(v0!=v2) { - if(v0==v1) { - VMOV(d1, v1); - v1 = d1; - } - VMOVQ(v0, v2); - } FCVT_D_S(d1, v1); VMOVeD(v0, 0, d1, 0); + if(v0!=v2) + VMOVeD(v0, 1, v2, 1); YMM0(gd); break; case 0x5B: @@ -311,17 +330,22 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, if(BOX64ENV(dynarec_fastround)) { VFCVTZSQS(v0, v1); } else { - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - for(int i=0; i<4; ++i) { + if(arm64_frintts) { + VFRINT32ZSQ(v0, v1); + VFCVTZSQS(v0, v0); + } else { BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); - VMOVeS(d0, 0, v1, i); - VFCVTZSs(d0, d0); - MRS_fpsr(x5); // get back FPSR to check the IOC bit - TBZ(x5, FPSR_IOC, 4+4); - VMOVQSfrom(d0, 0, x4); - VMOVeS(v0, i, d0, 0); + for(int i=0; i<4; ++i) { + BFCw(x5, FPSR_IOC, 1); // reset IOC bit + MSR_fpsr(x5); + VMOVeS(d0, 0, v1, i); + VFCVTZSs(d0, d0); + MRS_fpsr(x5); // get back FPSR to check the IOC bit + TBZ(x5, FPSR_IOC, 4+4); + VMOVQSfrom(d0, 0, x4); + VMOVeS(v0, i, d0, 0); + } } } } diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c index 919805d6..03240803 100644 --- a/src/dynarec/arm64/dynarec_arm64_f20f.c +++ b/src/dynarec/arm64/dynarec_arm64_f20f.c @@ -118,13 +118,23 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGD; GETEXSD(q0, 0, 0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); } - FCVTZSxwD(gd, q0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && arm64_frintts) { + v0 = fpu_get_scratch(dyn, ninst); + if(rex.w) { + FRINT64ZD(v0, q0); + } else { + FRINT32ZD(v0, q0); + } + FCVTZSxwD(gd, v0); + } else { + FCVTZSxwD(gd, q0); + } + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); // get back FPSR to check the IOC bit TBZ_NEXT(x5, FPSR_IOC); if(rex.w) { @@ -139,17 +149,25 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGD; GETEXSD(q0, 0, 0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); } u8 = sse_setround(dyn, ninst, x1, x2, x3); d1 = fpu_get_scratch(dyn, ninst); - FRINTID(d1, q0); + if(!BOX64ENV(dynarec_fastround) && arm64_frintts) { + if(rex.w) { + FRINT64XD(d1, q0); + } else { + FRINT32XD(d1, q0); + } + } else { + FRINTID(d1, q0); + } x87_restoreround(dyn, ninst, u8); FCVTZSxwD(gd, d1); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); // get back FPSR to check the IOC bit TBZ_NEXT(x5, FPSR_IOC); if(rex.w) { @@ -549,28 +567,32 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n SQXTN_32(v0, v0); // convert int64 -> int32 with saturation in lower part, RaZ high part } else { u8 = sse_setround(dyn, ninst, x1, x2, x3); - MRS_fpsr(x5); - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - ORRw_mask(x4, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn, ninst); - for(int i=0; i<2; ++i) { - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - if(i) { - VMOVeD(d0, 0, v1, i); - FRINTID(d0, d0); - } else { - FRINTID(d0, v1); + if(arm64_frintts) { + VFRINT32XDQ(v0, v1); // round, handling of overflow and Nan to 0x80000000 + VFCVTNSQD(v0, v0); // convert double -> int64 + SQXTN_32(v0, v0); // convert int64 -> int32 with saturation in lower part, RaZ high part + } else { + MRS_fpsr(x5); + ORRw_mask(x4, xZR, 1, 0); //0x80000000 + d0 = fpu_get_scratch(dyn, ninst); + for(int i=0; i<2; ++i) { + BFCw(x5, FPSR_IOC, 1); // reset IOC bit + MSR_fpsr(x5); + if(i) { + VMOVeD(d0, 0, v1, i); + FRINTID(d0, d0); + } else { + FRINTID(d0, v1); + } + FCVTZSwD(x1, d0); + MRS_fpsr(x5); // get back FPSR to check the IOC bit + TBZ(x5, FPSR_IOC, 4+4); + MOVw_REG(x1, x4); + VMOVQSfrom(v0, i, x1); } - FCVTZSwD(x1, d0); - MRS_fpsr(x5); // get back FPSR to check the IOC bit - TBZ(x5, FPSR_IOC, 4+4); - MOVw_REG(x1, x4); - VMOVQSfrom(v0, i, x1); + VMOVQDfrom(v0, 1, xZR); } x87_restoreround(dyn, ninst, u8); - VMOVQDfrom(v0, 1, xZR); } break; diff --git a/src/dynarec/arm64/dynarec_arm64_f30f.c b/src/dynarec/arm64/dynarec_arm64_f30f.c index 28e01f26..f7e0bf3f 100644 --- a/src/dynarec/arm64/dynarec_arm64_f30f.c +++ b/src/dynarec/arm64/dynarec_arm64_f30f.c @@ -116,11 +116,17 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETGX(v0, 1); GETED(0); d1 = fpu_get_scratch(dyn, ninst); + if(BOX64ENV(dynarec_fastround)<2) { + u8 = sse_setround(dyn, ninst, x3, x4, x5); + } if(rex.w) { SCVTFSx(d1, ed); } else { SCVTFSw(d1, ed); } + if(BOX64ENV(dynarec_fastround)<2) { + x87_restoreround(dyn, ninst, u8); + } VMOVeS(v0, 0, d1, 0); break; @@ -129,13 +135,23 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGD; GETEXSS(d0, 0, 0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); } - FCVTZSxwS(gd, d0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && arm64_frintts) { + v0 = fpu_get_scratch(dyn, ninst); + if(rex.w) { + FRINT64ZS(v0, q0); + } else { + FRINT32ZS(v0, q0); + } + FCVTZSxwS(gd, v0); + } else { + FCVTZSxwS(gd, d0); + } + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); // get back FPSR to check the IOC bit TBZ_NEXT(x5, FPSR_IOC); if(rex.w) { @@ -150,17 +166,25 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGD; GETEXSS(q0, 0, 0); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); } u8 = sse_setround(dyn, ninst, x1, x2, x3); d1 = fpu_get_scratch(dyn, ninst); - FRINTIS(d1, q0); + if(!BOX64ENV(dynarec_fastround) && arm64_frintts) { + if(rex.w) { + FRINT64XS(d1, q0); + } else { + FRINT32XS(d1, q0); + } + } else { + FRINTIS(d1, q0); + } x87_restoreround(dyn, ninst, u8); FCVTZSxwS(gd, d1); - if(!BOX64ENV(dynarec_fastround)) { + if(!BOX64ENV(dynarec_fastround) && !arm64_frintts) { MRS_fpsr(x5); // get back FPSR to check the IOC bit TBZ_NEXT(x5, FPSR_IOC); if(rex.w) { @@ -302,20 +326,23 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n if(BOX64ENV(dynarec_fastround)) { VFCVTZSQS(v0, v1); } else { - MRS_fpsr(x5); - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - ORRw_mask(x4, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn, ninst); - for(int i=0; i<4; ++i) { - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - VMOVeS(d0, 0, v1, i); - VFCVTZSs(d0, d0); - MRS_fpsr(x5); // get back FPSR to check the IOC bit - TBZ(x5, FPSR_IOC, 4+4); - VMOVQSfrom(d0, 0, x4); - VMOVeS(v0, i, d0, 0); + if(arm64_frintts) { + VFRINT32ZSQ(v0, v1); + VFCVTZSQS(v0, v0); + } else { + MRS_fpsr(x5); + ORRw_mask(x4, xZR, 1, 0); //0x80000000 + d0 = fpu_get_scratch(dyn, ninst); + for(int i=0; i<4; ++i) { + BFCw(x5, FPSR_IOC, 1); // reset IOC bit + MSR_fpsr(x5); + VMOVeS(d0, 0, v1, i); + VFCVTZSs(d0, d0); + MRS_fpsr(x5); // get back FPSR to check the IOC bit + TBZ(x5, FPSR_IOC, 4+4); + VMOVQSfrom(d0, 0, x4); + VMOVeS(v0, i, d0, 0); + } } } break; -- cgit 1.4.1