diff options
Diffstat (limited to 'src')
26 files changed, 582 insertions, 313 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c index 616085a8..8773b054 100644 --- a/src/dynarec/arm64/dynarec_arm64_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_0f.c @@ -387,7 +387,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGX(v0, 1); GETEM(q1, 0); - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); u8 = sse_setround(dyn, ninst, x1, x2, x3); SCVTFS(d0, q1); x87_restoreround(dyn, ninst, u8); @@ -423,7 +423,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); ORRw_mask(x2, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); for (int i=0; i<2; ++i) { BFCw(x5, FPSR_IOC, 1); // reset IOC bit if (i) { @@ -461,7 +461,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); ORRw_mask(x2, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); for (int i=0; i<2; ++i) { BFCw(x5, FPSR_IOC, 1); // reset IOC bit if (i) { @@ -516,7 +516,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); MOVI_8(d0, 0b10000111); VAND(d0, d0, q1); // mask the index VTBL1_8(q0, q0, d0); @@ -540,7 +540,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VUZP1_16(v0, q0, q1); VUZP2_16(q0, q0, q1); SQADD_16(q0, q0, v0); @@ -550,8 +550,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UXTL_8(v0, q0); // this is unsigned, so 0 extended SXTL_8(v1, q1); // this is signed VMULQ_16(v0, v0, v1); @@ -563,7 +563,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VUZP1_16(v0, q0, q1); VUZP2_16(q0, q0, q1); VSUB_16(q0, v0, q0); @@ -573,7 +573,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VUZP1_32(v0, q0, q1); VUZP2_32(q0, q0, q1); VSUB_32(q0, v0, q0); @@ -583,7 +583,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VUZP1_16(v0, q0, q1); VUZP2_16(q0, q0, q1); SQSUB_16(q0, v0, q0); @@ -593,8 +593,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); CMGT_0_8(v0, q1); VAND(v0, v0, q0); CMLT_0_8(v1, q1); @@ -606,8 +606,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); CMGT_0_16(v0, q1); VAND(v0, v0, q0); CMLT_0_16(v1, q1); @@ -619,8 +619,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); CMGT_0_32(v0, q1); VAND(v0, v0, q0); CMLT_0_32(v1, q1); @@ -661,10 +661,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VEORQ(v0, v0, v0); if(arm64_sha1) { - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); VMOVeS(v1, 0, q0, 3); SHA1H(v1, v1); VMOVeS(v0, 3, v1, 0); @@ -680,7 +680,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VEXTQ_8(v0, q1, q0, 8); VEORQ(q0, q0, v0); break; @@ -696,7 +696,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(q0==q1) v0 = q0; else { - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VEXTQ_8(v0, q1, q1, 8); VREV64Q_32(v0, v0); } @@ -733,10 +733,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin GETGX(q0, 1); GETEX(q1, 0, 0); d0 = sse_get_reg(dyn, ninst, x1, 0, 0); - v0 = fpu_get_scratch(dyn); - d1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + d1 = fpu_get_scratch(dyn, ninst); if(MODREG) { - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); } else v1 = q1; VREV64Q_32(q0, q0); @@ -803,9 +803,9 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(arm64_sha2) { GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); - d0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); + d0 = fpu_get_scratch(dyn, ninst); VEORQ(v1, v1, v1); VMOVQ(v0, q0); SHA256SU1(v0, v1, q1); // low v0 are ok and also need to be feed again SHA256SU1 to get the high part @@ -876,7 +876,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(u8>15) { VEOR(q0, q0, q0); } else if(u8>7) { - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); VEOR(d0, d0, d0); VEXT_8(q0, q0, d0, u8-8); } else { @@ -891,14 +891,14 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin GETGX(q0, 1); GETEX(q1, 0, 1); u8 = F8&3; - d0 = fpu_get_scratch(dyn); - d1 = fpu_get_scratch(dyn); - v0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); + d1 = fpu_get_scratch(dyn, ninst); + v0 = fpu_get_scratch(dyn, ninst); VEXTQ_8(v0, q0, q0, 8); VREV64Q_32(v0, v0); VEORQ(d1, d1, d1); if(MODREG) { - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); } else v1 = q1; if(v1!=v0) { @@ -1024,10 +1024,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin SKIPTEST(x1); GETEX(q0, 0, 0); GETGX_empty(q1); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); // more precise if(q1==q0) - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); else v1 = q1; VFRSQRTEQS(v0, q0); @@ -1042,10 +1042,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin GETEX(q0, 0, 0); GETGX_empty(q1); if(q0 == q1) - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); else v1 = q1; - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VFRECPEQS(v0, q0); VFRECPSQS(v1, v0, q0); VFMULQS(q1, v0, v1); @@ -1128,7 +1128,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin // FMIN/FMAX wll not copy the value if v0[x] is NaN // but x86 will copy if either v0[x] or v1[x] is NaN, so lets force a copy if source is NaN if(!box64_dynarec_fastnan && v0!=v1) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VFCMEQQS(q0, v0, v0); // 0 is NaN, 1 is not NaN, so MASK for NaN VANDQ(v0, v0, q0); VBICQ(q0, v1, q0); @@ -1151,7 +1151,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin // FMIN/FMAX wll not copy the value if v0[x] is NaN // but x86 will copy if either v0[x] or v1[x] is NaN, so lets force a copy if source is NaN if(!box64_dynarec_fastnan && v0!=v1) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VFCMEQQS(q0, v0, v0); // 0 is NaN, 1 is not NaN, so MASK for NaN VANDQ(v0, v0, q0); VBICQ(q0, v1, q0); @@ -1185,7 +1185,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(d0); GETEM(d1, 0); - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VMOVeD(q0, 0, d0, 0); VMOVeD(q0, 1, d1, 0); SQXTN_8(d0, q0); @@ -1215,7 +1215,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("PACKUSWB Gm, Em"); nextop = F8; GETGM(v0); - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VMOVeD(q0, 0, v0, 0); if(MODREG) { v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, (nextop&7)); @@ -1251,7 +1251,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("PACKSSDW Gm,Em"); nextop = F8; GETGM(v0); - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VMOVeD(q0, 0, v0, 0); if(MODREG) { GETEM(v1, 0); @@ -1338,7 +1338,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin (4)|(5<<8), (6)|(7<<8) }; - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); tmp64u = swp[(u8>>(0*2))&3] | (swp[(u8>>(1*2))&3]<<16); tmp64u |= (swp[(u8>>(2*2))&3]<<32) | (swp[(u8>>(3*2))&3]<<48); MOV64x(x2, tmp64u); @@ -2239,7 +2239,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 2: VFCMGEQS(v0, v1, v0); break; // Less or equal case 3: VFCMEQQS(v0, v0, v0); if(v0!=v1) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VFCMEQQS(q0, v1, v1); VANDQ(v0, v0, q0); } @@ -2250,7 +2250,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: VFCMGEQS(v0, v1, v0); VMVNQ(v0, v0); break; // Greater or unordered case 7: VFCMEQQS(v0, v0, v0); if(v0!=v1) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VFCMEQQS(q0, v1, v1); VANDQ(v0, v0, q0); } @@ -2313,7 +2313,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin } else if(v0==v1 && (u8==0xe5)) { // easy special case VMOVeS(v0, 0, v0, 1); } else { - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); // first two elements from Gx for(int i=0; i<2; ++i) { VMOVeS(d0, i, v0, (u8>>(i*2))&3); @@ -2390,12 +2390,12 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin GETGM(d0); GETEM(d1, 0); if(MODREG) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); } else { q0 = d1; } - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); VMOVBto(x1, d1, 0); MOVZw(x2, 16); SUBSw_REG(x2, x2, x1); @@ -2415,12 +2415,12 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin GETGM(d0); GETEM(d1, 0); if(MODREG) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); } else { q0 = d1; } - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); VMOVBto(x1, d1, 0); MOVZw(x2, 32); SUBSw_REG(x2, x2, x1); @@ -2439,7 +2439,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(d0); GETEM(d1, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); //MOVI_64(v0, 64); not 64! MOV32w(x1, 64); VMOVQDfrom(v0, 0, x1); @@ -2465,9 +2465,9 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xD7: nextop = F8; INST_NAME("PMOVMSKB Gd, Em"); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); - q1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); GETEM(q0, 0); GETGD; TABLE64(x1, (uintptr_t)&mask_shift8); @@ -2546,8 +2546,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(d0); GETEM(d1, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, d1); MOVI_32(v1, 15); UMIN_32(v0, v0, v1); // limit to 0 .. +15 values @@ -2560,8 +2560,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(d0); GETEM(d1, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, d1); MOVI_32(v1, 31); UMIN_32(v0, v0, v1); // limit to 0 .. +31 values @@ -2582,7 +2582,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(v0); GETEM(v1, 0); - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VSMULL_16(q0, v0, v1); SQSHRN_16(v0, q0, 16); break; @@ -2667,7 +2667,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(d0); GETEM(d1, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VMOVHto(x1, d1, 0); VDUPH(v0, x1); USHL_16(d0, d0, v0); @@ -2677,8 +2677,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(d0); GETEM(d1, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, d1); MOVI_32(v1, 32); UMIN_32(v0, v0, v1); // limit to 0 .. +32 values @@ -2690,8 +2690,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(d0); GETEM(d1, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, d1); MOVI_32(v1, 64); UMIN_32(v0, v0, v1); // limit to 0 .. +64 values @@ -2709,7 +2709,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(v0); GETEM(v1, 0); - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VSMULL_16(q0, v0, v1); VADDPQ_32(q0, q0, q0); //ADDP from Q to non-Q? VMOVQ(v0, q0); @@ -2719,8 +2719,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - d0 = fpu_get_scratch(dyn); - d1 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); + d1 = fpu_get_scratch(dyn, ninst); VEOR(d1, d1, d1); // is it necessary? UABDL_8(d0, q0, q1); UADDLVQ_16(d1, d0); @@ -2731,8 +2731,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(q0); GETEM(q1, 0); - d0 = fpu_get_scratch(dyn); - d1 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); + d1 = fpu_get_scratch(dyn, ninst); VSSHR_8(d1, q1, 7); // d1 = byte slection mask VLDR64_U12(d0, xRDI, 0); VBIC(d0, d0, d1); // d0 = clear masked byte diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c index 2afbd088..fa006bcd 100644 --- a/src/dynarec/arm64/dynarec_arm64_660f.c +++ b/src/dynarec/arm64/dynarec_arm64_660f.c @@ -135,7 +135,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n } else { SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); ADDx_U12(ed, ed, 8); VLD1_64(v0, 1, ed); } @@ -243,7 +243,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); ORRw_mask(x2, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); for (int i=0; i<2; ++i) { BFCw(x5, FPSR_IOC, 1); // reset IOC bit if (i) { @@ -283,7 +283,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); ORRw_mask(x2, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); for (int i=0; i<2; ++i) { BFCw(x5, FPSR_IOC, 1); // reset IOC bit if (i) { @@ -322,7 +322,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); MOVIQ_8(d0, 0b10001111); VANDQ(d0, d0, q1); // mask the index VTBLQ1_8(q0, q0, d0); @@ -346,7 +346,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VUZP1Q_16(v0, q0, q1); VUZP2Q_16(q0, q0, q1); SQADDQ_16(q0, q0, v0); @@ -356,10 +356,10 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); if(q0==q1) - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); else d0 = q0; UXTL_8(v0, q0); // this is unsigned, so 0 extended @@ -381,7 +381,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VUZP2Q_16(v0, q0, q1); VUZP1Q_16(q0, q0, q1); VSUBQ_16(q0, q0, v0); @@ -391,7 +391,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VUZP2Q_32(v0, q0, q1); VUZP1Q_32(q0, q0, q1); VSUBQ_32(q0, q0, v0); @@ -401,7 +401,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VUZP2Q_16(v0, q0, q1); VUZP1Q_16(q0, q0, q1); SQSUBQ_16(q0, q0, v0); @@ -411,8 +411,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v1 = fpu_get_scratch(dyn); - v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); + v0 = fpu_get_scratch(dyn, ninst); NEGQ_8(v0, q0); // get NEG CMLTQ_0_8(v1, q1); // calculate mask VBICQ(q0, q0, v1); // apply not mask on dest @@ -426,8 +426,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v1 = fpu_get_scratch(dyn); - v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); + v0 = fpu_get_scratch(dyn, ninst); NEGQ_16(v0, q0); // get NEG CMLTQ_0_16(v1, q1); // calculate mask VBICQ(q0, q0, v1); // apply not mask on dest @@ -441,8 +441,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v1 = fpu_get_scratch(dyn); - v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); + v0 = fpu_get_scratch(dyn, ninst); NEGQ_32(v0, q0); // get NEG CMLTQ_0_32(v1, q1); // calculate mask VBICQ(q0, q0, v1); // apply not mask on dest @@ -465,7 +465,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETGX(q0, 1); GETEX(q1, 0, 0); v0 = sse_get_reg(dyn, ninst, x1, 0, 0); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); if(q0!=q1) { VSSHRQ_8(v1, v0, 7); // bit[7]-> bit[7..0] VBICQ(q0, q0, v1); @@ -480,7 +480,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETGX(q0, 1); GETEX(q1, 0, 0); v0 = sse_get_reg(dyn, ninst, x1, 0, 0); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); if(q0!=q1) { VSSHRQ_32(v1, v0, 31); // bit[31]-> bit[31..0] VBICQ(q0, q0, v1); @@ -494,7 +494,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETGX(q0, 1); GETEX(q1, 0, 0); v0 = sse_get_reg(dyn, ninst, x1, 0, 0); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); if(q0!=q1) { VSSHRQ_64(v1, v0, 63); // bit[63]-> bit[63..0] VBICQ(q0, q0, v1); @@ -509,7 +509,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n SETFLAGS(X_ALL, SF_SET); GETGX(q0, 0); GETEX(q1, 0, 0); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); IFX(X_ZF) { VANDQ(v1, q1, q0); CMEQQ_0_64(v1, v1); @@ -612,7 +612,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n v0 = q0; } else { if(MODREG) - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); else v0 = q1; VUZP1Q_32(v0, q1, q1); @@ -638,8 +638,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETEX(q1, 0, 0); GETGX(q0, 1); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); VEORQ(v0, v0, v0); SMAXQ_32(v1, v0, q0); // values < 0 => 0 UQXTN_16(q0, v1); @@ -793,7 +793,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n if(arm64_aes) { GETEX(q1, 0, 0); GETGX(q0, 1); - v0 = fpu_get_scratch(dyn); // ARM64 internal operation differs a bit from x86_64 + v0 = fpu_get_scratch(dyn, ninst); // ARM64 internal operation differs a bit from x86_64 VEORQ(v0, q0, q1); AESE(v0, q1); AESMC(v0, v0); @@ -814,7 +814,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n if(arm64_aes) { GETEX(q1, 0, 0); GETGX(q0, 1); - v0 = fpu_get_scratch(dyn); // ARM64 internal operation differs a bit from x86_64 + v0 = fpu_get_scratch(dyn, ninst); // ARM64 internal operation differs a bit from x86_64 VEORQ(v0, q0, q1); AESE(v0, q1); VEORQ(q0, v0, q1); @@ -834,7 +834,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n if(arm64_aes) { GETEX(q1, 0, 0); GETGX(q0, 1); - v0 = fpu_get_scratch(dyn); // ARM64 internal operation differs a bit from x86_64 + v0 = fpu_get_scratch(dyn, ninst); // ARM64 internal operation differs a bit from x86_64 VEORQ(v0, q0, q1); AESD(v0, q1); AESIMC(v0, v0); @@ -855,7 +855,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n if(arm64_aes) { GETEX(q1, 0, 0); GETGX(q0, 1); - v0 = fpu_get_scratch(dyn); // ARM64 internal operation differs a bit from x86_64 + v0 = fpu_get_scratch(dyn, ninst); // ARM64 internal operation differs a bit from x86_64 VEORQ(v0, q0, q1); AESD(v0, q1); VEORQ(q0, v0, q1); @@ -914,7 +914,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(q1, 0, 1); GETGX_empty(q0); u8 = F8; - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); if(u8&4) { u8 = sse_setround(dyn, ninst, x1, x2, x3); VFRINTISQ(q0, q1); @@ -929,7 +929,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(q1, 0, 1); GETGX_empty(q0); u8 = F8; - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); if(u8&4) { u8 = sse_setround(dyn, ninst, x1, x2, x3); VFRINTIDQ(q0, q1); @@ -944,7 +944,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETGX(q0, 1); GETEXSS(q1, 0, 1); u8 = F8; - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); if(u8&4) { u8 = sse_setround(dyn, ninst, x1, x2, x3); FRINTXS(v1, q1); @@ -960,7 +960,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETGX(q0, 1); GETEXSD(q1, 0, 1); u8 = F8; - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); if(u8&4) { u8 = sse_setround(dyn, ninst, x1, x2, x3); FRINTXD(v1, q1); @@ -1044,7 +1044,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n if(u8>31) { VEORQ(q0, q0, q0); } else if(u8>15) { - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); VEORQ(d0, d0, d0); VEXTQ_8(q0, q0, d0, u8-16); } else { @@ -1133,7 +1133,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("INSERTPS Gx, Ex, Ib"); nextop = F8; GETGX(q0, 1); - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); VMOVQ(d0, q0); if (MODREG) { q1 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0); @@ -1172,7 +1172,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETGX(q0, 1); GETEX(q1, 0, 1); u8 = F8; - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VFMULQS(v0, q0, q1); // mask some, duplicate all, mask some for(int i=0; i<4; ++i) @@ -1258,7 +1258,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n CALL(sse42_compare_string_explicit_len, x1); q0 = sse_get_reg_empty(dyn, ninst, x2, 0); if(u8&0b1000000) { - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); switch(u8&1) { case 0b00: VDUPQB(q0, x1); // load the low 8bits of the mask @@ -1299,13 +1299,13 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETGX(v0, 0); GETEX(v1, 0, 1); u8 = F8; - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); if(u8&1) { //16bits VCMEQQ_16(q0, v0, v1); // equal => mask regs XTN_8(q0, q0); // 8 bits mask, in lower 64bits // transform that a mask in x1 - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); VSHL_8(q0, q0, 7); // keep only bit 0x80 TABLE64(x1, (uintptr_t)&mask_shift8); VLDR64_U12(q1, x1, 0); // load shift @@ -1316,8 +1316,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n //8 bits VCMEQQ_8(q0, v0, v1); // equal => mask regs // transform that a mask in x1 - q1 = fpu_get_scratch(dyn); - d0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); + d0 = fpu_get_scratch(dyn, ninst); VSHL_8(d0, q0, 7); // keep only bit 0x80 TABLE64(x1, (uintptr_t)&mask_shift8); VLDR64_U12(q1, x1, 0); // load shift @@ -1462,7 +1462,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n CALL(sse42_compare_string_implicit_len, x1); q0 = sse_get_reg_empty(dyn, ninst, x2, 0); if(u8&0b1000000) { - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); switch(u8&1) { case 0b00: VDUPQB(q0, x1); // load the low 8bits of the mask @@ -1592,8 +1592,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(q0, 0, 0); GETGX_empty(q1); if(!box64_dynarec_fastnan) { - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN VFCMEQQD(v0, q0, q0); // 0 if NAN, 1 if not NAN VFSQRTQD(q1, q0); @@ -1647,8 +1647,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(q0, 0, 0); GETGX(q1, 1); if(!box64_dynarec_fastnan) { - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN VFMAXQD(v0, q0, q1); // propagate NAN VFCMEQQD(v0, v0, v0); // 0 if NAN, 1 if not NAN @@ -1667,8 +1667,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(q0, 0, 0); GETGX(q1, 1); if(!box64_dynarec_fastnan) { - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN VFMAXQD(v0, q0, q1); // propagate NAN VFCMEQQD(v0, v0, v0); // 0 if NAN, 1 if not NAN @@ -1710,7 +1710,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n MSR_fpsr(x5); u8 = sse_setround(dyn, ninst, x1, x2, x3); MOV32w(x4, 0x80000000); - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); for(int i=0; i<4; ++i) { BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); @@ -1731,8 +1731,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(q0, 0, 0); GETGX(q1, 1); if(!box64_dynarec_fastnan) { - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN VFMAXQD(v0, q0, q1); // propagate NAN VFCMEQQD(v0, v0, v0); // 0 if NAN, 1 if not NAN @@ -1753,7 +1753,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n // FMIN/FMAX wll not copy the value if v0[x] is NaN // but x86 will copy if either v0[x] or v1[x] is NaN, so lets force a copy if source is NaN if(!box64_dynarec_fastnan && v0!=v1) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VFCMEQQD(q0, v0, v0); // 0 is NaN, 1 is not NaN, so MASK for NaN VANDQ(v0, v0, q0); VBICQ(q0, v1, q0); @@ -1767,8 +1767,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(q0, 0, 0); GETGX(q1, 1); if(!box64_dynarec_fastnan) { - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN VFMAXQD(v0, q0, q1); // propagate NAN VFCMEQQD(v0, v0, v0); // 0 if NAN, 1 if not NAN @@ -1789,7 +1789,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n // FMIN/FMAX wll not copy the value if v0[x] is NaN // but x86 will copy if either v0[x] or v1[x] is NaN, so lets force a copy if source is NaN if(!box64_dynarec_fastnan && v0!=v1) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VFCMEQQD(q0, v0, v0); // 0 is NaN, 1 is not NaN, so MASK for NaN VANDQ(v0, v0, q0); VBICQ(q0, v1, q0); @@ -1998,7 +1998,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n (8)|(9<<8)|(10<<16)|(11<<24), (12)|(13<<8)|(14<<16)|(15<<24) }; - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); tmp64u = swp[(u8>>(0*2))&3] | (swp[(u8>>(1*2))&3]<<32); MOV64x(x2, tmp64u); VMOVQDfrom(d0, 0, x2); @@ -2141,7 +2141,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n if(u8>15) { VEORQ(q0, q0, q0); } else { - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); VEORQ(q1, q1, q1); VEXTQ_8(q0, q0, q1, u8); } @@ -2169,7 +2169,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n if(u8>15) { VEORQ(q0, q0, q0); } else if(u8>0) { - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); VEORQ(q1, q1, q1); VEXTQ_8(q0, q1, q0, 16-u8); } @@ -2209,8 +2209,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETGX(q1, 1); GETEX(q0, 0, 0); if(!box64_dynarec_fastnan) { - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN // but need to mix low/high part VTRNQ1_64(v0, q1, q0); @@ -2231,7 +2231,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETEX(q1, 0, 0); GETGX(q0, 1); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VUZP1Q_64(v0, q0, q1); VUZP2Q_64(q0, q0, q1); VFSUBQD(q0, v0, q0); @@ -2627,7 +2627,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 2: VFCMGEQD(v0, v1, v0); break; // Less or equal case 3: VFCMEQQD(v0, v0, v0); if(v0!=v1) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VFCMEQQD(q0, v1, v1); VANDQ(v0, v0, q0); } @@ -2638,7 +2638,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 6: VFCMGEQD(v0, v1, v0); VMVNQ(v0, v0); break; // Greater or unordered case 7: VFCMEQQD(v0, v0, v0); if(v0!=v1) { - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VFCMEQQD(q0, v1, v1); VANDQ(v0, v0, q0); } @@ -2686,7 +2686,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n VMOVeD(v0, 1, v0, 0); } else { if(v0==v1) - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); else q0 = v0; VMOVeD(q0, 0, v0, (u8&1)); @@ -2720,8 +2720,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, q1); MOVI_32(v1, 16); UMIN_32(v0, v0, v1); // limit to 0 .. +16 values @@ -2734,7 +2734,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VFSUBQD(v0, q0, q1); VFADDQD(q0, q0, q1); VMOVeD(q0, 0, v0, 0); @@ -2744,8 +2744,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, q1); MOVI_32(v1, 32); UMIN_32(v0, v0, v1); // limit to 0 .. +32 values @@ -2758,8 +2758,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, q1); MOVI_32(v1, 64); UMIN_32(v0, v0, v1); // limit to 0 .. +64 values @@ -2798,9 +2798,9 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xD7: nextop = F8; INST_NAME("PMOVMSKB Gd, Ex"); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); - q1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); GETEX(q0, 0, 0); GETGD; TABLE64(x1, (uintptr_t)&mask_shift8); @@ -2888,8 +2888,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, q1); MOVI_32(v1, 15); UMIN_32(v0, v0, v1); // limit to -15 .. +15 values @@ -2902,8 +2902,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, q1); MOVI_32(v1, 31); UMIN_32(v0, v0, v1); // limit to 0 .. +31 values @@ -2923,8 +2923,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETEX(v1, 0, 0); - q0 = fpu_get_scratch(dyn); - q1 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); VUMULL_16(q0, v0, v1); VUMULL2_16(q1, v0, v1); UQSHRN_16(v0, q0, 16); @@ -2935,8 +2935,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETEX(v1, 0, 0); - q0 = fpu_get_scratch(dyn); - q1 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); VSMULL_16(q0, v0, v1); VSMULL2_16(q1, v0, v1); SQSHRN_16(v0, q0, 16); @@ -2955,7 +2955,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); ORRw_mask(x4, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); for(int i=0; i<2; ++i) { BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); @@ -3054,8 +3054,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, q1); MOVI_32(v1, 16); UMIN_32(v0, v0, v1); // limit to 0 .. +16 values @@ -3067,8 +3067,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, q1); MOVI_32(v1, 32); UMIN_32(v0, v0, v1); // limit to 0 .. +32 values @@ -3080,8 +3080,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + v1 = fpu_get_scratch(dyn, ninst); UQXTN_32(v0, q1); MOVI_32(v1, 64); UMIN_32(v0, v0, v1); // limit to 0 .. +64 values @@ -3093,10 +3093,10 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETEX(v1, 0, 0); - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); VUZP1Q_32(q0, v0, v0); //A3 A2 A1 A0 -> A3 A1 A2 A0 if(MODREG) { - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); } else { q1 = v1; } @@ -3108,8 +3108,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETEX(v1, 0, 0); - q0 = fpu_get_scratch(dyn); - q1 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); VSMULL_16(q0, v0, v1); VSMULL2_16(q1, v0, v1); VADDPQ_32(v0, q0, q1); @@ -3119,8 +3119,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - d0 = fpu_get_scratch(dyn); - d1 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); + d1 = fpu_get_scratch(dyn, ninst); VEOR(d1, d1, d1); // is it necessary? UABDL_8(d0, q0, q1); UADDLVQ_16(d1, d0); @@ -3134,10 +3134,10 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(q0, 1); GETEX(q1, 0, 0); - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); VLDR128_U12(v0, xRDI, 0); if(MODREG) - v1 = fpu_get_scratch(dyn); // need to preserve the register + v1 = fpu_get_scratch(dyn, ninst); // need to preserve the register else v1 = q1; VSSHRQ_8(v1, q1, 7); // get the mask diff --git a/src/dynarec/arm64/dynarec_arm64_6664.c b/src/dynarec/arm64/dynarec_arm64_6664.c index d5b14771..4b33fab6 100644 --- a/src/dynarec/arm64/dynarec_arm64_6664.c +++ b/src/dynarec/arm64/dynarec_arm64_6664.c @@ -61,7 +61,7 @@ uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n grab_segdata(dyn, addr, ninst, x4, seg); SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); VLDR64_REG(v1, ed, x4); } FCMPD(v0, v1); diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c index b22c7f92..7a59de5d 100644 --- a/src/dynarec/arm64/dynarec_arm64_67.c +++ b/src/dynarec/arm64/dynarec_arm64_67.c @@ -202,7 +202,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(MODREG) { s0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0); } else { - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); SMREAD(); addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(s0, ed, fixedaddress); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c index 8addb9b1..856fb0c5 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c @@ -68,24 +68,24 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("VSHUFPS Gx, Vx, Ex, Ib"); nextop = F8; GETVX(v2, 0); - GETGX_empty(v0); if(!MODREG) { - addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1); v1 = -1; // to avoid a warning } else v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); + GETGX_empty(v0); u8 = F8; if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) { VDUPQ_32(v0, v2, u8&3); } else if(v2==v1 && (u8==0xe0)) { // easy special case VMOVQ(v0, v2); VMOVeS(v0, 1, v0, 0); - } else if(v0==v1 && (u8==0xe5)) { // easy special case + } else if(v2==v1 && (u8==0xe5)) { // easy special case VMOVQ(v0, v2); VMOVeS(v0, 0, v0, 1); } else { - d0 = fpu_get_scratch(dyn); - // first two elements from Gx + d0 = fpu_get_scratch(dyn, ninst); + // first two elements from Vx for(int i=0; i<2; ++i) { VMOVeS(d0, i, v2, (u8>>(i*2))&3); } @@ -104,7 +104,34 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int VMOVQ(v0, d0); } if(vex.l) { - DEFAULT; /* TDOD! */ + if(MODREG) + v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, -1); + GETGY_empty_VY(v0, v2, 0, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); + if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) { + VDUPQ_32(v0, v2, u8&3); + } else if(v2==v1 && (u8==0xe0)) { + VMOVQ(v0, v2); + VMOVeS(v0, 1, v0, 0); + } else if(v2==v1 && (u8==0xe5)) { + VMOVQ(v0, v2); + VMOVeS(v0, 0, v0, 1); + } else { + for(int i=0; i<2; ++i) { + VMOVeS(d0, i, v2, (u8>>(i*2))&3); + } + if(MODREG) { + for(int i=2; i<4; ++i) { + VMOVeS(d0, i, v1, (u8>>(i*2))&3); + } + } else { + SMREAD(); + for(int i=2; i<4; ++i) { + ADDx_U12(x2, ed, 16+((u8>>(i*2))&3)*4); + VLD1_32(d0, i, x2); + } + } + VMOVQ(v0, d0); + } } else YMM0(gd); break; diff --git a/src/dynarec/arm64/dynarec_arm64_d8.c b/src/dynarec/arm64/dynarec_arm64_d8.c index 113519a9..fc481aea 100644 --- a/src/dynarec/arm64/dynarec_arm64_d8.c +++ b/src/dynarec/arm64/dynarec_arm64_d8.c @@ -187,7 +187,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0: INST_NAME("FADD ST0, float[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(s0, ed, fixedaddress); if(ST_IS_F(0)) { @@ -200,7 +200,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 1: INST_NAME("FMUL ST0, float[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(s0, ed, fixedaddress); if(ST_IS_F(0)) { @@ -213,7 +213,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 2: INST_NAME("FCOM ST0, float[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(s0, ed, fixedaddress); if(ST_IS_F(0)) { @@ -227,7 +227,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 3: INST_NAME("FCOMP ST0, float[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(s0, ed, fixedaddress); if(ST_IS_F(0)) { @@ -242,7 +242,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 4: INST_NAME("FSUB ST0, float[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(s0, ed, fixedaddress); if(ST_IS_F(0)) { @@ -255,7 +255,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 5: INST_NAME("FSUBR ST0, float[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(s0, ed, fixedaddress); if(ST_IS_F(0)) { @@ -268,7 +268,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FDIV ST0, float[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(s0, ed, fixedaddress); if(ST_IS_F(0)) { @@ -281,7 +281,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 7: INST_NAME("FDIVR ST0, float[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(s0, ed, fixedaddress); if(ST_IS_F(0)) { diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c index 6c99871d..53c3ad6e 100644 --- a/src/dynarec/arm64/dynarec_arm64_d9.c +++ b/src/dynarec/arm64/dynarec_arm64_d9.c @@ -337,7 +337,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin #else v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = x87_get_st(dyn, ninst, x1, x2, 1, NEON_CACHE_ST_D); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); FDIVD(s0, v1, v2); FRINTRRD(s0, s0, 0b00); // Nearest == TieToEven? FCVTZSxD(x4, s0); @@ -385,7 +385,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin #else v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = x87_get_st(dyn, ninst, x1, x2, 1, NEON_CACHE_ST_D); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); FDIVD(s0, v1, v2); FRINTZD(s0, s0); FCVTZSxD(x4, s0); @@ -496,7 +496,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(ST_IS_F(0)) s0 = v1; else { - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); FCVT_S_D(s0, v1); } addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); diff --git a/src/dynarec/arm64/dynarec_arm64_da.c b/src/dynarec/arm64/dynarec_arm64_da.c index b278ef02..afb5130d 100644 --- a/src/dynarec/arm64/dynarec_arm64_da.c +++ b/src/dynarec/arm64/dynarec_arm64_da.c @@ -143,7 +143,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0: INST_NAME("FIADD ST0, Ed"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(v2, ed, fixedaddress); SXTL_32(v2, v2); // i32 -> i64 @@ -153,7 +153,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 1: INST_NAME("FIMUL ST0, Ed"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(v2, ed, fixedaddress); SXTL_32(v2, v2); // i32 -> i64 @@ -163,7 +163,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 2: INST_NAME("FICOM ST0, Ed"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(v2, ed, fixedaddress); SXTL_32(v2, v2); // i32 -> i64 @@ -174,7 +174,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 3: INST_NAME("FICOMP ST0, Ed"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(v2, ed, fixedaddress); SXTL_32(v2, v2); // i32 -> i64 @@ -186,7 +186,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 4: INST_NAME("FISUB ST0, Ed"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(v2, ed, fixedaddress); SXTL_32(v2, v2); // i32 -> i64 @@ -196,7 +196,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 5: INST_NAME("FISUBR ST0, Ed"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(v2, ed, fixedaddress); SXTL_32(v2, v2); // i32 -> i64 @@ -206,7 +206,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FIDIV ST0, Ed"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(v2, ed, fixedaddress); SXTL_32(v2, v2); // i32 -> i64 @@ -216,7 +216,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 7: INST_NAME("FIDIVR ST0, Ed"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); VLD32(v2, ed, fixedaddress); SXTL_32(v2, v2); // i32 -> i64 diff --git a/src/dynarec/arm64/dynarec_arm64_db.c b/src/dynarec/arm64/dynarec_arm64_db.c index 488d18d4..9dc1d673 100644 --- a/src/dynarec/arm64/dynarec_arm64_db.c +++ b/src/dynarec/arm64/dynarec_arm64_db.c @@ -194,7 +194,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("FISTTP Ed, ST0"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); if(arm64_frintts) { FRINT32ZD(s0, v1); FCVTZSwD(x5, s0); @@ -220,7 +220,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); u8 = x87_setround(dyn, ninst, x1, x2, x4); // x1 have the modified RPSCR reg addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); if(arm64_frintts) { FRINT32XD(s0, v1); FCVTZSwD(x5, s0); @@ -246,7 +246,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); u8 = x87_setround(dyn, ninst, x1, x2, x4); // x1 have the modified RPSCR reg addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0); - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); if(arm64_frintts) { FRINT32XD(s0, v1); FCVTZSwD(x5, s0); diff --git a/src/dynarec/arm64/dynarec_arm64_dc.c b/src/dynarec/arm64/dynarec_arm64_dc.c index a06e765b..76f43bc5 100644 --- a/src/dynarec/arm64/dynarec_arm64_dc.c +++ b/src/dynarec/arm64/dynarec_arm64_dc.c @@ -185,7 +185,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0: INST_NAME("FADD ST0, double[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VLD64(v2, wback, fixedaddress); FADDD(v1, v1, v2); @@ -193,7 +193,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 1: INST_NAME("FMUL ST0, double[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VLD64(v2, wback, fixedaddress); FMULD(v1, v1, v2); @@ -201,7 +201,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 2: INST_NAME("FCOM ST0, double[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VLD64(v2, wback, fixedaddress); FCMPD(v1, v2); @@ -210,7 +210,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 3: INST_NAME("FCOMP ST0, double[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VLD64(v2, wback, fixedaddress); FCMPD(v1, v2); @@ -220,7 +220,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 4: INST_NAME("FSUB ST0, double[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VLD64(v2, wback, fixedaddress); FSUBD(v1, v1, v2); @@ -228,7 +228,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 5: INST_NAME("FSUBR ST0, double[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VLD64(v2, wback, fixedaddress); FSUBD(v1, v2, v1); @@ -236,7 +236,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FDIV ST0, double[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VLD64(v2, wback, fixedaddress); FDIVD(v1, v1, v2); @@ -244,7 +244,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 7: INST_NAME("FDIVR ST0, double[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VLD64(v2, wback, fixedaddress); FDIVD(v1, v2, v1); diff --git a/src/dynarec/arm64/dynarec_arm64_dd.c b/src/dynarec/arm64/dynarec_arm64_dd.c index eabfe5fd..afef3358 100644 --- a/src/dynarec/arm64/dynarec_arm64_dd.c +++ b/src/dynarec/arm64/dynarec_arm64_dd.c @@ -158,7 +158,7 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(ST_IS_I64(0)) { VST64(v1, ed, fixedaddress); } else { - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); if(arm64_frintts) { FRINT64ZD(s0, v1); FCVTZSxD(x2, s0); diff --git a/src/dynarec/arm64/dynarec_arm64_de.c b/src/dynarec/arm64/dynarec_arm64_de.c index 660d667c..7ea3b357 100644 --- a/src/dynarec/arm64/dynarec_arm64_de.c +++ b/src/dynarec/arm64/dynarec_arm64_de.c @@ -186,7 +186,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0: INST_NAME("FIADD ST0, word[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); VLD16(v2, wback, fixedaddress); SXTL_16(v2, v2); @@ -197,7 +197,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 1: INST_NAME("FIMUL ST0, word[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); VLD16(v2, wback, fixedaddress); SXTL_16(v2, v2); @@ -208,7 +208,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 2: INST_NAME("FICOM ST0, word[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); VLD16(v2, wback, fixedaddress); SXTL_16(v2, v2); @@ -220,7 +220,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 3: INST_NAME("FICOMP ST0, word[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); VLD16(v2, wback, fixedaddress); SXTL_16(v2, v2); @@ -233,7 +233,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 4: INST_NAME("FISUB ST0, word[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); VLD16(v2, wback, fixedaddress); SXTL_16(v2, v2); @@ -244,7 +244,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 5: INST_NAME("FISUBR ST0, word[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); VLD16(v2, wback, fixedaddress); SXTL_16(v2, v2); @@ -255,7 +255,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FIDIV ST0, word[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); VLD16(v2, wback, fixedaddress); SXTL_16(v2, v2); @@ -266,7 +266,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 7: INST_NAME("FIDIVR ST0, word[ED]"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - v2 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn, ninst); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); VLD16(v2, wback, fixedaddress); SXTL_16(v2, v2); diff --git a/src/dynarec/arm64/dynarec_arm64_df.c b/src/dynarec/arm64/dynarec_arm64_df.c index 21828508..79c59bfd 100644 --- a/src/dynarec/arm64/dynarec_arm64_df.c +++ b/src/dynarec/arm64/dynarec_arm64_df.c @@ -155,7 +155,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F); addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); ed = x1; - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); #if 0 // this version needs ARM v8.5, and doesn't handle saturation for 32bits integer not fitting 16bits FRINT32ZD(s0, v1); @@ -194,7 +194,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin u8 = x87_setround(dyn, ninst, x1, x2, x4); addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); ed = x1; - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); #if 0 FRINT32XD(s0, v1); // no saturation instruction on Arm, so using NEON @@ -234,7 +234,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin u8 = x87_setround(dyn, ninst, x1, x2, x4); addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); ed = x1; - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); #if 0 FRINT32XD(s0, v1); // no saturation instruction on Arm, so using NEON @@ -323,7 +323,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin } addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); ed = x1; - s0 = fpu_get_scratch(dyn); + s0 = fpu_get_scratch(dyn, ninst); if(ST_IS_I64(0)) { VST64(v1, wback, fixedaddress); } else { diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c index 65607b52..8f8e549b 100644 --- a/src/dynarec/arm64/dynarec_arm64_f20f.c +++ b/src/dynarec/arm64/dynarec_arm64_f20f.c @@ -99,7 +99,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETED(0); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); if(rex.w) { SCVTFDx(d1, ed); } else { @@ -140,7 +140,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n MSR_fpsr(x5); } u8 = sse_setround(dyn, ninst, x1, x2, x3); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); FRINTID(d1, q0); x87_restoreround(dyn, ninst, u8); FCVTZSxwD(gd, d1); @@ -207,10 +207,10 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("SQRTSD Gx, Ex"); nextop = F8; GETGX(v0, 1); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); GETEXSD(d0, 0, 0); if(!box64_dynarec_fastnan) { - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); FCMLTD_0(v1, d0); SHL_64(v1, v1, 63); FSQRTD(d1, d0); @@ -225,11 +225,11 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("ADDSD Gx, Ex"); nextop = F8; GETGX(d1, 1); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); GETEXSD(d0, 0, 0); if(!box64_dynarec_fastnan) { - v0 = fpu_get_scratch(dyn); - q0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN FMAXD(v0, d0, d1); // propagate NAN FCMEQD(v0, v0, v0); // 0 if NAN, 1 if not NAN @@ -247,11 +247,11 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MULSD Gx, Ex"); nextop = F8; GETGX(d1, 1); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); GETEXSD(d0, 0, 0); if(!box64_dynarec_fastnan) { - v0 = fpu_get_scratch(dyn); - q0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN FMAXD(v0, d0, d1); // propagate NAN FCMEQD(v0, v0, v0); // 0 if NAN, 1 if not NAN @@ -270,7 +270,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETEXSD(d0, 0, 0); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); FCVT_S_D(d1, d0); VMOVeS(v0, 0, d1, 0); break; @@ -279,11 +279,11 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("SUBSD Gx, Ex"); nextop = F8; GETGX(d1, 1); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); GETEXSD(d0, 0, 0); if(!box64_dynarec_fastnan) { - v0 = fpu_get_scratch(dyn); - q0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN FMAXD(v0, d0, d1); // propagate NAN FCMEQD(v0, v0, v0); // 0 if NAN, 1 if not NAN @@ -304,7 +304,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEXSD(v1, 0, 0); // MINSD: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0] #if 0 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); FMINNMD(d0, v0, v1); // NaN handling may be slightly different, is that a problem? VMOVeD(v0, 0, d0, 0); // to not erase uper part #else @@ -317,11 +317,11 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("DIVSD Gx, Ex"); nextop = F8; GETGX(v0, 1); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); GETEXSD(v1, 0, 0); if(!box64_dynarec_fastnan) { - d0 = fpu_get_scratch(dyn); - q0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); + q0 = fpu_get_scratch(dyn, ninst); // check if any input value was NAN FMAXD(d0, v0, v1); // propagate NAN FCMEQD(d0, d0, d0); // 0 if NAN, 1 if not NAN @@ -342,7 +342,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEXSD(v1, 0, 0); // MAXSD: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0] #if 0 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); FMAXNMD(d0, v0, v1); // NaN handling may be slightly different, is that a problem? VMOVeD(v0, 0, d0, 0); // to not erase uper part #else @@ -361,7 +361,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n u8 = F8; if(u8==0b00000000 || u8==0b01010101 || u8==0b10101010 || u8==0b11111111) { if(v0==v1) { - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); VMOVQ(d0, v1); } VDUP_16(v0, v1, u8&3); @@ -375,7 +375,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n u64 |= ((uint64_t)((u8>>(i*2))&3)*2+1)<<(i*16+8); } MOV64x(x2, u64); - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); VMOVQDfrom(d0, 0, x2); VTBL1_8(d0, v1, d0); VMOVeD(v0, 0, d0, 0); @@ -393,7 +393,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, 0); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); VLD128(v1, ed, fixedaddress); } VFADDPQS(v0, v0, v1); @@ -406,10 +406,10 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, 0); - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); VLD128(v1, ed, fixedaddress); } - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); VUZP1Q_32(d0, v0, v1); VUZP2Q_32(v0, v0, v1); VFSUBQS(v0, d0, v0); @@ -439,7 +439,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETEX(v1, 0, 0); - q0 = fpu_get_scratch(dyn); + q0 = fpu_get_scratch(dyn, ninst); static float addsubps[4] = {-1.f, 1.f, -1.f, 1.f}; MAYUSE(addsubps); TABLE64(x2, (uintptr_t)&addsubps); @@ -472,7 +472,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); ORRw_mask(x4, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); for(int i=0; i<2; ++i) { BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); diff --git a/src/dynarec/arm64/dynarec_arm64_f30f.c b/src/dynarec/arm64/dynarec_arm64_f30f.c index c4ccbff4..079cd0bc 100644 --- a/src/dynarec/arm64/dynarec_arm64_f30f.c +++ b/src/dynarec/arm64/dynarec_arm64_f30f.c @@ -84,7 +84,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n } else { SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, 0); - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); VLD128(q1, ed, fixedaddress); } GETGX_empty(q0); @@ -99,7 +99,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n } else { SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, 0); - q1 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn, ninst); VLD128(q1, ed, fixedaddress); } GETGX_empty(q0); @@ -117,7 +117,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETED(0); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); if(rex.w) { SCVTFSx(d1, ed); } else { @@ -158,7 +158,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n MSR_fpsr(x5); } u8 = sse_setround(dyn, ninst, x1, x2, x3); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); FRINTIS(d1, q0); x87_restoreround(dyn, ninst, u8); FCVTZSxwS(gd, d1); @@ -176,7 +176,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("SQRTSS Gx, Ex"); nextop = F8; GETGX(v0, 1); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); GETEXSS(d0, 0, 0); FSQRTS(d1, d0); VMOVeS(v0, 0, d1, 0); @@ -186,8 +186,8 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETEXSS(v1, 0, 0); - d0 = fpu_get_scratch(dyn); - d1 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); + d1 = fpu_get_scratch(dyn, ninst); // so here: F32: Imm8 = abcd efgh that gives => aBbbbbbc defgh000 00000000 00000000 // and want 1.0f = 0x3f800000 // so 00111111 10000000 00000000 00000000 @@ -203,7 +203,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETEXSS(v1, 0, 0); - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); FMOVS_8(d0, 0b01110000); //1.0f FDIVS(d0, d0, v1); VMOVeS(v0, 0, d0, 0); @@ -213,7 +213,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("ADDSS Gx, Ex"); nextop = F8; GETGX(v0, 1); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); GETEXSS(d0, 0, 0); FADDS(d1, v0, d0); // the high part of the vector is erased... VMOVeS(v0, 0, d1, 0); @@ -222,7 +222,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MULSS Gx, Ex"); nextop = F8; GETGX(v0, 1); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); GETEXSS(d0, 0, 0); FMULS(d1, v0, d0); VMOVeS(v0, 0, d1, 0); @@ -232,7 +232,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX(v0, 1); GETEXSS(v1, 0, 0); - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); FCVT_D_S(d0, v1); VMOVeD(v0, 0, d0, 0); break; @@ -248,7 +248,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); ORRw_mask(x4, xZR, 1, 0); //0x80000000 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); for(int i=0; i<4; ++i) { BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); @@ -266,7 +266,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("SUBSS Gx, Ex"); nextop = F8; GETGX(v0, 1); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); GETEXSS(d0, 0, 0); FSUBS(d1, v0, d0); VMOVeS(v0, 0, d1, 0); @@ -278,7 +278,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEXSS(v1, 0, 0); // MINSS: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0] #if 0 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); FMINNMS(d0, v0, v1); // NaN handling may be slightly different, is that a problem? VMOVeS(v0, 0, d0, 0); // to not erase uper part #else @@ -291,7 +291,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("DIVSS Gx, Ex"); nextop = F8; GETGX(v0, 1); - d1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn, ninst); GETEXSS(d0, 0, 0); FDIVS(d1, v0, d0); VMOVeS(v0, 0, d1, 0); @@ -303,7 +303,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEXSS(v1, 0, 0); // MAXSS: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0] #if 0 - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); FMAXNMS(d0, v0, v1); // NaN handling may be slightly different, is that a problem? VMOVeS(v0, 0, d0, 0); // to not erase uper part #else @@ -333,7 +333,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(v1, 0, 1) ; GETGX(v0, 1); u8 = F8; - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); if (u8 == 0b00000000 || u8 == 0b01010101 || u8 == 0b10101010 || u8 == 0b11111111) { VDUPQ_16(d0, v1, (u8 & 3) + 4); } else { @@ -386,7 +386,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n SETFLAGS(X_ALL, SF_SET); SET_DFNONE(x1); nextop = F8; - v1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn, ninst); GETGD; if(MODREG) { GETED(0); @@ -489,7 +489,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETEXSD(v1, 0, 0); GETGX_empty(v0); - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn, ninst); SXTL_32(v0, v1); SCVTQFD(v0, v0); // there is only I64 -> Double vector conversion, not from i32 break; diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index 7c4bac22..3d23bd00 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -32,9 +32,15 @@ #define EMM0 8 // Get a FPU scratch reg -int fpu_get_scratch(dynarec_arm_t* dyn) +int fpu_get_scratch(dynarec_arm_t* dyn, int ninst) { - return SCRATCH0 + dyn->n.fpu_scratch++; // return an Sx + int ret = SCRATCH0 + dyn->n.fpu_scratch++; + if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) { + // should only happens in step 0... + dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret].n); // mark as purged + dyn->n.neoncache[ret].v = 0; // reset it + } + return ret; } // Reset scratch regs counter void fpu_reset_scratch(dynarec_arm_t* dyn) @@ -42,10 +48,15 @@ void fpu_reset_scratch(dynarec_arm_t* dyn) dyn->n.fpu_scratch = 0; } // Get a x87 double reg -int fpu_get_reg_x87(dynarec_arm_t* dyn, int t, int n) +int fpu_get_reg_x87(dynarec_arm_t* dyn, int ninst, int t, int n) { int i=X870; while (dyn->n.fpuused[i]) ++i; + if(dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) { + // should only happens in step 0... + dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[i].n); // mark as purged + dyn->n.neoncache[i].v = 0; // reset it + } dyn->n.fpuused[i] = 1; dyn->n.neoncache[i].n = n; dyn->n.neoncache[i].t = t; @@ -61,13 +72,19 @@ void fpu_free_reg(dynarec_arm_t* dyn, int reg) dyn->n.neoncache[reg].v = 0; } // Get an MMX double reg -int fpu_get_reg_emm(dynarec_arm_t* dyn, int emm) +int fpu_get_reg_emm(dynarec_arm_t* dyn, int ninst, int emm) { - dyn->n.fpuused[EMM0 + emm] = 1; - dyn->n.neoncache[EMM0 + emm].t = NEON_CACHE_MM; - dyn->n.neoncache[EMM0 + emm].n = emm; - dyn->n.news |= (1<<(EMM0 + emm)); - return EMM0 + emm; + int ret = EMM0 + emm; + if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) { + // should only happens in step 0... + dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret].n); // mark as purged + dyn->n.neoncache[ret].v = 0; // reset it + } + dyn->n.fpuused[ret] = 1; + dyn->n.neoncache[ret].t = NEON_CACHE_MM; + dyn->n.neoncache[ret].n = emm; + dyn->n.news |= (1<<(ret)); + return ret; } // Get an XMM quad reg int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm) @@ -84,6 +101,77 @@ int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm) dyn->n.news |= (1<<i); return i; } +static int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg) +{ + if(dyn->n.neoncache[reg].t==NEON_CACHE_YMMR || dyn->n.neoncache[reg].t==NEON_CACHE_YMMW) { + if(dyn->n.neoncache[reg].n == ymm) { + // already there! + if(t==NEON_CACHE_YMMW) + dyn->n.neoncache[reg].t=t; + return reg; + } + return -1; + } else { + // found a slot! + dyn->n.neoncache[reg].t=t; + dyn->n.neoncache[reg].n=ymm; + return reg; + } + return -1; +} +static int is_ymm_to_keep(dynarec_arm_t* dyn, int reg, int k1, int k2, int k3) +{ + if(k1!=-1 && dyn->n.neoncache[reg].n==k1) + return 1; + if(k2!=-1 && dyn->n.neoncache[reg].n==k2) + return 1; + if(k3!=-1 && dyn->n.neoncache[reg].n==k3) + return 1; + return 0; +} +// Get an YMM quad reg, while preserving up to 3 other YMM regs +int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3) +{ + int i = EMM0; + // first pass see if a slot is free in EMM/x87 slots + for(int j=0; j<8; ++j) { + if(!dyn->n.fpuused[i+j]) { + int ret = internal_mark_ymm(dyn, t, ymm, i+j); + if(ret>=0) return ret; + } + } + // no slot in the emm space, look for scratch space in reverse + i = SCRATCH0; + for(int j=7; j>=dyn->n.fpu_scratch; --j) { + int ret = internal_mark_ymm(dyn, t, ymm, i+j); + if(ret>=0) return ret; + } + // no free slot, needs to purge a value... First loop on the YMMR, they are easier to purge + i = EMM0; + int keep = 0; + for(int j=0; j<8; ++j) { + if(!dyn->n.fpuused[i+j]) { + // should a test be done to check if ymm is already in the purge list? + if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR) { + dyn->insts[ninst].purge_ymm |= 1<<dyn->n.neoncache[i+j].n; + dyn->n.neoncache[i+j].v = 0; + return internal_mark_ymm(dyn, t, ymm, i+j); + } + } + } + // make space in the scratch area + i = SCRATCH0; + for(int j=dyn->n.fpu_scratch; j<8; ++j) { + // should a test be done to check if ymm is already in the purge list? + if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3)) { + dyn->insts[ninst].purge_ymm |= 1<<dyn->n.neoncache[i+j].n; + dyn->n.neoncache[i+j].v = 0; + return internal_mark_ymm(dyn, t, ymm, i+j); + } + } + printf_log(LOG_NONE, "BOX64 Dynarec: Error, unable to free a reg for YMM %d at inst=%d\n", ymm, ninst); + return i; +} // Reset fpu regs counter static void fpu_reset_reg_neoncache(neoncache_t* n) { @@ -343,10 +431,11 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { return ((dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))?0:(isCacheEmpty(dyn, ninst)?0:1); int ret = 0; if(!i2) { // just purge - if(dyn->insts[ninst].n.stack_next) { + if(dyn->insts[ninst].n.stack_next) return 1; - } - for(int i=0; i<24 && !ret; ++i) + if(dyn->insts[ninst].ymm_zero) + return 1; + for(int i=0; i<32 && !ret; ++i) if(dyn->insts[ninst].n.neoncache[i].v) { // there is something at ninst for i if(!( (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F @@ -361,10 +450,12 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { if(dyn->insts[ninst].n.stack_next != dyn->insts[i2].n.stack-dyn->insts[i2].n.stack_push) { return 1; } + if(dyn->insts[ninst].ymm_zero && (dyn->insts[ninst].ymm_zero&~dyn->insts[i2].ymm_zero)) + return 1; neoncache_t cache_i2 = dyn->insts[i2].n; neoncacheUnwind(&cache_i2); - for(int i=0; i<24; ++i) { + for(int i=0; i<32; ++i) { if(dyn->insts[ninst].n.neoncache[i].v) { // there is something at ninst for i if(!cache_i2.neoncache[i].v) { // but there is nothing at i2 for i ret = 1; @@ -374,6 +465,8 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { } else if(dyn->insts[ninst].n.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) {/* nothing */ } + else if(dyn->insts[ninst].n.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW) + {/* nothing */ } else ret = 1; } @@ -648,6 +741,9 @@ static void sse_reset(neoncache_t* n) { for (int i=0; i<16; ++i) n->ssecache[i].v = -1; + for (int i=0; i<32; ++i) + if(n->neoncache[i].t==NEON_CACHE_YMMR || n->neoncache[i].t==NEON_CACHE_YMMW) + n->neoncache[i].v = 0; } void fpu_reset(dynarec_arm_t* dyn) diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h index abe827bb..7da65897 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.h +++ b/src/dynarec/arm64/dynarec_arm64_functions.h @@ -6,15 +6,17 @@ #define SCRATCH0 24 // Get an FPU scratch reg -int fpu_get_scratch(dynarec_arm_t* dyn); +int fpu_get_scratch(dynarec_arm_t* dyn, int ninst); // Reset scratch regs counter void fpu_reset_scratch(dynarec_arm_t* dyn); // Get an x87 double reg -int fpu_get_reg_x87(dynarec_arm_t* dyn, int t, int n); +int fpu_get_reg_x87(dynarec_arm_t* dyn, int ninst, int t, int n); // Get an MMX double reg -int fpu_get_reg_emm(dynarec_arm_t* dyn, int emm); +int fpu_get_reg_emm(dynarec_arm_t* dyn, int ninst, int emm); // Get an XMM quad reg int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm); +// Get an YMM upper quad reg, while keeping up to 3 other YMM reg (-1 to no keep) +int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3); // Free a FPU/MMX/XMM reg void fpu_free_reg(dynarec_arm_t* dyn, int reg); // Reset fpu regs counter diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index 5e406588..cdf0eeb6 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -958,7 +958,7 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t) ++dyn->n.x87cache[i]; else if(ret==-1) { dyn->n.x87cache[i] = 0; - ret=dyn->n.x87reg[i]=fpu_get_reg_x87(dyn, t, 0); + ret=dyn->n.x87reg[i]=fpu_get_reg_x87(dyn, ninst, t, 0); dyn->n.neoncache[ret].t = X87_ST0; } } @@ -1274,7 +1274,7 @@ int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, i ret = i; // found, setup and grab the value dyn->n.x87cache[ret] = st; - dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, NEON_CACHE_ST_D, st); + dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, ninst, NEON_CACHE_ST_D, st); if(populate) { ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87)); LDRw_U12(s2, xEmu, offsetof(x64emu_t, top)); @@ -1394,7 +1394,7 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) ret = i; // found, setup and grab the value dyn->n.x87cache[ret] = st; - dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, NEON_CACHE_ST_D, st); + dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, ninst, NEON_CACHE_ST_D, st); ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87)); LDRw_U12(s2, xEmu, offsetof(x64emu_t, top)); int a = st - dyn->n.x87stack; @@ -1544,7 +1544,7 @@ int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a) if(dyn->n.mmxcache[a]!=-1) return dyn->n.mmxcache[a]; ++dyn->n.mmxcount; - int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, a); + int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, ninst, a); VLDR64_U12(ret, xEmu, offsetof(x64emu_t, mmx[a])); return ret; } @@ -1556,7 +1556,7 @@ int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int if(dyn->n.mmxcache[a]!=-1) return dyn->n.mmxcache[a]; ++dyn->n.mmxcount; - int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, a); + int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, ninst, a); return ret; } // purge the MMX cache only(needs 3 scratch registers) @@ -1679,12 +1679,30 @@ static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1) MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":""); ++old; } + int s1_set = 0; for(int i=0; i<16; ++i) - if(is_avx_zero(dyn, ninst, i)) - STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i])); + if(is_avx_zero(dyn, ninst, i)) { + if(!s1_set) { + ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0])); + s1_set = 1; + } + STPx_S7_offset(xZR, xZR, s1, i*16); + } if(!next) avx_mark_zero_reset(dyn, ninst); } + for(int i=0; i<32; ++i) { + if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) { + if (old==-1) { + MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":""); + ++old; + } + VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); + } + if(!next && (dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR)) + dyn->n.neoncache[i].v = 0; + } + // All done if(old!=-1) { MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n"); } @@ -1697,16 +1715,30 @@ static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1) VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } //AVX - if(dyn->ymm_zero) + if(dyn->ymm_zero) { + int s1_set = 0; for(int i=0; i<16; ++i) - if(is_avx_zero(dyn, ninst, i)) - STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i])); + if(is_avx_zero(dyn, ninst, i)) { + if(!s1_set) { + ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0])); + s1_set = 1; + } + STPx_S7_offset(xZR, xZR, s1, i*16); + } + } } void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a) { - if(is_avx_zero(dyn, ninst, a)) - STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[a])); + if(is_avx_zero(dyn, ninst, a)) { + //only ymm[0] can be accessed with STP :( + if(!a) + STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[a])); + else { + STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])); + STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])+8); + } + } if(dyn->n.ssecache[a].v==-1) return; if(dyn->n.neoncache[dyn->n.ssecache[a].reg].t == NEON_CACHE_XMMW) { @@ -1716,24 +1748,82 @@ void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a) } } +// AVX Helpers +// get neon register for a YMM upper reg, create the entry if needed +int ymm_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite, int k1, int k2, int k3) +{ + // look if already exist + for(int i=0; i<32; ++i) + if((dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) && dyn->n.neoncache[i].n==a) { + if(forwrite) { + dyn->n.neoncache[i].t = NEON_CACHE_YMMW; + dyn->ymm_zero&=~(1<<a); + } + return i; + } + // nope, grab a new one + int ret = fpu_get_reg_ymm(dyn, ninst, forwrite?NEON_CACHE_YMMW:NEON_CACHE_YMMR, a, k1, k2, k3); + if(dyn->ymm_zero&(1<<a)) { + VEORQ(ret, ret, ret); + if(forwrite) + dyn->ymm_zero&=~(1<<a); + } else { + VLDR128_U12(ret, xEmu, offsetof(x64emu_t, ymm[a])); + } + return ret; +} +// get neon register for a YMM reg, but don't try to synch it if it needed to be created +int ymm_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a, int k1, int k2, int k3) +{ + // look if already exist + for(int i=0; i<32; ++i) + if((dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) && dyn->n.neoncache[i].n==a) { + dyn->n.neoncache[i].t = NEON_CACHE_YMMW; + dyn->ymm_zero&=~(1<<a); + return i; + } + // nope, grab a new one + int ret = fpu_get_reg_ymm(dyn, ninst, NEON_CACHE_YMMW, a, k1, k2, k3); + if(dyn->ymm_zero&(1<<a)) + dyn->ymm_zero&=~(1<<a); + return ret; +} + + void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) { int start = not07?8:0; // only SSE regs needs to be push back to xEmu (needs to be "write") int n=0; - for (int i=start; i<16; i++) + for (int i=start; i<16; i++) { if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) ++n; + if(is_avx_zero(dyn, ninst, i)) + ++n; + } + for(int i=0; i<32; ++i) + if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) + ++n; if(!n) return; MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n); + int s1_set = 0; for (int i=start; i<16; ++i) { if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) { VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } - if(is_avx_zero(dyn, ninst, i)) - STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i])); + if(is_avx_zero(dyn, ninst, i)) { + if(!s1_set) { + ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0])); + s1_set = 1; + } + STPx_S7_offset(xZR, xZR, s1, i*16); + } } + // purge the YMM values + for(int i=0; i<32; ++i) + if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) + VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n); } @@ -1754,6 +1844,9 @@ void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) /*dyn->n.ssecache[i].write = 0; // OPTIM: it's sync, so not write anymore dyn->n.neoncache[dyn->n.ssecache[i].reg].t = NEON_CACHE_XMMR;*/ } + for(int i=0; i<32; ++i) + if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) + VLDR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n); } @@ -1988,7 +2081,7 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) { -#if STEP > 1 +#if 1//STEP > 1 int i2 = dyn->insts[ninst].x64.jmp_insts; if(i2<0) return; @@ -2101,6 +2194,19 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int } } } + // ymm0 + s3_top = 1; + if(dyn->ymm_zero && (dyn->ymm_zero&~dyn->insts[i2].ymm_zero)) { + for(int i=0; i<16; ++i) + if(dyn->insts[i2].purge_ymm&(1<<i)) + if(is_avx_zero(dyn, ninst, i)) { + if(s3_top) { + ADDx_U12(s3, xEmu,offsetof(x64emu_t, ymm[0])); + s3_top = 0; + } + STPx_S7_offset(xZR, xZR, s3, i*16); + } + } if(stack_cnt != cache_i2.stack) { MESSAGE(LOG_DUMP, "\t - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack); int a = stack_cnt - cache_i2.stack; @@ -2355,13 +2461,29 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst) dyn->n.swapped = 0; } -void avx_purge_ymm0(dynarec_arm_t* dyn, int ninst) +void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1) { - if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Purge YMM Zero mask=%04x --------\n", dyn->insts[ninst].purge_ymm0); + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Purge YMM mask=%04x --------\n", dyn->insts[ninst].purge_ymm); + int s1_set = 0; for(int i=0; i<16; ++i) - if(dyn->insts[ninst].purge_ymm0&(1<<i) && is_avx_zero(dyn, ninst, i)) { - STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i])); - avx_unmark_zero(dyn, ninst, i); + if(dyn->insts[ninst].purge_ymm&(1<<i)) { + if(is_avx_zero(dyn, ninst, i)) { + if(!s1_set) { + ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0])); + s1_set = 1; + } + STPx_S7_offset(xZR, xZR, s1, i*16); + avx_unmark_zero(dyn, ninst, i); + } + int reg = -1; + for(int j=0; j<32; ++j) + if(dyn->n.neoncache[j].t==NEON_CACHE_YMMR && dyn->n.neoncache[j].n==i) { + // just forget the reg.... + dyn->n.neoncache[j].v = 0; + } else if(dyn->n.neoncache[j].t==NEON_CACHE_YMMW && dyn->n.neoncache[j].n==i) { + VSTR128_U12(j, xEmu, offsetof(x64emu_t, ymm[i])); + dyn->n.neoncache[j].v = 0; + } } - if(box64_dynarec_dump) dynarec_log(LOG_NONE, "---------- Purge YMM Zero\n"); + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "---------- Purge YMM\n"); } \ No newline at end of file diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index 36950291..fc06d358 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -457,6 +457,10 @@ #define GETVX_empty(a) \ a = sse_get_reg_empty(dyn, ninst, x1, vex.v) +#define GETGY_empty_VY(a, b, w2, k1, k2) \ + b = ymm_get_reg(dyn, ninst, x1, vex.v, w2, gd, k1, k2); \ + a = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, k1, k2) + // Get EX as a quad, (x1 is used) #define GETEX(a, w, D) \ if(MODREG) { \ @@ -464,7 +468,7 @@ } else { \ if(w) {WILLWRITE2();} else {SMREAD();} \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, D); \ - a = fpu_get_scratch(dyn); \ + a = fpu_get_scratch(dyn, ninst); \ VLD128(a, ed, fixedaddress); \ } @@ -482,7 +486,7 @@ a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ if(w) {WILLWRITE2();} else {SMREAD();} \ - a = fpu_get_scratch(dyn); \ + a = fpu_get_scratch(dyn, ninst); \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, D); \ VLD64(a, ed, fixedaddress); \ } @@ -496,7 +500,7 @@ a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ if(w) {WILLWRITE2();} else {SMREAD();} \ - a = fpu_get_scratch(dyn); \ + a = fpu_get_scratch(dyn, ninst); \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, D); \ VLD32(a, ed, fixedaddress); \ } @@ -510,7 +514,7 @@ a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ if(w) {WILLWRITE2();} else {SMREAD();} \ - a = fpu_get_scratch(dyn); \ + a = fpu_get_scratch(dyn, ninst); \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, D); \ VLD16(a, ed, fixedaddress); \ } @@ -527,7 +531,7 @@ } else { \ SMREAD(); \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, D); \ - a = fpu_get_scratch(dyn); \ + a = fpu_get_scratch(dyn, ninst); \ VLD64(a, ed, fixedaddress); \ } @@ -1163,6 +1167,8 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr); #define sse_forget_reg STEPNAME(sse_forget_reg) #define sse_purge07cache STEPNAME(sse_purge07cache) #define sse_reflect_reg STEPNAME(sse_reflect_reg) +#define ymm_get_reg STEPNAME(ymm_get_reg) +#define ymm_get_reg_empty STEPNAME(ymm_get_reg_empty) #define fpu_pushcache STEPNAME(fpu_pushcache) #define fpu_popcache STEPNAME(fpu_popcache) @@ -1173,7 +1179,7 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr); #define x87_purgecache STEPNAME(x87_purgecache) #define fpu_reflectcache STEPNAME(fpu_reflectcache) #define fpu_unreflectcache STEPNAME(fpu_unreflectcache) -#define avx_purge_ymm0 STEPNAME(avx_purge_ymm0) +#define avx_purge_ymm STEPNAME(avx_purge_ymm) #define CacheTransform STEPNAME(CacheTransform) @@ -1334,8 +1340,8 @@ int x87_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1); // Set rounding according to mxcsr flags, return reg to restore flags int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); -// purge ymm_zero mask according to purge_ymm0 -void avx_purge_ymm0(dynarec_arm_t* dyn, int ninst); +// purge ymm_zero mask according to purge_ymm +void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1); void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3); @@ -1409,6 +1415,12 @@ void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); void fpu_unreflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07); void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07); +// avx helpers +// get neon register for a SSE reg, create the entry if needed +int ymm_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite, int k1, int k2, int k3); +// get neon register for a SSE reg, but don't try to synch it if it needed to be created +int ymm_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a, int k1, int k2, int k3); + uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); @@ -1593,6 +1605,6 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int } \ } -#define PURGE_YMM0() avx_purge_ymm0(dyn, ninst) +#define PURGE_YMM() avx_purge_ymm(dyn, ninst, x1) #endif //__DYNAREC_ARM64_HELPER_H__ diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index 2788ddc4..6a6647df 100644 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -82,7 +82,7 @@ typedef struct instruction_arm64_s { uintptr_t natcall; uint16_t retn; uint16_t ymm_zero; // bitmap of ymm to zero at purge - uint16_t purge_ymm0; // need to purge some ymm0 because of a loop + uint16_t purge_ymm; // need to purge some ymm uint8_t barrier_maybe; uint8_t will_write; uint8_t last_write; diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c index 4f509ac1..649ba1a1 100644 --- a/src/dynarec/dynarec_native.c +++ b/src/dynarec/dynarec_native.c @@ -559,7 +559,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit if(helper.insts[i].ymm_zero || helper.insts[k].ymm_zero) { // move to pureg the reg that are present in k (jump to) but not in i (jump from) uint16_t to_purge = helper.insts[k].ymm_zero & ~helper.insts[i].ymm_zero; - helper.insts[k].purge_ymm0 |= to_purge; + helper.insts[k].purge_ymm |= to_purge; helper.insts[k].ymm_zero &= ~to_purge; } } diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index cab03222..60c37b4b 100644 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -80,8 +80,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int } #endif fpu_propagate_stack(dyn, ninst); - if(dyn->insts[ninst].purge_ymm0) - PURGE_YMM0(); + if(dyn->insts[ninst].purge_ymm) + PURGE_YMM(); ip = addr; if (reset_n!=-1) { dyn->last_ip = 0; diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h index def767b4..484c1324 100644 --- a/src/dynarec/la64/dynarec_la64_helper.h +++ b/src/dynarec/la64/dynarec_la64_helper.h @@ -1075,6 +1075,6 @@ uintptr_t dynarec64_F20F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int } \ } while (0) -#define PURGE_YMM0() /* TODO */ +#define PURGE_YMM() /* TODO */ #endif //__DYNAREC_LA64_HELPER_H__ \ No newline at end of file diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h index b31d3f2e..09b6698e 100644 --- a/src/dynarec/la64/dynarec_la64_private.h +++ b/src/dynarec/la64/dynarec_la64_private.h @@ -81,7 +81,7 @@ typedef struct instruction_la64_s { uintptr_t natcall; uint16_t retn; uint16_t ymm_zero; // bitmap of ymm to zero at purge - uint16_t purge_ymm0; // need to purge some ymm0 because of a loop + uint16_t purge_ymm; // need to purge some ymm uint8_t barrier_maybe; uint8_t will_write; uint8_t last_write; diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 2471c71c..3935ee1c 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -1671,4 +1671,14 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int #define PURGE_YMM0() /* TODO */ +// reg = (reg < -32768) ? -32768 : ((reg > 32767) ? 32767 : reg) +#define SAT16(reg, s) \ + LUI(s, 0xFFFF8); /* -32768 */ \ + BGE(reg, s, 4 + 2 * 4); \ + MV(reg, s); \ + J(4 + 4 * 3); \ + LUI(s, 8); /* 32768 */ \ + BLT(reg, s, 4 + 4); \ + ADDIW(reg, s, -1); + #endif //__DYNAREC_RV64_HELPER_H__ diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index dff6f84e..1ba830d5 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -91,7 +91,7 @@ typedef struct instruction_rv64_s { uintptr_t natcall; uint16_t retn; uint16_t ymm_zero; // bitmap of ymm to zero at purge - uint16_t purge_ymm0; // need to purge some ymm0 because of a loop + uint16_t purge_ymm; // need to purge some ymm int barrier_maybe; flagcache_t f_exit; // flags status at end of intruction extcache_t e; // extcache at end of intruction (but before poping) |