diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2025-06-23 13:00:45 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2025-06-23 13:00:45 +0200 |
| commit | c66630da497c18622205cc58cb058a1f8cba7cd1 (patch) | |
| tree | a6c9c55bff71ecfdd80b9d2bff3b15e6c8724ce0 /src | |
| parent | 0cc58732fb3f6992918b10d5da7a9937edb4a0b4 (diff) | |
| download | box64-c66630da497c18622205cc58cb058a1f8cba7cd1.tar.gz box64-c66630da497c18622205cc58cb058a1f8cba7cd1.zip | |
[DYNAREC] Refactored a bit BARRIER_FLOAT ([ARM64] olny for now, todo for RV64 and LA64)
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_00.c | 14 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_0f.c | 9 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_66.c | 6 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_67.c | 1 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_67_32.c | 1 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_d9.c | 8 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_dd.c | 5 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_f20f.c | 1 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.c | 62 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.h | 3 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.c | 28 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_private.h | 4 | ||||
| -rw-r--r-- | src/dynarec/dynarec_arch.h | 2 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native.c | 12 | ||||
| -rw-r--r-- | src/tools/env.c | 2 |
15 files changed, 116 insertions, 42 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c index 219fa44c..6273933e 100644 --- a/src/dynarec/arm64/dynarec_arm64_00.c +++ b/src/dynarec/arm64/dynarec_arm64_00.c @@ -1072,7 +1072,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin #define GO(GETFLAGS, NO, YES, F) \ READFLAGS(F); \ i8 = F8S; \ - BARRIER(BARRIER_MAYBE); \ JUMP(addr+i8, 1); \ GETFLAGS; \ if(dyn->insts[ninst].x64.jmp_insts==-1 || \ @@ -2385,7 +2384,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(BOX64DRENV(dynarec_safeflags)) { READFLAGS(X_PEND); // lets play safe here too } - fpu_purgecache(dyn, ninst, 1, x1, x2, x3); // using next, even if there no next + BARRIER(BARRIER_FLOAT); i32 = F16; retn_to_epilog(dyn, ip, ninst, rex, i32); *need_epilog = 0; @@ -2397,7 +2396,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(BOX64DRENV(dynarec_safeflags)) { READFLAGS(X_PEND); // so instead, force the deferred flags, so it's not too slow, and flags are not lost } - fpu_purgecache(dyn, ninst, 1, x1, x2, x3); // using next, even if there no next + BARRIER(BARRIER_FLOAT); ret_to_epilog(dyn, ip, ninst, rex); *need_epilog = 0; *ok = 0; @@ -3282,7 +3281,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin addr = dynarec64_DF(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); break; #define GO(Z) \ - BARRIER(BARRIER_MAYBE); \ JUMP(addr+i8, 1); \ if(dyn->insts[ninst].x64.jmp_insts==-1 || \ CHECK_CACHE()) { \ @@ -3450,7 +3448,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin } else { MOV64x(x2, addr); } - fpu_purgecache(dyn, ninst, 1, x1, x3, x4); + BARRIER(BARRIER_FLOAT); + //fpu_purgecache(dyn, ninst, 0, x1, x3, x4); PUSH1z(x2); if (BOX64DRENV(dynarec_callret)) { SET_HASCALLRET(); @@ -3500,7 +3499,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 0xE9: case 0xEB: - BARRIER(BARRIER_MAYBE); + BARRIER(BARRIER_MAYBE); // there will be a barrier if there is a jump out if(opcode==0xEB && PK(0)==0xFF) { INST_NAME("JMP ib"); MESSAGE(LOG_DEBUG, "Hack for EB FF opcode"); @@ -3522,7 +3521,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(dyn->insts[ninst].x64.jmp_insts==-1) { // out of the block SET_NODF(); - fpu_purgecache(dyn, ninst, 1, x1, x2, x3); + BARRIER(BARRIER_FLOAT); + //fpu_purgecache(dyn, ninst, 0, x1, x2, x3); jump_to_next(dyn, j64, 0, ninst, rex.is32bits); } else { // inside the block diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c index 8842d43f..83f7b3b4 100644 --- a/src/dynarec/arm64/dynarec_arm64_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_0f.c @@ -1673,7 +1673,6 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin j64 = (uint32_t)(addr+i32_); \ else \ j64 = addr+i32_; \ - BARRIER(BARRIER_MAYBE); \ JUMP(j64, 1); \ GETFLAGS; \ if(dyn->insts[ninst].x64.jmp_insts==-1 || \ @@ -1918,7 +1917,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0: INST_NAME("FXSAVE Ed"); MESSAGE(LOG_DUMP, "Need Optimization (FXSAVE)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} CALL(rex.is32bits?const_fpu_fxsave32:const_fpu_fxsave64, -1); @@ -1926,7 +1925,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 1: INST_NAME("FXRSTOR Ed"); MESSAGE(LOG_DUMP, "Need Optimization (FXRSTOR)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} CALL(rex.is32bits?const_fpu_fxrstor32:const_fpu_fxrstor64, -1); @@ -1986,7 +1985,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 4: INST_NAME("XSAVE Ed"); MESSAGE(LOG_DUMP, "Need Optimization (XSAVE)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} MOV32w(x2, rex.w?0:1); @@ -1995,7 +1994,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 5: INST_NAME("XRSTOR Ed"); MESSAGE(LOG_DUMP, "Need Optimization (XRSTOR)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} MOV32w(x2, rex.w?0:1); diff --git a/src/dynarec/arm64/dynarec_arm64_66.c b/src/dynarec/arm64/dynarec_arm64_66.c index 30b4ea0e..e99e3cef 100644 --- a/src/dynarec/arm64/dynarec_arm64_66.c +++ b/src/dynarec/arm64/dynarec_arm64_66.c @@ -1361,7 +1361,7 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FNSTENV Ed"); MESSAGE(LOG_DUMP, "Need Optimization (FNSTENV16)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE? + BARRIER(BARRIER_FLOAT); // maybe only x87, not SSE? addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} MOV32w(x2, 1); @@ -1381,7 +1381,7 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 4: INST_NAME("FRSTOR Ed"); MESSAGE(LOG_DUMP, "Need Optimization (FRSTOR16)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} CALL(const_native_frstor16, -1); @@ -1389,7 +1389,7 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FNSAVE Ed"); MESSAGE(LOG_DUMP, "Need Optimization (FNSAVE16)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} CALL(const_native_fsave16, -1); diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c index e6755147..0e64eaf1 100644 --- a/src/dynarec/arm64/dynarec_arm64_67.c +++ b/src/dynarec/arm64/dynarec_arm64_67.c @@ -1425,7 +1425,6 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin #define GO(NO, YES) \ - BARRIER(BARRIER_MAYBE); \ JUMP(addr+i8, 1); \ if(dyn->insts[ninst].x64.jmp_insts==-1 || \ CHECK_CACHE()) { \ diff --git a/src/dynarec/arm64/dynarec_arm64_67_32.c b/src/dynarec/arm64/dynarec_arm64_67_32.c index 477b41be..ec1fa1e5 100644 --- a/src/dynarec/arm64/dynarec_arm64_67_32.c +++ b/src/dynarec/arm64/dynarec_arm64_67_32.c @@ -89,7 +89,6 @@ uintptr_t dynarec64_67_32(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int break; #define GO(NO, YES) \ - BARRIER(BARRIER_MAYBE); \ JUMP(addr+i8, 1); \ if(dyn->insts[ninst].x64.jmp_insts==-1 || \ CHECK_CACHE()) { \ diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c index b8cfe6e0..0d024778 100644 --- a/src/dynarec/arm64/dynarec_arm64_d9.c +++ b/src/dynarec/arm64/dynarec_arm64_d9.c @@ -356,7 +356,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 0xF6: INST_NAME("FDECSTP"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); LDRw_U12(x2, xEmu, offsetof(x64emu_t, top)); SUBw_U12(x2, x2, 1); ANDw_mask(x2, x2, 0, 2); //mask=7 @@ -364,7 +364,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 0xF7: INST_NAME("FINCSTP"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); LDRw_U12(x2, xEmu, offsetof(x64emu_t, top)); ADDw_U12(x2, x2, 1); ANDw_mask(x2, x2, 0, 2); //mask=7 @@ -525,7 +525,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 4: INST_NAME("FLDENV Ed"); MESSAGE(LOG_DUMP, "Need Optimization (FLDENV)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE? + BARRIER(BARRIER_FLOAT); // maybe only x87, not SSE? addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) { MOVx_REG(x1, ed); @@ -544,7 +544,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FNSTENV Ed"); MESSAGE(LOG_DUMP, "Need Optimization (FNSTENV)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE? + BARRIER(BARRIER_FLOAT); // maybe only x87, not SSE? addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) { MOVx_REG(x1, ed); diff --git a/src/dynarec/arm64/dynarec_arm64_dd.c b/src/dynarec/arm64/dynarec_arm64_dd.c index fe640ef3..9ce8c908 100644 --- a/src/dynarec/arm64/dynarec_arm64_dd.c +++ b/src/dynarec/arm64/dynarec_arm64_dd.c @@ -186,7 +186,7 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 4: INST_NAME("FRSTOR m108byte"); MESSAGE(LOG_DUMP, "Need Optimization (FRSTOR)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} CALL(const_native_frstor, -1); @@ -194,7 +194,7 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FNSAVE m108byte"); MESSAGE(LOG_DUMP, "Need Optimization (FNSAVE)\n"); - fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + BARRIER(BARRIER_FLOAT); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} CALL(const_native_fsave, -1); @@ -202,7 +202,6 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 7: INST_NAME("FNSTSW m2byte"); - //fpu_purgecache(dyn, ninst, 0, x1, x2, x3); addr = geted(dyn, addr, ninst, nextop, &ed, x4, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0); LDRw_U12(x2, xEmu, offsetof(x64emu_t, top)); LDRH_U12(x3, xEmu, offsetof(x64emu_t, sw)); diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c index e6289479..5d620da6 100644 --- a/src/dynarec/arm64/dynarec_arm64_f20f.c +++ b/src/dynarec/arm64/dynarec_arm64_f20f.c @@ -456,7 +456,6 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n j64 = (uint32_t)(addr+i32_); \ else \ j64 = addr+i32_; \ - BARRIER(BARRIER_MAYBE); \ JUMP(j64, 1); \ GETFLAGS; \ if(dyn->insts[ninst].x64.jmp_insts==-1 || \ diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index 9ecd4d29..d17eee5f 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -1236,4 +1236,66 @@ void updateUneeded(dynarec_arm_t* dyn) if(dyn->insts[ninst].n.ymm_unneeded&(1<<i)) propagateYMMUneeded(dyn, ninst, i); } +} + +void tryEarlyFpuBarrier(dynarec_arm_t* dyn, int last_fpu_used, int ninst) +{ + // there is a barrier at ninst + // check if, up to last fpu_used, if there is some suspicious jump that would prevent the barrier to be put earlier + int usefull = 0; + for(int i=ninst-1; i>last_fpu_used; --i) + { + if(!dyn->insts[i].x64.has_next) + return; // break of chain, don't try to be smart for now + if(dyn->insts[i].x64.barrier&BARRIER_FLOAT) + return; // already done? + if(dyn->insts[i].x64.jmp && dyn->insts[i].x64.jmp_insts==-1) + usefull = 1; + if(dyn->insts[i].x64.jmp && dyn->insts[i].x64.jmp_insts!=-1) { + int i2 = dyn->insts[i].x64.jmp_insts; + if(i2<last_fpu_used || i2>ninst) { + // check if some xmm/ymm/x87 stack are used in landing point + if(i2>ninst) { + if(dyn->insts[i2].n.xmm_used || dyn->insts[i2].n.ymm_used || dyn->insts[i2].n.stack) + return; + } + // we will stop there, not trying to guess too much thing + if((usefull && (i+1)!=ninst)) { + if(BOX64ENV(dynarec_dump) || BOX64ENV(dynarec_log)>1) dynarec_log(LOG_NONE, "Putting early Float Barrier in %d for %d\n", i+1, ninst); + dyn->insts[i+1].x64.barrier|=BARRIER_FLOAT; + } + return; + } + usefull = 1; + } + for(int pred=0; pred<dyn->insts[i].pred_sz; ++pred) { + if(dyn->insts[i].pred[pred]<=last_fpu_used) { + if(usefull && ((i+1)!=ninst)) { + if(BOX64ENV(dynarec_dump) || BOX64ENV(dynarec_log)>1) dynarec_log(LOG_NONE, "Putting early Float Barrier in %d for %d\n", i+1, ninst); + dyn->insts[i+1].x64.barrier|=BARRIER_FLOAT; + } + return; + } + } + if(dyn->insts[i].pred_sz>1) + usefull = 1; + } + if(usefull) { + if(BOX64ENV(dynarec_dump) || BOX64ENV(dynarec_log)>1) dynarec_log(LOG_NONE, "Putting early Float Barrier in %d for %d\n", last_fpu_used, ninst); + dyn->insts[last_fpu_used+1].x64.barrier|=BARRIER_FLOAT; + } +} + +void propagateFpuBarrier(dynarec_arm_t* dyn) +{ + int last_fpu_used = -1; + for(int ninst=0; ninst<dyn->size; ++ninst) { + int fpu_used = dyn->insts[ninst].n.xmm_used || dyn->insts[ninst].n.ymm_used || dyn->insts[ninst].mmx_used || dyn->insts[ninst].x87_used; + if(fpu_used) last_fpu_used = ninst; + dyn->insts[ninst].fpu_used = fpu_used; + if(dyn->insts[ninst].fpupurge && (last_fpu_used!=-1) && (last_fpu_used!=(ninst-1))) { + tryEarlyFpuBarrier(dyn, last_fpu_used, ninst); + last_fpu_used = -1; // reset the last_fpu_used... + } + } } \ No newline at end of file diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h index c2d88150..d32dbddd 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.h +++ b/src/dynarec/arm64/dynarec_arm64_functions.h @@ -86,7 +86,8 @@ void fpu_reset_ninst(dynarec_native_t* dyn, int ninst); // is st freed int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st); +// propage FPU_BARRIER to trigger it as soon as possible (avoiding fetching an FPU reg if it's unused) +void propagateFpuBarrier(dynarec_arm_t* dyn); // propage the uneeded flags on XMM/YMM regs (done between step 0 and step 1) void updateUneeded(dynarec_arm_t* dyn); - #endif //__DYNAREC_ARM_FUNCTIONS_H__ diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index e6ee1b70..643c2a97 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -1028,6 +1028,7 @@ int neoncache_st_coherency(dynarec_arm_t* dyn, int ninst, int a, int b) // the reg returned is *2 for FLOAT int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t) { + dyn->insts[ninst].x87_used = 1; if(dyn->n.mmxcount) mmx_purgecache(dyn, ninst, 0, s1); dyn->n.x87stack+=1; @@ -1062,6 +1063,7 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t) } void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1) { + dyn->insts[ninst].x87_used = 1; if(dyn->n.mmxcount) mmx_purgecache(dyn, ninst, 0, s1); dyn->n.x87stack+=1; @@ -1111,6 +1113,7 @@ static int internal_x87_dofree(dynarec_arm_t* dyn) } void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1) { + dyn->insts[ninst].x87_used = 1; if(dyn->n.mmxcount) mmx_purgecache(dyn, ninst, 0, s1); do { @@ -1193,17 +1196,7 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int for (int i=0; i<8; ++i) if(dyn->n.x87cache[i]!=-1) { int st = dyn->n.x87cache[i]+dyn->n.stack_pop; - #if STEP == 1 - if(!next) { // don't force promotion here - // pre-apply pop, because purge happens in-between - neoncache_promote_double(dyn, ninst, st); - } - #endif - #if STEP == 3 - if(!next && neoncache_get_current_st(dyn, ninst, st)!=NEON_CACHE_ST_D) { - MESSAGE(LOG_DUMP, "Warning, incoherency with purged ST%d cache\n", st); - } - #endif + // don't force promotion here ADDw_U12(s3, s2, dyn->n.x87cache[i]); // unadjusted count, as it's relative to real top ANDw_mask(s3, s3, 0, 2); //mask=7 // (emu->top + st)&7 switch(neoncache_get_current_st(dyn, ninst, st)) { @@ -1378,6 +1371,7 @@ void x87_unreflectcount(dynarec_arm_t* dyn, int ninst, int s1, int s2) int x87_get_current_cache(dynarec_arm_t* dyn, int ninst, int st, int t) { + dyn->insts[ninst].x87_used = 1; // search in cache first for (int i=0; i<8; ++i) { if(dyn->n.x87cache[i]==st) { @@ -1398,6 +1392,7 @@ int x87_get_current_cache(dynarec_arm_t* dyn, int ninst, int st, int t) int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, int st, int t) { + dyn->insts[ninst].x87_used = 1; if(dyn->n.mmxcount) mmx_purgecache(dyn, ninst, 0, s1); int ret = x87_get_current_cache(dyn, ninst, st, t); @@ -1431,6 +1426,7 @@ int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, i } int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) { + dyn->insts[ninst].x87_used = 1; for(int ii=0; ii<24; ++ii) if((dyn->n.neoncache[ii].t == NEON_CACHE_ST_F || dyn->n.neoncache[ii].t == NEON_CACHE_ST_D @@ -1442,10 +1438,12 @@ int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) } int x87_get_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t) { + dyn->insts[ninst].x87_used = 1; return dyn->n.x87reg[x87_get_cache(dyn, ninst, 1, s1, s2, a, t)]; } int x87_get_st_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t) { + dyn->insts[ninst].x87_used = 1; return dyn->n.x87reg[x87_get_cache(dyn, ninst, 0, s1, s2, a, t)]; } @@ -1500,6 +1498,7 @@ void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) { + dyn->insts[ninst].x87_used = 1; if(dyn->n.mmxcount) mmx_purgecache(dyn, ninst, 0, s1); // search in cache first @@ -1550,6 +1549,7 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) void x87_free(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int st) { + dyn->insts[ninst].x87_used = 1; int ret = -1; for (int i=0; (i<8) && (ret==-1); ++i) if(dyn->n.x87cache[i] == st) @@ -1683,6 +1683,7 @@ static int isx87Empty(dynarec_arm_t* dyn) // get neon register for a MMX reg, create the entry if needed int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a) { + dyn->insts[ninst].mmx_used = 1; if(!dyn->n.x87stack && isx87Empty(dyn)) x87_purgecache(dyn, ninst, 0, s1, s2, s3); if(dyn->n.mmxcache[a]!=-1) @@ -1695,6 +1696,7 @@ int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a) // get neon register for a MMX reg, but don't try to synch it if it needed to be created int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a) { + dyn->insts[ninst].mmx_used = 1; if(!dyn->n.x87stack && isx87Empty(dyn)) x87_purgecache(dyn, ninst, 0, s1, s2, s3); if(dyn->n.mmxcache[a]!=-1) @@ -2067,8 +2069,10 @@ void fpu_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int x87_purgecache(dyn, ninst, next, s1, s2, s3); mmx_purgecache(dyn, ninst, next, s1); sse_purgecache(dyn, ninst, next, s1); - if(!next) + if(!next) { fpu_reset_reg(dyn); + dyn->insts[ninst].fpupurge = 1; + } } static int findCacheSlot(dynarec_arm_t* dyn, int ninst, int t, int n, neoncache_t* cache) diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index b3eeed49..2d8a236c 100644 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -132,6 +132,10 @@ typedef struct instruction_arm64_s { unsigned df_notneeded:1; unsigned unaligned:1; // this opcode can be re-generated for unaligned special case unsigned x87precision:1; // this opcode can handle x87pc + unsigned mmx_used:1; // no fine tracking, just a global "any reg used" + unsigned x87_used:1; // no fine tracking, just a global "any reg used" + unsigned fpu_used:1; // any xmm/ymm/x87/mmx reg used + unsigned fpupurge:1; // this opcode will purge all fpu regs flagcache_t f_exit; // flags status at end of instruction neoncache_t n; // neoncache at end of instruction (but before poping) flagcache_t f_entry; // flags status before the instruction begin diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h index 5424c447..994eebc3 100644 --- a/src/dynarec/dynarec_arch.h +++ b/src/dynarec/dynarec_arch.h @@ -24,7 +24,7 @@ #define MAXBLOCK_SIZE ((1<<20)-200) #define RAZ_SPECIFIC(A, N) rasNativeState(A, N) -#define UPDATE_SPECIFICS(A) updateNativeFlags(A) +#define UPDATE_SPECIFICS(A) updateNativeFlags(A); propagateFpuBarrier(A) #define PREUPDATE_SPECIFICS(A) #define POSTUPDATE_SPECIFICS(A) updateUneeded(A) #define ARCH_SIZE(A) get_size_arch(A) diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c index 8e142b57..98aa0e61 100644 --- a/src/dynarec/dynarec_native.c +++ b/src/dynarec/dynarec_native.c @@ -702,6 +702,10 @@ dynablock_t* FillBlock64(uintptr_t addr, int alternate, int is32bits, int inst_m #endif { helper.insts[i].x64.need_after |= X_PEND; + if(helper.insts[i].barrier_maybe) { + helper.insts[i].x64.barrier|=BARRIER_FLOAT; + helper.insts[i].barrier_maybe = 0; + } } else { // find jump address instruction int k=-1; @@ -734,8 +738,6 @@ dynablock_t* FillBlock64(uintptr_t addr, int alternate, int is32bits, int inst_m k=i2; }*/ if(k!=-1) { - if(!helper.insts[i].barrier_maybe) - helper.insts[k].x64.barrier |= BARRIER_FULL; // special case, loop on itself with some nop in between if(k<i && !helper.insts[i].x64.has_next && is_nops(&helper, helper.insts[k].x64.addr, helper.insts[i].x64.addr-helper.insts[k].x64.addr)) { #ifndef ARCH_NOP @@ -746,6 +748,12 @@ dynablock_t* FillBlock64(uintptr_t addr, int alternate, int is32bits, int inst_m #endif } helper.insts[i].x64.jmp_insts = k; + helper.insts[i].barrier_maybe = 0; + } else { + if(helper.insts[i].barrier_maybe) { + helper.insts[i].x64.barrier|=BARRIER_FLOAT; + helper.insts[i].barrier_maybe = 0; + } } } } diff --git a/src/tools/env.c b/src/tools/env.c index 6727d830..bb89cc49 100644 --- a/src/tools/env.c +++ b/src/tools/env.c @@ -804,7 +804,7 @@ done: #define HEADER_SIGN "DynaCache" #define SET_VERSION(MAJ, MIN, REV) (((MAJ)<<24)|((MIN)<<16)|(REV)) #ifdef ARM64 -#define ARCH_VERSION SET_VERSION(0, 0, 1) +#define ARCH_VERSION SET_VERSION(0, 0, 2) #elif defined(RV64) #define ARCH_VERSION SET_VERSION(0, 0, 1) #elif defined(LA64) |