diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2022-10-24 20:19:09 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2022-10-24 20:19:09 +0200 |
| commit | f8f1859a0508ac574035a75aa0206cd25271d735 (patch) | |
| tree | 41d2118dc25cddee077f06a781eb1dd0d4ff4f71 /src | |
| parent | 92f2a44b28d21b7c64358b7d2883fb4793508a73 (diff) | |
| download | box64-f8f1859a0508ac574035a75aa0206cd25271d735.tar.gz box64-f8f1859a0508ac574035a75aa0206cd25271d735.zip | |
[DYNAREC] Better (defered) flag handling in dynarec (gives between 5% and 10% speedup)
Diffstat (limited to 'src')
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_00.c | 18 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_0f.c | 6 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_67.c | 6 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_emit_math.c | 4 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_functions.c | 44 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_functions.h | 2 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_helper.c | 54 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_helper.h | 44 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_pass0.h | 11 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_pass3.h | 7 | ||||
| -rwxr-xr-x | src/dynarec/dynarec_native.c | 130 | ||||
| -rwxr-xr-x | src/dynarec/dynarec_native_pass.c | 9 | ||||
| -rwxr-xr-x | src/dynarec/dynarec_private.h | 8 |
13 files changed, 210 insertions, 133 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c index 59c70674..f3f2f960 100755 --- a/src/dynarec/arm64/dynarec_arm64_00.c +++ b/src/dynarec/arm64/dynarec_arm64_00.c @@ -40,6 +40,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin uint8_t wback, wb1, wb2, wb; int64_t fixedaddress; int lock; + int cacheupd; opcode = F8; MAYUSE(eb1); @@ -47,6 +48,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MAYUSE(j64); MAYUSE(wb); MAYUSE(lock); + MAYUSE(cacheupd); switch(opcode) { case 0x00: @@ -650,11 +652,11 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin i32 = dyn->insts[ninst].epilog-(dyn->native_size); \ Bcond(NO, i32); \ if(dyn->insts[ninst].x64.jmp_insts==-1) { \ - if(!dyn->insts[ninst].x64.barrier) \ + if(!(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT)) \ fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ jump_to_next(dyn, addr+i8, 0, ninst); \ } else { \ - fpuCacheTransform(dyn, ninst, x1, x2, x3); \ + CacheTransform(dyn, ninst, cacheupd, x1, x2, x3); \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);\ B(i32); \ } \ @@ -1206,7 +1208,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 1: case 2: if(rep==1) {INST_NAME("REPNZ CMPSB");} else {INST_NAME("REPZ CMPSB");} - SETFLAGS(X_ALL, SF_MAYSET); + MAYSETFLAGS(); + SETFLAGS(X_ALL, SF_SET_PENDING); CBZx_NEXT(xRCX); TBNZ_MARK2(xFlags, F_DF); MARK; // Part with DF==0 @@ -1305,7 +1308,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 1: case 2: if(rep==1) {INST_NAME("REPNZ SCASB");} else {INST_NAME("REPZ SCASB");} - SETFLAGS(X_ALL, SF_MAYSET); + MAYSETFLAGS(); + SETFLAGS(X_ALL, SF_SET_PENDING); CBZx_NEXT(xRCX); UBFXw(x1, xRAX, 0, 8); TBNZ_MARK2(xFlags, F_DF); @@ -2035,11 +2039,11 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin i32 = dyn->insts[ninst].epilog-(dyn->native_size); \ if(Z) {CBNZx(xRCX, i32);} else {CBZx(xRCX, i32);}; \ if(dyn->insts[ninst].x64.jmp_insts==-1) { \ - if(!dyn->insts[ninst].x64.barrier) \ + if(!(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT)) \ fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ jump_to_next(dyn, addr+i8, 0, ninst); \ } else { \ - fpuCacheTransform(dyn, ninst, x1, x2, x3); \ + CacheTransform(dyn, ninst, cacheupd, x1, x2, x3); \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); \ Bcond(c__, i32); \ } \ @@ -2176,7 +2180,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin jump_to_next(dyn, addr+i32, 0, ninst); } else { // inside the block - fpuCacheTransform(dyn, ninst, x1, x2, x3); + CacheTransform(dyn, ninst, CHECK_CACHE(), x1, x2, x3); tmp = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); if(tmp==4) { NOP; diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c index 24877ed8..51c9e247 100755 --- a/src/dynarec/arm64/dynarec_arm64_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_0f.c @@ -72,6 +72,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin uint8_t wback, wb2; uint8_t eb1, eb2; int32_t i32, i32_; + int cacheupd; int v0, v1; int q0, q1; int d0, d1; @@ -88,6 +89,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MAYUSE(d1); MAYUSE(s0); MAYUSE(j64); + MAYUSE(cacheupd); #if STEP > 1 static const int8_t mask_shift8[] = { -7, -6, -5, -4, -3, -2, -1, 0 }; #endif @@ -1034,11 +1036,11 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin i32 = dyn->insts[ninst].epilog-(dyn->native_size); \ Bcond(NO, i32); \ if(dyn->insts[ninst].x64.jmp_insts==-1) { \ - if(!dyn->insts[ninst].x64.barrier) \ + if(!(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT)) \ fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ jump_to_next(dyn, addr+i32_, 0, ninst); \ } else { \ - fpuCacheTransform(dyn, ninst, x1, x2, x3); \ + CacheTransform(dyn, ninst, cacheupd, x1, x2, x3); \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); \ B(i32); \ } \ diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c index 7157a5e8..17a89479 100755 --- a/src/dynarec/arm64/dynarec_arm64_67.c +++ b/src/dynarec/arm64/dynarec_arm64_67.c @@ -44,6 +44,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin uint8_t u8; int32_t i32; int64_t j64, i64; + int cacheupd; int lock; int v0, v1, s0; MAYUSE(i32); @@ -52,6 +53,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MAYUSE(v1); MAYUSE(s0); MAYUSE(lock); + MAYUSE(cacheupd); // REX prefix before the 67 are ignored rex.rex = 0; @@ -762,11 +764,11 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin i32 = dyn->insts[ninst].epilog-(dyn->native_size); \ Bcond(NO, i32); \ if(dyn->insts[ninst].x64.jmp_insts==-1) { \ - if(!dyn->insts[ninst].x64.barrier) \ + if(!(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT)) \ fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ jump_to_next(dyn, addr+i8, 0, ninst); \ } else { \ - fpuCacheTransform(dyn, ninst, x1, x2, x3); \ + CacheTransform(dyn, ninst, cacheupd, x1, x2, x3); \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);\ B(i32); \ } \ diff --git a/src/dynarec/arm64/dynarec_arm64_emit_math.c b/src/dynarec/arm64/dynarec_arm64_emit_math.c index eb55a249..37952444 100755 --- a/src/dynarec/arm64/dynarec_arm64_emit_math.c +++ b/src/dynarec/arm64/dynarec_arm64_emit_math.c @@ -77,7 +77,7 @@ void emit_add32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3 void emit_add32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4, int s5) { MAYUSE(s5); - if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.need_flags==X_PEND)) + if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags==X_PEND)) { // special case when doing math on ESP and only PEND is needed: ignoring it! if(c>=0 && c<0x1000) { @@ -201,7 +201,7 @@ void emit_sub32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3 void emit_sub32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4, int s5) { MAYUSE(s5); - if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.need_flags==X_PEND)) + if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags==X_PEND)) { // special case when doing math on RSP and only PEND is needed: ignoring it! if(c>=0 && c<0x1000) { diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index d51f5a3a..2c6ebb36 100755 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -631,7 +631,7 @@ int isCacheEmpty(dynarec_arm_t* dyn, int ninst) { } -int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { +static int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { int i2 = dyn->insts[ninst].x64.jmp_insts; if(i2<0) return 1; @@ -678,6 +678,48 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { return ret; } +static int flagsCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { + int jmp = dyn->insts[ninst].x64.jmp_insts; + if(jmp<0) + return 0; + if(dyn->insts[ninst].f_exit.dfnone) // flags are fully known, nothing we can do more + return 0; +/* if((dyn->f.pending!=SF_SET) + && (dyn->f.pending!=SF_SET_PENDING)) { + if(dyn->f.pending!=SF_PENDING) {*/ + switch (dyn->insts[jmp].f_entry.pending) { + case SF_UNKNOWN: return 0; + case SF_SET: + if(dyn->insts[ninst].f_exit.pending!=SF_SET && dyn->insts[ninst].f_exit.pending!=SF_SET_PENDING) + return 1; + else + return 0; + case SF_SET_PENDING: + if(dyn->insts[ninst].f_exit.pending!=SF_SET + && dyn->insts[ninst].f_exit.pending!=SF_SET_PENDING + && dyn->insts[ninst].f_exit.pending!=SF_PENDING) + return 1; + else + return 0; + case SF_PENDING: + if(dyn->insts[ninst].f_exit.pending!=SF_SET + && dyn->insts[ninst].f_exit.pending!=SF_SET_PENDING + && dyn->insts[ninst].f_exit.pending!=SF_PENDING) + return 1; + else + return (dyn->insts[jmp].f_entry.dfnone == dyn->insts[ninst].f_exit.dfnone)?0:1; + } + if(dyn->insts[jmp].f_entry.dfnone && !dyn->insts[ninst].f_exit.dfnone) + return 1; + return 0; +} +int CacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { + int ret = 0; + if (fpuCacheNeedsTransform(dyn, ninst)) ret|=1; + if (flagsCacheNeedsTransform(dyn, ninst)) ret|=2; + return ret; +} + void neoncacheUnwind(neoncache_t* cache) { if(cache->swapped) { diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h index 7183fd6d..f8d5d127 100755 --- a/src/dynarec/arm64/dynarec_arm64_functions.h +++ b/src/dynarec/arm64/dynarec_arm64_functions.h @@ -72,7 +72,7 @@ void neoncache_promote_double(dynarec_arm_t* dyn, int ninst, int a); int neoncache_combine_st(dynarec_arm_t* dyn, int ninst, int a, int b); // with stack current dyn->n_stack* // FPU Cache transformation (for loops) -int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int i1); +int CacheNeedsTransform(dynarec_arm_t* dyn, int i1); // Undo the changes of a neoncache to get the status before the instruction void neoncacheUnwind(neoncache_t* cache); diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index 961b0278..521b0604 100755 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -1511,7 +1511,7 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in cache->neoncache[i].v = 0; } -void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) +static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) { #if STEP > 1 int i2 = dyn->insts[ninst].x64.jmp_insts; @@ -1642,6 +1642,58 @@ void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); #endif } +static void flagsCacheTransform(dynarec_arm_t* dyn, int ninst, int s1) +{ +#if STEP > 1 + int j64; + int jmp = dyn->insts[ninst].x64.jmp_insts; + if(jmp<0) + return; + if(dyn->f.dfnone) // flags are fully known, nothing we can do more + return; + MESSAGE(LOG_DUMP, "\tFlags fetch ---- ninst=%d -> %d\n", ninst, jmp); + int go = 0; + switch (dyn->insts[jmp].f_entry.pending) { + case SF_UNKNOWN: break; + case SF_SET: + if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING) + go = 1; + break; + case SF_SET_PENDING: + if(dyn->f.pending!=SF_SET + && dyn->f.pending!=SF_SET_PENDING + && dyn->f.pending!=SF_PENDING) + go = 1; + break; + case SF_PENDING: + if(dyn->f.pending!=SF_SET + && dyn->f.pending!=SF_SET_PENDING + && dyn->f.pending!=SF_PENDING) + go = 1; + else + go = (dyn->insts[jmp].f_entry.dfnone == dyn->f.dfnone)?0:1; + break; + } + if(dyn->insts[jmp].f_entry.dfnone && !dyn->f.dfnone) + go = 1; + if(go) { + if(dyn->f.pending!=SF_PENDING) { + LDRw_U12(s1, xEmu, offsetof(x64emu_t, df)); + j64 = (GETMARK3)-(dyn->native_size); + CBZw(s1, j64); + } + CALL_(UpdateFlags, -1, 0); + MARK3; + } +#endif +} + +void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) { + if(cacheupd&1) + fpuCacheTransform(dyn, ninst, s1, s2, s3); + if(cacheupd&2) + flagsCacheTransform(dyn, ninst, s1); +} #ifdef HAVE_TRACE void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index 954ac5b8..ab14a8e6 100755 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -477,11 +477,11 @@ j64 = GETMARKLOCK-(dyn->native_size); \ CBNZx(reg, j64) -#define IFX(A) if((dyn->insts[ninst].x64.need_flags&(A))) -#define IFX_PENDOR0 if((dyn->insts[ninst].x64.need_flags&(X_PEND) || !dyn->insts[ninst].x64.need_flags)) -#define IFXX(A) if((dyn->insts[ninst].x64.need_flags==(A))) -#define IFX2X(A, B) if((dyn->insts[ninst].x64.need_flags==(A) || dyn->insts[ninst].x64.need_flags==(B) || dyn->insts[ninst].x64.need_flags==((A)|(B)))) -#define IFXN(A, B) if((dyn->insts[ninst].x64.need_flags&(A) && !(dyn->insts[ninst].x64.need_flags&(B)))) +#define IFX(A) if((dyn->insts[ninst].x64.gen_flags&(A))) +#define IFX_PENDOR0 if((dyn->insts[ninst].x64.gen_flags&(X_PEND) || !dyn->insts[ninst].x64.gen_flags)) +#define IFXX(A) if((dyn->insts[ninst].x64.gen_flags==(A))) +#define IFX2X(A, B) if((dyn->insts[ninst].x64.gen_flags==(A) || dyn->insts[ninst].x64.gen_flags==(B) || dyn->insts[ninst].x64.gen_flags==((A)|(B)))) +#define IFXN(A, B) if((dyn->insts[ninst].x64.gen_flags&(A) && !(dyn->insts[ninst].x64.gen_flags&(B)))) // Generate FCOM with s1 and s2 scratch regs (the VCMP is already done) #define FCOM(s1, s2, s3) \ @@ -598,6 +598,10 @@ #define SET_NODF() dyn->f.dfnone = 0 #define SET_DFOK() dyn->f.dfnone = 1 +#ifndef MAYSETFLAGS +#define MAYSETFLAGS() +#endif + #ifndef READFLAGS #define READFLAGS(A) \ if(((A)!=X_PEND && dyn->f.pending!=SF_SET) \ @@ -613,23 +617,21 @@ SET_DFOK(); \ } #endif -// SF_MAYSET doesn't change the flags status cache -// it also doesn't consume any needed flags + #ifndef SETFLAGS #define SETFLAGS(A, B) \ if(dyn->f.pending!=SF_SET \ && ((B)&SF_SUB) \ - && (dyn->insts[ninst].x64.need_flags&(~(A)))) \ - READFLAGS(((dyn->insts[ninst].x64.need_flags&X_PEND)?X_ALL:dyn->insts[ninst].x64.need_flags)&(~(A)));\ - if(dyn->insts[ninst].x64.need_flags) switch(B) { \ + && (dyn->insts[ninst].x64.gen_flags&(~(A)))) \ + READFLAGS(((dyn->insts[ninst].x64.gen_flags&X_PEND)?X_ALL:dyn->insts[ninst].x64.gen_flags)&(~(A)));\ + if(dyn->insts[ninst].x64.gen_flags) switch(B) { \ case SF_SUBSET: \ case SF_SET: dyn->f.pending = SF_SET; break; \ case SF_PENDING: dyn->f.pending = SF_PENDING; break; \ case SF_SUBSET_PENDING: \ case SF_SET_PENDING: \ - dyn->f.pending = (dyn->insts[ninst].x64.need_flags&X_PEND)?SF_SET_PENDING:SF_SET; \ + dyn->f.pending = (dyn->insts[ninst].x64.gen_flags&X_PEND)?SF_SET_PENDING:SF_SET; \ break; \ - case SF_MAYSET: break; \ } else dyn->f.pending = SF_SET #endif #ifndef JUMP @@ -641,12 +643,12 @@ #ifndef BARRIER_NEXT #define BARRIER_NEXT(A) #endif -#define UFLAG_OP1(A) if(dyn->insts[ninst].x64.need_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, op1));} -#define UFLAG_OP2(A) if(dyn->insts[ninst].x64.need_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, op2));} -#define UFLAG_OP12(A1, A2) if(dyn->insts[ninst].x64.need_flags) {STRxw_U12(A1, xEmu, offsetof(x64emu_t, op1));STRxw_U12(A2, 0, offsetof(x64emu_t, op2));} -#define UFLAG_RES(A) if(dyn->insts[ninst].x64.need_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, res));} -#define UFLAG_DF(r, A) if(dyn->insts[ninst].x64.need_flags) {SET_DF(r, A)} -#define UFLAG_IF if(dyn->insts[ninst].x64.need_flags) +#define UFLAG_OP1(A) if(dyn->insts[ninst].x64.gen_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, op1));} +#define UFLAG_OP2(A) if(dyn->insts[ninst].x64.gen_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, op2));} +#define UFLAG_OP12(A1, A2) if(dyn->insts[ninst].x64.gen_flags) {STRxw_U12(A1, xEmu, offsetof(x64emu_t, op1));STRxw_U12(A2, 0, offsetof(x64emu_t, op2));} +#define UFLAG_RES(A) if(dyn->insts[ninst].x64.gen_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, res));} +#define UFLAG_DF(r, A) if(dyn->insts[ninst].x64.gen_flags) {SET_DF(r, A)} +#define UFLAG_IF if(dyn->insts[ninst].x64.gen_flags) #ifndef DEFAULT #define DEFAULT *ok = -1; BARRIER(2) #endif @@ -850,7 +852,7 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr); #define fpu_reflectcache STEPNAME(fpu_reflectcache) #endif -#define fpuCacheTransform STEPNAME(fpuCacheTransform) +#define CacheTransform STEPNAME(CacheTransform) /* setup r2 to address pointed by */ uintptr_t geted(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, rex_t rex, int* l, int s, int delta); @@ -977,12 +979,12 @@ void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1); // Set rounding according to mxcsr flags, return reg to restore flags int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); -void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); +void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3); #if STEP < 2 #define CHECK_CACHE() 0 #else -#define CHECK_CACHE() fpuCacheNeedsTransform(dyn, ninst) +#define CHECK_CACHE() (cacheupd = CacheNeedsTransform(dyn, ninst)) #endif #define neoncache_st_coherency STEPNAME(neoncache_st_coherency) diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h index 23cbcc93..61eb1bcd 100755 --- a/src/dynarec/arm64/dynarec_arm64_pass0.h +++ b/src/dynarec/arm64/dynarec_arm64_pass0.h @@ -6,16 +6,15 @@ if(ninst) dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr #define MESSAGE(A, ...) +#define MAYSETFLAGS() dyn->insts[ninst].x64.may_set = 1 #define READFLAGS(A) \ dyn->insts[ninst].x64.use_flags = A; dyn->f.dfnone = 1;\ dyn->f.pending=SF_SET #define SETFLAGS(A,B) \ - dyn->insts[ninst].x64.set_flags = A; \ - if(B!=SF_MAYSET) { \ - dyn->insts[ninst].x64.state_flags = B; \ - dyn->f.pending=(B)&SF_SET_PENDING; \ - dyn->f.dfnone=((B)&SF_SET)?1:0; \ - } + dyn->insts[ninst].x64.set_flags = A; \ + dyn->insts[ninst].x64.state_flags = B; \ + dyn->f.pending=(B)&SF_SET_PENDING; \ + dyn->f.dfnone=((B)&SF_SET)?1:0; #define EMIT(A) #define JUMP(A, C) add_next(dyn, (uintptr_t)A); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C #define BARRIER(A) if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1 diff --git a/src/dynarec/arm64/dynarec_arm64_pass3.h b/src/dynarec/arm64/dynarec_arm64_pass3.h index 099ef09a..d1d8fbba 100755 --- a/src/dynarec/arm64/dynarec_arm64_pass3.h +++ b/src/dynarec/arm64/dynarec_arm64_pass3.h @@ -19,7 +19,7 @@ #define INST_NAME(name) \ if(box64_dynarec_dump) {\ printf_x64_instruction(my_context->dec, &dyn->insts[ninst].x64, name); \ - dynarec_log(LOG_NONE, "%s%p: %d emited opcodes, inst=%d, barrier=%d state=%d/%d(%d), set=%X, use=%X, need=%X", \ + dynarec_log(LOG_NONE, "%s%p: %d emited opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X", \ (box64_dynarec_dump>1)?"\e[32m":"", \ (void*)(dyn->native_start+dyn->insts[ninst].address), \ dyn->insts[ninst].size/4, \ @@ -28,9 +28,12 @@ dyn->insts[ninst].x64.state_flags, \ dyn->f.pending, \ dyn->f.dfnone, \ + dyn->insts[ninst].x64.may_set?"may":"set", \ dyn->insts[ninst].x64.set_flags, \ + dyn->insts[ninst].x64.gen_flags, \ dyn->insts[ninst].x64.use_flags, \ - dyn->insts[ninst].x64.need_flags); \ + dyn->insts[ninst].x64.need_before, \ + dyn->insts[ninst].x64.need_after); \ if(dyn->insts[ninst].pred_sz) { \ dynarec_log(LOG_NONE, ", pred="); \ for(int ii=0; ii<dyn->insts[ninst].pred_sz; ++ii)\ diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c index c2a752b1..bbd4bcb7 100755 --- a/src/dynarec/dynarec_native.c +++ b/src/dynarec/dynarec_native.c @@ -328,59 +328,51 @@ static void fillPredecessors(dynarec_native_t* dyn) } -static void updateNeed(dynarec_native_t* dyn, int ninst, uint32_t need) { - uint32_t old_need = dyn->insts[ninst].x64.need_flags; - uint32_t new_need = old_need | need; - uint32_t new_use = dyn->insts[ninst].x64.use_flags; - uint32_t old_use = dyn->insts[ninst].x64.old_use; - - if((new_need&X_PEND) && dyn->insts[ninst].x64.state_flags==SF_SUBSET) { - new_need &=~X_PEND; - new_need |= X_ALL; - } else if((new_need&X_PEND) && dyn->insts[ninst].x64.state_flags==SF_SUBSET_PENDING) { - new_need |= X_ALL&~dyn->insts[ninst].x64.set_flags; - } - - - uint32_t new_set = 0; - if(dyn->insts[ninst].x64.state_flags & SF_SET) - new_set = dyn->insts[ninst].x64.set_flags; - if(dyn->insts[ninst].x64.state_flags & SF_PENDING) - new_set |= X_PEND; - if((new_need&X_PEND) && ( - dyn->insts[ninst].x64.state_flags==SF_SET || dyn->insts[ninst].x64.state_flags==SF_SUBSET)) { - new_need &=~X_PEND; - new_need |=X_ALL; - } - - dyn->insts[ninst].x64.need_flags = new_need; - dyn->insts[ninst].x64.old_use = new_use; - - if(dyn->insts[ninst].x64.jmp_insts==-1) - new_need |= X_PEND; - - if((new_need == old_need) && (new_use == old_use)) // no changes, bye - return; - - new_need &=~new_set; // clean needed flag that were suplied - new_need |= new_use; // new need - // a Flag Barrier will change all need to "Pending", as it clear all flags optimisation - if(new_need && dyn->insts[ninst].x64.barrier&BARRIER_FLAGS) - new_need = X_PEND; - - if((new_need == (X_ALL|X_PEND)) && (dyn->insts[ninst].x64.state_flags & SF_SET)) - new_need = X_ALL; - - //update need to new need on predecessor - for(int i=0; i<dyn->insts[ninst].pred_sz; ++i) - updateNeed(dyn, dyn->insts[ninst].pred[i], new_need); -} - -static void resetNeed(dynarec_native_t* dyn) { - for(int i = dyn->size; i-- > 0;) { - dyn->insts[i].x64.old_use = 0; - dyn->insts[i].x64.need_flags = dyn->insts[i].x64.default_need; +// updateNeed goes backward, from last intruction to top +static int updateNeed(dynarec_arm_t* dyn, int ninst, uint8_t need) { + while (ninst>=0) { + // need pending but instruction is only a subset: remove pend and use an X_ALL instead + need |= dyn->insts[ninst].x64.need_after; + if((need&X_PEND) && (dyn->insts[ninst].x64.state_flags==SF_SUBSET)) { + need &=~X_PEND; + need |= X_ALL; + } + if((need&X_PEND) && (dyn->insts[ninst].x64.state_flags==SF_SET)) { + need &=~X_PEND; + need |= dyn->insts[ninst].x64.set_flags; // SF_SET will compute all flags, it's not SUBSET! + } + if((need&X_PEND) && dyn->insts[ninst].x64.state_flags==SF_SUBSET_PENDING) { + need |= X_ALL&~(dyn->insts[ninst].x64.set_flags); + } + dyn->insts[ninst].x64.gen_flags = need&dyn->insts[ninst].x64.set_flags; + if((need&X_PEND) && (dyn->insts[ninst].x64.state_flags&SF_PENDING)) + dyn->insts[ninst].x64.gen_flags |= X_PEND; + dyn->insts[ninst].x64.need_after = need; + need = dyn->insts[ninst].x64.need_after&~dyn->insts[ninst].x64.gen_flags; + if(dyn->insts[ninst].x64.may_set) + need |= dyn->insts[ninst].x64.gen_flags; // forward the flags + // Consume X_PEND if relevant + if((need&X_PEND) && (dyn->insts[ninst].x64.set_flags&SF_PENDING)) + need &=~X_PEND; + need |= dyn->insts[ninst].x64.use_flags; + if(dyn->insts[ninst].x64.need_before == need) + return ninst - 1; + dyn->insts[ninst].x64.need_before = need; + if(dyn->insts[ninst].x64.barrier&BARRIER_FLAGS) { + need = need?X_PEND:0; + } + int ok = 0; + for(int i=0; i<dyn->insts[ninst].pred_sz; ++i) { + if(dyn->insts[ninst].pred[i] == ninst-1) + ok = 1; + else + updateNeed(dyn, dyn->insts[ninst].pred[i], need); + } + if(!ok) + return ninst - 1; + --ninst; } + return ninst; } __thread void* current_helper = NULL; @@ -446,17 +438,13 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) { protectDB(addr, end-addr); //end is 1byte after actual end // compute hash signature uint32_t hash = X31_hash_code((void*)addr, end-addr); - // Compute flag_need, without current barriers - resetNeed(&helper); - for(int i = helper.size; i-- > 0;) - updateNeed(&helper, i, 0); // calculate barriers for(int i=0; i<helper.size; ++i) if(helper.insts[i].x64.jmp) { uintptr_t j = helper.insts[i].x64.jmp; if(j<start || j>=end) { helper.insts[i].x64.jmp_insts = -1; - helper.insts[i].x64.use_flags |= X_PEND; + helper.insts[i].x64.need_after |= X_PEND; } else { // find jump address instruction int k=-1; @@ -472,7 +460,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) { // fill predecessors with the jump address fillPredecessors(&helper); // check for the optionnal barriers now - for(int i=helper.size-1; i>=0; --i) { + /*for(int i=helper.size-1; i>=0; --i) { if(helper.insts[i].barrier_maybe) { // out-of-block jump if(helper.insts[i].x64.jmp_insts == -1) { @@ -492,7 +480,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) { } } } - } + }*/ // check to remove useless barrier, in case of jump when destination doesn't needs flags /*for(int i=helper.size-1; i>=0; --i) { int k; @@ -509,27 +497,9 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) { } } }*/ - // reset need_flags and compute again, now taking barrier into account (because barrier change use_flags) - for(int i = helper.size; i-- > 0;) { - int k; - if(helper.insts[i].x64.jmp - && ((k=helper.insts[i].x64.jmp_insts)>=0) - ) { - if(helper.insts[k].x64.barrier&BARRIER_FLAGS) - // jumpto barrier - helper.insts[i].x64.use_flags |= X_PEND; - if(helper.insts[i].x64.barrier&BARRIER_FLAGS && (helper.insts[k].x64.need_flags | helper.insts[k].x64.use_flags)) - helper.insts[k].x64.barrier|=BARRIER_FLAGS; - else - helper.insts[i].x64.use_flags |= (helper.insts[k].x64.need_flags | helper.insts[k].x64.use_flags); - } - if(helper.insts[i].x64.barrier&BARRIER_FLAGS && !(helper.insts[i].x64.set_flags&SF_PENDING)) - // immediate barrier - helper.insts[i].x64.use_flags |= X_PEND; - } - resetNeed(&helper); - for(int i = helper.size; i-- > 0;) - updateNeed(&helper, i, 0); + int pos = helper.size; + while (pos>=0) + pos = updateNeed(&helper, pos, 0); // pass 1, float optimisations, first pass for flags native_pass1(&helper, addr); diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index d0211197..b095889a 100755 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -30,6 +30,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr) { int ok = 1; int ninst = 0; + int j64; uintptr_t ip = addr; uintptr_t init_addr = addr; rex_t rex; @@ -123,6 +124,9 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr) dyn->n.swapped = 0; NEW_INST; fpu_reset_scratch(dyn); + if((dyn->insts[ninst].x64.need_before&~X_PEND) && !dyn->insts[ninst].pred_sz) { + READFLAGS(dyn->insts[ninst].x64.need_before&~X_PEND); + } #ifdef HAVE_TRACE if(my_context->dec && box64_dynarec_trace) { if((trace_end == 0) @@ -226,10 +230,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr) BARRIER(BARRIER_FLOAT); } #if STEP == 0 - if(dyn->insts[ninst].x64.set_flags) - dyn->insts[ninst].x64.default_need |= X_PEND; - else - dyn->insts[ninst].x64.use_flags |= X_PEND; + dyn->insts[ninst].x64.need_after |= X_PEND; #endif ++ninst; fpu_purgecache(dyn, ninst, 0, x1, x2, x3); diff --git a/src/dynarec/dynarec_private.h b/src/dynarec/dynarec_private.h index 71966bdf..0ab21df4 100755 --- a/src/dynarec/dynarec_private.h +++ b/src/dynarec/dynarec_private.h @@ -24,7 +24,6 @@ #define SF_SUB 4 #define SF_SUBSET (SF_SUB|SF_SET) #define SF_SUBSET_PENDING (SF_SUBSET|SF_PENDING) -#define SF_MAYSET 8 typedef struct instruction_x64_s { uintptr_t addr; //address of the instruction @@ -37,9 +36,10 @@ typedef struct instruction_x64_s { uint8_t state_flags;// One of SF_XXX state uint8_t use_flags; // 0 or combination of X_?F uint8_t set_flags; // 0 or combination of X_?F - uint8_t default_need;// 0 or X_PEND basically - uint8_t need_flags; // calculated - uint8_t old_use; // calculated + uint8_t may_set; // 1 if the flags may not be set + uint8_t gen_flags; // calculated + uint8_t need_before;// calculated + uint8_t need_after; // calculated } instruction_x64_t; void printf_x64_instruction(zydis_dec_t* dec, instruction_x64_t* inst, const char* name); |