diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-06-05 10:44:31 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-06-05 10:44:31 +0200 |
| commit | ba411303e951cb51766d42a15be59e2b9d5e67ec (patch) | |
| tree | d348eb0b9c0561bde343f686ef6e308476a2e9a5 /src | |
| parent | 8848bc2e7f404c72396392b307ee6c3494392488 (diff) | |
| download | box64-ba411303e951cb51766d42a15be59e2b9d5e67ec.tar.gz box64-ba411303e951cb51766d42a15be59e2b9d5e67ec.zip | |
[DYNAREC] Improved handling of the Ymm0 attribute
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.c | 5 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.c | 23 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.h | 6 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_pass0.h | 3 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_private.h | 4 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native.c | 37 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native_pass.c | 4 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_pass0.h | 3 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_private.h | 3 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_pass0.h | 3 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_private.h | 3 |
11 files changed, 69 insertions, 25 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index 4ee331ab..9e4fd46c 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -540,7 +540,6 @@ void neoncacheUnwind(neoncache_t* cache) // And now, rebuild the x87cache info with neoncache cache->mmxcount = 0; cache->fpu_scratch = 0; - cache->fpu_extra_qscratch = 0; cache->fpu_reg = 0; for(int i=0; i<8; ++i) { cache->x87cache[i] = -1; @@ -694,8 +693,8 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r default: break; } } - if(dyn->ymm_zero) - dynarec_log(LOG_NONE, " ymm0=%04x", dyn->ymm_zero); + if(dyn->ymm_zero || dyn->insts[ninst].ymm0_add || dyn->insts[ninst].ymm0_sub) + dynarec_log(LOG_NONE, " ymm0=%04x(+%0x4-%04x)", dyn->ymm_zero, dyn->insts[ninst].ymm0_add ,dyn->insts[ninst].ymm0_sub); if(dyn->insts[ninst].purge_ymm) dynarec_log(LOG_NONE, " purgeYmm=%04x", dyn->insts[ninst].purge_ymm); if(dyn->n.stack || dyn->insts[ninst].n.stack_next || dyn->insts[ninst].n.x87stack) diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index ded87eb3..e79d81f7 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -2002,9 +2002,9 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int { if(cache->neoncache[i].v) { int quad = 0; - if(t==NEON_CACHE_XMMR || t==NEON_CACHE_XMMW) + if(t==NEON_CACHE_XMMR || t==NEON_CACHE_XMMW || t==NEON_CACHE_YMMR || t==NEON_CACHE_YMMW) quad = 1; - if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW) + if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW || cache->neoncache[i].t==NEON_CACHE_YMMR || cache->neoncache[i].t==NEON_CACHE_YMMW) quad = 1; int j = i+1; while(cache->neoncache[j].v) @@ -2171,12 +2171,17 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int int s1_val = 0; int s2_val = 0; // unload every uneeded cache - // check SSE first, than MMX, in order, for optimisation issue + // check SSE first, than MMX, in order, to optimise successive memory write for(int i=0; i<16; ++i) { int j=findCacheSlot(dyn, ninst, NEON_CACHE_XMMW, i, &cache); if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_XMMW, i, &cache_i2)==-1) unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n); } + for(int i=0; i<16; ++i) { + int j=findCacheSlot(dyn, ninst, NEON_CACHE_YMMW, i, &cache); + if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_YMMW, i, &cache_i2)==-1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n); + } for(int i=0; i<8; ++i) { int j=findCacheSlot(dyn, ninst, NEON_CACHE_MM, i, &cache); if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_MM, i, &cache_i2)==-1) @@ -2347,7 +2352,7 @@ void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) { x87_reflectcache(dyn, ninst, s1, s2, s3); mmx_reflectcache(dyn, ninst, s1); - sse_reflectcache(dyn, ninst, s1); + //sse_reflectcache(dyn, ninst, s1); // no need, it's pushed/unpushed during call } void fpu_unreflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) @@ -2464,7 +2469,7 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n) if(box64_dynarec_dump) if(memcmp(&dyn->n, &dyn->insts[reset_n].n, sizeof(neon_cache_t))) { MESSAGE(LOG_DEBUG, "Warning, difference in neoncache: reset="); - for(int i=0; i<24; ++i) + for(int i=0; i<32; ++i) if(dyn->insts[reset_n].n.neoncache[i].v) MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[reset_n].n.neoncache[i].t, dyn->insts[reset_n].n.neoncache[i].n)); if(dyn->insts[reset_n].n.combined1 || dyn->insts[reset_n].n.combined2) @@ -2472,7 +2477,7 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n) if(dyn->insts[reset_n].n.stack_push || dyn->insts[reset_n].n.stack_pop) MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[reset_n].n.stack_push, -dyn->insts[reset_n].n.stack_pop); MESSAGE(LOG_DEBUG, " ==> "); - for(int i=0; i<24; ++i) + for(int i=0; i<32; ++i) if(dyn->insts[ninst].n.neoncache[i].v) MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[ninst].n.neoncache[i].t, dyn->insts[ninst].n.neoncache[i].n)); if(dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) @@ -2480,7 +2485,7 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n) if(dyn->insts[ninst].n.stack_push || dyn->insts[ninst].n.stack_pop) MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop); MESSAGE(LOG_DEBUG, " -> "); - for(int i=0; i<24; ++i) + for(int i=0; i<32; ++i) if(dyn->n.neoncache[i].v) MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->n.neoncache[i].t, dyn->n.neoncache[i].n)); if(dyn->n.combined1 || dyn->n.combined2) @@ -2513,12 +2518,12 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst) dyn->n.swapped = 0; } -void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1) +void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, uint16_t mask, int s1) { MESSAGE(LOG_NONE, "Purge YMM mask=%04x --------\n", dyn->insts[ninst].purge_ymm); int s1_set = 0; for(int i=0; i<16; ++i) - if(dyn->insts[ninst].purge_ymm&(1<<i)) { + if(mask&(1<<i)) { if(is_avx_zero_unset(dyn, ninst, i)) { if(!s1_set) { ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0])); diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index ed593ca7..cb0e181c 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -562,7 +562,7 @@ // Get EY #define GETEY(ey) \ if(MODREG) \ - ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1, -1, -1, -1); \ + ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, -1, -1, -1); \ else \ VLDR128_U12(ey, ed, fixedaddress+16); \ @@ -1499,7 +1499,7 @@ void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1); // Set rounding according to mxcsr flags, return reg to restore flags int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); // purge ymm_zero mask according to purge_ymm -void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1); +void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, uint16_t mask, int s1); void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3); @@ -1772,6 +1772,6 @@ uintptr_t dynarec64_AVX_F2_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } \ } -#define PURGE_YMM() avx_purge_ymm(dyn, ninst, x1) +#define PURGE_YMM() avx_purge_ymm(dyn, ninst, dyn->insts[ninst+1].purge_ymm, x1) #endif //__DYNAREC_ARM64_HELPER_H__ diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h index 6e9b8019..8b2fc6fb 100644 --- a/src/dynarec/arm64/dynarec_arm64_pass0.h +++ b/src/dynarec/arm64/dynarec_arm64_pass0.h @@ -32,6 +32,9 @@ #define INST_EPILOG \ dyn->insts[ninst].f_exit = dyn->f; \ dyn->insts[ninst].n = dyn->n; \ + dyn->insts[ninst].ymm0_add = dyn->ymm_zero&~dyn->insts[ninst].ymm_zero; \ + dyn->insts[ninst].ymm0_sub = dyn->insts[ninst].ymm_zero&~dyn->ymm_zero; \ + dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\ dyn->insts[ninst].x64.has_next = (ok>0)?1:0; #define INST_NAME(name) #define DEFAULT \ diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index 6a6647df..03958927 100644 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -57,7 +57,6 @@ typedef struct neoncache_s { int8_t x87stack; // cache stack counter int8_t mmxcount; // number of mmx register used (not both mmx and x87 at the same time) int8_t fpu_scratch; // scratch counter - int8_t fpu_extra_qscratch; // some opcode need an extra quad scratch register int8_t fpu_reg; // x87/sse/mmx reg counter } neoncache_t; @@ -83,6 +82,9 @@ typedef struct instruction_arm64_s { uint16_t retn; uint16_t ymm_zero; // bitmap of ymm to zero at purge uint16_t purge_ymm; // need to purge some ymm + uint16_t ymm0_add; // the ymm0 added by the opcode + uint16_t ymm0_sub; // the ymm0 removed by the opcode + uint16_t ymm0_out; // the ymmm0 at th end of the opcode uint8_t barrier_maybe; uint8_t will_write; uint8_t last_write; diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c index 649ba1a1..a9086625 100644 --- a/src/dynarec/dynarec_native.c +++ b/src/dynarec/dynarec_native.c @@ -357,7 +357,7 @@ static void fillPredecessors(dynarec_native_t* dyn) } } -// updateNeed goes backward, from last instruction to top +// updateNeed for the current block. recursive function that goes backward static int updateNeed(dynarec_native_t* dyn, int ninst, uint8_t need) { while (ninst>=0) { // need pending but instruction is only a subset: remove pend and use an X_ALL instead @@ -400,6 +400,32 @@ static int updateNeed(dynarec_native_t* dyn, int ninst, uint8_t need) { return ninst; } +// ypdate Ymm0 and Purge_ymm0. +static int updateYmm0(dynarec_native_t* dyn, int ninst, uint16_t mask) { + while (ninst<dyn->size) { + uint16_t ymm0 = mask&~dyn->insts[ninst].purge_ymm; // current ymm0 + uint16_t to_purge = dyn->insts[ninst].ymm_zero & ~ymm0; // the new to purge + uint16_t ymm0_out = (mask|dyn->insts[ninst].ymm0_add)&~dyn->insts[ninst].ymm0_sub; // ymm0 at the output + //check if need to recurse further + int ok = (ymm0==dyn->insts[ninst].ymm_zero) && (!to_purge) && (ymm0_out==dyn->insts[ninst].ymm0_out); + if(ok && dyn->insts[ninst].x64.has_next) + ok = (dyn->insts[ninst+1].ymm_zero==(ymm0_out&~dyn->insts[ninst+1].purge_ymm)); + if(ok && dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts!=-1) + ok = (dyn->insts[dyn->insts[ninst].x64.jmp_insts].ymm_zero==(ymm0_out&~dyn->insts[dyn->insts[ninst].x64.jmp_insts].purge_ymm)); + if(ok) + return ninst+1; + dyn->insts[ninst].ymm_zero = ymm0; + dyn->insts[ninst].purge_ymm |= to_purge; + dyn->insts[ninst].ymm0_out = ymm0_out; + if(dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts!=-1) + updateYmm0(dyn, dyn->insts[ninst].x64.jmp_insts, ymm0_out); + if(!dyn->insts[ninst].x64.has_next) + return ninst+1; + ++ninst; + } + return ninst; +} + void* current_helper = NULL; static int static_jmps[MAX_INSTS+2]; static uintptr_t static_next[MAX_INSTS+2]; @@ -556,12 +582,6 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit if(!helper.insts[i].barrier_maybe) helper.insts[k].x64.barrier |= BARRIER_FULL; helper.insts[i].x64.jmp_insts = k; - if(helper.insts[i].ymm_zero || helper.insts[k].ymm_zero) { - // move to pureg the reg that are present in k (jump to) but not in i (jump from) - uint16_t to_purge = helper.insts[k].ymm_zero & ~helper.insts[i].ymm_zero; - helper.insts[k].purge_ymm |= to_purge; - helper.insts[k].ymm_zero &= ~to_purge; - } } } } @@ -578,6 +598,9 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit int pos = helper.size; while (pos>=0) pos = updateNeed(&helper, pos, 0); + pos = 0; + while(pos<helper.size) + pos = updateYmm0(&helper, pos, helper.insts[pos].ymm_zero); // remove fpu stuff on non-executed code for(int i=1; i<helper.size-1; ++i) if(!helper.insts[i].pred_sz) { diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index dcb4cb6f..a308e264 100644 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -104,8 +104,6 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int } reset_n = -1; } - if(dyn->insts[ninst].purge_ymm) - PURGE_YMM(); #if STEP > 0 else if(ninst && (dyn->insts[ninst].pred_sz>1 || (dyn->insts[ninst].pred_sz==1 && dyn->insts[ninst].pred[0]!=ninst-1))) dyn->last_ip = 0; // reset IP if some jump are coming here @@ -310,6 +308,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int } if((ok>0) && dyn->insts[ninst].x64.has_callret) reset_n = -2; + if((ok>0) && reset_n==-1 && dyn->insts[ninst+1].purge_ymm) + PURGE_YMM(); ++ninst; #if STEP == 0 memset(&dyn->insts[ninst], 0, sizeof(instruction_native_t)); diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h index 3990caa4..62f65853 100644 --- a/src/dynarec/la64/dynarec_la64_pass0.h +++ b/src/dynarec/la64/dynarec_la64_pass0.h @@ -36,6 +36,9 @@ #define INST_EPILOG \ dyn->insts[ninst].f_exit = dyn->f; \ dyn->insts[ninst].lsx = dyn->lsx; \ + dyn->insts[ninst].ymm0_add = dyn->ymm_zero&~dyn->insts[ninst].ymm_zero; \ + dyn->insts[ninst].ymm0_sub = dyn->insts[ninst].ymm_zero&~dyn->ymm_zero; \ + dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\ dyn->insts[ninst].x64.has_next = (ok > 0) ? 1 : 0; #define INST_NAME(name) #define DEFAULT \ diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h index 09b6698e..9086e68f 100644 --- a/src/dynarec/la64/dynarec_la64_private.h +++ b/src/dynarec/la64/dynarec_la64_private.h @@ -82,6 +82,9 @@ typedef struct instruction_la64_s { uint16_t retn; uint16_t ymm_zero; // bitmap of ymm to zero at purge uint16_t purge_ymm; // need to purge some ymm + uint16_t ymm0_add; // the ymm0 added by the opcode + uint16_t ymm0_sub; // the ymm0 removed by the opcode + uint16_t ymm0_out; // the ymmm0 at th end of the opcode uint8_t barrier_maybe; uint8_t will_write; uint8_t last_write; diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h index 9c2de9ee..174bb092 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass0.h +++ b/src/dynarec/rv64/dynarec_rv64_pass0.h @@ -34,6 +34,9 @@ #define INST_EPILOG \ dyn->insts[ninst].f_exit = dyn->f; \ dyn->insts[ninst].e = dyn->e; \ + dyn->insts[ninst].ymm0_add = dyn->ymm_zero&~dyn->insts[ninst].ymm_zero; \ + dyn->insts[ninst].ymm0_sub = dyn->insts[ninst].ymm_zero&~dyn->ymm_zero; \ + dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\ dyn->insts[ninst].x64.has_next = (ok>0)?1:0; #define INST_NAME(name) #define DEFAULT \ diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index 1ba830d5..aeda741c 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -92,6 +92,9 @@ typedef struct instruction_rv64_s { uint16_t retn; uint16_t ymm_zero; // bitmap of ymm to zero at purge uint16_t purge_ymm; // need to purge some ymm + uint16_t ymm0_add; // the ymm0 added by the opcode + uint16_t ymm0_sub; // the ymm0 removed by the opcode + uint16_t ymm0_out; // the ymmm0 at th end of the opcode int barrier_maybe; flagcache_t f_exit; // flags status at end of intruction extcache_t e; // extcache at end of intruction (but before poping) |