diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-06-08 16:51:51 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-06-08 16:51:51 +0200 |
| commit | a147c12f728cd50a9b462244334cf5a720a2b435 (patch) | |
| tree | 4da1ad5915eb654e2e51df00dfce3c373bcea0c3 /src | |
| parent | 874828c2ac6ede4302b5f86a3405ba6650a9ebd4 (diff) | |
| download | box64-a147c12f728cd50a9b462244334cf5a720a2b435.tar.gz box64-a147c12f728cd50a9b462244334cf5a720a2b435.zip | |
[ARM64_DYNAREC] Another fix for YMM selection on High registry pressure cases
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.c | 17 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.c | 42 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_private.h | 2 |
3 files changed, 44 insertions, 17 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index b05cbcc3..269197b3 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -58,7 +58,7 @@ int fpu_get_double_scratch(dynarec_arm_t* dyn, int ninst) void fpu_reset_scratch(dynarec_arm_t* dyn) { dyn->n.fpu_scratch = 0; - dyn->ymm_used = 0; + dyn->n.ymm_used = 0; } // Get a x87 double reg int fpu_get_reg_x87(dynarec_arm_t* dyn, int ninst, int t, int n) @@ -118,14 +118,14 @@ int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm) } int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg) { - if(dyn->n.neoncache[reg].t==NEON_CACHE_YMMR || dyn->n.neoncache[reg].t==NEON_CACHE_YMMW) { + if((dyn->n.neoncache[reg].t==NEON_CACHE_YMMR) || (dyn->n.neoncache[reg].t==NEON_CACHE_YMMW)) { if(dyn->n.neoncache[reg].n == ymm) { // already there! if(t==NEON_CACHE_YMMW) dyn->n.neoncache[reg].t=t; return reg; } - } else { + } else if(!dyn->n.neoncache[reg].v) { // found a slot! dyn->n.neoncache[reg].t=t; dyn->n.neoncache[reg].n=ymm; @@ -135,11 +135,13 @@ int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg) } int is_ymm_to_keep(dynarec_arm_t* dyn, int reg, int k1, int k2, int k3) { - if(k1!=-1 && dyn->n.neoncache[reg].n==k1) + if((k1!=-1) && (dyn->n.neoncache[reg].n==k1)) + return 1; + if((k2!=-1) && (dyn->n.neoncache[reg].n==k2)) return 1; - if(k2!=-1 && dyn->n.neoncache[reg].n==k2) + if((k3!=-1) && (dyn->n.neoncache[reg].n==k3)) return 1; - if(k3!=-1 && dyn->n.neoncache[reg].n==k3) + if((dyn->n.neoncache[reg].t==NEON_CACHE_YMMR || dyn->n.neoncache[reg].t==NEON_CACHE_YMMW) && (dyn->n.ymm_used&(1<<dyn->n.neoncache[reg].n))) return 1; return 0; } @@ -537,6 +539,7 @@ void neoncacheUnwind(neoncache_t* cache) break; case NEON_CACHE_YMMR: case NEON_CACHE_YMMW: + cache->fpuused[i] = 0; // YMM does not mark the fpu reg as used break; case NEON_CACHE_ST_F: case NEON_CACHE_ST_D: @@ -672,6 +675,8 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r } dynarec_log(LOG_NONE, ")%s", (box64_dynarec_dump>1)?"\e[32m":""); } + if(dyn->insts[ninst].n.ymm_used) + dynarec_log(LOG_NONE, " ymmUsed=%04x", dyn->insts[ninst].n.ymm_used); if(dyn->ymm_zero || dyn->insts[ninst].ymm0_add || dyn->insts[ninst].ymm0_sub) dynarec_log(LOG_NONE, " ymm0=%04x(+%04x-%04x)", dyn->ymm_zero, dyn->insts[ninst].ymm0_add ,dyn->insts[ninst].ymm0_sub); if(dyn->insts[ninst].purge_ymm) diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index 528035de..4eedadd1 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -1834,7 +1834,7 @@ void ymm_mark_zero(dynarec_arm_t* dyn, int ninst, int a) // look if already exist for(int i=0; i<32; ++i) if((dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) && dyn->n.neoncache[i].n==a) { - if(dyn->ymm_used&(1<<a)) { + if(dyn->n.ymm_used&(1<<a)) { // special case, the reg was just added in the opcode and cannot be marked as 0, so just RAZ it now dyn->n.neoncache[i].t = NEON_CACHE_YMMW; VEORQ(i, i, i); @@ -2559,8 +2559,8 @@ void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, uint16_t mask, int s1) int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3) { int i = -1; - dyn->ymm_used|=(1<<ymm); - #if STEP >1 + dyn->n.ymm_used|=(1<<ymm); + #if STEP > 1 // check the cached neoncache, it should be exact // look for it for(int ii=0; ii<32 && i==-1; ++ii) @@ -2599,28 +2599,50 @@ int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k } // no free slot, needs to purge a value... First loop on the YMMR, they are easier to purge i = EMM0; - int keep = 0; for(int j=0; j<8; ++j) { if(!dyn->n.fpuused[i+j] && !(dyn->mmx87&(1<<j))) { // should a test be done to check if ymm is already in the purge list? - if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR) { + if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && (dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR)) { dyn->n.neoncache[i+j].v = 0; - return internal_mark_ymm(dyn, t, ymm, i+j); + int ret = internal_mark_ymm(dyn, t, ymm, i+j); + if(ret>=0) return ret; } } } - // make space in the scratch area i = SCRATCH0; + for(int j=dyn->n.fpu_scratch; j<8; ++j) + if(!(dyn->scratchs&(1<<j))) { + if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && (dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR)) { + dyn->n.neoncache[i+j].v = 0; + int ret = internal_mark_ymm(dyn, t, ymm, i+j); + if(ret>=0) return ret; + } + } + // make space in the scratch area for(int j=dyn->n.fpu_scratch; j<8; ++j) { // should a test be done to check if ymm is already in the purge list? - if(!(dyn->scratchs&(1<<j)) &&!is_ymm_to_keep(dyn, i+j, k1, k2, k3)) { + if(!(dyn->scratchs&(1<<j)) && !is_ymm_to_keep(dyn, i+j, k1, k2, k3)) { // Save the reg and recycle it VSTR128_U12(i+j, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i+j].n])); dyn->n.neoncache[i+j].v = 0; - return internal_mark_ymm(dyn, t, ymm, i+j); + int ret = internal_mark_ymm(dyn, t, ymm, i+j); + if(ret>=0) return ret; } } + // last resort, go back in the EMM area... + i = EMM0; + for(int j=7; j>=0; --j) { + if(!dyn->n.fpuused[i+j] && !(dyn->mmx87&(1<<j))) { + // should a test be done to check if ymm is already in the purge list? + if((dyn->n.neoncache[i+j].t==NEON_CACHE_YMMW) && !is_ymm_to_keep(dyn, i+j, k1, k2, k3)) { + VSTR128_U12(i+j, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i+j].n])); + dyn->n.neoncache[i+j].v = 0; + int ret = internal_mark_ymm(dyn, t, ymm, i+j); + if(ret>=0) return ret; + } + } + } #endif - printf_log(LOG_NONE, "BOX64 Dynarec: Error, unable to free a reg for YMM %d at inst=%d\n", ymm, ninst); + printf_log(LOG_NONE, "BOX64 Dynarec: Error, unable to free a reg for YMM %d at inst=%d on pass %d\n", ymm, ninst, STEP); return i; } \ No newline at end of file diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index 07b66a39..db205a3b 100644 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -58,6 +58,7 @@ typedef struct neoncache_s { int8_t mmxcount; // number of mmx register used (not both mmx and x87 at the same time) int8_t fpu_scratch; // scratch counter int8_t fpu_reg; // x87/sse/mmx reg counter + uint16_t ymm_used; // mask of the ymm regs used in this opcode } neoncache_t; typedef struct flagcache_s { @@ -126,7 +127,6 @@ typedef struct dynarec_arm_s { int32_t forward_size; // size at the forward point int forward_ninst; // ninst at the forward point uint16_t ymm_zero; // bitmap of ymm to zero at purge - uint16_t ymm_used; // mask of the ymm regs used in this opcode uint8_t smwrite; // for strongmem model emulation uint8_t smread; uint8_t doublepush; |