From 5916869329fc3bf00885be33be98f9cfbe6073de Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Thu, 22 May 2025 20:07:23 +0200 Subject: [ARM64_DYNAREC] Try to avoid Load/Unload of XMM/YMM regs when possible on intrablock jump --- src/dynarec/arm64/dynarec_arm64_functions.c | 52 +++++++++++++++++- src/dynarec/arm64/dynarec_arm64_functions.h | 4 ++ src/dynarec/arm64/dynarec_arm64_helper.c | 83 +++++++++++++++++++++-------- src/dynarec/arm64/dynarec_arm64_private.h | 3 ++ src/dynarec/dynarec_arch.h | 4 +- src/dynarec/dynarec_native.c | 1 + 6 files changed, 121 insertions(+), 26 deletions(-) (limited to 'src') diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index 8a0140ba..38cc2355 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -73,6 +73,9 @@ void fpu_reset_scratch(dynarec_arm_t* dyn) dyn->n.ymm_write = 0; dyn->n.ymm_removed = 0; dyn->n.xmm_write = 0; + dyn->n.xmm_used = 0; + dyn->n.xmm_unneeded = 0; + dyn->n.ymm_unneeded = 0; dyn->n.xmm_removed = 0; } // Get a x87 double reg @@ -846,8 +849,11 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r } length += sprintf(buf + length, ")%s", (dyn->need_dump > 1) ? "\e[0;32m" : ""); } - if (dyn->insts[ninst].n.ymm_used) { - length += sprintf(buf + length, " ymmUsed=%04x", dyn->insts[ninst].n.ymm_used); + if (dyn->insts[ninst].n.xmm_used || dyn->insts[ninst].n.xmm_unneeded) { + length += sprintf(buf + length, " xmmUsed=%04x/unneeded=%04x", dyn->insts[ninst].n.xmm_used, dyn->insts[ninst].n.xmm_unneeded); + } + if (dyn->insts[ninst].n.ymm_used || dyn->insts[ninst].n.ymm_unneeded) { + length += sprintf(buf + length, " ymmUsed=%04x/unneeded=%04x", dyn->insts[ninst].n.ymm_used, dyn->insts[ninst].n.ymm_unneeded); } if (dyn->ymm_zero || dyn->insts[ninst].ymm0_add || dyn->insts[ninst].ymm0_sub || dyn->insts[ninst].ymm0_out) { length += sprintf(buf + length, " ymm0=(%04x/%04x+%04x-%04x=%04x)", dyn->ymm_zero, dyn->insts[ninst].ymm0_in, dyn->insts[ninst].ymm0_add, dyn->insts[ninst].ymm0_sub, dyn->insts[ninst].ymm0_out); @@ -1181,3 +1187,45 @@ void fpu_unwind_restore(dynarec_arm_t* dyn, int ninst, neoncache_t* cache) { memcpy(&dyn->insts[ninst].n, cache, sizeof(neoncache_t)); } + +static void propagateXMMUneeded(dynarec_arm_t* dyn, int ninst, int a) +{ + if(!ninst) return; + ninst = getNominalPred(dyn, ninst); + while(ninst>=0) { + if(dyn->insts[ninst].n.xmm_used&(1<insts[ninst].x64.barrier&BARRIER_FLOAT) return; // barrier, value is needed + if(dyn->insts[ninst].n.xmm_unneeded&(1<insts[ninst].x64.jmp) return; // stop when a jump is detected, that gets too complicated + dyn->insts[ninst].n.xmm_unneeded |= (1<=0) { + if(dyn->insts[ninst].n.ymm_used&(1<insts[ninst].x64.barrier&BARRIER_FLOAT) return; // barrier, value is needed + if(dyn->insts[ninst].n.ymm_unneeded&(1<insts[ninst].x64.jmp) return; // stop when a jump is detected, that gets too complicated + dyn->insts[ninst].n.ymm_unneeded |= (1<size; ++ninst) { + if(dyn->insts[ninst].n.xmm_unneeded) + for(int i=0; i<16; ++i) + if(dyn->insts[ninst].n.xmm_unneeded&(1<insts[ninst].n.ymm_unneeded) + for(int i=0; i<16; ++i) + if(dyn->insts[ninst].n.ymm_unneeded&(1<n.xmm_used |= 1<n.ssecache[a].v!=-1) { if(forwrite) { dyn->n.ssecache[a].write = 1; // update only if forwrite @@ -1739,11 +1740,13 @@ int sse_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite) // get neon register for a SSE reg, but don't try to synch it if it needed to be created int sse_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a) { + dyn->n.xmm_used |= 1<n.ssecache[a].v!=-1) { dyn->n.ssecache[a].write = 1; dyn->n.neoncache[dyn->n.ssecache[a].reg].t = NEON_CACHE_XMMW; return dyn->n.ssecache[a].reg; } + dyn->n.xmm_unneeded |= 1<n.ssecache[a].reg = fpu_get_reg_xmm(dyn, NEON_CACHE_XMMW, a); dyn->n.ssecache[a].write = 1; // it will be write... return dyn->n.ssecache[a].reg; @@ -1751,6 +1754,7 @@ int sse_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a) // forget neon register for a SSE reg, create the entry if needed void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a) { + dyn->n.xmm_used |= 1<n.ssecache[a].v==-1) return; if(dyn->n.neoncache[dyn->n.ssecache[a].reg].t == NEON_CACHE_XMMW) { @@ -1786,6 +1790,7 @@ void sse_purge07cache(dynarec_arm_t* dyn, int ninst, int s1) MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n"); ++old; } + dyn->n.xmm_used |= (1<n.neoncache[dyn->n.ssecache[i].reg].t == NEON_CACHE_XMMW) { VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } @@ -1803,6 +1808,7 @@ static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1) int old = -1; for (int i=0; i<16; ++i) if(dyn->n.ssecache[i].v!=-1) { + if(next) dyn->n.xmm_used |= (1<n.ssecache[i].write) { if (old==-1) { MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":""); @@ -1843,6 +1849,8 @@ static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1) } if(!next && (dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR)) dyn->n.neoncache[i].v = 0; + if(next && (dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR)) + dyn->n.xmm_used |= (1<n.neoncache[i].n); } // All done if(old!=-1) { @@ -1852,10 +1860,14 @@ static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1) static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1) { - for (int i=0; i<16; ++i) - if(dyn->n.ssecache[i].v!=-1 && dyn->n.ssecache[i].write) { - VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + for (int i=0; i<16; ++i) { + if(dyn->n.ssecache[i].v!=-1) { + dyn->n.xmm_used |= 1<n.ssecache[i].write) { + VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } } + } //AVX if(dyn->ymm_zero) { int s1_set = 0; @@ -1868,13 +1880,18 @@ static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1) STPx_S7_offset(xZR, xZR, s1, i*16); } } - for(int i=0; i<32; ++i) + for(int i=0; i<32; ++i) { if(dyn->n.neoncache[i].t == NEON_CACHE_YMMW) VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); + if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) || (dyn->n.neoncache[i].t == NEON_CACHE_YMMR)) + dyn->n.xmm_used |= 1<n.neoncache[i].n; + } } void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a) { + dyn->n.xmm_used |= 1<n.ymm_used |= 1<n.ymm_unneeded |= 1<ymm_zero&(1<ymm_zero&=~(1<insts[ninst].ymm0_add |= (1<n.ymm_unneeded |= 1<neoncache[j].v = tmp.v; } -static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, neoncache_t *cache, int i, int t, int n) +static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, neoncache_t *cache, int i, int t, int n, int i2) { if(cache->neoncache[i].v) { int quad = 0; @@ -2145,13 +2164,21 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int switch(t) { case NEON_CACHE_XMMR: case NEON_CACHE_XMMW: - MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); - VLDR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n])); + if(dyn->insts[i2].n.xmm_unneeded&(1<insts[i2].n.xmm_unneeded&(1<neoncache[i].t = t; } -static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, neoncache_t *cache, int i, int t, int n) +static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, neoncache_t *cache, int i, int t, int n, int i2) { switch(t) { case NEON_CACHE_XMMR: @@ -2205,12 +2232,20 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); break; case NEON_CACHE_XMMW: - MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); - VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n])); + if(dyn->insts[i2].n.xmm_unneeded&(1<insts[i2].n.ymm_unneeded&(1<ymm_zero, (dyn->insts[i2].purge_ymm|to_purge)); for(int i=0; i<16; ++i) if(is_avx_zero(dyn, ninst, i) && (dyn->insts[i2].purge_ymm|to_purge)&(1<insts[i2].n.ymm_unneeded&(1<=0 && findCacheSlot(dyn, ninst, NEON_CACHE_XMMW, i, &cache_i2)==-1) - unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n); + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n, i2); } for(int i=0; i<16; ++i) { int j=findCacheSlot(dyn, ninst, NEON_CACHE_YMMW, i, &cache); if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_YMMW, i, &cache_i2)==-1) - unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n); + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n, i2); } for(int i=0; i<8; ++i) { int j=findCacheSlot(dyn, ninst, NEON_CACHE_MM, i, &cache); if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_MM, i, &cache_i2)==-1) - unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n); + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n, i2); } for(int i=0; i<32; ++i) { if(cache.neoncache[i].v) if(findCacheSlot(dyn, ninst, cache.neoncache[i].t, cache.neoncache[i].n, &cache_i2)==-1) - unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.neoncache[i].t, cache.neoncache[i].n); + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.neoncache[i].t, cache.neoncache[i].n, i2); } // and now load/swap the missing one for(int i=0; i<32; ++i) { @@ -2336,7 +2373,7 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int if(cache_i2.neoncache[i].v != cache.neoncache[i].v) { int j; if((j=findCacheSlot(dyn, ninst, cache_i2.neoncache[i].t, cache_i2.neoncache[i].n, &cache))==-1) - loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.neoncache[i].t, cache_i2.neoncache[i].n); + loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.neoncache[i].t, cache_i2.neoncache[i].n, i2); else { // it's here, lets swap if needed if(j!=i) diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index 60c24aa2..af6d5bcb 100644 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -75,9 +75,12 @@ typedef struct neoncache_s { int8_t fpu_reg; // x87/sse/mmx reg counter uint16_t xmm_write; // 1bit of xmmXX removed write uint16_t xmm_removed; // 1bit if xmmXX was removed + uint16_t xmm_used; // mask of the xmm regs used in this opcode uint16_t ymm_used; // mask of the ymm regs used in this opcode uint16_t ymm_write; // 1bit of ymmXX removed write uint16_t ymm_removed; // 1bit if ymmXX was removed + uint16_t xmm_unneeded; // 1bit for xmmXX were value is not needed + uint16_t ymm_unneeded; // 1bit for ymmXX were value is not needed uint64_t ymm_regs; // 4bits (0-15) position of 16 ymmXX regs removed } neoncache_t; diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h index e3afb60a..152d69f7 100644 --- a/src/dynarec/dynarec_arch.h +++ b/src/dynarec/dynarec_arch.h @@ -26,7 +26,7 @@ #define RAZ_SPECIFIC(A, N) rasNativeState(A, N) #define UPDATE_SPECIFICS(A) updateNativeFlags(A) #define PREUPDATE_SPECIFICS(A) - +#define POSTUPDATE_SPECIFICS(A) updateUneeded(A) #define ARCH_SIZE(A) get_size_arch(A) #define ARCH_FILL(A, B, C) populate_arch(A, B, C) #define ARCH_ADJUST(A, B, C, D) adjust_arch(A, B, C, D) @@ -58,6 +58,7 @@ extern uint32_t arm64_crc(void* p, uint32_t len); #define RAZ_SPECIFIC(A, N) #define UPDATE_SPECIFICS(A) #define PREUPDATE_SPECIFICS(A) updateNativeFlags(A) +#define POSTUPDATE_SPECIFICS(A) #define ARCH_SIZE(A) 0 #define ARCH_FILL(A, B, C) NULL @@ -91,6 +92,7 @@ extern uint32_t arm64_crc(void* p, uint32_t len); #define RAZ_SPECIFIC(A, N) #define UPDATE_SPECIFICS(A) #define PREUPDATE_SPECIFICS(A) updateNativeFlags(A) +#define POSTUPDATE_SPECIFICS(A) #define ARCH_SIZE(A) get_size_arch(A) #define ARCH_FILL(A, B, C) populate_arch(A, B, C) diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c index 9ea02962..b1117953 100644 --- a/src/dynarec/dynarec_native.c +++ b/src/dynarec/dynarec_native.c @@ -775,6 +775,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit if(helper.need_x87check==1) helper.need_x87check = 0; } + POSTUPDATE_SPECIFICS(&helper); // pass 2, instruction size helper.callrets = static_callrets; native_pass2(&helper, addr, alternate, is32bits, inst_max); -- cgit 1.4.1