diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-07-05 18:53:32 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-07-05 18:53:32 +0200 |
| commit | 267bffb0ec2b8afdea35cdc7f2d5983f7cd1566b (patch) | |
| tree | 2da1f1246e1773e731b74f4d10926dc6d1103ee4 /src/dynarec/arm64 | |
| parent | 054621622895a6a8b43173be87ddda9637b0897d (diff) | |
| download | box64-267bffb0ec2b8afdea35cdc7f2d5983f7cd1566b.tar.gz box64-267bffb0ec2b8afdea35cdc7f2d5983f7cd1566b.zip | |
[ARM64_DYNAREC] More improvment on YMM handling
Diffstat (limited to 'src/dynarec/arm64')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.c | 17 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.h | 1 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.c | 38 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_pass1.h | 2 |
4 files changed, 34 insertions, 24 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index ea60745c..afb1ed6b 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -168,6 +168,10 @@ static void fpu_reset_reg_neoncache(neoncache_t* n) n->fpuused[i]=0; n->neoncache[i].v = 0; } + n->ymm_regs = 0; + n->ymm_removed = 0; + n->ymm_used = 0; + n->ymm_write = 0; } void fpu_reset_reg(dynarec_arm_t* dyn) @@ -767,7 +771,7 @@ static void sse_reset(neoncache_t* n) n->neoncache[i].v = 0; } -void fpu_reset(dynarec_arm_t* dyn) +void fpu_reset(dynarec_native_t* dyn) { x87_reset(&dyn->n); mmx_reset(&dyn->n); @@ -776,12 +780,21 @@ void fpu_reset(dynarec_arm_t* dyn) dyn->ymm_zero = 0; } -void fpu_reset_ninst(dynarec_arm_t* dyn, int ninst) +void fpu_reset_ninst(dynarec_native_t* dyn, int ninst) { x87_reset(&dyn->insts[ninst].n); mmx_reset(&dyn->insts[ninst].n); sse_reset(&dyn->insts[ninst].n); fpu_reset_reg_neoncache(&dyn->insts[ninst].n); + +} + +void arm64_fpu_reset(dynarec_native_t* dyn, int ninst, int step) +{ + if(step<2) { + dyn->insts[ninst].ymm0_in = 0; + dyn->insts[ninst].ymm0_out = 0; + } } int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st) diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h index b6c95904..0af490e4 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.h +++ b/src/dynarec/arm64/dynarec_arm64_functions.h @@ -69,6 +69,7 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode); // reset the cache void fpu_reset(dynarec_native_t* dyn); void fpu_reset_ninst(dynarec_native_t* dyn, int ninst); +void arm64_fpu_reset(dynarec_native_t* dyn, int ninst, int step); // is st freed int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st); diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index 32de5146..136c0f8c 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -743,7 +743,6 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save if(saveflags) { STRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags)); } - fpu_pushcache(dyn, ninst, reg, 0); if(ret!=-2) { STPx_S7_preindex(xEmu, savereg, xSP, -16); // ARM64 stack needs to be 16byte aligned STPx_S7_offset(xRAX, xRCX, xEmu, offsetof(x64emu_t, regs[_AX])); // x9..x15, x16,x17,x18 those needs to be saved by caller @@ -751,6 +750,7 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save STPx_S7_offset(xRSP, xRBP, xEmu, offsetof(x64emu_t, regs[_SP])); STPx_S7_offset(xRSI, xRDI, xEmu, offsetof(x64emu_t, regs[_SI])); STPx_S7_offset(xR8, xR9, xEmu, offsetof(x64emu_t, regs[_R8])); + fpu_pushcache(dyn, ninst, savereg, 0); } TABLE64(reg, (uintptr_t)fnc); BLR(reg); @@ -772,8 +772,8 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save GO(RSI, RDI); GO(R8, R9); #undef GO + fpu_popcache(dyn, ninst, savereg, 0); // savereg will not be used } - fpu_popcache(dyn, ninst, reg, 0); if(saveflags) { LDRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags)); } @@ -1658,7 +1658,7 @@ void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a) STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])+8); } } else for(int i=0; i<32; ++i) - if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) || (dyn->n.neoncache[i].t == NEON_CACHE_YMMR)) { + if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) || (dyn->n.neoncache[i].t == NEON_CACHE_YMMR) && (dyn->n.neoncache[i].n==a)) { if(dyn->n.neoncache[i].t == NEON_CACHE_YMMW) VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); fpu_free_reg(dyn, i); @@ -1853,7 +1853,7 @@ void ymm_mark_zero(dynarec_arm_t* dyn, int ninst, int a) VEORQ(i, i, i); return; } - dyn->n.neoncache[i].v = 0; // forget it! + fpu_free_reg(dyn, i); } #if STEP == 0 dyn->insts[ninst].ymm0_add |= (1<<a); @@ -1869,8 +1869,6 @@ void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) for (int i=start; i<16; i++) { if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) ++n; - if(is_avx_zero(dyn, ninst, i)) - ++n; } for(int i=0; i<32; ++i) if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) @@ -1878,23 +1876,16 @@ void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) if(!n) return; MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n); - int s1_set = 0; for (int i=start; i<16; ++i) { if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) { VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } - if(is_avx_zero(dyn, ninst, i)) { - if(!s1_set) { - ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0])); - s1_set = 1; - } - STPx_S7_offset(xZR, xZR, s1, i*16); - } } - // purge the YMM values - for(int i=0; i<32; ++i) + // push the YMM values + for(int i=0; i<32; ++i) { if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); + } MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n); } @@ -1906,6 +1897,9 @@ void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) for (int i=start; i<16; i++) if(dyn->n.ssecache[i].v!=-1) ++n; + for(int i=0; i<32; ++i) + if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR) + ++n; if(!n) return; MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n); @@ -1916,7 +1910,7 @@ void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) dyn->n.neoncache[dyn->n.ssecache[i].reg].t = NEON_CACHE_XMMR;*/ } for(int i=0; i<32; ++i) - if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) + if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR) VLDR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n); } @@ -2265,11 +2259,11 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); SCVTFDD(i, i); cache.neoncache[i].t = NEON_CACHE_ST_D; - } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) - { cache.neoncache[i].t = NEON_CACHE_XMMW; } - else if(cache.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW) - { cache.neoncache[i].t = NEON_CACHE_YMMW; } - else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) { + } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) { + cache.neoncache[i].t = NEON_CACHE_XMMW; + } else if(cache.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW) { + cache.neoncache[i].t = NEON_CACHE_YMMW; + } else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) { // refresh cache... MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[cache.neoncache[i].n])); diff --git a/src/dynarec/arm64/dynarec_arm64_pass1.h b/src/dynarec/arm64/dynarec_arm64_pass1.h index 6cf92feb..ab1f5fc4 100644 --- a/src/dynarec/arm64/dynarec_arm64_pass1.h +++ b/src/dynarec/arm64/dynarec_arm64_pass1.h @@ -5,10 +5,12 @@ #define NEW_INST \ dyn->insts[ninst].f_entry = dyn->f; \ dyn->n.combined1 = dyn->n.combined2 = 0;\ + dyn->insts[ninst].ymm0_in = dyn->ymm_zero;\ dyn->n.swapped = 0; dyn->n.barrier = 0 #define INST_EPILOG \ dyn->insts[ninst].n = dyn->n; \ + dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\ dyn->insts[ninst].f_exit = dyn->f #define INST_NAME(name) |