diff options
Diffstat (limited to 'src/dynarec')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.c | 17 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.h | 1 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.c | 38 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_pass1.h | 2 | ||||
| -rw-r--r-- | src/dynarec/dynarec_arch.h | 3 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native_pass.c | 2 |
6 files changed, 39 insertions, 24 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index ea60745c..afb1ed6b 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -168,6 +168,10 @@ static void fpu_reset_reg_neoncache(neoncache_t* n) n->fpuused[i]=0; n->neoncache[i].v = 0; } + n->ymm_regs = 0; + n->ymm_removed = 0; + n->ymm_used = 0; + n->ymm_write = 0; } void fpu_reset_reg(dynarec_arm_t* dyn) @@ -767,7 +771,7 @@ static void sse_reset(neoncache_t* n) n->neoncache[i].v = 0; } -void fpu_reset(dynarec_arm_t* dyn) +void fpu_reset(dynarec_native_t* dyn) { x87_reset(&dyn->n); mmx_reset(&dyn->n); @@ -776,12 +780,21 @@ void fpu_reset(dynarec_arm_t* dyn) dyn->ymm_zero = 0; } -void fpu_reset_ninst(dynarec_arm_t* dyn, int ninst) +void fpu_reset_ninst(dynarec_native_t* dyn, int ninst) { x87_reset(&dyn->insts[ninst].n); mmx_reset(&dyn->insts[ninst].n); sse_reset(&dyn->insts[ninst].n); fpu_reset_reg_neoncache(&dyn->insts[ninst].n); + +} + +void arm64_fpu_reset(dynarec_native_t* dyn, int ninst, int step) +{ + if(step<2) { + dyn->insts[ninst].ymm0_in = 0; + dyn->insts[ninst].ymm0_out = 0; + } } int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st) diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h index b6c95904..0af490e4 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.h +++ b/src/dynarec/arm64/dynarec_arm64_functions.h @@ -69,6 +69,7 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode); // reset the cache void fpu_reset(dynarec_native_t* dyn); void fpu_reset_ninst(dynarec_native_t* dyn, int ninst); +void arm64_fpu_reset(dynarec_native_t* dyn, int ninst, int step); // is st freed int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st); diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index 32de5146..136c0f8c 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -743,7 +743,6 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save if(saveflags) { STRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags)); } - fpu_pushcache(dyn, ninst, reg, 0); if(ret!=-2) { STPx_S7_preindex(xEmu, savereg, xSP, -16); // ARM64 stack needs to be 16byte aligned STPx_S7_offset(xRAX, xRCX, xEmu, offsetof(x64emu_t, regs[_AX])); // x9..x15, x16,x17,x18 those needs to be saved by caller @@ -751,6 +750,7 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save STPx_S7_offset(xRSP, xRBP, xEmu, offsetof(x64emu_t, regs[_SP])); STPx_S7_offset(xRSI, xRDI, xEmu, offsetof(x64emu_t, regs[_SI])); STPx_S7_offset(xR8, xR9, xEmu, offsetof(x64emu_t, regs[_R8])); + fpu_pushcache(dyn, ninst, savereg, 0); } TABLE64(reg, (uintptr_t)fnc); BLR(reg); @@ -772,8 +772,8 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save GO(RSI, RDI); GO(R8, R9); #undef GO + fpu_popcache(dyn, ninst, savereg, 0); // savereg will not be used } - fpu_popcache(dyn, ninst, reg, 0); if(saveflags) { LDRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags)); } @@ -1658,7 +1658,7 @@ void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a) STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])+8); } } else for(int i=0; i<32; ++i) - if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) || (dyn->n.neoncache[i].t == NEON_CACHE_YMMR)) { + if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) || (dyn->n.neoncache[i].t == NEON_CACHE_YMMR) && (dyn->n.neoncache[i].n==a)) { if(dyn->n.neoncache[i].t == NEON_CACHE_YMMW) VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); fpu_free_reg(dyn, i); @@ -1853,7 +1853,7 @@ void ymm_mark_zero(dynarec_arm_t* dyn, int ninst, int a) VEORQ(i, i, i); return; } - dyn->n.neoncache[i].v = 0; // forget it! + fpu_free_reg(dyn, i); } #if STEP == 0 dyn->insts[ninst].ymm0_add |= (1<<a); @@ -1869,8 +1869,6 @@ void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) for (int i=start; i<16; i++) { if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) ++n; - if(is_avx_zero(dyn, ninst, i)) - ++n; } for(int i=0; i<32; ++i) if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) @@ -1878,23 +1876,16 @@ void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) if(!n) return; MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n); - int s1_set = 0; for (int i=start; i<16; ++i) { if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) { VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } - if(is_avx_zero(dyn, ninst, i)) { - if(!s1_set) { - ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0])); - s1_set = 1; - } - STPx_S7_offset(xZR, xZR, s1, i*16); - } } - // purge the YMM values - for(int i=0; i<32; ++i) + // push the YMM values + for(int i=0; i<32; ++i) { if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); + } MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n); } @@ -1906,6 +1897,9 @@ void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) for (int i=start; i<16; i++) if(dyn->n.ssecache[i].v!=-1) ++n; + for(int i=0; i<32; ++i) + if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR) + ++n; if(!n) return; MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n); @@ -1916,7 +1910,7 @@ void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) dyn->n.neoncache[dyn->n.ssecache[i].reg].t = NEON_CACHE_XMMR;*/ } for(int i=0; i<32; ++i) - if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) + if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR) VLDR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n); } @@ -2265,11 +2259,11 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); SCVTFDD(i, i); cache.neoncache[i].t = NEON_CACHE_ST_D; - } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) - { cache.neoncache[i].t = NEON_CACHE_XMMW; } - else if(cache.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW) - { cache.neoncache[i].t = NEON_CACHE_YMMW; } - else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) { + } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) { + cache.neoncache[i].t = NEON_CACHE_XMMW; + } else if(cache.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW) { + cache.neoncache[i].t = NEON_CACHE_YMMW; + } else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) { // refresh cache... MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[cache.neoncache[i].n])); diff --git a/src/dynarec/arm64/dynarec_arm64_pass1.h b/src/dynarec/arm64/dynarec_arm64_pass1.h index 6cf92feb..ab1f5fc4 100644 --- a/src/dynarec/arm64/dynarec_arm64_pass1.h +++ b/src/dynarec/arm64/dynarec_arm64_pass1.h @@ -5,10 +5,12 @@ #define NEW_INST \ dyn->insts[ninst].f_entry = dyn->f; \ dyn->n.combined1 = dyn->n.combined2 = 0;\ + dyn->insts[ninst].ymm0_in = dyn->ymm_zero;\ dyn->n.swapped = 0; dyn->n.barrier = 0 #define INST_EPILOG \ dyn->insts[ninst].n = dyn->n; \ + dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\ dyn->insts[ninst].f_exit = dyn->f #define INST_NAME(name) diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h index f89125a7..351d9fcd 100644 --- a/src/dynarec/dynarec_arch.h +++ b/src/dynarec/dynarec_arch.h @@ -17,6 +17,7 @@ #include "arm64/dynarec_arm64_functions.h" // Limit here is defined by LD litteral, that is 19bits #define MAXBLOCK_SIZE ((1<<19)-200) +#define ARM_FPU_RESET() arm64_fpu_reset(dyn, ninst, STEP) #elif defined(LA64) #define instruction_native_t instruction_la64_t @@ -33,6 +34,7 @@ #include "la64/dynarec_la64_functions.h" // Limit here is unconditionnal jump, that is signed 28bits #define MAXBLOCK_SIZE ((1 << 27) - 200) +#define ARM_FPU_RESET() #elif defined(RV64) #define instruction_native_t instruction_rv64_t @@ -49,6 +51,7 @@ #include "rv64/dynarec_rv64_functions.h" // Limit here is unconditionnal jump, that is signed 21bits #define MAXBLOCK_SIZE ((1<<20)-200) +#define ARM_FPU_RESET() #else #error Unsupported platform #endif diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index 2ebc89cc..14f80103 100644 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -89,12 +89,14 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int dyn->f.dfnone = 0; dyn->f.pending = 0; fpu_reset(dyn); + ARM_FPU_RESET(); } else { fpu_reset_cache(dyn, ninst, reset_n); dyn->f = dyn->insts[reset_n].f_exit; if(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT) { MESSAGE(LOG_DEBUG, "Apply Barrier Float\n"); fpu_reset(dyn); + ARM_FPU_RESET(); } if(dyn->insts[ninst].x64.barrier&BARRIER_FLAGS) { MESSAGE(LOG_DEBUG, "Apply Barrier Flags\n"); |