diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2025-03-28 17:54:24 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-03-28 10:54:24 +0100 |
| commit | cfdeb3685cdfed402a01baa2fb693ef16f5ba7d7 (patch) | |
| tree | ca03cdd86adf7246e548389a1e15b0d4a1f8b01d | |
| parent | fdf01ace3fdf9a4bee9cd4aefdef335d51865d8b (diff) | |
| download | box64-cfdeb3685cdfed402a01baa2fb693ef16f5ba7d7.tar.gz box64-cfdeb3685cdfed402a01baa2fb693ef16f5ba7d7.zip | |
[LA64_DYNAREC] Added preliminary MMX support (#2476)
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_0f.c | 19 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_functions.c | 84 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_functions.h | 2 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_helper.c | 121 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_helper.h | 15 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_private.h | 4 |
6 files changed, 200 insertions, 45 deletions
diff --git a/src/dynarec/la64/dynarec_la64_0f.c b/src/dynarec/la64/dynarec_la64_0f.c index 688ba7cc..c94ad178 100644 --- a/src/dynarec/la64/dynarec_la64_0f.c +++ b/src/dynarec/la64/dynarec_la64_0f.c @@ -697,6 +697,25 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } VFMAX_S(v0, v0, v1); break; + case 0x6F: + INST_NAME("MOVQ Gm, Em"); + nextop = F8; + GETG; + if (MODREG) { + v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop & 7); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); + FMOV_D(v0, v1); + } else { + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v0, wback, fixedaddress); + } + break; + case 0x77: + INST_NAME("EMMS"); + // empty MMX, FPU now usable + mmx_purgecache(dyn, ninst, 0, x1); + break; #define GO(GETFLAGS, NO, YES, NATNO, NATYES, F, I) \ READFLAGS_FUSION(F, x1, x2, x3, x4, x5); \ diff --git a/src/dynarec/la64/dynarec_la64_functions.c b/src/dynarec/la64/dynarec_la64_functions.c index 0bfe78b7..f0310ddc 100644 --- a/src/dynarec/la64/dynarec_la64_functions.c +++ b/src/dynarec/la64/dynarec_la64_functions.c @@ -52,7 +52,16 @@ void fpu_free_reg(dynarec_la64_t* dyn, int reg) if (dyn->lsx.lsxcache[reg].t != LSX_CACHE_ST_F && dyn->lsx.lsxcache[reg].t != LSX_CACHE_ST_D && dyn->lsx.lsxcache[reg].t != LSX_CACHE_ST_I64) dyn->lsx.lsxcache[reg].v = 0; } - +// Get an MMX double reg +int fpu_get_reg_emm(dynarec_la64_t* dyn, int emm) +{ + int ret = EMM0 + emm; + dyn->lsx.fpuused[ret] = 1; + dyn->lsx.lsxcache[ret].t = LSX_CACHE_MM; + dyn->lsx.lsxcache[ret].n = emm; + dyn->lsx.news |= (1 << (ret)); + return ret; +} // Get an XMM quad reg int fpu_get_reg_xmm(dynarec_la64_t* dyn, int t, int xmm) { @@ -188,11 +197,23 @@ void lsxcacheUnwind(lsxcache_t* cache) } } cache->x87stack -= cache->stack_push; + cache->tags >>= (cache->stack_push * 2); cache->stack -= cache->stack_push; + if (cache->pushed >= cache->stack_push) + cache->pushed -= cache->stack_push; + else + cache->pushed = 0; cache->stack_push = 0; } cache->x87stack += cache->stack_pop; cache->stack_next = cache->stack; + if (cache->stack_pop) { + if (cache->poped >= cache->stack_pop) + cache->poped -= cache->stack_pop; + else + cache->poped = 0; + cache->tags <<= (cache->stack_pop * 2); + } cache->stack_pop = 0; cache->barrier = 0; // And now, rebuild the x87cache info with lsxcache @@ -407,52 +428,52 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r // will go badly if address is unaligned static uint8_t extract_byte(uint32_t val, void* address) { - int idx = (((uintptr_t)address)&3)*8; - return (val>>idx)&0xff; + int idx = (((uintptr_t)address) & 3) * 8; + return (val >> idx) & 0xff; } static uint32_t insert_byte(uint32_t val, uint8_t b, void* address) { - int idx = (((uintptr_t)address)&3)*8; - val&=~(0xff<<idx); - val|=(((uint32_t)b)<<idx); + int idx = (((uintptr_t)address) & 3) * 8; + val &= ~(0xff << idx); + val |= (((uint32_t)b) << idx); return val; } static uint16_t extract_half(uint32_t val, void* address) { - int idx = (((uintptr_t)address)&3)*8; - return (val>>idx)&0xffff; + int idx = (((uintptr_t)address) & 3) * 8; + return (val >> idx) & 0xffff; } static uint32_t insert_half(uint32_t val, uint16_t h, void* address) { - int idx = (((uintptr_t)address)&3)*8; - val&=~(0xffff<<idx); - val|=(((uint32_t)h)<<idx); + int idx = (((uintptr_t)address) & 3) * 8; + val &= ~(0xffff << idx); + val |= (((uint32_t)h) << idx); return val; } uint8_t la64_lock_xchg_b_slow(void* addr, uint8_t val) { uint32_t ret; - uint32_t* aligned = (uint32_t*)(((uintptr_t)addr)&~3); + uint32_t* aligned = (uint32_t*)(((uintptr_t)addr) & ~3); do { ret = *aligned; - } while(la64_lock_cas_d(aligned, ret, insert_byte(ret, val, addr))); + } while (la64_lock_cas_d(aligned, ret, insert_byte(ret, val, addr))); return extract_byte(ret, addr); } int la64_lock_cas_b_slow(void* addr, uint8_t ref, uint8_t val) { - uint32_t* aligned = (uint32_t*)(((uintptr_t)addr)&~3); + uint32_t* aligned = (uint32_t*)(((uintptr_t)addr) & ~3); uint32_t tmp = *aligned; return la64_lock_cas_d(aligned, ref, insert_byte(tmp, val, addr)); } int la64_lock_cas_h_slow(void* addr, uint16_t ref, uint16_t val) { - uint32_t* aligned = (uint32_t*)(((uintptr_t)addr)&~3); + uint32_t* aligned = (uint32_t*)(((uintptr_t)addr) & ~3); uint32_t tmp = *aligned; return la64_lock_cas_d(aligned, ref, insert_half(tmp, val, addr)); } @@ -462,6 +483,36 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode) dynarec_log_prefix(0, LOG_NONE, "\t%08x\t%s\n", opcode, la64_print(opcode, (uintptr_t)dyn->block)); } +static void x87_reset(lsxcache_t* lsx) +{ + for (int i = 0; i < 8; ++i) + lsx->x87cache[i] = -1; + lsx->tags = 0; + lsx->x87stack = 0; + lsx->stack = 0; + lsx->stack_next = 0; + lsx->stack_pop = 0; + lsx->stack_push = 0; + lsx->combined1 = lsx->combined2 = 0; + lsx->swapped = 0; + lsx->barrier = 0; + lsx->pushed = 0; + lsx->poped = 0; + + for (int i = 0; i < 24; ++i) + if (lsx->lsxcache[i].t == LSX_CACHE_ST_F + || lsx->lsxcache[i].t == LSX_CACHE_ST_D + || lsx->lsxcache[i].t == LSX_CACHE_ST_I64) + lsx->lsxcache[i].v = 0; +} + +static void mmx_reset(lsxcache_t* lsx) +{ + lsx->mmxcount = 0; + for (int i = 0; i < 8; ++i) + lsx->mmxcache[i] = -1; +} + static void sse_reset(lsxcache_t* lsx) { for (int i = 0; i < 16; ++i) @@ -470,7 +521,8 @@ static void sse_reset(lsxcache_t* lsx) void fpu_reset(dynarec_la64_t* dyn) { - // TODO: x87 and mmx + x87_reset(&dyn->lsx); + mmx_reset(&dyn->lsx); sse_reset(&dyn->lsx); fpu_reset_reg(dyn); } diff --git a/src/dynarec/la64/dynarec_la64_functions.h b/src/dynarec/la64/dynarec_la64_functions.h index f8c8b58e..17b2280f 100644 --- a/src/dynarec/la64/dynarec_la64_functions.h +++ b/src/dynarec/la64/dynarec_la64_functions.h @@ -18,6 +18,8 @@ int fpu_get_reg_xmm(dynarec_la64_t* dyn, int t, int xmm); void fpu_free_reg(dynarec_la64_t* dyn, int reg); // Reset fpu regs counter void fpu_reset_reg(dynarec_la64_t* dyn); +// Get an MMX double reg +int fpu_get_reg_emm(dynarec_la64_t* dyn, int emm); // FPU Cache transformation (for loops) // Specific, need to be written by backend int fpuCacheNeedsTransform(dynarec_la64_t* dyn, int ninst); diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c index b807949e..83ef2cc9 100644 --- a/src/dynarec/la64/dynarec_la64_helper.c +++ b/src/dynarec/la64/dynarec_la64_helper.c @@ -530,7 +530,7 @@ void jump_to_next(dynarec_la64_t* dyn, uintptr_t ip, int reg, int ninst, int is3 MAYUSE(ninst); MESSAGE(LOG_DUMP, "Jump to next\n"); - if(is32bits) + if (is32bits) ip &= 0xffffffffLL; if (reg) { @@ -584,7 +584,7 @@ void ret_to_epilog(dynarec_la64_t* dyn, int ninst, rex_t rex) SMEND(); if (BOX64DRENV(dynarec_callret)) { // pop the actual return address from RV64 stack - LD_D(xRA, xSP, 0); // native addr + LD_D(xRA, xSP, 0); // native addr LD_D(x6, xSP, 8); // x86 addr ADDI_D(xSP, xSP, 16); // pop BNE(x6, xRIP, 2 * 4); // is it the right address? @@ -629,7 +629,7 @@ void retn_to_epilog(dynarec_la64_t* dyn, int ninst, rex_t rex, int n) SMEND(); if (BOX64DRENV(dynarec_callret)) { // pop the actual return address from RV64 stack - LD_D(xRA, xSP, 0); // native addr + LD_D(xRA, xSP, 0); // native addr LD_D(x6, xSP, 8); // x86 addr ADDI_D(xSP, xSP, 16); // pop BNE(x6, xRIP, 2 * 4); // is it the right address? @@ -820,6 +820,12 @@ int sse_setround(dynarec_la64_t* dyn, int ninst, int s1, int s2) return s2; } + +void x87_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3) +{ + // TODO +} + // Restore round flag void x87_restoreround(dynarec_la64_t* dyn, int ninst, int s1) { @@ -829,6 +835,69 @@ void x87_restoreround(dynarec_la64_t* dyn, int ninst, int s1) MOVGR2FCSR(FCSR3, s1); } +// MMX helpers +static int isx87Empty(dynarec_la64_t* dyn) +{ + for (int i = 0; i < 8; ++i) + if (dyn->lsx.x87cache[i] != -1) + return 0; + return 1; +} + +// get neon register for a MMX reg, create the entry if needed +int mmx_get_reg(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int a) +{ + if (!dyn->lsx.x87stack && isx87Empty(dyn)) + x87_purgecache(dyn, ninst, 0, s1, s2, s3); + if (dyn->lsx.mmxcache[a] != -1) + return dyn->lsx.mmxcache[a]; + ++dyn->lsx.mmxcount; + int ret = dyn->lsx.mmxcache[a] = fpu_get_reg_emm(dyn, a); + FLD_D(ret, xEmu, offsetof(x64emu_t, mmx[a])); + return ret; +} +// get neon register for a MMX reg, but don't try to synch it if it needed to be created +int mmx_get_reg_empty(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int a) +{ + if (!dyn->lsx.x87stack && isx87Empty(dyn)) + x87_purgecache(dyn, ninst, 0, s1, s2, s3); + if (dyn->lsx.mmxcache[a] != -1) + return dyn->lsx.mmxcache[a]; + ++dyn->lsx.mmxcount; + int ret = dyn->lsx.mmxcache[a] = fpu_get_reg_emm(dyn, a); + return ret; +} +// purge the MMX cache only +void mmx_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1) +{ + if (!dyn->lsx.mmxcount) return; + if (!next) dyn->lsx.mmxcount = 0; + int old = -1; + for (int i = 0; i < 8; ++i) + if (dyn->lsx.mmxcache[i] != -1) { + if (old == -1) { + MESSAGE(LOG_DUMP, "\tPurge %sMMX Cache ------\n", next ? "locally " : ""); + ++old; + } + FST_D(dyn->lsx.mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i])); + if (!next) { + fpu_free_reg(dyn, dyn->lsx.mmxcache[i]); + dyn->lsx.mmxcache[i] = -1; + } + } + if (old != -1) { + MESSAGE(LOG_DUMP, "\t------ Purge MMX Cache\n"); + } +} + +static void mmx_reflectcache(dynarec_la64_t* dyn, int ninst, int s1) +{ + for (int i = 0; i < 8; ++i) + if (dyn->lsx.mmxcache[i] != -1) { + FST_D(dyn->lsx.mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i])); + } +} + // SSE / SSE2 helpers // get lsx register for a SSE reg, create the entry if needed int sse_get_reg(dynarec_la64_t* dyn, int ninst, int s1, int a, int forwrite) @@ -927,8 +996,8 @@ static void sse_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1) static void sse_reflectcache(dynarec_la64_t* dyn, int ninst, int s1) { - for (int i=0; i<16; ++i) - if(dyn->lsx.ssecache[i].v!=-1 && dyn->lsx.ssecache[i].write) { + for (int i = 0; i < 16; ++i) + if (dyn->lsx.ssecache[i].v != -1 && dyn->lsx.ssecache[i].write) { VST(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } } @@ -971,9 +1040,8 @@ void fpu_popcache(dynarec_la64_t* dyn, int ninst, int s1, int not07) void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3) { - // TODO: x87_purgecache(dyn, ninst, next, s1, s2, s3); - // TODO: mmx_purgecache(dyn, ninst, next, s1); - + x87_purgecache(dyn, ninst, next, s1, s2, s3); + mmx_purgecache(dyn, ninst, next, s1); sse_purgecache(dyn, ninst, next, s1); if (!next) fpu_reset_reg(dyn); @@ -982,7 +1050,7 @@ void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, in void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3) { // TODO: x87_reflectcache(dyn, ninst, s1, s2, s3); - // TODO: mmx_reflectcache(dyn, ninst, s1); + mmx_reflectcache(dyn, ninst, s1); sse_reflectcache(dyn, ninst, s1); } @@ -1330,37 +1398,37 @@ static void flagsCacheTransform(dynarec_la64_t* dyn, int ninst, int s1) { int j64; int jmp = dyn->insts[ninst].x64.jmp_insts; - if(jmp<0) + if (jmp < 0) return; - if(dyn->f.dfnone) // flags are fully known, nothing we can do more + if (dyn->f.dfnone) // flags are fully known, nothing we can do more return; MESSAGE(LOG_DUMP, "\tFlags fetch ---- ninst=%d -> %d\n", ninst, jmp); - int go = (dyn->insts[jmp].f_entry.dfnone && !dyn->f.dfnone)?1:0; + int go = (dyn->insts[jmp].f_entry.dfnone && !dyn->f.dfnone) ? 1 : 0; switch (dyn->insts[jmp].f_entry.pending) { case SF_UNKNOWN: break; case SF_SET: - if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING) + if (dyn->f.pending != SF_SET && dyn->f.pending != SF_SET_PENDING) go = 1; break; case SF_SET_PENDING: - if(dyn->f.pending!=SF_SET - && dyn->f.pending!=SF_SET_PENDING - && dyn->f.pending!=SF_PENDING) + if (dyn->f.pending != SF_SET + && dyn->f.pending != SF_SET_PENDING + && dyn->f.pending != SF_PENDING) go = 1; break; case SF_PENDING: - if(dyn->f.pending!=SF_SET - && dyn->f.pending!=SF_SET_PENDING - && dyn->f.pending!=SF_PENDING) + if (dyn->f.pending != SF_SET + && dyn->f.pending != SF_SET_PENDING + && dyn->f.pending != SF_PENDING) go = 1; - else if (dyn->insts[jmp].f_entry.dfnone !=dyn->f.dfnone) + else if (dyn->insts[jmp].f_entry.dfnone != dyn->f.dfnone) go = 1; break; } - if(go) { - if(dyn->f.pending!=SF_PENDING) { + if (go) { + if (dyn->f.pending != SF_PENDING) { LD_W(s1, xEmu, offsetof(x64emu_t, df)); - j64 = (GETMARKF2)-(dyn->native_size); + j64 = (GETMARKF2) - (dyn->native_size); BEQZ(s1, j64); } CALL_(UpdateFlags, -1, 0); @@ -1368,10 +1436,11 @@ static void flagsCacheTransform(dynarec_la64_t* dyn, int ninst, int s1) } } -void CacheTransform(dynarec_la64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) { - if(cacheupd&2) +void CacheTransform(dynarec_la64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) +{ + if (cacheupd & 2) fpuCacheTransform(dyn, ninst, s1, s2, s3); - if(cacheupd&1) + if (cacheupd & 1) flagsCacheTransform(dyn, ninst, s1); } diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h index e3146d77..4f5e9b08 100644 --- a/src/dynarec/la64/dynarec_la64_helper.h +++ b/src/dynarec/la64/dynarec_la64_helper.h @@ -952,6 +952,8 @@ void* la64_next(x64emu_t* emu, uintptr_t addr); #define x87_restoreround STEPNAME(x87_restoreround) #define sse_setround STEPNAME(sse_setround) +#define mmx_get_reg STEPNAME(mmx_get_reg) +#define mmx_get_reg_empty STEPNAME(mmx_get_reg_empty) #define x87_forget STEPNAME(x87_forget) #define sse_purge07cache STEPNAME(sse_purge07cache) #define sse_get_reg STEPNAME(sse_get_reg) @@ -964,6 +966,8 @@ void* la64_next(x64emu_t* emu, uintptr_t addr); #define fpu_reset_cache STEPNAME(fpu_reset_cache) #define fpu_propagate_stack STEPNAME(fpu_propagate_stack) #define fpu_purgecache STEPNAME(fpu_purgecache) +#define mmx_purgecache STEPNAME(mmx_purgecache) +#define x87_purgecache STEPNAME(x87_purgecache) #define fpu_reflectcache STEPNAME(fpu_reflectcache) #define fpu_unreflectcache STEPNAME(fpu_unreflectcache) @@ -1065,10 +1069,10 @@ void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4); // common coproc helpers // reset the cache with n void fpu_reset_cache(dynarec_la64_t* dyn, int ninst, int reset_n); -// propagate stack state void fpu_propagate_stack(dynarec_la64_t* dyn, int ninst); -// purge the FPU cache (needs 3 scratch registers) void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3); +void mmx_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1); +void x87_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3); void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3); void fpu_unreflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3); void fpu_pushcache(dynarec_la64_t* dyn, int ninst, int s1, int not07); @@ -1092,6 +1096,13 @@ void sse_forget_reg(dynarec_la64_t* dyn, int ninst, int a); // Push current value to the cache void sse_reflect_reg(dynarec_la64_t* dyn, int ninst, int a); +// MMX helpers +// get lsx register for a MMX reg, create the entry if needed +int mmx_get_reg(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int a); +// get lsx register for a MMX reg, but don't try to synch it if it needed to be created +int mmx_get_reg_empty(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int a); + + void CacheTransform(dynarec_la64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3); void la64_move64(dynarec_la64_t* dyn, int ninst, int reg, int64_t val); diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h index 49951c3e..7c40ca27 100644 --- a/src/dynarec/la64/dynarec_la64_private.h +++ b/src/dynarec/la64/dynarec_la64_private.h @@ -45,11 +45,13 @@ typedef struct lsxcache_s { uint8_t combined2; uint8_t swapped; // the combined reg were swapped uint8_t barrier; // is there a barrier at instruction epilog? + uint8_t pushed; // positive pushed value (to check for overflow) + uint8_t poped; // positive poped value (to check for underflow) uint32_t news; // bitmask, wich neoncache are new for this opcode // fpu cache int8_t x87cache[8]; // cache status for the 8 x87 register behind the fpu stack int8_t x87reg[8]; // reg used for x87cache entry - int8_t freed[8]; // set when FFREE is used, -1 else + int16_t tags; // similar to fpu_tags int8_t mmxcache[8]; // cache status for the 8 MMX registers sse_cache_t ssecache[16]; // cache status for the 16 SSE(2) registers int8_t fpuused[24]; // all 0..24 double reg from fpu, used by x87, sse and mmx |