diff options
| author | Yang Liu <numbksco@gmail.com> | 2024-03-28 21:17:26 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-03-28 14:17:26 +0100 |
| commit | 912622bf1f3db7f3cba32b44f4f1c106e3fb7dbf (patch) | |
| tree | 074034e80742c29628d8d9288e2f62c53e7c567a /src | |
| parent | 70fec208d83899e5da663302c1ec7c632393afd7 (diff) | |
| download | box64-912622bf1f3db7f3cba32b44f4f1c106e3fb7dbf.tar.gz box64-912622bf1f3db7f3cba32b44f4f1c106e3fb7dbf.zip | |
[LA64_DYNAREC] Added basic SSE infra and 66 0F 6E MOVD opcode (#1391)
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_660f.c | 15 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_functions.c | 209 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_functions.h | 15 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_helper.c | 410 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_helper.h | 7 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_private.h | 53 | ||||
| -rw-r--r-- | src/dynarec/la64/la64_emitter.h | 37 |
7 files changed, 718 insertions, 28 deletions
diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c index 5f7192ca..effaf5bf 100644 --- a/src/dynarec/la64/dynarec_la64_660f.c +++ b/src/dynarec/la64/dynarec_la64_660f.c @@ -55,6 +55,21 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; FAKEED; break; + case 0x6E: + INST_NAME("MOVD Gx, Ed"); + nextop = F8; + GETGX_empty(v0); + v1 = fpu_get_scratch(dyn); + GETED(0); + VXOR_V(v0, v0, v0); + if (rex.w) { + MOVGR2FR_D(v1, ed); + VEXTRINS_D(v0, v1, 0); // v0[63:0] = v1[63:0] + } else { + MOVGR2FR_W(v1, ed); + VEXTRINS_W(v0, v1, 0); // v0[31:0] = v1[31:0] + } + break; default: DEFAULT; } diff --git a/src/dynarec/la64/dynarec_la64_functions.c b/src/dynarec/la64/dynarec_la64_functions.c index c73f91e4..01eaee06 100644 --- a/src/dynarec/la64/dynarec_la64_functions.c +++ b/src/dynarec/la64/dynarec_la64_functions.c @@ -26,10 +26,169 @@ #include "custommem.h" #include "bridge.h" +#define XMM0 0 +#define XMM8 16 +#define X870 8 +#define EMM0 8 + +// Get a FPU scratch reg +int fpu_get_scratch(dynarec_la64_t* dyn) +{ + return SCRATCH0 + dyn->lsx.fpu_scratch++; // return an Sx +} // Reset scratch regs counter void fpu_reset_scratch(dynarec_la64_t* dyn) { - // TODO + dyn->lsx.fpu_scratch = 0; +} + +// Free a FPU double reg +void fpu_free_reg(dynarec_la64_t* dyn, int reg) +{ + // TODO: check upper limit? + dyn->lsx.fpuused[reg] = 0; + if (dyn->lsx.lsxcache[reg].t != LSX_CACHE_ST_F && dyn->lsx.lsxcache[reg].t != LSX_CACHE_ST_D && dyn->lsx.lsxcache[reg].t != LSX_CACHE_ST_I64) + dyn->lsx.lsxcache[reg].v = 0; +} + +// Get an XMM quad reg +int fpu_get_reg_xmm(dynarec_la64_t* dyn, int t, int xmm) +{ + int i; + if (xmm > 7) { + i = XMM8 + xmm - 8; + } else { + i = XMM0 + xmm; + } + dyn->lsx.fpuused[i] = 1; + dyn->lsx.lsxcache[i].t = t; + dyn->lsx.lsxcache[i].n = xmm; + dyn->lsx.news |= (1 << i); + return i; +} + +// Reset fpu regs counter +static void fpu_reset_reg_lsxcache(lsxcache_t* lsx) +{ + lsx->fpu_reg = 0; + for (int i = 0; i < 24; ++i) { + lsx->fpuused[i] = 0; + lsx->lsxcache[i].v = 0; + } +} + +void fpu_reset_reg(dynarec_la64_t* dyn) +{ + fpu_reset_reg_lsxcache(&dyn->lsx); +} + +void lsxcacheUnwind(lsxcache_t* cache) +{ + if (cache->swapped) { + // unswap + int a = -1; + int b = -1; + for (int j = 0; j < 24 && ((a == -1) || (b == -1)); ++j) + if ((cache->lsxcache[j].t == LSX_CACHE_ST_D || cache->lsxcache[j].t == LSX_CACHE_ST_F || cache->lsxcache[j].t == LSX_CACHE_ST_I64)) { + if (cache->lsxcache[j].n == cache->combined1) + a = j; + else if (cache->lsxcache[j].n == cache->combined2) + b = j; + } + if (a != -1 && b != -1) { + int tmp = cache->lsxcache[a].n; + cache->lsxcache[a].n = cache->lsxcache[b].n; + cache->lsxcache[b].n = tmp; + } + cache->swapped = 0; + cache->combined1 = cache->combined2 = 0; + } + if (cache->news) { + // reove the newly created lsxcache + for (int i = 0; i < 24; ++i) + if (cache->news & (1 << i)) + cache->lsxcache[i].v = 0; + cache->news = 0; + } + if (cache->stack_push) { + // unpush + for (int j = 0; j < 24; ++j) { + if ((cache->lsxcache[j].t == LSX_CACHE_ST_D || cache->lsxcache[j].t == LSX_CACHE_ST_F || cache->lsxcache[j].t == LSX_CACHE_ST_I64)) { + if (cache->lsxcache[j].n < cache->stack_push) + cache->lsxcache[j].v = 0; + else + cache->lsxcache[j].n -= cache->stack_push; + } + } + cache->x87stack -= cache->stack_push; + cache->stack -= cache->stack_push; + cache->stack_push = 0; + } + cache->x87stack += cache->stack_pop; + cache->stack_next = cache->stack; + cache->stack_pop = 0; + cache->barrier = 0; + // And now, rebuild the x87cache info with lsxcache + cache->mmxcount = 0; + cache->fpu_scratch = 0; + cache->fpu_extra_qscratch = 0; + cache->fpu_reg = 0; + for (int i = 0; i < 8; ++i) { + cache->x87cache[i] = -1; + cache->mmxcache[i] = -1; + cache->x87reg[i] = 0; + cache->ssecache[i * 2].v = -1; + cache->ssecache[i * 2 + 1].v = -1; + } + int x87reg = 0; + for (int i = 0; i < 24; ++i) { + if (cache->lsxcache[i].v) { + cache->fpuused[i] = 1; + switch (cache->lsxcache[i].t) { + case LSX_CACHE_MM: + cache->mmxcache[cache->lsxcache[i].n] = i; + ++cache->mmxcount; + ++cache->fpu_reg; + break; + case LSX_CACHE_XMMR: + case LSX_CACHE_XMMW: + cache->ssecache[cache->lsxcache[i].n].reg = i; + cache->ssecache[cache->lsxcache[i].n].write = (cache->lsxcache[i].t == LSX_CACHE_XMMW) ? 1 : 0; + ++cache->fpu_reg; + break; + case LSX_CACHE_ST_F: + case LSX_CACHE_ST_D: + case LSX_CACHE_ST_I64: + cache->x87cache[x87reg] = cache->lsxcache[i].n; + cache->x87reg[x87reg] = i; + ++x87reg; + ++cache->fpu_reg; + break; + case LSX_CACHE_SCR: + cache->fpuused[i] = 0; + cache->lsxcache[i].v = 0; + break; + } + } else { + cache->fpuused[i] = 0; + } + } +} + +const char* getCacheName(int t, int n) +{ + static char buff[20]; + switch (t) { + case LSX_CACHE_ST_D: sprintf(buff, "ST%d", n); break; + case LSX_CACHE_ST_F: sprintf(buff, "st%d", n); break; + case LSX_CACHE_ST_I64: sprintf(buff, "STi%d", n); break; + case LSX_CACHE_MM: sprintf(buff, "MM%d", n); break; + case LSX_CACHE_XMMW: sprintf(buff, "XMM%d", n); break; + case LSX_CACHE_XMMR: sprintf(buff, "xmm%d", n); break; + case LSX_CACHE_SCR: sprintf(buff, "Scratch"); break; + case LSX_CACHE_NONE: buff[0] = '\0'; break; + } + return buff; } void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex) @@ -63,23 +222,23 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r dynarec_log(LOG_NONE, ", jmp=out"); if (dyn->last_ip) dynarec_log(LOG_NONE, ", last_ip=%p", (void*)dyn->last_ip); - // for (int ii = 0; ii < 24; ++ii) { - // switch (dyn->insts[ninst].n.neoncache[ii].t) { - // case NEON_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; - // case NEON_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; - // case NEON_CACHE_ST_I64: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; - // case NEON_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; - // case NEON_CACHE_XMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; - // case NEON_CACHE_XMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; - // case NEON_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; - // case NEON_CACHE_NONE: - // default: break; - // } - // } - // if (dyn->n.stack || dyn->insts[ninst].n.stack_next || dyn->insts[ninst].n.x87stack) - // dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, dyn->insts[ninst].n.stack_pop, dyn->insts[ninst].n.x87stack); - // if (dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) - // dynarec_log(LOG_NONE, " %s:%d/%d", dyn->insts[ninst].n.swapped ? "SWP" : "CMB", dyn->insts[ninst].n.combined1, dyn->insts[ninst].n.combined2); + for (int ii = 0; ii < 24; ++ii) { + switch (dyn->insts[ninst].lsx.lsxcache[ii].t) { + case LSX_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break; + case LSX_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break; + case LSX_CACHE_ST_I64: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break; + case LSX_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break; + case LSX_CACHE_XMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break; + case LSX_CACHE_XMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break; + case LSX_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break; + case LSX_CACHE_NONE: + default: break; + } + } + if (dyn->lsx.stack || dyn->insts[ninst].lsx.stack_next || dyn->insts[ninst].lsx.x87stack) + dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->lsx.stack, dyn->insts[ninst].lsx.stack_next, dyn->insts[ninst].lsx.stack_push, dyn->insts[ninst].lsx.stack_pop, dyn->insts[ninst].lsx.x87stack); + if (dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2) + dynarec_log(LOG_NONE, " %s:%d/%d", dyn->insts[ninst].lsx.swapped ? "SWP" : "CMB", dyn->insts[ninst].lsx.combined1, dyn->insts[ninst].lsx.combined2); dynarec_log(LOG_NONE, "%s\n", (box64_dynarec_dump > 1) ? "\e[m" : ""); } } @@ -147,12 +306,22 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode) dynarec_log(LOG_NONE, "\t%08x\t%s\n", opcode, la64_print(opcode, (uintptr_t)dyn->block)); } +static void sse_reset(lsxcache_t* lsx) +{ + for (int i = 0; i < 16; ++i) + lsx->ssecache[i].v = -1; +} + void fpu_reset(dynarec_la64_t* dyn) { - // TODO + // TODO: x87 and mmx + sse_reset(&dyn->lsx); + fpu_reset_reg(dyn); } void fpu_reset_ninst(dynarec_la64_t* dyn, int ninst) { - // TODO + // TODO: x87 and mmx + sse_reset(&dyn->insts[ninst].lsx); + fpu_reset_reg_lsxcache(&dyn->insts[ninst].lsx); } \ No newline at end of file diff --git a/src/dynarec/la64/dynarec_la64_functions.h b/src/dynarec/la64/dynarec_la64_functions.h index 67608783..2badba4d 100644 --- a/src/dynarec/la64/dynarec_la64_functions.h +++ b/src/dynarec/la64/dynarec_la64_functions.h @@ -3,11 +3,26 @@ #include "../dynarec_native_functions.h" +#define SCRATCH0 24 + typedef struct x64emu_s x64emu_t; typedef struct dynarec_la64_s dynarec_la64_t; +// Get an FPU scratch reg +int fpu_get_scratch(dynarec_la64_t* dyn); // Reset scratch regs counter void fpu_reset_scratch(dynarec_la64_t* dyn); +// Get an XMM quad reg +int fpu_get_reg_xmm(dynarec_la64_t* dyn, int t, int xmm); +// Free a FPU/MMX/XMM reg +void fpu_free_reg(dynarec_la64_t* dyn, int reg); +// Reset fpu regs counter +void fpu_reset_reg(dynarec_la64_t* dyn); + +// Undo the changes of a lsxcache to get the status before the instruction +void lsxcacheUnwind(lsxcache_t* cache); + +const char* getCacheName(int t, int n); void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex); void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode); diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c index cbd97fa8..9ad715e6 100644 --- a/src/dynarec/la64/dynarec_la64_helper.c +++ b/src/dynarec/la64/dynarec_la64_helper.c @@ -511,30 +511,122 @@ void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st) // TODO } +// get lsx register for an SSE reg, but don't try to synch it if it needed to be created +int sse_get_reg_empty(dynarec_la64_t* dyn, int ninst, int s1, int a) +{ + if (dyn->lsx.ssecache[a].v != -1) { + dyn->lsx.ssecache[a].write = 1; + dyn->lsx.lsxcache[dyn->lsx.ssecache[a].reg].t = LSX_CACHE_XMMW; + return dyn->lsx.ssecache[a].reg; + } + dyn->lsx.ssecache[a].reg = fpu_get_reg_xmm(dyn, LSX_CACHE_XMMW, a); + dyn->lsx.ssecache[a].write = 1; // it will be write... + return dyn->lsx.ssecache[a].reg; +} + // purge the SSE cache for XMM0..XMM7 (to use before function native call) void sse_purge07cache(dynarec_la64_t* dyn, int ninst, int s1) { - // TODO + int old = -1; + for (int i = 0; i < 8; ++i) + if (dyn->lsx.ssecache[i].v != -1) { + if (old == -1) { + MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n"); + ++old; + } + if (dyn->lsx.lsxcache[dyn->lsx.ssecache[i].reg].t == LSX_CACHE_XMMW) { + VST(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } + fpu_free_reg(dyn, dyn->lsx.ssecache[i].reg); + dyn->lsx.ssecache[i].v = -1; + } + if (old != -1) { + MESSAGE(LOG_DUMP, "\t------ Purge XMM0..7 Cache\n"); + } +} + +// purge the SSE cache only +static void sse_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1) +{ + int old = -1; + for (int i = 0; i < 16; ++i) + if (dyn->lsx.ssecache[i].v != -1) { + if (dyn->lsx.ssecache[i].write) { + if (old == -1) { + MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next ? "locally " : ""); + ++old; + } + VST(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } + if (!next) { + fpu_free_reg(dyn, dyn->lsx.ssecache[i].reg); + dyn->lsx.ssecache[i].v = -1; + } + } + if (old != -1) { + MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n"); + } +} + +static void sse_reflectcache(dynarec_la64_t* dyn, int ninst, int s1) +{ + for (int i=0; i<16; ++i) + if(dyn->lsx.ssecache[i].v!=-1 && dyn->lsx.ssecache[i].write) { + VST(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } } void fpu_pushcache(dynarec_la64_t* dyn, int ninst, int s1, int not07) { - // TODO + int start = not07 ? 8 : 0; + // only SSE regs needs to be push back to xEmu (needs to be "write") + int n = 0; + for (int i = start; i < 16; i++) + if ((dyn->lsx.ssecache[i].v != -1) && (dyn->lsx.ssecache[i].write)) + ++n; + if (!n) + return; + MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n); + for (int i = start; i < 16; ++i) + if ((dyn->lsx.ssecache[i].v != -1) && (dyn->lsx.ssecache[i].write)) { + VST(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } + MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n); } void fpu_popcache(dynarec_la64_t* dyn, int ninst, int s1, int not07) { - // TODO + int start = not07 ? 8 : 0; + // only SSE regs needs to be pop back from xEmu (don't need to be "write" this time) + int n = 0; + for (int i = start; i < 16; i++) + if (dyn->lsx.ssecache[i].v != -1) + ++n; + if (!n) + return; + MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n); + for (int i = start; i < 16; ++i) + if (dyn->lsx.ssecache[i].v != -1) { + VLD(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } + MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n); } void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3) { - // TODO + // TODO: x87_purgecache(dyn, ninst, next, s1, s2, s3); + // TODO: mmx_purgecache(dyn, ninst, next, s1); + + sse_purgecache(dyn, ninst, next, s1); + if (!next) + fpu_reset_reg(dyn); } void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3) { - // TODO + // TODO: x87_reflectcache(dyn, ninst, s1, s2, s3); + // TODO: mmx_reflectcache(dyn, ninst, s1); + sse_reflectcache(dyn, ninst, s1); } void fpu_unreflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3) @@ -562,19 +654,321 @@ void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4) void fpu_reset_cache(dynarec_la64_t* dyn, int ninst, int reset_n) { - // TODO + MESSAGE(LOG_DEBUG, "Reset Caches with %d\n", reset_n); +#if STEP > 1 + // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap) + dyn->lsx = dyn->insts[ninst].lsx; + lsxcacheUnwind(&dyn->lsx); +#ifdef HAVE_TRACE +// TODO: trace +#endif // HAVE_TRACE +#else + dyn->lsx = dyn->insts[reset_n].lsx; +#endif } // propagate ST stack state, especial stack pop that are deferred void fpu_propagate_stack(dynarec_la64_t* dyn, int ninst) { - // TODO + if (dyn->lsx.stack_pop) { + for (int j = 0; j < 24; ++j) + if ((dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_D + || dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_F + || dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_I64)) { + if (dyn->lsx.lsxcache[j].n < dyn->lsx.stack_pop) + dyn->lsx.lsxcache[j].v = 0; + else + dyn->lsx.lsxcache[j].n -= dyn->lsx.stack_pop; + } + dyn->lsx.stack_pop = 0; + } + dyn->lsx.stack = dyn->lsx.stack_next; + dyn->lsx.news = 0; + dyn->lsx.stack_push = 0; + dyn->lsx.swapped = 0; +} + + +static int findCacheSlot(dynarec_la64_t* dyn, int ninst, int t, int n, lsxcache_t* cache) +{ + lsx_cache_t f; + f.n = n; + f.t = t; + for (int i = 0; i < 24; ++i) { + if (cache->lsxcache[i].v == f.v) + return i; + if (cache->lsxcache[i].n == n) { + switch (cache->lsxcache[i].t) { + case LSX_CACHE_ST_F: + if (t == LSX_CACHE_ST_D) + return i; + if (t == LSX_CACHE_ST_I64) + return i; + break; + case LSX_CACHE_ST_D: + if (t == LSX_CACHE_ST_F) + return i; + if (t == LSX_CACHE_ST_I64) + return i; + break; + case LSX_CACHE_ST_I64: + if (t == LSX_CACHE_ST_F) + return i; + if (t == LSX_CACHE_ST_D) + return i; + break; + case LSX_CACHE_XMMR: + if (t == LSX_CACHE_XMMW) + return i; + break; + case LSX_CACHE_XMMW: + if (t == LSX_CACHE_XMMR) + return i; + break; + } + } + } + return -1; +} + +static void swapCache(dynarec_la64_t* dyn, int ninst, int i, int j, lsxcache_t* cache) +{ + if (i == j) + return; + int quad = 0; + if (cache->lsxcache[i].t == LSX_CACHE_XMMR || cache->lsxcache[i].t == LSX_CACHE_XMMW) + quad = 1; + if (cache->lsxcache[j].t == LSX_CACHE_XMMR || cache->lsxcache[j].t == LSX_CACHE_XMMW) + quad = 1; + + if (!cache->lsxcache[i].v) { + // a mov is enough, no need to swap + MESSAGE(LOG_DUMP, "\t - Moving %d <- %d\n", i, j); + if (quad) { + VOR_V(i, j, j); + } else { + VXOR_V(i, i, i); + VEXTRINS_D(i, j, 0); + } + cache->lsxcache[i].v = cache->lsxcache[j].v; + cache->lsxcache[j].v = 0; + return; + } + // SWAP + lsx_cache_t tmp; + MESSAGE(LOG_DUMP, "\t - Swapping %d <-> %d\n", i, j); + // There is no VSWP in Arm64 NEON to swap 2 register contents! + // so use a scratch... +#define SCRATCH 31 + if (quad) { + VOR_V(SCRATCH, i, i); + VOR_V(i, j, j); + VOR_V(j, SCRATCH, SCRATCH); + } else { + VXOR_V(SCRATCH, SCRATCH, SCRATCH); + VEXTRINS_D(SCRATCH, i, 0); + VXOR_V(i, i, i); + VEXTRINS_D(i, j, 0); + VXOR_V(j, j, j); + VEXTRINS_D(j, SCRATCH, 0); + } +#undef SCRATCH + tmp.v = cache->lsxcache[i].v; + cache->lsxcache[i].v = cache->lsxcache[j].v; + cache->lsxcache[j].v = tmp.v; } +static void loadCache(dynarec_la64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, lsxcache_t* cache, int i, int t, int n) +{ + if (cache->lsxcache[i].v) { + int quad = 0; + if (t == LSX_CACHE_XMMR || t == LSX_CACHE_XMMW) + quad = 1; + if (cache->lsxcache[i].t == LSX_CACHE_XMMR || cache->lsxcache[i].t == LSX_CACHE_XMMW) + quad = 1; + int j = i + 1; + while (cache->lsxcache[j].v) + ++j; + MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i); + if (quad) { + VOR_V(j, i, i); + } else { + VXOR_V(j, j, j); + VEXTRINS_D(j, i, 0); + } + cache->lsxcache[j].v = cache->lsxcache[i].v; + } + switch (t) { + case LSX_CACHE_XMMR: + case LSX_CACHE_XMMW: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + VLD(i, xEmu, offsetof(x64emu_t, xmm[n])); + break; + case LSX_CACHE_MM: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + VLD(i, xEmu, offsetof(x64emu_t, mmx[n])); + break; + case LSX_CACHE_ST_D: + case LSX_CACHE_ST_F: + case LSX_CACHE_ST_I64: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + // TODO: x87 + break; + case LSX_CACHE_NONE: + case LSX_CACHE_SCR: + default: /* nothing done */ + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + } + cache->lsxcache[i].n = n; + cache->lsxcache[i].t = t; +} + +static void unloadCache(dynarec_la64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, lsxcache_t* cache, int i, int t, int n) +{ + switch (t) { + case LSX_CACHE_XMMR: + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + case LSX_CACHE_XMMW: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + VST(i, xEmu, offsetof(x64emu_t, xmm[n])); + break; + case LSX_CACHE_MM: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + VST(i, xEmu, offsetof(x64emu_t, mmx[n])); + break; + case LSX_CACHE_ST_D: + case LSX_CACHE_ST_F: + case LSX_CACHE_ST_I64: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + // TODO: x87 + break; + case LSX_CACHE_NONE: + case LSX_CACHE_SCR: + default: /* nothing done */ + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + } + cache->lsxcache[i].v = 0; +} static void fpuCacheTransform(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3) { - // TODO +#if STEP > 1 + int i2 = dyn->insts[ninst].x64.jmp_insts; + if (i2 < 0) + return; + MESSAGE(LOG_DUMP, "\tCache Transform ---- ninst=%d -> %d\n", ninst, i2); + if ((!i2) || (dyn->insts[i2].x64.barrier & BARRIER_FLOAT)) { + if (dyn->lsx.stack_next) { + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + for (int i = 0; i < 24; ++i) + if (dyn->lsx.lsxcache[i].v) { // there is something at ninst for i + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + lsxcache_t cache_i2 = dyn->insts[i2].lsx; + lsxcacheUnwind(&cache_i2); + + if (!cache_i2.stack) { + int purge = 1; + for (int i = 0; i < 24 && purge; ++i) + if (cache_i2.lsxcache[i].v) + purge = 0; + if (purge) { + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + } + int stack_cnt = dyn->lsx.stack_next; + int s3_top = 0xffff; + if (stack_cnt != cache_i2.stack) { + MESSAGE(LOG_DUMP, "\t - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack); + int a = stack_cnt - cache_i2.stack; + // TODO: x87 + s3_top = 0; + stack_cnt = cache_i2.stack; + } + lsxcache_t cache = dyn->lsx; + int s1_val = 0; + int s2_val = 0; + // unload every uneeded cache + // check SSE first, than MMX, in order, for optimisation issue + for (int i = 0; i < 16; ++i) { + int j = findCacheSlot(dyn, ninst, LSX_CACHE_XMMW, i, &cache); + if (j >= 0 && findCacheSlot(dyn, ninst, LSX_CACHE_XMMW, i, &cache_i2) == -1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.lsxcache[j].t, cache.lsxcache[j].n); + } + for (int i = 0; i < 8; ++i) { + int j = findCacheSlot(dyn, ninst, LSX_CACHE_MM, i, &cache); + if (j >= 0 && findCacheSlot(dyn, ninst, LSX_CACHE_MM, i, &cache_i2) == -1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.lsxcache[j].t, cache.lsxcache[j].n); + } + for (int i = 0; i < 24; ++i) { + if (cache.lsxcache[i].v) + if (findCacheSlot(dyn, ninst, cache.lsxcache[i].t, cache.lsxcache[i].n, &cache_i2) == -1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.lsxcache[i].t, cache.lsxcache[i].n); + } + // and now load/swap the missing one + for (int i = 0; i < 24; ++i) { + if (cache_i2.lsxcache[i].v) { + if (cache_i2.lsxcache[i].v != cache.lsxcache[i].v) { + int j; + if ((j = findCacheSlot(dyn, ninst, cache_i2.lsxcache[i].t, cache_i2.lsxcache[i].n, &cache)) == -1) + loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.lsxcache[i].t, cache_i2.lsxcache[i].n); + else { + // it's here, lets swap if needed + if (j != i) + swapCache(dyn, ninst, i, j, &cache); + } + } + if (cache.lsxcache[i].t != cache_i2.lsxcache[i].t) { + if (cache.lsxcache[i].t == LSX_CACHE_ST_D && cache_i2.lsxcache[i].t == LSX_CACHE_ST_F) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n)); + FCVT_S_D(i, i); + cache.lsxcache[i].t = LSX_CACHE_ST_F; + } else if (cache.lsxcache[i].t == LSX_CACHE_ST_F && cache_i2.lsxcache[i].t == LSX_CACHE_ST_D) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n)); + FCVT_D_S(i, i); + cache.lsxcache[i].t = LSX_CACHE_ST_D; + } else if (cache.lsxcache[i].t == LSX_CACHE_ST_D && cache_i2.lsxcache[i].t == LSX_CACHE_ST_I64) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n)); + VFTINTRZ_L_D(i, i); + cache.lsxcache[i].t = LSX_CACHE_ST_I64; + } else if (cache.lsxcache[i].t == LSX_CACHE_ST_F && cache_i2.lsxcache[i].t == LSX_CACHE_ST_I64) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n)); + VFTINTRZL_L_S(i, i); + cache.lsxcache[i].t = LSX_CACHE_ST_D; + } else if (cache.lsxcache[i].t == LSX_CACHE_ST_I64 && cache_i2.lsxcache[i].t == LSX_CACHE_ST_F) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n)); + VFFINT_S_L(i, i, i); + cache.lsxcache[i].t = LSX_CACHE_ST_F; + } else if (cache.lsxcache[i].t == LSX_CACHE_ST_I64 && cache_i2.lsxcache[i].t == LSX_CACHE_ST_D) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n)); + VFFINT_D_L(i, i); + cache.lsxcache[i].t = LSX_CACHE_ST_D; + } else if (cache.lsxcache[i].t == LSX_CACHE_XMMR && cache_i2.lsxcache[i].t == LSX_CACHE_XMMW) { + cache.lsxcache[i].t = LSX_CACHE_XMMW; + } else if (cache.lsxcache[i].t == LSX_CACHE_XMMW && cache_i2.lsxcache[i].t == LSX_CACHE_XMMR) { + // refresh cache... + MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n)); + VST(i, xEmu, offsetof(x64emu_t, xmm[cache.lsxcache[i].n])); + cache.lsxcache[i].t = LSX_CACHE_XMMR; + } + } + } + } + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); +#endif } static void flagsCacheTransform(dynarec_la64_t* dyn, int ninst, int s1) diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h index f26dfba7..30f341e8 100644 --- a/src/dynarec/la64/dynarec_la64_helper.h +++ b/src/dynarec/la64/dynarec_la64_helper.h @@ -174,6 +174,10 @@ gd = i; \ BSTRPICK_D(gd, gb1, gb2 + 7, gb2); +#define GETGX_empty(a) \ + gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \ + a = sse_get_reg_empty(dyn, ninst, x1, gd) + // Write gb (gd) back to original register / memory, using s1 as scratch #define GBBACK(s1) BSTRINS_D(gb1, gd, gb2 + 7, gb2); @@ -523,6 +527,7 @@ void* la64_next(x64emu_t* emu, uintptr_t addr); #define x87_forget STEPNAME(x87_forget) #define sse_purge07cache STEPNAME(sse_purge07cache) +#define sse_get_reg_empty STEPNAME(sse_get_reg_empty) #define fpu_pushcache STEPNAME(fpu_pushcache) #define fpu_popcache STEPNAME(fpu_popcache) @@ -595,6 +600,8 @@ void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st); // SSE/SSE2 helpers // purge the XMM0..XMM7 cache (before function call) void sse_purge07cache(dynarec_la64_t* dyn, int ninst, int s1); +// get lsx register for an SSE reg, but don't try to synch it if it needed to be created +int sse_get_reg_empty(dynarec_la64_t* dyn, int ninst, int s1, int a); void CacheTransform(dynarec_la64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3); diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h index 1ecb88f6..7b76a75f 100644 --- a/src/dynarec/la64/dynarec_la64_private.h +++ b/src/dynarec/la64/dynarec_la64_private.h @@ -9,6 +9,57 @@ typedef struct instsize_s instsize_t; #define BARRIER_MAYBE 8 +#define LSX_CACHE_NONE 0 +#define LSX_CACHE_ST_D 1 +#define LSX_CACHE_ST_F 2 +#define LSX_CACHE_ST_I64 3 +#define LSX_CACHE_MM 4 +#define LSX_CACHE_XMMW 5 +#define LSX_CACHE_XMMR 6 +#define LSX_CACHE_SCR 7 + +typedef union lsx_cache_s { + int8_t v; + struct { + uint8_t t : 4; // reg type + uint8_t n : 4; // reg number + }; +} lsx_cache_t; + +typedef union sse_cache_s { + int8_t v; + struct { + uint8_t reg : 7; + uint8_t write : 1; + }; +} sse_cache_t; + +typedef struct lsxcache_s { + // LSX cache + lsx_cache_t lsxcache[24]; + int8_t stack; + int8_t stack_next; + int8_t stack_pop; + int8_t stack_push; + uint8_t combined1; + uint8_t combined2; + uint8_t swapped; // the combined reg were swapped + uint8_t barrier; // is there a barrier at instruction epilog? + uint32_t news; // bitmask, wich neoncache are new for this opcode + // fpu cache + int8_t x87cache[8]; // cache status for the 8 x87 register behind the fpu stack + int8_t x87reg[8]; // reg used for x87cache entry + int8_t freed[8]; // set when FFREE is used, -1 else + int8_t mmxcache[8]; // cache status for the 8 MMX registers + sse_cache_t ssecache[16]; // cache status for the 16 SSE(2) registers + int8_t fpuused[24]; // all 0..24 double reg from fpu, used by x87, sse and mmx + int8_t x87stack; // cache stack counter + int8_t mmxcount; // number of mmx register used (not both mmx and x87 at the same time) + int8_t fpu_scratch; // scratch counter + int8_t fpu_extra_qscratch; // some opcode need an extra quad scratch register + int8_t fpu_reg; // x87/sse/mmx reg counter +} lsxcache_t; + typedef struct flagcache_s { int pending; // is there a pending flags here, or to check? int dfnone; // if deferred flags is already set to df_none @@ -33,6 +84,7 @@ typedef struct instruction_la64_s { uint8_t will_write; uint8_t last_write; flagcache_t f_exit; // flags status at end of instruction + lsxcache_t lsx; // lsxcache at end of instruction (but before poping) flagcache_t f_entry; // flags status before the instruction begin } instruction_la64_t; @@ -52,6 +104,7 @@ typedef struct dynarec_la64_s { uintptr_t tablestart; uintptr_t jmp_next; // address of the jump_next address flagcache_t f; + lsxcache_t lsx; uintptr_t* next; // variable array of "next" jump address int next_sz; int next_cap; diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h index c9c67407..117bb99e 100644 --- a/src/dynarec/la64/la64_emitter.h +++ b/src/dynarec/la64/la64_emitter.h @@ -947,11 +947,41 @@ LSX instruction starts with V, LASX instruction starts with XV. #define VFMINA_D(vd, vj, vk) EMIT(type_3R(0b01110001010000110, vk, vj, vd)) #define VFCVT_H_S(vd, vj, vk) EMIT(type_3R(0b01110001010001100, vk, vj, vd)) #define VFCVT_S_D(vd, vj, vk) EMIT(type_3R(0b01110001010001101, vk, vj, vd)) +#define VFTINT_W_S(vd, vj) EMIT(type_2R(0b0111001010011110001100, vj, vd)) +#define VFTINT_L_D(vd, vj) EMIT(type_2R(0b0111001010011110001101, vj, vd)) +#define VFTINTRM_W_S(vd, vj) EMIT(type_2R(0b0111001010011110001110, vj, vd)) +#define VFTINTRM_L_D(vd, vj) EMIT(type_2R(0b0111001010011110001111, vj, vd)) +#define VFTINTRP_W_S(vd, vj) EMIT(type_2R(0b0111001010011110010000, vj, vd)) +#define VFTINTRP_L_D(vd, vj) EMIT(type_2R(0b0111001010011110010001, vj, vd)) +#define VFTINTRZ_W_S(vd, vj) EMIT(type_2R(0b0111001010011110010010, vj, vd)) +#define VFTINTRZ_L_D(vd, vj) EMIT(type_2R(0b0111001010011110010011, vj, vd)) +#define VFTINTRNE_W_S(vd, vj) EMIT(type_2R(0b0111001010011110010100, vj, vd)) +#define VFTINTRNE_L_D(vd, vj) EMIT(type_2R(0b0111001010011110010101, vj, vd)) +#define VFTINT_WU_S(vd, vj) EMIT(type_2R(0b0111001010011110010110, vj, vd)) +#define VFTINT_LU_D(vd, vj) EMIT(type_2R(0b0111001010011110010111, vj, vd)) +#define VFTINTRZ_WU_S(vd, vj) EMIT(type_2R(0b0111001010011110011100, vj, vd)) +#define VFTINTRZ_LU_D(vd, vj) EMIT(type_2R(0b0111001010011110011101, vj, vd)) #define VFTINT_W_D(vd, vj, vk) EMIT(type_3R(0b01110001010010011, vk, vj, vd)) #define VFTINTRM_W_D(vd, vj, vk) EMIT(type_3R(0b01110001010010100, vk, vj, vd)) #define VFTINTRP_W_D(vd, vj, vk) EMIT(type_3R(0b01110001010010101, vk, vj, vd)) #define VFTINTRZ_W_D(vd, vj, vk) EMIT(type_3R(0b01110001010010110, vk, vj, vd)) #define VFTINTRNE_W_D(vd, vj, vk) EMIT(type_3R(0b01110001010010111, vk, vj, vd)) +#define VFTINTL_L_S(vd, vj) EMIT(type_2R(0b0111001010011110100000, vj, vd)) +#define VFTINTH_L_S(vd, vj) EMIT(type_2R(0b0111001010011110100001, vj, vd)) +#define VFTINTRML_L_S(vd, vj) EMIT(type_2R(0b0111001010011110100010, vj, vd)) +#define VFTINTRMH_L_S(vd, vj) EMIT(type_2R(0b0111001010011110100011, vj, vd)) +#define VFTINTRPL_L_S(vd, vj) EMIT(type_2R(0b0111001010011110100100, vj, vd)) +#define VFTINTRPH_L_S(vd, vj) EMIT(type_2R(0b0111001010011110100101, vj, vd)) +#define VFTINTRZL_L_S(vd, vj) EMIT(type_2R(0b0111001010011110100110, vj, vd)) +#define VFTINTRZH_L_S(vd, vj) EMIT(type_2R(0b0111001010011110100111, vj, vd)) +#define VFTINTRNEL_L_S(vd, vj) EMIT(type_2R(0b0111001010011110101000, vj, vd)) +#define VFTINTRNEH_L_S(vd, vj) EMIT(type_2R(0b0111001010011110101001, vj, vd)) +#define VFFINT_S_W(vd, vj) EMIT(type_2R(0b0111001010011110000000, vj, vd)) +#define VFFINT_S_WU(vd, vj) EMIT(type_2R(0b0111001010011110000001, vj, vd)) +#define VFFINT_D_L(vd, vj) EMIT(type_2R(0b0111001010011110000010, vj, vd)) +#define VFFINT_D_LU(vd, vj) EMIT(type_2R(0b0111001010011110000011, vj, vd)) +#define VFFINTL_D_W(vd, vj) EMIT(type_2R(0b0111001010011110000100, vj, vd)) +#define VFFINTH_D_W(vd, vj) EMIT(type_2R(0b0111001010011110000101, vj, vd)) #define VFFINT_S_L(vd, vj, vk) EMIT(type_3R(0b01110001010010000, vk, vj, vd)) #define VSEQ_B(vd, vj, vk) EMIT(type_3R(0b01110000000000000, vk, vj, vd)) #define VSEQ_H(vd, vj, vk) EMIT(type_3R(0b01110000000000001, vk, vj, vd)) @@ -1000,6 +1030,13 @@ LSX instruction starts with V, LASX instruction starts with XV. #define VSHUF_H(vd, vj, vk) EMIT(type_3R(0b01110001011110101, vk, vj, vd)) #define VSHUF_W(vd, vj, vk) EMIT(type_3R(0b01110001011110110, vk, vj, vd)) #define VSHUF_D(vd, vj, vk) EMIT(type_3R(0b01110001011110111, vk, vj, vd)) +#define VEXTRINS_D(vd, vj, imm8) EMIT(type_2RI8(0b01110011100000, imm8, vj, vd)) +#define VEXTRINS_W(vd, vj, imm8) EMIT(type_2RI8(0b01110011100001, imm8, vj, vd)) +#define VEXTRINS_H(vd, vj, imm8) EMIT(type_2RI8(0b01110011100010, imm8, vj, vd)) +#define VEXTRINS_B(vd, vj, imm8) EMIT(type_2RI8(0b01110011100011, imm8, vj, vd)) +#define VLD(vd, rj, imm12) EMIT(type_2RI12(0b0010110000, imm12, rj, vd)) +#define VST(vd, rj, imm12) EMIT(type_2RI12(0b0010110001, imm12, rj, vd)) + #define XVADD_B(vd, vj, vk) EMIT(type_3R(0b01110100000010100, vk, vj, vd)) #define XVADD_H(vd, vj, vk) EMIT(type_3R(0b01110100000010101, vk, vj, vd)) |