diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2024-07-19 17:34:45 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-07-19 11:34:45 +0200 |
| commit | 3e20a60c37b77b782a81f1a6b234a3c4ddd61d2c (patch) | |
| tree | 19dbc080091a1eff074f42eebca9db8b743e17c3 | |
| parent | 984c13634980f4fdd7d7c67c3e25cb38cb360874 (diff) | |
| download | box64-3e20a60c37b77b782a81f1a6b234a3c4ddd61d2c.tar.gz box64-3e20a60c37b77b782a81f1a6b234a3c4ddd61d2c.zip | |
[RV64_DYNAREC] Added vector SEW cache (#1698)
* [RV64_DYNAREC] Added vector SEW cache * handling reset_n
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.h | 1 | ||||
| -rw-r--r-- | src/dynarec/dynarec_arch.h | 10 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native_pass.c | 1 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_helper.h | 1 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f_vector.c | 20 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_functions.c | 27 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_functions.h | 2 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.c | 49 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 21 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_pass0.h | 35 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_pass1.h | 18 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_pass2.h | 18 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_pass3.h | 12 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_private.h | 2 | ||||
| -rw-r--r-- | src/dynarec/rv64/rv64_emitter.h | 2 |
15 files changed, 143 insertions, 76 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index dbc3c29f..7d7114ed 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -1107,6 +1107,7 @@ dyn->smread = dyn->smwrite = 0; \ dyn->doublepush = 0; \ dyn->doublepop = 0; +#define ARCH_RESET() #if STEP < 2 #define GETIP(A) TABLE64(0, 0) diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h index f89125a7..6a5c4977 100644 --- a/src/dynarec/dynarec_arch.h +++ b/src/dynarec/dynarec_arch.h @@ -38,11 +38,13 @@ #define instruction_native_t instruction_rv64_t #define dynarec_native_t dynarec_rv64_t -#define ADDITIONNAL_DEFINITION() \ - int fpuCacheNeedsTransform(dynarec_native_t* dyn, int ninst); +#define ADDITIONNAL_DEFINITION() \ + int fpuCacheNeedsTransform(dynarec_native_t* dyn, int ninst); \ + int sewNeedsTransform(dynarec_rv64_t* dyn, int ninst); -#define OTHER_CACHE() \ - if (fpuCacheNeedsTransform(dyn, ninst)) ret|=2; +#define OTHER_CACHE() \ + if (fpuCacheNeedsTransform(dyn, ninst)) ret |= 2; \ + if (sewNeedsTransform(dyn, ninst)) ret |= 3; #include "rv64/rv64_printer.h" #include "rv64/dynarec_rv64_private.h" diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index a6bcd4ec..779ba9be 100644 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -89,6 +89,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int dyn->f.dfnone = 0; dyn->f.pending = 0; fpu_reset(dyn); + ARCH_RESET(); } else { fpu_reset_cache(dyn, ninst, reset_n); dyn->f = dyn->insts[reset_n].f_exit; diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h index 573c115b..f2972274 100644 --- a/src/dynarec/la64/dynarec_la64_helper.h +++ b/src/dynarec/la64/dynarec_la64_helper.h @@ -710,6 +710,7 @@ #endif #define ARCH_INIT() +#define ARCH_RESET() #if STEP < 2 #define GETIP(A) TABLE64(0, 0) diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c index deb95d26..4ca426e6 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c @@ -50,11 +50,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i case 0x28: INST_NAME("MOVAPD Gx, Ex"); nextop = F8; - // FIXME - vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8); - GETG; if (MODREG) { + SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY); ed = (nextop & 7) + (rex.b << 3); v1 = sse_get_reg_vector(dyn, ninst, x1, ed, 0); v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd); @@ -63,7 +61,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SMREAD(); v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd); addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0); - VLE8_V(v0, ed, VECTOR_UNMASKED, VECTOR_NFIELD1); + VL1RE64_V(v0, ed); } break; case 0x38: // SSSE3 opcodes @@ -72,9 +70,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i case 0x00: INST_NAME("PSHUFB Gx, Ex"); nextop = F8; - // FIXME - vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8); - + SET_ELEMENT_WIDTH(x1, VECTOR_SEW8); GETGX_vector(q0, 1); GETEX_vector(q1, 0, 0); v0 = fpu_get_scratch(dyn); @@ -94,10 +90,8 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i case 0x6F: INST_NAME("MOVDQA Gx, Ex"); nextop = F8; - // FIXME - vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8); - if (MODREG) { + SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY); v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0); GETGX_empty_vector(v0); VMV_V_V(v0, v1); @@ -105,7 +99,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i GETGX_empty_vector(v0); SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 0, 0); - VLE8_V(v0, ed, VECTOR_UNMASKED, VECTOR_NFIELD1); + VL1RE64_V(v0, ed); } break; case 0x7E: @@ -113,9 +107,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i case 0xEF: INST_NAME("PXOR Gx, Ex"); nextop = F8; - // FIXME: we should try to minimize vsetvl usage as it may hurts performance a lot. - vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8); - + SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY); GETG; if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { // special case diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c index 6ce97b8a..2d46d5e2 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.c +++ b/src/dynarec/rv64/dynarec_rv64_functions.c @@ -18,6 +18,7 @@ #include "callback.h" #include "emu/x64run_private.h" #include "emu/x87emu_private.h" +#include "rv64_emitter.h" #include "x64trace.h" #include "signals.h" #include "dynarec_native.h" @@ -370,6 +371,20 @@ int fpuCacheNeedsTransform(dynarec_rv64_t* dyn, int ninst) { return ret; } +int sewNeedsTransform(dynarec_rv64_t* dyn, int ninst) +{ + int i2 = dyn->insts[ninst].x64.jmp_insts; + + if (dyn->insts[i2].vector_sew == VECTOR_SEWNA) + return 0; + else if (dyn->insts[i2].vector_sew == VECTOR_SEWANY && dyn->insts[ninst].vector_sew != VECTOR_SEWNA) + return 0; + else if (dyn->insts[i2].vector_sew == dyn->insts[ninst].vector_sew) + return 0; + + return 1; +} + void extcacheUnwind(extcache_t* cache) { if(cache->swapped) { @@ -592,22 +607,22 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r }; if(box64_dynarec_dump) { printf_x64_instruction(rex.is32bits?my_context->dec32:my_context->dec, &dyn->insts[ninst].x64, name); - dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d/%d", - (box64_dynarec_dump>1)?"\e[32m":"", - (void*)(dyn->native_start+dyn->insts[ninst].address), - dyn->insts[ninst].size/4, + dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d/%d, sew=%d", + (box64_dynarec_dump > 1) ? "\e[32m" : "", + (void*)(dyn->native_start + dyn->insts[ninst].address), + dyn->insts[ninst].size / 4, ninst, dyn->insts[ninst].x64.barrier, dyn->insts[ninst].x64.state_flags, dyn->f.pending, dyn->f.dfnone, - dyn->insts[ninst].x64.may_set?"may":"set", + dyn->insts[ninst].x64.may_set ? "may" : "set", dyn->insts[ninst].x64.set_flags, dyn->insts[ninst].x64.gen_flags, dyn->insts[ninst].x64.use_flags, dyn->insts[ninst].x64.need_before, dyn->insts[ninst].x64.need_after, - dyn->smread, dyn->smwrite); + dyn->smread, dyn->smwrite, dyn->insts[ninst].vector_sew); if(dyn->insts[ninst].pred_sz) { dynarec_log(LOG_NONE, ", pred="); for(int ii=0; ii<dyn->insts[ninst].pred_sz; ++ii) diff --git a/src/dynarec/rv64/dynarec_rv64_functions.h b/src/dynarec/rv64/dynarec_rv64_functions.h index e3a5171d..41d01c4b 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.h +++ b/src/dynarec/rv64/dynarec_rv64_functions.h @@ -46,6 +46,8 @@ int extcache_no_i64(dynarec_rv64_t* dyn, int ninst, int st, int a); // FPU Cache transformation (for loops) // Specific, need to be written par backend int fpuCacheNeedsTransform(dynarec_rv64_t* dyn, int ninst); +int sewNeedsTransform(dynarec_rv64_t* dyn, int ninst); + // Undo the changes of a extcache to get the status before the instruction void extcacheUnwind(extcache_t* cache); diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index 4c132350..3229b3da 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -1675,7 +1675,7 @@ int sse_get_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a, int forwri dyn->e.ssecache[a].vector = 1; dyn->e.ssecache[a].single = 0; // just to be clean ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a])); - VLE8_V(ret, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VL1RE64_V(ret, s1); return ret; } @@ -1710,7 +1710,7 @@ void sse_forget_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a) return sse_forget_reg(dyn, ninst, s1, a); if (dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t == EXT_CACHE_XMMW) { ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a])); - VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VS1R_V(dyn->e.ssecache[a].reg, s1); } fpu_free_reg(dyn, dyn->e.ssecache[a].reg); dyn->e.ssecache[a].v = -1; @@ -1729,7 +1729,7 @@ void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1) } if (dyn->e.ssecache[i].vector) { ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); - VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VS1R_V(dyn->e.ssecache[i].reg, s1); } else if (dyn->e.ssecache[i].single) FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else @@ -1754,7 +1754,7 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1) } if (dyn->e.ssecache[i].vector) { ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); - VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VS1R_V(dyn->e.ssecache[i].reg, s1); } else if (dyn->e.ssecache[i].single) FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else @@ -1784,7 +1784,7 @@ static void sse_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1) if (dyn->e.ssecache[i].v != -1) { if (dyn->e.ssecache[i].vector) { ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); - VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VS1R_V(dyn->e.ssecache[i].reg, s1); } else if (dyn->e.ssecache[i].single) FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else @@ -1798,7 +1798,7 @@ void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a) return; if (dyn->e.ssecache[a].vector) { ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a])); - VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VS1R_V(dyn->e.ssecache[a].reg, s1); } else if (dyn->e.ssecache[a].single) FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); else @@ -1821,7 +1821,7 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) if(dyn->e.ssecache[i].v!=-1) { if (dyn->e.ssecache[i].vector) { ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); - VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VS1R_V(dyn->e.ssecache[i].reg, s1); } else if (dyn->e.ssecache[i].single) FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else @@ -1867,7 +1867,7 @@ void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) if(dyn->e.ssecache[i].v!=-1) { if (dyn->e.ssecache[i].vector) { ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); - VLE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VL1RE64_V(dyn->e.ssecache[i].reg, s1); } else if (dyn->e.ssecache[i].single) FLW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else @@ -2047,7 +2047,7 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int case EXT_CACHE_XMMW: MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n])); - VLE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VL1RE64_V(i, s1); break; case EXT_CACHE_SS: MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); @@ -2106,7 +2106,7 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i case EXT_CACHE_XMMW: MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n])); - VSE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + VS1R_V(i, s1); break; case EXT_CACHE_SS: MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); @@ -2197,7 +2197,6 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in int s2_val = 0; // unload every uneeded cache // check SSE first, than MMX, in order, for optimisation issue - if (rv64_vector) vector_vsetvl_emul1(dyn, ninst, s1, VECTOR_SEW8); for (int i = 0; i < 16; ++i) { int j = findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache); if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2) == -1) @@ -2339,10 +2338,24 @@ static void flagsCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1) #endif } +static void sewTransform(dynarec_rv64_t* dyn, int ninst, int s1) +{ +#if STEP > 1 + int j64; + int jmp = dyn->insts[ninst].x64.jmp_insts; + if (jmp < 0) return; + if (dyn->insts[jmp].vector_sew == VECTOR_SEWNA) return; + MESSAGE(LOG_DUMP, "\tSEW changed to %d ---- ninst=%d -> %d\n", dyn->insts[jmp].vector_sew, ninst, jmp); + vector_vsetvl_emul1(dyn, ninst, s1, dyn->insts[jmp].vector_sew); +#endif +} + void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) { - if(cacheupd&2) + if (cacheupd & 3) + sewTransform(dyn, ninst, s1); + if (cacheupd & 2) fpuCacheTransform(dyn, ninst, s1, s2, s3); - if(cacheupd&1) + if (cacheupd & 1) flagsCacheTransform(dyn, ninst, s1); } @@ -2426,16 +2439,18 @@ void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n) { MESSAGE(LOG_DEBUG, "Reset Caches with %d\n",reset_n); #if STEP > 1 - // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap) + // for STEP 2 & 3, just need to refresh with current, and undo the changes (push & swap) dyn->e = dyn->insts[ninst].e; + dyn->vector_sew = dyn->insts[ninst].vector_sew; #else dyn->e = dyn->insts[reset_n].e; + dyn->vector_sew = dyn->insts[reset_n].vector_sew; #endif extcacheUnwind(&dyn->e); #if STEP == 0 if(box64_dynarec_dump) dynarec_log(LOG_NONE, "New x87stack=%d\n", dyn->e.x87stack); #endif - #if defined(HAVE_TRACE) && (STEP>2) + #if defined(HAVE_TRACE) && (STEP > 2) if(box64_dynarec_dump) if(memcmp(&dyn->e, &dyn->insts[reset_n].e, sizeof(ext_cache_t))) { MESSAGE(LOG_DEBUG, "Warning, difference in extcache: reset="); @@ -2464,7 +2479,7 @@ void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n) MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->e.stack_push, -dyn->e.stack_pop); MESSAGE(LOG_DEBUG, "\n"); } - #endif //HAVE_TRACE +#endif // HAVE_TRACE } // propagate ST stack state, especial stack pop that are deferred @@ -2492,6 +2507,8 @@ void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst) // other configs are set automatically. void vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew) { + if (sew == VECTOR_SEWNA) return; + if (sew == VECTOR_SEWANY) sew = VECTOR_SEW8; /* mu: mask undisturbed * tu: tail undisturbed * sew: selected element width diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 17cca67d..e3c3bbf2 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -508,7 +508,7 @@ addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, D); \ a = fpu_get_scratch(dyn); \ ADDI(x2, ed, fixedaddress); \ - VLE8_V(a, x2, VECTOR_UNMASKED, VECTOR_NFIELD1); \ + VL1RE64_V(a, x2); \ } #define GETGM() \ @@ -1031,8 +1031,10 @@ #define FTABLE64(A, V) #endif -#define ARCH_INIT() - +#define ARCH_INIT() \ + dyn->vector_sew = VECTOR_SEWNA; +#define ARCH_RESET() \ + dyn->vector_sew = VECTOR_SEWNA; #if STEP < 2 #define GETIP(A) TABLE64(0, 0) @@ -1078,6 +1080,19 @@ #define MODREG ((nextop & 0xC0) == 0xC0) +#ifndef SET_ELEMENT_WIDTH +#define SET_ELEMENT_WIDTH(s1, sew) \ + do { \ + if (sew == VECTOR_SEWNA) { \ + } else if (sew == VECTOR_SEWANY && dyn->vector_sew != VECTOR_SEWNA) { \ + } else if (sew == dyn->vector_sew) { \ + } else { \ + vector_vsetvl_emul1(dyn, ninst, s1, sew); \ + } \ + dyn->vector_sew = sew; \ + } while (0) +#endif + void rv64_epilog(void); void rv64_epilog_fast(void); void* rv64_next(x64emu_t* emu, uintptr_t addr); diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h index ed3c321b..8924cae0 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass0.h +++ b/src/dynarec/rv64/dynarec_rv64_pass0.h @@ -20,20 +20,26 @@ #define BARRIER(A) if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1 #define BARRIER_NEXT(A) dyn->insts[ninst].x64.barrier_next = A #define SET_HASCALLRET() dyn->insts[ninst].x64.has_callret = 1 -#define NEW_INST \ - ++dyn->size; \ - memset(&dyn->insts[ninst], 0, sizeof(instruction_native_t)); \ - dyn->insts[ninst].x64.addr = ip; \ - dyn->e.combined1 = dyn->e.combined2 = 0;\ - dyn->e.swapped = 0; dyn->e.barrier = 0; \ - for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\ - dyn->insts[ninst].f_entry = dyn->f; \ - if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;} +#define NEW_INST \ + ++dyn->size; \ + memset(&dyn->insts[ninst], 0, sizeof(instruction_native_t)); \ + dyn->insts[ninst].x64.addr = ip; \ + dyn->e.combined1 = dyn->e.combined2 = 0; \ + dyn->e.swapped = 0; \ + dyn->e.barrier = 0; \ + for (int i = 0; i < 16; ++i) \ + dyn->e.olds[i].v = 0; \ + dyn->insts[ninst].f_entry = dyn->f; \ + if (reset_n != -1) \ + dyn->vector_sew = ninst ? dyn->insts[ninst - 1].vector_sew : VECTOR_SEWNA; \ + if (ninst) \ + dyn->insts[ninst - 1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst - 1].x64.addr; -#define INST_EPILOG \ - dyn->insts[ninst].f_exit = dyn->f; \ - dyn->insts[ninst].e = dyn->e; \ - dyn->insts[ninst].x64.has_next = (ok>0)?1:0; +#define INST_EPILOG \ + dyn->insts[ninst].f_exit = dyn->f; \ + dyn->insts[ninst].e = dyn->e; \ + dyn->insts[ninst].vector_sew = dyn->vector_sew; \ + dyn->insts[ninst].x64.has_next = (ok > 0) ? 1 : 0; #define INST_NAME(name) #define DEFAULT \ --dyn->size; \ @@ -66,3 +72,6 @@ dynarec_log(LOG_NONE, "\n"); \ } \ return 0 + +#define SET_ELEMENT_WIDTH(s1, sew) \ + dyn->vector_sew = sew; diff --git a/src/dynarec/rv64/dynarec_rv64_pass1.h b/src/dynarec/rv64/dynarec_rv64_pass1.h index b76d7e97..c7813ba0 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass1.h +++ b/src/dynarec/rv64/dynarec_rv64_pass1.h @@ -1,15 +1,19 @@ -#define INIT +#define INIT #define FINI #define MESSAGE(A, ...) do {} while (0) #define EMIT(A) do {} while (0) -#define NEW_INST \ - dyn->insts[ninst].f_entry = dyn->f; \ - dyn->e.combined1 = dyn->e.combined2 = 0;\ - for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\ - dyn->e.swapped = 0; dyn->e.barrier = 0 +#define NEW_INST \ + dyn->insts[ninst].f_entry = dyn->f; \ + dyn->e.combined1 = dyn->e.combined2 = 0; \ + for (int i = 0; i < 16; ++i) \ + dyn->e.olds[i].v = 0; \ + if (reset_n != -1) \ + dyn->vector_sew = ninst ? dyn->insts[ninst - 1].vector_sew : VECTOR_SEWNA; \ + dyn->e.swapped = 0; \ + dyn->e.barrier = 0 #define INST_EPILOG \ dyn->insts[ninst].e = dyn->e; \ dyn->insts[ninst].f_exit = dyn->f -#define INST_NAME(name) +#define INST_NAME(name) diff --git a/src/dynarec/rv64/dynarec_rv64_pass2.h b/src/dynarec/rv64/dynarec_rv64_pass2.h index 6761a454..37a71b9a 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass2.h +++ b/src/dynarec/rv64/dynarec_rv64_pass2.h @@ -7,13 +7,15 @@ #define MESSAGE(A, ...) do {} while (0) #define EMIT(A) do {dyn->insts[ninst].size+=4; dyn->native_size+=4;}while(0) -#define NEW_INST \ - if(ninst) { \ - dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size); \ - dyn->insts_size += 1+((dyn->insts[ninst-1].x64.size>(dyn->insts[ninst-1].size/4))?dyn->insts[ninst-1].x64.size:(dyn->insts[ninst-1].size/4))/15; \ - dyn->insts[ninst].ymm0_pass2 = dyn->ymm_zero; \ - } -#define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; -#define INST_NAME(name) +#define NEW_INST \ + if (reset_n != -1) \ + dyn->vector_sew = ninst ? dyn->insts[ninst - 1].vector_sew : VECTOR_SEWNA; \ + if (ninst) { \ + dyn->insts[ninst].address = (dyn->insts[ninst - 1].address + dyn->insts[ninst - 1].size); \ + dyn->insts_size += 1 + ((dyn->insts[ninst - 1].x64.size > (dyn->insts[ninst - 1].size / 4)) ? dyn->insts[ninst - 1].x64.size : (dyn->insts[ninst - 1].size / 4)) / 15; \ + dyn->insts[ninst].ymm0_pass2 = dyn->ymm_zero; \ + } +#define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; +#define INST_NAME(name) #define TABLE64(A, V) {Table64(dyn, (V), 2); EMIT(0); EMIT(0);} #define FTABLE64(A, V) {mmx87_regs_t v = {.d = V}; Table64(dyn, v.q, 2); EMIT(0); EMIT(0);} diff --git a/src/dynarec/rv64/dynarec_rv64_pass3.h b/src/dynarec/rv64/dynarec_rv64_pass3.h index 4a32a728..1dce2bc4 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass3.h +++ b/src/dynarec/rv64/dynarec_rv64_pass3.h @@ -12,11 +12,13 @@ }while(0) #define MESSAGE(A, ...) if(box64_dynarec_dump) dynarec_log(LOG_NONE, __VA_ARGS__) -#define NEW_INST \ - if(box64_dynarec_dump) print_newinst(dyn, ninst); \ - if(ninst) { \ - addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst-1].x64.size, dyn->insts[ninst-1].size/4); \ - dyn->insts[ninst].ymm0_pass3 = dyn->ymm_zero; \ +#define NEW_INST \ + if (reset_n != -1) \ + dyn->vector_sew = ninst ? dyn->insts[ninst - 1].vector_sew : VECTOR_SEWNA; \ + if (box64_dynarec_dump) print_newinst(dyn, ninst); \ + if (ninst) { \ + addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst - 1].x64.size, dyn->insts[ninst - 1].size / 4); \ + dyn->insts[ninst].ymm0_pass3 = dyn->ymm_zero; \ } #define INST_EPILOG #define INST_NAME(name) inst_name_pass3(dyn, ninst, name, rex) diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index 94c4cf23..70f58661 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -110,6 +110,7 @@ typedef struct instruction_rv64_s { flagcache_t f_exit; // flags status at end of intruction extcache_t e; // extcache at end of intruction (but before poping) flagcache_t f_entry; // flags status before the instruction begin + uint8_t vector_sew; } instruction_rv64_t; typedef struct dynarec_rv64_s { @@ -148,6 +149,7 @@ typedef struct dynarec_rv64_s { uint16_t ymm_zero; // bitmap of ymm to zero at purge uint8_t always_test; uint8_t abort; + uint8_t vector_sew; } dynarec_rv64_t; // convert idx (0..24) to reg index (10..31 0..1) diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h index e0fe4403..4d574684 100644 --- a/src/dynarec/rv64/rv64_emitter.h +++ b/src/dynarec/rv64/rv64_emitter.h @@ -1215,6 +1215,8 @@ f28–31 ft8–11 FP temporaries Caller #define VECTOR_SEW16 0b001 #define VECTOR_SEW32 0b010 #define VECTOR_SEW64 0b011 +#define VECTOR_SEWNA 0b111 // N/A +#define VECTOR_SEWANY 0b1000 // any sew would be ok, but not N/A. #define VECTOR_MASKED 0 #define VECTOR_UNMASKED 1 |