diff options
Diffstat (limited to 'src/dynarec')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_00.c | 27 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx.c | 60 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_0f.c | 115 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_functions.c | 16 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.c | 64 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.h | 17 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_pass0.h | 1 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_private.h | 13 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native.c | 8 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native_functions.c | 30 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native_functions.h | 9 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native_pass.c | 2 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_helper.h | 2 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_pass0.h | 1 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_private.h | 5 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 2 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_pass0.h | 1 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_private.h | 5 |
18 files changed, 359 insertions, 19 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c index 91e0159d..f09f0803 100644 --- a/src/dynarec/arm64/dynarec_arm64_00.c +++ b/src/dynarec/arm64/dynarec_arm64_00.c @@ -2206,7 +2206,19 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin STRH_U12(x1, xEmu, offsetof(x64emu_t, segs[_ES])); STRw_U12(xZR, xEmu, offsetof(x64emu_t, segs_serial[_ES])); } else { - DEFAULT; + vex_t vex = {0}; + vex.rex = rex; + u8 = nextop; + vex.m = u8&0b00011111; + vex.rex.b = (u8&0b00100000)?0:1; + vex.rex.x = (u8&0b01000000)?0:1; + vex.rex.r = (u8&0b10000000)?0:1; + u8 = F8; + vex.p = u8&0b00000011; + vex.l = (u8>>2)&1; + vex.v = ((~u8)>>3)&0b1111; + vex.rex.w = (u8>>7)&1; + addr = dynarec64_AVX(dyn, addr, ip, ninst, vex, ok, need_epilog); } break; case 0xC5: @@ -2220,7 +2232,18 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin STRH_U12(x1, xEmu, offsetof(x64emu_t, segs[_DS])); STRw_U12(xZR, xEmu, offsetof(x64emu_t, segs_serial[_DS])); } else { - DEFAULT; + vex_t vex = {0}; + vex.rex = rex; + u8 = nextop; + vex.p = u8&0b00000011; + vex.l = (u8>>2)&1; + vex.v = ((~u8)>>3)&0b1111; + vex.rex.r = (u8&0b10000000)?0:1; + vex.rex.b = 0; + vex.rex.x = 0; + vex.rex.w = 0; + vex.m = VEX_M_0F; + addr = dynarec64_AVX(dyn, addr, ip, ninst, vex, ok, need_epilog); } break; case 0xC6: diff --git a/src/dynarec/arm64/dynarec_arm64_avx.c b/src/dynarec/arm64/dynarec_arm64_avx.c new file mode 100644 index 00000000..53ff0cf5 --- /dev/null +++ b/src/dynarec/arm64/dynarec_arm64_avx.c @@ -0,0 +1,60 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "arm64_printer.h" +#include "dynarec_arm64_private.h" +#include "dynarec_arm64_functions.h" +#include "dynarec_arm64_helper.h" + +static const char* avx_prefix_string(uint16_t p) +{ + switch(p) { + case VEX_P_NONE: return "0"; + case VEX_P_66: return "66"; + case VEX_P_F2: return "F2"; + case VEX_P_F3: return "F3"; + default: return "??"; + } +} +static const char* avx_map_string(uint16_t m) +{ + switch(m) { + case VEX_M_NONE: return "0"; + case VEX_M_0F: return "0F"; + case VEX_M_0F38: return "0F38"; + case VEX_M_0F3A: return "0F3A"; + default: return "??"; + } +} + +uintptr_t dynarec64_AVX(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; (void)need_epilog; + + uint8_t opcode = PK(0); + rex_t rex = vex.rex; + + if( (vex.m==VEX_M_0F) && (vex.p==VEX_P_NONE)) + addr = dynarec64_AVX_0F(dyn, addr, ip, ninst, vex, ok, need_epilog); + else {DEFAULT;} + + if(*ok==-1) { + printf_log(LOG_INFO, "Dynarec unimplemented AVX opcode size %d prefix %s map %s opcode %02X ", 128<<vex.l, avx_prefix_string(vex.p), avx_map_string(vex.m), opcode); + } + return addr; +} diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c new file mode 100644 index 00000000..8addb9b1 --- /dev/null +++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c @@ -0,0 +1,115 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "my_cpuid.h" +#include "emu/x87emu_private.h" +#include "emu/x64shaext.h" + +#include "arm64_printer.h" +#include "dynarec_arm64_private.h" +#include "dynarec_arm64_functions.h" +#include "dynarec_arm64_helper.h" + +uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; (void)need_epilog; + + uint8_t opcode = F8; + uint8_t nextop, u8; + uint8_t gd, ed; + uint8_t wback, wb1, wb2; + uint8_t eb1, eb2, gb1, gb2; + int32_t i32, i32_; + int cacheupd = 0; + int v0, v1, v2; + int q0, q1, q2; + int d0, d1, d2; + int s0; + uint64_t tmp64u; + int64_t j64; + int64_t fixedaddress; + int unscaled; + MAYUSE(wb1); + MAYUSE(wb2); + MAYUSE(eb1); + MAYUSE(eb2); + MAYUSE(gb1); + MAYUSE(gb2); + MAYUSE(q0); + MAYUSE(q1); + MAYUSE(d0); + MAYUSE(d1); + MAYUSE(s0); + MAYUSE(j64); + MAYUSE(cacheupd); + #if STEP > 1 + static const int8_t mask_shift8[] = { -7, -6, -5, -4, -3, -2, -1, 0 }; + #endif + + rex_t rex = vex.rex; + + switch(opcode) { + + case 0xC6: + INST_NAME("VSHUFPS Gx, Vx, Ex, Ib"); + nextop = F8; + GETVX(v2, 0); + GETGX_empty(v0); + if(!MODREG) { + addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1); + v1 = -1; // to avoid a warning + } else + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); + u8 = F8; + if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) { + VDUPQ_32(v0, v2, u8&3); + } else if(v2==v1 && (u8==0xe0)) { // easy special case + VMOVQ(v0, v2); + VMOVeS(v0, 1, v0, 0); + } else if(v0==v1 && (u8==0xe5)) { // easy special case + VMOVQ(v0, v2); + VMOVeS(v0, 0, v0, 1); + } else { + d0 = fpu_get_scratch(dyn); + // first two elements from Gx + for(int i=0; i<2; ++i) { + VMOVeS(d0, i, v2, (u8>>(i*2))&3); + } + // second two from Ex + if(MODREG) { + for(int i=2; i<4; ++i) { + VMOVeS(d0, i, v1, (u8>>(i*2))&3); + } + } else { + SMREAD(); + for(int i=2; i<4; ++i) { + ADDx_U12(x2, ed, ((u8>>(i*2))&3)*4); + VLD1_32(d0, i, x2); + } + } + VMOVQ(v0, d0); + } + if(vex.l) { + DEFAULT; /* TDOD! */ + } else YMM0(gd); + break; + + default: + DEFAULT; + } + return addr; +} diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index f62ade0d..7c4bac22 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -88,7 +88,7 @@ int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm) static void fpu_reset_reg_neoncache(neoncache_t* n) { n->fpu_reg = 0; - for (int i=0; i<24; ++i) { + for (int i=0; i<32; ++i) { n->fpuused[i]=0; n->neoncache[i].v = 0; } @@ -456,7 +456,7 @@ void neoncacheUnwind(neoncache_t* cache) cache->ssecache[i*2+1].v = -1; } int x87reg = 0; - for(int i=0; i<24; ++i) { + for(int i=0; i<32; ++i) { if(cache->neoncache[i].v) { cache->fpuused[i] = 1; switch (cache->neoncache[i].t) { @@ -467,6 +467,8 @@ void neoncacheUnwind(neoncache_t* cache) break; case NEON_CACHE_XMMR: case NEON_CACHE_XMMW: + case NEON_CACHE_YMMR: + case NEON_CACHE_YMMW: cache->ssecache[cache->neoncache[i].n].reg = i; cache->ssecache[cache->neoncache[i].n].write = (cache->neoncache[i].t==NEON_CACHE_XMMW)?1:0; ++cache->fpu_reg; @@ -543,6 +545,8 @@ const char* getCacheName(int t, int n) case NEON_CACHE_MM: sprintf(buff, "MM%d", n); break; case NEON_CACHE_XMMW: sprintf(buff, "XMM%d", n); break; case NEON_CACHE_XMMR: sprintf(buff, "xmm%d", n); break; + case NEON_CACHE_YMMW: sprintf(buff, "YMM%d", n); break; + case NEON_CACHE_YMMR: sprintf(buff, "ymm%d", n); break; case NEON_CACHE_SCR: sprintf(buff, "Scratch"); break; case NEON_CACHE_NONE: buff[0]='\0'; break; } @@ -580,7 +584,7 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r dynarec_log(LOG_NONE, ", jmp=out"); if(dyn->last_ip) dynarec_log(LOG_NONE, ", last_ip=%p", (void*)dyn->last_ip); - for(int ii=0; ii<24; ++ii) { + for(int ii=0; ii<32; ++ii) { switch(dyn->insts[ninst].n.neoncache[ii].t) { case NEON_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; case NEON_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; @@ -588,11 +592,15 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r case NEON_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; case NEON_CACHE_XMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; case NEON_CACHE_XMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; - case NEON_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; + case NEON_CACHE_YMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; + case NEON_CACHE_YMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; + //case NEON_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; case NEON_CACHE_NONE: default: break; } } + if(dyn->ymm_zero) + dynarec_log(LOG_NONE, " ymm0_mask=%04x", dyn->ymm_zero); if(dyn->n.stack || dyn->insts[ninst].n.stack_next || dyn->insts[ninst].n.x87stack) dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, dyn->insts[ninst].n.stack_pop, dyn->insts[ninst].n.x87stack); if(dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index 568483dd..5e406588 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -1673,6 +1673,18 @@ static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1) dyn->n.ssecache[i].v = -1; } } + //AVX + if(dyn->ymm_zero) { + if (old==-1) { + MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":""); + ++old; + } + for(int i=0; i<16; ++i) + if(is_avx_zero(dyn, ninst, i)) + STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i])); + if(!next) + avx_mark_zero_reset(dyn, ninst); + } if(old!=-1) { MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n"); } @@ -1684,10 +1696,17 @@ static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1) if(dyn->n.ssecache[i].v!=-1 && dyn->n.ssecache[i].write) { VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } + //AVX + if(dyn->ymm_zero) + for(int i=0; i<16; ++i) + if(is_avx_zero(dyn, ninst, i)) + STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i])); } void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a) { + if(is_avx_zero(dyn, ninst, a)) + STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[a])); if(dyn->n.ssecache[a].v==-1) return; if(dyn->n.neoncache[dyn->n.ssecache[a].reg].t == NEON_CACHE_XMMW) { @@ -1708,10 +1727,13 @@ void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) if(!n) return; MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n); - for (int i=start; i<16; ++i) + for (int i=start; i<16; ++i) { if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) { VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } + if(is_avx_zero(dyn, ninst, i)) + STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i])); + } MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n); } @@ -1778,6 +1800,13 @@ static int findCacheSlot(dynarec_arm_t* dyn, int ninst, int t, int n, neoncache_ case NEON_CACHE_XMMW: if(t==NEON_CACHE_XMMR) return i; + case NEON_CACHE_YMMR: + if(t==NEON_CACHE_YMMW) + return i; + break; + case NEON_CACHE_YMMW: + if(t==NEON_CACHE_YMMR) + return i; break; } } @@ -1790,9 +1819,9 @@ static void swapCache(dynarec_arm_t* dyn, int ninst, int i, int j, neoncache_t * if (i==j) return; int quad = 0; - if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW) + if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW || cache->neoncache[i].t==NEON_CACHE_YMMR || cache->neoncache[i].t==NEON_CACHE_YMMW) quad =1; - if(cache->neoncache[j].t==NEON_CACHE_XMMR || cache->neoncache[j].t==NEON_CACHE_XMMW) + if(cache->neoncache[j].t==NEON_CACHE_XMMR || cache->neoncache[j].t==NEON_CACHE_XMMW || cache->neoncache[j].t==NEON_CACHE_YMMR || cache->neoncache[j].t==NEON_CACHE_YMMW) quad =1; if(!cache->neoncache[i].v) { @@ -1821,7 +1850,6 @@ static void swapCache(dynarec_arm_t* dyn, int ninst, int i, int j, neoncache_t * VMOV(i, j); VMOV(j, SCRATCH); } - #undef SCRATCH tmp.v = cache->neoncache[i].v; cache->neoncache[i].v = cache->neoncache[j].v; cache->neoncache[j].v = tmp.v; @@ -1852,6 +1880,11 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); VLDR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n])); break; + case NEON_CACHE_YMMR: + case NEON_CACHE_YMMW: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + VLDR128_U12(i, xEmu, offsetof(x64emu_t, ymm[n])); + break; case NEON_CACHE_MM: MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); VLDR64_U12(i, xEmu, offsetof(x64emu_t, mmx[n])); @@ -1900,12 +1933,17 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in { switch(t) { case NEON_CACHE_XMMR: + case NEON_CACHE_YMMR: MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); break; case NEON_CACHE_XMMW: MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n])); break; + case NEON_CACHE_YMMW: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[n])); + break; case NEON_CACHE_MM: MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); VSTR64_U12(i, xEmu, offsetof(x64emu_t, mmx[n])); @@ -2047,11 +2085,18 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int cache.neoncache[i].t = NEON_CACHE_ST_D; } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) { cache.neoncache[i].t = NEON_CACHE_XMMW; } + else if(cache.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW) + { cache.neoncache[i].t = NEON_CACHE_YMMW; } else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) { // refresh cache... MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[cache.neoncache[i].n])); cache.neoncache[i].t = NEON_CACHE_XMMR; + } else if(cache.neoncache[i].t == NEON_CACHE_YMMW && cache_i2.neoncache[i].t == NEON_CACHE_YMMR) { + // refresh cache... + MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); + VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[cache.neoncache[i].n])); + cache.neoncache[i].t = NEON_CACHE_YMMR; } } } @@ -2309,3 +2354,14 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst) dyn->n.stack_push = 0; dyn->n.swapped = 0; } + +void avx_purge_ymm0(dynarec_arm_t* dyn, int ninst) +{ + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Purge YMM Zero mask=%04x --------\n", dyn->insts[ninst].purge_ymm0); + for(int i=0; i<16; ++i) + if(dyn->insts[ninst].purge_ymm0&(1<<i) && is_avx_zero(dyn, ninst, i)) { + STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i])); + avx_unmark_zero(dyn, ninst, i); + } + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "---------- Purge YMM Zero\n"); +} \ No newline at end of file diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index 96c56be8..36950291 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -449,6 +449,14 @@ gd = ((nextop&0x38)>>3)+(rex.r<<3); \ a = sse_get_reg_empty(dyn, ninst, x1, gd) +// Get VX as a quad (might use x1) +#define GETVX(a, w) \ + a = sse_get_reg(dyn, ninst, x1, vex.v, w) + +// Get an empty VX (use x1) +#define GETVX_empty(a) \ + a = sse_get_reg_empty(dyn, ninst, x1, vex.v) + // Get EX as a quad, (x1 is used) #define GETEX(a, w, D) \ if(MODREG) { \ @@ -530,6 +538,7 @@ SMWRITE2(); \ } +#define YMM0(a) avx_mark_zero(dyn, ninst, gd); // Get Direction with size Z and based of F_DF flag, on register r ready for LDR/STR fetching // F_DF is 1<<10, so 1 ROR 11*2 (so F_OF) @@ -1016,6 +1025,8 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr); #define dynarec64_66F0 STEPNAME(dynarec64_66F0) #define dynarec64_F20F STEPNAME(dynarec64_F20F) #define dynarec64_F30F STEPNAME(dynarec64_F30F) +#define dynarec64_AVX STEPNAME(dynarec64_AVX) +#define dynarec64_AVX_0F STEPNAME(dynarec64_AVX_0F) #define geted STEPNAME(geted) #define geted32 STEPNAME(geted32) @@ -1162,6 +1173,7 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr); #define x87_purgecache STEPNAME(x87_purgecache) #define fpu_reflectcache STEPNAME(fpu_reflectcache) #define fpu_unreflectcache STEPNAME(fpu_unreflectcache) +#define avx_purge_ymm0 STEPNAME(avx_purge_ymm0) #define CacheTransform STEPNAME(CacheTransform) @@ -1322,6 +1334,8 @@ int x87_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1); // Set rounding according to mxcsr flags, return reg to restore flags int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); +// purge ymm_zero mask according to purge_ymm0 +void avx_purge_ymm0(dynarec_arm_t* dyn, int ninst); void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3); @@ -1420,6 +1434,8 @@ uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); #if STEP < 2 #define PASS2(A) @@ -1577,5 +1593,6 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n } \ } +#define PURGE_YMM0() avx_purge_ymm0(dyn, ninst) #endif //__DYNAREC_ARM64_HELPER_H__ diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h index 7d4c0c2d..6e9b8019 100644 --- a/src/dynarec/arm64/dynarec_arm64_pass0.h +++ b/src/dynarec/arm64/dynarec_arm64_pass0.h @@ -26,6 +26,7 @@ dyn->n.combined1 = dyn->n.combined2 = 0;\ dyn->n.swapped = 0; dyn->n.barrier = 0; \ dyn->insts[ninst].f_entry = dyn->f; \ + dyn->insts[ninst].ymm_zero = dyn->ymm_zero;\ if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;} #define INST_EPILOG \ diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index b26d522d..2788ddc4 100644 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -16,7 +16,9 @@ typedef struct instsize_s instsize_t; #define NEON_CACHE_MM 4 #define NEON_CACHE_XMMW 5 #define NEON_CACHE_XMMR 6 -#define NEON_CACHE_SCR 7 +#define NEON_CACHE_YMMW 7 +#define NEON_CACHE_YMMR 8 +#define NEON_CACHE_SCR 9 typedef union neon_cache_s { int8_t v; struct { @@ -33,7 +35,7 @@ typedef union sse_cache_s { } sse_cache_t; typedef struct neoncache_s { // Neon cache - neon_cache_t neoncache[24]; + neon_cache_t neoncache[32]; int8_t stack; int8_t stack_next; int8_t stack_pop; @@ -51,7 +53,7 @@ typedef struct neoncache_s { int16_t tags; // similar to fpu_tags int8_t mmxcache[8]; // cache status for the 8 MMX registers sse_cache_t ssecache[16]; // cache status for the 16 SSE(2) registers - int8_t fpuused[24]; // all 0..24 double reg from fpu, used by x87, sse and mmx + int8_t fpuused[32]; // all neon regs, used by x87, mmx, sse and avx int8_t x87stack; // cache stack counter int8_t mmxcount; // number of mmx register used (not both mmx and x87 at the same time) int8_t fpu_scratch; // scratch counter @@ -78,7 +80,9 @@ typedef struct instruction_arm64_s { uintptr_t marklock; int pass2choice;// value for choices that are fixed on pass2 for pass3 uintptr_t natcall; - int retn; + uint16_t retn; + uint16_t ymm_zero; // bitmap of ymm to zero at purge + uint16_t purge_ymm0; // need to purge some ymm0 because of a loop uint8_t barrier_maybe; uint8_t will_write; uint8_t last_write; @@ -118,6 +122,7 @@ typedef struct dynarec_arm_s { uintptr_t forward_to; // address of the next jump to (to check if everything is ok) int32_t forward_size; // size at the forward point int forward_ninst; // ninst at the forward point + uint16_t ymm_zero; // bitmap of ymm to zero at purge uint8_t smwrite; // for strongmem model emulation uint8_t smread; uint8_t doublepush; diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c index 7ac19234..4f509ac1 100644 --- a/src/dynarec/dynarec_native.c +++ b/src/dynarec/dynarec_native.c @@ -553,9 +553,15 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit k=i2; }*/ if(k!=-1) { - if(k!=-1 && !helper.insts[i].barrier_maybe) + if(!helper.insts[i].barrier_maybe) helper.insts[k].x64.barrier |= BARRIER_FULL; helper.insts[i].x64.jmp_insts = k; + if(helper.insts[i].ymm_zero || helper.insts[k].ymm_zero) { + // move to pureg the reg that are present in k (jump to) but not in i (jump from) + uint16_t to_purge = helper.insts[k].ymm_zero & ~helper.insts[i].ymm_zero; + helper.insts[k].purge_ymm0 |= to_purge; + helper.insts[k].ymm_zero &= ~to_purge; + } } } } diff --git a/src/dynarec/dynarec_native_functions.c b/src/dynarec/dynarec_native_functions.c index 1cb2834d..10d6f333 100644 --- a/src/dynarec/dynarec_native_functions.c +++ b/src/dynarec/dynarec_native_functions.c @@ -507,7 +507,7 @@ uint8_t geted_ib(dynarec_native_t* dyn, uintptr_t addr, int ninst, uint8_t nexto } #undef F8 -int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress, int* retn) +int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress, uint16_t* retn) { (void)dyn; @@ -533,3 +533,31 @@ int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress, #undef PK32 #undef PK } + +// AVX +void avx_mark_zero(dynarec_native_t* dyn, int ninst, int reg) +{ + dyn->ymm_zero |= (1<<reg); +} + +int is_avx_zero(dynarec_native_t* dyn, int ninst, int reg) +{ + return (dyn->ymm_zero>>reg)&1; +} +int is_avx_zero_unset(dynarec_native_t* dyn, int ninst, int reg) +{ + if((dyn->ymm_zero>>reg)&1) { + dyn->ymm_zero &= ~(1<<reg); + return 1; + } + return 0; +} +void avx_mark_zero_reset(dynarec_native_t* dyn, int ninst) +{ + dyn->ymm_zero = 0; +} + +void avx_unmark_zero(dynarec_native_t* dyn, int ninst, int reg) +{ + dyn->ymm_zero &= ~(1<<reg); +} diff --git a/src/dynarec/dynarec_native_functions.h b/src/dynarec/dynarec_native_functions.h index 533dfeeb..3e81081b 100644 --- a/src/dynarec/dynarec_native_functions.h +++ b/src/dynarec/dynarec_native_functions.h @@ -65,7 +65,14 @@ uintptr_t fakeed(dynarec_native_t* dyn, uintptr_t addr, int ninst, uint8_t nexto uint8_t geted_ib(dynarec_native_t* dyn, uintptr_t addr, int ninst, uint8_t nextop); // Is what pointed at addr a native call? And if yes, to what function? -int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress, int* retn); +int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress, uint16_t* retn); + +// AVX utilities +void avx_mark_zero(dynarec_native_t* dyn, int ninst, int reg); +int is_avx_zero(dynarec_native_t* dyn, int ninst, int reg); +int is_avx_zero_unset(dynarec_native_t* dyn, int ninst, int reg); +void avx_mark_zero_reset(dynarec_native_t* dyn, int ninst); +void avx_unmark_zero(dynarec_native_t* dyn, int ninst, int reg); ADDITIONNAL_DEFINITION() diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index 2772a973..cab03222 100644 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -80,6 +80,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int } #endif fpu_propagate_stack(dyn, ninst); + if(dyn->insts[ninst].purge_ymm0) + PURGE_YMM0(); ip = addr; if (reset_n!=-1) { dyn->last_ip = 0; diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h index fbc91e34..def767b4 100644 --- a/src/dynarec/la64/dynarec_la64_helper.h +++ b/src/dynarec/la64/dynarec_la64_helper.h @@ -1075,4 +1075,6 @@ uintptr_t dynarec64_F20F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int } \ } while (0) +#define PURGE_YMM0() /* TODO */ + #endif //__DYNAREC_LA64_HELPER_H__ \ No newline at end of file diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h index 99a897a6..3990caa4 100644 --- a/src/dynarec/la64/dynarec_la64_pass0.h +++ b/src/dynarec/la64/dynarec_la64_pass0.h @@ -30,6 +30,7 @@ dyn->lsx.combined1 = dyn->lsx.combined2 = 0; \ dyn->lsx.swapped = 0; \ dyn->lsx.barrier = 0; \ + dyn->insts[ninst].ymm_zero = dyn->ymm_zero; \ dyn->insts[ninst].f_entry = dyn->f; \ if (ninst) { dyn->insts[ninst - 1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst - 1].x64.addr; } #define INST_EPILOG \ diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h index 7b76a75f..b31d3f2e 100644 --- a/src/dynarec/la64/dynarec_la64_private.h +++ b/src/dynarec/la64/dynarec_la64_private.h @@ -79,7 +79,9 @@ typedef struct instruction_la64_s { uintptr_t marklock; int pass2choice;// value for choices that are fixed on pass2 for pass3 uintptr_t natcall; - int retn; + uint16_t retn; + uint16_t ymm_zero; // bitmap of ymm to zero at purge + uint16_t purge_ymm0; // need to purge some ymm0 because of a loop uint8_t barrier_maybe; uint8_t will_write; uint8_t last_write; @@ -119,6 +121,7 @@ typedef struct dynarec_la64_s { uintptr_t forward_to; // address of the next jump to (to check if everything is ok) int32_t forward_size; // size at the forward point int forward_ninst; // ninst at the forward point + uint16_t ymm_zero; // bitmap of ymm to zero at purge uint8_t smread; // for strongmem model emulation uint8_t smwrite; // for strongmem model emulation uint8_t always_test; diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 6a374499..fd680474 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -1671,4 +1671,6 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int BLT(reg, s, 4 + 4); \ ADDIW(reg, s, -1); +#define PURGE_YMM0() /* TODO */ + #endif //__DYNAREC_RV64_HELPER_H__ diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h index 3ee1685f..9c2de9ee 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass0.h +++ b/src/dynarec/rv64/dynarec_rv64_pass0.h @@ -28,6 +28,7 @@ dyn->e.swapped = 0; dyn->e.barrier = 0; \ for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\ dyn->insts[ninst].f_entry = dyn->f; \ + dyn->insts[ninst].ymm_zero = dyn->ymm_zero;\ if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;} #define INST_EPILOG \ diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index 3acbdfb6..dff6f84e 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -89,7 +89,9 @@ typedef struct instruction_rv64_s { uintptr_t marklock; int pass2choice;// value for choices that are fixed on pass2 for pass3 uintptr_t natcall; - int retn; + uint16_t retn; + uint16_t ymm_zero; // bitmap of ymm to zero at purge + uint16_t purge_ymm0; // need to purge some ymm0 because of a loop int barrier_maybe; flagcache_t f_exit; // flags status at end of intruction extcache_t e; // extcache at end of intruction (but before poping) @@ -129,6 +131,7 @@ typedef struct dynarec_rv64_s { uintptr_t forward_to; // address of the next jump to (to check if everything is ok) int32_t forward_size; // size at the forward point int forward_ninst; // ninst at the forward point + uint16_t ymm_zero; // bitmap of ymm to zero at purge uint8_t always_test; uint8_t abort; } dynarec_rv64_t; |