diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2024-10-26 23:50:24 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-10-26 17:50:24 +0200 |
| commit | 5563103453e96b6894a8cddc7cbbe7dba234a983 (patch) | |
| tree | bb845edb96c07bd04ef9ee05e3cd727706901ae9 /src | |
| parent | a1d62eb92bd8139e7e8ce61ee18743492d429974 (diff) | |
| download | box64-5563103453e96b6894a8cddc7cbbe7dba234a983.tar.gz box64-5563103453e96b6894a8cddc7cbbe7dba234a983.zip | |
[RV64_DYNAREC] Implemented the first AVX128 opcode for scalar only (#1962)
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_00_3.c | 40 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_avx.c | 63 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c | 96 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_f30f.c | 2 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_functions.c | 19 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.c | 129 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 25 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_private.h | 6 |
8 files changed, 368 insertions, 12 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_00_3.c b/src/dynarec/rv64/dynarec_rv64_00_3.c index 1b76badc..cd2914cd 100644 --- a/src/dynarec/rv64/dynarec_rv64_00_3.c +++ b/src/dynarec/rv64/dynarec_rv64_00_3.c @@ -300,7 +300,45 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int *need_epilog = 0; *ok = 0; break; - + case 0xC4: + nextop = F8; + if (rex.is32bits && !(MODREG)) { + DEFAULT; + } else { + vex_t vex = { 0 }; + vex.rex = rex; + u8 = nextop; + vex.m = u8 & 0b00011111; + vex.rex.b = (u8 & 0b00100000) ? 0 : 1; + vex.rex.x = (u8 & 0b01000000) ? 0 : 1; + vex.rex.r = (u8 & 0b10000000) ? 0 : 1; + u8 = F8; + vex.p = u8 & 0b00000011; + vex.l = (u8 >> 2) & 1; + vex.v = ((~u8) >> 3) & 0b1111; + vex.rex.w = (u8 >> 7) & 1; + addr = dynarec64_AVX(dyn, addr, ip, ninst, vex, ok, need_epilog); + } + break; + case 0xC5: + nextop = F8; + if (rex.is32bits && !(MODREG)) { + DEFAULT; + } else { + vex_t vex = { 0 }; + vex.rex = rex; + u8 = nextop; + vex.p = u8 & 0b00000011; + vex.l = (u8 >> 2) & 1; + vex.v = ((~u8) >> 3) & 0b1111; + vex.rex.r = (u8 & 0b10000000) ? 0 : 1; + vex.rex.b = 0; + vex.rex.x = 0; + vex.rex.w = 0; + vex.m = VEX_M_0F; + addr = dynarec64_AVX(dyn, addr, ip, ninst, vex, ok, need_epilog); + } + break; case 0xC6: INST_NAME("MOV Eb, Ib"); nextop=F8; diff --git a/src/dynarec/rv64/dynarec_rv64_avx.c b/src/dynarec/rv64/dynarec_rv64_avx.c new file mode 100644 index 00000000..c9d80f2a --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_avx.c @@ -0,0 +1,63 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_functions.h" +#include "dynarec_rv64_helper.h" + +static const char* avx_prefix_string(uint16_t p) +{ + switch (p) { + case VEX_P_NONE: return "0"; + case VEX_P_66: return "66"; + case VEX_P_F2: return "F2"; + case VEX_P_F3: return "F3"; + default: return "??"; + } +} +static const char* avx_map_string(uint16_t m) +{ + switch (m) { + case VEX_M_NONE: return "0"; + case VEX_M_0F: return "0F"; + case VEX_M_0F38: return "0F38"; + case VEX_M_0F3A: return "0F3A"; + default: return "??"; + } +} + +uintptr_t dynarec64_AVX(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = PK(0); + rex_t rex = vex.rex; + + if ((vex.m == VEX_M_0F) && (vex.p == VEX_P_F3)) + addr = dynarec64_AVX_F3_0F(dyn, addr, ip, ninst, vex, ok, need_epilog); + else { + DEFAULT; + } + + if ((*ok == -1) && (box64_dynarec_log >= LOG_INFO || box64_dynarec_dump || box64_dynarec_missing == 1)) { + dynarec_log(LOG_NONE, "Dynarec unimplemented AVX opcode size %d prefix %s map %s opcode %02X ", 128 << vex.l, avx_prefix_string(vex.p), avx_map_string(vex.m), opcode); + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c b/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c new file mode 100644 index 00000000..443ef949 --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c @@ -0,0 +1,96 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "my_cpuid.h" +#include "emu/x87emu_private.h" +#include "emu/x64shaext.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_functions.h" +#include "dynarec_rv64_helper.h" + +uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + uint8_t nextop, u8; + uint8_t gd, ed, vd; + uint8_t wback, wb1, wb2, gback, vback; + uint8_t eb1, eb2, gb1, gb2; + int32_t i32, i32_; + int cacheupd = 0; + int v0, v1, v2; + int q0, q1, q2; + int d0, d1, d2; + int s0; + uint64_t tmp64u, u64; + int64_t j64; + int64_t fixedaddress, gdoffset, vxoffset; + int unscaled; + + rex_t rex = vex.rex; + + switch (opcode) { + case 0x10: + INST_NAME("VMOVSS Gx, [Vx,] Ex"); + nextop = F8; + GETG; + if (MODREG) { + if (gd == vex.v) { + v0 = sse_get_reg(dyn, ninst, x1, gd, 1); + q0 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 1); + FMVS(v0, q0); + } else { + GETGX(); + GETVX(); + GETEX(x2, 0, 1); + if (rv64_xtheadmempair) { + ADD(x1, vback, vxoffset); + TH_LDD(x3, x4, x1, 0); + } else { + LD(x3, vback, vxoffset); + LD(x4, vback, vxoffset + 8); + } + LWU(x5, wback, fixedaddress); + if (rv64_xtheadmempair) { + ADDI(x1, gback, gdoffset); + TH_SDD(x3, x4, x1, 0); + } else { + SD(x3, gback, gdoffset); + SD(x4, gback, gdoffset + 8); + } + SW(x5, gback, gdoffset); + } + } else { + v0 = sse_get_reg_empty(dyn, ninst, x1, gd, 1); + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0); + FLW(v0, ed, fixedaddress); + // reset upper part + SW(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 4); + SD(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 8); + } + YMM0(gd); + break; + default: + DEFAULT; + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_f30f.c b/src/dynarec/rv64/dynarec_rv64_f30f.c index 29a65c33..97fa1935 100644 --- a/src/dynarec/rv64/dynarec_rv64_f30f.c +++ b/src/dynarec/rv64/dynarec_rv64_f30f.c @@ -59,7 +59,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } else { v0 = sse_get_reg_empty(dyn, ninst, x1, gd, 1); SMREAD(); - addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 8, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0); FLW(v0, ed, fixedaddress); // reset upper part SW(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 4); diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c index 619041be..234f3b6d 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.c +++ b/src/dynarec/rv64/dynarec_rv64_functions.c @@ -92,11 +92,11 @@ int fpu_get_reg_xmm(dynarec_rv64_t* dyn, int t, int xmm) return EXTREG(i); } // Reset fpu regs counter -void fpu_reset_reg_extcache(dynarec_rv64_t* dyn, extcache_t* e) +static void fpu_reset_reg_extcache(dynarec_rv64_t* dyn, extcache_t* e) { e->fpu_reg = 0; - for (int i=0; i<24; ++i) { - e->fpuused[i]=0; + for (int i = 0; i < 32; ++i) { + e->fpuused[i] = 0; e->extcache[i].v = 0; } dyn->vector_sew = VECTOR_SEWNA; @@ -492,7 +492,7 @@ void extcacheUnwind(extcache_t* cache) cache->ssecache[i*2+1].v = -1; } int x87reg = 0; - for(int i=0; i<24; ++i) { + for (int i = 0; i < 32; ++i) { if(cache->extcache[i].v) { cache->fpuused[i] = 1; switch (cache->extcache[i].t) { @@ -515,6 +515,8 @@ void extcacheUnwind(extcache_t* cache) break; case EXT_CACHE_XMMR: case EXT_CACHE_XMMW: + case EXT_CACHE_YMMR: + case EXT_CACHE_YMMW: cache->ssecache[cache->extcache[i].n].reg = EXTREG(i); cache->ssecache[cache->extcache[i].n].vector = 1; cache->ssecache[cache->extcache[i].n].write = (cache->extcache[i].t == EXT_CACHE_XMMW) ? 1 : 0; @@ -605,6 +607,8 @@ const char* getCacheName(int t, int n) case EXT_CACHE_SCR: sprintf(buff, "Scratch"); break; case EXT_CACHE_XMMW: sprintf(buff, "XMM%d", n); break; case EXT_CACHE_XMMR: sprintf(buff, "xmm%d", n); break; + case EXT_CACHE_YMMW: sprintf(buff, "YMM%d", n); break; + case EXT_CACHE_YMMR: sprintf(buff, "ymm%d", n); break; case EXT_CACHE_NONE: buff[0]='\0'; break; } return buff; @@ -654,7 +658,7 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r dynarec_log(LOG_NONE, ", jmp=out"); if(dyn->last_ip) dynarec_log(LOG_NONE, ", last_ip=%p", (void*)dyn->last_ip); - for(int ii=0; ii<24; ++ii) { + for (int ii = 0; ii < 32; ++ii) { switch(dyn->insts[ninst].e.extcache[ii].t) { case EXT_CACHE_ST_D: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_ST_F: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; @@ -664,11 +668,15 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r case EXT_CACHE_SD: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_XMMR: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_XMMW: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_YMMW: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_YMMR: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_SCR: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_NONE: default: break; } } + if (dyn->ymm_zero) + dynarec_log(LOG_NONE, " ymm0_mask = %04x", dyn->ymm_zero); if(dyn->e.stack || dyn->insts[ninst].e.stack_next || dyn->insts[ninst].e.x87stack) dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->e.stack, dyn->insts[ninst].e.stack_next, dyn->insts[ninst].e.stack_push, dyn->insts[ninst].e.stack_pop, dyn->insts[ninst].e.x87stack); if(dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2) @@ -733,6 +741,7 @@ void fpu_reset(dynarec_rv64_t* dyn) mmx_reset(&dyn->e); sse_reset(&dyn->e); fpu_reset_reg(dyn); + dyn->ymm_zero = 0; } void fpu_reset_ninst(dynarec_rv64_t* dyn, int ninst) diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index be4298a8..9b2d69ee 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -1897,6 +1897,26 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1) dyn->e.ssecache[i].v = -1; } } + + // AVX + if (dyn->ymm_zero) { + if (old == -1) { + MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next ? "locally " : ""); + ++old; + } + for (int i = 0; i < 16; ++i) + if (is_avx_zero(dyn, ninst, i)) { + if (rv64_xtheadmempair) { + ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i])); + TH_SDD(xZR, xZR, s1, 0); + } else { + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i])); + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8); + } + } + if (!next) + avx_mark_zero_reset(dyn, ninst); + } if(old!=-1) { MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n"); } @@ -1915,10 +1935,32 @@ static void sse_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1) else FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } + + // AVX + if (dyn->ymm_zero) + for (int i = 0; i < 16; ++i) + if (is_avx_zero(dyn, ninst, i)) { + if (rv64_xtheadmempair) { + ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i])); + TH_SDD(xZR, xZR, s1, 0); + } else { + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i])); + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8); + } + } } void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a) { + if (is_avx_zero(dyn, ninst, a)) { + if (rv64_xtheadmempair) { + ADDI(s1, xEmu, offsetof(x64emu_t, ymm[a])); + TH_SDD(xZR, xZR, s1, 0); + } else { + SD(xZR, xEmu, offsetof(x64emu_t, ymm[a])); + SD(xZR, xEmu, offsetof(x64emu_t, ymm[a]) + 8); + } + } if (dyn->e.ssecache[a].v == -1) return; if (dyn->e.ssecache[a].vector) { @@ -1931,6 +1973,14 @@ void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a) FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); } +void ymm_mark_zero(dynarec_rv64_t* dyn, int ninst, int a) +{ +#if STEP == 0 + dyn->insts[ninst].ymm0_add |= (1 << a); +#endif + avx_mark_zero(dyn, ninst, a); +} + void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) { // for float registers, we might lost f0..f7, f10..f17 and f28..f31, that means @@ -1949,6 +1999,15 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + if (is_avx_zero(dyn, ninst, i)) { + if (rv64_xtheadmempair) { + ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i])); + TH_SDD(xZR, xZR, s1, 0); + } else { + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i])); + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8); + } + } } MESSAGE(LOG_DUMP, "\t------- Push (float) XMM Cache (%d)\n", n); } @@ -1990,6 +2049,15 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); VSE_V(dyn->e.ssecache[i].reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); } + if (is_avx_zero(dyn, ninst, i)) { + if (rv64_xtheadmempair) { + ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i])); + TH_SDD(xZR, xZR, s1, 0); + } else { + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i])); + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8); + } + } } MESSAGE(LOG_DUMP, "\t------- Push (vector) XMM Cache (%d)\n", n); } @@ -2100,6 +2168,13 @@ static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_ case EXT_CACHE_XMMW: if (t == EXT_CACHE_XMMR) return i; + case EXT_CACHE_YMMR: + if (t == EXT_CACHE_YMMW) + return i; + break; + case EXT_CACHE_YMMW: + if (t == EXT_CACHE_YMMR) + return i; break; } } @@ -2111,7 +2186,10 @@ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t * { if (i == j) return; - if (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW || cache->extcache[j].t == EXT_CACHE_XMMR || cache->extcache[j].t == EXT_CACHE_XMMW) { + if (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW + || cache->extcache[j].t == EXT_CACHE_XMMR || cache->extcache[j].t == EXT_CACHE_XMMW + || cache->extcache[i].t == EXT_CACHE_YMMR || cache->extcache[i].t == EXT_CACHE_YMMW + || cache->extcache[j].t == EXT_CACHE_YMMR || cache->extcache[j].t == EXT_CACHE_YMMW) { int reg_i = EXTREG(i); int reg_j = EXTREG(j); if (!cache->extcache[i].v) { @@ -2178,7 +2256,7 @@ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t * static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, extcache_t* cache, int i, int t, int n) { int reg = EXTREG(i); - if (cache->extcache[i].v && (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW)) { + if (cache->extcache[i].v && (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW || cache->extcache[i].t == EXT_CACHE_YMMR || cache->extcache[i].t == EXT_CACHE_YMMW)) { int j = i + 1; while (cache->extcache[j].v) ++j; MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i); @@ -2208,6 +2286,13 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n])); VLE_V(reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); break; + case EXT_CACHE_YMMR: + case EXT_CACHE_YMMW: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + SET_ELEMENT_WIDTH(s1, VECTOR_SEWANY, 0); + ADDI(s1, xEmu, offsetof(x64emu_t, ymm[n])); + VLE_V(reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); + break; case EXT_CACHE_SS: MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); FLW(reg, xEmu, offsetof(x64emu_t, xmm[n])); @@ -2260,6 +2345,7 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i int reg = EXTREG(i); switch(t) { case EXT_CACHE_XMMR: + case EXT_CACHE_YMMR: MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); break; case EXT_CACHE_XMMW: @@ -2268,6 +2354,12 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n])); VSE_V(reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); break; + case EXT_CACHE_YMMW: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + SET_ELEMENT_WIDTH(s1, VECTOR_SEWANY, 0); + ADDI(s1, xEmu, offsetof(x64emu_t, ymm[n])); + VSE_V(reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); + break; case EXT_CACHE_SS: MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); FSW(reg, xEmu, offsetof(x64emu_t, xmm[n])); @@ -2421,6 +2513,8 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in cache.extcache[i].t = EXT_CACHE_ST_D; } else if (cache.extcache[i].t == EXT_CACHE_XMMR && cache_i2.extcache[i].t == EXT_CACHE_XMMW) { cache.extcache[i].t = EXT_CACHE_XMMW; + } else if (cache.extcache[i].t == EXT_CACHE_YMMR && cache_i2.extcache[i].t == EXT_CACHE_YMMW) { + cache.extcache[i].t = EXT_CACHE_YMMW; } else if (cache.extcache[i].t == EXT_CACHE_XMMW && cache_i2.extcache[i].t == EXT_CACHE_XMMR) { // refresh cache... MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n)); @@ -2428,6 +2522,13 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in ADDI(s1, xEmu, offsetof(x64emu_t, xmm[cache.extcache[i].n])); VSE_V(EXTREG(i), s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); cache.extcache[i].t = EXT_CACHE_XMMR; + } else if (cache.extcache[i].t == EXT_CACHE_YMMW && cache_i2.extcache[i].t == EXT_CACHE_YMMR) { + // refresh cache... + MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n)); + SET_ELEMENT_WIDTH(s1, VECTOR_SEWANY, 0); + ADDI(s1, xEmu, offsetof(x64emu_t, ymm[cache.extcache[i].n])); + VSE_V(EXTREG(i), s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); + cache.extcache[i].t = EXT_CACHE_YMMR; } } } @@ -2835,3 +2936,27 @@ void vector_loadmask(dynarec_rv64_t* dyn, int ninst, int vreg, uint64_t imm, int } #endif } + + +void avx_purge_ymm(dynarec_rv64_t* dyn, int ninst, uint16_t mask, int s1) +{ + int do_something = 0; + for (int i = 0; i < 16; ++i) + if (mask & (1 << i)) { + if (is_avx_zero_unset(dyn, ninst, i)) { + if (!do_something) { + MESSAGE(LOG_NONE, "Purge YMM mask=%04x --------\n", mask); + do_something = 1; + } + if (rv64_xtheadmempair) { + ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i])); + TH_SDD(xZR, xZR, s1, 0); + } else { + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i])); + SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8); + } + } + } + if (do_something) + MESSAGE(LOG_NONE, "---------- Purge YMM\n"); +} diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index d2d84e90..d69addee 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -420,6 +420,9 @@ OR(wback, wback, ed); \ } + +#define YMM0(a) ymm_mark_zero(dyn, ninst, a); + // Get direction with size Z and based of F_DF flag, on register r ready for load/store fetching // using s as scratch. #define GETDIR(r, s, Z) \ @@ -480,6 +483,11 @@ gback = xEmu; \ gdoffset = offsetof(x64emu_t, xmm[gd]) +#define GETVX() \ + sse_forget_reg(dyn, ninst, x3, vex.v); \ + vback = xEmu; \ + vxoffset = offsetof(x64emu_t, xmm[vex.v]) + // Get Ex address in general register a, will purge SS or SD if it's reg and is loaded. May use x3. Use wback as load address! #define GETEX(a, D, I12) \ if (MODREG) { \ @@ -1148,6 +1156,9 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define dynarec64_F20F_vector STEPNAME(dynarec64_F20F_vector) #define dynarec64_F30F_vector STEPNAME(dynarec64_F30F_vector) +#define dynarec64_AVX STEPNAME(dynarec64_AVX) +#define dynarec64_AVX_F3_0F STEPNAME(dynarec64_AVX_F3_0F) + #define geted STEPNAME(geted) #define geted32 STEPNAME(geted32) #define geted16 STEPNAME(geted16) @@ -1279,6 +1290,8 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define sse_purge07cache STEPNAME(sse_purge07cache) #define sse_reflect_reg STEPNAME(sse_reflect_reg) +#define ymm_mark_zero STEPNAME(ymm_mark_zero) + #define sse_get_reg_empty_vector STEPNAME(sse_get_reg_empty_vector) #define sse_get_reg_vector STEPNAME(sse_get_reg_vector) #define sse_forget_reg_vector STEPNAME(sse_forget_reg_vector) @@ -1293,6 +1306,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define sse_purgecache STEPNAME(sse_purgecache) #define fpu_reflectcache STEPNAME(fpu_reflectcache) #define fpu_unreflectcache STEPNAME(fpu_unreflectcache) +#define avx_purge_ymm STEPNAME(avx_purge_ymm) #define CacheTransform STEPNAME(CacheTransform) #define rv64_move64 STEPNAME(rv64_move64) @@ -1450,6 +1464,9 @@ void x87_restoreround(dynarec_rv64_t* dyn, int ninst, int s1); // Set rounding according to mxcsr flags, return reg to restore flags int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2); +// purge ymm_zero mask according to purge_ymm +void avx_purge_ymm(dynarec_rv64_t* dyn, int ninst, uint16_t mask, int s1); + void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3); void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val); @@ -1518,6 +1535,9 @@ void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1); // Push current value to the cache void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a); +// mark an ymm upper part has zero (forgetting upper part if needed) +void ymm_mark_zero(dynarec_rv64_t* dyn, int ninst, int a); + // common coproc helpers // reset the cache with n void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n); @@ -1571,6 +1591,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); + #if STEP < 2 #define PASS2(A) #else @@ -1754,7 +1777,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i #define FCOMIS(v1, v2, s1, s2, s3, s4, s5) FCOMI(S, v1, v2, s1, s2, s3, s4, s5) #define FCOMID(v1, v2, s1, s2, s3, s4, s5) FCOMI(D, v1, v2, s1, s2, s3, s4, s5) -#define PURGE_YMM() /* TODO */ +#define PURGE_YMM() avx_purge_ymm(dyn, ninst, dyn->insts[ninst + 1].purge_ymm, x1) // reg = (reg < -32768) ? -32768 : ((reg > 32767) ? 32767 : reg) #define SAT16(reg, s) \ diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index 4f552e6c..b591ecee 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -20,6 +20,8 @@ typedef struct instsize_s instsize_t; #define EXT_CACHE_SCR 7 #define EXT_CACHE_XMMW 8 #define EXT_CACHE_XMMR 9 +#define EXT_CACHE_YMMW 10 +#define EXT_CACHE_YMMR 11 #define EXT_CACHE_OLD_SD 0 #define EXT_CACHE_OLD_SS 1 @@ -56,7 +58,7 @@ typedef union sse_old_s { typedef struct extcache_s { // ext cache - ext_cache_t extcache[24]; + ext_cache_t extcache[32]; int8_t stack; int8_t stack_next; int8_t stack_pop; @@ -75,7 +77,7 @@ typedef struct extcache_s { int16_t tags; // similar to fpu_tags int8_t mmxcache[8]; // cache status for the 8 MMX registers sse_cache_t ssecache[16]; // cache status for the 16 SSE(2) registers - int8_t fpuused[24]; // all 10..31 & 0..1 double reg from fpu, used by x87, sse and mmx + int8_t fpuused[32]; // all double reg from fpu, used by x87, mmx, sse and avx int8_t x87stack; // cache stack counter int8_t mmxcount; // number of mmx register used (not both mmx and x87 at the same time) int8_t fpu_scratch; // scratch counter |