diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2024-07-04 18:39:05 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-07-04 12:39:05 +0200 |
| commit | eb695d5553e82dce46d67019f48a3354ec611394 (patch) | |
| tree | 6c7996715b075aacadacc39f3ca76095c5cf05d9 /src | |
| parent | b5946f37526606e05b28fc29af0b5eb1a2497414 (diff) | |
| download | box64-eb695d5553e82dce46d67019f48a3354ec611394.tar.gz box64-eb695d5553e82dce46d67019f48a3354ec611394.zip | |
[RV64_DYNAREC] Added preliminary RVV infra and PXOR opcode for demonstration (#1632)
* [RV64_DYNAREC] Added preliminary RVV infra and PXOR opcode for demonstration * keep sse_cache_s uint8_t as suggested * use xor to do the wrap * revert * better fallback
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_0f.c | 10 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_66.c | 10 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f.c | 24 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f_vector.c | 70 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_functions.c | 34 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.c | 288 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 40 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_private.h | 15 | ||||
| -rw-r--r-- | src/dynarec/rv64/rv64_emitter.h | 17 | ||||
| -rw-r--r-- | src/rv64detect.c | 3 |
10 files changed, 418 insertions, 93 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c index 7eeac836..26e64d3d 100644 --- a/src/dynarec/rv64/dynarec_rv64_0f.c +++ b/src/dynarec/rv64/dynarec_rv64_0f.c @@ -786,7 +786,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni nextop = F8; if (MODREG) { ed = (nextop & 7) + (rex.b << 3); - sse_reflect_reg(dyn, ninst, ed); + sse_reflect_reg(dyn, ninst, x6, ed); ADDI(x2, xEmu, offsetof(x64emu_t, xmm[ed])); } else { SMREAD(); @@ -796,9 +796,9 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } } GETG; - sse_forget_reg(dyn, ninst, gd); + sse_forget_reg(dyn, ninst, x6, gd); ADDI(x1, xEmu, offsetof(x64emu_t, xmm[gd])); - sse_reflect_reg(dyn, ninst, 0); + sse_reflect_reg(dyn, ninst, x6, 0); switch (u8) { case 0xC8: CALL(sha1nexte, -1); @@ -878,7 +878,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni nextop = F8; if (MODREG) { ed = (nextop & 7) + (rex.b << 3); - sse_reflect_reg(dyn, ninst, ed); + sse_reflect_reg(dyn, ninst, x6, ed); ADDI(x2, xEmu, offsetof(x64emu_t, xmm[ed])); } else { SMREAD(); @@ -887,7 +887,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } u8 = F8; GETG; - sse_forget_reg(dyn, ninst, gd); + sse_forget_reg(dyn, ninst, x6, gd); ADDI(x1, xEmu, offsetof(x64emu_t, xmm[gd])); MOV32w(x3, u8); CALL(sha1rnds4, -1); diff --git a/src/dynarec/rv64/dynarec_rv64_66.c b/src/dynarec/rv64/dynarec_rv64_66.c index 0f0f31d0..b0e45569 100644 --- a/src/dynarec/rv64/dynarec_rv64_66.c +++ b/src/dynarec/rv64/dynarec_rv64_66.c @@ -38,6 +38,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni int64_t fixedaddress; int unscaled; int lock; + uintptr_t retaddr = 0; MAYUSE(u8); MAYUSE(u16); MAYUSE(u64); @@ -125,7 +126,14 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni break; case 0x0F: switch(rep) { - case 0: addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); break; + case 0: { + if (rv64_vector) { + retaddr = dynarec64_660F_vector(dyn, addr, ip, ninst, rex, ok, need_epilog); + addr = retaddr ? retaddr : dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); + } else + addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); + break; + } case 1: addr = dynarec64_66F20F(dyn, addr, ip, ninst, rex, ok, need_epilog); break; case 2: addr = dynarec64_66F30F(dyn, addr, ip, ninst, rex, ok, need_epilog); break; } diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c index ae8a7522..daf89b2a 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f.c +++ b/src/dynarec/rv64/dynarec_rv64_660f.c @@ -511,7 +511,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; GETGX(); GETEX(x2, 0); - sse_forget_reg(dyn, ninst, 0); // forget xmm[0] + sse_forget_reg(dyn, ninst, x6, 0); // forget xmm[0] for (int i = 0; i < 16; ++i) { LB(x3, xEmu, offsetof(x64emu_t, xmm[0]) + i); BGE(x3, xZR, 12); // continue @@ -920,11 +920,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SETFLAGS(X_ALL, SF_SET_DF); nextop = F8; GETG; - sse_reflect_reg(dyn, ninst, gd); + sse_reflect_reg(dyn, ninst, x6, gd); ADDI(x3, xEmu, offsetof(x64emu_t, xmm[gd])); if (MODREG) { ed = (nextop & 7) + (rex.b << 3); - sse_reflect_reg(dyn, ninst, ed); + sse_reflect_reg(dyn, ninst, x6, ed); ADDI(x1, xEmu, offsetof(x64emu_t, xmm[ed])); } else { addr = geted(dyn, addr, ninst, nextop, &wback, x1, x2, &fixedaddress, rex, NULL, 0, 1); @@ -957,7 +957,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETGX(); GETEX(x2, 0); SSE_LOOP_MV_Q(x3); - sse_forget_reg(dyn, ninst, gd); + sse_forget_reg(dyn, ninst, x6, gd); MOV32w(x1, gd); CALL(native_aesimc, -1); break; @@ -965,7 +965,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("AESENC Gx, Ex"); // AES-NI nextop = F8; GETG; - sse_forget_reg(dyn, ninst, gd); + sse_forget_reg(dyn, ninst, x6, gd); MOV32w(x1, gd); CALL(native_aese, -1); GETGX(); @@ -976,7 +976,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("AESENCLAST Gx, Ex"); // AES-NI nextop = F8; GETG; - sse_forget_reg(dyn, ninst, gd); + sse_forget_reg(dyn, ninst, x6, gd); MOV32w(x1, gd); CALL(native_aeselast, -1); GETGX(); @@ -987,7 +987,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("AESDEC Gx, Ex"); // AES-NI nextop = F8; GETG; - sse_forget_reg(dyn, ninst, gd); + sse_forget_reg(dyn, ninst, x6, gd); MOV32w(x1, gd); CALL(native_aesd, -1); GETGX(); @@ -999,7 +999,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("AESDECLAST Gx, Ex"); // AES-NI nextop = F8; GETG; - sse_forget_reg(dyn, ninst, gd); + sse_forget_reg(dyn, ninst, x6, gd); MOV32w(x1, gd); CALL(native_aesdlast, -1); GETGX(); @@ -1333,11 +1333,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("PCLMULQDQ Gx, Ex, Ib"); nextop = F8; GETG; - sse_forget_reg(dyn, ninst, gd); + sse_forget_reg(dyn, ninst, x6, gd); MOV32w(x1, gd); // gx if (MODREG) { ed = (nextop & 7) + (rex.b << 3); - sse_forget_reg(dyn, ninst, ed); + sse_forget_reg(dyn, ninst, x6, ed); MOV32w(x2, ed); MOV32w(x3, 0); // p = NULL } else { @@ -1355,11 +1355,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("AESKEYGENASSIST Gx, Ex, Ib"); // AES-NI nextop = F8; GETG; - sse_forget_reg(dyn, ninst, gd); + sse_forget_reg(dyn, ninst, x6, gd); MOV32w(x1, gd); // gx if (MODREG) { ed = (nextop & 7) + (rex.b << 3); - sse_forget_reg(dyn, ninst, ed); + sse_forget_reg(dyn, ninst, x6, ed); MOV32w(x2, ed); MOV32w(x3, 0); // p = NULL } else { diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c new file mode 100644 index 00000000..bcda4100 --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c @@ -0,0 +1,70 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "bitutils.h" +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_functions.h" +#include "dynarec_rv64_helper.h" + +uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + uint8_t opcode = F8; + uint8_t nextop, u8, s8; + int32_t i32; + uint8_t gd, ed; + uint8_t wback, wb1, wb2, gback; + uint8_t eb1, eb2; + int64_t j64; + uint64_t tmp64u, tmp64u2; + int v0, v1; + int q0, q1; + int d0, d1, d2; + int64_t fixedaddress, gdoffset; + int unscaled; + MAYUSE(d0); + MAYUSE(d1); + MAYUSE(q0); + MAYUSE(q1); + MAYUSE(eb1); + MAYUSE(eb2); + MAYUSE(j64); + switch (opcode) { + case 0xEF: + INST_NAME("PXOR Gx, Ex"); + nextop = F8; + // FIXME: we should try to minimize vsetvl usage as it may hurts performance a lot. + vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8); + + GETG; + if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { + // special case + q0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd); + VXOR_VV(q0, q0, q0, VECTOR_UNMASKED); + } else { + q0 = sse_get_reg_vector(dyn, ninst, x1, gd, 1); + GETEX_vector(q1, 0, 0); + VXOR_VV(q0, q0, q1, VECTOR_UNMASKED); + } + break; + default: + // fallback to the scalar version + return 0; + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c index d366de48..6ce97b8a 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.c +++ b/src/dynarec/rv64/dynarec_rv64_functions.c @@ -402,13 +402,16 @@ void extcacheUnwind(extcache_t* cache) } // add/change bad regs for(int i=0; i<16; ++i) { - if(cache->olds[i].changed) { - cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD; - } else if(cache->olds[i].purged) { - cache->extcache[i].n = i; - cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD; + if (cache->extcache[i].t == EXT_CACHE_SS || cache->extcache[i].t == EXT_CACHE_SD) { + if (cache->olds[i].changed) { + cache->extcache[i].t = cache->olds[i].single ? EXT_CACHE_SS : EXT_CACHE_SD; + } else if (cache->olds[i].purged) { + cache->extcache[i].n = i; + cache->extcache[i].t = cache->olds[i].single ? EXT_CACHE_SS : EXT_CACHE_SD; + } } } + if(cache->stack_push) { // unpush for(int j=0; j<24; ++j) { @@ -465,14 +468,23 @@ void extcacheUnwind(extcache_t* cache) break; case EXT_CACHE_SS: cache->ssecache[cache->extcache[i].n].reg = EXTREG(i); + cache->ssecache[cache->extcache[i].n].vector = 0; cache->ssecache[cache->extcache[i].n].single = 1; ++cache->fpu_reg; break; case EXT_CACHE_SD: cache->ssecache[cache->extcache[i].n].reg = EXTREG(i); + cache->ssecache[cache->extcache[i].n].vector = 0; cache->ssecache[cache->extcache[i].n].single = 0; ++cache->fpu_reg; break; + case EXT_CACHE_XMMR: + case EXT_CACHE_XMMW: + cache->ssecache[cache->extcache[i].n].reg = i; + cache->ssecache[cache->extcache[i].n].vector = 1; + cache->ssecache[cache->extcache[i].n].write = (cache->extcache[i].t == EXT_CACHE_XMMW) ? 1 : 0; + ++cache->fpu_reg; + break; case EXT_CACHE_ST_F: case EXT_CACHE_ST_D: case EXT_CACHE_ST_I64: @@ -556,6 +568,8 @@ const char* getCacheName(int t, int n) case EXT_CACHE_SS: sprintf(buff, "SS%d", n); break; case EXT_CACHE_SD: sprintf(buff, "SD%d", n); break; case EXT_CACHE_SCR: sprintf(buff, "Scratch"); break; + case EXT_CACHE_XMMW: sprintf(buff, "XMM%d", n); break; + case EXT_CACHE_XMMR: sprintf(buff, "xmm%d", n); break; case EXT_CACHE_NONE: buff[0]='\0'; break; } return buff; @@ -570,6 +584,12 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fs8", "fs9", "fs10", "fs11", "ft8", "ft9", "ft10", "ft11" }; + static const char* vnames[] = { + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v8", "v9", "v30", "v31", + }; if(box64_dynarec_dump) { printf_x64_instruction(rex.is32bits?my_context->dec32:my_context->dec, &dyn->insts[ninst].x64, name); dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d/%d", @@ -607,6 +627,8 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r case EXT_CACHE_MM: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_SS: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_SD: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_XMMR: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_XMMW: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_SCR: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_NONE: default: break; @@ -689,4 +711,4 @@ void fpu_reset_ninst(dynarec_rv64_t* dyn, int ninst) int fpu_is_st_freed(dynarec_rv64_t* dyn, int ninst, int st) { return (dyn->e.tags&(0b11<<(st*2)))?1:0; -} \ No newline at end of file +} diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index cefdab81..06abe224 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -1570,10 +1570,15 @@ static void mmx_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1) // get ext register for a SSE reg, create the entry if needed int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single) { - if(dyn->e.ssecache[a].v!=-1) { + if (dyn->e.ssecache[a].v != -1) { + if (dyn->e.ssecache[a].vector == 1) { + // it's in the fpu, forget it first... + sse_forget_reg_vector(dyn, ninst, s1, a); + return sse_get_reg(dyn, ninst, s1, a, single); + } // forget / reload if change of size if(dyn->e.ssecache[a].single!=single) { - sse_forget_reg(dyn, ninst, a); + sse_forget_reg(dyn, ninst, s1, a); // update olds after the forget... dyn->e.olds[a].changed = 1; dyn->e.olds[a].purged = 0; @@ -1585,41 +1590,52 @@ int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single) dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a); int ret = dyn->e.ssecache[a].reg; dyn->e.ssecache[a].single = single; + dyn->e.ssecache[a].vector = 0; if(dyn->e.ssecache[a].single) FLW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); else FLD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); return ret; } + // get ext register for a SSE reg, but don't try to synch it if it needed to be created int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single) { - if(dyn->e.ssecache[a].v!=-1) { - if(dyn->e.ssecache[a].single!=single) { + if (dyn->e.ssecache[a].v != -1) { + if (dyn->e.ssecache[a].vector == 1) { + // it's in the fpu, forget it first... + sse_forget_reg_vector(dyn, ninst, s1, a); + return sse_get_reg_empty(dyn, ninst, s1, a, single); + } + + if (dyn->e.ssecache[a].single != single) { if (single) { - // writing back the double + // writing back the double, to clear upper 32 bit. FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); - // need to wipe the half high 32bits of old Double because we now have a single - //SW(xZR, xEmu, offsetof(x64emu_t, xmm[a])+4); } dyn->e.olds[a].changed = 1; dyn->e.olds[a].purged = 0; dyn->e.olds[a].reg = EXTIDX(dyn->e.ssecache[a].reg); dyn->e.olds[a].single = 1-single; dyn->e.ssecache[a].single = single; + dyn->e.ssecache[a].vector = 0; dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = single?EXT_CACHE_SS:EXT_CACHE_SD; } return dyn->e.ssecache[a].reg; } dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a); dyn->e.ssecache[a].single = single; + dyn->e.ssecache[a].vector = 0; return dyn->e.ssecache[a].reg; } + // forget ext register for a SSE reg, does nothing if the regs is not loaded -void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a) +void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a) { - if(dyn->e.ssecache[a].v==-1) + if (dyn->e.ssecache[a].v == -1) return; + if (dyn->e.ssecache[a].vector == 1) + return sse_forget_reg_vector(dyn, ninst, s1, a); if(dyn->e.ssecache[a].single) FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); else @@ -1632,24 +1648,93 @@ void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a) dyn->e.ssecache[a].v = -1; return; } + +// get rvv register for a SSE reg, create the entry if needed +int sse_get_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a, int forwrite) +{ + if (dyn->e.ssecache[a].v != -1) { + if (dyn->e.ssecache[a].vector == 0) { + // it's in the fpu, forget it first... + sse_forget_reg(dyn, ninst, s1, a); + return sse_get_reg_vector(dyn, ninst, s1, a, forwrite); + } + + if (forwrite) { + dyn->e.ssecache[a].write = 1; // update only if forwrite + dyn->e.ssecache[a].single = 0; // just to be clean + dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = EXT_CACHE_XMMW; + } + return dyn->e.ssecache[a].reg; + } + dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, forwrite ? EXT_CACHE_XMMW : EXT_CACHE_XMMR, a); + int ret = dyn->e.ssecache[a].reg; + dyn->e.ssecache[a].write = forwrite; + dyn->e.ssecache[a].vector = 1; + dyn->e.ssecache[a].single = 0; // just to be clean + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a])); + VLE8_V(ret, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + return ret; +} + +// get rvv register for an SSE reg, but don't try to synch it if it needed to be created +int sse_get_reg_empty_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a) +{ + if (dyn->e.ssecache[a].v != -1) { + if (dyn->e.ssecache[a].vector == 0) { + // it's in the fpu, forget it first... + sse_forget_reg(dyn, ninst, s1, a); + return sse_get_reg_empty_vector(dyn, ninst, s1, a); + } + dyn->e.ssecache[a].vector = 1; + dyn->e.ssecache[a].write = 1; + dyn->e.ssecache[a].single = 0; // just to be clean + dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = EXT_CACHE_XMMW; + return dyn->e.ssecache[a].reg; + } + dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, EXT_CACHE_XMMW, a); + dyn->e.ssecache[a].vector = 1; + dyn->e.ssecache[a].single = 0; // just to be clean + dyn->e.ssecache[a].write = 1; // it will be write... + return dyn->e.ssecache[a].reg; +} + +// forget rvv register for a SSE reg, does nothing if the regs is not loaded +void sse_forget_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a) +{ + if (dyn->e.ssecache[a].v == -1) + return; + if (dyn->e.ssecache[a].vector == 0) + return sse_forget_reg(dyn, ninst, s1, a); + if (dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t == EXT_CACHE_XMMW) { + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a])); + VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + } + fpu_free_reg(dyn, dyn->e.ssecache[a].reg); + dyn->e.ssecache[a].v = -1; + return; +} + // purge the SSE cache for XMM0..XMM7 (to use before function native call) void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1) { int old = -1; - for (int i=0; i<8; ++i) - if(dyn->e.ssecache[i].v!=-1) { - if (old==-1) { + for (int i = 0; i < 8; ++i) + if (dyn->e.ssecache[i].v != -1) { + if (old == -1) { MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n"); ++old; } - if(dyn->e.ssecache[i].single) + if (dyn->e.ssecache[i].vector) { + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); + VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + } else if (dyn->e.ssecache[i].single) FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); fpu_free_reg(dyn, dyn->e.ssecache[i].reg); dyn->e.ssecache[i].v = -1; } - if(old!=-1) { + if (old != -1) { MESSAGE(LOG_DUMP, "\t------ Purge XMM0..7 Cache\n"); } } @@ -1664,17 +1749,25 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1) MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":""); ++old; } - if(dyn->e.ssecache[i].single) + if (dyn->e.ssecache[i].vector) { + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); + VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + } else if (dyn->e.ssecache[i].single) FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); if(!next) { - fpu_free_reg(dyn, dyn->e.ssecache[i].reg); - dyn->e.olds[i].changed = 0; - dyn->e.olds[i].purged = 1; - dyn->e.olds[i].reg = dyn->e.ssecache[i].reg; - dyn->e.olds[i].single = dyn->e.ssecache[i].single; - dyn->e.ssecache[i].v = -1; + if (dyn->e.ssecache[i].vector) { + fpu_free_reg(dyn, dyn->e.ssecache[i].reg); + dyn->e.ssecache[i].v = -1; + } else { + fpu_free_reg(dyn, dyn->e.ssecache[i].reg); + dyn->e.olds[i].changed = 0; + dyn->e.olds[i].purged = 1; + dyn->e.olds[i].reg = dyn->e.ssecache[i].reg; + dyn->e.olds[i].single = dyn->e.ssecache[i].single; + dyn->e.ssecache[i].v = -1; + } } } if(old!=-1) { @@ -1684,20 +1777,26 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1) static void sse_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1) { - for (int i=0; i<16; ++i) - if(dyn->e.ssecache[i].v!=-1) { - if(dyn->e.ssecache[i].single) + for (int i = 0; i < 16; ++i) + if (dyn->e.ssecache[i].v != -1) { + if (dyn->e.ssecache[i].vector) { + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); + VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + } else if (dyn->e.ssecache[i].single) FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } } -void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int a) +void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a) { if (dyn->e.ssecache[a].v == -1) return; - if (dyn->e.ssecache[a].single) + if (dyn->e.ssecache[a].vector) { + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a])); + VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + } else if (dyn->e.ssecache[a].single) FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); else FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); @@ -1717,7 +1816,10 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n); for (int i=start; i<8; ++i) if(dyn->e.ssecache[i].v!=-1) { - if(dyn->e.ssecache[i].single) + if (dyn->e.ssecache[i].vector) { + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); + VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + } else if (dyn->e.ssecache[i].single) FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); @@ -1760,7 +1862,10 @@ void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n); for (int i=start; i<8; ++i) if(dyn->e.ssecache[i].v!=-1) { - if(dyn->e.ssecache[i].single) + if (dyn->e.ssecache[i].vector) { + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i])); + VLE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + } else if (dyn->e.ssecache[i].single) FLW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); else FLD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); @@ -1829,6 +1934,14 @@ static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_ if (t == EXT_CACHE_ST_D) return i; break; + case EXT_CACHE_XMMR: + if (t == EXT_CACHE_XMMW) + return i; + break; + case EXT_CACHE_XMMW: + if (t == EXT_CACHE_XMMR) + return i; + break; } } } @@ -1837,16 +1950,33 @@ static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *cache) { - if (i==j) + if (i == j) return; + + if (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW || cache->extcache[j].t == EXT_CACHE_XMMR || cache->extcache[j].t == EXT_CACHE_XMMW) { + if (!cache->extcache[i].v) { + // a mov is enough, no need to swap + MESSAGE(LOG_DUMP, "\t - Moving %d <- %d\n", i, j); + VOR_VV(i, j, j, VECTOR_UNMASKED); + cache->extcache[i].v = cache->extcache[j].v; + cache->extcache[j].v = 0; + return; + } + // SWAP + ext_cache_t tmp; + MESSAGE(LOG_DUMP, "\t - Swapping %d <-> %d\n", i, j); + VXOR_VV(i, i, j, VECTOR_UNMASKED); + VXOR_VV(j, i, j, VECTOR_UNMASKED); + VXOR_VV(i, i, j, VECTOR_UNMASKED); + tmp.v = cache->extcache[i].v; + cache->extcache[i].v = cache->extcache[j].v; + cache->extcache[j].v = tmp.v; return; + } + int reg_i = EXTREG(i); int reg_j = EXTREG(j); - int i_single = 0; - if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F) - i_single =1; - int j_single = 0; - if(cache->extcache[j].t==EXT_CACHE_SS || cache->extcache[j].t==EXT_CACHE_ST_F) - j_single =1; + int i_single = cache->extcache[i].t == EXT_CACHE_SS || cache->extcache[i].t == EXT_CACHE_ST_F; + int j_single = cache->extcache[j].t == EXT_CACHE_SS || cache->extcache[j].t == EXT_CACHE_ST_F; if(!cache->extcache[i].v) { // a mov is enough, no need to swap @@ -1887,17 +2017,22 @@ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t * static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, extcache_t* cache, int i, int t, int n) { int reg = EXTREG(i); - if(cache->extcache[i].v) { + if (cache->extcache[i].v && (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW)) { + int j = i + 1; + while (cache->extcache[j].v) ++j; + MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i); + VOR_VV(j, i, i, VECTOR_UNMASKED); + cache->extcache[j].v = cache->extcache[i].v; + } else if (cache->extcache[i].v) { int single = 0; - if(t==EXT_CACHE_SS || t==EXT_CACHE_ST_F) + if (t == EXT_CACHE_SS || t == EXT_CACHE_ST_F) single = 1; - if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F) + if (cache->extcache[i].t == EXT_CACHE_SS || cache->extcache[i].t == EXT_CACHE_ST_F) single = 1; - int j = i+1; - while(cache->extcache[j].v) - ++j; + int j = i + 1; + while (cache->extcache[j].v) ++j; MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i); - if(single) { + if (single) { FMVS(EXTREG(j), reg); } else { FMVD(EXTREG(j), reg); @@ -1905,6 +2040,12 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int cache->extcache[j].v = cache->extcache[i].v; } switch(t) { + case EXT_CACHE_XMMR: + case EXT_CACHE_XMMW: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n])); + VLE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + break; case EXT_CACHE_SS: MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); FLW(reg, xEmu, offsetof(x64emu_t, xmm[n])); @@ -1956,6 +2097,14 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i { int reg = EXTREG(i); switch(t) { + case EXT_CACHE_XMMR: + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + case EXT_CACHE_XMMW: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n])); + VSE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1); + break; case EXT_CACHE_SS: MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); FSW(reg, xEmu, offsetof(x64emu_t, xmm[n])); @@ -2045,43 +2194,47 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in int s2_val = 0; // unload every uneeded cache // check SSE first, than MMX, in order, for optimisation issue - for(int i=0; i<16; ++i) { - int j=findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache); - if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2)==-1) + if (rv64_vector) vector_vsetvl_emul1(dyn, ninst, s1, VECTOR_SEW8); + for (int i = 0; i < 16; ++i) { + int j = findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache); + if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2) == -1) unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n); - j=findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache); - if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache_i2)==-1) + j = findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache); + if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache_i2) == -1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n); + j = findCacheSlot(dyn, ninst, EXT_CACHE_XMMW, i, &cache); + if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_XMMW, i, &cache_i2) == -1) unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n); } - for(int i=0; i<8; ++i) { - int j=findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache); - if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache_i2)==-1) + for (int i = 0; i < 8; ++i) { + int j = findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache); + if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache_i2) == -1) unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n); } - for(int i=0; i<24; ++i) { + for (int i = 0; i < 24; ++i) { if(cache.extcache[i].v) - if(findCacheSlot(dyn, ninst, cache.extcache[i].t, cache.extcache[i].n, &cache_i2)==-1) + if (findCacheSlot(dyn, ninst, cache.extcache[i].t, cache.extcache[i].n, &cache_i2) == -1) unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.extcache[i].t, cache.extcache[i].n); } // and now load/swap the missing one - for(int i=0; i<24; ++i) { - if(cache_i2.extcache[i].v) { - if(cache_i2.extcache[i].v != cache.extcache[i].v) { + for (int i = 0; i < 24; ++i) { + if (cache_i2.extcache[i].v) { + if (cache_i2.extcache[i].v != cache.extcache[i].v) { int j; - if((j=findCacheSlot(dyn, ninst, cache_i2.extcache[i].t, cache_i2.extcache[i].n, &cache))==-1) + if ((j = findCacheSlot(dyn, ninst, cache_i2.extcache[i].t, cache_i2.extcache[i].n, &cache)) == -1) loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.extcache[i].t, cache_i2.extcache[i].n); else { // it's here, lets swap if needed - if(j!=i) + if (j != i) swapCache(dyn, ninst, i, j, &cache); } } - if(cache.extcache[i].t != cache_i2.extcache[i].t) { - if(cache.extcache[i].t == EXT_CACHE_ST_D && cache_i2.extcache[i].t == EXT_CACHE_ST_F) { + if (cache.extcache[i].t != cache_i2.extcache[i].t) { + if (cache.extcache[i].t == EXT_CACHE_ST_D && cache_i2.extcache[i].t == EXT_CACHE_ST_F) { MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n)); FCVTSD(EXTREG(i), EXTREG(i)); cache.extcache[i].t = EXT_CACHE_ST_F; - } else if(cache.extcache[i].t == EXT_CACHE_ST_F && cache_i2.extcache[i].t == EXT_CACHE_ST_D) { + } else if (cache.extcache[i].t == EXT_CACHE_ST_F && cache_i2.extcache[i].t == EXT_CACHE_ST_D) { MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n)); FCVTDS(EXTREG(i), EXTREG(i)); cache.extcache[i].t = EXT_CACHE_ST_D; @@ -2331,3 +2484,18 @@ void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst) dyn->e.stack_push = 0; dyn->e.swapped = 0; } + +// Use vector extension as like SIMD for now, this function sets the specified element width, +// other configs are set automatically. +void vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew) +{ + /* mu: mask undisturbed + * tu: tail undisturbed + * sew: selected element width + * lmul: vector register group multiplier + * + * mu tu sew lmul=1 */ + uint32_t vtypei = (0b0 << 7) | (0b0 << 6) | (sew << 3) | 0b000; + ADDI(s1, xZR, 16 >> sew); + VSETVLI(xZR, s1, vtypei); +} diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 9168c5e9..529b47df 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -473,10 +473,10 @@ FLD(a, ed, fixedaddress); \ } -// Will get pointer to GX in general register a, will purge SS or SD if loaded. can use gback as load address +// Will get pointer to GX in general register a, will purge SS or SD if loaded. May use x3. can use gback as load address #define GETGX() \ gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \ - sse_forget_reg(dyn, ninst, gd); \ + sse_forget_reg(dyn, ninst, x3, gd); \ gback = xEmu; \ gdoffset = offsetof(x64emu_t, xmm[gd]) @@ -484,7 +484,7 @@ #define GETEX(a, D) \ if (MODREG) { \ ed = (nextop & 7) + (rex.b << 3); \ - sse_forget_reg(dyn, ninst, ed); \ + sse_forget_reg(dyn, ninst, x3, ed); \ fixedaddress = offsetof(x64emu_t, xmm[ed]); \ wback = xEmu; \ } else { \ @@ -494,6 +494,18 @@ fixedaddress = 0; /* TODO: optimize this! */ \ } +// Get EX as a quad, (x1 is used) +#define GETEX_vector(a, w, D) \ + if (MODREG) { \ + a = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w); \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, D); \ + a = fpu_get_scratch(dyn); \ + ADDI(x2, ed, fixedaddress); \ + VLE8_V(a, x2, VECTOR_UNMASKED, VECTOR_NFIELD1); \ + } + #define GETGM() \ gd = ((nextop & 0x38) >> 3); \ mmx_forget_reg(dyn, ninst, gd); \ @@ -1093,6 +1105,8 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define dynarec64_F20F STEPNAME(dynarec64_F20F) #define dynarec64_F30F STEPNAME(dynarec64_F30F) +#define dynarec64_660F_vector STEPNAME(dynarec64_660F_vector) + #define geted STEPNAME(geted) #define geted32 STEPNAME(geted32) #define geted16 STEPNAME(geted16) @@ -1223,6 +1237,10 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define sse_purge07cache STEPNAME(sse_purge07cache) #define sse_reflect_reg STEPNAME(sse_reflect_reg) +#define sse_get_reg_empty_vector STEPNAME(sse_get_reg_empty_vector) +#define sse_get_reg_vector STEPNAME(sse_get_reg_vector) +#define sse_forget_reg_vector STEPNAME(sse_forget_reg_vector) + #define fpu_pushcache STEPNAME(fpu_pushcache) #define fpu_popcache STEPNAME(fpu_popcache) #define fpu_reset_cache STEPNAME(fpu_reset_cache) @@ -1238,6 +1256,8 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define rv64_move64 STEPNAME(rv64_move64) #define rv64_move32 STEPNAME(rv64_move32) +#define vector_vsetvl_emul1 STEPNAME(vector_vsetvl_emul1) + /* setup r2 to address pointed by */ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta); @@ -1392,6 +1412,8 @@ void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2 void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val); void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zeroup); +void vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew); + #if STEP < 2 #define CHECK_CACHE() 0 #else @@ -1435,14 +1457,20 @@ void mmx_forget_reg(dynarec_rv64_t* dyn, int ninst, int a); // SSE/SSE2 helpers // get float register for a SSE reg, create the entry if needed int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single); +// get rvv register for a SSE reg, create the entry if needed +int sse_get_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a, int forwrite); // get float register for a SSE reg, but don't try to synch it if it needed to be created int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single); +// get rvv register for an SSE reg, but don't try to synch it if it needed to be created +int sse_get_reg_empty_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a); // forget float register for a SSE reg, create the entry if needed -void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a); +void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a); +// forget rvv register for a SSE reg, does nothing if the regs is not loaded +void sse_forget_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a); // purge the XMM0..XMM7 cache (before function call) void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1); // Push current value to the cache -void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int a); +void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a); // common coproc helpers // reset the cache with n @@ -1489,6 +1517,8 @@ uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); + #if STEP < 2 #define PASS2(A) #else diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index 8d06cc68..530a0f05 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -18,6 +18,9 @@ typedef struct instsize_s instsize_t; #define EXT_CACHE_SS 5 #define EXT_CACHE_SD 6 #define EXT_CACHE_SCR 7 +#define EXT_CACHE_XMMW 8 +#define EXT_CACHE_XMMR 9 + typedef union ext_cache_s { int8_t v; struct { @@ -25,13 +28,18 @@ typedef union ext_cache_s { uint8_t n:4; // reg number }; } ext_cache_t; + typedef union sse_cache_s { - int8_t v; + int16_t v; struct { - uint8_t reg:7; - uint8_t single:1; + uint16_t reg : 7; + uint16_t vector : 1; + uint16_t single : 1; + uint16_t write : 1; + uint16_t unused : 7; }; } sse_cache_t; + typedef union sse_old_s { int8_t v; struct { @@ -41,6 +49,7 @@ typedef union sse_old_s { uint8_t single:1; }; } sse_old_t; + typedef struct extcache_s { // ext cache ext_cache_t extcache[24]; diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h index 1fc65a01..c873daad 100644 --- a/src/dynarec/rv64/rv64_emitter.h +++ b/src/dynarec/rv64/rv64_emitter.h @@ -1206,6 +1206,23 @@ f28–31 ft8–11 FP temporaries Caller // Vector extension emitter +#define VECTOR_SEW8 0b000 +#define VECTOR_SEW16 0b001 +#define VECTOR_SEW32 0b010 +#define VECTOR_SEW64 0b011 + +#define VECTOR_MASKED 0 +#define VECTOR_UNMASKED 1 + +#define VECTOR_NFIELD1 0b000 +#define VECTOR_NFIELD2 0b001 +#define VECTOR_NFIELD3 0b010 +#define VECTOR_NFIELD4 0b011 +#define VECTOR_NFIELD5 0b100 +#define VECTOR_NFIELD6 0b101 +#define VECTOR_NFIELD7 0b110 +#define VECTOR_NFIELD8 0b111 + // configuration setting // https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc #define VSETIVLI(rd, zimm, zimm10) EMIT(I_type(0b110000000000 | (zimm10), zimm, 0b111, rd, 0b1010111)) // 11...............111.....1010111 diff --git a/src/rv64detect.c b/src/rv64detect.c index 698200cc..27ea2e51 100644 --- a/src/rv64detect.c +++ b/src/rv64detect.c @@ -69,9 +69,10 @@ void RV64_Detect_Function() rv64_zbs = Check(my_block); // Test Vector v1.0 with CSRR zero, vcsr + block = (uint32_t*)my_block; CSRRS(xZR, xZR, 0x00f); BR(xRA); - rv64_vector = Check(my_block); + rv64_vector = Check(my_block); // TODO: also check vlen >= 128 // THead vendor extensions if (!rv64_zba) { |