diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2023-03-21 20:20:35 +0000 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2023-03-21 20:20:35 +0000 |
| commit | 900f38779da63d30625c6951291ce0e39ff3c598 (patch) | |
| tree | d68903d1156ce7566efda38fd00d03ae9e9ae0ec | |
| parent | d369ac8bf5d0c5de5e8222f5ef416df943298050 (diff) | |
| download | box64-900f38779da63d30625c6951291ce0e39ff3c598.tar.gz box64-900f38779da63d30625c6951291ce0e39ff3c598.zip | |
[RV64_DYNAREC] Added x87/SSE/mmx infrastructure, and a few x87 D9 opcodes
| -rwxr-xr-x | CMakeLists.txt | 2 | ||||
| -rwxr-xr-x | src/dynarec/dynarec_arch.h | 6 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_00.c | 4 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_d9.c | 383 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_functions.c | 414 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_functions.h | 34 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.c | 1087 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 77 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_pass2.h | 1 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_pass3.h | 1 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_private.h | 53 | ||||
| -rw-r--r-- | src/dynarec/rv64/rv64_emitter.h | 61 |
12 files changed, 2082 insertions, 41 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 866acaca..619e25b0 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -615,7 +615,7 @@ if(RV64_DYNAREC) "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_66.c" #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_67.c" #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_d8.c" - #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_d9.c" + "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_d9.c" #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_da.c" #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_db.c" #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_dc.c" diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h index d88f591b..51e74d4e 100755 --- a/src/dynarec/dynarec_arch.h +++ b/src/dynarec/dynarec_arch.h @@ -32,9 +32,11 @@ #define instruction_native_t instruction_rv64_t #define dynarec_native_t dynarec_rv64_t -#define ADDITIONNAL_DEFINITION() +#define ADDITIONNAL_DEFINITION() \ + int fpuCacheNeedsTransform(dynarec_native_t* dyn, int ninst); -#define OTHER_CACHE() +#define OTHER_CACHE() \ + if (fpuCacheNeedsTransform(dyn, ninst)) ret|=2; #include "rv64/rv64_printer.h" #include "rv64/dynarec_rv64_private.h" diff --git a/src/dynarec/rv64/dynarec_rv64_00.c b/src/dynarec/rv64/dynarec_rv64_00.c index d91a3216..b6153b06 100644 --- a/src/dynarec/rv64/dynarec_rv64_00.c +++ b/src/dynarec/rv64/dynarec_rv64_00.c @@ -1003,6 +1003,10 @@ uintptr_t dynarec64_00(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } break; + case 0xD9: + addr = dynarec64_D9(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + case 0xE8: INST_NAME("CALL Id"); i32 = F32S; diff --git a/src/dynarec/rv64/dynarec_rv64_d9.c b/src/dynarec/rv64/dynarec_rv64_d9.c new file mode 100644 index 00000000..bff399e6 --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_d9.c @@ -0,0 +1,383 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <pthread.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_helper.h" +#include "dynarec_rv64_functions.h" + + +uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; (void)rep; (void)need_epilog; + + uint8_t nextop = F8; + uint8_t ed; + uint8_t wback, wb1; + uint8_t u8; + int64_t fixedaddress; + int unscaled; + int v1, v2; + int s0; + int i1, i2, i3; + + MAYUSE(s0); + MAYUSE(v2); + MAYUSE(v1); + + switch(nextop) { + case 0xC0: + case 0xC1: + case 0xC2: + case 0xC3: + case 0xC4: + case 0xC5: + case 0xC6: + case 0xC7: + INST_NAME("FLD STx"); + v2 = x87_do_push(dyn, ninst, x1, X87_ST(nextop&7)); + v1 = x87_get_st(dyn, ninst, x1, x2, (nextop&7)+1, X87_COMBINE(0, (nextop&7)+1)); + if(ST_IS_F(0)) { + FMVS(v2, v1); + } else { + FMVD(v2, v1); + } + break; + + case 0xC8: + INST_NAME("FXCH ST0"); + break; + case 0xC9: + case 0xCA: + case 0xCB: + case 0xCC: + case 0xCD: + case 0xCE: + case 0xCF: + INST_NAME("FXCH STx"); + // swap the cache value, not the double value itself :p + x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_ST(nextop&7)); + x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + x87_swapreg(dyn, ninst, x1, x2, 0, nextop&7); + // should set C1 to 0 + break; + + case 0xD0: + INST_NAME("FNOP"); + break; + + case 0xD8: + INST_NAME("FSTPNCE ST0, ST0"); + x87_do_pop(dyn, ninst, x3); + break; + case 0xD9: + case 0xDA: + case 0xDB: + case 0xDC: + case 0xDD: + case 0xDE: + case 0xDF: + INST_NAME("FSTPNCE ST0, STx"); + // copy the cache value for st0 to stx + x87_get_st_empty(dyn, ninst, x1, x2, nextop&7, X87_ST(nextop&7)); + x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + x87_swapreg(dyn, ninst, x1, x2, 0, nextop&7); + x87_do_pop(dyn, ninst, x3); + break; + case 0xE0: + INST_NAME("FCHS"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if(ST_IS_F(0)) { + FNEGS(v1, v1); + } else { + FNEGD(v1, v1); + } + break; + case 0xE1: + INST_NAME("FABS"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if(ST_IS_F(0)) { + FABSS(v1, v1); + } else { + FABSD(v1, v1); + } + break; + + case 0xE4: + INST_NAME("FTST"); + DEFAULT + break; + case 0xE5: + INST_NAME("FXAM"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_refresh(dyn, ninst, x1, x2, 0); + CALL(fpu_fxam, -1); // should be possible inline, but is it worth it? + break; + + case 0xE8: + INST_NAME("FLD1"); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_F); + if(ST_IS_F(0)) { + MOV32w(x1, 0x3f800000); + FMVWX(v1, x1); + } else { + MOV64x(x1, 0x3FF0000000000000); + FMVDX(v1, x1); + } + break; + case 0xE9: + INST_NAME("FLDL2T"); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D); + FTABLE64(v1, L2T); + break; + case 0xEA: + INST_NAME("FLDL2E"); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D); + FTABLE64(v1, L2E); + break; + case 0xEB: + INST_NAME("FLDPI"); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D); + FTABLE64(v1, PI); + break; + case 0xEC: + INST_NAME("FLDLG2"); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D); + FTABLE64(v1, LG2); + break; + case 0xED: + INST_NAME("FLDLN2"); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D); + FTABLE64(v1, LN2); + break; + case 0xEE: + INST_NAME("FLDZ"); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_F); + if(ST_IS_F(0)) { + FMVWX(v1, xZR); + } else { + FMVDX(v1, xZR); + } + break; + + case 0xF0: + INST_NAME("F2XM1"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + CALL(native_f2xm1, -1); + break; + case 0xF1: + INST_NAME("FYL2X"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + CALL(native_fyl2x, -1); + x87_do_pop(dyn, ninst, x3); + break; + case 0xF2: + INST_NAME("FPTAN"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + CALL(native_ftan, -1); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_F); + if(ST_IS_F(0)) { + MOV32w(x1, 0x3f800000); + FMVWX(v1, x1); + } else { + MOV64x(x1, 0x3FF0000000000000); + FMVDX(v1, x1); + } + break; + case 0xF3: + INST_NAME("FPATAN"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + CALL(native_fpatan, -1); + x87_do_pop(dyn, ninst, x3); + break; + case 0xF4: + INST_NAME("FXTRACT"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_do_push_empty(dyn, ninst, 0); + x87_forget(dyn, ninst, x1, x2, 1); + CALL(native_fxtract, -1); + break; + case 0xF5: + INST_NAME("FPREM1"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + CALL(native_fprem1, -1); + break; + case 0xF6: + INST_NAME("FDECSTP"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + LW(x2, xEmu, offsetof(x64emu_t, top)); + ADDI(x2, x2, -1); + ANDI(x2, x2, 7); + SW(x2, xEmu, offsetof(x64emu_t, top)); + break; + case 0xF7: + INST_NAME("FINCSTP"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + LW(x2, xEmu, offsetof(x64emu_t, top)); + ADDI(x2, x2, 1); + ANDI(x2, x2, 7); + SW(x2, xEmu, offsetof(x64emu_t, top)); + break; + case 0xF8: + INST_NAME("FPREM"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + CALL(native_fprem, -1); + break; + case 0xF9: + INST_NAME("FYL2XP1"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + CALL(native_fyl2xp1, -1); + x87_do_pop(dyn, ninst, x3); + break; + case 0xFA: + INST_NAME("FSQRT"); + DEFAULT; + break; + case 0xFB: + INST_NAME("FSINCOS"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_do_push_empty(dyn, ninst, 0); + x87_forget(dyn, ninst, x1, x2, 1); + CALL(native_fsincos, -1); + break; + case 0xFC: + INST_NAME("FRNDINT"); + DEFAULT; + break; + case 0xFD: + INST_NAME("FSCALE"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + CALL(native_fscale, -1); + break; + case 0xFE: + INST_NAME("FSIN"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + CALL(native_fsin, -1); + break; + case 0xFF: + INST_NAME("FCOS"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + CALL(native_fcos, -1); + break; + + + case 0xD1: + case 0xD4: + case 0xD5: + case 0xD6: + case 0xD7: + case 0xE2: + case 0xE3: + case 0xE6: + case 0xE7: + case 0xEF: + DEFAULT; + break; + + default: + switch((nextop>>3)&7) { + case 0: + INST_NAME("FLD ST0, float[ED]"); + v1 = x87_do_push(dyn, ninst, x1, box64_dynarec_x87double?EXT_CACHE_ST_D:EXT_CACHE_ST_F); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLW(v1, ed, fixedaddress); + if(!ST_IS_F(0)) { + FCVTDS(v1, v1); + } + break; + case 2: + INST_NAME("FST float[ED], ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_F); + if(ST_IS_F(0)) + s0 = v1; + else { + s0 = fpu_get_scratch(dyn); + FCVTSD(s0, v1); + } + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FSW(s0, ed, fixedaddress); + break; + case 3: + INST_NAME("FSTP float[ED], ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_F); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + if(!ST_IS_F(0)) { + FCVTSD(v1, v1); + } + FSW(v1, ed, fixedaddress); + x87_do_pop(dyn, ninst, x3); + break; + case 4: + INST_NAME("FLDENV Ed"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE? + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); + if(ed!=x1) { + MV(x1, ed); + } + MOV32w(x2, 0); + CALL(fpu_loadenv, -1); + break; + case 5: + INST_NAME("FLDCW Ew"); + GETEW(x1, 0); + SH(x1, xEmu, offsetof(x64emu_t, cw)); // hopefully cw is not too far for an imm8 + break; + case 6: + INST_NAME("FNSTENV Ed"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE? + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); + if(ed!=x1) { + MV(x1, ed); + } + MOV32w(x2, 0); + CALL(fpu_savenv, -1); + break; + case 7: + INST_NAME("FNSTCW Ew"); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x1, &fixedaddress, rex, NULL, 0, 0); + ed = x1; + wb1 = 1; + LH(x1, xEmu, offsetof(x64emu_t, cw)); + EWBACK; + break; + default: + DEFAULT; + } + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c index 2c3de1b5..8994e0e5 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.c +++ b/src/dynarec/rv64/dynarec_rv64_functions.c @@ -29,12 +29,406 @@ #include "bridge.h" #include "rv64_lock.h" +#define XMM0 0 +#define X870 XMM0+16 +#define EMM0 XMM0+16 +#define SCRATCH0 0 + +// Get a FPU scratch reg +int fpu_get_scratch(dynarec_rv64_t* dyn) +{ + return SCRATCH0 + dyn->e.fpu_scratch++; // return an Sx +} +// Reset scratch regs counter void fpu_reset_scratch(dynarec_rv64_t* dyn) { - //TODO + dyn->e.fpu_scratch = 0; +} +// Get a x87 double reg +int fpu_get_reg_x87(dynarec_rv64_t* dyn, int t, int n) +{ + int i=X870; + while (dyn->e.fpuused[i]) ++i; + dyn->e.fpuused[i] = 1; + dyn->e.extcache[i].n = n; + dyn->e.extcache[i].t = t; + dyn->e.news |= (1<<i); + return EXTREG(i); // return a Dx +} +// Free a FPU double reg +void fpu_free_reg(dynarec_rv64_t* dyn, int reg) +{ + int idx = EXTIDX(reg); + // TODO: check upper limit? + dyn->e.fpuused[idx] = 0; + if(dyn->e.extcache[idx].t!=EXT_CACHE_ST_F && dyn->e.extcache[idx].t!=EXT_CACHE_ST_D) + dyn->e.extcache[idx].v = 0; +} +// Get an MMX double reg +int fpu_get_reg_emm(dynarec_rv64_t* dyn, int emm) +{ + dyn->e.fpuused[EMM0 + emm] = 1; + dyn->e.extcache[EMM0 + emm].t = EXT_CACHE_MM; + dyn->e.extcache[EMM0 + emm].n = emm; + dyn->e.news |= (1<<(EMM0 + emm)); + return EXTREG(EMM0 + emm); +} +// Get an XMM quad reg +int fpu_get_reg_xmm(dynarec_rv64_t* dyn, int t, int xmm) +{ + int i = XMM0+xmm; + dyn->e.fpuused[i] = 1; + dyn->e.extcache[i].t = t; + dyn->e.extcache[i].n = xmm; + dyn->e.news |= (1<<i); + return EXTREG(i); +} +// Reset fpu regs counter +void fpu_reset_reg(dynarec_rv64_t* dyn) +{ + dyn->e.fpu_reg = 0; + for (int i=0; i<24; ++i) { + dyn->e.fpuused[i]=0; + dyn->e.extcache[i].v = 0; + } +} + +int extcache_get_st(dynarec_rv64_t* dyn, int ninst, int a) +{ + if (dyn->insts[ninst].e.swapped) { + if(dyn->insts[ninst].e.combined1 == a) + a = dyn->insts[ninst].e.combined2; + else if(dyn->insts[ninst].e.combined2 == a) + a = dyn->insts[ninst].e.combined1; + } + for(int i=0; i<24; ++i) + if((dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F + || dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_D) + && dyn->insts[ninst].e.extcache[i].n==a) + return dyn->insts[ninst].e.extcache[i].t; + // not in the cache yet, so will be fetched... + return EXT_CACHE_ST_D; +} + +int extcache_get_current_st(dynarec_rv64_t* dyn, int ninst, int a) +{ + (void)ninst; + if(!dyn->insts) + return EXT_CACHE_ST_D; + for(int i=0; i<24; ++i) + if((dyn->e.extcache[i].t==EXT_CACHE_ST_F + || dyn->e.extcache[i].t==EXT_CACHE_ST_D) + && dyn->e.extcache[i].n==a) + return dyn->e.extcache[i].t; + // not in the cache yet, so will be fetched... + return EXT_CACHE_ST_D; +} + +int extcache_get_st_f(dynarec_rv64_t* dyn, int ninst, int a) +{ + for(int i=0; i<24; ++i) + if(dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F + && dyn->insts[ninst].e.extcache[i].n==a) + return i; + return -1; +} +int extcache_get_st_f_noback(dynarec_rv64_t* dyn, int ninst, int a) +{ + for(int i=0; i<24; ++i) + if(dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F + && dyn->insts[ninst].e.extcache[i].n==a) + return i; + return -1; +} +int extcache_get_current_st_f(dynarec_rv64_t* dyn, int a) +{ + for(int i=0; i<24; ++i) + if(dyn->e.extcache[i].t==EXT_CACHE_ST_F + && dyn->e.extcache[i].n==a) + return i; + return -1; +} + +static void extcache_promote_double_forward(dynarec_rv64_t* dyn, int ninst, int maxinst, int a); +static void extcache_promote_double_internal(dynarec_rv64_t* dyn, int ninst, int maxinst, int a); +static void extcache_promote_double_combined(dynarec_rv64_t* dyn, int ninst, int maxinst, int a) +{ + if(a == dyn->insts[ninst].e.combined1 || a == dyn->insts[ninst].e.combined2) { + if(a == dyn->insts[ninst].e.combined1) { + a = dyn->insts[ninst].e.combined2; + } else + a = dyn->insts[ninst].e.combined1; + int i = extcache_get_st_f_noback(dyn, ninst, a); + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_combined, ninst=%d combined%c %d i=%d (stack:%d/%d)\n", ninst, (a == dyn->insts[ninst].e.combined2)?'2':'1', a ,i, dyn->insts[ninst].e.stack_push, -dyn->insts[ninst].e.stack_pop); + if(i>=0) { + dyn->insts[ninst].e.extcache[i].t = EXT_CACHE_ST_D; + if(!dyn->insts[ninst].e.barrier) + extcache_promote_double_internal(dyn, ninst-1, maxinst, a-dyn->insts[ninst].e.stack_push); + // go forward is combined is not pop'd + if(a-dyn->insts[ninst].e.stack_pop>=0) + if(!dyn->insts[ninst+1].e.barrier) + extcache_promote_double_forward(dyn, ninst+1, maxinst, a-dyn->insts[ninst].e.stack_pop); + } + } +} +static void extcache_promote_double_internal(dynarec_rv64_t* dyn, int ninst, int maxinst, int a) +{ + if(dyn->insts[ninst+1].e.barrier) + return; + while(ninst>=0) { + a+=dyn->insts[ninst].e.stack_pop; // adjust Stack depth: add pop'd ST (going backward) + int i = extcache_get_st_f(dyn, ninst, a); + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_internal, ninst=%d, a=%d st=%d:%d, i=%d\n", ninst, a, dyn->insts[ninst].e.stack, dyn->insts[ninst].e.stack_next, i); + if(i<0) return; + dyn->insts[ninst].e.extcache[i].t = EXT_CACHE_ST_D; + // check combined propagation too + if(dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2) { + if(dyn->insts[ninst].e.swapped) { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_internal, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack); + if (a==dyn->insts[ninst].e.combined1) + a = dyn->insts[ninst].e.combined2; + else if (a==dyn->insts[ninst].e.combined2) + a = dyn->insts[ninst].e.combined1; + } else { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_internal, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack); + extcache_promote_double_combined(dyn, ninst, maxinst, a); + } + } + a-=dyn->insts[ninst].e.stack_push; // // adjust Stack depth: remove push'd ST (going backward) + --ninst; + if(ninst<0 || a<0 || dyn->insts[ninst].e.barrier) + return; + } +} + +static void extcache_promote_double_forward(dynarec_rv64_t* dyn, int ninst, int maxinst, int a) +{ + while((ninst!=-1) && (ninst<maxinst) && (a>=0)) { + a+=dyn->insts[ninst].e.stack_push; // // adjust Stack depth: add push'd ST (going forward) + if((dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2) && dyn->insts[ninst].e.swapped) { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_forward, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack); + if (a==dyn->insts[ninst].e.combined1) + a = dyn->insts[ninst].e.combined2; + else if (a==dyn->insts[ninst].e.combined2) + a = dyn->insts[ninst].e.combined1; + } + int i = extcache_get_st_f_noback(dyn, ninst, a); + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_forward, ninst=%d, a=%d st=%d:%d(%d/%d), i=%d\n", ninst, a, dyn->insts[ninst].e.stack, dyn->insts[ninst].e.stack_next, dyn->insts[ninst].e.stack_push, -dyn->insts[ninst].e.stack_pop, i); + if(i<0) return; + dyn->insts[ninst].e.extcache[i].t = EXT_CACHE_ST_D; + // check combined propagation too + if((dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2) && !dyn->insts[ninst].e.swapped) { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_forward, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack); + extcache_promote_double_combined(dyn, ninst, maxinst, a); + } + a-=dyn->insts[ninst].e.stack_pop; // adjust Stack depth: remove pop'd ST (going forward) + if(dyn->insts[ninst].x64.has_next && !dyn->insts[ninst].e.barrier) + ++ninst; + else + ninst=-1; + } + if(ninst==maxinst) + extcache_promote_double(dyn, ninst, a); +} + +void extcache_promote_double(dynarec_rv64_t* dyn, int ninst, int a) +{ + int i = extcache_get_current_st_f(dyn, a); + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double, ninst=%d a=%d st=%d i=%d\n", ninst, a, dyn->e.stack, i); + if(i<0) return; + dyn->e.extcache[i].t = EXT_CACHE_ST_D; + dyn->insts[ninst].e.extcache[i].t = EXT_CACHE_ST_D; + // check combined propagation too + if(dyn->e.combined1 || dyn->e.combined2) { + if(dyn->e.swapped) { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double, ninst=%d swapped! %d/%d vs %d\n", ninst, dyn->e.combined1 ,dyn->e.combined2, a); + if(dyn->e.combined1 == a) + a = dyn->e.combined2; + else if(dyn->e.combined2 == a) + a = dyn->e.combined1; + } else { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double, ninst=%d combined! %d/%d vs %d\n", ninst, dyn->e.combined1 ,dyn->e.combined2, a); + if(dyn->e.combined1 == a) + extcache_promote_double(dyn, ninst, dyn->e.combined2); + else if(dyn->e.combined2 == a) + extcache_promote_double(dyn, ninst, dyn->e.combined1); + } + } + a-=dyn->insts[ninst].e.stack_push; // // adjust Stack depth: remove push'd ST (going backward) + if(!ninst || a<0) return; + extcache_promote_double_internal(dyn, ninst-1, ninst, a); +} + +int extcache_combine_st(dynarec_rv64_t* dyn, int ninst, int a, int b) +{ + dyn->e.combined1=a; + dyn->e.combined2=b; + if( extcache_get_current_st(dyn, ninst, a)==EXT_CACHE_ST_F + && extcache_get_current_st(dyn, ninst, b)==EXT_CACHE_ST_F ) + return EXT_CACHE_ST_F; + return EXT_CACHE_ST_D; +} + +static int isCacheEmpty(dynarec_native_t* dyn, int ninst) { + if(dyn->insts[ninst].e.stack_next) { + return 0; + } + for(int i=0; i<24; ++i) + if(dyn->insts[ninst].e.extcache[i].v) { // there is something at ninst for i + if(!( + (dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F || dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_D) + && dyn->insts[ninst].e.extcache[i].n<dyn->insts[ninst].e.stack_pop)) + return 0; + } + return 1; + +} + +int fpuCacheNeedsTransform(dynarec_rv64_t* dyn, int ninst) { + int i2 = dyn->insts[ninst].x64.jmp_insts; + if(i2<0) + return 1; + if((dyn->insts[i2].x64.barrier&BARRIER_FLOAT)) + // if the barrier as already been apply, no transform needed + return ((dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))?0:(isCacheEmpty(dyn, ninst)?0:1); + int ret = 0; + if(!i2) { // just purge + if(dyn->insts[ninst].e.stack_next) { + return 1; + } + for(int i=0; i<24 && !ret; ++i) + if(dyn->insts[ninst].e.extcache[i].v) { // there is something at ninst for i + if(!( + (dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F || dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_D) + && dyn->insts[ninst].e.extcache[i].n<dyn->insts[ninst].e.stack_pop)) + ret = 1; + } + return ret; + } + // Check if ninst can be compatible to i2 + if(dyn->insts[ninst].e.stack_next != dyn->insts[i2].e.stack-dyn->insts[i2].e.stack_push) { + return 1; + } + extcache_t cache_i2 = dyn->insts[i2].e; + extcacheUnwind(&cache_i2); + + for(int i=0; i<24; ++i) { + if(dyn->insts[ninst].e.extcache[i].v) { // there is something at ninst for i + if(!cache_i2.extcache[i].v) { // but there is nothing at i2 for i + ret = 1; + } else if(dyn->insts[ninst].e.extcache[i].v!=cache_i2.extcache[i].v) { // there is something different + if(dyn->insts[ninst].e.extcache[i].n!=cache_i2.extcache[i].n) { // not the same x64 reg + ret = 1; + } + else if(dyn->insts[ninst].e.extcache[i].t == EXT_CACHE_SS && cache_i2.extcache[i].t == EXT_CACHE_SD) + {/* nothing */ } + else + ret = 1; + } + } else if(cache_i2.extcache[i].v) + ret = 1; + } + return ret; +} + +void extcacheUnwind(extcache_t* cache) +{ + if(cache->swapped) { + // unswap + int a = -1; + int b = -1; + for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j) + if((cache->extcache[j].t == EXT_CACHE_ST_D || cache->extcache[j].t == EXT_CACHE_ST_F)) { + if(cache->extcache[j].n == cache->combined1) + a = j; + else if(cache->extcache[j].n == cache->combined2) + b = j; + } + if(a!=-1 && b!=-1) { + int tmp = cache->extcache[a].n; + cache->extcache[a].n = cache->extcache[b].n; + cache->extcache[b].n = tmp; + } + cache->swapped = 0; + cache->combined1 = cache->combined2 = 0; + } + if(cache->news) { + // reove the newly created extcache + for(int i=0; i<24; ++i) + if(cache->news&(1<<i)) + cache->extcache[i].v = 0; + cache->news = 0; + } + if(cache->stack_push) { + // unpush + for(int j=0; j<24; ++j) { + if((cache->extcache[j].t == EXT_CACHE_ST_D || cache->extcache[j].t == EXT_CACHE_ST_F)) { + if(cache->extcache[j].n<cache->stack_push) + cache->extcache[j].v = 0; + else + cache->extcache[j].n-=cache->stack_push; + } + } + cache->x87stack-=cache->stack_push; + cache->stack-=cache->stack_push; + cache->stack_push = 0; + } + cache->x87stack+=cache->stack_pop; + cache->stack_next = cache->stack; + cache->stack_pop = 0; + cache->barrier = 0; + // And now, rebuild the x87cache info with extcache + cache->mmxcount = 0; + cache->fpu_scratch = 0; + cache->fpu_extra_qscratch = 0; + cache->fpu_reg = 0; + for(int i=0; i<8; ++i) { + cache->x87cache[i] = -1; + cache->mmxcache[i] = -1; + cache->x87reg[i] = 0; + cache->ssecache[i*2].v = -1; + cache->ssecache[i*2+1].v = -1; + } + int x87reg = 0; + for(int i=0; i<24; ++i) { + if(cache->extcache[i].v) { + cache->fpuused[i] = 1; + switch (cache->extcache[i].t) { + case EXT_CACHE_MM: + cache->mmxcache[cache->extcache[i].n] = i; + ++cache->mmxcount; + ++cache->fpu_reg; + break; + case EXT_CACHE_SS: + cache->ssecache[cache->extcache[i].n].reg = i; + cache->ssecache[cache->extcache[i].n].single = 1; + ++cache->fpu_reg; + break; + case EXT_CACHE_SD: + cache->ssecache[cache->extcache[i].n].reg = i; + cache->ssecache[cache->extcache[i].n].single = 0; + ++cache->fpu_reg; + break; + case EXT_CACHE_ST_F: + case EXT_CACHE_ST_D: + cache->x87cache[x87reg] = cache->extcache[i].n; + cache->x87reg[x87reg] = i; + ++x87reg; + ++cache->fpu_reg; + break; + case EXT_CACHE_SCR: + cache->fpuused[i] = 0; + cache->extcache[i].v = 0; + break; + } + } else { + cache->fpuused[i] = 0; + } + } } + uint8_t extract_byte(uint32_t val, void* address){ int idx = (((uintptr_t)address)&3)*8; return (val>>idx)&0xff; @@ -80,4 +474,20 @@ int rv64_lock_cas_h(void* addr, uint16_t ref, uint16_t val) uint32_t* aligned = (uint32_t*)(((uintptr_t)addr)&~3); uint32_t tmp = *aligned; return rv64_lock_cas_d(aligned, tmp, insert_half(tmp, val, addr)); -} \ No newline at end of file +} + + +const char* getCacheName(int t, int n) +{ + static char buff[20]; + switch(t) { + case EXT_CACHE_ST_D: sprintf(buff, "ST%d", n); break; + case EXT_CACHE_ST_F: sprintf(buff, "st%d", n); break; + case EXT_CACHE_MM: sprintf(buff, "MM%d", n); break; + case EXT_CACHE_SS: sprintf(buff, "SS%d", n); break; + case EXT_CACHE_SD: sprintf(buff, "SD%d", n); break; + case EXT_CACHE_SCR: sprintf(buff, "Scratch"); break; + case EXT_CACHE_NONE: buff[0]='\0'; break; + } + return buff; +} diff --git a/src/dynarec/rv64/dynarec_rv64_functions.h b/src/dynarec/rv64/dynarec_rv64_functions.h index af55ad81..63640b0b 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.h +++ b/src/dynarec/rv64/dynarec_rv64_functions.h @@ -6,7 +6,41 @@ typedef struct x64emu_s x64emu_t; typedef struct dynarec_rv64_s dynarec_rv64_t; +// Get an FPU scratch reg +int fpu_get_scratch(dynarec_rv64_t* dyn); // Reset scratch regs counter void fpu_reset_scratch(dynarec_rv64_t* dyn); +// Get an x87 double reg +int fpu_get_reg_x87(dynarec_rv64_t* dyn, int t, int n); +// Get an MMX double reg +int fpu_get_reg_emm(dynarec_rv64_t* dyn, int emm); +// Get an XMM quad reg +int fpu_get_reg_xmm(dynarec_rv64_t* dyn, int t, int xmm); +// Free a FPU/MMX/XMM reg +void fpu_free_reg(dynarec_rv64_t* dyn, int reg); +// Reset fpu regs counter +void fpu_reset_reg(dynarec_rv64_t* dyn); + +// ---- Neon cache functions +// Get type for STx +int extcache_get_st(dynarec_rv64_t* dyn, int ninst, int a); +// Get if STx is FLOAT or DOUBLE +int extcache_get_st_f(dynarec_rv64_t* dyn, int ninst, int a); +// Get actual type for STx +int extcache_get_current_st(dynarec_rv64_t* dyn, int ninst, int a); +// Get actual STx is FLOAT or DOUBLE +int extcache_get_current_st_f(dynarec_rv64_t* dyn, int a); +// Back-propagate a change float->double +void extcache_promote_double(dynarec_rv64_t* dyn, int ninst, int a); +// Combine and propagate if needed (pass 1 only) +int extcache_combine_st(dynarec_rv64_t* dyn, int ninst, int a, int b); // with stack current dyn->n_stack* + +// FPU Cache transformation (for loops) // Specific, need to be writen par backend +int fpuCacheNeedsTransform(dynarec_rv64_t* dyn, int ninst); + +// Undo the changes of a extcache to get the status before the instruction +void extcacheUnwind(extcache_t* cache); + +const char* getCacheName(int t, int n); #endif //__DYNAREC_RV64_FUNCTIONS_H__ \ No newline at end of file diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index 5b57d4dc..b049a93b 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -4,6 +4,7 @@ #include <pthread.h> #include <errno.h> #include <assert.h> +#include <string.h> #include "bitutils.h" #include "debug.h" @@ -523,55 +524,1027 @@ void grab_segdata(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, int reg, int s MESSAGE(LOG_DUMP, "----%s Offset\n", (segment==_FS)?"FS":"GS"); } -void fpu_reset(dynarec_rv64_t* dyn) +// x87 stuffs +static void x87_reset(dynarec_rv64_t* dyn) { - //TODO + for (int i=0; i<8; ++i) + dyn->e.x87cache[i] = -1; + dyn->e.x87stack = 0; + dyn->e.stack = 0; + dyn->e.stack_next = 0; + dyn->e.stack_pop = 0; + dyn->e.stack_push = 0; + dyn->e.combined1 = dyn->e.combined2 = 0; + dyn->e.swapped = 0; + dyn->e.barrier = 0; + for(int i=0; i<24; ++i) + if(dyn->e.extcache[i].t == EXT_CACHE_ST_F || dyn->e.extcache[i].t == EXT_CACHE_ST_D) + dyn->e.extcache[i].v = 0; } -void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n) +void x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch) { - //TODO + MAYUSE(scratch); + if(!dyn->e.x87stack) + return; + if(dyn->e.mmxcount) + mmx_purgecache(dyn, ninst, 0, scratch); + MESSAGE(LOG_DUMP, "\tSynch x87 Stackcount (%d)\n", dyn->e.x87stack); + int a = dyn->e.x87stack; + // Add x87stack to emu fpu_stack + LW(scratch, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI(scratch, scratch, a); + SW(scratch, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LW(scratch, xEmu, offsetof(x64emu_t, top)); + ADDI(scratch, scratch, -a); + ANDI(scratch, scratch, 7); + SW(scratch, xEmu, offsetof(x64emu_t, top)); + // reset x87stack, but not the stack count of extcache + dyn->e.x87stack = 0; + dyn->e.stack_next -= dyn->e.stack; + dyn->e.stack = 0; + MESSAGE(LOG_DUMP, "\t------x87 Stackcount\n"); } -void fpu_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3) +int extcache_st_coherency(dynarec_rv64_t* dyn, int ninst, int a, int b) { - //TODO + int i1 = extcache_get_st(dyn, ninst, a); + int i2 = extcache_get_st(dyn, ninst, b); + if(i1!=i2) { + MESSAGE(LOG_DUMP, "Warning, ST cache incoherent between ST%d(%d) and ST%d(%d)\n", a, i1, b, i2); + } + + return i1; } -// propagate ST stack state, especial stack pop that are defered -void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst) +// On step 1, Float/Double for ST is actualy computed and back-propagated +// On step 2-3, the value is just read for inst[...].n.neocache[..] +// the reg returned is *2 for FLOAT +int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t) +{ + if(dyn->e.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->e.x87stack+=1; + dyn->e.stack+=1; + dyn->e.stack_next+=1; + dyn->e.stack_push+=1; + // move all regs in cache, and find a free one + for(int j=0; j<24; ++j) + if((dyn->e.extcache[j].t == EXT_CACHE_ST_D) || (dyn->e.extcache[j].t == EXT_CACHE_ST_F)) + ++dyn->e.extcache[j].n; + int ret = -1; + for(int i=0; i<8; ++i) + if(dyn->e.x87cache[i]!=-1) + ++dyn->e.x87cache[i]; + else if(ret==-1) { + dyn->e.x87cache[i] = 0; + ret=dyn->e.x87reg[i]=fpu_get_reg_x87(dyn, t, 0); + #if STEP == 1 + // need to check if reg is compatible with float + if((ret>15) && (t == EXT_CACHE_ST_F)) + dyn->e.extcache[ret].t = EXT_CACHE_ST_D; + #else + dyn->e.extcache[ret].t = X87_ST0; + #endif + } + return ret; +} +void x87_do_push_empty(dynarec_rv64_t* dyn, int ninst, int s1) { - //TODO + if(dyn->e.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->e.x87stack+=1; + dyn->e.stack+=1; + dyn->e.stack_next+=1; + dyn->e.stack_push+=1; + // move all regs in cache + for(int j=0; j<24; ++j) + if((dyn->e.extcache[j].t == EXT_CACHE_ST_D) || (dyn->e.extcache[j].t == EXT_CACHE_ST_F)) + ++dyn->e.extcache[j].n; + for(int i=0; i<8; ++i) + if(dyn->e.x87cache[i]!=-1) + ++dyn->e.x87cache[i]; + if(s1) + x87_stackcount(dyn, ninst, s1); +} +void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1) +{ + if(dyn->e.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->e.x87stack-=1; + dyn->e.stack_next-=1; + dyn->e.stack_pop+=1; + // move all regs in cache, poping ST0 + for(int i=0; i<8; ++i) + if(dyn->e.x87cache[i]!=-1) { + --dyn->e.x87cache[i]; + if(dyn->e.x87cache[i]==-1) { + fpu_free_reg(dyn, dyn->e.x87reg[i]); + dyn->e.x87reg[i] = -1; + } + } } +void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3) +{ + int ret = 0; + for (int i=0; i<8 && !ret; ++i) + if(dyn->e.x87cache[i] != -1) + ret = 1; + if(!ret && !dyn->e.x87stack) // nothing to do + return; + MESSAGE(LOG_DUMP, "\tPurge %sx87 Cache and Synch Stackcount (%+d)---\n", next?"locally ":"", dyn->e.x87stack); + int a = dyn->e.x87stack; + if(a!=0) { + // reset x87stack + if(!next) + dyn->e.x87stack = 0; + // Add x87stack to emu fpu_stack + LW(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI(s2, s2, a); + SW(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LW(s2, xEmu, offsetof(x64emu_t, top)); + // update tags (and top at the same time) + if(a>0) { + // new tag to fulls + ADDI(s3, xZR, 0); + for (int i=0; i<a; ++i) { + ADDI(s2, s2, -1); + ANDI(s2, s2, 7); // (emu->top + st)&7 + SLLI(s1, s2, 2); + ADD(s1, xEmu, s1); + SW(s3, s1, offsetof(x64emu_t, p_regs)); + } + } else { + // empty tags + ADDI(s3, xZR, 0b11); + for (int i=0; i<-a; ++i) { + SLLI(s1, s2, 2); + ADD(s1, xEmu, s1); + SW(s3, s1, offsetof(x64emu_t, p_regs)); + ADDI(s2, s2, 1); + ANDI(s2, s2, 7); // (emu->top + st)&7 + } + } + SW(s2, xEmu, offsetof(x64emu_t, top)); + } else { + LW(s2, xEmu, offsetof(x64emu_t, top)); + } + if(ret!=0) { + // --- set values + // Get top + // loop all cache entries + for (int i=0; i<8; ++i) + if(dyn->e.x87cache[i]!=-1) { + #if STEP == 1 + if(!next) { // don't force promotion here + // pre-apply pop, because purge happens in-between + extcache_promote_double(dyn, ninst, dyn->e.x87cache[i]+dyn->e.stack_pop); + } + #endif + #if STEP == 3 + if(!next && extcache_get_st_f(dyn, ninst, dyn->e.x87cache[i])>=0) { + MESSAGE(LOG_DUMP, "Warning, incoherency with purged ST%d cache\n", dyn->e.x87cache[i]); + } + #endif + ADDI(s3, s2, dyn->e.x87cache[i]); + ANDI(s3, s3, 7); // (emu->top + st)&7 + SLLI(s1, s3, 3); + ADD(s1, xEmu, s1); + if(next) { + // need to check if a ST_F need local promotion + if(extcache_get_st_f(dyn, ninst, dyn->e.x87cache[i])>=0) { + FCVTDS(0, dyn->e.x87reg[i]); + FSD(0, s1, offsetof(x64emu_t, x87)); // save the value + } else { + FSD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87)); // save the value + } + } else { + FSD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87)); + fpu_free_reg(dyn, dyn->e.x87reg[i]); + dyn->e.x87reg[i] = -1; + dyn->e.x87cache[i] = -1; + //dyn->e.stack_pop+=1; //no pop, but the purge because of barrier will have the n.barrier flags set + } + } + } + if(!next) { + dyn->e.stack_next = 0; + #if STEP < 2 + // refresh the cached valued, in case it's a purge outside a instruction + dyn->insts[ninst].e.barrier = 1; + #endif + } + MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n"); +} + +#ifdef HAVE_TRACE +static void x87_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3) +{ + x87_stackcount(dyn, ninst, s1); + int ret = 0; + for (int i=0; (i<8) && (!ret); ++i) + if(dyn->e.x87cache[i] != -1) + ret = 1; + if(!ret) // nothing to do + return; + // prepare offset to fpu => s1 + // Get top + LW(s2, xEmu, offsetof(x64emu_t, top)); + // loop all cache entries + for (int i=0; i<8; ++i) + if(dyn->e.x87cache[i]!=-1) { + ADDI(s3, s2, dyn->e.x87cache[i]); + ANDI(s3, s3, 7); // (emu->top + i)&7 + SLLI(s1, s3, 3); + ADD(s1, xEmu, s1); + FSD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87)); + } +} +#endif + +int x87_get_current_cache(dynarec_rv64_t* dyn, int ninst, int st, int t) +{ + // search in cache first + for (int i=0; i<8; ++i) { + if(dyn->e.x87cache[i]==st) { + #if STEP == 1 + if(t==EXT_CACHE_ST_D && (dyn->e.extcache[dyn->e.x87reg[i]].t==EXT_CACHE_ST_F)) + extcache_promote_double(dyn, ninst, st); + #endif + return i; + } + assert(dyn->e.x87cache[i]<8); + } + return -1; +} + +int x87_get_cache(dynarec_rv64_t* dyn, int ninst, int populate, int s1, int s2, int st, int t) +{ + if(dyn->e.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + int ret = x87_get_current_cache(dyn, ninst, st, t); + if(ret!=-1) + return ret; + MESSAGE(LOG_DUMP, "\tCreate %sx87 Cache for ST%d\n", populate?"and populate ":"", st); + // get a free spot + for (int i=0; (i<8) && (ret==-1); ++i) + if(dyn->e.x87cache[i]==-1) + ret = i; + // found, setup and grab the value + dyn->e.x87cache[ret] = st; + dyn->e.x87reg[ret] = fpu_get_reg_x87(dyn, EXT_CACHE_ST_D, st); + if(populate) { + LW(s2, xEmu, offsetof(x64emu_t, top)); + int a = st - dyn->e.x87stack; + if(a) { + ADDI(s2, s2, a); + ANDI(s2, s2, 7); + } + ADD(s1, xEmu, s2); + FLD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87)); + } + MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); + + return ret; +} +int x87_get_extcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) +{ + for(int ii=0; ii<24; ++ii) + if((dyn->e.extcache[ii].t == EXT_CACHE_ST_F || dyn->e.extcache[ii].t == EXT_CACHE_ST_D) + && dyn->e.extcache[ii].n==st) + return ii; + assert(0); + return -1; +} +int x87_get_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t) +{ + return dyn->e.x87reg[x87_get_cache(dyn, ninst, 1, s1, s2, a, t)]; +} +int x87_get_st_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t) +{ + return dyn->e.x87reg[x87_get_cache(dyn, ninst, 0, s1, s2, a, t)]; +} + + +void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) +{ + x87_stackcount(dyn, ninst, s1); + int ret = -1; + for (int i=0; (i<8) && (ret==-1); ++i) + if(dyn->e.x87cache[i] == st) + ret = i; + if(ret==-1) // nothing to do + return; + MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st); + // prepare offset to fpu => s1 + // Get top + LW(s2, xEmu, offsetof(x64emu_t, top)); + // Update + if(st) { + ADDI(s2, s2, st); + ANDI(s2, s2, 7); // (emu->top + i)&7 + } + ADD(s1, xEmu, s2); + if(dyn->e.extcache[dyn->e.x87reg[ret]].t==EXT_CACHE_ST_F) { + FCVTDS(0, dyn->e.x87reg[ret]); + FSD(31, s1, offsetof(x64emu_t, x87)); + } else { + FSD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87)); + } + MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st); +} + +void x87_forget(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) +{ + x87_stackcount(dyn, ninst, s1); + int ret = -1; + for (int i=0; (i<8) && (ret==-1); ++i) + if(dyn->e.x87cache[i] == st) + ret = i; + if(ret==-1) // nothing to do + return; + MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st); + #if STEP == 1 + if(dyn->e.extcache[dyn->e.x87reg[ret]].t==EXT_CACHE_ST_F) + extcache_promote_double(dyn, ninst, st); + #endif + // prepare offset to fpu => s1 + // Get top + LW(s2, xEmu, offsetof(x64emu_t, top)); + // Update + if(st) { + ADDI(s2, s2, st); + ANDI(s2, s2, 7); // (emu->top + i)&7 + } + ADD(s1, xEmu, s2); + FSD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87)); + MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st); + // and forget that cache + fpu_free_reg(dyn, dyn->e.x87reg[ret]); + dyn->e.extcache[dyn->e.x87reg[ret]].v = 0; + dyn->e.x87cache[ret] = -1; + dyn->e.x87reg[ret] = -1; +} + +void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) +{ + if(dyn->e.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + // search in cache first + for (int i=0; i<8; ++i) + if(dyn->e.x87cache[i]==st) { + // refresh the value + MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st); + #if STEP == 1 + if(dyn->e.extcache[dyn->e.x87reg[i]].t==EXT_CACHE_ST_F) + extcache_promote_double(dyn, ninst, st); + #endif + LW(s2, xEmu, offsetof(x64emu_t, top)); + int a = st - dyn->e.x87stack; + ADDI(s2, s2, a); + AND(s2, s2, 7); + SLLI(s2, s2, 3); + ADD(s1, xEmu, s2); + FLD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87)); + MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); + // ok + return; + } + // Was not in the cache? creating it.... + MESSAGE(LOG_DUMP, "\tCreate x87 Cache for ST%d\n", st); + // get a free spot + int ret = -1; + for (int i=0; (i<8) && (ret==-1); ++i) + if(dyn->e.x87cache[i]==-1) + ret = i; + // found, setup and grab the value + dyn->e.x87cache[ret] = st; + dyn->e.x87reg[ret] = fpu_get_reg_x87(dyn, EXT_CACHE_ST_D, st); + LW(s2, xEmu, offsetof(x64emu_t, top)); + int a = st - dyn->e.x87stack; + ADDI(s2, s2, a); + ANDI(s2, s2, 7); // (emu->top + i)&7 + SLLI(s2, s2, 3); + ADD(s1, xEmu, s2); + FLD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87)); + MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); +} + +void x87_swapreg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int b) +{ + int i1, i2, i3; + i1 = x87_get_cache(dyn, ninst, 1, s1, s2, b, X87_ST(b)); + i2 = x87_get_cache(dyn, ninst, 1, s1, s2, a, X87_ST(a)); + i3 = dyn->e.x87cache[i1]; + dyn->e.x87cache[i1] = dyn->e.x87cache[i2]; + dyn->e.x87cache[i2] = i3; + // swap those too + int j1, j2, j3; + j1 = x87_get_extcache(dyn, ninst, s1, s2, b); + j2 = x87_get_extcache(dyn, ninst, s1, s2, a); + j3 = dyn->e.extcache[j1].n; + dyn->e.extcache[j1].n = dyn->e.extcache[j2].n; + dyn->e.extcache[j2].n = j3; + // mark as swapped + dyn->e.swapped = 1; + dyn->e.combined1= a; dyn->e.combined2=b; +} + +// Set rounding according to cw flags, return reg to restore flags +int x87_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3) +{ + MAYUSE(dyn); MAYUSE(ninst); + MAYUSE(s1); MAYUSE(s2); + LW(s1, xEmu, offsetof(x64emu_t, cw)); + SRLI(s1, s1, 10); + ANDI(s1, s1, 0b11); + // MMX/x87 Round mode: 0..3: Nearest, Down, Up, Chop + // RV64: 0..7: Nearest, Toward Zero (Chop), Down, Up, Nearest tie to Max, invalid, invalid, dynamic (invalid here) + // 0->0, 1->2, 2->3, 3->1 + SLLI(s1, s1, 1); + ADDI(s2, xZR, 3); + BGE(s1, s2, 4+8); + ADDI(s1, s1, -4); + XORI(s3, s1, 0b11); + // transform done (is there a faster way?) + FSRM(s3); // exange RM with current + return s3; +} + +// Set rounding according to mxcsr flags, return reg to restore flags +int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3) +{ + MAYUSE(dyn); MAYUSE(ninst); + MAYUSE(s1); MAYUSE(s2); + LW(s1, xEmu, offsetof(x64emu_t, mxcsr)); + SRLI(s1, s1, 13); + ANDI(s1, s1, 0b11); + // MMX/x87 Round mode: 0..3: Nearest, Down, Up, Chop + // RV64: 0..7: Nearest, Toward Zero (Chop), Down, Up, Nearest tie to Max, invalid, invalid, dynamic (invalid here) + // 0->0, 1->2, 2->3, 3->1 + SLLI(s1, s1, 1); + ADDI(s2, xZR, 3); + BGE(s1, s2, 4+8); + ADDI(s1, s1, -4); + XORI(s3, s1, 0b11); + // transform done (is there a faster way?) + FSRM(s3); // exange RM with current + return s3; +} + +// Restore round flag, destroy s1 doing so +void x87_restoreround(dynarec_rv64_t* dyn, int ninst, int s1) +{ + MAYUSE(dyn); MAYUSE(ninst); + MAYUSE(s1); + FSRM(s1); // put back fpscr +} + +// MMX helpers +static void mmx_reset(dynarec_rv64_t* dyn) +{ + dyn->e.mmxcount = 0; + for (int i=0; i<8; ++i) + dyn->e.mmxcache[i] = -1; +} +static int isx87Empty(dynarec_rv64_t* dyn) +{ + for (int i=0; i<8; ++i) + if(dyn->e.x87cache[i] != -1) + return 0; + return 1; +} + +// get neon register for a MMX reg, create the entry if needed +int mmx_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a) +{ + if(!dyn->e.x87stack && isx87Empty(dyn)) + x87_purgecache(dyn, ninst, 0, s1, s2, s3); + if(dyn->e.mmxcache[a]!=-1) + return dyn->e.mmxcache[a]; + ++dyn->e.mmxcount; + int ret = dyn->e.mmxcache[a] = fpu_get_reg_emm(dyn, a); + FLD(ret, xEmu, offsetof(x64emu_t, mmx[a])); + return ret; +} +// get neon register for a MMX reg, but don't try to synch it if it needed to be created +int mmx_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a) +{ + if(!dyn->e.x87stack && isx87Empty(dyn)) + x87_purgecache(dyn, ninst, 0, s1, s2, s3); + if(dyn->e.mmxcache[a]!=-1) + return dyn->e.mmxcache[a]; + ++dyn->e.mmxcount; + int ret = dyn->e.mmxcache[a] = fpu_get_reg_emm(dyn, a); + return ret; +} +// purge the MMX cache only(needs 3 scratch registers) void mmx_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1) { - // TODO + if(!dyn->e.mmxcount) + return; + if(!next) + dyn->e.mmxcount = 0; + int old = -1; + for (int i=0; i<8; ++i) + if(dyn->e.mmxcache[i]!=-1) { + if (old==-1) { + MESSAGE(LOG_DUMP, "\tPurge %sMMX Cache ------\n", next?"locally ":""); + ++old; + } + FSD(dyn->e.mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i])); + if(!next) { + fpu_free_reg(dyn, dyn->e.mmxcache[i]); + dyn->e.mmxcache[i] = -1; + } + } + if(old!=-1) { + MESSAGE(LOG_DUMP, "\t------ Purge MMX Cache\n"); + } +} +#ifdef HAVE_TRACE +static void mmx_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1) +{ + for (int i=0; i<8; ++i) + if(dyn->e.mmxcache[i]!=-1) { + FLD(dyn->e.mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i])); + } } +#endif -void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3) +// SSE / SSE2 helpers +static void sse_reset(dynarec_rv64_t* dyn) { - //TODO + for (int i=0; i<16; ++i) + dyn->e.ssecache[i].v = -1; +} +// get ext register for a SSE reg, create the entry if needed +int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single) +{ + if(dyn->e.ssecache[a].v!=-1) { + // forget / reload if change of size + if(dyn->e.ssecache[a].single!=single) { + sse_forget_reg(dyn, ninst, a); + return sse_get_reg(dyn, ninst, s1, a, single); + } + return dyn->e.ssecache[a].reg; + } + dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a); + int ret = dyn->e.ssecache[a].reg; + dyn->e.ssecache[a].single = single; + if(dyn->e.ssecache[a].single) + FLW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); + else + FLD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); + return ret; +} +// get ext register for a SSE reg, but don't try to synch it if it needed to be created +int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single) +{ + if(dyn->e.ssecache[a].v!=-1) { + dyn->e.ssecache[a].single = single; + dyn->e.extcache[dyn->e.ssecache[a].reg].t = single?EXT_CACHE_SS:EXT_CACHE_SD; + return dyn->e.ssecache[a].reg; + } + dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a); + dyn->e.ssecache[a].single = 1; // it will be write... + return dyn->e.ssecache[a].reg; +} +// forget ext register for a SSE reg, create the entry if needed +void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a) +{ + if(dyn->e.ssecache[a].v==-1) + return; + if(dyn->e.ssecache[a].single) + FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); + else + FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); + fpu_free_reg(dyn, dyn->e.ssecache[a].reg); + dyn->e.ssecache[a].v = -1; + return; +} +// purge the SSE cache for XMM0..XMM7 (to use before function native call) +void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1) +{ + int old = -1; + for (int i=0; i<8; ++i) + if(dyn->e.ssecache[i].v!=-1) { + if (old==-1) { + MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n"); + ++old; + } + if(dyn->e.ssecache[i].single) + FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + else + FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + fpu_free_reg(dyn, dyn->e.ssecache[i].reg); + dyn->e.ssecache[i].v = -1; + } + if(old!=-1) { + MESSAGE(LOG_DUMP, "\t------ Purge XMM0..7 Cache\n"); + } } +// purge the SSE cache only +static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1) +{ + int old = -1; + for (int i=0; i<16; ++i) + if(dyn->e.ssecache[i].v!=-1) { + if (old==-1) { + MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":""); + ++old; + } + if(dyn->e.ssecache[i].single) + FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + else + FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + if(!next) { + fpu_free_reg(dyn, dyn->e.ssecache[i].reg); + dyn->e.ssecache[i].v = -1; + } + } + if(old!=-1) { + MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n"); + } +} #ifdef HAVE_TRACE -void fpu_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3) +static void sse_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1) { - //TODO + for (int i=0; i<16; ++i) + if(dyn->e.ssecache[i].v!=-1) { + if(dyn->e.ssecache[i].single) + FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + else + FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } } #endif void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) { - //TODO + int start = not07?8:0; + // only SSE regs needs to be push back to xEmu (needs to be "write") + int n=0; + for (int i=start; i<16; i++) + if(dyn->e.ssecache[i].v!=-1) + ++n; + if(!n) + return; + MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n); + for (int i=start; i<16; ++i) + if(dyn->e.ssecache[i].v!=-1) { + if(dyn->e.ssecache[i].single) + FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + else + FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } + MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n); } void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) { - //TODO + int start = not07?8:0; + // only SSE regs needs to be pop back from xEmu (don't need to be "write" this time) + int n=0; + for (int i=start; i<16; i++) + if(dyn->e.ssecache[i].v!=-1) + ++n; + if(!n) + return; + MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n); + for (int i=start; i<16; ++i) + if(dyn->e.ssecache[i].v!=-1) { + if(dyn->e.ssecache[i].single) + FLW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + else + FLD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } + MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n); +} + +void fpu_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3) +{ + x87_purgecache(dyn, ninst, next, s1, s2, s3); + mmx_purgecache(dyn, ninst, next, s1); + sse_purgecache(dyn, ninst, next, s1); + if(!next) + fpu_reset_reg(dyn); +} + +static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_t* cache) +{ + ext_cache_t f; + f.n = n; f.t = t; + for(int i=0; i<24; ++i) { + if(cache->extcache[i].v == f.v) + return i; + if(cache->extcache[i].n == n) { + switch(cache->extcache[i].t) { + case EXT_CACHE_ST_F: + if (t==EXT_CACHE_ST_D) + return i; + break; + case EXT_CACHE_ST_D: + if (t==EXT_CACHE_ST_F) + return i; + break; + } + } + } + return -1; +} + +static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *cache) +{ + if (i==j) + return; + int reg_i = EXTREG(i); + int reg_j = EXTREG(j); + int i_single = 0; + if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F) + i_single =1; + int j_single = 0; + if(cache->extcache[j].t==EXT_CACHE_SS || cache->extcache[j].t==EXT_CACHE_ST_F) + j_single =1; + + if(!cache->extcache[i].v) { + // a mov is enough, no need to swap + MESSAGE(LOG_DUMP, "\t - Moving %d <- %d\n", i, j); + if(j_single) { + FMVS(reg_i, reg_j); + } else { + FMVD(reg_i, reg_j); + } + cache->extcache[i].v = cache->extcache[j].v; + cache->extcache[j].v = 0; + return; + } + // SWAP + ext_cache_t tmp; + MESSAGE(LOG_DUMP, "\t - Swaping %d <-> %d\n", i, j); + // There is no VSWP in Arm64 NEON to swap 2 register contents! + // so use a scratch... + #define SCRATCH 0 + if(i_single) + FMVS(SCRATCH, reg_i); + else + FMVD(SCRATCH, reg_i); + if(j_single) + FMVS(reg_i, reg_j); + else + FMVD(reg_i, reg_j); + if(i_single) + FMVS(reg_j, SCRATCH); + else + FMVD(reg_j, SCRATCH); + #undef SCRATCH + tmp.v = cache->extcache[i].v; + cache->extcache[i].v = cache->extcache[j].v; + cache->extcache[j].v = tmp.v; +} + +static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, extcache_t *cache, int i, int t, int n) +{ + int reg = EXTREG(i); + if(cache->extcache[i].v) { + int single = 0; + if(t==EXT_CACHE_SS || t==EXT_CACHE_ST_F) + single = 1; + if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F) + single = 1; + int j = i+1; + while(cache->extcache[j].v) + ++j; + MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i); + if(single) { + FMVS(EXTREG(j), reg); + } else { + FMVD(EXTREG(j), reg); + } + cache->extcache[j].v = cache->extcache[i].v; + } + switch(t) { + case EXT_CACHE_SS: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + FLW(reg, xEmu, offsetof(x64emu_t, xmm[n])); + break; + case EXT_CACHE_SD: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + FLD(reg, xEmu, offsetof(x64emu_t, xmm[n])); + break; + case EXT_CACHE_MM: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + FLD(reg, xEmu, offsetof(x64emu_t, mmx[i])); + break; + case EXT_CACHE_ST_D: + case EXT_CACHE_ST_F: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + if((*s3_top) == 0xffff) { + LW(s3, xEmu, offsetof(x64emu_t, top)); + *s3_top = 0; + } + int a = n - (*s3_top) - stack_cnt; + if(a) { + ADDI(s3, s3, a); + ANDI(s3, s3, 7); // (emu->top + i)&7 + } + *s3_top += a; + *s2_val = 0; + SLLI(s2, s3, 3); + ADD(s2, xEmu, s2); + FLD(reg, s2, offsetof(x64emu_t, x87)); + if(t==EXT_CACHE_ST_F) { + FCVTSD(reg, reg); + } + break; + case EXT_CACHE_NONE: + case EXT_CACHE_SCR: + default: /* nothing done */ + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + } + cache->extcache[i].n = n; + cache->extcache[i].t = t; +} + +static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, extcache_t *cache, int i, int t, int n) +{ + int reg = EXTREG(i); + switch(t) { + case EXT_CACHE_SS: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + FSW(reg, xEmu, offsetof(x64emu_t, xmm[n])); + break; + case EXT_CACHE_SD: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + FSD(reg, xEmu, offsetof(x64emu_t, xmm[n])); + break; + case EXT_CACHE_MM: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + FSD(reg, xEmu, offsetof(x64emu_t, mmx[n])); + break; + case EXT_CACHE_ST_D: + case EXT_CACHE_ST_F: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + if((*s3_top)==0xffff) { + LW(s3, xEmu, offsetof(x64emu_t, top)); + *s3_top = 0; + } + int a = n - (*s3_top) - stack_cnt; + if(a) { + ADDI(s3, s3, a); + ANDI(s3, s3, 7); + } + *s3_top += a; + SLLI(s2, s3, 3); + ADD(s2, xEmu, s2); + *s2_val = 0; + if(t==EXT_CACHE_ST_F) { + FCVTDS(reg, reg); + } + FSD(reg, s2, offsetof(x64emu_t, x87)); + break; + case EXT_CACHE_NONE: + case EXT_CACHE_SCR: + default: /* nothing done */ + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + } + cache->extcache[i].v = 0; } static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3) { - //TODO +#if STEP > 1 + int i2 = dyn->insts[ninst].x64.jmp_insts; + if(i2<0) + return; + MESSAGE(LOG_DUMP, "\tCache Transform ---- ninst=%d -> %d\n", ninst, i2); + if((!i2) || (dyn->insts[i2].x64.barrier&BARRIER_FLOAT)) { + if(dyn->e.stack_next) { + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + for(int i=0; i<24; ++i) + if(dyn->e.extcache[i].v) { // there is something at ninst for i + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + extcache_t cache_i2 = dyn->insts[i2].e; + extcacheUnwind(&cache_i2); + + if(!cache_i2.stack) { + int purge = 1; + for (int i=0; i<24 && purge; ++i) + if(cache_i2.extcache[i].v) + purge = 0; + if(purge) { + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + } + int stack_cnt = dyn->e.stack_next; + int s3_top = 0xffff; + if(stack_cnt != cache_i2.stack) { + MESSAGE(LOG_DUMP, "\t - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack); + int a = stack_cnt - cache_i2.stack; + // Add x87stack to emu fpu_stack + LWU(s3, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI(s3, s3, a); + SW(s3, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LWU(s3, xEmu, offsetof(x64emu_t, top)); + // update tags (and top at the same time) + if(a>0) { + // new tag to fulls + ADDI(s2, xZR, 0); + ADDI(s1, xEmu, offsetof(x64emu_t, p_regs)); + SLLI(s3, s3, 2); + for (int i=0; i<a; ++i) { + ADDI(s3, s3, -1<<2); + ANDI(s3, s3, 7<<2); + ADD(s3, s1, s3); + SW(s2, s3, 0); // that slot is full + SUB(s3, s3, s1); + } + SRLI(s3, s3, 2); + } else { + // empty tags + ADDI(s2, xZR, 0b11); + ADDI(s1, xEmu, offsetof(x64emu_t, p_regs)); + SLLI(s3, s3, 2); + for (int i=0; i<-a; ++i) { + ADD(s3, s1, s3); + SW(s2, s3, 0); // empty slot before leaving it + SUB(s3, s3, s1); + ADDI(s3, s3, 1<<2); + ANDI(s3, s3, 7<<2); // (emu->top + st)&7 + } + SRLI(s3, s3, 2); + } + SW(s3, xEmu, offsetof(x64emu_t, top)); + s3_top = 0; + stack_cnt = cache_i2.stack; + } + extcache_t cache = dyn->e; + int s1_val = 0; + int s2_val = 0; + // unload every uneeded cache + // check SSE first, than MMX, in order, for optimisation issue + for(int i=0; i<16; ++i) { + int j=findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache); + if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2)==-1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n); + j=findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache); + if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache_i2)==-1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n); + } + for(int i=0; i<8; ++i) { + int j=findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache); + if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache_i2)==-1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n); + } + for(int i=0; i<24; ++i) { + if(cache.extcache[i].v) + if(findCacheSlot(dyn, ninst, cache.extcache[i].t, cache.extcache[i].n, &cache_i2)==-1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.extcache[i].t, cache.extcache[i].n); + } + // and now load/swap the missing one + for(int i=0; i<24; ++i) { + if(cache_i2.extcache[i].v) { + if(cache_i2.extcache[i].v != cache.extcache[i].v) { + int j; + if((j=findCacheSlot(dyn, ninst, cache_i2.extcache[i].t, cache_i2.extcache[i].n, &cache))==-1) + loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.extcache[i].t, cache_i2.extcache[i].n); + else { + // it's here, lets swap if needed + if(j!=i) + swapCache(dyn, ninst, i, j, &cache); + } + } + if(cache.extcache[i].t != cache_i2.extcache[i].t) { + if(cache.extcache[i].t == EXT_CACHE_ST_D && cache_i2.extcache[i].t == EXT_CACHE_ST_F) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n)); + FCVTSD(EXTREG(i), EXTREG(i)); + cache.extcache[i].t = EXT_CACHE_ST_F; + } else if(cache.extcache[i].t == EXT_CACHE_ST_F && cache_i2.extcache[i].t == EXT_CACHE_ST_D) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n)); + FCVTDS(EXTREG(i), EXTREG(i)); + cache.extcache[i].t = EXT_CACHE_ST_D; + } + } + } + } + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); +#endif } static void flagsCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1) { @@ -669,6 +1642,25 @@ void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val) } } +#ifdef HAVE_TRACE +void fpu_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3) +{ + x87_reflectcache(dyn, ninst, s1, s2, s3); + if(trace_emm) + mmx_reflectcache(dyn, ninst, s1); + if(trace_xmm) + sse_reflectcache(dyn, ninst, s1); +} +#endif + +void fpu_reset(dynarec_rv64_t* dyn) +{ + x87_reset(dyn); + mmx_reset(dyn); + sse_reset(dyn); + fpu_reset_reg(dyn); +} + void emit_pf(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4) { MAYUSE(dyn); MAYUSE(ninst); @@ -685,3 +1677,64 @@ void emit_pf(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4) BEQZ(s4, 8); ORI(xFlags, xFlags, 1 << F_PF); } + +void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n) +{ + MESSAGE(LOG_DEBUG, "Reset Caches with %d\n",reset_n); + #if STEP > 1 + // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap) + dyn->e = dyn->insts[ninst].e; + extcacheUnwind(&dyn->e); + #ifdef HAVE_TRACE + if(box64_dynarec_dump) + if(memcmp(&dyn->e, &dyn->insts[reset_n].e, sizeof(ext_cache_t))) { + MESSAGE(LOG_DEBUG, "Warning, difference in extcache: reset="); + for(int i=0; i<24; ++i) + if(dyn->insts[reset_n].e.extcache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[reset_n].e.extcache[i].t, dyn->insts[reset_n].e.extcache[i].n)); + if(dyn->insts[reset_n].e.combined1 || dyn->insts[reset_n].e.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[reset_n].e.swapped?"SWP":"CMB", dyn->insts[reset_n].e.combined1, dyn->insts[reset_n].e.combined2); + if(dyn->insts[reset_n].e.stack_push || dyn->insts[reset_n].e.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[reset_n].e.stack_push, -dyn->insts[reset_n].e.stack_pop); + MESSAGE(LOG_DEBUG, " ==> "); + for(int i=0; i<24; ++i) + if(dyn->insts[ninst].e.extcache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[ninst].e.extcache[i].t, dyn->insts[ninst].e.extcache[i].n)); + if(dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[ninst].e.swapped?"SWP":"CMB", dyn->insts[ninst].e.combined1, dyn->insts[ninst].e.combined2); + if(dyn->insts[ninst].e.stack_push || dyn->insts[ninst].e.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[ninst].e.stack_push, -dyn->insts[ninst].e.stack_pop); + MESSAGE(LOG_DEBUG, " -> "); + for(int i=0; i<24; ++i) + if(dyn->e.extcache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->e.extcache[i].t, dyn->e.extcache[i].n)); + if(dyn->e.combined1 || dyn->e.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->e.swapped?"SWP":"CMB", dyn->e.combined1, dyn->e.combined2); + if(dyn->e.stack_push || dyn->e.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->e.stack_push, -dyn->e.stack_pop); + MESSAGE(LOG_DEBUG, "\n"); + } + #endif //HAVE_TRACE + #else + dyn->e = dyn->insts[reset_n].e; + #endif +} + +// propagate ST stack state, especial stack pop that are defered +void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst) +{ + if(dyn->e.stack_pop) { + for(int j=0; j<24; ++j) + if((dyn->e.extcache[j].t == EXT_CACHE_ST_D || dyn->e.extcache[j].t == EXT_CACHE_ST_F)) { + if(dyn->e.extcache[j].n<dyn->e.stack_pop) + dyn->e.extcache[j].v = 0; + else + dyn->e.extcache[j].n-=dyn->e.stack_pop; + } + dyn->e.stack_pop = 0; + } + dyn->e.stack = dyn->e.stack_next; + dyn->e.news = 0; + dyn->e.stack_push = 0; + dyn->e.swapped = 0; +} \ No newline at end of file diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index d1c5dc2e..e8e2cf6c 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -469,6 +469,9 @@ #ifndef TABLE64 #define TABLE64(A, V) #endif +#ifndef FTABLE64 +#define FTABLE64(A, V) +#endif #define ARCH_INIT() @@ -638,7 +641,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define x87_do_pop STEPNAME(x87_do_pop) #define x87_get_current_cache STEPNAME(x87_get_current_cache) #define x87_get_cache STEPNAME(x87_get_cache) -#define x87_get_neoncache STEPNAME(x87_get_neoncache) +#define x87_get_extcache STEPNAME(x87_get_extcache) #define x87_get_st STEPNAME(x87_get_st) #define x87_get_st_empty STEPNAME(x87_get_st) #define x87_refresh STEPNAME(x87_refresh) @@ -654,6 +657,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define sse_get_reg STEPNAME(sse_get_reg) #define sse_get_reg_empty STEPNAME(sse_get_reg_empty) #define sse_forget_reg STEPNAME(sse_forget_reg) +#define sse_purge07cache STEPNAME(sse_purge07cache) #define fpu_pushcache STEPNAME(fpu_pushcache) #define fpu_popcache STEPNAME(fpu_popcache) @@ -663,6 +667,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define fpu_purgecache STEPNAME(fpu_purgecache) #define mmx_purgecache STEPNAME(mmx_purgecache) #define x87_purgecache STEPNAME(x87_purgecache) +#define sse_purgecache STEPNAME(sse_purgecache) #ifdef HAVE_TRACE #define fpu_reflectcache STEPNAME(fpu_reflectcache) #endif @@ -766,37 +771,37 @@ void emit_pf(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4); // x87 helper // cache of the local stack counter, to avoid upadte at every call -//void x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch); +void x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch); // fpu push. Return the Dd value to be used -//int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t); +int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t); // fpu push. Do not allocate a cache register. Needs a scratch register to do x87stack synch (or 0 to not do it) -//void x87_do_push_empty(dynarec_rv64_t* dyn, int ninst, int s1); +void x87_do_push_empty(dynarec_rv64_t* dyn, int ninst, int s1); // fpu pop. All previous returned Dd should be considered invalid -//void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1); +void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1); // get cache index for a x87 reg, return -1 if cache doesn't exist -//int x87_get_current_cache(dynarec_rv64_t* dyn, int ninst, int st, int t); +int x87_get_current_cache(dynarec_rv64_t* dyn, int ninst, int st, int t); // get cache index for a x87 reg, create the entry if needed -//int x87_get_cache(dynarec_rv64_t* dyn, int ninst, int populate, int s1, int s2, int a, int t); -// get neoncache index for a x87 reg -//int x87_get_neoncache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a); +int x87_get_cache(dynarec_rv64_t* dyn, int ninst, int populate, int s1, int s2, int a, int t); +// get extcache index for a x87 reg +int x87_get_extcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a); // get vfpu register for a x87 reg, create the entry if needed -//int x87_get_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t); +int x87_get_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t); // get vfpu register for a x87 reg, create the entry if needed. Do not fetch the Stx if not already in cache -//int x87_get_st_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t); +int x87_get_st_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t); // refresh a value from the cache ->emu (nothing done if value is not cached) -//void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st); +void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st); // refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached) -//void x87_forget(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st); +void x87_forget(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st); // refresh the cache value from emu -//void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st); +void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st); // swap 2 x87 regs -//void x87_swapreg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int b); +void x87_swapreg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int b); // Set rounding according to cw flags, return reg to restore flags -//int x87_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3); +int x87_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3); // Restore round flag -//void x87_restoreround(dynarec_rv64_t* dyn, int ninst, int s1); +void x87_restoreround(dynarec_rv64_t* dyn, int ninst, int s1); // Set rounding according to mxcsr flags, return reg to restore flags -//int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3); +int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3); void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3); @@ -808,6 +813,39 @@ void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zerou #else #define CHECK_CACHE() (cacheupd = CacheNeedsTransform(dyn, ninst)) #endif +#define extcache_st_coherency STEPNAME(extcache_st_coherency) +int extcache_st_coherency(dynarec_rv64_t* dyn, int ninst, int a, int b); + +#if STEP == 0 +#define ST_IS_F(A) 0 +#define X87_COMBINE(A, B) EXT_CACHE_ST_D +#define X87_ST0 EXT_CACHE_ST_D +#define X87_ST(A) EXT_CACHE_ST_D +#elif STEP == 1 +#define ST_IS_F(A) (extcache_get_current_st(dyn, ninst, A)==EXT_CACHE_ST_F) +#define X87_COMBINE(A, B) extcache_combine_st(dyn, ninst, A, B) +#define X87_ST0 extcache_get_current_st(dyn, ninst, 0) +#define X87_ST(A) extcache_get_current_st(dyn, ninst, A) +#else +#define ST_IS_F(A) (extcache_get_st(dyn, ninst, A)==EXT_CACHE_ST_F) +#if STEP == 3 +#define X87_COMBINE(A, B) extcache_st_coherency(dyn, ninst, A, B) +#else +#define X87_COMBINE(A, B) extcache_get_st(dyn, ninst, A) +#endif +#define X87_ST0 extcache_get_st(dyn, ninst, 0) +#define X87_ST(A) extcache_get_st(dyn, ninst, A) +#endif + +//SSE/SSE2 helpers +// get neon register for a SSE reg, create the entry if needed +int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single); +// get neon register for a SSE reg, but don't try to synch it if it needed to be created +int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single); +// forget neon register for a SSE reg, create the entry if needed +void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a); +// purge the XMM0..XMM7 cache (before function call) +void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1); // common coproc helpers // reset the cache @@ -828,6 +866,7 @@ void fpu_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3); void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07); void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07); + uintptr_t dynarec64_00(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int seg, int* ok, int* need_epilog); @@ -835,7 +874,7 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); //uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); //uintptr_t dynarec64_D8(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); -//uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); //uintptr_t dynarec64_DA(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); //uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); //uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); diff --git a/src/dynarec/rv64/dynarec_rv64_pass2.h b/src/dynarec/rv64/dynarec_rv64_pass2.h index 408c3e97..176d512d 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass2.h +++ b/src/dynarec/rv64/dynarec_rv64_pass2.h @@ -15,3 +15,4 @@ #define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; #define INST_NAME(name) #define TABLE64(A, V) {Table64(dyn, (V)); EMIT(0); EMIT(0);} +#define FTABLE64(A, V) {mmx87_regs_t v = {.d = V}; Table64(dyn, v.q); EMIT(0); EMIT(0);} \ No newline at end of file diff --git a/src/dynarec/rv64/dynarec_rv64_pass3.h b/src/dynarec/rv64/dynarec_rv64_pass3.h index e6aa268f..dac190cd 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass3.h +++ b/src/dynarec/rv64/dynarec_rv64_pass3.h @@ -57,3 +57,4 @@ } #define TABLE64(A, V) {int val64offset = Table64(dyn, (V)); MESSAGE(LOG_DUMP, " Table64: 0x%lx\n", (V)); AUIPC(A, SPLIT20(val64offset)); LD(A, A, SPLIT12(val64offset));} +#define FTABLE64(A, V) {mmx87_regs_t v = {.d = V}; int val64offset = Table64(dyn, v.q); MESSAGE(LOG_DUMP, " FTable64: %g\n", v.d); AUIPC(x1, SPLIT20(val64offset)); FLD(A, x1, SPLIT12(val64offset));} diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index ac403464..c00325e1 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -10,6 +10,52 @@ typedef struct instsize_s instsize_t; #define BARRIER_MAYBE 8 +#define EXT_CACHE_NONE 0 +#define EXT_CACHE_ST_D 1 +#define EXT_CACHE_ST_F 2 +#define EXT_CACHE_MM 3 +#define EXT_CACHE_SS 4 +#define EXT_CACHE_SD 5 +#define EXT_CACHE_SCR 6 +typedef union ext_cache_s { + int8_t v; + struct { + uint8_t t:4; // reg type + uint8_t n:4; // reg number + }; +} ext_cache_t; +typedef union sse_cache_s { + int8_t v; + struct { + uint8_t reg:7; + uint8_t single:1; + }; +} sse_cache_t; +typedef struct extcache_s { + // ext cache + ext_cache_t extcache[24]; + int8_t stack; + int8_t stack_next; + int8_t stack_pop; + int8_t stack_push; + uint8_t combined1; + uint8_t combined2; + uint8_t swapped; // the combined reg were swapped + uint8_t barrier; // is there a barrier at instruction epilog? + uint32_t news; // bitmask, wich neoncache are new for this opcode + // fpu cache + int8_t x87cache[8]; // cache status for the 8 x87 register behind the fpu stack + int8_t x87reg[8]; // reg used for x87cache entry + int8_t mmxcache[8]; // cache status for the 8 MMX registers + sse_cache_t ssecache[16]; // cache status for the 16 SSE(2) registers + int8_t fpuused[24]; // all 10..31 & 0..1 double reg from fpu, used by x87, sse and mmx + int8_t x87stack; // cache stack counter + int8_t mmxcount; // number of mmx register used (not both mmx and x87 at the same time) + int8_t fpu_scratch; // scratch counter + int8_t fpu_extra_qscratch; // some opcode need an extra quad scratch register + int8_t fpu_reg; // x87/sse/mmx reg counter +} extcache_t; + typedef struct flagcache_s { int pending; // is there a pending flags here, or to check? int dfnone; // if defered flags is already set to df_none @@ -32,6 +78,7 @@ typedef struct instruction_rv64_s { int retn; int barrier_maybe; flagcache_t f_exit; // flags status at end of intruction + extcache_t e; // extcache at end of intruction (but before poping) flagcache_t f_entry; // flags status before the instruction begin } instruction_rv64_t; @@ -50,6 +97,7 @@ typedef struct dynarec_rv64_s { int table64cap; uintptr_t tablestart; flagcache_t f; + extcache_t e; // cache for the 10..31 0..1 double reg from fpu, plus x87 stack delta uintptr_t* next; // variable array of "next" jump address int next_sz; int next_cap; @@ -65,6 +113,11 @@ typedef struct dynarec_rv64_s { int forward_ninst; // ninst at the forward point } dynarec_rv64_t; +// convert idx (0..24) to reg index (10..31 0..1) +#define EXTREG(A) (((A)+10)&31) +// convert reg index (10..31 0..1) or idx (0..24) +#define EXTIDX(A) (((A)-10)&31) + void add_next(dynarec_rv64_t *dyn, uintptr_t addr); uintptr_t get_closest_next(dynarec_rv64_t *dyn, uintptr_t addr); int is_nops(dynarec_rv64_t *dyn, uintptr_t addr, int n); diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h index ca8b0891..ab12fa55 100644 --- a/src/dynarec/rv64/rv64_emitter.h +++ b/src/dynarec/rv64/rv64_emitter.h @@ -305,6 +305,13 @@ f28–31 ft8–11 FP temporaries Caller // Shift Right Arithmetic Immediate #define SRAIxw(rd, rs1, imm) if (rex.w) { SRAI(rd, rs1, imm); } else { SRAIW(rd, rs1, imm); } +#define CSRRW(rd, rs1, csr) EMIT(I_type(csr, rs1, 0b001, rd, 0b1110011)) +#define CSRRS(rd, rs1, csr) EMIT(I_type(csr, rs1, 0b010, rd, 0b1110011)) +#define CSRRC(rd, rs1, csr) EMIT(I_type(csr, rs1, 0b011, rd, 0b1110011)) +#define CSRRWI(rd, imm, csr) EMIT(I_type(csr, imm, 0b101, rd, 0b1110011)) +#define CSRRSI(rd, imm, csr) EMIT(I_type(csr, imm, 0b110, rd, 0b1110011)) +#define CSRRCI(rd, imm, csr) EMIT(I_type(csr, imm, 0b111, rd, 0b1110011)) + // RV32M // rd =(lower) rs1 * rs2 (both signed) #define MUL(rd, rs1, rs2) EMIT(R_type(0b0000001, rs2, rs1, 0b000, rd, 0b0110011)) @@ -350,4 +357,58 @@ f28–31 ft8–11 FP temporaries Caller #define LRxw(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010|rex.w, rd, 0b0101111)) #define SCxw(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010|rex.w, rd, 0b0101111)) +// RV32F +// Read round mode +#define FRRM(rd) CSRRS(rd, xZR, 0x002) +// Swap round mode with rd +#define FSRM(rd) CSRRWI(rd, 0b111, 0x002) +// load single precision from rs1+imm12 to frd +#define FLW(frd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b010, frd, 0b0000111)) +// store single precision frs2 to rs1+imm12 +#define FSW(frs2, rs1, imm12) EMIT(S_type(imm12, frs2, rs1, 0b010, 0b0100111)) +// store rs1 with rs2 sign bit to rd +#define FSGNJS(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b000, rd, 0b1010011)) +// move rs1 to rd +#define FMVS(rd, rs1) FSGNJS(rd, rs1, rs1) +// store rs1 with oposite rs2 sign bit to rd +#define FSGNJNS(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b001, rd, 0b1010011)) +// -rs1 => rd +#define FNEGS(rd, rs1) FSGNJNS(rd, rs1, rs1) +// store rs1 with rs1^rs2 sign bit to rd +#define FSGNJXS(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b1010011)) +// |rs1| => rd +#define FABSS(rd, rs1) FSGNJXS(rd, rs1, rs1) +// Move from Single +#define FMVXW(rd, frs1) EMIT(R_type(0b1110000, 0b00000, frs1, 0b000, rd, 0b1010011)) +// Move to Single +#define FMVWX(frd, rs1) EMIT(R_type(0b1111000, 0b00000, rs1, 0b000, frd, 0b1010011)) + +// RV32D +// load double precision from rs1+imm12 to frd +#define FLD(frd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b011, frd, 0b0000111)) +// store double precision frs2 to rs1+imm12 +#define FSD(frs2, rs1, imm12) EMIT(S_type(imm12, frs2, rs1, 0b011, 0b0100111)) +// Convert Double frs1 to Single frd +#define FCVTSD(frd, frs1) EMIT(R_type(0b0100000, 0b00001, frs1, 0b000, frd, 0b1010011)) +// Convert Single frs1 to Double frd +#define FCVTDS(frd, frs1) EMIT(R_type(0b0100001, 0b00000, frs1, 0b000, frd, 0b1010011)) +// store rs1 with rs2 sign bit to rd +#define FSGNJD(rd, rs1, rs2) EMIT(R_type(0b0010001, rs2, rs1, 0b000, rd, 0b1010011)) +// move rs1 to rd +#define FMVD(rd, rs1) FSGNJD(rd, rs1, rs1) +// store rs1 with oposite rs2 sign bit to rd +#define FSGNJND(rd, rs1, rs2) EMIT(R_type(0b0010001, rs2, rs1, 0b001, rd, 0b1010011)) +// -rs1 => rd +#define FNEGD(rd, rs1) FSGNJND(rd, rs1, rs1) +// store rs1 with rs1^rs2 sign bit to rd +#define FSGNJXD(rd, rs1, rs2) EMIT(R_type(0b0010001, rs2, rs1, 0b010, rd, 0b1010011)) +// |rs1| => rd +#define FABSD(rd, rs1) FSGNJXD(rd, rs1, rs1) + +//RV64D +// Move from Double +#define FMVXD(rd, frs1) EMIT(R_type(0b1110001, 0b00000, frs1, 0b000, rd, 0b1010011)) +// Move to Double +#define FMVDX(frd, rs1) EMIT(R_type(0b1111001, 0b00000, rs1, 0b000, frd, 0b1010011)) + #endif //__RV64_EMITTER_H__ |