diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-05-21 13:34:31 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-05-21 13:34:31 +0200 |
| commit | 706ca3649e830bd52529096092496e358ee53085 (patch) | |
| tree | fd1594a45fb4823679e986d4831e366f22ce66b6 | |
| parent | bb537e4dd5fe07b08c7d2fe01fe7798869aa959e (diff) | |
| download | box64-706ca3649e830bd52529096092496e358ee53085.tar.gz box64-706ca3649e830bd52529096092496e358ee53085.zip | |
[RV64_DYNAREC] Fixed a bunch of x87 opcodes
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_d9.c | 94 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_db.c | 46 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_df.c | 2 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.c | 40 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 43 |
5 files changed, 179 insertions, 46 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_d9.c b/src/dynarec/rv64/dynarec_rv64_d9.c index d5f2ad6d..ed75f0b5 100644 --- a/src/dynarec/rv64/dynarec_rv64_d9.c +++ b/src/dynarec/rv64/dynarec_rv64_d9.c @@ -128,9 +128,79 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni break; case 0xE5: INST_NAME("FXAM"); + #if 1 + i1 = x87_get_current_cache(dyn, ninst, 0, EXT_CACHE_ST_D); + // value put in x14 + if(i1==-1) { + if(fpu_is_st_freed(dyn, ninst, 0)) { + MOV32w(x4, 0b100000100000000); + B_MARK3_nocond; + } else { + // not in cache, so check Empty status and load it + i2 = -dyn->e.x87stack; + LWU(x3, xEmu, offsetof(x64emu_t, fpu_stack)); + if(i2) { + ADDI(x3, x3, i2); + } + MOV32w(x4, 0b100000100000000); + BGE_MARK3(xZR, x3); + // x5 will be the actual top + LWU(x5, xEmu, offsetof(x64emu_t, top)); + if(i2) { + ADDI(x5, x5, i2); + ANDI(x5, x5, 7); // (emu->top + i)&7 + } + // load tag + LHU(x3, xEmu, offsetof(x64emu_t, fpu_tags)); + MOV32w(x4, 0b100000100000000); + ANDI(x2, x3, 0b11); + BNEZ_MARK3(x2); // empty: C3,C2,C0 = 101 + // load x2 with ST0 anyway, for sign extraction + if(rv64_zba) SH3ADD(x1, x2, xEmu); else {SLLI(x2, x2, 3); ADD(x1, xEmu, x2);} + LD(x2, x1, offsetof(x64emu_t, x87)); + } + } else { + // simply move from cache reg to x2 + v1 = dyn->e.x87reg[i1]; + FMVXD(x2, v1); + } + // get exponant in x1 + SRLI(x1, x2, 20+32); + ANDI(x1, x1, 0x7ff); // 0x7ff + BNEZ_MARK(x1); // not zero or denormal + MOV64x(x3, 0x7fffffffffffffff); + AND(x1, x2, x3); + MOV32w(x4, 0b100000000000000); // Zero: C3,C2,C0 = 100 + BEQZ_MARK3(x1); + MOV32w(x4, 0b100010000000000); // Denormal: C3,C2,C0 = 110 + B_MARK3_nocond; + MARK; + ADDI(x3, xZR, 0x7ff); // infinite/NaN? + MOV32w(x4, 0b000010000000000); // normal: C3,C2,C0 = 010 + BNE_MARK3(x1, x3); + SLLI(x3, x2, 12); + SRLI(x3, x3, 12); // and 0x000fffffffffffff + MOV32w(x4, 0b000010100000000); // infinity: C3,C2,C0 = 011 + BEQZ_MARK3(x3); + MOV32w(x4, 0b000000100000000); // NaN: C3,C2,C0 = 001 + MARK3; + // Extract signa & Update SW + SRLI(x1, x2, 63); + ANDI(x4, x4, ~(1<<9)); + SLLI(x1, x1, 9); + OR(x4, x4, x1); //C1 + LHU(x1, xEmu, offsetof(x64emu_t, sw)); + MOV32w(x2, ~0b0100011100000000); + AND(x1, x1, x2); + OR(x4, x4, x1); + SH(x4, xEmu, offsetof(x64emu_t, sw)); + #else MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_refresh(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x1); CALL(fpu_fxam, -1); // should be possible inline, but is it worth it? + x87_unstackcount(dyn, ninst, x1, s0); + #endif break; case 0xE8: @@ -183,21 +253,27 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("F2XM1"); MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_f2xm1, -1); + x87_unstackcount(dyn, ninst, x3, s0); break; case 0xF1: INST_NAME("FYL2X"); MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fyl2x, -1); + x87_unstackcount(dyn, ninst, x3, s0); X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xF2: INST_NAME("FPTAN"); MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_ftan, -1); + x87_unstackcount(dyn, ninst, x3, s0); X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, EXT_CACHE_ST_F); if(ST_IS_F(0)) { MOV32w(x1, 0x3f800000); @@ -212,7 +288,9 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fpatan, -1); + x87_unstackcount(dyn, ninst, x3, s0); X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xF4: @@ -220,14 +298,18 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni MESSAGE(LOG_DUMP, "Need Optimization\n"); X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, 0); x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fxtract, -1); + x87_unstackcount(dyn, ninst, x3, s0); break; case 0xF5: INST_NAME("FPREM1"); MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fprem1, -1); + x87_unstackcount(dyn, ninst, x3, s0); break; case 0xF6: INST_NAME("FDECSTP"); @@ -250,14 +332,18 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fprem, -1); + x87_unstackcount(dyn, ninst, x3, s0); break; case 0xF9: INST_NAME("FYL2XP1"); MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fyl2xp1, -1); + x87_unstackcount(dyn, ninst, x3, s0); X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xFA: @@ -274,7 +360,9 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni MESSAGE(LOG_DUMP, "Need Optimization\n"); X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, 0); x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fsincos, -1); + x87_unstackcount(dyn, ninst, x3, s0); break; case 0xFC: INST_NAME("FRNDINT"); @@ -321,19 +409,25 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fscale, -1); + x87_unstackcount(dyn, ninst, x3, s0); break; case 0xFE: INST_NAME("FSIN"); MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fsin, -1); + x87_unstackcount(dyn, ninst, x3, s0); break; case 0xFF: INST_NAME("FCOS"); MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fcos, -1); + x87_unstackcount(dyn, ninst, x3, s0); break; diff --git a/src/dynarec/rv64/dynarec_rv64_db.c b/src/dynarec/rv64/dynarec_rv64_db.c index a647ee11..80e99666 100644 --- a/src/dynarec/rv64/dynarec_rv64_db.c +++ b/src/dynarec/rv64/dynarec_rv64_db.c @@ -150,44 +150,14 @@ uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0xEF: INST_NAME("FUCOMI ST0, STx"); SETFLAGS(X_ALL, SF_SET); - SET_DFNONE(); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); - IFX(X_ZF | X_PF | X_CF) { - if(ST_IS_F(0)) { - FEQS(x5, v1, v1); - FEQS(x4, v2, v2); - AND(x5, x5, x4); - BEQZ(x5, 24); // undefined/NaN - FEQS(x5, v1, v2); - BNEZ(x5, 24); // equal - FLTS(x3, v1, v2); // x3 = (v1<v2)?1:0 - OR(xFlags, xFlags, x3); // CF is the least significant bit - J(16); // end - // NaN - ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF)); - J(8); // end - // equal - ORI(xFlags, xFlags, 1<<F_ZF); - // end - } else { - FEQD(x5, v1, v1); - FEQD(x4, v2, v2); - AND(x5, x5, x4); - BEQZ(x5, 24); // undefined/NaN - FEQD(x5, v1, v2); - BNEZ(x5, 24); // equal - FLTD(x3, v1, v2); // x3 = (v1<v2)?1:0 - OR(xFlags, xFlags, x3); // CF is the least significant bit - J(16); // end - // NaN - ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF)); - J(8); // end - // equal - ORI(xFlags, xFlags, 1<<F_ZF); - // end - } + if (ST_IS_F(0)) { + FCOMIS(v1, v2, x1, x2, x3, x4, x5); + } else { + FCOMID(v1, v2, x1, x2, x3, x4, x5); } + break; case 0xF0: case 0xF1: @@ -202,9 +172,9 @@ uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); if (ST_IS_F(0)) { - FCOMS(v1, v2, x1, x2, x3, x4, x5); + FCOMIS(v1, v2, x1, x2, x3, x4, x5); } else { - FCOMS(v1, v2, x1, x2, x3, x4, x5); + FCOMID(v1, v2, x1, x2, x3, x4, x5); } break; @@ -312,7 +282,9 @@ uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni if(ed!=x1) { MV(x1, ed); } + s0 = x87_stackcount(dyn, ninst, x3); CALL(native_fstp, -1); + x87_unstackcount(dyn, ninst, x3, s0); } X87_POP_OR_FAIL(dyn, ninst, x3); break; diff --git a/src/dynarec/rv64/dynarec_rv64_df.c b/src/dynarec/rv64/dynarec_rv64_df.c index 2a2884c5..c0bc6ae8 100644 --- a/src/dynarec/rv64/dynarec_rv64_df.c +++ b/src/dynarec/rv64/dynarec_rv64_df.c @@ -212,7 +212,9 @@ uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni x87_forget(dyn, ninst, x1, x2, 0); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); if (ed != x1) { MV(x1, ed); } + s0 = x87_stackcount(dyn, ninst, x3); CALL(fpu_fbst, -1); + x87_unstackcount(dyn, ninst, x3, s0); X87_POP_OR_FAIL(dyn, ninst, x3); break; case 7: diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index 7aa51a74..c87cd4f7 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -827,11 +827,11 @@ void grab_segdata(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, int reg, int s MESSAGE(LOG_DUMP, "----%s Offset\n", (segment==_FS)?"FS":"GS"); } -void x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch) +int x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch) { MAYUSE(scratch); if(!dyn->e.x87stack) - return; + return 0; if(dyn->e.mmxcount) mmx_purgecache(dyn, ninst, 0, scratch); MESSAGE(LOG_DUMP, "\tSynch x87 Stackcount (%d)\n", dyn->e.x87stack); @@ -848,10 +848,35 @@ void x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch) // reset x87stack, but not the stack count of extcache dyn->e.x87stack = 0; dyn->e.stack_next -= dyn->e.stack; + int ret = dyn->e.stack; dyn->e.stack = 0; MESSAGE(LOG_DUMP, "\t------x87 Stackcount\n"); + return ret; +} +void x87_unstackcount(dynarec_rv64_t* dyn, int ninst, int scratch, int count) +{ + MAYUSE(scratch); + if(!count) + return; + if(dyn->e.mmxcount) + mmx_purgecache(dyn, ninst, 0, scratch); + MESSAGE(LOG_DUMP, "\tSynch x87 Unstackcount (%d)\n", count); + int a = -count; + // Add x87stack to emu fpu_stack + LW(scratch, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI(scratch, scratch, a); + SW(scratch, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LW(scratch, xEmu, offsetof(x64emu_t, top)); + SUBI(scratch, scratch, a); + ANDI(scratch, scratch, 7); + SW(scratch, xEmu, offsetof(x64emu_t, top)); + // reset x87stack, but not the stack count of extcache + dyn->e.x87stack = count; + dyn->e.stack = count; + dyn->e.stack_next += dyn->e.stack; + MESSAGE(LOG_DUMP, "\t------x87 Unstackcount\n"); } - int extcache_st_coherency(dynarec_rv64_t* dyn, int ninst, int a, int b) { int i1 = extcache_get_st(dyn, ninst, a); @@ -1217,7 +1242,6 @@ int x87_get_st_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) { - x87_stackcount(dyn, ninst, s1); int ret = -1; for (int i=0; (i<8) && (ret==-1); ++i) if(dyn->e.x87cache[i] == st) @@ -1230,11 +1254,12 @@ void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) // Get top LW(s2, xEmu, offsetof(x64emu_t, top)); // Update - if(st) { - ADDI(s2, s2, st); + int a = st - dyn->e.x87stack; + if(a) { + ADDI(s2, s2, a); ANDI(s2, s2, 7); // (emu->top + i)&7 } - ADD(s1, xEmu, s2); + if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);} if (dyn->e.extcache[EXTIDX(reg)].t == EXT_CACHE_ST_F) { FCVTDS(SCRATCH0, reg); FSD(SCRATCH0, s1, offsetof(x64emu_t, x87)); @@ -1250,7 +1275,6 @@ void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) void x87_forget(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) { - x87_stackcount(dyn, ninst, s1); int ret = -1; for (int i=0; (i<8) && (ret==-1); ++i) if(dyn->e.x87cache[i] == st) diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 1731de5a..6d7f63b1 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -717,8 +717,12 @@ #define BEQ_MARK3(reg1, reg2) Bxx_gen(EQ, MARK3, reg1, reg2) // Branch to MARK3 if reg1!=reg2 (use j64) #define BNE_MARK3(reg1, reg2) Bxx_gen(NE, MARK3, reg1, reg2) +// Branch to MARK3 if reg1!>=reg2 (use j64) +#define BGE_MARK3(reg1, reg2) Bxx_gen(GE, MARK3, reg1, reg2) // Branch to MARK3 if reg1!=0 (use j64) #define BNEZ_MARK3(reg) BNE_MARK3(reg, xZR) +// Branch to MARK3 if reg1==0 (use j64) +#define BEQZ_MARK3(reg) BEQ_MARK3(reg, xZR) // Branch to MARK3 instruction unconditionnal (use j64) #define B_MARK3_nocond Bxx_gen(__, MARK3, 0, 0) // Branch to MARKLOCK if reg1!=reg2 (use j64) @@ -1199,6 +1203,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define x87_forget STEPNAME(x87_forget) #define x87_reget_st STEPNAME(x87_reget_st) #define x87_stackcount STEPNAME(x87_stackcount) +#define x87_unstackcount STEPNAME(x87_unstackcount) #define x87_swapreg STEPNAME(x87_swapreg) #define x87_setround STEPNAME(x87_setround) #define x87_restoreround STEPNAME(x87_restoreround) @@ -1340,7 +1345,9 @@ void emit_pf(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4); // x87 helper // cache of the local stack counter, to avoid upadte at every call -void x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch); +int x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch); +// restore local stack counter +void x87_unstackcount(dynarec_rv64_t* dyn, int ninst, int scratch, int count); // fpu push. Return the Dd value to be used int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t); // fpu push. Do not allocate a cache register. Needs a scratch register to do x87stack synch (or 0 to not do it) @@ -1625,6 +1632,40 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int #define FCOMS(v1, v2, s1, s2, s3, s4, s5) FCOM(S, v1, v2, s1, s2, s3, s4, s5) #define FCOMD(v1, v2, s1, s2, s3, s4, s5) FCOM(D, v1, v2, s1, s2, s3, s4, s5) +#define FCOMI(w, v1, v2, s1, s2, s3, s4, s5) \ + IFX(X_OF|X_AF|X_SF|X_PEND) { \ + MOV64x(s2, ~((1<<F_OF2)|(1<<F_AF)|(1<<F_SF))); \ + AND(xFlags, xFlags, s2); \ + } \ + IFX(X_CF|X_PF|X_ZF|X_PEND) { \ + MOV32w(s2, 0b01000101); \ + if(rv64_zbb) { \ + ANDN(xFlags, xFlags, s2); \ + } else { \ + NOT(s3, s2); \ + AND(xFlags, xFlags, s3); \ + } \ + FEQ##w(s5, v1, v1); \ + FEQ##w(s4, v2, v2); \ + AND(s5, s5, s4); \ + BEQZ(s5, 5*4); /* undefined/NaN */ \ + FEQ##w(s5, v1, v2); \ + BNEZ(s5, 5*4); /* equal */ \ + FLT##w(s1, v1, v2); /* s1 = (v1<v2)?1:0 */ \ + J(4*4); /* end */ \ + /* undefined/NaN */ \ + MV(s1, s2); \ + J(2*4); /* end */ \ + /* equal */ \ + ADDI(s1, xZR, 0b01000000); \ + /* end */ \ + OR(xFlags, xFlags, s1); \ + } \ + SET_DFNONE() + +#define FCOMIS(v1, v2, s1, s2, s3, s4, s5) FCOMI(S, v1, v2, s1, s2, s3, s4, s5) +#define FCOMID(v1, v2, s1, s2, s3, s4, s5) FCOMI(D, v1, v2, s1, s2, s3, s4, s5) + // reg = (reg < -32768) ? -32768 : ((reg > 32767) ? 32767 : reg) #define SAT16(reg, s) \ LUI(s, 0xFFFF8); /* -32768 */ \ |