diff options
Diffstat (limited to 'src/dynarec')
| -rw-r--r-- | src/dynarec/dynarec_arch.h | 2 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_00.c | 29 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_d8.c | 276 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_d9.c | 516 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_da.c | 210 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_db.c | 256 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_dc.c | 227 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_dd.c | 192 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_de.c | 158 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_df.c | 294 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_functions.c | 237 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_functions.h | 23 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_helper.c | 739 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_helper.h | 226 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_pass0.h | 4 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_pass1.h | 7 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_pass2.h | 7 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_pass3.h | 8 | ||||
| -rw-r--r-- | src/dynarec/la64/dynarec_la64_private.h | 1 | ||||
| -rw-r--r-- | src/dynarec/la64/la64_mapping.h | 4 |
20 files changed, 3384 insertions, 32 deletions
diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h index 6df0b53c..b57d7771 100644 --- a/src/dynarec/dynarec_arch.h +++ b/src/dynarec/dynarec_arch.h @@ -68,8 +68,6 @@ extern void* create_updateflags(); #define STOP_NATIVE_FLAGS(A, B) {} #define ARCH_UNALIGNED(A, B) 0 -// NYI -#define NATIVE_RESTORE_X87PC() #elif defined(RV64) #define instruction_native_t instruction_rv64_t diff --git a/src/dynarec/la64/dynarec_la64_00.c b/src/dynarec/la64/dynarec_la64_00.c index 35dd0005..74a543b5 100644 --- a/src/dynarec/la64/dynarec_la64_00.c +++ b/src/dynarec/la64/dynarec_la64_00.c @@ -1234,6 +1234,9 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni BSTRPICK_D(xRDX, xRDX, 31, 0); } break; + case 0x9B: + INST_NAME("FWAIT"); + break; case 0x9C: INST_NAME("PUSHF"); READFLAGS(X_ALL); @@ -2294,6 +2297,32 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } break; + case 0xD8: + addr = dynarec64_D8(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + case 0xD9: + addr = dynarec64_D9(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + case 0xDA: + addr = dynarec64_DA(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + case 0xDB: + addr = dynarec64_DB(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + case 0xDC: + addr = dynarec64_DC(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + case 0xDD: + addr = dynarec64_DD(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + + case 0xDE: + addr = dynarec64_DE(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + case 0xDF: + addr = dynarec64_DF(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + #define GO(Z) \ BARRIER(BARRIER_MAYBE); \ JUMP(addr + i8, 1); \ diff --git a/src/dynarec/la64/dynarec_la64_d8.c b/src/dynarec/la64/dynarec_la64_d8.c new file mode 100644 index 00000000..e2051185 --- /dev/null +++ b/src/dynarec/la64/dynarec_la64_d8.c @@ -0,0 +1,276 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "la64_printer.h" +#include "dynarec_la64_private.h" +#include "../dynarec_helper.h" +#include "dynarec_la64_functions.h" + + +uintptr_t dynarec64_D8(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; + (void)rep; + (void)need_epilog; + + uint8_t nextop = F8; + uint8_t ed; + uint8_t wback, wb1; + uint8_t u8; + int64_t fixedaddress; + int unscaled; + int v1, v2; + int s0; + int i1, i2, i3; + + MAYUSE(s0); + MAYUSE(v2); + MAYUSE(v1); + + if (MODREG) + switch (nextop) { + case 0xC0 ... 0xC7: + INST_NAME("FADD ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + if (ST_IS_F(0)) { + FADD_S(v1, v1, v2); + } else { + FADD_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xC8 ... 0xCF: + INST_NAME("FMUL ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + if (ST_IS_F(0)) { + FMUL_S(v1, v1, v2); + } else { + FMUL_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xD0 ... 0xD7: + INST_NAME("FCOM ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMS(v1, v2, x1, x2, x3); + } else { + FCOMD(v1, v2, x1, x2, x3); + } + break; + case 0xD8 ... 0xDF: + INST_NAME("FCOMP ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMS(v1, v2, x1, x2, x3); + } else { + FCOMD(v1, v2, x1, x2, x3); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xE0 ... 0xE7: + INST_NAME("FSUB ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + if (ST_IS_F(0)) { + FSUB_S(v1, v1, v2); + } else { + FSUB_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xE8 ... 0xEF: + INST_NAME("FSUBR ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + if (ST_IS_F(0)) { + FSUB_S(v1, v2, v1); + } else { + FSUB_D(v1, v2, v1); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xF0 ... 0xF7: + INST_NAME("FDIV ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + if (ST_IS_F(0)) { + FDIV_S(v1, v1, v2); + } else { + FDIV_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xF8 ... 0xFF: + INST_NAME("FDIVR ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + if (ST_IS_F(0)) { + FDIV_S(v1, v2, v1); + } else { + FDIV_D(v1, v2, v1); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + default: + DEFAULT; + break; + } + else + switch ((nextop >> 3) & 7) { + case 0: + INST_NAME("FADD ST0, float[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + s0 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(s0, ed, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3); + if (ST_IS_F(0)) { + FADD_S(v1, v1, s0); + } else { + FCVT_D_S(s0, s0); + FADD_D(v1, v1, s0); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 1: + INST_NAME("FMUL ST0, float[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + s0 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(s0, ed, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3); + if (ST_IS_F(0)) { + FMUL_S(v1, v1, s0); + } else { + FCVT_D_S(s0, s0); + FMUL_D(v1, v1, s0); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 2: + INST_NAME("FCOM ST0, float[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + s0 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(s0, ed, fixedaddress); + if (ST_IS_F(0)) { + FCOMS(v1, s0, x1, x6, x3); + } else { + FCVT_D_S(s0, s0); + FCOMD(v1, s0, x1, x6, x3); + } + break; + case 3: + INST_NAME("FCOMP ST0, float[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + s0 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(s0, ed, fixedaddress); + if (ST_IS_F(0)) { + FCOMS(v1, s0, x1, x6, x3); + } else { + FCVT_D_S(s0, s0); + FCOMD(v1, s0, x1, x6, x3); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 4: + INST_NAME("FSUB ST0, float[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + s0 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(s0, ed, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3); + if (ST_IS_F(0)) { + FSUB_S(v1, v1, s0); + } else { + FCVT_D_S(s0, s0); + FSUB_D(v1, v1, s0); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 5: + INST_NAME("FSUBR ST0, float[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + s0 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(s0, ed, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3); + if (ST_IS_F(0)) { + FSUB_S(v1, s0, v1); + } else { + FCVT_D_S(s0, s0); + FSUB_D(v1, s0, v1); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 6: + INST_NAME("FDIV ST0, float[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + s0 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(s0, ed, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3); + if (ST_IS_F(0)) { + FDIV_S(v1, v1, s0); + } else { + FCVT_D_S(s0, s0); + FDIV_D(v1, v1, s0); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 7: + INST_NAME("FDIVR ST0, float[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + s0 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(s0, ed, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3); + if (ST_IS_F(0)) { + FDIV_S(v1, s0, v1); + } else { + FCVT_D_S(s0, s0); + FDIV_D(v1, s0, v1); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + } + return addr; +} diff --git a/src/dynarec/la64/dynarec_la64_d9.c b/src/dynarec/la64/dynarec_la64_d9.c new file mode 100644 index 00000000..29285b3e --- /dev/null +++ b/src/dynarec/la64/dynarec_la64_d9.c @@ -0,0 +1,516 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "la64_emitter.h" +#include "la64_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "la64_printer.h" +#include "dynarec_la64_private.h" +#include "../dynarec_helper.h" +#include "dynarec_la64_functions.h" + + +uintptr_t dynarec64_D9(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; + (void)rep; + (void)need_epilog; + + uint8_t nextop = F8; + uint8_t ed; + uint8_t wback, wb1; + uint8_t u8; + int64_t fixedaddress; + int unscaled; + int v0, v1, v2; + int s0; + int i1, i2, i3; + int64_t j64; + + MAYUSE(s0); + MAYUSE(v0); + MAYUSE(v1); + MAYUSE(v2); + MAYUSE(j64); + + if (MODREG) + switch (nextop) { + case 0xC0 ... 0xC7: + INST_NAME("FLD STx"); + X87_PUSH_OR_FAIL(v2, dyn, ninst, x1, X87_ST(nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, (nextop & 7) + 1, X87_COMBINE(0, (nextop & 7) + 1)); + if (ST_IS_F(0)) { + FMOV_S(v2, v1); + } else { + FMOV_D(v2, v1); + } + break; + + case 0xC8: + INST_NAME("FXCH ST0"); + break; + case 0xC9 ... 0xCF: + INST_NAME("FXCH STx"); + // swap the cache value, not the double value itself :p + x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_ST(nextop & 7)); + x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + x87_swapreg(dyn, ninst, x1, x2, 0, nextop & 7); + // should set C1 to 0 + break; + + case 0xD0: + INST_NAME("FNOP"); + break; + + case 0xD8: + INST_NAME("FSTPNCE ST0, ST0"); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xD9 ... 0xDF: + INST_NAME("FSTPNCE ST0, STx"); + // copy the cache value for st0 to stx + x87_get_st_empty(dyn, ninst, x1, x2, nextop & 7, X87_ST(nextop & 7)); + x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + x87_swapreg(dyn, ninst, x1, x2, 0, nextop & 7); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xE0: + INST_NAME("FCHS"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if (ST_IS_F(0)) { + FNEG_S(v1, v1); + } else { + FNEG_D(v1, v1); + } + break; + case 0xE1: + INST_NAME("FABS"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if (ST_IS_F(0)) { + FABS_S(v1, v1); + } else { + FABS_D(v1, v1); + } + break; + + case 0xE4: + INST_NAME("FTST"); + DEFAULT; + break; + case 0xE5: + INST_NAME("FXAM"); +#if 1 + i1 = x87_get_current_cache(dyn, ninst, 0, LSX_CACHE_ST_D); + // value put in x4 + if (i1 == -1) { + if (fpu_is_st_freed(dyn, ninst, 0)) { + MOV32w(x4, 0b100000100000000); + B_MARK3_nocond; + } else { + // not in cache, so check Empty status and load it + i2 = -dyn->lsx.x87stack; + LD_WU(x3, xEmu, offsetof(x64emu_t, fpu_stack)); + if (i2) { + ADDI_D(x3, x3, -i2); + } + MOV32w(x4, 0b100000100000000); // empty: C3,C2,C0 = 101 + BGE_MARK3(xZR, x3); + // x5 will be the actual top + LD_WU(x5, xEmu, offsetof(x64emu_t, top)); + if (i2) { + ADDI_D(x5, x5, i2); + ANDI(x5, x5, 7); // (emu->top + i)&7 + } + // load x2 with ST0 anyway, for sign extraction + SLLI_D(x5, x5, 3); + ADD_D(x1, xEmu, x5); + LD_D(x2, x1, offsetof(x64emu_t, x87)); + // load tag + if (i2 >= 0) { + LD_HU(x3, xEmu, offsetof(x64emu_t, fpu_tags)); + if (i2 > 0) { + BSTRINS_D(x3, xZR, 15, 0); + SRLI_D(x3, x3, i2 * 2); + } + ANDI(x3, x3, 0b11); + BNEZ_MARK3(x3); // empty: C3,C2,C0 = 101 + } + } + } else { + // simply move from cache reg to x2 + v1 = dyn->lsx.x87reg[i1]; + MOVFR2GR_D(x2, v1); + } + // get exponant in x1 + SRLI_D(x1, x2, 20 + 32); + ANDI(x1, x1, 0x7ff); // 0x7ff + BNEZ_MARK(x1); // not zero or denormal + MOV64x(x3, 0x7fffffffffffffff); + AND(x1, x2, x3); + MOV32w(x4, 0b100000000000000); // Zero: C3,C2,C0 = 100 + BEQZ_MARK3(x1); + MOV32w(x4, 0b100010000000000); // Denormal: C3,C2,C0 = 110 + B_MARK3_nocond; + MARK; + ADDI_D(x3, xZR, 0x7ff); // infinite/NaN? + MOV32w(x4, 0b000010000000000); // normal: C3,C2,C0 = 010 + BNE_MARK3(x1, x3); + SLLI_D(x3, x2, 12); + SRLI_D(x3, x3, 12); // and 0x000fffffffffffff + MOV32w(x4, 0b000010100000000); // infinity: C3,C2,C0 = 011 + BEQZ_MARK3(x3); + MOV32w(x4, 0b000000100000000); // NaN: C3,C2,C0 = 001 + MARK3; + // Extract sign & Update SW + SRLI_D(x1, x2, 63); + SLLI_D(x1, x1, 9); + OR(x4, x4, x1); // C1 + LD_HU(x1, xEmu, offsetof(x64emu_t, sw)); + MOV32w(x2, ~0b0100011100000000); + AND(x1, x1, x2); + OR(x4, x4, x1); + ST_H(x4, xEmu, offsetof(x64emu_t, sw)); +#else + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_refresh(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x1); + CALL(fpu_fxam, -1, 0, 0); // should be possible inline, but is it worth it? + x87_unstackcount(dyn, ninst, x1, s0); +#endif + break; + + case 0xE8: + INST_NAME("FLD1"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_F); + if (ST_IS_F(0)) { + MOV32w(x1, 0x3f800000); + MOVGR2FR_W(v1, x1); + } else { + MOV64x(x1, 0x3FF0000000000000); + MOVGR2FR_D(v1, x1); + } + break; + case 0xE9: + INST_NAME("FLDL2T"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D); + FTABLE64(v1, L2T); + break; + case 0xEA: + INST_NAME("FLDL2E"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D); + FTABLE64(v1, L2E); + break; + case 0xEB: + INST_NAME("FLDPI"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D); + FTABLE64(v1, PI); + break; + case 0xEC: + INST_NAME("FLDLG2"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D); + FTABLE64(v1, LG2); + break; + case 0xED: + INST_NAME("FLDLN2"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D); + FTABLE64(v1, LN2); + break; + case 0xEE: + INST_NAME("FLDZ"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_F); + if (ST_IS_F(0)) + MOVGR2FR_W(v1, xZR); + else + MOVGR2FR_D(v1, xZR); + break; + + case 0xF0: + INST_NAME("F2XM1"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x3); + CALL(const_native_f2xm1, -1, 0, 0); + x87_unstackcount(dyn, ninst, x3, s0); + break; + case 0xF1: + INST_NAME("FYL2X"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); + CALL(const_native_fyl2x, -1, 0, 0); + x87_unstackcount(dyn, ninst, x3, s0); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xF2: + INST_NAME("FPTAN"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x3); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + CALL_(const_native_ftan, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + x87_unstackcount(dyn, ninst, x3, s0); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_F); + if (ST_IS_F(0)) { + MOV32w(x1, 0x3f800000); + MOVGR2FR_W(v1, x1); + } else { + MOV64x(x1, 0x3FF0000000000000); + MOVGR2FR_D(v1, x1); + } + break; + case 0xF3: + INST_NAME("FPATAN"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + CALL_(const_native_fpatan, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + x87_unstackcount(dyn, ninst, x3, s0); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xF4: + INST_NAME("FXTRACT"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, x3); + x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); + CALL(const_native_fxtract, -1, 0, 0); + x87_unstackcount(dyn, ninst, x3, s0); + break; + case 0xF5: + INST_NAME("FPREM1"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); + CALL(const_native_fprem1, -1, 0, 0); + x87_unstackcount(dyn, ninst, x3, s0); + break; + case 0xF6: + INST_NAME("FDECSTP"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + LD_W(x2, xEmu, offsetof(x64emu_t, top)); + ADDI_D(x2, x2, -1); + ANDI(x2, x2, 7); + ST_W(x2, xEmu, offsetof(x64emu_t, top)); + break; + case 0xF7: + INST_NAME("FINCSTP"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + LD_W(x2, xEmu, offsetof(x64emu_t, top)); + ADDI_D(x2, x2, 1); + ANDI(x2, x2, 7); + ST_W(x2, xEmu, offsetof(x64emu_t, top)); + break; + case 0xF8: + INST_NAME("FPREM"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); + CALL(const_native_fprem, -1, 0, 0); + x87_unstackcount(dyn, ninst, x3, s0); + break; + case 0xF9: + INST_NAME("FYL2XP1"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); + CALL(const_native_fyl2xp1, -1, 0, 0); + x87_unstackcount(dyn, ninst, x3, s0); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xFA: + INST_NAME("FSQRT"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + if (ST_IS_F(0)) { + FSQRT_S(v1, v1); + } else { + FSQRT_D(v1, v1); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xFB: + INST_NAME("FSINCOS"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, x3); + x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + CALL_(const_native_fsincos, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + x87_unstackcount(dyn, ninst, x3, s0); + break; + case 0xFC: + INST_NAME("FRNDINT"); + v0 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + v1 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn); + u8 = x87_setround(dyn, ninst, x1, x2); + + if (ST_IS_F(0)) { + FCMP_S(fcc0, v0, v0, cEQ); + BCNEZ_MARK(fcc0); + B_NEXT_nocond; + MARK; // v0 is not nan + FABS_S(v1, v0); + MOV64x(x3, 1ULL << __FLT_MANT_DIG__); + MOVGR2FR_W(v2, x3); + FFINT_S_L(v2, v2); + FCMP_S(fcc1, v1, v2, cLT); + BCNEZ_MARK2(fcc1); + B_NEXT_nocond; + MARK2; + FTINT_L_S(v1, v0); + FFINT_S_L(v1, v1); + FCOPYSIGN_S(v0, v1, v0); + } else { + FCMP_D(fcc0, v0, v0, cEQ); + BCNEZ_MARK(fcc0); + B_NEXT_nocond; + MARK; // v0 is not nan + FABS_D(v1, v0); + MOV64x(x3, 1ULL << __DBL_MANT_DIG__); + MOVGR2FR_D(v2, x3); + FFINT_D_L(v2, v2); + FCMP_D(fcc1, v1, v2, cLT); + BCNEZ_MARK2(fcc1); + B_NEXT_nocond; + MARK2; + FTINT_L_D(v1, v0); + FFINT_D_L(v1, v1); + FCOPYSIGN_D(v0, v1, v0); + } + x87_restoreround(dyn, ninst, u8); + break; + case 0xFD: + INST_NAME("FSCALE"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + x87_forget(dyn, ninst, x1, x2, 1); + s0 = x87_stackcount(dyn, ninst, x3); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + CALL_(const_native_fscale, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + x87_unstackcount(dyn, ninst, x3, s0); + break; + case 0xFE: + INST_NAME("FSIN"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x3); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + CALL_(const_native_fsin, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + x87_unstackcount(dyn, ninst, x3, s0); + break; + case 0xFF: + INST_NAME("FCOS"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_forget(dyn, ninst, x1, x2, 0); + s0 = x87_stackcount(dyn, ninst, x3); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + CALL_(const_native_fcos, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + x87_unstackcount(dyn, ninst, x3, s0); + break; + default: + DEFAULT; + break; + } + else + switch ((nextop >> 3) & 7) { + case 0: + INST_NAME("FLD ST0, float[ED]"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, (BOX64ENV(dynarec_x87double) == 1) ? LSX_CACHE_ST_D : LSX_CACHE_ST_F); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v1, ed, fixedaddress); + if (!ST_IS_F(0)) { + FCVT_D_S(v1, v1); + } + break; + case 2: + INST_NAME("FST float[ED], ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F); + if (ST_IS_F(0)) + s0 = v1; + else { + s0 = fpu_get_scratch(dyn); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2); + FCVT_S_D(s0, v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + } + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FST_S(s0, ed, fixedaddress); + break; + case 3: + INST_NAME("FSTP float[ED], ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + if (!ST_IS_F(0)) { + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3); + FCVT_S_D(v1, v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + } + FST_S(v1, ed, fixedaddress); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 4: + INST_NAME("FLDENV Ed"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE? + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); + MOV32w(x2, 0); + CALL(const_fpu_loadenv, -1, ed, x2); + NATIVE_RESTORE_X87PC(); + break; + case 5: + INST_NAME("FLDCW Ew"); + GETEW(x1, 0); + ST_H(x1, xEmu, offsetof(x64emu_t, cw)); // hopefully cw is not too far for an imm8 + if (dyn->need_x87check) { + SRLI_D(x87pc, x1, 8); + ANDI(x87pc, x87pc, 0b11); + } + break; + case 6: + INST_NAME("FNSTENV Ed"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE? + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); + MOV32w(x2, 0); + CALL(const_fpu_savenv, -1, ed, x2); + break; + case 7: + INST_NAME("FNSTCW Ew"); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x1, &fixedaddress, rex, NULL, 0, 0); + ed = x1; + wb1 = 1; + LD_H(x1, xEmu, offsetof(x64emu_t, cw)); + EWBACK; + break; + default: + DEFAULT; + } + return addr; +} diff --git a/src/dynarec/la64/dynarec_la64_da.c b/src/dynarec/la64/dynarec_la64_da.c new file mode 100644 index 00000000..9fc06fff --- /dev/null +++ b/src/dynarec/la64/dynarec_la64_da.c @@ -0,0 +1,210 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "la64_emitter.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "la64_printer.h" +#include "dynarec_la64_private.h" +#include "../dynarec_helper.h" +#include "dynarec_la64_functions.h" + + +uintptr_t dynarec64_DA(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + uint8_t nextop = F8; + int64_t j64; + uint8_t ed; + uint8_t u8; + uint8_t wback; + int v1, v2; + int d0; + int s0; + int64_t fixedaddress; + int unscaled; + + MAYUSE(s0); + MAYUSE(d0); + MAYUSE(v2); + MAYUSE(v1); + MAYUSE(ed); + MAYUSE(j64); + + if (MODREG) + switch (nextop) { + case 0xC0 ... 0xC7: + INST_NAME("FCMOVB ST0, STx"); + READFLAGS(X_CF); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + RESTORE_EFLAGS(x5); + ANDI(x1, xFlags, 1 << F_CF); + CBZ_NEXT(x1); + if (ST_IS_F(0)) + FMOV_S(v1, v2); + else + FMOV_D(v1, v2); + break; + case 0xC8 ... 0xCF: + INST_NAME("FCMOVE ST0, STx"); + READFLAGS(X_ZF); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + RESTORE_EFLAGS(x5); + ANDI(x1, xFlags, 1 << F_ZF); + CBZ_NEXT(x1); + if (ST_IS_F(0)) + FMOV_S(v1, v2); + else + FMOV_D(v1, v2); + break; + case 0xD0 ... 0xD7: + INST_NAME("FCMOVBE ST0, STx"); + READFLAGS(X_CF | X_ZF); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + RESTORE_EFLAGS(x5); + ANDI(x1, xFlags, (1 << F_CF) | (1 << F_ZF)); + CBZ_NEXT(x1); + if (ST_IS_F(0)) + FMOV_S(v1, v2); + else + FMOV_D(v1, v2); + break; + case 0xD8 ... 0xDF: + INST_NAME("FCMOVU ST0, STx"); + READFLAGS(X_PF); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + RESTORE_EFLAGS(x5); + ANDI(x1, xFlags, (1 << F_PF)); + CBZ_NEXT(x1); + if (ST_IS_F(0)) + FMOV_S(v1, v2); + else + FMOV_D(v1, v2); + break; + case 0xE9: + INST_NAME("FUCOMPP ST0, ST1"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, 1, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMS(v1, v2, x1, x2, x3); + } else { + FCOMD(v1, v2, x1, x2, x3); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + default: + DEFAULT; + break; + } + else + switch ((nextop >> 3) & 7) { + case 0: + INST_NAME("FIADD ST0, Ed"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v2, ed, fixedaddress); + FFINT_D_W(v2, v2); // i32 -> double + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FADD_D(v1, v1, v2); + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 1: + INST_NAME("FIMUL ST0, Ed"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v2, ed, fixedaddress); + FFINT_D_W(v2, v2); // i32 -> double + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FMUL_D(v1, v1, v2); + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 2: + INST_NAME("FICOM ST0, Ed"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v2, ed, fixedaddress); + FFINT_D_W(v2, v2); // i32 -> double + FCOMD(v1, v2, x1, x2, x3); + break; + case 3: + INST_NAME("FICOMP ST0, Ed"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v2, ed, fixedaddress); + FFINT_D_W(v2, v2); // i32 -> double + FCOMD(v1, v2, x1, x2, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 4: + INST_NAME("FISUB ST0, Ed"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v2, ed, fixedaddress); + FFINT_D_W(v2, v2); // i32 -> double + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FSUB_D(v1, v1, v2); + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 5: + INST_NAME("FISUBR ST0, Ed"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v2, ed, fixedaddress); + FFINT_D_W(v2, v2); // i32 -> double + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FSUB_D(v1, v2, v1); + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 6: + INST_NAME("FIDIV ST0, Ed"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v2, ed, fixedaddress); + FFINT_D_W(v2, v2); // i32 -> double + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FDIV_D(v1, v1, v2); + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 7: + INST_NAME("FIDIVR ST0, Ed"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v2, ed, fixedaddress); + FFINT_D_W(v2, v2); // i32 -> double + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FDIV_D(v1, v2, v1); + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + } + return addr; +} diff --git a/src/dynarec/la64/dynarec_la64_db.c b/src/dynarec/la64/dynarec_la64_db.c new file mode 100644 index 00000000..7b81af4a --- /dev/null +++ b/src/dynarec/la64/dynarec_la64_db.c @@ -0,0 +1,256 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "la64_emitter.h" +#include "la64_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "la64_printer.h" +#include "dynarec_la64_private.h" +#include "../dynarec_helper.h" +#include "dynarec_la64_functions.h" + + +uintptr_t dynarec64_DB(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; + (void)rep; + (void)need_epilog; + + uint8_t nextop = F8; + uint8_t ed; + uint8_t wback; + uint8_t u8; + int64_t fixedaddress; + int unscaled; + int v1, v2; + int s0; + int64_t j64; + + MAYUSE(s0); + MAYUSE(v2); + MAYUSE(v1); + MAYUSE(j64); + + if (MODREG) + switch (nextop) { + case 0xC0 ... 0xC7: + INST_NAME("FCMOVNB ST0, STx"); + READFLAGS(X_CF); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + RESTORE_EFLAGS(x5); + ANDI(x1, xFlags, 1 << F_CF); + CBNZ_NEXT(x1); + if (ST_IS_F(0)) { + FMOV_S(v1, v2); + } else { + FMOV_D(v1, v2); // F_CF==0 + } + break; + case 0xC8 ... 0xCF: + INST_NAME("FCMOVNE ST0, STx"); + READFLAGS(X_ZF); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + RESTORE_EFLAGS(x5); + ANDI(x1, xFlags, 1 << F_ZF); + CBNZ_NEXT(x1); + if (ST_IS_F(0)) { + FMOV_S(v1, v2); + } else { + FMOV_D(v1, v2); // F_ZF==0 + } + break; + case 0xD0 ... 0xD7: + INST_NAME("FCMOVNBE ST0, STx"); + READFLAGS(X_CF | X_ZF); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + RESTORE_EFLAGS(x5); + ANDI(x1, xFlags, (1 << F_CF) | (1 << F_ZF)); + CBNZ_NEXT(x1); + if (ST_IS_F(0)) { + FMOV_S(v1, v2); + } else { + FMOV_D(v1, v2); // F_CF==0 & F_ZF==0 + } + break; + case 0xD8 ... 0xDF: + INST_NAME("FCMOVNU ST0, STx"); + READFLAGS(X_PF); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + RESTORE_EFLAGS(x5); + ANDI(x1, xFlags, 1 << F_PF); + CBNZ_NEXT(x1); + if (ST_IS_F(0)) { + FMOV_S(v1, v2); + } else { + FMOV_D(v1, v2); // F_PF==0 + } + break; + case 0xE1: + INST_NAME("FDISI8087_NOP"); // so.. NOP? + break; + case 0xE2: + INST_NAME("FNCLEX"); + LD_H(x2, xEmu, offsetof(x64emu_t, sw)); + BSTRINS_D(x2, x2, 7, 0); // IE .. PE, SF, ES + BSTRINS_D(x2, x2, 15, 15); // B + ST_H(x2, xEmu, offsetof(x64emu_t, sw)); + break; + case 0xE3: + INST_NAME("FNINIT"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_purgecache(dyn, ninst, 0, x1, x2, x3); + CALL(const_reset_fpu, -1, 0, 0); + NATIVE_RESTORE_X87PC(); + break; + case 0xE8 ... 0xEF: + INST_NAME("FUCOMI ST0, STx"); + SETFLAGS(X_ALL, SF_SET, NAT_FLAGS_NOFUSION); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMIS(v1, v2, x1, x2); + } else { + FCOMID(v1, v2, x1, x2); + } + + break; + case 0xF0 ... 0xF7: + INST_NAME("FCOMI ST0, STx"); + SETFLAGS(X_ALL, SF_SET, NAT_FLAGS_NOFUSION); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMIS(v1, v2, x1, x2); + } else { + FCOMID(v1, v2, x1, x2); + } + break; + default: + DEFAULT; + break; + } + else + switch ((nextop >> 3) & 7) { + case 0: + INST_NAME("FILD ST0, Ed"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_S(v1, ed, fixedaddress); + FFINT_D_W(v1, v1); // i32 -> double + break; + case 1: + INST_NAME("FISTTP Ed, ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0); + if (!BOX64ENV(dynarec_fastround)) { + MOVGR2FCSR(FCSR2, xZR); // reset all bits + } + FTINTRZ_W_D(v2, v1); + if (!BOX64ENV(dynarec_fastround)) { + MOVFCSR2GR(x5, FCSR2); // get back FPSR to check + BSTRPICK_D(x5, x5, FR_V, FR_V); + BEQZ_MARK(x5); + MOV32w(x4, 0x80000000); + MOVGR2FR_W(v2, x4); + MARK; + } + FST_S(v2, wback, fixedaddress); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 2: + INST_NAME("FIST Ed, ST0"); + DEFAULT; + break; + case 3: + INST_NAME("FISTP Ed, ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + u8 = x87_setround(dyn, ninst, x1, x5); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); + v2 = fpu_get_scratch(dyn); + if (!BOX64ENV(dynarec_fastround)) { + MOVGR2FCSR(FCSR2, xZR); // reset all bits + } + FTINT_W_D(v2, v1); + if (!BOX64ENV(dynarec_fastround)) { + MOVFCSR2GR(x5, FCSR2); // get back FPSR to check + BSTRPICK_D(x5, x5, FR_V, FR_V); + BEQZ_MARK(x5); + MOV32w(x4, 0x80000000); + MOVGR2FR_W(v2, x4); + MARK; + } + FST_S(v2, wback, fixedaddress); + x87_restoreround(dyn, ninst, u8); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 5: + INST_NAME("FLD tbyte"); + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 8, 0); + if ((PK(0) == 0xDB && ((PK(1) >> 3) & 7) == 7) || (!rex.is32bits && PK(0) >= 0x40 && PK(0) <= 0x4f && PK(1) == 0xDB && ((PK(2) >> 3) & 7) == 7)) { + NOTEST(x5); + // the FLD is immediatly followed by an FSTP + LD_D(x5, ed, fixedaddress + 0); + LD_H(x6, ed, fixedaddress + 8); + // no persistant scratch register, so unrool both instruction here... + MESSAGE(LOG_DUMP, "\tHack: FSTP tbyte\n"); + nextop = F8; // 0xDB or rex + if (!rex.is32bits && nextop >= 0x40 && nextop <= 0x4f) { + rex.rex = nextop; + nextop = F8; // 0xDB + } else + rex.rex = 0; + nextop = F8; // modrm + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 8, 0); + ST_D(x5, ed, fixedaddress + 0); + ST_H(x6, ed, fixedaddress + 8); + } else { + if (BOX64ENV(x87_no80bits)) { + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D); + FLD_D(v1, ed, fixedaddress); + } else { + ADDI_D(x1, ed, fixedaddress); + X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, x3); + x87_reflectcount(dyn, ninst, x3, x4); + CALL(const_native_fld, -1, x1, 0); + x87_unreflectcount(dyn, ninst, x3, x4); + } + } + break; + case 7: + INST_NAME("FSTP tbyte"); + if (BOX64ENV(x87_no80bits)) { + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FST_D(v1, wback, fixedaddress); + } else { + x87_forget(dyn, ninst, x1, x3, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); + x87_reflectcount(dyn, ninst, x3, x4); + CALL(const_native_fstp, -1, ed, 0); + x87_unreflectcount(dyn, ninst, x3, x4); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + default: + DEFAULT; + } + return addr; +} diff --git a/src/dynarec/la64/dynarec_la64_dc.c b/src/dynarec/la64/dynarec_la64_dc.c new file mode 100644 index 00000000..4d639822 --- /dev/null +++ b/src/dynarec/la64/dynarec_la64_dc.c @@ -0,0 +1,227 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "la64_printer.h" +#include "dynarec_la64_private.h" +#include "../dynarec_helper.h" +#include "dynarec_la64_functions.h" + + +uintptr_t dynarec64_DC(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; + (void)rep; + (void)need_epilog; + + uint8_t nextop = F8; + uint8_t wback; + uint8_t u8; + int64_t fixedaddress; + int unscaled; + int v1, v2; + + MAYUSE(v2); + MAYUSE(v1); + + if (MODREG) + switch (nextop) { + case 0xC0 ... 0xC7: + INST_NAME("FADD STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FADD_S(v1, v1, v2); + } else { + FADD_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xC8 ... 0xCF: + INST_NAME("FMUL STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FMUL_S(v1, v1, v2); + } else { + FMUL_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xD0 ... 0xD7: + INST_NAME("FCOM ST0, STx"); // yep + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMS(v1, v2, x1, x2, x3); + } else { + FCOMD(v1, v2, x1, x2, x3); + } + break; + case 0xD8 ... 0xDF: + INST_NAME("FCOMP ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMS(v1, v2, x1, x2, x3); + } else { + FCOMD(v1, v2, x1, x2, x3); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xE0 ... 0xE7: + INST_NAME("FSUBR STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FSUB_S(v1, v2, v1); + } else { + FSUB_D(v1, v2, v1); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xE8 ... 0xEF: + INST_NAME("FSUB STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FSUB_S(v1, v1, v2); + } else { + FSUB_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xF0 ... 0xF7: + INST_NAME("FDIVR STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FDIV_S(v1, v2, v1); + } else { + FDIV_D(v1, v2, v1); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 0xF8 ... 0xFF: + INST_NAME("FDIV STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FDIV_S(v1, v1, v2); + } else { + FDIV_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + default: + DEFAULT; + break; + } + else + switch ((nextop >> 3) & 7) { + case 0: + INST_NAME("FADD ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v2, wback, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FADD_D(v1, v1, v2); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 1: + INST_NAME("FMUL ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v2, wback, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FMUL_D(v1, v1, v2); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 2: + INST_NAME("FCOM ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v2, wback, fixedaddress); + FCOMD(v1, v2, x1, x6, x3); + break; + case 3: + INST_NAME("FCOMP ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v2, wback, fixedaddress); + FCOMD(v1, v2, x1, x6, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 4: + INST_NAME("FSUB ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v2, wback, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FSUB_D(v1, v1, v2); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 5: + INST_NAME("FSUBR ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v2, wback, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FSUB_D(v1, v2, v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 6: + INST_NAME("FDIV ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v2, wback, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FDIV_D(v1, v1, v2); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + case 7: + INST_NAME("FDIVR ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v2, wback, fixedaddress); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + FDIV_D(v1, v2, v1); + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + break; + } + return addr; +} diff --git a/src/dynarec/la64/dynarec_la64_dd.c b/src/dynarec/la64/dynarec_la64_dd.c new file mode 100644 index 00000000..ea1101b6 --- /dev/null +++ b/src/dynarec/la64/dynarec_la64_dd.c @@ -0,0 +1,192 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "la64_emitter.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "la64_printer.h" +#include "dynarec_la64_private.h" +#include "../dynarec_helper.h" +#include "dynarec_la64_functions.h" + + +uintptr_t dynarec64_DD(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; + (void)rep; + (void)need_epilog; + + uint8_t nextop = F8; + uint8_t ed, wback; + int64_t fixedaddress; + int unscaled; + int v1, v2; + int s0; + int64_t j64; + + MAYUSE(s0); + MAYUSE(v2); + MAYUSE(v1); + MAYUSE(j64); + + if (MODREG) + switch (nextop) { + case 0xC0 ... 0xC7: +#if 1 + if ((nextop & 7) == 0 && PK(0) == 0xD9 && PK(1) == 0xF7) { + MESSAGE(LOG_DUMP, "Hack for FFREE ST0 / FINCSTP\n"); + x87_do_pop(dyn, ninst, x1); + addr += 2; + SKIPTEST(x1); + } else + x87_free(dyn, ninst, x1, x2, x3, nextop & 7); +#else + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_purgecache(dyn, ninst, 0, x1, x2, x3); + MOV32w(x1, nextop & 7); + CALL(fpu_do_free, -1, x1, 0); +#endif + break; + case 0xD0 ... 0xD7: + INST_NAME("FST ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FMOV_S(v2, v1); + } else { + FMOV_D(v2, v1); + } + break; + case 0xD8: + INST_NAME("FSTP ST0, ST0"); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xD9 ... 0xDF: + INST_NAME("FSTP ST0, STx"); + // copy the cache value for st0 to stx + x87_get_st_empty(dyn, ninst, x1, x2, nextop & 7, X87_ST(nextop & 7)); + x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + x87_swapreg(dyn, ninst, x1, x2, 0, nextop & 7); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xE0 ... 0xE7: + INST_NAME("FUCOM ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMS(v1, v2, x1, x2, x3); + } else { + FCOMD(v1, v2, x1, x2, x3); + } + break; + case 0xE8 ... 0xEF: + INST_NAME("FUCOMP ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMS(v1, v2, x1, x2, x3); + } else { + FCOMD(v1, v2, x1, x2, x3); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + default: + DEFAULT; + break; + } + else + switch ((nextop >> 3) & 7) { + case 0: + INST_NAME("FLD double"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD_D(v1, wback, fixedaddress); + break; + case 1: + INST_NAME("FISTTP i64, ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_I64); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0); + if (ST_IS_I64(0)) { + FST_D(v1, wback, fixedaddress); + } else { + if (!BOX64ENV(dynarec_fastround)) { + MOVGR2FCSR(FCSR2, xZR); // reset all bits + } + FTINTRZ_L_D(v2, v1); + if (!BOX64ENV(dynarec_fastround)) { + MOVFCSR2GR(x5, FCSR2); // get back FPSR to check + MOV32w(x3, (1 << FR_V)); + AND(x5, x5, x3); + BEQZ_MARK(x5); + MOV64x(x4, 0x8000000000000000); + MOVGR2FR_D(v2, x4); + MARK; + } + FST_D(v2, wback, fixedaddress); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 2: + INST_NAME("FST double"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FST_D(v1, wback, fixedaddress); + break; + case 3: + INST_NAME("FSTP double"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FST_D(v1, wback, fixedaddress); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 4: + INST_NAME("FRSTOR m108byte"); + MESSAGE(LOG_DUMP, "Need Optimization (FRSTOR)\n"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + addr = geted(dyn, addr, ninst, nextop, &ed, x4, x6, &fixedaddress, rex, NULL, 0, 0); + CALL(const_native_frstor, -1, ed, 0); + break; + case 6: + INST_NAME("FNSAVE m108byte"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + addr = geted(dyn, addr, ninst, nextop, &ed, x4, x6, &fixedaddress, rex, NULL, 0, 0); + CALL(const_native_fsave, -1, ed, 0); + NATIVE_RESTORE_X87PC(); + break; + case 7: + INST_NAME("FNSTSW m2byte"); + // fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + addr = geted(dyn, addr, ninst, nextop, &ed, x4, x6, &fixedaddress, rex, NULL, 0, 0); + LD_WU(x2, xEmu, offsetof(x64emu_t, top)); + LD_HU(x3, xEmu, offsetof(x64emu_t, sw)); + if (dyn->lsx.x87stack) { + // update top + ADDI_D(x2, x2, -dyn->lsx.x87stack); + ANDI(x2, x2, 7); + } + MOV32w(x5, ~0x3800); + AND(x3, x3, x5); // mask out TOP + SLLI_D(x2, x2, 11); // shift TOP to bit 11 + OR(x3, x3, x2); // inject TOP + ST_H(x3, xEmu, offsetof(x64emu_t, sw)); + ST_H(x3, ed, fixedaddress); // store whole sw flags + break; + default: + DEFAULT; + } + return addr; +} diff --git a/src/dynarec/la64/dynarec_la64_de.c b/src/dynarec/la64/dynarec_la64_de.c new file mode 100644 index 00000000..cb73d76b --- /dev/null +++ b/src/dynarec/la64/dynarec_la64_de.c @@ -0,0 +1,158 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "la64_printer.h" +#include "dynarec_la64_private.h" +#include "../dynarec_helper.h" +#include "dynarec_la64_functions.h" + + +uintptr_t dynarec64_DE(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; + (void)rep; + (void)need_epilog; + + uint8_t nextop = F8; + uint8_t wback; + uint8_t u8; + int64_t fixedaddress; + int v1, v2; + + MAYUSE(v2); + MAYUSE(v1); + + if (MODREG) + switch (nextop) { + case 0xC0 ... 0xC7: + INST_NAME("FADDP STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FADD_S(v1, v1, v2); + } else { + FADD_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xC8 ... 0xCF: + INST_NAME("FMULP STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FMUL_S(v1, v1, v2); + } else { + FMUL_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xD0 ... 0xD7: + INST_NAME("FCOMP ST0, STx"); // yep + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMS(v1, v2, x1, x2, x3); + } else { + FCOMD(v1, v2, x1, x2, x3); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xD9: + INST_NAME("FCOMPP ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMS(v1, v2, x1, x2, x3); + } else { + FCOMD(v1, v2, x1, x2, x3); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xE0 ... 0xE7: + INST_NAME("FSUBRP STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FSUB_S(v1, v2, v1); + } else { + FSUB_D(v1, v2, v1); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xE8 ... 0xEF: + INST_NAME("FSUBP STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FSUB_S(v1, v1, v2); + } else { + FSUB_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xF0 ... 0xF7: + INST_NAME("FDIVRP STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FDIV_S(v1, v2, v1); + } else { + FDIV_D(v1, v2, v1); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 0xF8 ... 0xFF: + INST_NAME("FDIVP STx, ST0"); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5); + if (ST_IS_F(0)) { + FDIV_S(v1, v1, v2); + } else { + FDIV_D(v1, v1, v2); + } + X87_CHECK_PRECISION(v1); + if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + default: + DEFAULT; + break; + } + else + switch ((nextop >> 3) & 7) { + default: + DEFAULT; + } + return addr; +} diff --git a/src/dynarec/la64/dynarec_la64_df.c b/src/dynarec/la64/dynarec_la64_df.c new file mode 100644 index 00000000..07994fb5 --- /dev/null +++ b/src/dynarec/la64/dynarec_la64_df.c @@ -0,0 +1,294 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "la64_emitter.h" +#include "la64_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "la64_printer.h" +#include "dynarec_la64_private.h" +#include "../dynarec_helper.h" +#include "dynarec_la64_functions.h" + +uintptr_t dynarec64_DF(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; + (void)rep; + (void)need_epilog; + + uint8_t nextop = F8; + uint8_t ed, wback, u8; + int v1, v2; + int s0; + int64_t j64; + int64_t fixedaddress; + + MAYUSE(s0); + MAYUSE(v2); + MAYUSE(v1); + MAYUSE(j64); + + if (MODREG) + switch (nextop) { + case 0xC0 ... 0xC7: + INST_NAME("FFREEP STx"); + // not handling Tag... + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + + case 0xE0: + INST_NAME("FNSTSW AX"); + LD_WU(x2, xEmu, offsetof(x64emu_t, top)); + if (dyn->lsx.x87stack) { + ADDI_D(x2, x2, -dyn->lsx.x87stack); + ANDI(x2, x2, 0x7); + } + LD_HU(x1, xEmu, offsetof(x64emu_t, sw)); + MOV32w(x3, 0b1100011111111111); // mask + AND(x1, x1, x3); + SLLI_D(x2, x2, 11); + OR(x1, x1, x2); // inject top + ST_H(x1, xEmu, offsetof(x64emu_t, sw)); + SRLI_D(xRAX, xRAX, 16); + SLLI_D(xRAX, xRAX, 16); + OR(xRAX, xRAX, x1); + break; + case 0xE8 ... 0xF7: + if (nextop < 0xF0) { + INST_NAME("FUCOMIP ST0, STx"); + } else { + INST_NAME("FCOMIP ST0, STx"); + } + SETFLAGS(X_ALL, SF_SET, NAT_FLAGS_NOFUSION); + SET_DFNONE(); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7)); + if (ST_IS_F(0)) { + FCOMIS(v1, v2, x1, x2); + } else { + FCOMID(v1, v2, x1, x2); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + default: + DEFAULT; + break; + } + else + switch ((nextop >> 3) & 7) { + case 0: + INST_NAME("FILD ST0, Ew"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_F); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0); + LD_H(x1, wback, fixedaddress); + MOVGR2FR_D(v1, x1); + if (ST_IS_F(0)) { + FFINT_S_L(v1, v1); + } else { + FFINT_D_L(v1, v1); + } + break; + case 1: + INST_NAME("FISTTP Ew, ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0); + if (!BOX64ENV(dynarec_fastround)) { + MOVGR2FCSR(FCSR2, xZR); // reset all bits + } + if (ST_IS_F(0)) { + FTINTRZ_W_S(v2, v1); + MOVFR2GR_S(x4, v2); + } else { + FTINTRZ_W_D(v2, v1); + MOVFR2GR_S(x4, v2); + } + if (!BOX64ENV(dynarec_fastround)) { + MOVFCSR2GR(x5, FCSR2); // get back FPSR to check + BSTRPICK_D(x5, x5, FR_V, FR_V); + BNEZ_MARK(x5); + SLLI_W(x5, x4, 16); + SRAI_W(x5, x5, 16); + BEQ_MARK2(x5, x4); + MARK; + MOV32w(x4, 0x8000); + } + MARK2; + ST_H(x4, wback, fixedaddress); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 2: + INST_NAME("FIST Ew, ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F); + v2 = fpu_get_scratch(dyn); + u8 = x87_setround(dyn, ninst, x1, x5); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); + if (!BOX64ENV(dynarec_fastround)) { + MOVGR2FCSR(FCSR2, xZR); // reset all bits + } + if (ST_IS_F(0)) { + FTINT_W_S(v2, v1); + MOVFR2GR_S(x4, v2); + } else { + FTINT_W_D(v2, v1); + MOVFR2GR_S(x4, v2); + } + x87_restoreround(dyn, ninst, u8); + if (!BOX64ENV(dynarec_fastround)) { + MOVFCSR2GR(x5, FCSR2); // get back FPSR to check + BSTRPICK_D(x5, x5, FR_V, FR_V); + BNEZ_MARK(x5); + SLLI_W(x5, x4, 16); + SRAI_W(x5, x5, 16); + BEQ_MARK2(x5, x4); + MARK; + MOV32w(x4, 0x8000); + } + MARK2; + ST_H(x4, wback, fixedaddress); + break; + case 3: + INST_NAME("FISTP Ew, ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F); + v2 = fpu_get_scratch(dyn); + u8 = x87_setround(dyn, ninst, x1, x5); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); + if (!BOX64ENV(dynarec_fastround)) { + MOVGR2FCSR(FCSR2, xZR); // reset all bits + } + if (ST_IS_F(0)) { + FTINT_W_S(v2, v1); + MOVFR2GR_S(x4, v2); + } else { + FTINT_W_D(v2, v1); + MOVFR2GR_S(x4, v2); + } + x87_restoreround(dyn, ninst, u8); + if (!BOX64ENV(dynarec_fastround)) { + MOVFCSR2GR(x5, FCSR2); // get back FPSR to check + BSTRPICK_D(x5, x5, FR_V, FR_V); + BNEZ_MARK(x5); + SLLI_W(x5, x4, 16); + SRAI_W(x5, x5, 16); + BEQ_MARK2(x5, x4); + MARK; + MOV32w(x4, 0x8000); + } + MARK2; + ST_H(x4, wback, fixedaddress); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 4: + INST_NAME("FBLD ST0, tbytes"); + X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, x1); + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); + s0 = x87_stackcount(dyn, ninst, x3); + CALL(const_fpu_fbld, -1, ed, 0); + x87_unstackcount(dyn, ninst, x3, s0); + break; + case 5: + INST_NAME("FILD ST0, i64"); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_I64); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); + + if (ST_IS_I64(0)) { + FLD_D(v1, wback, fixedaddress); + } else { + LD_D(x1, wback, fixedaddress); + if (rex.is32bits) { + // need to also feed the STll stuff... + ADDI_D(x4, xEmu, offsetof(x64emu_t, fpu_ll)); + LD_WU(x5, xEmu, offsetof(x64emu_t, top)); + int a = 0 - dyn->lsx.x87stack; + if (a) { + ADDI_W(x5, x5, a); + ANDI(x5, x5, 0x7); + } + SLLI_D(x5, x5, 4); // fpu_ll is 2 i64 + ADD_D(x5, x5, x4); + ST_D(x1, x5, 8); // ll + } + MOVGR2FR_D(v1, x1); + FFINT_D_L(v1, v1); + if (rex.is32bits) { + FST_D(v1, x5, 0); // ref + } + } + break; + case 6: + INST_NAME("FBSTP tbytes, ST0"); + x87_forget(dyn, ninst, x1, x2, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); + s0 = x87_stackcount(dyn, ninst, x3); + CALL(const_fpu_fbst, -1, ed, 0); + x87_unstackcount(dyn, ninst, x3, s0); + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + case 7: + INST_NAME("FISTP i64, ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_I64); + v2 = fpu_get_scratch(dyn); + if (!ST_IS_I64(0)) { + u8 = x87_setround(dyn, ninst, x1, x7); + } + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); + + if (ST_IS_I64(0)) { + FST_D(v1, wback, fixedaddress); + } else { + if (rex.is32bits) { + // need to check STll first... + ADDI_D(x4, xEmu, offsetof(x64emu_t, fpu_ll)); + LD_WU(x5, xEmu, offsetof(x64emu_t, top)); + int a = 0 - dyn->lsx.x87stack; + if (a) { + ADDI_W(x5, x5, a); + ANDI(x5, x5, 0x7); + } + SLLI_D(x5, x5, 4); // fpu_ll is 2 i64 + ADD_D(x5, x5, x4); + MOVFR2GR_D(x3, v1); + LD_D(x6, x5, 0); // ref + BNE_MARK(x6, x3); + LD_D(x6, x5, 8); // ll + ST_D(x6, wback, fixedaddress); + B_MARK3_nocond; + MARK; + } + + if (!BOX64ENV(dynarec_fastround)) { + MOVGR2FCSR(FCSR2, xZR); // reset all bits + } + FTINT_L_D(v2, v1); + if (!BOX64ENV(dynarec_fastround)) { + MOVFCSR2GR(x5, FCSR2); // get back FPSR to check + BSTRPICK_D(x5, x5, FR_V, FR_V); + BEQ_MARK2(x5, xZR); + MOV64x(x4, 0x8000000000000000LL); + MOVGR2FR_D(v2, x4); + } + MARK2; + FST_D(v2, wback, fixedaddress); + MARK3; + x87_restoreround(dyn, ninst, u8); + } + X87_POP_OR_FAIL(dyn, ninst, x3); + break; + default: + DEFAULT; + break; + } + return addr; +} diff --git a/src/dynarec/la64/dynarec_la64_functions.c b/src/dynarec/la64/dynarec_la64_functions.c index f2674483..a38b2a47 100644 --- a/src/dynarec/la64/dynarec_la64_functions.c +++ b/src/dynarec/la64/dynarec_la64_functions.c @@ -29,8 +29,8 @@ #include "elfloader.h" #define XMM0 0 -#define X870 16 -#define EMM0 16 +#define X870 XMM0 + 16 +#define EMM0 XMM0 + 16 // Get a FPU scratch reg int fpu_get_scratch(dynarec_la64_t* dyn) @@ -42,7 +42,18 @@ void fpu_reset_scratch(dynarec_la64_t* dyn) { dyn->lsx.fpu_scratch = 0; } - +// Get a x87 double reg +int fpu_get_reg_x87(dynarec_la64_t* dyn, int t, int n) +{ + int i = X870; + while (dyn->lsx.fpuused[i]) + ++i; + dyn->lsx.fpuused[i] = 1; + dyn->lsx.lsxcache[i].n = n; + dyn->lsx.lsxcache[i].t = t; + dyn->lsx.news |= (1 << i); + return i; // return a Dx +} // Free a FPU double reg void fpu_free_reg(dynarec_la64_t* dyn, int reg) { @@ -101,6 +112,220 @@ void fpu_reset_reg(dynarec_la64_t* dyn) fpu_reset_reg_lsxcache(&dyn->lsx); } + +int lsxcache_no_i64(dynarec_la64_t* dyn, int ninst, int st, int a) +{ + if (a == LSX_CACHE_ST_I64) { + lsxcache_promote_double(dyn, ninst, st); + return LSX_CACHE_ST_D; + } + return a; +} + +int lsxcache_get_st(dynarec_la64_t* dyn, int ninst, int a) +{ + if (dyn->insts[ninst].lsx.swapped) { + if (dyn->insts[ninst].lsx.combined1 == a) + a = dyn->insts[ninst].lsx.combined2; + else if (dyn->insts[ninst].lsx.combined2 == a) + a = dyn->insts[ninst].lsx.combined1; + } + for (int i = 0; i < 24; ++i) + if ((dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F + || dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_D + || dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_I64) + && dyn->insts[ninst].lsx.lsxcache[i].n == a) + return dyn->insts[ninst].lsx.lsxcache[i].t; + // not in the cache yet, so will be fetched... + return LSX_CACHE_ST_D; +} + +int lsxcache_get_current_st(dynarec_la64_t* dyn, int ninst, int a) +{ + (void)ninst; + if (!dyn->insts) + return LSX_CACHE_ST_D; + for (int i = 0; i < 24; ++i) + if ((dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_F + || dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_D + || dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_I64) + && dyn->lsx.lsxcache[i].n == a) + return dyn->lsx.lsxcache[i].t; + // not in the cache yet, so will be fetched... + return LSX_CACHE_ST_D; +} + +int lsxcache_get_st_f(dynarec_la64_t* dyn, int ninst, int a) +{ + for (int i = 0; i < 24; ++i) + if (dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F + && dyn->insts[ninst].lsx.lsxcache[i].n == a) + return i; + return -1; +} + +int lsxcache_get_st_f_i64(dynarec_la64_t* dyn, int ninst, int a) +{ + for (int i = 0; i < 24; ++i) + if ((dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_I64 || dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F) + && dyn->insts[ninst].lsx.lsxcache[i].n == a) + return i; + return -1; +} + +int lsxcache_get_st_f_noback(dynarec_la64_t* dyn, int ninst, int a) +{ + for (int i = 0; i < 24; ++i) + if (dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F + && dyn->insts[ninst].lsx.lsxcache[i].n == a) + return i; + return -1; +} + +int lsxcache_get_st_f_i64_noback(dynarec_la64_t* dyn, int ninst, int a) +{ + for (int i = 0; i < 24; ++i) + if ((dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_I64 || dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F) + && dyn->insts[ninst].lsx.lsxcache[i].n == a) + return i; + return -1; +} + +int lsxcache_get_current_st_f(dynarec_la64_t* dyn, int a) +{ + for (int i = 0; i < 24; ++i) + if (dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_F + && dyn->lsx.lsxcache[i].n == a) + return i; + return -1; +} + +int lsxcache_get_current_st_f_i64(dynarec_la64_t* dyn, int a) +{ + for (int i = 0; i < 24; ++i) + if ((dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_I64 || dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_F) + && dyn->lsx.lsxcache[i].n == a) + return i; + return -1; +} + +static void lsxcache_promote_double_forward(dynarec_la64_t* dyn, int ninst, int maxinst, int a); +static void lsxcache_promote_double_internal(dynarec_la64_t* dyn, int ninst, int maxinst, int a); +static void lsxcache_promote_double_combined(dynarec_la64_t* dyn, int ninst, int maxinst, int a) +{ + if (a == dyn->insts[ninst].lsx.combined1 || a == dyn->insts[ninst].lsx.combined2) { + if (a == dyn->insts[ninst].lsx.combined1) { + a = dyn->insts[ninst].lsx.combined2; + } else + a = dyn->insts[ninst].lsx.combined1; + int i = lsxcache_get_st_f_i64_noback(dyn, ninst, a); + if (i >= 0) { + dyn->insts[ninst].lsx.lsxcache[i].t = LSX_CACHE_ST_D; + if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2; + if (!dyn->insts[ninst].lsx.barrier) + lsxcache_promote_double_internal(dyn, ninst - 1, maxinst, a - dyn->insts[ninst].lsx.stack_push); + // go forward is combined is not pop'd + if (a - dyn->insts[ninst].lsx.stack_pop >= 0) + if (!dyn->insts[ninst + 1].lsx.barrier) + lsxcache_promote_double_forward(dyn, ninst + 1, maxinst, a - dyn->insts[ninst].lsx.stack_pop); + } + } +} +static void lsxcache_promote_double_internal(dynarec_la64_t* dyn, int ninst, int maxinst, int a) +{ + if (dyn->insts[ninst + 1].lsx.barrier) + return; + while (ninst >= 0) { + a += dyn->insts[ninst].lsx.stack_pop; // adjust Stack depth: add pop'd ST (going backward) + int i = lsxcache_get_st_f_i64(dyn, ninst, a); + if (i < 0) return; + dyn->insts[ninst].lsx.lsxcache[i].t = LSX_CACHE_ST_D; + if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2; + // check combined propagation too + if (dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2) { + if (dyn->insts[ninst].lsx.swapped) { + // if(dyn->need_dump) dynarec_log(LOG_NONE, "lsxcache_promote_double_internal, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack); + if (a == dyn->insts[ninst].lsx.combined1) + a = dyn->insts[ninst].lsx.combined2; + else if (a == dyn->insts[ninst].lsx.combined2) + a = dyn->insts[ninst].lsx.combined1; + } else { + lsxcache_promote_double_combined(dyn, ninst, maxinst, a); + } + } + a -= dyn->insts[ninst].lsx.stack_push; // // adjust Stack depth: remove push'd ST (going backward) + --ninst; + if (ninst < 0 || a < 0 || dyn->insts[ninst].lsx.barrier) + return; + } +} + +static void lsxcache_promote_double_forward(dynarec_la64_t* dyn, int ninst, int maxinst, int a) +{ + while ((ninst != -1) && (ninst < maxinst) && (a >= 0)) { + a += dyn->insts[ninst].lsx.stack_push; // // adjust Stack depth: add push'd ST (going forward) + if ((dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2) && dyn->insts[ninst].lsx.swapped) { + // if(dyn->need_dump) dynarec_log(LOG_NONE, "lsxcache_promote_double_forward, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack); + if (a == dyn->insts[ninst].lsx.combined1) + a = dyn->insts[ninst].lsx.combined2; + else if (a == dyn->insts[ninst].lsx.combined2) + a = dyn->insts[ninst].lsx.combined1; + } + int i = lsxcache_get_st_f_i64_noback(dyn, ninst, a); + if (i < 0) return; + dyn->insts[ninst].lsx.lsxcache[i].t = LSX_CACHE_ST_D; + if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2; + // check combined propagation too + if ((dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2) && !dyn->insts[ninst].lsx.swapped) { + // if(dyn->need_dump) dynarec_log(LOG_NONE, "lsxcache_promote_double_forward, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack); + lsxcache_promote_double_combined(dyn, ninst, maxinst, a); + } + a -= dyn->insts[ninst].lsx.stack_pop; // adjust Stack depth: remove pop'd ST (going forward) + if (dyn->insts[ninst].x64.has_next && !dyn->insts[ninst].lsx.barrier) + ++ninst; + else + ninst = -1; + } + if (ninst == maxinst) + lsxcache_promote_double(dyn, ninst, a); +} + +void lsxcache_promote_double(dynarec_la64_t* dyn, int ninst, int a) +{ + int i = lsxcache_get_current_st_f_i64(dyn, a); + if (i < 0) return; + dyn->lsx.lsxcache[i].t = LSX_CACHE_ST_D; + dyn->insts[ninst].lsx.lsxcache[i].t = LSX_CACHE_ST_D; + if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2; + // check combined propagation too + if (dyn->lsx.combined1 || dyn->lsx.combined2) { + if (dyn->lsx.swapped) { + if (dyn->lsx.combined1 == a) + a = dyn->lsx.combined2; + else if (dyn->lsx.combined2 == a) + a = dyn->lsx.combined1; + } else { + if (dyn->lsx.combined1 == a) + lsxcache_promote_double(dyn, ninst, dyn->lsx.combined2); + else if (dyn->lsx.combined2 == a) + lsxcache_promote_double(dyn, ninst, dyn->lsx.combined1); + } + } + a -= dyn->insts[ninst].lsx.stack_push; // // adjust Stack depth: remove push'd ST (going backward) + if (!ninst || a < 0) return; + lsxcache_promote_double_internal(dyn, ninst - 1, ninst, a); +} + +int lsxcache_combine_st(dynarec_la64_t* dyn, int ninst, int a, int b) +{ + dyn->lsx.combined1 = a; + dyn->lsx.combined2 = b; + if (lsxcache_get_current_st(dyn, ninst, a) == LSX_CACHE_ST_F + && lsxcache_get_current_st(dyn, ninst, b) == LSX_CACHE_ST_F) + return LSX_CACHE_ST_F; + return LSX_CACHE_ST_D; +} + static int isCacheEmpty(dynarec_native_t* dyn, int ninst) { if (dyn->insts[ninst].lsx.stack_next) { @@ -549,6 +774,12 @@ void fpu_reset(dynarec_la64_t* dyn) fpu_reset_reg(dyn); } +int fpu_is_st_freed(dynarec_la64_t* dyn, int ninst, int st) +{ + return (dyn->lsx.tags & (0b11 << (st * 2))) ? 1 : 0; +} + + void fpu_reset_ninst(dynarec_la64_t* dyn, int ninst) { // TODO: x87 and mmx diff --git a/src/dynarec/la64/dynarec_la64_functions.h b/src/dynarec/la64/dynarec_la64_functions.h index 4b96b497..18c8b0d3 100644 --- a/src/dynarec/la64/dynarec_la64_functions.h +++ b/src/dynarec/la64/dynarec_la64_functions.h @@ -12,6 +12,8 @@ typedef struct dynarec_la64_s dynarec_la64_t; int fpu_get_scratch(dynarec_la64_t* dyn); // Reset scratch regs counter void fpu_reset_scratch(dynarec_la64_t* dyn); +// Get an x87 double reg +int fpu_get_reg_x87(dynarec_la64_t* dyn, int t, int n); // Get an XMM quad reg int fpu_get_reg_xmm(dynarec_la64_t* dyn, int t, int xmm); // Get an YMM quad reg @@ -23,6 +25,25 @@ void fpu_reset_reg(dynarec_la64_t* dyn); // Get an MMX double reg int fpu_get_reg_emm(dynarec_la64_t* dyn, int emm); +// Get type for STx +int lsxcache_get_st(dynarec_la64_t* dyn, int ninst, int a); +// Get if STx is FLOAT or DOUBLE +int lsxcache_get_st_f(dynarec_la64_t* dyn, int ninst, int a); +// Get if STx is FLOAT or I64 +int lsxcache_get_st_f_i64(dynarec_la64_t* dyn, int ninst, int a); +// Get actual type for STx +int lsxcache_get_current_st(dynarec_la64_t* dyn, int ninst, int a); +// Get actual STx is FLOAT or DOUBLE +int lsxcache_get_current_st_f(dynarec_la64_t* dyn, int a); +// Get actual STx is FLOAT or I64 +int lsxcache_get_current_st_f_i64(dynarec_la64_t* dyn, int a); +// Back-propagate a change float->double +void lsxcache_promote_double(dynarec_la64_t* dyn, int ninst, int a); +// Combine and propagate if needed (pass 1 only) +int lsxcache_combine_st(dynarec_la64_t* dyn, int ninst, int a, int b); // with stack current dyn->n_stack* +// Do not allow i64 type +int lsxcache_no_i64(dynarec_la64_t* dyn, int ninst, int st, int a); + // FPU Cache transformation (for loops) // Specific, need to be written by backend int fpuCacheNeedsTransform(dynarec_la64_t* dyn, int ninst); @@ -39,6 +60,8 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode); // reset the cache void fpu_reset(dynarec_native_t* dyn); void fpu_reset_ninst(dynarec_native_t* dyn, int ninst); +// is st freed +int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st); void updateNativeFlags(dynarec_la64_t* dyn); void get_free_scratch(dynarec_la64_t* dyn, int ninst, uint8_t* tmp1, uint8_t* tmp2, uint8_t* tmp3, uint8_t s1, uint8_t s2, uint8_t s3, uint8_t s4, uint8_t s5); diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c index d283c09f..5a774e46 100644 --- a/src/dynarec/la64/dynarec_la64_helper.c +++ b/src/dynarec/la64/dynarec_la64_helper.c @@ -695,7 +695,7 @@ void call_c(dynarec_la64_t* dyn, int ninst, la64_consts_t fnc, int reg, int ret, { MAYUSE(fnc); if (savereg == 0) - savereg = x6; + savereg = x87pc; if (saveflags) { RESTORE_EFLAGS(reg); ST_D(xFlags, xEmu, offsetof(x64emu_t, eflags)); @@ -753,6 +753,8 @@ void call_c(dynarec_la64_t* dyn, int ninst, la64_consts_t fnc, int reg, int ret, LD_D(xFlags, xEmu, offsetof(x64emu_t, eflags)); SPILL_EFLAGS(); } + if (savereg != x87pc && dyn->need_x87check) + NATIVE_RESTORE_X87PC(); // SET_NODF(); dyn->last_ip = 0; } @@ -783,10 +785,240 @@ void grab_segdata(dynarec_la64_t* dyn, uintptr_t addr, int ninst, int reg, int s MESSAGE(LOG_DUMP, "----%s Offset\n", (segment == _FS) ? "FS" : "GS"); } +int x87_stackcount(dynarec_la64_t* dyn, int ninst, int scratch) +{ + MAYUSE(scratch); + if (!dyn->lsx.x87stack) + return 0; + if (dyn->lsx.mmxcount) + mmx_purgecache(dyn, ninst, 0, scratch); + MESSAGE(LOG_DUMP, "\tSynch x87 Stackcount (%d)\n", dyn->lsx.x87stack); + int a = dyn->lsx.x87stack; + // Add x87stack to emu fpu_stack + LD_W(scratch, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI_D(scratch, scratch, a); + ST_W(scratch, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LD_W(scratch, xEmu, offsetof(x64emu_t, top)); + ADDI_D(scratch, scratch, -a); + ANDI(scratch, scratch, 7); + ST_W(scratch, xEmu, offsetof(x64emu_t, top)); + // reset x87stack, but not the stack count of extcache + dyn->lsx.x87stack = 0; + dyn->lsx.stack_next -= dyn->lsx.stack; + int ret = dyn->lsx.stack; + dyn->lsx.stack = 0; + MESSAGE(LOG_DUMP, "\t------x87 Stackcount\n"); + return ret; +} +void x87_unstackcount(dynarec_la64_t* dyn, int ninst, int scratch, int count) +{ + MAYUSE(scratch); + if (!count) + return; + if (dyn->lsx.mmxcount) + mmx_purgecache(dyn, ninst, 0, scratch); + MESSAGE(LOG_DUMP, "\tSynch x87 Unstackcount (%d)\n", count); + int a = -count; + // Add x87stack to emu fpu_stack + LD_W(scratch, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI_D(scratch, scratch, a); + ST_W(scratch, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LD_W(scratch, xEmu, offsetof(x64emu_t, top)); + ADDI_D(scratch, scratch, -a); + ANDI(scratch, scratch, 7); + ST_W(scratch, xEmu, offsetof(x64emu_t, top)); + // reset x87stack, but not the stack count of extcache + dyn->lsx.x87stack = count; + dyn->lsx.stack = count; + dyn->lsx.stack_next += dyn->lsx.stack; + MESSAGE(LOG_DUMP, "\t------x87 Unstackcount\n"); +} void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st) { - // TODO + int ret = -1; + for (int i = 0; (i < 8) && (ret == -1); ++i) + if (dyn->lsx.x87cache[i] == st) + ret = i; + if (ret == -1) // nothing to do + return; + MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st); + const int reg = dyn->lsx.x87reg[ret]; +#if STEP == 1 + if (dyn->lsx.lsxcache[dyn->lsx.x87reg[ret]].t == LSX_CACHE_ST_F + || dyn->lsx.lsxcache[dyn->lsx.x87reg[ret]].t == LSX_CACHE_ST_I64) + lsxcache_promote_double(dyn, ninst, st); +#endif + // prepare offset to fpu => s1 + // Get top + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + // Update + int a = st - dyn->lsx.x87stack; + if (a) { + ADDI_D(s2, s2, a); + ANDI(s2, s2, 7); // (emu->top + i)&7 + } + SLLI_D(s2, s2, 3); + ADD_D(s1, xEmu, s2); + if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_F) { + FCVT_D_S(SCRATCH0, reg); + FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87)); + } else if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_I64) { + FFINT_D_L(SCRATCH0, reg); + FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87)); + } else { + FST_D(reg, s1, offsetof(x64emu_t, x87)); + } + MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st); + // and forget that cache + fpu_free_reg(dyn, dyn->lsx.x87reg[ret]); + dyn->lsx.lsxcache[reg].v = 0; + dyn->lsx.x87cache[ret] = -1; + dyn->lsx.x87reg[ret] = -1; +} + + +void x87_reget_st(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st) +{ + if (dyn->lsx.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + // search in cache first + for (int i = 0; i < 8; ++i) + if (dyn->lsx.x87cache[i] == st) { + // refresh the value + MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st); +#if STEP == 1 + if (dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_F + || dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_I64) + lsxcache_promote_double(dyn, ninst, st); +#endif + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + int a = st - dyn->lsx.x87stack; + if (a) { + ADDI_D(s2, s2, a); + AND(s2, s2, 7); + } + SLLI_D(s2, s2, 3); + ADD_D(s1, xEmu, s2); + FLD_D(dyn->lsx.x87reg[i], s1, offsetof(x64emu_t, x87)); + MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); + // ok + return; + } + // Was not in the cache? creating it.... + MESSAGE(LOG_DUMP, "\tCreate x87 Cache for ST%d\n", st); + // get a free spot + int ret = -1; + for (int i = 0; (i < 8) && (ret == -1); ++i) + if (dyn->lsx.x87cache[i] == -1) + ret = i; + // found, setup and grab the value + dyn->lsx.x87cache[ret] = st; + dyn->lsx.x87reg[ret] = fpu_get_reg_x87(dyn, LSX_CACHE_ST_D, st); + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + int a = st - dyn->lsx.x87stack; + ADDI_D(s2, s2, a); + ANDI(s2, s2, 7); // (emu->top + i)&7 + SLLI_D(s2, s2, 3); + ADD_D(s1, xEmu, s2); + FLD_D(dyn->lsx.x87reg[ret], s1, offsetof(x64emu_t, x87)); + MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); +} + +void x87_free(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int st) +{ + int ret = -1; + for (int i = 0; (i < 8) && (ret == -1); ++i) + if (dyn->lsx.x87cache[i] == st) + ret = i; + MESSAGE(LOG_DUMP, "\tFFREE%s x87 Cache for ST%d\n", (ret != -1) ? " (and Forget)" : "", st); + if (ret != -1) { + const int reg = dyn->lsx.x87reg[ret]; +#if STEP == 1 + if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_F || dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_I64) + lsxcache_promote_double(dyn, ninst, st); +#endif + // Get top + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + // Update + int ast = st - dyn->lsx.x87stack; + if (ast) { + ADDI_D(s2, s2, ast); + ANDI(s2, s2, 7); // (emu->top + i)&7 + } + SLLI_D(s2, s2, 3); + ADD_D(s1, xEmu, s2); + if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_F) { + FCVT_D_S(SCRATCH0, reg); + FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87)); + } else if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_I64) { + FFINT_D_L(SCRATCH0, reg); + FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87)); + } else { + FST_D(reg, s1, offsetof(x64emu_t, x87)); + } + // and forget that cache + fpu_free_reg(dyn, reg); + dyn->lsx.lsxcache[reg].v = 0; + dyn->lsx.x87cache[ret] = -1; + dyn->lsx.x87reg[ret] = -1; + } else { + // Get top + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + // Update + int ast = st - dyn->lsx.x87stack; + if (ast) { + ADDI_D(s2, s2, ast); + ANDI(s2, s2, 7); // (emu->top + i)&7 + } + } + // add mark in the freed array + dyn->lsx.tags |= 0b11 << (st * 2); + MESSAGE(LOG_DUMP, "\t--------x87 FFREE for ST%d\n", st); +} + +void x87_swapreg(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int b) +{ + int i1, i2, i3; + i1 = x87_get_cache(dyn, ninst, 1, s1, s2, b, X87_ST(b)); + i2 = x87_get_cache(dyn, ninst, 1, s1, s2, a, X87_ST(a)); + i3 = dyn->lsx.x87cache[i1]; + dyn->lsx.x87cache[i1] = dyn->lsx.x87cache[i2]; + dyn->lsx.x87cache[i2] = i3; + // swap those too + int j1, j2, j3; + j1 = x87_get_lsxcache(dyn, ninst, s1, s2, b); + j2 = x87_get_lsxcache(dyn, ninst, s1, s2, a); + j3 = dyn->lsx.lsxcache[j1].n; + dyn->lsx.lsxcache[j1].n = dyn->lsx.lsxcache[j2].n; + dyn->lsx.lsxcache[j2].n = j3; + // mark as swapped + dyn->lsx.swapped = 1; + dyn->lsx.combined1 = a; + dyn->lsx.combined2 = b; +} + +// Set rounding according to cw flags, return reg to restore flags +int x87_setround(dynarec_la64_t* dyn, int ninst, int s1, int s2) +{ + MAYUSE(dyn); + MAYUSE(ninst); + MAYUSE(s1); + MAYUSE(s2); + LD_W(s1, xEmu, offsetof(x64emu_t, cw)); + BSTRPICK_W(s1, s1, 11, 10); + // MMX/x87 Round mode: 0..3: Nearest, Down, Up, Chop + // LA64: 0..3: Nearest, TowardZero, TowardsPositive, TowardsNegative + // 0->0, 1->3, 2->2, 3->1 + SUB_W(s1, xZR, s1); + ANDI(s1, s1, 3); + // done + SLLI_D(s1, s1, 8); + MOVFCSR2GR(s2, FCSR3); + MOVGR2FCSR(FCSR3, s1); // exchange RM with current + return s2; } // Set rounding according to mxcsr flags, return reg to restore flags @@ -810,10 +1042,394 @@ int sse_setround(dynarec_la64_t* dyn, int ninst, int s1, int s2) return s2; } +int lsxcache_st_coherency(dynarec_la64_t* dyn, int ninst, int a, int b) +{ + int i1 = lsxcache_get_st(dyn, ninst, a); + int i2 = lsxcache_get_st(dyn, ninst, b); + if (i1 != i2) { + MESSAGE(LOG_DUMP, "Warning, ST cache incoherent between ST%d(%d) and ST%d(%d)\n", a, i1, b, i2); + } + + return i1; +} + +// On step 1, Float/Double for ST is actually computed and back-propagated +// On step 2-3, the value is just read for inst[...].n.neocache[..] +// the reg returned is *2 for FLOAT +int x87_do_push(dynarec_la64_t* dyn, int ninst, int s1, int t) +{ + if (dyn->lsx.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->lsx.x87stack += 1; + dyn->lsx.stack += 1; + dyn->lsx.stack_next += 1; + dyn->lsx.stack_push += 1; + ++dyn->lsx.pushed; + if (dyn->lsx.poped) + --dyn->lsx.poped; + // move all regs in cache, and find a free one + for (int j = 0; j < 24; ++j) + if ((dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_D) + || (dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_F) + || (dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_I64)) + ++dyn->lsx.lsxcache[j].n; + int ret = -1; + dyn->lsx.tags <<= 2; + for (int i = 0; i < 8; ++i) + if (dyn->lsx.x87cache[i] != -1) + ++dyn->lsx.x87cache[i]; + else if (ret == -1) { + dyn->lsx.x87cache[i] = 0; + ret = dyn->lsx.x87reg[i] = fpu_get_reg_x87(dyn, t, 0); + dyn->lsx.lsxcache[ret].t = X87_ST0; + } + if (ret == -1) { + MESSAGE(LOG_DUMP, "Incoherent x87 stack cache, aborting\n"); + dyn->abort = 1; + } + return ret; +} +void x87_do_push_empty(dynarec_la64_t* dyn, int ninst, int s1) +{ + if (dyn->lsx.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->lsx.x87stack += 1; + dyn->lsx.stack += 1; + dyn->lsx.stack_next += 1; + dyn->lsx.stack_push += 1; + ++dyn->lsx.pushed; + if (dyn->lsx.poped) + --dyn->lsx.poped; + // move all regs in cache + for (int j = 0; j < 24; ++j) + if ((dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_D) + || (dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_F) + || (dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_I64)) + ++dyn->lsx.lsxcache[j].n; + int ret = -1; + dyn->lsx.tags <<= 2; + for (int i = 0; i < 8; ++i) + if (dyn->lsx.x87cache[i] != -1) + ++dyn->lsx.x87cache[i]; + else if (ret == -1) + ret = i; + if (ret == -1) { + MESSAGE(LOG_DUMP, "Incoherent x87 stack cache, aborting\n"); + dyn->abort = 1; + } +} +static void internal_x87_dopop(dynarec_la64_t* dyn) +{ + for (int i = 0; i < 8; ++i) + if (dyn->lsx.x87cache[i] != -1) { + --dyn->lsx.x87cache[i]; + if (dyn->lsx.x87cache[i] == -1) { + fpu_free_reg(dyn, dyn->lsx.x87reg[i]); + dyn->lsx.x87reg[i] = -1; + } + } +} +static int internal_x87_dofree(dynarec_la64_t* dyn) +{ + if (dyn->lsx.tags & 0b11) { + MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n"); + return 1; + } + return 0; +} +void x87_do_pop(dynarec_la64_t* dyn, int ninst, int s1) +{ + if (dyn->lsx.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + do { + dyn->lsx.x87stack -= 1; + dyn->lsx.stack_next -= 1; + dyn->lsx.stack_pop += 1; + dyn->lsx.tags >>= 2; + ++dyn->lsx.poped; + if (dyn->lsx.pushed) + --dyn->lsx.pushed; + // move all regs in cache, poping ST0 + internal_x87_dopop(dyn); + } while (internal_x87_dofree(dyn)); +} + void x87_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3) { - // TODO + int ret = 0; + for (int i = 0; i < 8 && !ret; ++i) + if (dyn->lsx.x87cache[i] != -1) + ret = 1; + if (!ret && !dyn->lsx.x87stack) // nothing to do + return; + MESSAGE(LOG_DUMP, "\tPurge %sx87 Cache and Synch Stackcount (%+d)---\n", next ? "locally " : "", dyn->lsx.x87stack); + int a = dyn->lsx.x87stack; + if (a != 0) { + // reset x87stack + if (!next) + dyn->lsx.x87stack = 0; + // Add x87stack to emu fpu_stack + LD_W(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI_D(s2, s2, a); + ST_W(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + // update tags (and top at the same time) + ADDI_D(s2, s2, -a); + ANDI(s2, s2, 7); + ST_W(s2, xEmu, offsetof(x64emu_t, top)); + // update tags (and top at the same time) + LD_HU(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + if (a > 0) { + SLLI_D(s1, s1, a * 2); + } else { + BSTRINS_D(s1, xZR, 15, 0); + SRLI_D(s1, s1, -a * 2); + } + ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + } else { + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + } + // check if free is used + if (dyn->lsx.tags) { + LD_H(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + MOV32w(s3, dyn->lsx.tags); + OR(s1, s1, s3); + ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + } + if (ret != 0) { + // --- set values + // Get top + // loop all cache entries + for (int i = 0; i < 8; ++i) + if (dyn->lsx.x87cache[i] != -1) { + int st = dyn->lsx.x87cache[i] + dyn->lsx.stack_pop; +#if STEP == 1 + if (!next) { // don't force promotion here + // pre-apply pop, because purge happens in-between + lsxcache_promote_double(dyn, ninst, st); + } +#endif +#if STEP == 3 + if (!next && lsxcache_get_current_st(dyn, ninst, st) != LSX_CACHE_ST_D) { + MESSAGE(LOG_DUMP, "Warning, incoherency with purged ST%d cache\n", st); + } +#endif + ADDI_D(s3, s2, dyn->lsx.x87cache[i]); // unadjusted count, as it's relative to real top + ANDI(s3, s3, 7); // (emu->top + st)&7 + SLLI_D(s1, s3, 3); + ADD_D(s1, xEmu, s1); + switch (lsxcache_get_current_st(dyn, ninst, st)) { + case LSX_CACHE_ST_D: + FST_D(dyn->lsx.x87reg[i], s1, offsetof(x64emu_t, x87)); // save the value + break; + case LSX_CACHE_ST_F: + FCVT_D_S(SCRATCH0, dyn->lsx.x87reg[i]); + FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87)); // save the value + break; + case LSX_CACHE_ST_I64: + FFINT_D_L(SCRATCH0, dyn->lsx.x87reg[i]); + FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87)); // save the value + break; + } + if (!next) { + fpu_free_reg(dyn, dyn->lsx.x87reg[i]); + dyn->lsx.x87reg[i] = -1; + dyn->lsx.x87cache[i] = -1; + // dyn->lsx.stack_pop+=1; //no pop, but the purge because of barrier will have the n.barrier flags set + } + } + } + if (!next) { + dyn->lsx.stack_next = 0; + dyn->lsx.tags = 0; +#if STEP < 2 + // refresh the cached valued, in case it's a purge outside a instruction + dyn->insts[ninst].lsx.barrier = 1; + dyn->lsx.pushed = 0; + dyn->lsx.poped = 0; + +#endif + } + MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n"); +} + +void x87_reflectcount(dynarec_la64_t* dyn, int ninst, int s1, int s2) +{ + // Synch top and stack count + int a = dyn->lsx.x87stack; + if (a) { + MESSAGE(LOG_DUMP, "\tSync x87 Count of %d-----\n", a); + // Add x87stack to emu fpu_stack + LD_W(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI_D(s2, s2, a); + ST_W(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + ADDI_D(s2, s2, -a); + ANDI(s2, s2, 7); + ST_W(s2, xEmu, offsetof(x64emu_t, top)); + // update tags + LD_H(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + if (a > 0) { + SLLI_D(s1, s1, a * 2); + } else { + MOV32w(s2, 0xffff0000); + OR(s1, s1, s2); + SRLI_D(s1, s1, -a * 2); + } + ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + } +} + +static void x87_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3) +{ + // Sync top and stack count + int a = dyn->lsx.x87stack; + if (a) { + // Add x87stack to emu fpu_stack + LD_W(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI_D(s2, s2, a); + ST_W(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + ADDI_D(s2, s2, -a); + ANDI(s2, s2, 7); + ST_W(s2, xEmu, offsetof(x64emu_t, top)); + // update tags (and top at the same time) + LD_H(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + if (a > 0) { + SLLI_D(s1, s1, a * 2); + } else { + MOV32w(s3, 0xffff0000); + OR(s1, s1, s3); + SRLI_D(s1, s1, -a * 2); + } + ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + } + int ret = 0; + for (int i = 0; (i < 8) && (!ret); ++i) + if (dyn->lsx.x87cache[i] != -1) + ret = 1; + if (!ret) // nothing to do + return; + // prepare offset to fpu => s1 + // Get top + if (!a) { + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + } + // loop all cache entries + for (int i = 0; i < 8; ++i) + if (dyn->lsx.x87cache[i] != -1) { + ADDI_D(s3, s2, dyn->lsx.x87cache[i]); + ANDI(s3, s3, 7); // (emu->top + i)&7 + SLLI_D(s1, s3, 3); + ADD_D(s1, xEmu, s1); + if (lsxcache_get_current_st_f(dyn, dyn->lsx.x87cache[i]) >= 0) { + FCVT_D_S(SCRATCH0, dyn->lsx.x87reg[i]); + FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87)); + } else + FST_D(dyn->lsx.x87reg[i], s1, offsetof(x64emu_t, x87)); + } +} + + +void x87_unreflectcount(dynarec_la64_t* dyn, int ninst, int s1, int s2) +{ + // revert top and stack count + int a = dyn->lsx.x87stack; + if (a) { + // Sub x87stack to emu fpu_stack + LD_W(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI_D(s2, s2, -a); + ST_W(s2, xEmu, offsetof(x64emu_t, fpu_stack)); + // Add x87stack to top, with and 7 + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + ADDI_D(s2, s2, a); + ANDI(s2, s2, 7); + ST_W(s2, xEmu, offsetof(x64emu_t, top)); + // update tags + LD_H(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + if (a > 0) { + MOV32w(s2, 0xffff0000); + OR(s1, s1, s2); + SRLI_D(s1, s1, a * 2); + } else { + SLLI_D(s1, s1, -a * 2); + } + ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + } +} + +int x87_get_current_cache(dynarec_la64_t* dyn, int ninst, int st, int t) +{ + // search in cache first + for (int i = 0; i < 8; ++i) { + if (dyn->lsx.x87cache[i] == st) { +#if STEP == 1 + if (t == LSX_CACHE_ST_D && (dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_F || dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_I64)) + lsxcache_promote_double(dyn, ninst, st); + else if (t == LSX_CACHE_ST_I64 && (dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_F)) + lsxcache_promote_double(dyn, ninst, st); + else if (t == LSX_CACHE_ST_F && (dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_I64)) + lsxcache_promote_double(dyn, ninst, st); +#endif + return i; + } + assert(dyn->lsx.x87cache[i] < 8); + } + return -1; +} + +int x87_get_cache(dynarec_la64_t* dyn, int ninst, int populate, int s1, int s2, int st, int t) +{ + if (dyn->lsx.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + int ret = x87_get_current_cache(dyn, ninst, st, t); + if (ret != -1) + return ret; + MESSAGE(LOG_DUMP, "\tCreate %sx87 Cache for ST%d\n", populate ? "and populate " : "", st); + // get a free spot + for (int i = 0; (i < 8) && (ret == -1); ++i) + if (dyn->lsx.x87cache[i] == -1) + ret = i; + // found, setup and grab the value + dyn->lsx.x87cache[ret] = st; + dyn->lsx.x87reg[ret] = fpu_get_reg_x87(dyn, LSX_CACHE_ST_D, st); + if (populate) { + LD_W(s2, xEmu, offsetof(x64emu_t, top)); + int a = st - dyn->lsx.x87stack; + if (a) { + ADDI_D(s2, s2, a); + ANDI(s2, s2, 7); + } + SLLI_D(s2, s2, 3); + ADD_D(s1, xEmu, s2); + FLD_D(dyn->lsx.x87reg[ret], s1, offsetof(x64emu_t, x87)); + } + MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); + + return ret; +} +int x87_get_lsxcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st) +{ + for (int ii = 0; ii < 24; ++ii) + if ((dyn->lsx.lsxcache[ii].t == LSX_CACHE_ST_F + || dyn->lsx.lsxcache[ii].t == LSX_CACHE_ST_D + || dyn->lsx.lsxcache[ii].t == LSX_CACHE_ST_I64) + && dyn->lsx.lsxcache[ii].n == st) + return ii; + assert(0); + return -1; +} +int x87_get_st(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int t) +{ + return dyn->lsx.x87reg[x87_get_cache(dyn, ninst, 1, s1, s2, a, t)]; +} +int x87_get_st_empty(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int t) +{ + return dyn->lsx.x87reg[x87_get_cache(dyn, ninst, 0, s1, s2, a, t)]; } // Restore round flag @@ -1259,7 +1875,7 @@ void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, in void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3) { - // TODO: x87_reflectcache(dyn, ninst, s1, s2, s3); + x87_reflectcache(dyn, ninst, s1, s2, s3); mmx_reflectcache(dyn, ninst, s1); sse_reflectcache(dyn, ninst, s1); avx_reflectcache(dyn, ninst, s1); @@ -1267,7 +1883,8 @@ void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3) void fpu_unreflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3) { - // TODO + // need to undo the top and stack tracking that must not be reflected permanently yet + x87_unreflectcount(dyn, ninst, s1, s2); } void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4) @@ -1294,13 +1911,43 @@ void fpu_reset_cache(dynarec_la64_t* dyn, int ninst, int reset_n) #if STEP > 1 // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap) dyn->lsx = dyn->insts[ninst].lsx; - lsxcacheUnwind(&dyn->lsx); -#ifdef HAVE_TRACE -// TODO: trace -#endif // HAVE_TRACE #else dyn->lsx = dyn->insts[reset_n].lsx; #endif + lsxcacheUnwind(&dyn->lsx); +#if STEP == 0 + if (dyn->need_dump) dynarec_log(LOG_NONE, "New x87stack=%d\n", dyn->lsx.x87stack); +#endif +#if defined(HAVE_TRACE) && (STEP > 2) + if (dyn->need_dump) + if (memcmp(&dyn->lsx, &dyn->insts[reset_n].lsx, sizeof(lsx_cache_t))) { + MESSAGE(LOG_DEBUG, "Warning, difference in lsxcache: reset="); + for (int i = 0; i < 24; ++i) + if (dyn->insts[reset_n].lsx.lsxcache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[reset_n].lsx.lsxcache[i].t, dyn->insts[reset_n].lsx.lsxcache[i].n)); + if (dyn->insts[reset_n].lsx.combined1 || dyn->insts[reset_n].lsx.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[reset_n].lsx.swapped ? "SWP" : "CMB", dyn->insts[reset_n].lsx.combined1, dyn->insts[reset_n].lsx.combined2); + if (dyn->insts[reset_n].lsx.stack_push || dyn->insts[reset_n].lsx.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[reset_n].lsx.stack_push, -dyn->insts[reset_n].lsx.stack_pop); + MESSAGE(LOG_DEBUG, " ==> "); + for (int i = 0; i < 24; ++i) + if (dyn->insts[ninst].lsx.lsxcache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[ninst].lsx.lsxcache[i].t, dyn->insts[ninst].lsx.lsxcache[i].n)); + if (dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[ninst].lsx.swapped ? "SWP" : "CMB", dyn->insts[ninst].lsx.combined1, dyn->insts[ninst].lsx.combined2); + if (dyn->insts[ninst].lsx.stack_push || dyn->insts[ninst].lsx.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[ninst].lsx.stack_push, -dyn->insts[ninst].lsx.stack_pop); + MESSAGE(LOG_DEBUG, " -> "); + for (int i = 0; i < 24; ++i) + if (dyn->lsx.lsxcache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->lsx.lsxcache[i].t, dyn->lsx.lsxcache[i].n)); + if (dyn->lsx.combined1 || dyn->lsx.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->lsx.swapped ? "SWP" : "CMB", dyn->lsx.combined1, dyn->lsx.combined2); + if (dyn->lsx.stack_push || dyn->lsx.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->lsx.stack_push, -dyn->lsx.stack_pop); + MESSAGE(LOG_DEBUG, "\n"); + } +#endif // HAVE_TRACE } // propagate ST stack state, especial stack pop that are deferred @@ -1491,7 +2138,26 @@ static void loadCache(dynarec_la64_t* dyn, int ninst, int stack_cnt, int s1, int case LSX_CACHE_ST_F: case LSX_CACHE_ST_I64: MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); - // TODO: x87 + if ((*s3_top) == 0xffff) { + LD_W(s3, xEmu, offsetof(x64emu_t, top)); + *s3_top = 0; + } + int a = n - (*s3_top) - stack_cnt; + if (a) { + ADDI_D(s3, s3, a); + ANDI(s3, s3, 7); // (emu->top + i)&7 + } + *s3_top += a; + *s2_val = 0; + SLLI_D(s2, s3, 3); + ADD_D(s2, xEmu, s2); + FLD_D(i, s2, offsetof(x64emu_t, x87)); + if (t == LSX_CACHE_ST_F) { + FCVT_S_D(i, i); + } + if (t == LSX_CACHE_ST_I64) { + FTINTRZ_L_D(i, i); + } break; case LSX_CACHE_NONE: case LSX_CACHE_SCR: @@ -1528,7 +2194,26 @@ static void unloadCache(dynarec_la64_t* dyn, int ninst, int stack_cnt, int s1, i case LSX_CACHE_ST_F: case LSX_CACHE_ST_I64: MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); - // TODO: x87 + if ((*s3_top) == 0xffff) { + LD_W(s3, xEmu, offsetof(x64emu_t, top)); + *s3_top = 0; + } + int a = n - (*s3_top) - stack_cnt; + if (a) { + ADDI_D(s3, s3, a); + ANDI(s3, s3, 7); + } + *s3_top += a; + SLLI_D(s2, s3, 3); + ADD_D(s2, xEmu, s2); + *s2_val = 0; + if (t == LSX_CACHE_ST_F) { + FCVT_D_S(i, i); + } + if (t == LSX_CACHE_ST_I64) { + FFINT_D_L(i, i); + } + FST_D(i, s2, offsetof(x64emu_t, x87)); break; case LSX_CACHE_NONE: case LSX_CACHE_SCR: @@ -1576,13 +2261,6 @@ static void fpuCacheTransform(dynarec_la64_t* dyn, int ninst, int s1, int s2, in } int stack_cnt = dyn->lsx.stack_next; int s3_top = 0xffff; - if (stack_cnt != cache_i2.stack) { - MESSAGE(LOG_DUMP, "\t - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack); - int a = stack_cnt - cache_i2.stack; - // TODO: x87 - s3_top = 0; - stack_cnt = cache_i2.stack; - } lsxcache_t cache = dyn->lsx; int s1_val = 0; int s2_val = 0; @@ -1666,6 +2344,31 @@ static void fpuCacheTransform(dynarec_la64_t* dyn, int ninst, int s1, int s2, in } } } + if (stack_cnt != cache_i2.stack) { + MESSAGE(LOG_DUMP, "\t - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack); + int a = stack_cnt - cache_i2.stack; + // Add x87stack to emu fpu_stack + LD_WU(s3, xEmu, offsetof(x64emu_t, fpu_stack)); + ADDI_D(s3, s3, a); + ST_W(s3, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LD_WU(s3, xEmu, offsetof(x64emu_t, top)); + ADDI_D(s3, s3, -a); + ANDI(s3, s3, 7); + ST_W(s3, xEmu, offsetof(x64emu_t, top)); + // update tags + LD_H(s2, xEmu, offsetof(x64emu_t, fpu_tags)); + if (a > 0) { + SLLI_D(s2, s2, a * 2); + } else { + MOV32w(s3, 0xffff0000); + OR(s2, s2, s3); + SRLI_D(s2, s2, -a * 2); + } + ST_H(s2, xEmu, offsetof(x64emu_t, fpu_tags)); + s3_top = 0; + stack_cnt = cache_i2.stack; + } MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); } diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h index 9cb25114..440cfba5 100644 --- a/src/dynarec/la64/dynarec_la64_helper.h +++ b/src/dynarec/la64/dynarec_la64_helper.h @@ -794,6 +794,10 @@ #define BLTU_MARK(reg1, reg2) Bxx_gen(LTU, MARK, reg1, reg2) // Branch to MARK if reg1>=reg2 (use j64) #define BGE_MARK(reg1, reg2) Bxx_gen(GE, MARK, reg1, reg2) +// Branch to MARK2 if reg1>=0 (use j64) +#define BGE_MARK2(reg, reg2) Bxx_gen(GE, MARK2, reg, reg2) +// Branch to MARK3 if reg1>=0 (use j64) +#define BGE_MARK3(reg, reg2) Bxx_gen(GE, MARK3, reg, reg2) // Branch to MARK instruction unconditionnal (use j64) #define B_MARK_nocond Bxx_gen(__, MARK, 0, 0) @@ -849,6 +853,23 @@ #define IFX2X(A, B) if ((dyn->insts[ninst].x64.gen_flags == (A) || dyn->insts[ninst].x64.gen_flags == (B) || dyn->insts[ninst].x64.gen_flags == ((A) | (B)))) #define IFXN(A, B) if ((dyn->insts[ninst].x64.gen_flags & (A) && !(dyn->insts[ninst].x64.gen_flags & (B)))) +#ifndef NATIVE_RESTORE_X87PC +#define NATIVE_RESTORE_X87PC() \ + if (dyn->need_x87check) { \ + LD_D(x87pc, xEmu, offsetof(x64emu_t, cw)); \ + SRLI_D(x87pc, x87pc, 8); \ + ANDI(x87pc, x87pc, 0b11); \ + } +#endif +#ifndef X87_CHECK_PRECISION +#define X87_CHECK_PRECISION(A) \ + if (!ST_IS_F(0) && dyn->need_x87check) { \ + BNEZ(x87pc, 4 + 8); \ + FCVT_S_D(A, A); \ + FCVT_D_S(A, A); \ + } +#endif + #define STORE_REG(A) ST_D(x##A, xEmu, offsetof(x64emu_t, regs[_##A])) #define LOAD_REG(A) LD_D(x##A, xEmu, offsetof(x64emu_t, regs[_##A])) @@ -951,6 +972,37 @@ } \ } + +#if STEP == 0 +#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) var = x87_do_push(dyn, ninst, scratch, t) +#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) x87_do_push_empty(dyn, ninst, scratch) +#define X87_POP_OR_FAIL(dyn, ninst, scratch) x87_do_pop(dyn, ninst, scratch) +#else +#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) \ + if ((dyn->lsx.x87stack == 8) || (dyn->lsx.pushed == 8)) { \ + if (dyn->need_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->lsx.x87stack, dyn->lsx.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ + var = x87_do_push(dyn, ninst, scratch, t); + +#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) \ + if ((dyn->lsx.x87stack == 8) || (dyn->lsx.pushed == 8)) { \ + if (dyn->need_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->lsx.x87stack, dyn->lsx.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ + x87_do_push_empty(dyn, ninst, scratch); + +#define X87_POP_OR_FAIL(dyn, ninst, scratch) \ + if ((dyn->lsx.x87stack == -8) || (dyn->lsx.poped == 8)) { \ + if (dyn->need_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Pop, stack=%d/%d on inst %d\n", dyn->lsx.x87stack, dyn->lsx.poped, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ + x87_do_pop(dyn, ninst, scratch); +#endif + #ifndef MAYSETFLAGS #define MAYSETFLAGS() \ do { \ @@ -1053,6 +1105,9 @@ #ifndef TABLE64 #define TABLE64(A, V) #endif +#ifndef FTABLE64 +#define FTABLE64(A, V) +#endif #ifndef TABLE64C #define TABLE64C(A, V) #endif @@ -1148,6 +1203,16 @@ #define dynarec64_AVX_F3_0F STEPNAME(dynarec64_AVX_F3_0F) #define dynarec64_AVX_F3_0F38 STEPNAME(dynarec64_AVX_F3_0F38) +#define dynarec64_D8 STEPNAME(dynarec64_D8) +#define dynarec64_D9 STEPNAME(dynarec64_D9) +#define dynarec64_DA STEPNAME(dynarec64_DA) +#define dynarec64_DB STEPNAME(dynarec64_DB) +#define dynarec64_DC STEPNAME(dynarec64_DC) +#define dynarec64_DD STEPNAME(dynarec64_DD) +#define dynarec64_DE STEPNAME(dynarec64_DE) +#define dynarec64_DF STEPNAME(dynarec64_DF) +#define dynarec64_F0 STEPNAME(dynarec64_F0) + #define geted STEPNAME(geted) #define geted32 STEPNAME(geted32) #define jump_to_epilog STEPNAME(jump_to_epilog) @@ -1234,11 +1299,30 @@ #define emit_pf STEPNAME(emit_pf) -#define x87_restoreround STEPNAME(x87_restoreround) +#define x87_do_push STEPNAME(x87_do_push) +#define x87_do_push_empty STEPNAME(x87_do_push_empty) +#define x87_do_pop STEPNAME(x87_do_pop) +#define x87_get_current_cache STEPNAME(x87_get_current_cache) +#define x87_get_cache STEPNAME(x87_get_cache) +#define x87_get_lsxcache STEPNAME(x87_get_lsxcache) +#define x87_get_st STEPNAME(x87_get_st) +#define x87_get_st_empty STEPNAME(x87_get_st) +#define x87_free STEPNAME(x87_free) +#define x87_refresh STEPNAME(x87_refresh) +#define x87_forget STEPNAME(x87_forget) +#define x87_reget_st STEPNAME(x87_reget_st) +#define x87_stackcount STEPNAME(x87_stackcount) +#define x87_unstackcount STEPNAME(x87_unstackcount) +#define x87_swapreg STEPNAME(x87_swapreg) +#define x87_setround STEPNAME(x87_setround) +#define x87_restoreround STEPNAME(x87_restoreround) +#define x87_reflectcount STEPNAME(x87_reflectcount) +#define x87_unreflectcount STEPNAME(x87_unreflectcount) +#define x87_purgecache STEPNAME(x87_purgecache) + #define sse_setround STEPNAME(sse_setround) #define mmx_get_reg STEPNAME(mmx_get_reg) #define mmx_get_reg_empty STEPNAME(mmx_get_reg_empty) -#define x87_forget STEPNAME(x87_forget) #define sse_purge07cache STEPNAME(sse_purge07cache) #define sse_get_reg STEPNAME(sse_get_reg) #define sse_get_reg_empty STEPNAME(sse_get_reg_empty) @@ -1259,7 +1343,6 @@ #define fpu_propagate_stack STEPNAME(fpu_propagate_stack) #define fpu_purgecache STEPNAME(fpu_purgecache) #define mmx_purgecache STEPNAME(mmx_purgecache) -#define x87_purgecache STEPNAME(x87_purgecache) #define fpu_reflectcache STEPNAME(fpu_reflectcache) #define fpu_unreflectcache STEPNAME(fpu_unreflectcache) @@ -1359,22 +1442,58 @@ void emit_rol32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4); // common coproc helpers + +// x87 helper +// cache of the local stack counter, to avoid update at every call +int x87_stackcount(dynarec_la64_t* dyn, int ninst, int scratch); +// restore local stack counter +void x87_unstackcount(dynarec_la64_t* dyn, int ninst, int scratch, int count); +// fpu push. Return the Dd value to be used +int x87_do_push(dynarec_la64_t* dyn, int ninst, int s1, int t); +// fpu push. Do not allocate a cache register. Needs a scratch register to do x87stack synch (or 0 to not do it) +void x87_do_push_empty(dynarec_la64_t* dyn, int ninst, int s1); +// fpu pop. All previous returned Dd should be considered invalid +void x87_do_pop(dynarec_la64_t* dyn, int ninst, int s1); +// get cache index for a x87 reg, return -1 if cache doesn't exist +int x87_get_current_cache(dynarec_la64_t* dyn, int ninst, int st, int t); +// get cache index for a x87 reg, create the entry if needed +int x87_get_cache(dynarec_la64_t* dyn, int ninst, int populate, int s1, int s2, int a, int t); +// get extcache index for a x87 reg +int x87_get_lsxcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a); +// get vfpu register for a x87 reg, create the entry if needed +int x87_get_st(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int t); +// get vfpu register for a x87 reg, create the entry if needed. Do not fetch the Stx if not already in cache +int x87_get_st_empty(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int t); +// Free st, using the FFREE opcode (so it's freed but stack is not moved) +void x87_free(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int st); +// refresh a value from the cache ->emu (nothing done if value is not cached) +void x87_refresh(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st); +// refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached) +void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st); +// refresh the cache value from emu +void x87_reget_st(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st); +// swap 2 x87 regs +void x87_swapreg(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int b); +// Set rounding according to cw flags, return reg to restore flags +int x87_setround(dynarec_la64_t* dyn, int ninst, int s1, int s2); +// Restore round flag +void x87_restoreround(dynarec_la64_t* dyn, int ninst, int s1); +// Set rounding according to mxcsr flags, return reg to restore flags +void x87_reflectcount(dynarec_la64_t* dyn, int ninst, int s1, int s2); +void x87_unreflectcount(dynarec_la64_t* dyn, int ninst, int s1, int s2); +void x87_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3); + // reset the cache with n void fpu_reset_cache(dynarec_la64_t* dyn, int ninst, int reset_n); void fpu_propagate_stack(dynarec_la64_t* dyn, int ninst); void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3); void mmx_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1); -void x87_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3); void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3); void fpu_unreflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3); void fpu_pushcache(dynarec_la64_t* dyn, int ninst, int s1, int not07); void fpu_popcache(dynarec_la64_t* dyn, int ninst, int s1, int not07); -// Restore round flag -void x87_restoreround(dynarec_la64_t* dyn, int ninst, int s1); // Set rounding according to mxcsr flags, return reg to restore flags int sse_setround(dynarec_la64_t* dyn, int ninst, int s1, int s2); -// refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached) -void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st); // SSE/SSE2 helpers // purge the XMM0..XMM7 cache (before function call) @@ -1417,6 +1536,34 @@ void la64_move32(dynarec_la64_t* dyn, int ninst, int reg, int32_t val, int zerou #define CHECK_CACHE() (cacheupd = CacheNeedsTransform(dyn, ninst)) #endif +#define lsxcache_st_coherency STEPNAME(lsxcache_st_coherency) +int lsxcache_st_coherency(dynarec_la64_t* dyn, int ninst, int a, int b); + +#if STEP == 0 +#define ST_IS_F(A) 0 +#define ST_IS_I64(A) 0 +#define X87_COMBINE(A, B) LSX_CACHE_ST_D +#define X87_ST0 LSX_CACHE_ST_D +#define X87_ST(A) LSX_CACHE_ST_D +#elif STEP == 1 +#define ST_IS_F(A) (lsxcache_get_current_st(dyn, ninst, A) == LSX_CACHE_ST_F) +#define ST_IS_I64(A) (lsxcache_get_current_st(dyn, ninst, A) == LSX_CACHE_ST_I64) +#define X87_COMBINE(A, B) lsxcache_combine_st(dyn, ninst, A, B) +#define X87_ST0 lsxcache_no_i64(dyn, ninst, 0, lsxcache_get_current_st(dyn, ninst, 0)) +#define X87_ST(A) lsxcache_no_i64(dyn, ninst, A, lsxcache_get_current_st(dyn, ninst, A)) +#else +#define ST_IS_F(A) (lsxcache_get_st(dyn, ninst, A) == LSX_CACHE_ST_F) +#define ST_IS_I64(A) (lsxcache_get_st(dyn, ninst, A) == LSX_CACHE_ST_I64) +#if STEP == 3 +#define X87_COMBINE(A, B) lsxcache_st_coherency(dyn, ninst, A, B) +#else +#define X87_COMBINE(A, B) lsxcache_get_st(dyn, ninst, A) +#endif +#define X87_ST0 lsxcache_get_st(dyn, ninst, 0) +#define X87_ST(A) lsxcache_get_st(dyn, ninst, A) +#endif + + uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_F30F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); @@ -1441,6 +1588,14 @@ uintptr_t dynarec64_AVX_F2_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i uintptr_t dynarec64_AVX_F2_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); uintptr_t dynarec64_AVX_F3_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); uintptr_t dynarec64_AVX_F3_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_D8(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_D9(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_DA(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_DB(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_DC(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_DD(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_DE(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_DF(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); #if STEP < 3 @@ -1570,6 +1725,61 @@ uintptr_t dynarec64_AVX_F3_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i opcode = F8; \ } + +#define FCOM(w, v1, v2, s1, s2, s3) \ + LD_HU(s3, xEmu, offsetof(x64emu_t, sw)); \ + MOV32w(s1, 0b1011100011111111); /* mask off c0,c1,c2,c3 */ \ + AND(s3, s3, s1); \ + FCMP_##w(fcc0, v1, v2, cOR); \ + BCEQZ(fcc0, 28); /* undefined/NaN */ \ + FCMP_##w(fcc1, v1, v2, cEQ); \ + BCNEZ(fcc1, 32); /* equal */ \ + FCMP_##w(fcc2, v1, v2, cLT); /* x2 = (v1<v2)?1:0 */ \ + MOVCF2GR(s2, fcc2); \ + SLLI_D(s1, s2, 8); \ + B(20); /* end */ \ + /* undefined/NaN */ \ + LU12I_W(s1, 4); \ + ADDI_D(s1, s1, 0b010100000000); \ + B(8); /* end */ \ + /* equal */ \ + LU12I_W(s1, 4); \ + /* end */ \ + OR(s3, s3, s1); \ + ST_H(s3, xEmu, offsetof(x64emu_t, sw)); + +#define FCOMS(v1, v2, s1, s2, s3) FCOM(S, v1, v2, s1, s2, s3) +#define FCOMD(v1, v2, s1, s2, s3) FCOM(D, v1, v2, s1, s2, s3) + +#define FCOMI(w, v1, v2, s1, s2) \ + IFX (X_OF | X_AF | X_SF | X_PEND) { \ + MOV64x(s2, ((1 << F_OF) | (1 << F_AF) | (1 << F_SF))); \ + ANDN(xFlags, xFlags, s2); \ + } \ + IFX (X_CF | X_PF | X_ZF | X_PEND) { \ + MOV32w(s2, 0b01000101); \ + ANDN(xFlags, xFlags, s2); \ + FCMP_##w(fcc0, v1, v2, cOR); \ + BCEQZ(fcc0, 24); /* undefined/NaN */ \ + FCMP_##w(fcc1, v1, v2, cEQ); \ + BCNEZ(fcc1, 24); /* equal */ \ + FCMP_##w(fcc2, v1, v2, cLT); /* s1 = (v1<v2)?1:0 */ \ + MOVCF2GR(s1, fcc2); \ + B(4 * 4); /* end */ \ + /* undefined/NaN */ \ + MV(s1, s2); \ + B(2 * 4); /* end */ \ + /* equal */ \ + ADDI_D(s1, xZR, 0b01000000); \ + /* end */ \ + OR(xFlags, xFlags, s1); \ + } \ + SPILL_EFLAGS(); \ + SET_DFNONE() + +#define FCOMIS(v1, v2, s1, s2) FCOMI(S, v1, v2, s1, s2) +#define FCOMID(v1, v2, s1, s2) FCOMI(D, v1, v2, s1, s2) + // Restore xFlags from LBT.eflags #define RESTORE_EFLAGS(s) \ do { \ diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h index c235bbcd..31cbadcf 100644 --- a/src/dynarec/la64/dynarec_la64_pass0.h +++ b/src/dynarec/la64/dynarec_la64_pass0.h @@ -68,3 +68,7 @@ PrintFunctionAddr(ip, " => "); \ dynarec_log_prefix(0, LOG_NONE, "\n"); \ } + + +#define NATIVE_RESTORE_X87PC() +#define X87_CHECK_PRECISION(A) \ No newline at end of file diff --git a/src/dynarec/la64/dynarec_la64_pass1.h b/src/dynarec/la64/dynarec_la64_pass1.h index 20366bd0..b0dde230 100644 --- a/src/dynarec/la64/dynarec_la64_pass1.h +++ b/src/dynarec/la64/dynarec_la64_pass1.h @@ -13,3 +13,10 @@ dyn->insts[ninst].f_exit = dyn->f #define INST_NAME(name) + +#define NATIVE_RESTORE_X87PC() +#define X87_CHECK_PRECISION(A) \ + do { \ + if (dyn->need_x87check) \ + dyn->need_x87check = 2; \ + } while (0) diff --git a/src/dynarec/la64/dynarec_la64_pass2.h b/src/dynarec/la64/dynarec_la64_pass2.h index eb722e42..26ce8fab 100644 --- a/src/dynarec/la64/dynarec_la64_pass2.h +++ b/src/dynarec/la64/dynarec_la64_pass2.h @@ -33,6 +33,13 @@ EMIT(0); \ EMIT(0); \ } while (0) +#define FTABLE64(A, V) \ + do { \ + mmx87_regs_t v = { .d = V }; \ + Table64(dyn, v.q, 2); \ + EMIT(0); \ + EMIT(0); \ + } while (0) #define TABLE64C(A, V) \ do { \ if (dyn->need_reloc && !isTable64(dyn, getConst(V))) \ diff --git a/src/dynarec/la64/dynarec_la64_pass3.h b/src/dynarec/la64/dynarec_la64_pass3.h index 8188e761..f0eb1419 100644 --- a/src/dynarec/la64/dynarec_la64_pass3.h +++ b/src/dynarec/la64/dynarec_la64_pass3.h @@ -40,6 +40,14 @@ PCADDU12I(A, SPLIT20(val64offset)); \ LD_D(A, A, SPLIT12(val64offset)); \ } while (0) +#define FTABLE64(A, V) \ + do { \ + mmx87_regs_t v = { .d = V }; \ + int val64offset = Table64(dyn, v.q, 3); \ + MESSAGE(LOG_DUMP, " FTable64: %g\n", v.d); \ + PCADDU12I(x1, SPLIT20(val64offset)); \ + FLD_D(A, x1, SPLIT12(val64offset)); \ + } while (0) #define TABLE64C(A, V) \ do { \ if (dyn->need_reloc && !isTable64(dyn, getConst(V))) \ diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h index 120fc14e..65beba34 100644 --- a/src/dynarec/la64/dynarec_la64_private.h +++ b/src/dynarec/la64/dynarec_la64_private.h @@ -114,6 +114,7 @@ typedef struct instruction_la64_s { uint8_t nat_flags_needsign:1; uint8_t nat_flags_op1; uint8_t nat_flags_op2; + uint8_t x87precision:1; // this opcode can handle x87pc flagcache_t f_exit; // flags status at end of instruction lsxcache_t lsx; // lsxcache at end of instruction (but before poping) flagcache_t f_entry; // flags status before the instruction begin diff --git a/src/dynarec/la64/la64_mapping.h b/src/dynarec/la64/la64_mapping.h index 3bb6c1d4..6446f55f 100644 --- a/src/dynarec/la64/la64_mapping.h +++ b/src/dynarec/la64/la64_mapping.h @@ -26,7 +26,7 @@ r17 t5 x4 Temporary Scratch r18 t6 x5 Temporary Scratch Caller r19 t7 x6 Temporary Scratch Caller r20 t8 x7 Temporary Scratch Caller -r21 rx - Reserved N/A - +r21 rx - Reserved X87 Precision Control - r22 fp SavedSP Saved register/frame pointer - Callee r23 s0 R10 Saved register - Callee r24 s1 R11 Saved register - Callee @@ -74,6 +74,8 @@ r31 s8 xEmu Saved register The Emu struct #define x6 19 #define x7 20 +#define x87pc 21 + // emu is $r31 #define xEmu 31 // LA64 RA |