From 5d5c3aec3cb11b2017c108337443e4f825e3b35e Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Tue, 5 Mar 2024 16:49:19 +0800 Subject: [LA64_DYNAREC] Made eflags synchronization lazy (#1329) * [LA64_DYNAREC] Made eflags synchronization lazy * A smol optim * Fixed CLEAR_FLAGS --- src/dynarec/la64/dynarec_la64_00.c | 11 ++++--- src/dynarec/la64/dynarec_la64_0f.c | 19 +++++------- src/dynarec/la64/dynarec_la64_emit_logic.c | 40 +++++++++++++------------ src/dynarec/la64/dynarec_la64_emit_math.c | 28 +++++------------ src/dynarec/la64/dynarec_la64_emit_shift.c | 29 +++++++++--------- src/dynarec/la64/dynarec_la64_emit_tests.c | 48 ++++++++++++++---------------- src/dynarec/la64/dynarec_la64_helper.c | 2 ++ src/dynarec/la64/dynarec_la64_helper.h | 23 +++++++++++++- src/dynarec/la64/la64_epilog.S | 9 ++++++ src/dynarec/la64/la64_prolog.S | 14 +++++++-- 10 files changed, 122 insertions(+), 101 deletions(-) (limited to 'src') diff --git a/src/dynarec/la64/dynarec_la64_00.c b/src/dynarec/la64/dynarec_la64_00.c index 0b807e01..986e4a82 100644 --- a/src/dynarec/la64/dynarec_la64_00.c +++ b/src/dynarec/la64/dynarec_la64_00.c @@ -295,8 +295,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni i8 = F8S; \ BARRIER(BARRIER_MAYBE); \ JUMP(addr + i8, 1); \ - if (la64_lbt && (opcode - 0x70) >= 0xC) { \ - X64_SET_EFLAGS(xFlags, F); \ + if (la64_lbt) { \ X64_SETJ(x1, I); \ } else { \ GETFLAGS; \ @@ -304,7 +303,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni if (dyn->insts[ninst].x64.jmp_insts == -1 || CHECK_CACHE()) { \ /* out of block */ \ i32 = dyn->insts[ninst].epilog - (dyn->native_size); \ - if (la64_lbt && (opcode - 0x70) >= 0xC) \ + if (la64_lbt) \ BEQZ_safe(x1, i32); \ else \ B##NO##_safe(x1, i32); \ @@ -320,7 +319,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } else { \ /* inside the block */ \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size); \ - if (la64_lbt && (opcode - 0x70) >= 0xC) \ + if (la64_lbt) \ BNEZ_safe(x1, i32); \ else \ B##YES##_safe(x1, i32); \ @@ -727,7 +726,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni READFLAGS(X_ZF); i8 = F8S; ADDI_D(xRCX, xRCX, -1); - ANDI(x1, xFlags, 1 << F_ZF); + if (la64_lbt) X64_GET_EFLAGS(x1, X_ZF); else ANDI(x1, xFlags, 1 << F_ZF); CBNZ_NEXT(x1); GO(0); break; @@ -736,7 +735,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni READFLAGS(X_ZF); i8 = F8S; ADDI_D(xRCX, xRCX, -1); - ANDI(x1, xFlags, 1 << F_ZF); + if (la64_lbt) X64_GET_EFLAGS(x1, X_ZF); else ANDI(x1, xFlags, 1 << F_ZF); CBZ_NEXT(x1); GO(0); break; diff --git a/src/dynarec/la64/dynarec_la64_0f.c b/src/dynarec/la64/dynarec_la64_0f.c index 1560f72a..d3e7821e 100644 --- a/src/dynarec/la64/dynarec_la64_0f.c +++ b/src/dynarec/la64/dynarec_la64_0f.c @@ -109,13 +109,12 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni FAKEED; break; - #define GO(GETFLAGS, NO, YES, F, I) \ + #define GO(GETFLAGS, NO, YES, F, I) \ READFLAGS(F); \ i32_ = F32S; \ BARRIER(BARRIER_MAYBE); \ JUMP(addr + i32_, 1); \ - if (la64_lbt && (opcode - 0x80) >= 0xC) { \ - X64_SET_EFLAGS(xFlags, F); \ + if (la64_lbt) { \ X64_SETJ(x1, I); \ } else { \ GETFLAGS; \ @@ -123,7 +122,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni if (dyn->insts[ninst].x64.jmp_insts == -1 || CHECK_CACHE()) { \ /* out of the block */ \ i32 = dyn->insts[ninst].epilog - (dyn->native_size); \ - if (la64_lbt && (opcode - 0x80) >= 0xC) \ + if (la64_lbt) \ BEQZ_safe(x1, i32); \ else \ B##NO##_safe(x1, i32); \ @@ -139,7 +138,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } else { \ /* inside the block */ \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size); \ - if (la64_lbt && (opcode - 0x80) >= 0xC) \ + if (la64_lbt) \ BNEZ_safe(x1, i32); \ else \ B##YES##_safe(x1, i32); \ @@ -152,17 +151,13 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni #define GO(GETFLAGS, NO, YES, F, I) \ READFLAGS(F); \ - if (la64_lbt && (opcode - 0x90) >= 0xC) { \ - X64_SET_EFLAGS(xFlags, F); \ - X64_SETJ(x1, I); \ + if (la64_lbt) { \ + X64_SETJ(x3, I); \ } else { \ GETFLAGS; \ + S##YES(x3, x1); \ } \ nextop = F8; \ - if (la64_lbt && (opcode - 0x90) >= 0xC) \ - SNEZ(x3, x1); \ - else \ - S##YES(x3, x1); \ if (MODREG) { \ if (rex.rex) { \ eb1 = TO_LA64((nextop & 7) + (rex.b << 3)); \ diff --git a/src/dynarec/la64/dynarec_la64_emit_logic.c b/src/dynarec/la64/dynarec_la64_emit_logic.c index ac1e1020..423b885e 100644 --- a/src/dynarec/la64/dynarec_la64_emit_logic.c +++ b/src/dynarec/la64/dynarec_la64_emit_logic.c @@ -25,7 +25,6 @@ // emit XOR32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch void emit_xor32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4) { - CLEAR_FLAGS(s3); IFX(X_PEND) { SET_DF(s4, rex.w ? d_xor64 : d_xor32); } else IFX(X_ALL) { @@ -34,9 +33,10 @@ void emit_xor32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s if (la64_lbt) { IFX(X_ALL) { - if (rex.w) X64_XOR_D(s1, s2); else X64_XOR_W(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); + if (rex.w) + X64_XOR_D(s1, s2); + else + X64_XOR_W(s1, s2); } XOR(s1, s1, s2); if (!rex.w && s1 != s2) ZEROUP(s1); @@ -46,6 +46,7 @@ void emit_xor32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s return; } + CLEAR_FLAGS(s3); XOR(s1, s1, s2); // test sign bit before zeroup. @@ -74,7 +75,6 @@ void emit_xor32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s // emit AND8 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch void emit_and8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4) { - CLEAR_FLAGS(s3); IFX(X_PEND) { SET_DF(s3, d_and8); } else IFX(X_ALL) { @@ -85,8 +85,6 @@ void emit_and8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s3, int s IFXA(X_ALL, la64_lbt) { MOV32w(s3, c); X64_AND_B(s1, s3); - X64_GET_EFLAGS(s4, X_ALL); - OR(xFlags, xFlags, s4); } ANDI(s1, s1, c&0xff); @@ -97,6 +95,7 @@ void emit_and8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s3, int s if (la64_lbt) return; + CLEAR_FLAGS(s3); IFX(X_SF) { SRLI_D(s3, s1, 7); BEQZ(s3, 8); @@ -114,7 +113,6 @@ void emit_and8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s3, int s // emit AND32 instruction, from s1, c, store result in s1 using s3 and s4 as scratch void emit_and32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4) { - CLEAR_FLAGS(s3); IFX(X_PEND) { SET_DF(s3, rex.w ? d_tst64 : d_tst32); } else IFX(X_ALL) { @@ -123,9 +121,10 @@ void emit_and32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i IFXA(X_ALL, la64_lbt) { MOV64xw(s3, c); - if (rex.w) X64_AND_D(s1, s3); else X64_AND_W(s1, s3); - X64_GET_EFLAGS(s4, X_ALL); - OR(xFlags, xFlags, s4); + if (rex.w) + X64_AND_D(s1, s3); + else + X64_AND_W(s1, s3); } if (c >= 0 && c <= 4095) { @@ -141,6 +140,7 @@ void emit_and32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i if (la64_lbt) return; + CLEAR_FLAGS(s3); IFX(X_SF) { SRLI_D(s3, s1, rex.w ? 63 : 31); BEQZ(s3, 8); @@ -159,7 +159,6 @@ void emit_and32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i // emit OR32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4) { - CLEAR_FLAGS(s3); IFX(X_PEND) { SET_DF(s4, rex.w?d_or64:d_or32); } else IFX(X_ALL) { @@ -167,9 +166,10 @@ void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3 } IFXA(X_ALL, la64_lbt) { - if (rex.w) X64_OR_D(s1, s2); else X64_OR_W(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); + if (rex.w) + X64_OR_D(s1, s2); + else + X64_OR_W(s1, s2); } OR(s1, s1, s2); @@ -181,6 +181,7 @@ void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3 if(la64_lbt) return; + CLEAR_FLAGS(s3); // test sign bit before zeroup. IFX(X_SF) { if (!rex.w) SEXT_W(s1, s1); @@ -199,7 +200,6 @@ void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3 // emit OR32 instruction, from s1, c, store result in s1 using s3 and s4 as scratch void emit_or32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4) { - CLEAR_FLAGS(s3); IFX(X_PEND) { SET_DF(s4, rex.w ? d_or64 : d_or32); } else IFX(X_ALL) { @@ -208,9 +208,10 @@ void emit_or32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, in IFXA(X_ALL, la64_lbt) { MOV64xw(s3, c); - if (rex.w) X64_OR_D(s1, s3); else X64_OR_W(s1, s3); - X64_GET_EFLAGS(s4, X_ALL); - OR(xFlags, xFlags, s4); + if (rex.w) + X64_OR_D(s1, s3); + else + X64_OR_W(s1, s3); } if (c >= 0 && c <= 4095) { @@ -226,6 +227,7 @@ void emit_or32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, in if (la64_lbt) return; + CLEAR_FLAGS(s3); // test sign bit before zeroup. IFX(X_SF) { if (!rex.w) SEXT_W(s1, s1); diff --git a/src/dynarec/la64/dynarec_la64_emit_math.c b/src/dynarec/la64/dynarec_la64_emit_math.c index 30dc3313..1cd1fc16 100644 --- a/src/dynarec/la64/dynarec_la64_emit_math.c +++ b/src/dynarec/la64/dynarec_la64_emit_math.c @@ -24,7 +24,6 @@ // emit ADD32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5) { - CLEAR_FLAGS(s3); IFX(X_PEND) { if (rex.w) { ST_D(s1, xEmu, offsetof(x64emu_t, op1)); @@ -41,8 +40,6 @@ void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s if (la64_lbt) { IFX(X_ALL) { X64_ADD_WU(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } ADDxw(s1, s1, s2); if (!rex.w) ZEROUP(s1); @@ -52,6 +49,7 @@ void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s return; } + CLEAR_FLAGS(s3); IFX(X_CF) { if (rex.w) { @@ -127,7 +125,6 @@ void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s // emit ADD32 instruction, from s1, constant c, store result in s1 using s3 and s4 as scratch void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5) { - CLEAR_FLAGS(s3); if (s1 == xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags == X_PEND)) { // special case when doing math on ESP and only PEND is needed: ignoring it! if (c >= -2048 && c < 2048) { @@ -156,8 +153,6 @@ void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i if (la64_lbt) { IFX(X_ALL) { X64_ADD_WU(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } ADDxw(s1, s1, s2); if (!rex.w) ZEROUP(s1); @@ -167,6 +162,7 @@ void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i return; } + CLEAR_FLAGS(s3); IFX(X_CF) { if (rex.w) { @@ -251,7 +247,6 @@ void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i // emit ADD8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4) { - CLEAR_FLAGS(s3); IFX(X_PEND) { ST_B(s1, xEmu, offsetof(x64emu_t, op1)); ST_B(s2, xEmu, offsetof(x64emu_t, op2)); @@ -263,8 +258,6 @@ void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4) if (la64_lbt) { IFX(X_ALL) { X64_ADD_B(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } ADD_D(s1, s1, s2); ANDI(s1, s1, 0xff); @@ -274,6 +267,7 @@ void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4) return; } + CLEAR_FLAGS(s3); IFX(X_AF | X_OF) { OR(s3, s1, s2); // s3 = op1 | op2 @@ -332,7 +326,6 @@ void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4) // emit ADD8 instruction, from s1, const c, store result in s1 using s3 and s4 as scratch void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, int s4) { - CLEAR_FLAGS(s3); IFX(X_PEND) { MOV32w(s4, c & 0xff); ST_B(s1, xEmu, offsetof(x64emu_t, op1)); @@ -346,8 +339,6 @@ void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, i IFX(X_ALL) { IFX(X_PEND) {} else { MOV32w(s4, c & 0xff); } X64_ADD_B(s1, s4); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } ADDI_D(s1, s1, c & 0xff); ANDI(s1, s1, 0xff); @@ -357,6 +348,7 @@ void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, i return; } + CLEAR_FLAGS(s3); IFX(X_AF | X_OF) { IFX(X_PEND) {} else { MOV32w(s4, c & 0xff); } @@ -416,7 +408,6 @@ void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, i // emit SUB8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch void emit_sub8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5) { - CLEAR_FLAGS(s3); IFX(X_PEND) { ST_B(s1, xEmu, offsetof(x64emu_t, op1)); ST_B(s2, xEmu, offsetof(x64emu_t, op2)); @@ -428,8 +419,6 @@ void emit_sub8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i if (la64_lbt) { IFX(X_ALL) { X64_SUB_B(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } SUB_D(s1, s1, s2); ANDI(s1, s1, 0xff); @@ -439,6 +428,7 @@ void emit_sub8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i return; } + CLEAR_FLAGS(s3); IFX(X_AF | X_CF | X_OF) { // for later flag calculation NOR(s5, xZR, s1); @@ -474,7 +464,6 @@ void emit_sub8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, i // emit SUB32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5) { - CLEAR_FLAGS(s3); IFX(X_PEND) { SDxw(s1, xEmu, offsetof(x64emu_t, op1)); SDxw(s2, xEmu, offsetof(x64emu_t, op2)); @@ -486,8 +475,6 @@ void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s if (la64_lbt) { IFX(X_ALL) { X64_SUB_WU(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } SUBxw(s1, s1, s2); if (!rex.w) ZEROUP(s1); @@ -497,6 +484,7 @@ void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s return; } + CLEAR_FLAGS(s3); IFX(X_AF | X_CF | X_OF) { // for later flag calculation NOR(s5, xZR, s1); @@ -526,7 +514,6 @@ void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s // emit SUB32 instruction, from s1, constant c, store result in s1 using s2, s3, s4 and s5 as scratch void emit_sub32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5) { - CLEAR_FLAGS(s3); if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags==X_PEND)) { // special case when doing math on RSP and only PEND is needed: ignoring it! @@ -548,12 +535,11 @@ void emit_sub32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i SET_DFNONE(); } + CLEAR_FLAGS(s3); if (la64_lbt) { IFX(X_PEND) {} else {MOV64xw(s2, c);} IFX(X_ALL) { X64_SUB_WU(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } SUBxw(s1, s1, s2); if (!rex.w) ZEROUP(s1); diff --git a/src/dynarec/la64/dynarec_la64_emit_shift.c b/src/dynarec/la64/dynarec_la64_emit_shift.c index ae1712d9..fe461cdb 100644 --- a/src/dynarec/la64/dynarec_la64_emit_shift.c +++ b/src/dynarec/la64/dynarec_la64_emit_shift.c @@ -25,7 +25,6 @@ void emit_shl32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5) { // s2 is not 0 here and is 1..1f/3f - CLEAR_FLAGS(s3); IFX(X_PEND) { SDxw(s1, xEmu, offsetof(x64emu_t, op1)); SDxw(s2, xEmu, offsetof(x64emu_t, op2)); @@ -36,9 +35,10 @@ void emit_shl32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s if (la64_lbt) { IFX(X_ALL) { - if (rex.w) X64_SLL_D(s1, s2); else X64_SLL_W(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); + if (rex.w) + X64_SLL_D(s1, s2); + else + X64_SLL_W(s1, s2); } SLL_D(s1, s1, s2); IFX(X_PEND) { @@ -47,6 +47,7 @@ void emit_shl32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s return; } + CLEAR_FLAGS(s3); IFX(X_CF | X_OF) { ADDI_D(s5, s2, rex.w?-64:-32); SUB_D(s5, xZR, s5); @@ -90,8 +91,6 @@ void emit_shl32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s // emit SHR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch void emit_shr32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4) { - CLEAR_FLAGS(s3); - IFX(X_PEND) { if (c) { MOV64x(s3, c); @@ -113,9 +112,10 @@ void emit_shr32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, if (la64_lbt) { IFX(X_PEND) {} else { MOV64x(s3, c); } IFX(X_ALL) { - if (rex.w) X64_SRL_D(s1, s3); else X64_SRL_W(s1, s3); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); + if (rex.w) + X64_SRL_D(s1, s3); + else + X64_SRL_W(s1, s3); } SRLIxw(s1, s1, c); @@ -126,6 +126,7 @@ void emit_shr32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, return; } + CLEAR_FLAGS(s3); IFX(X_CF) { if (c > 1) { SRAI_D(s3, s1, c - 1); @@ -171,8 +172,6 @@ void emit_shr32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, // emit SAR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch void emit_sar32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4) { - CLEAR_FLAGS(s3); - IFX(X_PEND) { if (c) { MOV64x(s3, c); @@ -194,9 +193,10 @@ void emit_sar32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, if (la64_lbt) { IFX(X_PEND) {} else { MOV64x(s3, c); } IFX(X_ALL) { - if (rex.w) X64_SRA_D(s1, s3); else X64_SRA_W(s1, s3); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); + if (rex.w) + X64_SRA_D(s1, s3); + else + X64_SRA_W(s1, s3); } SRAIxw(s1, s1, c); @@ -207,6 +207,7 @@ void emit_sar32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, return; } + CLEAR_FLAGS(s3); IFX(X_CF) { if (c > 1) { SRAI_D(s3, s1, c - 1); diff --git a/src/dynarec/la64/dynarec_la64_emit_tests.c b/src/dynarec/la64/dynarec_la64_emit_tests.c index 20387ed6..916fdd38 100644 --- a/src/dynarec/la64/dynarec_la64_emit_tests.c +++ b/src/dynarec/la64/dynarec_la64_emit_tests.c @@ -25,7 +25,6 @@ // emit CMP8 instruction, from cmp s1, s2, using s3, s4, s5 and s6 as scratch void emit_cmp8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5, int s6) { - CLEAR_FLAGS(s3); IFX_PENDOR0 { ST_B(s1, xEmu, offsetof(x64emu_t, op1)); ST_B(s2, xEmu, offsetof(x64emu_t, op2)); @@ -37,8 +36,6 @@ void emit_cmp8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i if (la64_lbt) { IFX(X_ALL) { X64_SUB_B(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } IFX_PENDOR0 { @@ -48,6 +45,7 @@ void emit_cmp8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i return; } + CLEAR_FLAGS(s3); IFX(X_AF | X_CF | X_OF) { // for later flag calculation NOR(s5, xZR, s1); @@ -77,7 +75,6 @@ void emit_cmp8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i // emit CMP8 instruction, from cmp s1 , 0, using s3 and s4 as scratch void emit_cmp8_0(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4) { - CLEAR_FLAGS(s3); IFX_PENDOR0 { ST_B(s1, xEmu, offsetof(x64emu_t, op1)); ST_B(xZR, xEmu, offsetof(x64emu_t, op2)); @@ -90,12 +87,11 @@ void emit_cmp8_0(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4) if (la64_lbt) { IFX(X_ALL) { X64_SUB_B(s1, xZR); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } return; } + CLEAR_FLAGS(s3); IFX(X_SF) { SRLI_D(s3, s1, 7); BEQZ(s3, 8); @@ -113,7 +109,6 @@ void emit_cmp8_0(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4) // emit CMP32 instruction, from cmp s1, s2, using s3 and s4 as scratch void emit_cmp32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6) { - CLEAR_FLAGS(s3); IFX_PENDOR0 { SDxw(s1, xEmu, offsetof(x64emu_t, op1)); SDxw(s2, xEmu, offsetof(x64emu_t, op2)); @@ -124,9 +119,10 @@ void emit_cmp32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s if (la64_lbt) { IFX(X_ALL) { - if (rex.w) X64_SUB_D(s1, s2); else X64_SUB_W(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); + if (rex.w) + X64_SUB_D(s1, s2); + else + X64_SUB_W(s1, s2); } IFX_PENDOR0 { @@ -136,6 +132,7 @@ void emit_cmp32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s return; } + CLEAR_FLAGS(s3); IFX(X_AF | X_CF | X_OF) { // for later flag calculation NOR(s5, xZR, s1); @@ -166,7 +163,6 @@ void emit_cmp32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s // emit CMP32 instruction, from cmp s1, 0, using s3 and s4 as scratch void emit_cmp32_0(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s3, int s4) { - CLEAR_FLAGS(s3); IFX_PENDOR0 { ST_D(s1, xEmu, offsetof(x64emu_t, op1)); ST_D(xZR, xEmu, offsetof(x64emu_t, op2)); @@ -178,13 +174,15 @@ void emit_cmp32_0(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s3, int if (la64_lbt) { IFX(X_ALL) { - if (rex.w) X64_SUB_D(s1, xZR); else X64_SUB_W(s1, xZR); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); + if (rex.w) + X64_SUB_D(s1, xZR); + else + X64_SUB_W(s1, xZR); } return; } + CLEAR_FLAGS(s3); IFX(X_SF) { if (rex.w) { BGE(s1, xZR, 8); @@ -207,7 +205,6 @@ void emit_cmp32_0(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s3, int // emit TEST8 instruction, from test s1, s2, using s3, s4 and s5 as scratch void emit_test8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5) { - CLEAR_FLAGS(s3); IFX_PENDOR0 { SET_DF(s3, d_tst8); } else { @@ -217,8 +214,6 @@ void emit_test8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, if (la64_lbt) { IFX(X_ALL) { X64_AND_B(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); } IFX_PENDOR0 { @@ -228,6 +223,7 @@ void emit_test8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, return; } + CLEAR_FLAGS(s3); AND(s3, s1, s2); // res = s1 & s2 IFX_PENDOR0 { @@ -250,7 +246,6 @@ void emit_test8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, // emit TEST32 instruction, from test s1, s2, using s3 and s4 as scratch void emit_test32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5) { - CLEAR_FLAGS(s3); IFX_PENDOR0 { SET_DF(s3, rex.w?d_tst64:d_tst32); } else { @@ -259,9 +254,10 @@ void emit_test32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int if (la64_lbt) { IFX(X_ALL) { - if (rex.w) X64_AND_D(s1, s2); else X64_AND_W(s1, s2); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); + if (rex.w) + X64_AND_D(s1, s2); + else + X64_AND_W(s1, s2); } IFX_PENDOR0 { @@ -271,6 +267,7 @@ void emit_test32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int return; } + CLEAR_FLAGS(s3); AND(s3, s1, s2); // res = s1 & s2 IFX_PENDOR0 { @@ -296,7 +293,6 @@ void emit_test32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int // emit TEST32 instruction, from test s1, s2, using s3 and s4 as scratch void emit_test32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4, int s5) { - CLEAR_FLAGS(s3); IFX_PENDOR0 { SET_DF(s3, rex.w ? d_tst64 : d_tst32); } else { @@ -307,9 +303,10 @@ void emit_test32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, if (la64_lbt) { IFX(X_ALL) { MOV64xw(s3, c); - if (rex.w) X64_AND_D(s1, s3); else X64_AND_W(s1, s3); - X64_GET_EFLAGS(s3, X_ALL); - OR(xFlags, xFlags, s3); + if (rex.w) + X64_AND_D(s1, s3); + else + X64_AND_W(s1, s3); } IFX_PENDOR0 { @@ -324,6 +321,7 @@ void emit_test32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, return; } + CLEAR_FLAGS(s3); if (c >= 0 && c <= 4095) { ANDI(s3, s1, c); } else { diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c index 03934883..cbd97fa8 100644 --- a/src/dynarec/la64/dynarec_la64_helper.c +++ b/src/dynarec/la64/dynarec_la64_helper.c @@ -451,6 +451,7 @@ void call_c(dynarec_la64_t* dyn, int ninst, void* fnc, int reg, int ret, int sav if (savereg == 0) savereg = x6; if (saveflags) { + RESTORE_EFLAGS(reg); ST_D(xFlags, xEmu, offsetof(x64emu_t, eflags)); } fpu_pushcache(dyn, ninst, reg, 0); @@ -499,6 +500,7 @@ void call_c(dynarec_la64_t* dyn, int ninst, void* fnc, int reg, int ret, int sav fpu_popcache(dyn, ninst, reg, 0); if (saveflags) { LD_D(xFlags, xEmu, offsetof(x64emu_t, eflags)); + SPILL_EFLAGS(); } SET_NODF(); dyn->last_ip = 0; diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h index 7a174edd..1b9490a3 100644 --- a/src/dynarec/la64/dynarec_la64_helper.h +++ b/src/dynarec/la64/dynarec_la64_helper.h @@ -269,8 +269,11 @@ #define SET_NODF() dyn->f.dfnone = 0 #define SET_DFOK() dyn->f.dfnone = 1 +#define CLEAR_FLAGS_(s) \ + MOV64x(s, (1UL << F_AF) | (1UL << F_CF) | (1UL << F_OF) | (1UL << F_ZF) | (1UL << F_SF) | (1UL << F_PF)); ANDN(xFlags, xFlags, s); + #define CLEAR_FLAGS(s) \ - IFX(X_ALL) { MOV64x(s, (1UL << F_AF) | (1UL << F_CF) | (1UL << F_OF) | (1UL << F_ZF) | (1UL << F_SF) | (1UL << F_PF)); ANDN(xFlags, xFlags, s); } + IFX(X_ALL) { CLEAR_FLAGS_(s) } #define CALC_SUB_FLAGS(op1_, op2, res, scratch1, scratch2, width) \ IFX(X_AF | X_CF | X_OF) \ @@ -664,4 +667,22 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int opcode = F8; \ } +// Restore xFlags from LBT.eflags +#define RESTORE_EFLAGS(s) \ + do { \ + if (la64_lbt) { \ + CLEAR_FLAGS_(reg); \ + X64_GET_EFLAGS(reg, X_ALL); \ + OR(xFlags, xFlags, reg); \ + } \ + } while (0) + +// Spill xFlags to LBT.eflags +#define SPILL_EFLAGS() \ + do { \ + if (la64_lbt) { \ + X64_SET_EFLAGS(xFlags, X_ALL); \ + } \ + } while (0) + #endif //__DYNAREC_LA64_HELPER_H__ \ No newline at end of file diff --git a/src/dynarec/la64/la64_epilog.S b/src/dynarec/la64/la64_epilog.S index 14f5dc4e..0b1feb7c 100644 --- a/src/dynarec/la64/la64_epilog.S +++ b/src/dynarec/la64/la64_epilog.S @@ -27,6 +27,15 @@ la64_epilog: st.d $r28, $r4, (8 * 13) st.d $r29, $r4, (8 * 14) st.d $r30, $r4, (8 * 15) + // restore xFlags from LBT.eflags + la.global $r12, la64_lbt + ldptr.d $r12, $r12, 0 + beqz $r12, 1f + ori $r13, $r0, 0b100011010101 + andn $r31, $r31, $r13 + x86mfflag $r13, 0b111111 + or $r31, $r31, $r13 +1: st.d $r31, $r4, (8 * 16) // xFlags st.d $r20, $r4, (8 * 17) // put back reg value in emu, including EIP (so $r20 must be EIP now) // fallback to epilog_fast now, just restoring saved regs diff --git a/src/dynarec/la64/la64_prolog.S b/src/dynarec/la64/la64_prolog.S index c1d1ed80..d0faa1e2 100644 --- a/src/dynarec/la64/la64_prolog.S +++ b/src/dynarec/la64/la64_prolog.S @@ -6,6 +6,8 @@ .text .align 4 +.extern la64_lbt + .global la64_prolog la64_prolog: //save all 18 used register @@ -31,7 +33,7 @@ la64_prolog: fst.d $f29, $sp, (8 * 16) fst.d $f30, $sp, (8 * 17) fst.d $f31, $sp, (8 * 18) - //setup emu -> register + // setup emu -> register ld.d $r12, $r4, (8 * 0) ld.d $r13, $r4, (8 * 1) ld.d $r14, $r4, (8 * 2) @@ -48,8 +50,14 @@ la64_prolog: ld.d $r28, $r4, (8 * 13) ld.d $r29, $r4, (8 * 14) ld.d $r30, $r4, (8 * 15) - ld.d $r31, $r4, (8 * 16) //xFlags - ld.d $r20, $r4, (8 * 17) //xRIP + ld.d $r31, $r4, (8 * 16) // xFlags + ld.d $r20, $r4, (8 * 17) // xRIP + // spill xFlags to LBT.eflags + la.global $a6, la64_lbt + ldptr.d $a6, $a6, 0 + beqz $a6, 1f + x86mtflag $r31, 0b111111 +1: // push sentinel onto the stack st.d $r0, $sp, -16 st.d $r0, $sp, -8 -- cgit 1.4.1