about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <numbksco@gmail.com>2024-03-05 16:49:19 +0800
committerGitHub <noreply@github.com>2024-03-05 09:49:19 +0100
commit5d5c3aec3cb11b2017c108337443e4f825e3b35e (patch)
treeff8355db040b0d8ccc061027b32cb50a7d23616b /src
parent0ba63ea272b8bc4dff24377d238d082df83f520c (diff)
downloadbox64-5d5c3aec3cb11b2017c108337443e4f825e3b35e.tar.gz
box64-5d5c3aec3cb11b2017c108337443e4f825e3b35e.zip
[LA64_DYNAREC] Made eflags synchronization lazy (#1329)
* [LA64_DYNAREC] Made eflags synchronization lazy

* A smol optim

* Fixed CLEAR_FLAGS
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/la64/dynarec_la64_00.c11
-rw-r--r--src/dynarec/la64/dynarec_la64_0f.c19
-rw-r--r--src/dynarec/la64/dynarec_la64_emit_logic.c40
-rw-r--r--src/dynarec/la64/dynarec_la64_emit_math.c28
-rw-r--r--src/dynarec/la64/dynarec_la64_emit_shift.c29
-rw-r--r--src/dynarec/la64/dynarec_la64_emit_tests.c48
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.c2
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h23
-rw-r--r--src/dynarec/la64/la64_epilog.S9
-rw-r--r--src/dynarec/la64/la64_prolog.S14
10 files changed, 122 insertions, 101 deletions
diff --git a/src/dynarec/la64/dynarec_la64_00.c b/src/dynarec/la64/dynarec_la64_00.c
index 0b807e01..986e4a82 100644
--- a/src/dynarec/la64/dynarec_la64_00.c
+++ b/src/dynarec/la64/dynarec_la64_00.c
@@ -295,8 +295,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             i8 = F8S;                                                                               \
             BARRIER(BARRIER_MAYBE);                                                                 \
             JUMP(addr + i8, 1);                                                                     \
-            if (la64_lbt && (opcode - 0x70) >= 0xC) {                                               \
-                X64_SET_EFLAGS(xFlags, F);                                                          \
+            if (la64_lbt) {                                                                         \
                 X64_SETJ(x1, I);                                                                    \
             } else {                                                                                \
                 GETFLAGS;                                                                           \
@@ -304,7 +303,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             if (dyn->insts[ninst].x64.jmp_insts == -1 || CHECK_CACHE()) {                           \
                 /* out of block */                                                                  \
                 i32 = dyn->insts[ninst].epilog - (dyn->native_size);                                \
-                if (la64_lbt && (opcode - 0x70) >= 0xC)                                             \
+                if (la64_lbt)                                                                       \
                     BEQZ_safe(x1, i32);                                                             \
                 else                                                                                \
                     B##NO##_safe(x1, i32);                                                          \
@@ -320,7 +319,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             } else {                                                                                \
                 /* inside the block */                                                              \
                 i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size);     \
-                if (la64_lbt && (opcode - 0x70) >= 0xC)                                             \
+                if (la64_lbt)                                                                       \
                     BNEZ_safe(x1, i32);                                                             \
                 else                                                                                \
                     B##YES##_safe(x1, i32);                                                         \
@@ -727,7 +726,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             READFLAGS(X_ZF);
             i8 = F8S;
             ADDI_D(xRCX, xRCX, -1);
-            ANDI(x1, xFlags, 1 << F_ZF);
+            if (la64_lbt) X64_GET_EFLAGS(x1, X_ZF); else ANDI(x1, xFlags, 1 << F_ZF);
             CBNZ_NEXT(x1);
             GO(0);
             break;
@@ -736,7 +735,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             READFLAGS(X_ZF);
             i8 = F8S;
             ADDI_D(xRCX, xRCX, -1);
-            ANDI(x1, xFlags, 1 << F_ZF);
+            if (la64_lbt) X64_GET_EFLAGS(x1, X_ZF); else ANDI(x1, xFlags, 1 << F_ZF);
             CBZ_NEXT(x1);
             GO(0);
             break;
diff --git a/src/dynarec/la64/dynarec_la64_0f.c b/src/dynarec/la64/dynarec_la64_0f.c
index 1560f72a..d3e7821e 100644
--- a/src/dynarec/la64/dynarec_la64_0f.c
+++ b/src/dynarec/la64/dynarec_la64_0f.c
@@ -109,13 +109,12 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             FAKEED;
             break;
 
-        #define GO(GETFLAGS, NO, YES, F, I)                                                            \
+        #define GO(GETFLAGS, NO, YES, F, I)                                                         \
             READFLAGS(F);                                                                           \
             i32_ = F32S;                                                                            \
             BARRIER(BARRIER_MAYBE);                                                                 \
             JUMP(addr + i32_, 1);                                                                   \
-            if (la64_lbt && (opcode - 0x80) >= 0xC) {                                               \
-                X64_SET_EFLAGS(xFlags, F);                                                          \
+            if (la64_lbt) {                                                                         \
                 X64_SETJ(x1, I);                                                                    \
             } else {                                                                                \
                 GETFLAGS;                                                                           \
@@ -123,7 +122,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             if (dyn->insts[ninst].x64.jmp_insts == -1 || CHECK_CACHE()) {                           \
                 /* out of the block */                                                              \
                 i32 = dyn->insts[ninst].epilog - (dyn->native_size);                                \
-                if (la64_lbt && (opcode - 0x80) >= 0xC)                                             \
+                if (la64_lbt)                                                                       \
                     BEQZ_safe(x1, i32);                                                             \
                 else                                                                                \
                     B##NO##_safe(x1, i32);                                                          \
@@ -139,7 +138,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             } else {                                                                                \
                 /* inside the block */                                                              \
                 i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size);     \
-                if (la64_lbt && (opcode - 0x80) >= 0xC)                                             \
+                if (la64_lbt)                                                                       \
                     BNEZ_safe(x1, i32);                                                             \
                 else                                                                                \
                     B##YES##_safe(x1, i32);                                                         \
@@ -152,17 +151,13 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 
         #define GO(GETFLAGS, NO, YES, F, I)                                                          \
             READFLAGS(F);                                                                            \
-            if (la64_lbt && (opcode - 0x90) >= 0xC) {                                                \
-                X64_SET_EFLAGS(xFlags, F);                                                           \
-                X64_SETJ(x1, I);                                                                     \
+            if (la64_lbt) {                                                                          \
+                X64_SETJ(x3, I);                                                                     \
             } else {                                                                                 \
                 GETFLAGS;                                                                            \
+                S##YES(x3, x1);                                                                      \
             }                                                                                        \
             nextop = F8;                                                                             \
-            if (la64_lbt && (opcode - 0x90) >= 0xC)                                                  \
-                SNEZ(x3, x1);                                                                        \
-            else                                                                                     \
-                S##YES(x3, x1);                                                                      \
             if (MODREG) {                                                                            \
                 if (rex.rex) {                                                                       \
                     eb1 = TO_LA64((nextop & 7) + (rex.b << 3));                                      \
diff --git a/src/dynarec/la64/dynarec_la64_emit_logic.c b/src/dynarec/la64/dynarec_la64_emit_logic.c
index ac1e1020..423b885e 100644
--- a/src/dynarec/la64/dynarec_la64_emit_logic.c
+++ b/src/dynarec/la64/dynarec_la64_emit_logic.c
@@ -25,7 +25,6 @@
 // emit XOR32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
 void emit_xor32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         SET_DF(s4, rex.w ? d_xor64 : d_xor32);
     } else IFX(X_ALL) {
@@ -34,9 +33,10 @@ void emit_xor32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 
     if (la64_lbt) {
         IFX(X_ALL) {
-            if (rex.w) X64_XOR_D(s1, s2); else X64_XOR_W(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
+            if (rex.w)
+                X64_XOR_D(s1, s2);
+            else
+                X64_XOR_W(s1, s2);
         }
         XOR(s1, s1, s2);
         if (!rex.w && s1 != s2) ZEROUP(s1);
@@ -46,6 +46,7 @@ void emit_xor32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         return;
     }
 
+    CLEAR_FLAGS(s3);
     XOR(s1, s1, s2);
 
     // test sign bit before zeroup.
@@ -74,7 +75,6 @@ void emit_xor32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 // emit AND8 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
 void emit_and8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         SET_DF(s3, d_and8);
     } else IFX(X_ALL) {
@@ -85,8 +85,6 @@ void emit_and8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s3, int s
     IFXA(X_ALL, la64_lbt) {
         MOV32w(s3, c);
         X64_AND_B(s1, s3);
-        X64_GET_EFLAGS(s4, X_ALL);
-        OR(xFlags, xFlags, s4);
     }
 
     ANDI(s1, s1, c&0xff);
@@ -97,6 +95,7 @@ void emit_and8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s3, int s
 
     if (la64_lbt) return;
 
+    CLEAR_FLAGS(s3);
     IFX(X_SF) {
         SRLI_D(s3, s1, 7);
         BEQZ(s3, 8);
@@ -114,7 +113,6 @@ void emit_and8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s3, int s
 // emit AND32 instruction, from s1, c, store result in s1 using s3 and s4 as scratch
 void emit_and32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         SET_DF(s3, rex.w ? d_tst64 : d_tst32);
     } else IFX(X_ALL) {
@@ -123,9 +121,10 @@ void emit_and32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
 
     IFXA(X_ALL, la64_lbt) {
         MOV64xw(s3, c);
-        if (rex.w) X64_AND_D(s1, s3); else X64_AND_W(s1, s3);
-        X64_GET_EFLAGS(s4, X_ALL);
-        OR(xFlags, xFlags, s4);
+        if (rex.w)
+            X64_AND_D(s1, s3);
+        else
+            X64_AND_W(s1, s3);
     }
 
     if (c >= 0 && c <= 4095) {
@@ -141,6 +140,7 @@ void emit_and32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
 
     if (la64_lbt) return;
 
+    CLEAR_FLAGS(s3);
     IFX(X_SF) {
         SRLI_D(s3, s1, rex.w ? 63 : 31);
         BEQZ(s3, 8);
@@ -159,7 +159,6 @@ void emit_and32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
 // emit OR32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
 void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         SET_DF(s4, rex.w?d_or64:d_or32);
     } else IFX(X_ALL) {
@@ -167,9 +166,10 @@ void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
     }
 
     IFXA(X_ALL, la64_lbt) {
-        if (rex.w) X64_OR_D(s1, s2); else X64_OR_W(s1, s2);
-        X64_GET_EFLAGS(s3, X_ALL);
-        OR(xFlags, xFlags, s3);
+        if (rex.w)
+            X64_OR_D(s1, s2);
+        else
+            X64_OR_W(s1, s2);
     }
 
     OR(s1, s1, s2);
@@ -181,6 +181,7 @@ void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
 
     if(la64_lbt) return;
 
+    CLEAR_FLAGS(s3);
     // test sign bit before zeroup.
     IFX(X_SF) {
         if (!rex.w) SEXT_W(s1, s1);
@@ -199,7 +200,6 @@ void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
 // emit OR32 instruction, from s1, c, store result in s1 using s3 and s4 as scratch
 void emit_or32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         SET_DF(s4, rex.w ? d_or64 : d_or32);
     } else IFX(X_ALL) {
@@ -208,9 +208,10 @@ void emit_or32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, in
 
     IFXA(X_ALL, la64_lbt) {
         MOV64xw(s3, c);
-        if (rex.w) X64_OR_D(s1, s3); else X64_OR_W(s1, s3);
-        X64_GET_EFLAGS(s4, X_ALL);
-        OR(xFlags, xFlags, s4);
+        if (rex.w)
+            X64_OR_D(s1, s3);
+        else
+            X64_OR_W(s1, s3);
     }
 
     if (c >= 0 && c <= 4095) {
@@ -226,6 +227,7 @@ void emit_or32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, in
 
     if (la64_lbt) return;
 
+    CLEAR_FLAGS(s3);
     // test sign bit before zeroup.
     IFX(X_SF) {
         if (!rex.w) SEXT_W(s1, s1);
diff --git a/src/dynarec/la64/dynarec_la64_emit_math.c b/src/dynarec/la64/dynarec_la64_emit_math.c
index 30dc3313..1cd1fc16 100644
--- a/src/dynarec/la64/dynarec_la64_emit_math.c
+++ b/src/dynarec/la64/dynarec_la64_emit_math.c
@@ -24,7 +24,6 @@
 // emit ADD32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
 void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         if (rex.w) {
             ST_D(s1, xEmu, offsetof(x64emu_t, op1));
@@ -41,8 +40,6 @@ void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     if (la64_lbt) {
         IFX(X_ALL) {
             X64_ADD_WU(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
         ADDxw(s1, s1, s2);
         if (!rex.w) ZEROUP(s1);
@@ -52,6 +49,7 @@ void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_CF)
     {
         if (rex.w) {
@@ -127,7 +125,6 @@ void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 // emit ADD32 instruction, from s1, constant c, store result in s1 using s3 and s4 as scratch
 void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5)
 {
-    CLEAR_FLAGS(s3);
     if (s1 == xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags == X_PEND)) {
         // special case when doing math on ESP and only PEND is needed: ignoring it!
         if (c >= -2048 && c < 2048) {
@@ -156,8 +153,6 @@ void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
     if (la64_lbt) {
         IFX(X_ALL) {
             X64_ADD_WU(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
         ADDxw(s1, s1, s2);
         if (!rex.w) ZEROUP(s1);
@@ -167,6 +162,7 @@ void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_CF)
     {
         if (rex.w) {
@@ -251,7 +247,6 @@ void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
 // emit ADD8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
 void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         ST_B(s1, xEmu, offsetof(x64emu_t, op1));
         ST_B(s2, xEmu, offsetof(x64emu_t, op2));
@@ -263,8 +258,6 @@ void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     if (la64_lbt) {
         IFX(X_ALL) {
             X64_ADD_B(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
         ADD_D(s1, s1, s2);
         ANDI(s1, s1, 0xff);
@@ -274,6 +267,7 @@ void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_AF | X_OF)
     {
         OR(s3, s1, s2);  // s3 = op1 | op2
@@ -332,7 +326,6 @@ void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 // emit ADD8 instruction, from s1, const c, store result in s1 using s3 and s4 as scratch
 void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         MOV32w(s4, c & 0xff);
         ST_B(s1, xEmu, offsetof(x64emu_t, op1));
@@ -346,8 +339,6 @@ void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, i
         IFX(X_ALL) {
             IFX(X_PEND) {} else { MOV32w(s4, c & 0xff); }
             X64_ADD_B(s1, s4);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
         ADDI_D(s1, s1, c & 0xff);
         ANDI(s1, s1, 0xff);
@@ -357,6 +348,7 @@ void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, i
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_AF | X_OF)
     {
         IFX(X_PEND) {} else { MOV32w(s4, c & 0xff); }
@@ -416,7 +408,6 @@ void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, i
 // emit SUB8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
 void emit_sub8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         ST_B(s1, xEmu, offsetof(x64emu_t, op1));
         ST_B(s2, xEmu, offsetof(x64emu_t, op2));
@@ -428,8 +419,6 @@ void emit_sub8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     if (la64_lbt) {
         IFX(X_ALL) {
             X64_SUB_B(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
         SUB_D(s1, s1, s2);
         ANDI(s1, s1, 0xff);
@@ -439,6 +428,7 @@ void emit_sub8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_AF | X_CF | X_OF) {
         // for later flag calculation
         NOR(s5, xZR, s1);
@@ -474,7 +464,6 @@ void emit_sub8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, i
 // emit SUB32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
 void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5)
 {
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, op1));
         SDxw(s2, xEmu, offsetof(x64emu_t, op2));
@@ -486,8 +475,6 @@ void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     if (la64_lbt) {
         IFX(X_ALL) {
             X64_SUB_WU(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
         SUBxw(s1, s1, s2);
         if (!rex.w) ZEROUP(s1);
@@ -497,6 +484,7 @@ void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_AF | X_CF | X_OF) {
         // for later flag calculation
         NOR(s5, xZR, s1);
@@ -526,7 +514,6 @@ void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 // emit SUB32 instruction, from s1, constant c, store result in s1 using s2, s3, s4 and s5 as scratch
 void emit_sub32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5)
 {
-    CLEAR_FLAGS(s3);
     if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags==X_PEND))
     {
         // special case when doing math on RSP and only PEND is needed: ignoring it!
@@ -548,12 +535,11 @@ void emit_sub32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
         SET_DFNONE();
     }
 
+    CLEAR_FLAGS(s3);
     if (la64_lbt) {
         IFX(X_PEND) {} else {MOV64xw(s2, c);}
         IFX(X_ALL) {
             X64_SUB_WU(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
         SUBxw(s1, s1, s2);
         if (!rex.w) ZEROUP(s1);
diff --git a/src/dynarec/la64/dynarec_la64_emit_shift.c b/src/dynarec/la64/dynarec_la64_emit_shift.c
index ae1712d9..fe461cdb 100644
--- a/src/dynarec/la64/dynarec_la64_emit_shift.c
+++ b/src/dynarec/la64/dynarec_la64_emit_shift.c
@@ -25,7 +25,6 @@
 void emit_shl32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5)
 {
     // s2 is not 0 here and is 1..1f/3f
-    CLEAR_FLAGS(s3);
     IFX(X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, op1));
         SDxw(s2, xEmu, offsetof(x64emu_t, op2));
@@ -36,9 +35,10 @@ void emit_shl32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 
     if (la64_lbt) {
         IFX(X_ALL) {
-            if (rex.w) X64_SLL_D(s1, s2); else X64_SLL_W(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
+            if (rex.w)
+                X64_SLL_D(s1, s2);
+            else
+                X64_SLL_W(s1, s2);
         }
         SLL_D(s1, s1, s2);
         IFX(X_PEND) {
@@ -47,6 +47,7 @@ void emit_shl32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_CF | X_OF) {
         ADDI_D(s5, s2, rex.w?-64:-32);
         SUB_D(s5, xZR, s5);
@@ -90,8 +91,6 @@ void emit_shl32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 // emit SHR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
 void emit_shr32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
-
     IFX(X_PEND) {
         if (c) {
             MOV64x(s3, c);
@@ -113,9 +112,10 @@ void emit_shr32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
     if (la64_lbt) {
         IFX(X_PEND) {} else { MOV64x(s3, c); }
         IFX(X_ALL) {
-            if (rex.w) X64_SRL_D(s1, s3); else X64_SRL_W(s1, s3);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
+            if (rex.w)
+                X64_SRL_D(s1, s3);
+            else
+                X64_SRL_W(s1, s3);
         }
 
         SRLIxw(s1, s1, c);
@@ -126,6 +126,7 @@ void emit_shr32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_CF) {
         if (c > 1) {
             SRAI_D(s3, s1, c - 1);
@@ -171,8 +172,6 @@ void emit_shr32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
 // emit SAR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
 void emit_sar32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
-
     IFX(X_PEND) {
         if (c) {
             MOV64x(s3, c);
@@ -194,9 +193,10 @@ void emit_sar32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
     if (la64_lbt) {
         IFX(X_PEND) {} else { MOV64x(s3, c); }
         IFX(X_ALL) {
-            if (rex.w) X64_SRA_D(s1, s3); else X64_SRA_W(s1, s3);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
+            if (rex.w)
+                X64_SRA_D(s1, s3);
+            else
+                X64_SRA_W(s1, s3);
         }
 
         SRAIxw(s1, s1, c);
@@ -207,6 +207,7 @@ void emit_sar32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_CF) {
         if (c > 1) {
             SRAI_D(s3, s1, c - 1);
diff --git a/src/dynarec/la64/dynarec_la64_emit_tests.c b/src/dynarec/la64/dynarec_la64_emit_tests.c
index 20387ed6..916fdd38 100644
--- a/src/dynarec/la64/dynarec_la64_emit_tests.c
+++ b/src/dynarec/la64/dynarec_la64_emit_tests.c
@@ -25,7 +25,6 @@
 // emit CMP8 instruction, from cmp s1, s2, using s3, s4, s5 and s6 as scratch
 void emit_cmp8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5, int s6)
 {
-    CLEAR_FLAGS(s3);
     IFX_PENDOR0 {
         ST_B(s1, xEmu, offsetof(x64emu_t, op1));
         ST_B(s2, xEmu, offsetof(x64emu_t, op2));
@@ -37,8 +36,6 @@ void emit_cmp8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     if (la64_lbt) {
         IFX(X_ALL) {
             X64_SUB_B(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
 
         IFX_PENDOR0 {
@@ -48,6 +45,7 @@ void emit_cmp8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_AF | X_CF | X_OF) {
         // for later flag calculation
         NOR(s5, xZR, s1);
@@ -77,7 +75,6 @@ void emit_cmp8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
 // emit CMP8 instruction, from cmp s1 , 0, using s3 and s4 as scratch
 void emit_cmp8_0(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
     IFX_PENDOR0 {
         ST_B(s1, xEmu, offsetof(x64emu_t, op1));
         ST_B(xZR, xEmu, offsetof(x64emu_t, op2));
@@ -90,12 +87,11 @@ void emit_cmp8_0(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4)
     if (la64_lbt) {
         IFX(X_ALL) {
             X64_SUB_B(s1, xZR);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_SF) {
         SRLI_D(s3, s1, 7);
         BEQZ(s3, 8);
@@ -113,7 +109,6 @@ void emit_cmp8_0(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4)
 // emit CMP32 instruction, from cmp s1, s2, using s3 and s4 as scratch
 void emit_cmp32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6)
 {
-    CLEAR_FLAGS(s3);
     IFX_PENDOR0 {
         SDxw(s1, xEmu, offsetof(x64emu_t, op1));
         SDxw(s2, xEmu, offsetof(x64emu_t, op2));
@@ -124,9 +119,10 @@ void emit_cmp32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 
     if (la64_lbt) {
         IFX(X_ALL) {
-            if (rex.w) X64_SUB_D(s1, s2); else X64_SUB_W(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
+            if (rex.w)
+                X64_SUB_D(s1, s2);
+            else
+                X64_SUB_W(s1, s2);
         }
 
         IFX_PENDOR0 {
@@ -136,6 +132,7 @@ void emit_cmp32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_AF | X_CF | X_OF) {
         // for later flag calculation
         NOR(s5, xZR, s1);
@@ -166,7 +163,6 @@ void emit_cmp32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 // emit CMP32 instruction, from cmp s1, 0, using s3 and s4 as scratch
 void emit_cmp32_0(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s3, int s4)
 {
-    CLEAR_FLAGS(s3);
     IFX_PENDOR0 {
         ST_D(s1, xEmu, offsetof(x64emu_t, op1));
         ST_D(xZR, xEmu, offsetof(x64emu_t, op2));
@@ -178,13 +174,15 @@ void emit_cmp32_0(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s3, int
 
     if (la64_lbt) {
         IFX(X_ALL) {
-            if (rex.w) X64_SUB_D(s1, xZR); else X64_SUB_W(s1, xZR);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
+            if (rex.w)
+                X64_SUB_D(s1, xZR);
+            else
+                X64_SUB_W(s1, xZR);
         }
         return;
     }
 
+    CLEAR_FLAGS(s3);
     IFX(X_SF) {
         if (rex.w) {
             BGE(s1, xZR, 8);
@@ -207,7 +205,6 @@ void emit_cmp32_0(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s3, int
 // emit TEST8 instruction, from test s1, s2, using s3, s4 and s5 as scratch
 void emit_test8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5)
 {
-    CLEAR_FLAGS(s3);
     IFX_PENDOR0 {
         SET_DF(s3, d_tst8);
     } else {
@@ -217,8 +214,6 @@ void emit_test8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     if (la64_lbt) {
         IFX(X_ALL) {
             X64_AND_B(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
         }
 
         IFX_PENDOR0 {
@@ -228,6 +223,7 @@ void emit_test8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
         return;
     }
 
+    CLEAR_FLAGS(s3);
     AND(s3, s1, s2); // res = s1 & s2
 
     IFX_PENDOR0 {
@@ -250,7 +246,6 @@ void emit_test8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
 // emit TEST32 instruction, from test s1, s2, using s3 and s4 as scratch
 void emit_test32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5)
 {
-    CLEAR_FLAGS(s3);
     IFX_PENDOR0 {
         SET_DF(s3, rex.w?d_tst64:d_tst32);
     } else {
@@ -259,9 +254,10 @@ void emit_test32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int
 
     if (la64_lbt) {
         IFX(X_ALL) {
-            if (rex.w) X64_AND_D(s1, s2); else X64_AND_W(s1, s2);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
+            if (rex.w)
+                X64_AND_D(s1, s2);
+            else
+                X64_AND_W(s1, s2);
         }
 
         IFX_PENDOR0 {
@@ -271,6 +267,7 @@ void emit_test32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int
         return;
     }
 
+    CLEAR_FLAGS(s3);
     AND(s3, s1, s2); // res = s1 & s2
 
     IFX_PENDOR0 {
@@ -296,7 +293,6 @@ void emit_test32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int
 // emit TEST32 instruction, from test s1, s2, using s3 and s4 as scratch
 void emit_test32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4, int s5)
 {
-    CLEAR_FLAGS(s3);
     IFX_PENDOR0 {
         SET_DF(s3, rex.w ? d_tst64 : d_tst32);
     } else {
@@ -307,9 +303,10 @@ void emit_test32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c,
     if (la64_lbt) {
         IFX(X_ALL) {
             MOV64xw(s3, c);
-            if (rex.w) X64_AND_D(s1, s3); else X64_AND_W(s1, s3);
-            X64_GET_EFLAGS(s3, X_ALL);
-            OR(xFlags, xFlags, s3);
+            if (rex.w)
+                X64_AND_D(s1, s3);
+            else
+                X64_AND_W(s1, s3);
         }
 
         IFX_PENDOR0 {
@@ -324,6 +321,7 @@ void emit_test32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c,
         return;
     }
 
+    CLEAR_FLAGS(s3);
     if (c >= 0 && c <= 4095) {
         ANDI(s3, s1, c);
     } else {
diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c
index 03934883..cbd97fa8 100644
--- a/src/dynarec/la64/dynarec_la64_helper.c
+++ b/src/dynarec/la64/dynarec_la64_helper.c
@@ -451,6 +451,7 @@ void call_c(dynarec_la64_t* dyn, int ninst, void* fnc, int reg, int ret, int sav
     if (savereg == 0)
         savereg = x6;
     if (saveflags) {
+        RESTORE_EFLAGS(reg);
         ST_D(xFlags, xEmu, offsetof(x64emu_t, eflags));
     }
     fpu_pushcache(dyn, ninst, reg, 0);
@@ -499,6 +500,7 @@ void call_c(dynarec_la64_t* dyn, int ninst, void* fnc, int reg, int ret, int sav
     fpu_popcache(dyn, ninst, reg, 0);
     if (saveflags) {
         LD_D(xFlags, xEmu, offsetof(x64emu_t, eflags));
+        SPILL_EFLAGS();
     }
     SET_NODF();
     dyn->last_ip = 0;
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index 7a174edd..1b9490a3 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -269,8 +269,11 @@
 #define SET_NODF() dyn->f.dfnone = 0
 #define SET_DFOK() dyn->f.dfnone = 1
 
+#define CLEAR_FLAGS_(s) \
+    MOV64x(s, (1UL << F_AF) | (1UL << F_CF) | (1UL << F_OF) | (1UL << F_ZF) | (1UL << F_SF) | (1UL << F_PF)); ANDN(xFlags, xFlags, s);
+
 #define CLEAR_FLAGS(s) \
-    IFX(X_ALL) { MOV64x(s, (1UL << F_AF) | (1UL << F_CF) | (1UL << F_OF) | (1UL << F_ZF) | (1UL << F_SF) | (1UL << F_PF)); ANDN(xFlags, xFlags, s); }
+    IFX(X_ALL) { CLEAR_FLAGS_(s) }
 
 #define CALC_SUB_FLAGS(op1_, op2, res, scratch1, scratch2, width)     \
     IFX(X_AF | X_CF | X_OF)                                           \
@@ -664,4 +667,22 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             opcode = F8;                           \
         }
 
+// Restore xFlags from LBT.eflags
+#define RESTORE_EFLAGS(s)               \
+    do {                                \
+        if (la64_lbt) {                 \
+            CLEAR_FLAGS_(reg);          \
+            X64_GET_EFLAGS(reg, X_ALL); \
+            OR(xFlags, xFlags, reg);    \
+        }                               \
+    } while (0)
+
+// Spill xFlags to LBT.eflags
+#define SPILL_EFLAGS()                     \
+    do {                                   \
+        if (la64_lbt) {                    \
+            X64_SET_EFLAGS(xFlags, X_ALL); \
+        }                                  \
+    } while (0)
+
 #endif //__DYNAREC_LA64_HELPER_H__
\ No newline at end of file
diff --git a/src/dynarec/la64/la64_epilog.S b/src/dynarec/la64/la64_epilog.S
index 14f5dc4e..0b1feb7c 100644
--- a/src/dynarec/la64/la64_epilog.S
+++ b/src/dynarec/la64/la64_epilog.S
@@ -27,6 +27,15 @@ la64_epilog:
     st.d   $r28, $r4, (8 * 13)
     st.d   $r29, $r4, (8 * 14)
     st.d   $r30, $r4, (8 * 15)
+    // restore xFlags from LBT.eflags
+    la.global $r12, la64_lbt
+    ldptr.d   $r12, $r12, 0
+    beqz      $r12, 1f
+    ori       $r13, $r0, 0b100011010101
+    andn      $r31, $r31, $r13
+    x86mfflag $r13, 0b111111
+    or        $r31, $r31, $r13
+1:
     st.d   $r31, $r4, (8 * 16) // xFlags
     st.d   $r20, $r4, (8 * 17) // put back reg value in emu, including EIP (so $r20 must be EIP now)
     // fallback to epilog_fast now, just restoring saved regs
diff --git a/src/dynarec/la64/la64_prolog.S b/src/dynarec/la64/la64_prolog.S
index c1d1ed80..d0faa1e2 100644
--- a/src/dynarec/la64/la64_prolog.S
+++ b/src/dynarec/la64/la64_prolog.S
@@ -6,6 +6,8 @@
 .text
 .align 4
 
+.extern la64_lbt
+
 .global la64_prolog
 la64_prolog:
     //save all 18 used register
@@ -31,7 +33,7 @@ la64_prolog:
     fst.d  $f29, $sp, (8 * 16)
     fst.d  $f30, $sp, (8 * 17)
     fst.d  $f31, $sp, (8 * 18)
-    //setup emu -> register
+    // setup emu -> register
     ld.d   $r12, $r4, (8 * 0)
     ld.d   $r13, $r4, (8 * 1)
     ld.d   $r14, $r4, (8 * 2)
@@ -48,8 +50,14 @@ la64_prolog:
     ld.d   $r28, $r4, (8 * 13)
     ld.d   $r29, $r4, (8 * 14)
     ld.d   $r30, $r4, (8 * 15)
-    ld.d   $r31, $r4, (8 * 16)  //xFlags
-    ld.d   $r20, $r4, (8 * 17)  //xRIP
+    ld.d   $r31, $r4, (8 * 16)  // xFlags
+    ld.d   $r20, $r4, (8 * 17)  // xRIP
+    // spill xFlags to LBT.eflags
+    la.global $a6, la64_lbt
+    ldptr.d   $a6, $a6, 0
+    beqz      $a6, 1f
+    x86mtflag $r31, 0b111111
+1:
     // push sentinel onto the stack
     st.d   $r0, $sp, -16
     st.d   $r0,  $sp, -8