about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2025-05-23 00:57:09 +0800
committerGitHub <noreply@github.com>2025-05-22 18:57:09 +0200
commitc9f3b990b41c73cf1f893e52e37db6aa3e7afff0 (patch)
tree97f22bdde5dead412e675e46cc0a53bb689d54d1 /src
parent6592b63800a97e851e5fd4d4a05b06663f1d286d (diff)
downloadbox64-c9f3b990b41c73cf1f893e52e37db6aa3e7afff0.tar.gz
box64-c9f3b990b41c73cf1f893e52e37db6aa3e7afff0.zip
[RV64_DYNAREC] Enable nativeflags optimization for more patterns (#2659)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_1.c3
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_2.c6
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_3.c9
-rw-r--r--src/dynarec/rv64/dynarec_rv64_0f.c1
-rw-r--r--src/dynarec/rv64/dynarec_rv64_66.c4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_67.c7
-rw-r--r--src/dynarec/rv64/dynarec_rv64_emit_logic.c42
-rw-r--r--src/dynarec/rv64/dynarec_rv64_emit_math.c61
-rw-r--r--src/dynarec/rv64/dynarec_rv64_emit_shift.c89
-rw-r--r--src/dynarec/rv64/dynarec_rv64_emit_tests.c47
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.c31
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.c26
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h25
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass0.h23
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h2
-rw-r--r--src/dynarec/rv64/rv64_mapping.h1
16 files changed, 201 insertions, 176 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_00_1.c b/src/dynarec/rv64/dynarec_rv64_00_1.c
index 03ecffa1..3e4fd754 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_1.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_1.c
@@ -86,6 +86,7 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x56:
         case 0x57:
             INST_NAME("PUSH reg");
+            SCRATCH_USAGE(0);
             gd = TO_NAT((opcode & 0x07) + (rex.b << 3));
             PUSH1z(gd);
             break;
@@ -98,6 +99,7 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5E:
         case 0x5F:
             INST_NAME("POP reg");
+            SCRATCH_USAGE(0);
             gd = TO_NAT((opcode & 0x07) + (rex.b << 3));
             POP1z(gd);
             break;
@@ -151,6 +153,7 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 INST_NAME("MOVSXD Gd, Ed");
                 nextop = F8;
                 GETGD;
+                SCRATCH_USAGE(0);
                 if (rex.w) {
                     if (MODREG) { // reg <= reg
                         ADDIW(gd, TO_NAT((nextop & 7) + (rex.b << 3)), 0);
diff --git a/src/dynarec/rv64/dynarec_rv64_00_2.c b/src/dynarec/rv64/dynarec_rv64_00_2.c
index ffc4d43a..7551cce4 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_2.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_2.c
@@ -400,6 +400,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGD;
             if (MODREG) { // reg <= reg
+                SCRATCH_USAGE(0);
                 MVxw(TO_NAT((nextop & 7) + (rex.b << 3)), gd);
             } else { // mem <= reg
                 IF_UNALIGNED(ip) {
@@ -413,6 +414,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         }
                     }
                 } else {
+                    SCRATCH_USAGE(0);
                     addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0);
                     SDxw(gd, ed, fixedaddress);
                 }
@@ -464,6 +466,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("MOV Gd, Ed");
             nextop = F8;
             GETGD;
+            SCRATCH_USAGE(0);
             if (MODREG) {
                 MVxw(gd, TO_NAT((nextop & 7) + (rex.b << 3)));
             } else {
@@ -476,6 +479,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("MOV Ed, Seg");
             nextop = F8;
             if (MODREG) {
+                SCRATCH_USAGE(0);
                 LHU(TO_NAT((nextop & 7) + (rex.b << 3)), xEmu, offsetof(x64emu_t, segs[(nextop & 0x38) >> 3]));
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
@@ -1042,6 +1046,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xB3:
             INST_NAME("MOV xL, Ib");
             u8 = F8;
+            SCRATCH_USAGE(0);
             if (rex.rex)
                 gb1 = TO_NAT((opcode & 7) + (rex.b << 3));
             else
@@ -1078,6 +1083,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xBF:
             INST_NAME("MOV Reg, Id");
             gd = TO_NAT((opcode & 7) + (rex.b << 3));
+            SCRATCH_USAGE(0);
             if (rex.w) {
                 u64 = F64;
                 MOV64x(gd, u64);
diff --git a/src/dynarec/rv64/dynarec_rv64_00_3.c b/src/dynarec/rv64/dynarec_rv64_00_3.c
index c10742db..868e55bb 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_3.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_3.c
@@ -373,9 +373,11 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     ORI(eb1, eb1, u8);
                 }
             } else { // mem <= u8
+                SCRATCH_USAGE(0);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, &lock, 1, 1);
                 u8 = F8;
                 if (u8) {
+                    SCRATCH_USAGE(1);
                     ADDI(x3, xZR, u8);
                     ed = x3;
                 } else
@@ -388,6 +390,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("MOV Ed, Id");
             nextop = F8;
             if (MODREG) { // reg <= i32
+                SCRATCH_USAGE(0);
                 i64 = F32S;
                 ed = TO_NAT((nextop & 7) + (rex.b << 3));
                 MOV64xw(ed, i64);
@@ -409,9 +412,11 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         }
                     }
                 } else {
+                    SCRATCH_USAGE(0);
                     addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, &lock, 1, 4);
                     i64 = F32S;
                     if (i64) {
+                        SCRATCH_USAGE(1);
                         MOV64x(x3, i64);
                         ed = x3;
                     } else
@@ -802,7 +807,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                             SLLW(ed, ed, xRCX);
                         if (dyn->insts[ninst].nat_flags_fusion) {
                             if (!rex.w) ZEROUP(ed);
-                            NAT_FLAGS_OPS(ed, xZR);
+                            NAT_FLAGS_OPS(ed, xZR, x5, xZR);
                         } else if (!rex.w && MODREG) {
                             ZEROUP(ed);
                         }
@@ -827,7 +832,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                             SRLW(ed, ed, xRCX);
                         if (dyn->insts[ninst].nat_flags_fusion) {
                             if (!rex.w) ZEROUP(ed);
-                            NAT_FLAGS_OPS(ed, xZR);
+                            NAT_FLAGS_OPS(ed, xZR, x5, xZR);
                         } else if (!rex.w && MODREG) {
                             ZEROUP(ed);
                         }
diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c
index 82f8f632..b2f8c720 100644
--- a/src/dynarec/rv64/dynarec_rv64_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f.c
@@ -2077,6 +2077,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("MOVZX Gd, Eb");
             nextop = F8;
             GETGD;
+            SCRATCH_USAGE(0);
             if (MODREG) {
                 if (rex.rex) {
                     eb1 = TO_NAT((nextop & 7) + (rex.b << 3));
diff --git a/src/dynarec/rv64/dynarec_rv64_66.c b/src/dynarec/rv64/dynarec_rv64_66.c
index f7134d6e..68ed8f84 100644
--- a/src/dynarec/rv64/dynarec_rv64_66.c
+++ b/src/dynarec/rv64/dynarec_rv64_66.c
@@ -403,6 +403,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0x56:
         case 0x57:
             INST_NAME("PUSH reg");
+            SCRATCH_USAGE(0);
             gd = TO_NAT((opcode & 0x07) + (rex.b << 3));
             PUSH1_16(gd);
             break;
@@ -683,6 +684,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     INSHz(ed, gd, x2, x3, 1, 1);
                 }
             } else {
+                SCRATCH_USAGE(0);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0);
                 SH(gd, ed, fixedaddress);
                 SMWRITELOCK(lock);
@@ -1176,7 +1178,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                             }
                             wb1 = 0;
                             EWBACK;
-                            if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(ed, xZR);
+                            if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(ed, xZR, x5, xZR);
                         } else {
                             GETEW(x1, 0);
                             u8 = (F8) & 0x1f;
diff --git a/src/dynarec/rv64/dynarec_rv64_67.c b/src/dynarec/rv64/dynarec_rv64_67.c
index 8fd60088..aeb6535d 100644
--- a/src/dynarec/rv64/dynarec_rv64_67.c
+++ b/src/dynarec/rv64/dynarec_rv64_67.c
@@ -206,6 +206,7 @@ uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     INST_NAME("MOVXZ Gd, Eb");
                     nextop = F8;
                     GETGD;
+                    SCRATCH_USAGE(0);
                     if (MODREG) {
                         if (rex.rex) {
                             eb1 = TO_NAT((nextop & 7) + (rex.b << 3));
@@ -231,6 +232,7 @@ uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     INST_NAME("MOVZX Gd, Ew");
                     nextop = F8;
                     GETGD;
+                    SCRATCH_USAGE(0);
                     if (MODREG) {
                         ed = TO_NAT((nextop & 7) + (rex.b << 3));
                         ZEXTH(gd, ed);
@@ -482,6 +484,7 @@ uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("MOVSXD Gd, Ed");
             nextop = F8;
             GETGD;
+            SCRATCH_USAGE(0);
             if (rex.w) {
                 if (MODREG) { // reg <= reg
                     ADDIW(gd, TO_NAT((nextop & 7) + (rex.b << 3)), 0);
@@ -516,6 +519,7 @@ uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                             OR(ed, ed, x2);
                         }
                     } else {
+                        SCRATCH_USAGE(0);
                         addr = geted32(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0);
                         SH(gd, ed, fixedaddress);
                         SMWRITELOCK(lock);
@@ -706,6 +710,7 @@ uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("MOV Ed, Gd");
             nextop = F8;
             GETGD;
+            SCRATCH_USAGE(0);
             if (MODREG) { // reg <= reg
                 MVxw(TO_NAT((nextop & 7) + (rex.b << 3)), gd);
             } else { // mem <= reg
@@ -718,6 +723,7 @@ uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("MOV Gd, Ed");
             nextop = F8;
             GETGD;
+            SCRATCH_USAGE(0);
             if (MODREG) {
                 MVxw(gd, TO_NAT((nextop & 7) + (rex.b << 3)));
             } else {
@@ -770,6 +776,7 @@ uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("MOV Ed, Id");
             nextop = F8;
             if (MODREG) { // reg <= i32
+                SCRATCH_USAGE(0);
                 i64 = F32S;
                 ed = TO_NAT((nextop & 7) + (rex.b << 3));
                 MOV64xw(ed, i64);
diff --git a/src/dynarec/rv64/dynarec_rv64_emit_logic.c b/src/dynarec/rv64/dynarec_rv64_emit_logic.c
index 32edad65..ba4e991b 100644
--- a/src/dynarec/rv64/dynarec_rv64_emit_logic.c
+++ b/src/dynarec/rv64/dynarec_rv64_emit_logic.c
@@ -31,8 +31,6 @@ void emit_xor8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 
     XOR(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         SRLI(s3, s1, 7);
         SET_FLAGS_NEZ(s3, F_SF, s4);
@@ -46,6 +44,7 @@ void emit_xor8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit XOR8 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
@@ -59,8 +58,6 @@ void emit_xor8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s
     }
     XORI(s1, s1, c & 0xff);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         SRLI(s3, s1, 7);
         SET_FLAGS_NEZ(s3, F_SF, s4);
@@ -74,6 +71,7 @@ void emit_xor8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit XOR32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -88,8 +86,6 @@ void emit_xor32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 
     XOR(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     // test sign bit before zeroup.
     IFX (X_SF) {
         if (!rex.w) SEXT_W(s1, s1);
@@ -108,6 +104,7 @@ void emit_xor32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit XOR32 instruction, from s1, c, store result in s1 using s3 and s4 as scratch
@@ -127,8 +124,6 @@ void emit_xor32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
         XOR(s1, s1, s3);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     // test sign bit before zeroup.
     IFX (X_SF) {
         if (!rex.w) SEXT_W(s1, s1);
@@ -146,6 +141,7 @@ void emit_xor32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit XOR16 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch, s4 can be same as s2 (and so s2 destroyed)
@@ -160,8 +156,6 @@ void emit_xor16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
 
     XOR(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -178,6 +172,7 @@ void emit_xor16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit OR16 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch, s4 can be same as s2 (and so s2 destroyed)
@@ -192,8 +187,6 @@ void emit_or16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 
     OR(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SD(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -209,6 +202,7 @@ void emit_or16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit OR32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -223,8 +217,6 @@ void emit_or32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
 
     OR(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         if (!rex.w) SEXT_W(s1, s1);
         SET_FLAGS_LTZ(s1, F_SF, s3, s4);
@@ -243,6 +235,7 @@ void emit_or32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit OR32 instruction, from s1, c, store result in s1 using s3 and s4 as scratch
@@ -262,8 +255,6 @@ void emit_or32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, in
         OR(s1, s1, s3);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         if (!rex.w) SEXT_W(s1, s1);
         SET_FLAGS_LTZ(s1, F_SF, s3, s4);
@@ -282,6 +273,7 @@ void emit_or32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, in
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 
@@ -297,8 +289,6 @@ void emit_and8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 
     AND(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -312,6 +302,7 @@ void emit_and8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 
@@ -327,8 +318,6 @@ void emit_and8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s
 
     ANDI(s1, s1, c & 0xff);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SD(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -342,6 +331,7 @@ void emit_and8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 void emit_and16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
@@ -355,8 +345,6 @@ void emit_and16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 
     AND(s1, s1, s2); // res = s1 & s2
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -370,6 +358,7 @@ void emit_and16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit AND32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -385,8 +374,6 @@ void emit_and32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     AND(s1, s1, s2); // res = s1 & s2
     if (!rex.w) ZEROUP(s1);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -400,6 +387,7 @@ void emit_and32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit AND32 instruction, from s1, c, store result in s1 using s3 and s4 as scratch
@@ -420,8 +408,6 @@ void emit_and32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
     }
     if (!rex.w && c < 0) ZEROUP(s1);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -435,6 +421,7 @@ void emit_and32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit OR8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch, s4 can be same as s2 (and so s2 destroyed)
@@ -449,8 +436,6 @@ void emit_or8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 
     OR(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -464,6 +449,7 @@ void emit_or8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit OR8 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
diff --git a/src/dynarec/rv64/dynarec_rv64_emit_math.c b/src/dynarec/rv64/dynarec_rv64_emit_math.c
index 131564a3..9c2291a9 100644
--- a/src/dynarec/rv64/dynarec_rv64_emit_math.c
+++ b/src/dynarec/rv64/dynarec_rv64_emit_math.c
@@ -60,8 +60,6 @@ void emit_add32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 
     ADDxw(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -97,6 +95,7 @@ void emit_add32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ADD32 instruction, from s1, constant c, store result in s1 using s3 and s4 as scratch
@@ -164,8 +163,6 @@ void emit_add32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
         ADDxw(s1, s1, s2);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -201,6 +198,7 @@ void emit_add32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ADD16 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -220,8 +218,6 @@ void emit_add16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     }
     ADD(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SW(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -263,6 +259,7 @@ void emit_add16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ADD8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -282,8 +279,6 @@ void emit_add8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     }
     ADD(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_AF | X_OF) {
         if (rv64_zbb) {
             ANDN(s3, s3, s1); // s3 = ~res & (op1 | op2)
@@ -322,6 +317,7 @@ void emit_add8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ADD8 instruction, from s1, const c, store result in s1 using s3 and s4 as scratch
@@ -346,8 +342,6 @@ void emit_add8c(dynarec_rv64_t* dyn, int ninst, int s1, int c, int s2, int s3, i
     }
     ADDI(s1, s1, c);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_AF | X_OF) {
         if (rv64_zbb) {
             ANDN(s3, s3, s1); // s3 = ~res & (op1 | op2)
@@ -386,6 +380,7 @@ void emit_add8c(dynarec_rv64_t* dyn, int ninst, int s1, int c, int s2, int s3, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SUB8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -408,8 +403,6 @@ void emit_sub8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     SUB(s1, s1, s2);
     ANDI(s1, s1, 0xff);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         SRLI(s3, s1, 7);
         SET_FLAGS_NEZ(s3, F_SF, s4);
@@ -424,6 +417,7 @@ void emit_sub8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SUB8 instruction, from s1, constant c, store result in s1 using s3 and s4 as scratch
@@ -451,7 +445,6 @@ void emit_sub16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     }
 
     SUBW(s1, s1, s2);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
@@ -469,6 +462,7 @@ void emit_sub16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 
@@ -490,7 +484,6 @@ void emit_sub32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     }
 
     SUBxw(s1, s1, s2);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
@@ -508,6 +501,7 @@ void emit_sub32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 
@@ -551,7 +545,6 @@ void emit_sub32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
         }
         SUBxw(s1, s1, s2);
     }
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_AF | X_CF | X_OF) {
         IFX (X_PEND) {
@@ -575,6 +568,7 @@ void emit_sub32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit INC8 instruction, from s1, store result in s1 using s2, s3 and s4 as scratch
@@ -595,7 +589,6 @@ void emit_inc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     }
 
     ADDIW(s1, s1, 1);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
@@ -631,6 +624,7 @@ void emit_inc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 
@@ -653,7 +647,6 @@ void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     }
 
     ADDIW(s1, s1, -1);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
@@ -684,6 +677,7 @@ void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit INC32 instruction, from s1, store result in s1 using s3 and s4 as scratch
@@ -704,7 +698,6 @@ void emit_inc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     }
 
     ADDIxw(s1, s1, 1);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
@@ -741,6 +734,7 @@ void emit_inc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit DEC32 instruction, from s1, store result in s1 using s3 and s4 as scratch
@@ -762,7 +756,6 @@ void emit_dec32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     }
 
     ADDIxw(s1, s1, -1);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
@@ -794,6 +787,7 @@ void emit_dec32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit INC16 instruction, from s1, store result in s1 using s3 and s4 as scratch
@@ -814,7 +808,6 @@ void emit_inc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     }
 
     ADDI(s1, s1, 1);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
@@ -852,6 +845,7 @@ void emit_inc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit DEC16 instruction, from s1, store result in s1 using s3 and s4 as scratch
@@ -874,8 +868,6 @@ void emit_dec16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
 
     ADDIW(s1, s1, -1);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -905,6 +897,7 @@ void emit_dec16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s5);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SBB8 instruction, from s1, s2, store result in s1 using s3, s4 and s5 as scratch
@@ -928,8 +921,6 @@ void emit_sbb8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     SUBW(s1, s1, s3);
     ANDI(s1, s1, 0xff);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     CLEAR_FLAGS();
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
@@ -946,6 +937,7 @@ void emit_sbb8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ADC8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -967,8 +959,6 @@ void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     ANDI(s3, xFlags, 1 << F_CF);
     ADD(s1, s1, s3);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     CLEAR_FLAGS();
     IFX (X_PEND) {
         SW(s1, xEmu, offsetof(x64emu_t, res));
@@ -1010,6 +1000,7 @@ void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ADC8 instruction, from s1, const c, store result in s1 using s3, s4, s5 and s6 as scratch
@@ -1046,8 +1037,6 @@ void emit_sbb16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     ANDI(s3, xFlags, 1 << F_CF);
     SUBW(s1, s1, s3);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     CLEAR_FLAGS();
     SLLIW(s1, s1, 16);
     IFX (X_SF) {
@@ -1066,6 +1055,7 @@ void emit_sbb16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SBB32 instruction, from s1, s2, store result in s1 using s3, s4 and s5 as scratch
@@ -1088,8 +1078,6 @@ void emit_sbb32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     ANDI(s3, xFlags, 1 << F_CF);
     SUBxw(s1, s1, s3);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     CLEAR_FLAGS();
     IFX (X_SF) {
         SET_FLAGS_LTZ(s1, F_SF, s4, s5);
@@ -1109,6 +1097,7 @@ void emit_sbb32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit NEG32 instruction, from s1, store result in s1 using s2 and s3 as scratch
@@ -1126,7 +1115,6 @@ void emit_neg32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     }
 
     NEGxw(s1, s1);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
@@ -1164,6 +1152,7 @@ void emit_neg32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit NEG16 instruction, from s1, store result in s1 using s2 and s3 as scratch
@@ -1182,7 +1171,6 @@ void emit_neg16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 
     NEG(s1, s1);
     ZEXTH(s1, s1);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
@@ -1219,6 +1207,7 @@ void emit_neg16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit NEG8 instruction, from s1, store result in s1 using s2 and s3 as scratch
@@ -1237,7 +1226,6 @@ void emit_neg8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
 
     NEG(s1, s1);
     ANDI(s1, s1, 0xff);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
@@ -1273,6 +1261,7 @@ void emit_neg8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ADC16 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -1294,8 +1283,6 @@ void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     ANDI(s3, xFlags, 1 << F_CF);
     ADD(s1, s1, s3);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     CLEAR_FLAGS();
     IFX (X_PEND) {
         SW(s1, xEmu, offsetof(x64emu_t, res));
@@ -1337,6 +1324,7 @@ void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ADC32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -1384,8 +1372,6 @@ void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     ANDI(s3, xFlags, 1 << F_CF);
     ADDxw(s1, s1, s3);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     CLEAR_FLAGS();
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
@@ -1425,4 +1411,5 @@ void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_ZF) {
         SET_FLAGS_EQZ(s1, F_ZF, s5);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
diff --git a/src/dynarec/rv64/dynarec_rv64_emit_shift.c b/src/dynarec/rv64/dynarec_rv64_emit_shift.c
index ed3e4adf..1bba5289 100644
--- a/src/dynarec/rv64/dynarec_rv64_emit_shift.c
+++ b/src/dynarec/rv64/dynarec_rv64_emit_shift.c
@@ -49,7 +49,6 @@ void emit_shl8c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
             SET_FLAGS_LTZ(s1, F_SF, s3, s4);
         }
         SRLI(s1, s1, 56);
-        if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
         IFX (X_PEND) {
             SB(s1, xEmu, offsetof(x64emu_t, res));
@@ -69,6 +68,7 @@ void emit_shl8c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
         IFX (X_PF) {
             emit_pf(dyn, ninst, s1, s3, s4);
         }
+        if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
     } else {
         IFX (X_CF) {
             if (c == 8) {
@@ -77,7 +77,6 @@ void emit_shl8c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
             }
         }
         MV(s1, xZR);
-        if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(xZR, xZR);
 
         IFX (X_PEND) {
             SB(s1, xEmu, offsetof(x64emu_t, res));
@@ -93,6 +92,7 @@ void emit_shl8c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
                 ORI(xFlags, xFlags, 1 << F_PF);
             }
         }
+        if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(xZR, xZR, xZR, xZR);
     }
 }
 
@@ -133,7 +133,6 @@ void emit_shr8c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
     SRLI(s1, s1, c);
     ANDI(s1, s1, 0xff);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     // SF should be unset
     IFX (X_PEND) {
@@ -145,6 +144,7 @@ void emit_shr8c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SAR8 instruction, from s1 , constant c, store result in s1 using s3, s4 and s5 as scratch
@@ -181,8 +181,6 @@ void emit_sar8c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
     SRLI(s1, s1, c);
     ANDI(s1, s1, 0xff);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -192,6 +190,7 @@ void emit_sar8c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SHL8 instruction, from s1 , shift s2, store result in s1 using s3, s4 and s5 as scratch
@@ -223,8 +222,6 @@ void emit_shl8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     }
     SRLI(s1, s1, 56);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -243,6 +240,7 @@ void emit_shl8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SHR8 instruction, from s1 , shift s2 (!0 and and'd already), store result in s1 using s3 and s4 as scratch
@@ -279,8 +277,6 @@ void emit_shr8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     SRL(s1, s1, s2);
     ANDI(s1, s1, 0xff);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     // SF should be unset
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
@@ -291,6 +287,7 @@ void emit_shr8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SAR8 instruction, from s1 , shift s2 (!0 and and'd already), store result in s1 using s3, s4 and s5 as scratch
@@ -323,8 +320,6 @@ void emit_sar8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     SRL(s1, s1, s2);
     ANDI(s1, s1, 0xff);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SB(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -334,6 +329,7 @@ void emit_sar8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SHL16 instruction, from s1 , constant c, store result in s1 using s3, s4 and s5 as scratch
@@ -367,8 +363,6 @@ void emit_shl16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
         }
         SRLI(s1, s1, 48);
 
-        if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
         IFX (X_PEND) {
             SH(s1, xEmu, offsetof(x64emu_t, res));
         }
@@ -387,6 +381,7 @@ void emit_shl16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
         IFX (X_PF) {
             emit_pf(dyn, ninst, s1, s3, s4);
         }
+        if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
     } else {
         IFX (X_CF) {
             if (c == 16) {
@@ -395,7 +390,6 @@ void emit_shl16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
             }
         }
         MV(s1, xZR);
-        if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(xZR, xZR);
 
         IFX (X_PEND) {
             SH(s1, xEmu, offsetof(x64emu_t, res));
@@ -411,6 +405,7 @@ void emit_shl16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
                 ORI(xFlags, xFlags, 1 << F_PF);
             }
         }
+        if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(xZR, xZR, xZR, xZR);
     }
 }
 
@@ -450,8 +445,6 @@ void emit_shr16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
 
     SRLI(s1, s1, c);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     // SF should be unset
 
     IFX (X_PEND) {
@@ -463,6 +456,7 @@ void emit_shr16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SAR16 instruction, from s1 , constant c, store result in s1 using s3, s4 and s5 as scratch
@@ -499,8 +493,6 @@ void emit_sar16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
     SRLI(s1, s1, c);
     ZEXTH(s1, s1);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -510,6 +502,7 @@ void emit_sar16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 
@@ -542,8 +535,6 @@ void emit_shl16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     }
     SRLI(s1, s1, 48);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -562,6 +553,7 @@ void emit_shl16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SHR16 instruction, from s1 , shift s2 (!0 and and'd already), store result in s1 using s3 and s4 as scratch
@@ -598,8 +590,6 @@ void emit_shr16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     SRL(s1, s1, s2);
     ZEXTH(s1, s1);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     // SF should be unset
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
@@ -610,6 +600,7 @@ void emit_shr16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SAR16 instruction, from s1 , shift s2 (!0 and and'd already), store result in s1 using s3, s4 and s5 as scratch
@@ -642,8 +633,6 @@ void emit_sar16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     SRL(s1, s1, s2);
     ZEXTH(s1, s1);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -653,6 +642,7 @@ void emit_sar16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SHL32 instruction, from s1 , shift s2, store result in s1 using s3, s4 and s5 as scratch
@@ -684,8 +674,6 @@ void emit_shl32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         SLLW(s1, s1, s2);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         SET_FLAGS_LTZ(s1, F_SF, s3, s4);
     }
@@ -710,6 +698,7 @@ void emit_shl32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 // emit SHL32 instruction, from s1 , constant c, store result in s1 using s3, s4 and s5 as scratch
 void emit_shl32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4, int s5)
@@ -743,8 +732,6 @@ void emit_shl32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
         SLLIW(s1, s1, c);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         SET_FLAGS_LTZ(s1, F_SF, s3, s4);
     }
@@ -769,6 +756,7 @@ void emit_shl32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SHR32 instruction, from s1 , shift s2 (!0 and and'd already), store result in s1 using s3 and s4 as scratch
@@ -804,8 +792,6 @@ void emit_shr32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 
     SRL(s1, s1, s2);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         SET_FLAGS_LTZ(s1, F_SF, s3, s4);
     }
@@ -821,6 +807,7 @@ void emit_shr32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SHR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
@@ -870,8 +857,6 @@ void emit_shr32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
         SRLIW(s1, s1, c);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         SET_FLAGS_LTZ(s1, F_SF, s3, s4);
     }
@@ -887,6 +872,7 @@ void emit_shr32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SAR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
@@ -929,8 +915,6 @@ void emit_sar32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
         SRAIW(s1, s1, c);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     // SRAIW sign-extends, so test sign bit before clearing upper bits
     IFX (X_SF) {
         SET_FLAGS_LTZ(s1, F_SF, s3, s4);
@@ -947,6 +931,7 @@ void emit_sar32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ROL32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -979,7 +964,6 @@ void emit_rol32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         SRLxw(s1, s1, s4);
         OR(s1, s3, s1);
     }
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
@@ -1001,6 +985,7 @@ void emit_rol32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         OR(xFlags, xFlags, s3);
         MARK;
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ROR32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -1033,7 +1018,6 @@ void emit_ror32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         SLLxw(s1, s1, s4);
         OR(s1, s3, s1);
     }
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
@@ -1057,6 +1041,7 @@ void emit_ror32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
         OR(xFlags, xFlags, s3);
         MARK;
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ROL16 instruction, from s1, constant c, store result in s1 using s3 and s4 as scratch
@@ -1073,8 +1058,6 @@ void emit_rol16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
         ZEXTH(s1, s1);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_CF | X_OF) {
         ANDI(xFlags, xFlags, ~(1UL << F_CF | 1UL << F_OF2));
         ANDI(s4, s1, 1 << F_CF);
@@ -1088,6 +1071,7 @@ void emit_rol16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
             OR(xFlags, xFlags, s3);
         }
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ROL32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
@@ -1121,8 +1105,6 @@ void emit_rol32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
 
     if (!rex.w) ZEROUP(s1);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -1139,6 +1121,7 @@ void emit_rol32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
             OR(xFlags, xFlags, s3);
         }
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ROR16 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
@@ -1155,8 +1138,6 @@ void emit_ror16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
         ZEXTH(s1, s1);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_CF | X_OF) {
         ANDI(xFlags, xFlags, ~(1UL << F_CF | 1UL << F_OF2));
     }
@@ -1176,6 +1157,7 @@ void emit_ror16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
             OR(xFlags, xFlags, s3);
         }
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit ROR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
@@ -1209,8 +1191,6 @@ void emit_ror32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
 
     if (!rex.w) ZEROUP(s1);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -1229,6 +1209,7 @@ void emit_ror32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
             OR(xFlags, xFlags, s3);
         }
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit SHRD32 instruction, from s1, fill s2 , constant c, store result in s1 using s3 and s4 as scratch
@@ -1273,8 +1254,6 @@ void emit_shrd32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uin
     SLLIxw(s1, s2, (rex.w ? 64 : 32) - c);
     OR(s1, s1, s3);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     if (!rex.w) {
         ZEROUP(s1);
     }
@@ -1300,6 +1279,7 @@ void emit_shrd32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uin
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 void emit_shrd16c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4, int s5)
@@ -1352,7 +1332,6 @@ void emit_shrd16c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uin
         OR(s1, s1, s5);
     }
     ZEXTH(s1, s1);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
@@ -1377,6 +1356,7 @@ void emit_shrd16c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uin
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 void emit_shld32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4, int s5)
@@ -1423,7 +1403,6 @@ void emit_shld32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uin
     if (!rex.w) {
         ZEROUP(s1);
     }
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
@@ -1447,6 +1426,7 @@ void emit_shld32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uin
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 
@@ -1477,8 +1457,6 @@ void emit_shrd32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int
     SLLxw(s4, s2, s4);
     OR(s1, s4, s3);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_PEND) {
         SDxw(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -1504,6 +1482,7 @@ void emit_shrd32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 void emit_shld32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s5, int s3, int s4, int s6)
@@ -1539,7 +1518,6 @@ void emit_shld32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int
     if (!rex.w) {
         ZEROUP(s1);
     }
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_OF) {
         ADDI(s5, s5, -1);
@@ -1560,6 +1538,7 @@ void emit_shld32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 void emit_shld16c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4, int s5)
@@ -1611,7 +1590,6 @@ void emit_shld16c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uin
         OR(s1, s1, s3);
     }
     ZEXTH(s1, s1);
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
 
     IFX (X_PEND) {
         SH(s1, xEmu, offsetof(x64emu_t, res));
@@ -1636,6 +1614,7 @@ void emit_shld16c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, uin
     IFX (X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit RCL16 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
@@ -1657,8 +1636,6 @@ void emit_rcl16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
     SRLI(s1, s1, 17 - c);
     OR(s1, s1, s3);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_CF | X_OF) {
         ANDI(xFlags, xFlags, ~(1UL << F_CF | 1UL << F_OF2));
         SRLI(s4, s4, 63);
@@ -1673,6 +1650,7 @@ void emit_rcl16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
             OR(xFlags, xFlags, s3);
         }
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit RCR16 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
@@ -1699,8 +1677,6 @@ void emit_rcr16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
     SLLI(s4, s1, 47);
     ZEXTH(s1, s1);
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_CF | X_OF) {
         ANDI(xFlags, xFlags, ~(1UL << F_CF | 1UL << F_OF2));
     }
@@ -1720,4 +1696,5 @@ void emit_rcr16c(dynarec_rv64_t* dyn, int ninst, int s1, uint32_t c, int s3, int
             OR(xFlags, xFlags, s3);
         }
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
diff --git a/src/dynarec/rv64/dynarec_rv64_emit_tests.c b/src/dynarec/rv64/dynarec_rv64_emit_tests.c
index acdb9275..23404365 100644
--- a/src/dynarec/rv64/dynarec_rv64_emit_tests.c
+++ b/src/dynarec/rv64/dynarec_rv64_emit_tests.c
@@ -65,9 +65,9 @@ void emit_cmp8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
             SRAI(s3, s3, 56);
             SLLI(s4, s2, 56);
             SRAI(s4, s4, 56);
-            NAT_FLAGS_OPS(s3, s4);
+            NAT_FLAGS_OPS(s3, s4, s5, xZR);
         } else {
-            NAT_FLAGS_OPS(s1, s2);
+            NAT_FLAGS_OPS(s1, s2, s3, xZR);
         }
     }
 }
@@ -101,9 +101,9 @@ void emit_cmp8_0(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4)
         if (dyn->insts[ninst].nat_flags_needsign) {
             SLLI(s3, s1, 56);
             SRAI(s3, s3, 56);
-            NAT_FLAGS_OPS(s3, xZR);
+            NAT_FLAGS_OPS(s3, xZR, s4, xZR);
         } else {
-            NAT_FLAGS_OPS(s1, xZR);
+            NAT_FLAGS_OPS(s1, xZR, s3, xZR);
         }
     }
 }
@@ -154,9 +154,9 @@ void emit_cmp16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
             SRAI(s3, s3, 48);
             SLLI(s4, s2, 48);
             SRAI(s4, s4, 48);
-            NAT_FLAGS_OPS(s3, s4);
+            NAT_FLAGS_OPS(s3, s4, s5, xZR);
         } else {
-            NAT_FLAGS_OPS(s1, s2);
+            NAT_FLAGS_OPS(s1, s2, s3, xZR);
         }
     }
 }
@@ -190,9 +190,9 @@ void emit_cmp16_0(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4)
         if (dyn->insts[ninst].nat_flags_needsign) {
             SLLI(s3, s1, 48);
             SRAI(s3, s3, 48);
-            NAT_FLAGS_OPS(s3, xZR);
+            NAT_FLAGS_OPS(s3, xZR, s4, xZR);
         } else {
-            NAT_FLAGS_OPS(s1, xZR);
+            NAT_FLAGS_OPS(s1, xZR, s3, xZR);
         }
     }
 }
@@ -237,9 +237,9 @@ void emit_cmp32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
     NAT_FLAGS_ENABLE_CARRY();
     NAT_FLAGS_ENABLE_SIGN();
     if (dyn->insts[ninst].nat_flags_fusion) {
-        if (rex.w)
-            NAT_FLAGS_OPS(s1, s2);
-        else {
+        if (rex.w) {
+            NAT_FLAGS_OPS(s1, s2, s3, s4);
+        } else {
             if (dyn->insts[ninst].nat_flags_needsign) {
                 SEXT_W(s3, s1);
                 SEXT_W(s4, s2);
@@ -247,7 +247,7 @@ void emit_cmp32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
                 ZEXTW2(s3, s1);
                 ZEXTW2(s4, s2);
             }
-            NAT_FLAGS_OPS(s3, s4);
+            NAT_FLAGS_OPS(s3, s4, s5, xZR);
         }
     }
 }
@@ -290,16 +290,16 @@ void emit_cmp32_0(dynarec_rv64_t* dyn, int ninst, rex_t rex, uint8_t nextop, int
     NAT_FLAGS_ENABLE_SIGN();
     if (dyn->insts[ninst].nat_flags_fusion) {
         if (rex.w)
-            NAT_FLAGS_OPS(s1, xZR);
+            NAT_FLAGS_OPS(s1, xZR, s3, xZR);
         else {
             if (dyn->insts[ninst].nat_flags_needsign) {
                 SEXT_W(s3, s1);
-                NAT_FLAGS_OPS(s3, xZR);
+                NAT_FLAGS_OPS(s3, xZR, s4, xZR);
             } else if (res == s5) { // zero-up'd case
-                NAT_FLAGS_OPS(s5, xZR);
+                NAT_FLAGS_OPS(s5, xZR, s4, xZR);
             } else {
                 ZEXTW2(s3, s1);
-                NAT_FLAGS_OPS(s3, xZR);
+                NAT_FLAGS_OPS(s3, xZR, s4, xZR);
             }
         }
     }
@@ -321,8 +321,6 @@ void emit_test8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
         SD(s1, xEmu, offsetof(x64emu_t, res));
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR);
-
     IFX (X_SF) {
         SRLI(s4, s1, 7);
         SET_FLAGS_NEZ(s4, F_SF, s5);
@@ -339,6 +337,7 @@ void emit_test8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
         SLLI(s1, s1, 56);
         SRAI(s1, s1, 56);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s1, xZR, s3, xZR);
 }
 
 // emit TEST8 instruction, from test s1, c, using s3, s4 and s5 as scratch
@@ -357,8 +356,6 @@ void emit_test8c(dynarec_rv64_t* dyn, int ninst, int s1, uint8_t c, int s3, int
         SD(s3, xEmu, offsetof(x64emu_t, res));
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s3, xZR);
-
     IFX (X_SF) {
         SRLI(s4, s3, 7);
         SET_FLAGS_NEZ(s4, F_SF, s5);
@@ -375,6 +372,7 @@ void emit_test8c(dynarec_rv64_t* dyn, int ninst, int s1, uint8_t c, int s3, int
         SLLI(s3, s3, 56);
         SRAI(s3, s3, 56);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s3, xZR, xZR, xZR);
 }
 
 // emit TEST16 instruction, from test s1, s2, using s3, s4 and s5 as scratch
@@ -393,8 +391,6 @@ void emit_test16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
         SH(s3, xEmu, offsetof(x64emu_t, res));
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s3, xZR);
-
     IFX (X_SF) {
         SRLI(s4, s3, 15);
         SET_FLAGS_NEZ(s4, F_SF, s5);
@@ -411,6 +407,7 @@ void emit_test16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4,
         SLLI(s3, s3, 48);
         SRAI(s3, s3, 48);
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s3, xZR, xZR, xZR);
 }
 
 // emit TEST32 instruction, from test s1, s2, using s3 and s4 as scratch
@@ -433,8 +430,6 @@ void emit_test32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int
         if (!rex.w) ZEROUP(s3);
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s3, xZR);
-
     IFX (X_SF) {
         SRLI(s4, s3, rex.w ? 63 : 31);
         SET_FLAGS_NEZ(s4, F_SF, s5);
@@ -456,6 +451,7 @@ void emit_test32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int
                 ZEROUP(s3);
         }
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s3, xZR, xZR, xZR);
 }
 
 // emit TEST32 instruction, from test s1, s2, using s3 and s4 as scratch
@@ -483,8 +479,6 @@ void emit_test32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c,
         SDxw(s3, xEmu, offsetof(x64emu_t, res));
     }
 
-    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s3, xZR);
-
     IFX (X_SF) {
         SRLI(s4, s3, rex.w ? 63 : 31);
         SET_FLAGS_NEZ(s4, F_SF, s5);
@@ -506,4 +500,5 @@ void emit_test32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c,
                 ZEROUP(s3);
         }
     }
+    if (dyn->insts[ninst].nat_flags_fusion) NAT_FLAGS_OPS(s3, xZR, xZR, xZR);
 }
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c
index f6f9a683..b5149486 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.c
+++ b/src/dynarec/rv64/dynarec_rv64_functions.c
@@ -705,7 +705,7 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
     if (!dyn->need_dump && !BOX64ENV(dynarec_gdbjit) && !BOX64ENV(dynarec_perf_map)) return;
 
     static char buf[256];
-    int length = sprintf(buf, "barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, fuse=%d, sm=%d(%d/%d), sew@entry=%d, sew@exit=%d",
+    int length = sprintf(buf, "barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, fuse=%d/%d, sm=%d(%d/%d), sew@entry=%d, sew@exit=%d",
         dyn->insts[ninst].x64.barrier,
         dyn->insts[ninst].x64.state_flags,
         dyn->f.pending,
@@ -717,6 +717,7 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
         dyn->insts[ninst].x64.need_before,
         dyn->insts[ninst].x64.need_after,
         dyn->insts[ninst].nat_flags_fusion,
+        dyn->insts[ninst].no_scratch_usage,
         dyn->smwrite, dyn->insts[ninst].will_write, dyn->insts[ninst].last_write,
         dyn->insts[ninst].vector_sew_entry, dyn->insts[ninst].vector_sew_exit);
     if (dyn->insts[ninst].pred_sz) {
@@ -858,15 +859,27 @@ void updateNativeFlags(dynarec_rv64_t* dyn)
         return;
     for (int i = 1; i < dyn->size; ++i)
         if (dyn->insts[i].nat_flags_fusion) {
-            if (dyn->insts[i].pred_sz == 1 && dyn->insts[i].pred[0] == i - 1
-                && (dyn->insts[i].x64.use_flags & dyn->insts[i - 1].x64.set_flags) == dyn->insts[i].x64.use_flags) {
-                dyn->insts[i - 1].nat_flags_fusion = 1;
-                if (dyn->insts[i].x64.use_flags & X_SF) {
-                    dyn->insts[i - 1].nat_flags_needsign = 1;
+            int j = i - 1;
+            int found = 0;
+            if (dyn->insts[i].pred_sz == 1 && dyn->insts[i].pred[0] == j) {
+                while (j >= 0) {
+                    if (dyn->insts[j].x64.set_flags && (dyn->insts[i].x64.use_flags & dyn->insts[j].x64.set_flags) == dyn->insts[i].x64.use_flags) {
+                        dyn->insts[j].nat_flags_fusion = 1;
+                        if (dyn->insts[i].x64.use_flags & X_SF) {
+                            dyn->insts[j].nat_flags_needsign = 1;
+                        }
+                        dyn->insts[i].x64.use_flags = 0;
+                        dyn->insts[j].nat_next_inst = i;
+                        found = 1;
+                        break;
+                    } else if (j && dyn->insts[j].pred_sz == 1 && dyn->insts[j].pred[0] == j - 1
+                        && dyn->insts[j].no_scratch_usage && !dyn->insts[j].x64.set_flags && !dyn->insts[j].x64.use_flags) {
+                        j -= 1;
+                    } else
+                        break;
                 }
-                dyn->insts[i].x64.use_flags = 0;
-            } else
-                dyn->insts[i].nat_flags_fusion = 0;
+            }
+            if (!found) dyn->insts[i].nat_flags_fusion = 0;
         }
 }
 
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c
index 73b0ea73..ce95d9e4 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.c
+++ b/src/dynarec/rv64/dynarec_rv64_helper.c
@@ -62,9 +62,11 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
                     if (tmp && ((tmp < -2048) || (tmp > maxval) || !i12)) {
                         MOV64x(scratch, tmp);
                         ADDSL(ret, scratch, TO_NAT(sib_reg), sib >> 6, ret);
+                        SCRATCH_USAGE(1);
                     } else {
                         if (sib >> 6) {
                             SLLI(ret, TO_NAT(sib_reg), (sib >> 6));
+                            SCRATCH_USAGE(1);
                         } else
                             ret = TO_NAT(sib_reg);
                         *fixaddress = tmp;
@@ -77,10 +79,12 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
                             break;
                     }
                     MOV64x(ret, tmp);
+                    SCRATCH_USAGE(1);
                 }
             } else {
                 if (sib_reg != 4) {
                     ADDSL(ret, TO_NAT(sib_reg2), TO_NAT(sib_reg), sib >> 6, scratch);
+                    SCRATCH_USAGE(1);
                 } else {
                     ret = TO_NAT(sib_reg2);
                 }
@@ -95,13 +99,17 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
                 GETIP(addr + delta, scratch);
                 ret = xRIP;
                 *fixaddress = tmp;
+                SCRATCH_USAGE(1);
             } else if (adj && (tmp + adj >= -2048) && (tmp + adj <= maxval)) {
                 ADDI(ret, xRIP, tmp + adj);
+                SCRATCH_USAGE(1);
             } else if ((tmp >= -2048) && (tmp <= maxval)) {
                 GETIP(addr + delta, scratch);
                 ADDI(ret, xRIP, tmp);
+                SCRATCH_USAGE(1);
             } else if (tmp + addr + delta < 0x100000000LL) {
                 MOV64x(ret, tmp + addr + delta);
+                SCRATCH_USAGE(1);
             } else {
                 if (adj) {
                     MOV64x(ret, tmp + adj);
@@ -110,6 +118,7 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
                     GETIP(addr + delta, scratch);
                 }
                 ADD(ret, ret, xRIP);
+                SCRATCH_USAGE(1);
             }
             switch (lock) {
                 case 1: addLockAddress(addr + delta + tmp); break;
@@ -138,6 +147,7 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
             if ((nextop & 7) == 4) {
                 if (sib_reg != 4) {
                     ADDSL(ret, TO_NAT(sib_reg2), TO_NAT(sib_reg), sib >> 6, scratch);
+                    SCRATCH_USAGE(1);
                 } else {
                     ret = TO_NAT(sib_reg2);
                 }
@@ -154,19 +164,23 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
                 } else
                     scratch = TO_NAT((nextop & 0x07) + (rex.b << 3));
                 ADDI(ret, scratch, i64);
+                SCRATCH_USAGE(1);
             } else {
                 MOV64x(scratch, i64);
                 if ((nextop & 7) == 4) {
                     if (sib_reg != 4) {
                         ADD(scratch, scratch, TO_NAT(sib_reg2));
                         ADDSL(ret, scratch, TO_NAT(sib_reg), sib >> 6, ret);
+                        SCRATCH_USAGE(1);
                     } else {
                         PASS3(int tmp = TO_NAT(sib_reg2));
                         ADD(ret, tmp, scratch);
+                        SCRATCH_USAGE(1);
                     }
                 } else {
                     PASS3(int tmp = TO_NAT((nextop & 0x07) + (rex.b << 3)));
                     ADD(ret, tmp, scratch);
+                    SCRATCH_USAGE(1);
                 }
             }
         }
@@ -208,10 +222,12 @@ static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_
                             ADDW(ret, TO_NAT(sib_reg), scratch);
                         }
                         ZEROUP(ret);
+                        SCRATCH_USAGE(1);
                     } else {
-                        if (sib >> 6)
+                        if (sib >> 6) {
                             SLLI(ret, TO_NAT(sib_reg), (sib >> 6));
-                        else
+                            SCRATCH_USAGE(1);
+                        } else
                             ret = TO_NAT(sib_reg);
                         *fixaddress = tmp;
                     }
@@ -223,6 +239,7 @@ static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_
                             break;
                     }
                     MOV32w(ret, tmp);
+                    SCRATCH_USAGE(1);
                 }
             } else {
                 if (sib_reg != 4) {
@@ -233,6 +250,7 @@ static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_
                         ADDW(ret, TO_NAT(sib_reg2), TO_NAT(sib_reg));
                     }
                     ZEROUP(ret);
+                    SCRATCH_USAGE(1);
                 } else {
                     ret = TO_NAT(sib_reg2);
                 }
@@ -240,6 +258,7 @@ static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_
         } else if ((nextop & 7) == 5) {
             uint32_t tmp = F32;
             MOV32w(ret, tmp);
+            SCRATCH_USAGE(1);
             switch (lock) {
                 case 1: addLockAddress(tmp); break;
                 case 2:
@@ -276,6 +295,7 @@ static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_
                         ADDW(ret, TO_NAT(sib_reg2), TO_NAT(sib_reg));
                     }
                     ZEROUP(ret);
+                    SCRATCH_USAGE(1);
                 } else {
                     ret = TO_NAT(sib_reg2);
                 }
@@ -298,6 +318,7 @@ static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_
                     scratch = TO_NAT(nextop & 0x07);
                 ADDIW(ret, scratch, i32);
                 ZEROUP(ret);
+                SCRATCH_USAGE(1);
             } else {
                 // no need to zero up, as we did it below
                 rv64_move32(dyn, ninst, scratch, i32, 0);
@@ -318,6 +339,7 @@ static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_
                     ADDW(ret, tmp, scratch);
                 }
                 ZEROUP(ret);
+                SCRATCH_USAGE(1);
             }
         }
     }
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 5eb8ac69..37e8518c 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -1066,10 +1066,18 @@
     READFLAGS(A)
 #endif
 
-#define NAT_FLAGS_OPS(op1, op2)                    \
-    do {                                           \
-        dyn->insts[ninst + 1].nat_flags_op1 = op1; \
-        dyn->insts[ninst + 1].nat_flags_op2 = op2; \
+#define NAT_FLAGS_OPS(op1, op2, s1, s2)                                     \
+    do {                                                                    \
+        dyn->insts[dyn->insts[ninst].nat_next_inst].nat_flags_op1 = op1;    \
+        dyn->insts[dyn->insts[ninst].nat_next_inst].nat_flags_op2 = op2;    \
+        if (dyn->insts[ninst + 1].no_scratch_usage && IS_GPR(op1)) {        \
+            MV(s1, op1);                                                    \
+            dyn->insts[dyn->insts[ninst].nat_next_inst].nat_flags_op1 = s1; \
+        }                                                                   \
+        if (dyn->insts[ninst + 1].no_scratch_usage && IS_GPR(op2)) {        \
+            MV(s2, op2);                                                    \
+            dyn->insts[dyn->insts[ninst].nat_next_inst].nat_flags_op2 = s2; \
+        }                                                                   \
     } while (0)
 
 #define NAT_FLAGS_ENABLE_CARRY() dyn->insts[ninst].nat_flags_carry = 1
@@ -1951,7 +1959,7 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
         }                                                                            \
         if (dyn->insts[ninst].nat_flags_fusion) {                                    \
             ANDI(s1, dst, 0xff);                                                     \
-            NAT_FLAGS_OPS(s1, xZR);                                                  \
+            NAT_FLAGS_OPS(s1, xZR, xZR, xZR);                                        \
         }                                                                            \
         break;                                                                       \
     }
@@ -1974,7 +1982,7 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
         }                                                                            \
         if (dyn->insts[ninst].nat_flags_fusion) {                                    \
             ZEXTH(s1, dst);                                                          \
-            NAT_FLAGS_OPS(s1, xZR);                                                  \
+            NAT_FLAGS_OPS(s1, xZR, xZR, xZR);                                        \
         }                                                                            \
         break;                                                                       \
     }
@@ -1994,4 +2002,9 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
 #define VECTOR_LOAD_VMASK(mask, s1, multiple) \
     vector_loadmask(dyn, ninst, VMASK, mask, s1, multiple)
 
+#define SCRATCH_USAGE(usage)                         \
+    do {                                             \
+        dyn->insts[ninst].no_scratch_usage = !usage; \
+    } while (0)
+
 #endif //__DYNAREC_RV64_HELPER_H__
diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h
index 416e8bb0..a5cefa8c 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass0.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass0.h
@@ -14,15 +14,20 @@
     dyn->f.dfnone = 1;                   \
     dyn->f.pending = SF_SET
 
-#define READFLAGS_FUSION(A, s1, s2, s3, s4, s5)                                                                 \
-    if (BOX64ENV(dynarec_nativeflags) && ninst > 0 && !dyn->insts[ninst - 1].nat_flags_nofusion) {              \
-        if ((A) == (X_ZF))                                                                                      \
-            dyn->insts[ninst].nat_flags_fusion = 1;                                                             \
-        else if (dyn->insts[ninst - 1].nat_flags_carry && ((A) == (X_CF) || (A) == (X_CF | X_ZF)))              \
-            dyn->insts[ninst].nat_flags_fusion = 1;                                                             \
-        else if (dyn->insts[ninst - 1].nat_flags_sign && ((A) == (X_SF | X_OF) || (A) == (X_SF | X_OF | X_ZF))) \
-            dyn->insts[ninst].nat_flags_fusion = 1;                                                             \
-    }                                                                                                           \
+#define READFLAGS_FUSION(A, s1, s2, s3, s4, s5)                                                                \
+    if (BOX64ENV(dynarec_nativeflags) && ninst > 0) {                                                          \
+        int prev = ninst - 1;                                                                                  \
+        while (prev && dyn->insts[prev].no_scratch_usage)                                                      \
+            prev -= 1;                                                                                         \
+        if (!dyn->insts[prev].nat_flags_nofusion) {                                                            \
+            if ((A) == (X_ZF))                                                                                 \
+                dyn->insts[ninst].nat_flags_fusion = 1;                                                        \
+            else if (dyn->insts[prev].nat_flags_carry && ((A) == (X_CF) || (A) == (X_CF | X_ZF)))              \
+                dyn->insts[ninst].nat_flags_fusion = 1;                                                        \
+            else if (dyn->insts[prev].nat_flags_sign && ((A) == (X_SF | X_OF) || (A) == (X_SF | X_OF | X_ZF))) \
+                dyn->insts[ninst].nat_flags_fusion = 1;                                                        \
+        }                                                                                                      \
+    }                                                                                                          \
     READFLAGS(A);
 
 #define SETFLAGS(A, B, FUSION)                                           \
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index 16ea574f..e32aa69e 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -138,8 +138,10 @@ typedef struct instruction_rv64_s {
     uint8_t             nat_flags_needsign:1;
     uint8_t             unaligned:1;    // this opcode can be re-generated for unaligned special case
     uint8_t             x87precision:1; // this opcode can handle x87pc
+    uint8_t             no_scratch_usage:1; // this opcode does not use scratch register
     uint8_t             nat_flags_op1;
     uint8_t             nat_flags_op2;
+    uint16_t            nat_next_inst;
     flagcache_t         f_exit;     // flags status at end of instruction
     extcache_t          e;          // extcache at end of instruction (but before poping)
     flagcache_t         f_entry;    // flags status before the instruction begin
diff --git a/src/dynarec/rv64/rv64_mapping.h b/src/dynarec/rv64/rv64_mapping.h
index 53e71f3c..ac7ec914 100644
--- a/src/dynarec/rv64/rv64_mapping.h
+++ b/src/dynarec/rv64/rv64_mapping.h
@@ -65,6 +65,7 @@ x31     t6     x6           Temporary                       Scratch
 
 // convert a x86 register to native according to the register mapping
 #define TO_NAT(A) (((uint8_t[]) { 16, 13, 12, 24, 9, 8, 11, 10, 14, 15, 26, 27, 18, 19, 20, 21 })[(A)])
+#define IS_GPR(A) (((uint8_t[]) { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0 })[(A)])
 
 #define x1   6
 #define x2   7