about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2024-11-19 16:39:57 +0800
committerGitHub <noreply@github.com>2024-11-19 09:39:57 +0100
commit127d273ada8bd26c05a4778db5e0f10b9e627621 (patch)
tree32e6e4f157ca5fba1d80690fabcb44c127d39539 /src
parent469d4f81eb1ec9bb6b5919de15e266f1bbc9a388 (diff)
downloadbox64-127d273ada8bd26c05a4778db5e0f10b9e627621.tar.gz
box64-127d273ada8bd26c05a4778db5e0f10b9e627621.zip
[DYNAREC] Reworked strong memory emulation (#2043)
* [ARM64_DYNAREC] Reworked strong memory emulation

* Simplify

* [RV64,LA64_DYNAREC] Reworked strong memory emulation

* forgot this

* more tweaks
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_00.c20
-rw-r--r--src/dynarec/arm64/dynarec_arm64_66.c5
-rw-r--r--src/dynarec/arm64/dynarec_arm64_660f.c1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_67.c10
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f20f.c1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h297
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h1
-rw-r--r--src/dynarec/dynarec_native_pass.c7
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h219
-rw-r--r--src/dynarec/la64/dynarec_la64_private.h1
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h200
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h3
-rw-r--r--src/dynarec/rv64/rv64_emitter.h2
16 files changed, 571 insertions, 200 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index 807c93cd..facfd3df 100644
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -646,7 +646,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 SKIPTEST(x1);
                 dyn->doublepush = 0;
             } else {
-                WILLWRITE();
                 gd = xRAX+(opcode&0x07)+(rex.b<<3);
                 u32 = PK(0);
                 i32 = 1;
@@ -730,7 +729,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
         case 0x60:
             if(rex.is32bits) {
                 INST_NAME("PUSHAD");
-                WILLWRITE();
                 MOVw_REG(x1, xRSP);
                 PUSH2_32(xRAX, xRCX);
                 PUSH2_32(xRDX, xRBX);
@@ -814,7 +812,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 LDRSW_U12(x1, x3, 0);
                 PUSH1z(x1);
             } else {
-                WILLWRITE();
                 MOV64z(x3, i64);
                 PUSH1z(x3);
                 SMWRITE();
@@ -883,7 +880,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("PUSH Ib");
             i64 = F8S;
             MOV64z(x3, i64);
-            WILLWRITE();
             PUSH1z(x3);
             SMWRITE();
             break;
@@ -1292,7 +1288,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 BFIx(eb1, gd, eb2*8, 8);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff, 0, rex, &lock, 0, 0);
-                WILLWRITELOCK(lock);
                 STB(gd, ed, fixedaddress);
                 SMWRITELOCK(lock);
             }
@@ -1304,8 +1299,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             if(MODREG) {   // reg <= reg
                 MOVxw_REG(xRAX+(nextop&7)+(rex.b<<3), gd);
             } else {                    // mem <= reg
-                addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<(2+rex.w), (1<<(2+rex.w))-1, rex, &lock, 0, 0);
-                WILLWRITELOCK(lock);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff << (2 + rex.w), (1 << (2 + rex.w)) - 1, rex, &lock, 0, 0);
                 STxw(gd, ed, fixedaddress);
                 SMWRITELOCK(lock);
             }
@@ -1521,8 +1515,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             else
                 u64 = F64;
             MOV64z(x1, u64);
-            lock=isLockAddress(u64);
-            WILLWRITELOCK(lock);
+            lock = isLockAddress(u64);
             STRB_U12(xRAX, x1, 0);
             SMWRITELOCK(lock);
             break;
@@ -1533,8 +1526,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             else
                 u64 = F64;
             MOV64z(x1, u64);
-            lock=isLockAddress(u64);
-            WILLWRITELOCK(lock);
+            lock = isLockAddress(u64);
             STRxw_U12(xRAX, x1, 0);
             SMWRITELOCK(lock);
             break;
@@ -1693,7 +1685,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             emit_test32c(dyn, ninst, rex, xRAX, i64, x3, x4, x5);
             break;
         case 0xAA:
-            WILLWRITE();
             if(rep) {
                 INST_NAME("REP STOSB");
                 CBZx_NEXT(xRCX);
@@ -1718,7 +1709,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             SMWRITE();
             break;
         case 0xAB:
-            WILLWRITE();
             if(rep) {
                 INST_NAME("REP STOSD");
                 CBZx_NEXT(xRCX);
@@ -2248,7 +2238,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     ed = x3;
                 } else
                     ed = xZR;
-                WILLWRITELOCK(lock);
                 STB(ed, wback, fixedaddress);
                 SMWRITELOCK(lock);
             }
@@ -2268,7 +2257,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     ed = x3;
                 } else
                     ed = xZR;
-                WILLWRITELOCK(lock);
                 STxw(ed, wback, fixedaddress);
                 SMWRITELOCK(lock);
             }
@@ -2363,7 +2351,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         call_n(dyn, ninst, *(void**)(addr+8), tmp);
                         addr+=8+8;
                     } else {
-                        WILLWRITE2();
                         GETIP(ip+1); // read the 0xCC
                         STORE_XEMU_CALL(xRIP);
                         ADDx_U12(x1, xEmu, (uint32_t)offsetof(x64emu_t, ip)); // setup addr as &emu->ip
@@ -3037,7 +3024,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     } else {
                         TABLE64(x2, addr);
                     }
-                    WILLWRITE2();
                     PUSH1(x2);
                     MESSAGE(LOG_DUMP, "Native Call to %s (retn=%d)\n", getBridgeName((void*)(dyn->insts[ninst].natcall-1))?:GetNativeName(GetNativeFnc(dyn->insts[ninst].natcall-1)), dyn->insts[ninst].retn);
                     SKIPTEST(x1);    // disable test as this hack dos 2 instructions for 1
diff --git a/src/dynarec/arm64/dynarec_arm64_66.c b/src/dynarec/arm64/dynarec_arm64_66.c
index 946716ed..0d2b69c6 100644
--- a/src/dynarec/arm64/dynarec_arm64_66.c
+++ b/src/dynarec/arm64/dynarec_arm64_66.c
@@ -711,7 +711,6 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
         case 0x9C:

             INST_NAME("PUSHF");

             READFLAGS(X_ALL);

-            WILLWRITE();

             PUSH1_16(xFlags);

             SMWRITE();

             break;

@@ -752,8 +751,7 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             else

                 u64 = F64;

             MOV64z(x1, u64);

-            if(isLockAddress(u64)) lock=1; else lock = 0;

-            WILLWRITELOCK(lock);

+            lock = isLockAddress(u64);

             STRH_U12(xRAX, x1, 0);

             SMWRITELOCK(lock);

             break;

@@ -865,7 +863,6 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             break;

 

         case 0xAB:

-            WILLWRITE();

             if(rep) {

                 INST_NAME("REP STOSW");

                 CBZx_NEXT(xRCX);

diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index 6bd25c4a..d2deeb3b 100644
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -2869,7 +2869,6 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 v1 = sse_get_reg_empty(dyn, ninst, x1, (nextop&7) + (rex.b<<3));

                 FMOVD(v1, v0);

             } else {

-                WILLWRITE2();

                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);

                 VST64(v0, ed, fixedaddress);

                 SMWRITE2();

diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c
index bce01f5c..f570f0a1 100644
--- a/src/dynarec/arm64/dynarec_arm64_67.c
+++ b/src/dynarec/arm64/dynarec_arm64_67.c
@@ -780,7 +780,6 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                                 v1 = sse_get_reg_empty(dyn, ninst, x1, (nextop&7) + (rex.b<<3));

                                 FMOVD(v1, v0);

                             } else {

-                                WILLWRITE2();

                                 addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);

                                 VST64(v0, ed, fixedaddress);

                                 SMWRITE2();

@@ -889,8 +888,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                             BFIx(ed, gd, 0, 16);

                         }

                     } else {

-                        addr = geted32(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, &lock, 0, 0);

-                        WILLWRITELOCK(lock);

+                        addr = geted32(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff << 1, 1, rex, &lock, 0, 0);

                         STH(gd, ed, fixedaddress);

                         SMWRITELOCK(lock);

                     }

@@ -1102,7 +1100,6 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 BFIx(eb1, gd, eb2*8, 8);

             } else {

                 addr = geted32(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff, 0, rex, &lock, 0, 0);

-                WILLWRITELOCK(lock);

                 STB(gd, ed, fixedaddress);

                 SMWRITELOCK(lock);

             }

@@ -1114,8 +1111,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             if(MODREG) {   // reg <= reg

                 MOVxw_REG(xRAX+(nextop&7)+(rex.b<<3), gd);

             } else {                    // mem <= reg

-                addr = geted32(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<(2+rex.w), (1<<(2+rex.w))-1, rex, &lock, 0, 0);

-                WILLWRITELOCK(lock);

+                addr = geted32(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff << (2 + rex.w), (1 << (2 + rex.w)) - 1, rex, &lock, 0, 0);

                 STxw(gd, ed, fixedaddress);

                 SMWRITELOCK(lock);

             }

@@ -1348,7 +1344,6 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     ed = x3;

                 } else

                     ed = xZR;

-                WILLWRITELOCK(lock);

                 STB(ed, wback, fixedaddress);

                 SMWRITELOCK(lock);

             }

@@ -1364,7 +1359,6 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 addr = geted32(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<(2+rex.w), (1<<(2+rex.w))-1, rex, &lock, 0, 4);

                 i64 = F32S;

                 MOV64xw(x3, i64);

-                WILLWRITELOCK(lock);

                 STxw(x3, ed, fixedaddress);

                 SMWRITELOCK(lock);

             }

diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
index 15a938bd..2c5947cf 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
@@ -558,7 +558,6 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             if(MODREG) {
                 v1 = sse_get_reg(dyn, ninst, x3, (nextop&7)+(rex.b<<3), 1);
             } else {
-                WILLWRITE2();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0);
                 unscaled = 0;
                 v1 = fpu_get_scratch(dyn, ninst);
@@ -603,7 +602,6 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             if(MODREG) {
                 v1 = sse_get_reg(dyn, ninst, x3, (nextop&7)+(rex.b<<3), 1);
             } else {
-                WILLWRITE2();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0);
                 unscaled = 0;
                 v1 = fpu_get_scratch(dyn, ninst);
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
index 69a58b58..853f3207 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
@@ -490,7 +490,6 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             if(MODREG) {
                 v1 = sse_get_reg_empty(dyn, ninst, x3, (nextop&7)+(rex.b<<3));
             } else {
-                WILLWRITE2();
                 v1 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, &unscaled, 0xfff<<(3+vex.l), vex.l?15:7, rex, NULL, 0, 1);
             }
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
index 992f4543..65dfb240 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
@@ -88,7 +88,6 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
                 if(v1!=v2) VMOVeD(v1, 1, v2, 1);
                 YMM0((nextop&7)+(rex.b<<3));
             } else {
-                WILLWRITE2();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VST64(v0, ed, fixedaddress);
                 SMWRITE2();
diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c
index 8c0cb3b9..6563aea1 100644
--- a/src/dynarec/arm64/dynarec_arm64_f20f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f20f.c
@@ -71,7 +71,6 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 d0 = sse_get_reg(dyn, ninst, x1, ed, 1);

                 VMOVeD(d0, 0, v0, 0);

             } else {

-                WILLWRITE2();

                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);

                 VST64(v0, ed, fixedaddress);

                 SMWRITE2();

diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 5e297221..e76aac02 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -36,64 +36,176 @@
 #define FEMIT(A)    EMIT(A)
 #endif
 
-// Strong mem emulation helpers
-#define SMREAD_VAL  4
-#define SMWRITE2_MIN 1
-#define SMFIRST_MIN 1
-#define SMSEQ_MIN 2
-#define SMSEQ_MAX 3
+/* Box64 Strong Memory Model Emulation
+ *
+ * Definition of a SEQ:
+ * A SEQ is a sequence of opcodes that writes to guest memory, terminated by JMP, RET, CALL, etc.
+ *
+ * Memory barriers are added in the following cases to emulate the strong memory model:
+ * 1. End of a SEQ:
+ *    - Scalar operations (a1)
+ *    - SIMD operations (a2)
+ * 2. Start of a SEQ:
+ *    - Scalar operations (b1)
+ *    - SIMD operations (b2)
+ * 3. Right before the last guest memory store in a SEQ:
+ *    - Scalar operations (c1)
+ *    - SIMD operations (c2)
+ * 4. After every third guest memory store in a SEQ (d)
+ *
+ * STRONGMEM levels:
+ * LEVEL1: Includes a1, b1
+ * LEVEL2: Includes LEVEL1, plus a2, b2, c1, c2
+ * LEVEL3: Includes LEVEL2, plus d
+ */
+
+#define STRONGMEM_SIMD_WRITE 2 // The level of SIMD memory writes will be tracked
+#define STRONGMEM_LAST_WRITE 2 // The level of a barrier before the last guest memory store will be put
+#define STRONGMEM_SEQ_WRITE  3 // The level of a barrier at every third memory store will be  put
+
 #if STEP == 1
-// pass 1 has the jump point available
-#define SMWRITE()   dyn->insts[ninst].will_write = 1; dyn->smwrite = 1
+
+#define SMWRITE()                                                \
+    do {                                                         \
+        /* Mark that current sequence writes to guest memory. */ \
+        /* This will be used in SMEND for last_write. */         \
+        dyn->smwrite = 1;                                        \
+        /* Mark that current opcode writes to guest memory. */   \
+        dyn->insts[ninst].will_write = 1;                        \
+    } while (0)
+
+#define SMWRITELOCK(lock)              \
+    do {                               \
+        dyn->insts[ninst].lock = lock; \
+        SMWRITE();                     \
+    } while (0)
+
+#define SMWRITE2()                                             \
+    do {                                                       \
+        if (box64_dynarec_strongmem >= STRONGMEM_SIMD_WRITE) { \
+            dyn->smwrite = 1;                                  \
+            dyn->insts[ninst].will_write = 2;                  \
+        }                                                      \
+    } while (0)
+
 #define SMREAD()
 #define SMREADLOCK(lock)
-#define SMMIGHTREAD()
-#define WILLWRITE2()   if(box64_dynarec_strongmem>SMWRITE2_MIN) {WILLWRITE();}
-#define SMWRITE2()   if(box64_dynarec_strongmem>SMWRITE2_MIN) {SMWRITE();}
-#define SMWRITELOCK(lock)   SMWRITE()
-#define WILLWRITELOCK(lock)
 #define WILLWRITE()
-#define SMMIGHTWRITE()   if(!MODREG) {SMWRITE();}
-#define SMSTART() dyn->smwrite = 0; dyn->smread = 0;
-#define SMEND() if(dyn->smwrite && (box64_dynarec_strongmem>SMFIRST_MIN)) {int i = ninst; while(i>=0 && !dyn->insts[i].will_write) --i; if(i>=0) {dyn->insts[i].last_write = 1;}} dyn->smwrite = 0
+#define WILLWRITELOCK(lock)
+
+#define SMSTART()                                                  \
+    do {                                                           \
+        /* Clear current state at the start of a potential SEQ. */ \
+        dyn->smwrite = 0;                                          \
+    } while (0)
+
+#define SMEND()                                                                                \
+    do {                                                                                       \
+        /* If there is any guest memory write, which is a SEQ, then compute the last_write. */ \
+        if (dyn->smwrite && (box64_dynarec_strongmem >= STRONGMEM_LAST_WRITE)) {               \
+            int i = ninst;                                                                     \
+            while (i >= 0 && !dyn->insts[i].will_write)                                        \
+                --i;                                                                           \
+            if (i >= 0) { dyn->insts[i].last_write = 1; }                                      \
+        }                                                                                      \
+        dyn->smwrite = 0;                                                                      \
+    } while (0)
+
 #define SMDMB()
+
 #else
-// Sequence of Write will trigger a DMB on "last" write if strongmem is >= 1
-// Block will trigget at 1st and last if strongmem is >= SMFIRST_MIN
-// Read will contribute to trigger a DMB on "first" read if strongmem is >= SMREAD_MIN
-// Opcode will read
-#define SMREAD()    if(dyn->insts[ninst].will_write) {WILLWRITE();} else if(box64_dynarec_strongmem==SMREAD_VAL && !dyn->smread) {DSB_SY(); dyn->smread = 1;}
-// Opcode will read with option forced lock
-#define SMREADLOCK(lock)    if((lock)) {SMWRITELOCK(lock);} else {SMREAD();}
-// Opcode might read (depend on nextop)
-#define SMMIGHTREAD()   if(!MODREG) {SMREAD();}
-// Opcode has wrote
-#define SMWRITE()   if((box64_dynarec_strongmem>=SMFIRST_MIN) && dyn->smwrite==0 && (box64_dynarec_strongmem!=SMREAD_VAL)) {SMDMB();} if(box64_dynarec_strongmem>SMSEQ_MIN && (box64_dynarec_strongmem!=SMREAD_VAL)) {if(++dyn->smwrite>=SMSEQ_MAX) {SMDMB(); dyn->smwrite=1;}} else dyn->smwrite=1
-// Opcode has wrote (strongmem>1 only)
-#define WILLWRITE2()   if(box64_dynarec_strongmem>SMWRITE2_MIN) {WILLWRITE();}
-#define SMWRITE2()   if(box64_dynarec_strongmem>SMWRITE2_MIN) {SMWRITE();}
-// Opcode has wrote with option forced lock
-#define SMWRITELOCK(lock)   if(lock) {SMDMB(); dyn->smwrite=1;} else {SMWRITE();}
-// Opcode has wrote with option forced lock
-#define WILLWRITELOCK(lock)   if(lock) {DMB_ISH();} else {WILLWRITE();}
-// Opcode might have wrote (depend on nextop)
-#define SMMIGHTWRITE()   if(!MODREG) {SMWRITE();}
-// Opcode will write (without reading)
-#define WILLWRITE() if((box64_dynarec_strongmem>=SMFIRST_MIN) && dyn->smwrite==0 && (box64_dynarec_strongmem!=SMREAD_VAL)) {SMDMB();} else if(box64_dynarec_strongmem>=SMFIRST_MIN && dyn->insts[ninst].last_write && (box64_dynarec_strongmem!=SMREAD_VAL)) {SMDMB();} dyn->smwrite=1
-// Start of sequence
-#define SMSTART()   SMEND()
-// End of sequence
-#define SMEND()     if(dyn->smwrite && box64_dynarec_strongmem && (box64_dynarec_strongmem!=SMREAD_VAL)) {DMB_ISH();} dyn->smwrite=0; dyn->smread=0
-// Force a Data memory barrier (for LOCK: prefix)
-#define SMDMB()                                                  \
-    if (box64_dynarec_strongmem && !box64_dynarec_weakbarrier) { \
-        DSB_ISH();                                               \
-    } else {                                                     \
-        DMB_ISH();                                               \
-    }                                                            \
-    dyn->smwrite = 0;                                            \
-    dyn->smread = 0
 
+// An opcode writes guest memory, this need to be put after the STORE instruction manually.
+#define SMWRITE()                                                     \
+    do {                                                              \
+        /* Put a barrier at every third memory write. */              \
+        if (box64_dynarec_strongmem >= STRONGMEM_SEQ_WRITE) {         \
+            if (++dyn->smwrite >= 3 /* Every third memory write */) { \
+                DMB_ISH();                                            \
+                dyn->smwrite = 1;                                     \
+            }                                                         \
+        } else {                                                      \
+            /* Mark that current sequence writes to guest memory. */  \
+            dyn->smwrite = 1;                                         \
+        }                                                             \
+    } while (0)
+
+// Similar to SMWRITE, but checks lock.
+#define SMWRITELOCK(lock) \
+    do {                  \
+        if (lock) {       \
+            DMB_ISH();    \
+        } else {          \
+            SMWRITE();    \
+        }                 \
+    } while (0)
+
+// Similar to SMWRITE, but for SIMD instructions.
+#define SMWRITE2()                                           \
+    do {                                                     \
+        if (box64_dynarec_strongmem >= STRONGMEM_SIMD_WRITE) \
+            SMWRITE();                                       \
+    } while (0)
+
+// An opcode reads guest memory, this need to be put before the LOAD instruction manually.
+#define SMREAD()
+
+// Similar to SMREAD, but checks lock.
+#define SMREADLOCK(lock) \
+    do {                 \
+        if (lock) {      \
+            DMB_ISH();   \
+        } else {         \
+            SMREAD();    \
+        }                \
+    } while (0)
+
+// An opcode will write memory, this will be put before the STORE instruction automatically.
+#define WILLWRITE()                                                                                   \
+    do {                                                                                              \
+        if (box64_dynarec_strongmem >= dyn->insts[ninst].will_write && dyn->smwrite == 0) {           \
+            /* Will write but never written, this is the start of a SEQ, put a barrier. */            \
+            DMB_ISH();                                                                                \
+        } else if (box64_dynarec_strongmem >= STRONGMEM_LAST_WRITE && dyn->insts[ninst].last_write) { \
+            /* Last write, put a barrier */                                                           \
+            DMB_ISH();                                                                                \
+        }                                                                                             \
+    } while (0)
+
+// Similar to WILLWRITE, but checks lock.
+#define WILLWRITELOCK(lock) \
+    do {                    \
+        if (lock) {         \
+            DMB_ISH();      \
+        } else {            \
+            WILLWRITE();    \
+        }                   \
+    } while (0)
+
+// Used to clear the state at the start of a SEQ
+#define SMSTART()         \
+    do {                  \
+        dyn->smwrite = 0; \
+    } while (0)
+
+// Will be put at the end of the SEQ
+#define SMEND()                                             \
+    do {                                                    \
+        if (box64_dynarec_strongmem) {                      \
+            /* Check if there is any guest memory write. */ \
+            int i = ninst;                                  \
+            while (i >= 0 && !dyn->insts[i].will_write)     \
+                --i;                                        \
+            if (i >= 0) {                                   \
+                /* It's a SEQ, put a barrier here. */       \
+                DMB_ISH();                                  \
+            }                                               \
+        }                                                   \
+        dyn->smwrite = 0;                                   \
+    } while (0)
+
+// The barrier.
+#define SMDMB() DMB_ISH()
 #endif
 
 //LOCK_* define
@@ -619,36 +731,35 @@
     vy = ymm_get_reg_empty(dyn, ninst, x1, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1, -1)
 
 // Get EX as a quad, (x3 is used)
-#define GETEX_Y(a, w, D)                                                                                \
-    if(MODREG) {                                                                                        \
-        a = sse_get_reg(dyn, ninst, x3, (nextop&7)+(rex.b<<3), w);                                      \
-    } else {                                                                                            \
-        if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
-        addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, D);  \
-        unscaled = 0;                                                                                   \
-        a = fpu_get_scratch(dyn, ninst);                                                                \
-        VLDR128_U12(a, ed, fixedaddress);                                                               \
+#define GETEX_Y(a, w, D)                                                                                       \
+    if (MODREG) {                                                                                              \
+        a = sse_get_reg(dyn, ninst, x3, (nextop & 7) + (rex.b << 3), w);                                       \
+    } else {                                                                                                   \
+        SMREAD();                                                                                              \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe << 4, 15, rex, NULL, 0, D); \
+        unscaled = 0;                                                                                          \
+        a = fpu_get_scratch(dyn, ninst);                                                                       \
+        VLDR128_U12(a, ed, fixedaddress);                                                                      \
     }
 // Get EX as a quad, (x3 is used)
 #define GETEX_empty_Y(a, D)                                                                             \
     if(MODREG) {                                                                                        \
         a = sse_get_reg_empty(dyn, ninst, x3, (nextop&7)+(rex.b<<3));                                   \
     } else {                                                                                            \
-        WILLWRITE2();                                                                                   \
         a = fpu_get_scratch(dyn, ninst);                                                                \
         addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, D);  \
         unscaled = 0;                                                                                   \
     }
 
 // Get EX as a quad, (x1 is used)
-#define GETEX(a, w, D)                                                                                  \
-    if(MODREG) {                                                                                        \
-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
-    } else {                                                                                            \
-        if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, D);  \
-        a = fpu_get_scratch(dyn, ninst);                                                                \
-        VLD128(a, ed, fixedaddress);                                                                    \
+#define GETEX(a, w, D)                                                                                              \
+    if (MODREG) {                                                                                                   \
+        a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w);                                            \
+    } else {                                                                                                        \
+        SMREAD();                                                                                                   \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff << 4, 15, rex, NULL, 0, D); \
+        a = fpu_get_scratch(dyn, ninst);                                                                            \
+        VLD128(a, ed, fixedaddress);                                                                                \
     }
 
 // Put Back EX if it was a memory and not an emm register
@@ -660,42 +771,42 @@
 
 
 // Get Ex as a double, not a quad (warning, x1 get used)
-#define GETEXSD(a, w, D)                                                                                \
-    if(MODREG) {                                                                                        \
-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
-    } else {                                                                                            \
-        if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
-        a = fpu_get_scratch(dyn, ninst);                                                                \
-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, D);   \
-        VLD64(a, ed, fixedaddress);                                                                     \
+#define GETEXSD(a, w, D)                                                                                           \
+    if (MODREG) {                                                                                                  \
+        a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w);                                           \
+    } else {                                                                                                       \
+        SMREAD();                                                                                                  \
+        a = fpu_get_scratch(dyn, ninst);                                                                           \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff << 3, 7, rex, NULL, 0, D); \
+        VLD64(a, ed, fixedaddress);                                                                                \
     }
 
 // Get Ex as 64bits, not a quad (warning, x1 get used)
 #define GETEX64(a, w, D)    GETEXSD(a, w, D)
 
 // Get Ex as a single, not a quad (warning, x1 get used)
-#define GETEXSS(a, w, D)                                                                                \
-    if(MODREG) {                                                                                        \
-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
-    } else {                                                                                            \
-        if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
-        a = fpu_get_scratch(dyn, ninst);                                                                \
-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, D);   \
-        VLD32(a, ed, fixedaddress);                                                                     \
+#define GETEXSS(a, w, D)                                                                                           \
+    if (MODREG) {                                                                                                  \
+        a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w);                                           \
+    } else {                                                                                                       \
+        SMREAD();                                                                                                  \
+        a = fpu_get_scratch(dyn, ninst);                                                                           \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff << 2, 3, rex, NULL, 0, D); \
+        VLD32(a, ed, fixedaddress);                                                                                \
     }
 
 // Get Ex as 32bits, not a quad (warning, x1 get used)
 #define GETEX32(a, w, D)    GETEXSS(a, w, D)
 
 // Get Ex as 16bits, not a quad (warning, x1 get used)
-#define GETEX16(a, w, D)                                                                                \
-    if(MODREG) {                                                                                        \
-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
-    } else {                                                                                            \
-        if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
-        a = fpu_get_scratch(dyn, ninst);                                                                \
-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, D);   \
-        VLD16(a, ed, fixedaddress);                                                                     \
+#define GETEX16(a, w, D)                                                                                           \
+    if (MODREG) {                                                                                                  \
+        a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w);                                           \
+    } else {                                                                                                       \
+        SMREAD();                                                                                                  \
+        a = fpu_get_scratch(dyn, ninst);                                                                           \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff << 1, 1, rex, NULL, 0, D); \
+        VLD16(a, ed, fixedaddress);                                                                                \
     }
 
 // Get GM, might use x1, x2 and x3
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 3b8cb1f3..295574b8 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -108,6 +108,7 @@ typedef struct instruction_arm64_s {
     uint8_t             barrier_maybe;
     uint8_t             will_write;
     uint8_t             last_write;
+    uint8_t             lock;
     uint8_t             set_nat_flags;  // 0 or combinaison of native flags define
     uint8_t             use_nat_flags;  // 0 or combinaison of native flags define
     uint8_t             use_nat_flags_before;  // 0 or combinaison of native flags define
diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index dee5d496..5df1dce2 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -116,6 +116,13 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
             GOTEST(x1, x2);
         }
         if(dyn->insts[ninst].pred_sz>1) {SMSTART();}
+        #if STEP > 1
+        if (dyn->insts[ninst].lock) {
+            WILLWRITELOCK(dyn->insts[ninst].lock);
+        } else if (dyn->insts[ninst].will_write) {
+            WILLWRITE();
+        }
+        #endif
         if((dyn->insts[ninst].x64.need_before&~X_PEND) && !dyn->insts[ninst].pred_sz) {
             READFLAGS(dyn->insts[ninst].x64.need_before&~X_PEND);
         }
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index 21f3c2af..686a2a09 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -32,50 +32,177 @@
 #define PK64(a) *(uint64_t*)(addr + a)
 #define PKip(a) *(uint8_t*)(ip + a)
 
-// Strong mem emulation helpers
-#define SMREAD_MIN  2
-#define SMWRITE_MIN 1
-// Sequence of Read will trigger a DMB on "first" read if strongmem is >= SMREAD_MIN
-// Sequence of Write will trigger a DMB on "last" write if strongmem is >= 1
-// All Write operation that might use a lock all have a memory barrier if strongmem is >= SMWRITE_MIN
-// Opcode will read
-#define SMREAD()                                                        \
-    if ((dyn->smread == 0) && (box64_dynarec_strongmem > SMREAD_MIN)) { \
-        SMDMB();                                                        \
-    } else                                                              \
-        dyn->smread = 1
-// Opcode will read with option forced lock
+/* Box64 Strong Memory Model Emulation
+ *
+ * Definition of a SEQ:
+ * A SEQ is a sequence of opcodes that writes to guest memory, terminated by JMP, RET, CALL, etc.
+ *
+ * Memory barriers are added in the following cases to emulate the strong memory model:
+ * 1. End of a SEQ:
+ *    - Scalar operations (a1)
+ *    - SIMD operations (a2)
+ * 2. Start of a SEQ:
+ *    - Scalar operations (b1)
+ *    - SIMD operations (b2)
+ * 3. Right before the last guest memory store in a SEQ:
+ *    - Scalar operations (c1)
+ *    - SIMD operations (c2)
+ * 4. After every third guest memory store in a SEQ (d)
+ *
+ * STRONGMEM levels:
+ * LEVEL1: Includes a1, b1
+ * LEVEL2: Includes LEVEL1, plus a2, b2, c1, c2
+ * LEVEL3: Includes LEVEL2, plus d
+ */
+
+#define STRONGMEM_SIMD_WRITE 2 // The level of SIMD memory writes will be tracked
+#define STRONGMEM_LAST_WRITE 2 // The level of a barrier before the last guest memory store will be put
+#define STRONGMEM_SEQ_WRITE  3 // The level of a barrier at every third memory store will be  put
+
+#if STEP == 1
+
+#define SMWRITE()                                                \
+    do {                                                         \
+        /* Mark that current sequence writes to guest memory. */ \
+        /* This will be used in SMEND for last_write. */         \
+        dyn->smwrite = 1;                                        \
+        /* Mark that current opcode writes to guest memory. */   \
+        dyn->insts[ninst].will_write = 1;                        \
+    } while (0)
+
+#define SMWRITELOCK(lock)              \
+    do {                               \
+        dyn->insts[ninst].lock = lock; \
+        SMWRITE();                     \
+    } while (0)
+
+#define SMWRITE2()                                             \
+    do {                                                       \
+        if (box64_dynarec_strongmem >= STRONGMEM_SIMD_WRITE) { \
+            dyn->smwrite = 1;                                  \
+            dyn->insts[ninst].will_write = 2;                  \
+        }                                                      \
+    } while (0)
+
+#define SMREAD()
+#define SMREADLOCK(lock)
+#define WILLWRITE()
+#define WILLWRITELOCK(lock)
+
+#define SMSTART()                                                  \
+    do {                                                           \
+        /* Clear current state at the start of a potential SEQ. */ \
+        dyn->smwrite = 0;                                          \
+    } while (0)
+
+#define SMEND()                                                                                \
+    do {                                                                                       \
+        /* If there is any guest memory write, which is a SEQ, then compute the last_write. */ \
+        if (dyn->smwrite && (box64_dynarec_strongmem >= STRONGMEM_LAST_WRITE)) {               \
+            int i = ninst;                                                                     \
+            while (i >= 0 && !dyn->insts[i].will_write)                                        \
+                --i;                                                                           \
+            if (i >= 0) { dyn->insts[i].last_write = 1; }                                      \
+        }                                                                                      \
+        dyn->smwrite = 0;                                                                      \
+    } while (0)
+
+#define SMDMB()
+
+#else
+
+// An opcode writes guest memory, this need to be put after the STORE instruction manually.
+#define SMWRITE()                                                     \
+    do {                                                              \
+        /* Put a barrier at every third memory write. */              \
+        if (box64_dynarec_strongmem >= STRONGMEM_SEQ_WRITE) {         \
+            if (++dyn->smwrite >= 3 /* Every third memory write */) { \
+                DBAR(0);                                              \
+                dyn->smwrite = 1;                                     \
+            }                                                         \
+        } else {                                                      \
+            /* Mark that current sequence writes to guest memory. */  \
+            dyn->smwrite = 1;                                         \
+        }                                                             \
+    } while (0)
+
+// Similar to SMWRITE, but checks lock.
+#define SMWRITELOCK(lock) \
+    do {                  \
+        if (lock) {       \
+            DBAR(0);      \
+        } else {          \
+            SMWRITE();    \
+        }                 \
+    } while (0)
+
+// Similar to SMWRITE, but for SIMD instructions.
+#define SMWRITE2()                                           \
+    do {                                                     \
+        if (box64_dynarec_strongmem >= STRONGMEM_SIMD_WRITE) \
+            SMWRITE();                                       \
+    } while (0)
+
+// An opcode reads guest memory, this need to be put before the LOAD instruction manually.
+#define SMREAD()
+
+// Similar to SMREAD, but checks lock.
 #define SMREADLOCK(lock) \
-    if ((lock) || ((dyn->smread == 0) && (box64_dynarec_strongmem > SMREAD_MIN))) { SMDMB(); }
-// Opcode might read (depend on nextop)
-#define SMMIGHTREAD() \
-    if (!MODREG) { SMREAD(); }
-// Opcode has wrote
-#define SMWRITE() dyn->smwrite = 1
-// Opcode has wrote (strongmem>1 only)
-#define SMWRITE2() \
-    if (box64_dynarec_strongmem > SMREAD_MIN) dyn->smwrite = 1
-// Opcode has wrote with option forced lock
-#define SMWRITELOCK(lock)                                  \
-    if (lock || (box64_dynarec_strongmem > SMWRITE_MIN)) { \
-        SMDMB();                                           \
-    } else                                                 \
-        dyn->smwrite = 1
-// Opcode might have wrote (depend on nextop)
-#define SMMIGHTWRITE() \
-    if (!MODREG) { SMWRITE(); }
-// Start of sequence
-#define SMSTART() SMEND()
-// End of sequence
-#define SMEND()                                               \
-    if (dyn->smwrite && box64_dynarec_strongmem) { DBAR(0); } \
-    dyn->smwrite = 0;                                         \
-    dyn->smread = 0;
-// Force a Data memory barrier (for LOCK: prefix)
-#define SMDMB()       \
-    DBAR(0);          \
-    dyn->smwrite = 0; \
-    dyn->smread = 1
+    do {                 \
+        if (lock) {      \
+            DBAR(0);     \
+        } else {         \
+            SMREAD();    \
+        }                \
+    } while (0)
+
+// An opcode will write memory, this will be put before the STORE instruction automatically.
+#define WILLWRITE()                                                                                   \
+    do {                                                                                              \
+        if (box64_dynarec_strongmem >= dyn->insts[ninst].will_write && dyn->smwrite == 0) {           \
+            /* Will write but never written, this is the start of a SEQ, put a barrier. */            \
+            DBAR(0);                                                                                  \
+        } else if (box64_dynarec_strongmem >= STRONGMEM_LAST_WRITE && dyn->insts[ninst].last_write) { \
+            /* Last write, put a barrier */                                                           \
+            DBAR(0);                                                                                  \
+        }                                                                                             \
+    } while (0)
+
+// Similar to WILLWRITE, but checks lock.
+#define WILLWRITELOCK(lock) \
+    do {                    \
+        if (lock) {         \
+            DBAR(0);        \
+        } else {            \
+            WILLWRITE();    \
+        }                   \
+    } while (0)
+
+// Used to clear the state at the start of a SEQ
+#define SMSTART()         \
+    do {                  \
+        dyn->smwrite = 0; \
+    } while (0)
+
+// Will be put at the end of the SEQ
+#define SMEND()                                             \
+    do {                                                    \
+        if (box64_dynarec_strongmem) {                      \
+            /* Check if there is any guest memory write. */ \
+            int i = ninst;                                  \
+            while (i >= 0 && !dyn->insts[i].will_write)     \
+                --i;                                        \
+            if (i >= 0) {                                   \
+                /* It's a SEQ, put a barrier here. */       \
+                DBAR(0);                                    \
+            }                                               \
+        }                                                   \
+        dyn->smwrite = 0;                                   \
+    } while (0)
+
+// The barrier.
+#define SMDMB() DBAR(0)
+#endif
 
 // LOCK_* define
 #define LOCK_LOCK (int*)1
@@ -750,7 +877,11 @@
 #define TABLE64(A, V)
 #endif
 
-#define ARCH_INIT()
+#define ARCH_INIT()                     \
+    do {                                \
+        dyn->smread = dyn->smwrite = 0; \
+    } while (0)
+
 #define ARCH_RESET()
 
 #if STEP < 2
diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h
index 1dab0696..2e64ac55 100644
--- a/src/dynarec/la64/dynarec_la64_private.h
+++ b/src/dynarec/la64/dynarec_la64_private.h
@@ -91,6 +91,7 @@ typedef struct instruction_la64_s {
     uint8_t             barrier_maybe;
     uint8_t             will_write;
     uint8_t             last_write;
+    uint8_t             lock;
     uint8_t             df_notneeded;
     flagcache_t         f_exit;     // flags status at end of instruction
     lsxcache_t          lsx;        // lsxcache at end of instruction (but before poping)
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 074a000f..7903ca06 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -32,33 +32,177 @@
 #define PK64(a) *(uint64_t*)(addr + a)
 #define PKip(a) *(uint8_t*)(ip + a)
 
+/* Box64 Strong Memory Model Emulation
+ *
+ * Definition of a SEQ:
+ * A SEQ is a sequence of opcodes that writes to guest memory, terminated by JMP, RET, CALL, etc.
+ *
+ * Memory barriers are added in the following cases to emulate the strong memory model:
+ * 1. End of a SEQ:
+ *    - Scalar operations (a1)
+ *    - SIMD operations (a2)
+ * 2. Start of a SEQ:
+ *    - Scalar operations (b1)
+ *    - SIMD operations (b2)
+ * 3. Right before the last guest memory store in a SEQ:
+ *    - Scalar operations (c1)
+ *    - SIMD operations (c2)
+ * 4. After every third guest memory store in a SEQ (d)
+ *
+ * STRONGMEM levels:
+ * LEVEL1: Includes a1, b1
+ * LEVEL2: Includes LEVEL1, plus a2, b2, c1, c2
+ * LEVEL3: Includes LEVEL2, plus d
+ */
+
+#define STRONGMEM_SIMD_WRITE 2 // The level of SIMD memory writes will be tracked
+#define STRONGMEM_LAST_WRITE 2 // The level of a barrier before the last guest memory store will be put
+#define STRONGMEM_SEQ_WRITE  3 // The level of a barrier at every third memory store will be  put
+
+#if STEP == 1
+
+#define SMWRITE()                                                \
+    do {                                                         \
+        /* Mark that current sequence writes to guest memory. */ \
+        /* This will be used in SMEND for last_write. */         \
+        dyn->smwrite = 1;                                        \
+        /* Mark that current opcode writes to guest memory. */   \
+        dyn->insts[ninst].will_write = 1;                        \
+    } while (0)
+
+#define SMWRITELOCK(lock)              \
+    do {                               \
+        dyn->insts[ninst].lock = lock; \
+        SMWRITE();                     \
+    } while (0)
+
+#define SMWRITE2()                                             \
+    do {                                                       \
+        if (box64_dynarec_strongmem >= STRONGMEM_SIMD_WRITE) { \
+            dyn->smwrite = 1;                                  \
+            dyn->insts[ninst].will_write = 2;                  \
+        }                                                      \
+    } while (0)
+
+#define SMREAD()
+#define SMREADLOCK(lock)
+#define WILLWRITE()
+#define WILLWRITELOCK(lock)
+
+#define SMSTART()                                                  \
+    do {                                                           \
+        /* Clear current state at the start of a potential SEQ. */ \
+        dyn->smwrite = 0;                                          \
+    } while (0)
 
-// Strong mem emulation helpers
-#define SMREAD_MIN  2
-#define SMWRITE_MIN 1
-// Sequence of Read will trigger a DMB on "first" read if strongmem is >= SMREAD_MIN
-// Sequence of Write will trigger a DMB on "last" write if strongmem is >= 1
-// All Write operation that might use a lock all have a memory barrier if strongmem is >= SMWRITE_MIN
-// Opcode will read
-#define SMREAD() if((dyn->smread==0) && (box64_dynarec_strongmem>SMREAD_MIN)) {SMDMB();} else dyn->smread=1
-// Opcode will read with option forced lock
-#define SMREADLOCK(lock)    if((lock) || ((dyn->smread==0) && (box64_dynarec_strongmem>SMREAD_MIN))) {SMDMB();}
-// Opcode might read (depend on nextop)
-#define SMMIGHTREAD()   if(!MODREG) {SMREAD();}
-// Opcode has wrote
-#define SMWRITE()   dyn->smwrite=1
-// Opcode has wrote (strongmem>1 only)
-#define SMWRITE2()   if(box64_dynarec_strongmem>SMREAD_MIN) dyn->smwrite=1
-// Opcode has wrote with option forced lock
-#define SMWRITELOCK(lock)   if(lock || (box64_dynarec_strongmem>SMWRITE_MIN)) {SMDMB();} else dyn->smwrite=1
-// Opcode might have wrote (depend on nextop)
-#define SMMIGHTWRITE()   if(!MODREG) {SMWRITE();}
-// Start of sequence
-#define SMSTART()   SMEND()
-// End of sequence
-#define SMEND()     if(dyn->smwrite && box64_dynarec_strongmem) {FENCE();} dyn->smwrite=0; dyn->smread=0;
-// Force a Data memory barrier (for LOCK: prefix)
-#define SMDMB()     FENCE(); dyn->smwrite=0; dyn->smread=1
+#define SMEND()                                                                                \
+    do {                                                                                       \
+        /* If there is any guest memory write, which is a SEQ, then compute the last_write. */ \
+        if (dyn->smwrite && (box64_dynarec_strongmem >= STRONGMEM_LAST_WRITE)) {               \
+            int i = ninst;                                                                     \
+            while (i >= 0 && !dyn->insts[i].will_write)                                        \
+                --i;                                                                           \
+            if (i >= 0) { dyn->insts[i].last_write = 1; }                                      \
+        }                                                                                      \
+        dyn->smwrite = 0;                                                                      \
+    } while (0)
+
+#define SMDMB()
+
+#else
+
+// An opcode writes guest memory, this need to be put after the STORE instruction manually.
+#define SMWRITE()                                                     \
+    do {                                                              \
+        /* Put a barrier at every third memory write. */              \
+        if (box64_dynarec_strongmem >= STRONGMEM_SEQ_WRITE) {         \
+            if (++dyn->smwrite >= 3 /* Every third memory write */) { \
+                FENCE_RW_RW();                                        \
+                dyn->smwrite = 1;                                     \
+            }                                                         \
+        } else {                                                      \
+            /* Mark that current sequence writes to guest memory. */  \
+            dyn->smwrite = 1;                                         \
+        }                                                             \
+    } while (0)
+
+// Similar to SMWRITE, but checks lock.
+#define SMWRITELOCK(lock)  \
+    do {                   \
+        if (lock) {        \
+            FENCE_RW_RW(); \
+        } else {           \
+            SMWRITE();     \
+        }                  \
+    } while (0)
+
+// Similar to SMWRITE, but for SIMD instructions.
+#define SMWRITE2()                                           \
+    do {                                                     \
+        if (box64_dynarec_strongmem >= STRONGMEM_SIMD_WRITE) \
+            SMWRITE();                                       \
+    } while (0)
+
+// An opcode reads guest memory, this need to be put before the LOAD instruction manually.
+#define SMREAD()
+
+// Similar to SMREAD, but checks lock.
+#define SMREADLOCK(lock)   \
+    do {                   \
+        if (lock) {        \
+            FENCE_RW_RW(); \
+        } else {           \
+            SMREAD();      \
+        }                  \
+    } while (0)
+
+// An opcode will write memory, this will be put before the STORE instruction automatically.
+#define WILLWRITE()                                                                                   \
+    do {                                                                                              \
+        if (box64_dynarec_strongmem >= dyn->insts[ninst].will_write && dyn->smwrite == 0) {           \
+            /* Will write but never written, this is the start of a SEQ, put a barrier. */            \
+            FENCE_RW_RW();                                                                            \
+        } else if (box64_dynarec_strongmem >= STRONGMEM_LAST_WRITE && dyn->insts[ninst].last_write) { \
+            /* Last write, put a barrier */                                                           \
+            FENCE_RW_RW();                                                                            \
+        }                                                                                             \
+    } while (0)
+
+// Similar to WILLWRITE, but checks lock.
+#define WILLWRITELOCK(lock) \
+    do {                    \
+        if (lock) {         \
+            FENCE_RW_RW();  \
+        } else {            \
+            WILLWRITE();    \
+        }                   \
+    } while (0)
+
+// Used to clear the state at the start of a SEQ
+#define SMSTART()         \
+    do {                  \
+        dyn->smwrite = 0; \
+    } while (0)
+
+// Will be put at the end of the SEQ
+#define SMEND()                                             \
+    do {                                                    \
+        if (box64_dynarec_strongmem) {                      \
+            /* Check if there is any guest memory write. */ \
+            int i = ninst;                                  \
+            while (i >= 0 && !dyn->insts[i].will_write)     \
+                --i;                                        \
+            if (i >= 0) {                                   \
+                /* It's a SEQ, put a barrier here. */       \
+                FENCE_RW_RW();                              \
+            }                                               \
+        }                                                   \
+        dyn->smwrite = 0;                                   \
+    } while (0)
+
+// The barrier.
+#define SMDMB() FENCE_RW_RW()
+#endif
 
 // LOCK_* define
 #define LOCK_LOCK (int*)1
@@ -1099,8 +1243,10 @@
 #define FTABLE64(A, V)
 #endif
 
-#define ARCH_INIT() \
+#define ARCH_INIT()                 \
+    dyn->smread = dyn->smwrite = 0; \
     dyn->vector_sew = VECTOR_SEWNA;
+
 #define ARCH_RESET() \
     dyn->vector_sew = VECTOR_SEWNA;
 
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index 5c00231e..3de4b465 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -122,6 +122,9 @@ typedef struct instruction_rv64_s {
     uint16_t            ymm0_out;   // the ymm0 at th end of the opcode
     uint16_t            ymm0_pass2, ymm0_pass3;
     int                 barrier_maybe;
+    uint8_t             will_write;
+    uint8_t             last_write;
+    uint8_t             lock;
     uint8_t             df_notneeded;
     flagcache_t         f_exit;     // flags status at end of instruction
     extcache_t          e;          // extcache at end of instruction (but before poping)
diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h
index 4e199e00..99fcf0e5 100644
--- a/src/dynarec/rv64/rv64_emitter.h
+++ b/src/dynarec/rv64/rv64_emitter.h
@@ -399,7 +399,7 @@ f28–31  ft8–11  FP temporaries                  Caller
     } while (0)
 
 #define FENCE_gen(pred, succ) (((pred) << 24) | ((succ) << 20) | 0b0001111)
-#define FENCE()               EMIT(FENCE_gen(3, 3))
+#define FENCE_RW_RW()         EMIT(FENCE_gen(3, 3))
 
 #define FENCE_I_gen() ((0b001 << 12) | 0b0001111)
 #define FENCE_I()     EMIT(FENCE_I_gen())