about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2025-06-23 13:00:45 +0200
committerptitSeb <sebastien.chev@gmail.com>2025-06-23 13:00:45 +0200
commitc66630da497c18622205cc58cb058a1f8cba7cd1 (patch)
treea6c9c55bff71ecfdd80b9d2bff3b15e6c8724ce0 /src
parent0cc58732fb3f6992918b10d5da7a9937edb4a0b4 (diff)
downloadbox64-c66630da497c18622205cc58cb058a1f8cba7cd1.tar.gz
box64-c66630da497c18622205cc58cb058a1f8cba7cd1.zip
[DYNAREC] Refactored a bit BARRIER_FLOAT ([ARM64] olny for now, todo for RV64 and LA64)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_00.c14
-rw-r--r--src/dynarec/arm64/dynarec_arm64_0f.c9
-rw-r--r--src/dynarec/arm64/dynarec_arm64_66.c6
-rw-r--r--src/dynarec/arm64/dynarec_arm64_67.c1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_67_32.c1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_d9.c8
-rw-r--r--src/dynarec/arm64/dynarec_arm64_dd.c5
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f20f.c1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.c62
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.h3
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c28
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h4
-rw-r--r--src/dynarec/dynarec_arch.h2
-rw-r--r--src/dynarec/dynarec_native.c12
-rw-r--r--src/tools/env.c2
15 files changed, 116 insertions, 42 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index 219fa44c..6273933e 100644
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -1072,7 +1072,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
         #define GO(GETFLAGS, NO, YES, F)                                \
             READFLAGS(F);                                               \
             i8 = F8S;                                                   \
-            BARRIER(BARRIER_MAYBE);                                     \
             JUMP(addr+i8, 1);                                           \
             GETFLAGS;                                                   \
             if(dyn->insts[ninst].x64.jmp_insts==-1 ||                   \
@@ -2385,7 +2384,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             if(BOX64DRENV(dynarec_safeflags)) {
                 READFLAGS(X_PEND);  // lets play safe here too
             }
-            fpu_purgecache(dyn, ninst, 1, x1, x2, x3);  // using next, even if there no next
+            BARRIER(BARRIER_FLOAT);
             i32 = F16;
             retn_to_epilog(dyn, ip, ninst, rex, i32);
             *need_epilog = 0;
@@ -2397,7 +2396,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             if(BOX64DRENV(dynarec_safeflags)) {
                 READFLAGS(X_PEND);  // so instead, force the deferred flags, so it's not too slow, and flags are not lost
             }
-            fpu_purgecache(dyn, ninst, 1, x1, x2, x3);  // using next, even if there no next
+            BARRIER(BARRIER_FLOAT);
             ret_to_epilog(dyn, ip, ninst, rex);
             *need_epilog = 0;
             *ok = 0;
@@ -3282,7 +3281,6 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             addr = dynarec64_DF(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
             break;
         #define GO(Z)                                                   \
-            BARRIER(BARRIER_MAYBE);                                     \
             JUMP(addr+i8, 1);                                           \
             if(dyn->insts[ninst].x64.jmp_insts==-1 ||                   \
                 CHECK_CACHE()) {                                        \
@@ -3450,7 +3448,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     } else {
                         MOV64x(x2, addr);
                     }
-                    fpu_purgecache(dyn, ninst, 1, x1, x3, x4);
+                    BARRIER(BARRIER_FLOAT);
+                    //fpu_purgecache(dyn, ninst, 0, x1, x3, x4);
                     PUSH1z(x2);
                     if (BOX64DRENV(dynarec_callret)) {
                         SET_HASCALLRET();
@@ -3500,7 +3499,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             break;
         case 0xE9:
         case 0xEB:
-            BARRIER(BARRIER_MAYBE);
+            BARRIER(BARRIER_MAYBE); // there will be a barrier if there is a jump out
             if(opcode==0xEB && PK(0)==0xFF) {
                 INST_NAME("JMP ib");
                 MESSAGE(LOG_DEBUG, "Hack for EB FF opcode");
@@ -3522,7 +3521,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 if(dyn->insts[ninst].x64.jmp_insts==-1) {
                     // out of the block
                     SET_NODF();
-                    fpu_purgecache(dyn, ninst, 1, x1, x2, x3);
+                    BARRIER(BARRIER_FLOAT);
+                    //fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
                     jump_to_next(dyn, j64, 0, ninst, rex.is32bits);
                 } else {
                     // inside the block
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c
index 8842d43f..83f7b3b4 100644
--- a/src/dynarec/arm64/dynarec_arm64_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_0f.c
@@ -1673,7 +1673,6 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 j64 = (uint32_t)(addr+i32_);                            \

             else                                                        \

                 j64 = addr+i32_;                                        \

-            BARRIER(BARRIER_MAYBE);                                     \

             JUMP(j64, 1);                                               \

             GETFLAGS;                                                   \

             if(dyn->insts[ninst].x64.jmp_insts==-1 ||                   \

@@ -1918,7 +1917,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     case 0:

                         INST_NAME("FXSAVE Ed");

                         MESSAGE(LOG_DUMP, "Need Optimization (FXSAVE)\n");

-                        fpu_purgecache(dyn, ninst, 0, x1, x2, x3);

+                        BARRIER(BARRIER_FLOAT);

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);

                         if(ed!=x1) {MOVx_REG(x1, ed);}

                         CALL(rex.is32bits?const_fpu_fxsave32:const_fpu_fxsave64, -1);

@@ -1926,7 +1925,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     case 1:

                         INST_NAME("FXRSTOR Ed");

                         MESSAGE(LOG_DUMP, "Need Optimization (FXRSTOR)\n");

-                        fpu_purgecache(dyn, ninst, 0, x1, x2, x3);

+                        BARRIER(BARRIER_FLOAT);

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);

                         if(ed!=x1) {MOVx_REG(x1, ed);}

                         CALL(rex.is32bits?const_fpu_fxrstor32:const_fpu_fxrstor64, -1);

@@ -1986,7 +1985,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     case 4:

                         INST_NAME("XSAVE Ed");

                         MESSAGE(LOG_DUMP, "Need Optimization (XSAVE)\n");

-                        fpu_purgecache(dyn, ninst, 0, x1, x2, x3);

+                        BARRIER(BARRIER_FLOAT);

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);

                         if(ed!=x1) {MOVx_REG(x1, ed);}

                         MOV32w(x2, rex.w?0:1);

@@ -1995,7 +1994,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     case 5:

                         INST_NAME("XRSTOR Ed");

                         MESSAGE(LOG_DUMP, "Need Optimization (XRSTOR)\n");

-                        fpu_purgecache(dyn, ninst, 0, x1, x2, x3);

+                        BARRIER(BARRIER_FLOAT);

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);

                         if(ed!=x1) {MOVx_REG(x1, ed);}

                         MOV32w(x2, rex.w?0:1);

diff --git a/src/dynarec/arm64/dynarec_arm64_66.c b/src/dynarec/arm64/dynarec_arm64_66.c
index 30b4ea0e..e99e3cef 100644
--- a/src/dynarec/arm64/dynarec_arm64_66.c
+++ b/src/dynarec/arm64/dynarec_arm64_66.c
@@ -1361,7 +1361,7 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     case 6:

                         INST_NAME("FNSTENV Ed");

                         MESSAGE(LOG_DUMP, "Need Optimization (FNSTENV16)\n");

-                        fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE?

+                        BARRIER(BARRIER_FLOAT); // maybe only x87, not SSE?

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);

                         if(ed!=x1) {MOVx_REG(x1, ed);}

                         MOV32w(x2, 1);

@@ -1381,7 +1381,7 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     case 4:

                         INST_NAME("FRSTOR Ed");

                         MESSAGE(LOG_DUMP, "Need Optimization (FRSTOR16)\n");

-                        fpu_purgecache(dyn, ninst, 0, x1, x2, x3);

+                        BARRIER(BARRIER_FLOAT);

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);

                         if(ed!=x1) {MOVx_REG(x1, ed);}

                         CALL(const_native_frstor16, -1);

@@ -1389,7 +1389,7 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     case 6:

                         INST_NAME("FNSAVE Ed");

                         MESSAGE(LOG_DUMP, "Need Optimization (FNSAVE16)\n");

-                        fpu_purgecache(dyn, ninst, 0, x1, x2, x3);

+                        BARRIER(BARRIER_FLOAT);

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);

                         if(ed!=x1) {MOVx_REG(x1, ed);}

                         CALL(const_native_fsave16, -1);

diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c
index e6755147..0e64eaf1 100644
--- a/src/dynarec/arm64/dynarec_arm64_67.c
+++ b/src/dynarec/arm64/dynarec_arm64_67.c
@@ -1425,7 +1425,6 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
 

 

         #define GO(NO, YES)                                             \

-            BARRIER(BARRIER_MAYBE);                                     \

             JUMP(addr+i8, 1);                                           \

             if(dyn->insts[ninst].x64.jmp_insts==-1 ||                   \

                 CHECK_CACHE()) {                                        \

diff --git a/src/dynarec/arm64/dynarec_arm64_67_32.c b/src/dynarec/arm64/dynarec_arm64_67_32.c
index 477b41be..ec1fa1e5 100644
--- a/src/dynarec/arm64/dynarec_arm64_67_32.c
+++ b/src/dynarec/arm64/dynarec_arm64_67_32.c
@@ -89,7 +89,6 @@ uintptr_t dynarec64_67_32(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
             break;
 
         #define GO(NO, YES)                                             \
-            BARRIER(BARRIER_MAYBE);                                     \
             JUMP(addr+i8, 1);                                           \
             if(dyn->insts[ninst].x64.jmp_insts==-1 ||                   \
                 CHECK_CACHE()) {                                        \
diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c
index b8cfe6e0..0d024778 100644
--- a/src/dynarec/arm64/dynarec_arm64_d9.c
+++ b/src/dynarec/arm64/dynarec_arm64_d9.c
@@ -356,7 +356,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             break;
         case 0xF6:
             INST_NAME("FDECSTP");
-            fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+            BARRIER(BARRIER_FLOAT);
             LDRw_U12(x2, xEmu, offsetof(x64emu_t, top));
             SUBw_U12(x2, x2, 1);
             ANDw_mask(x2, x2, 0, 2);    //mask=7
@@ -364,7 +364,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             break;
         case 0xF7:
             INST_NAME("FINCSTP");
-            fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+            BARRIER(BARRIER_FLOAT);
             LDRw_U12(x2, xEmu, offsetof(x64emu_t, top));
             ADDw_U12(x2, x2, 1);
             ANDw_mask(x2, x2, 0, 2);    //mask=7
@@ -525,7 +525,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 4:
                 INST_NAME("FLDENV Ed");
                 MESSAGE(LOG_DUMP, "Need Optimization (FLDENV)\n");
-                fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE?
+                BARRIER(BARRIER_FLOAT); // maybe only x87, not SSE?
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
                 if(ed!=x1) {
                     MOVx_REG(x1, ed);
@@ -544,7 +544,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 6:
                 INST_NAME("FNSTENV Ed");
                 MESSAGE(LOG_DUMP, "Need Optimization (FNSTENV)\n");
-                fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE?
+                BARRIER(BARRIER_FLOAT); // maybe only x87, not SSE?
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
                 if(ed!=x1) {
                     MOVx_REG(x1, ed);
diff --git a/src/dynarec/arm64/dynarec_arm64_dd.c b/src/dynarec/arm64/dynarec_arm64_dd.c
index fe640ef3..9ce8c908 100644
--- a/src/dynarec/arm64/dynarec_arm64_dd.c
+++ b/src/dynarec/arm64/dynarec_arm64_dd.c
@@ -186,7 +186,7 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 4:
                 INST_NAME("FRSTOR m108byte");
                 MESSAGE(LOG_DUMP, "Need Optimization (FRSTOR)\n");
-                fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+                BARRIER(BARRIER_FLOAT);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
                 if(ed!=x1) {MOVx_REG(x1, ed);}
                 CALL(const_native_frstor, -1);
@@ -194,7 +194,7 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 6:
                 INST_NAME("FNSAVE m108byte");
                 MESSAGE(LOG_DUMP, "Need Optimization (FNSAVE)\n");
-                fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+                BARRIER(BARRIER_FLOAT);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
                 if(ed!=x1) {MOVx_REG(x1, ed);}
                 CALL(const_native_fsave, -1);
@@ -202,7 +202,6 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 break;
             case 7:
                 INST_NAME("FNSTSW m2byte");
-                //fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x4, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 LDRw_U12(x2, xEmu, offsetof(x64emu_t, top));
                 LDRH_U12(x3, xEmu, offsetof(x64emu_t, sw));
diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c
index e6289479..5d620da6 100644
--- a/src/dynarec/arm64/dynarec_arm64_f20f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f20f.c
@@ -456,7 +456,6 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 j64 = (uint32_t)(addr+i32_);                            \

             else                                                        \

                 j64 = addr+i32_;                                        \

-            BARRIER(BARRIER_MAYBE);                                     \

             JUMP(j64, 1);                                               \

             GETFLAGS;                                                   \

             if(dyn->insts[ninst].x64.jmp_insts==-1 ||                   \

diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index 9ecd4d29..d17eee5f 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -1236,4 +1236,66 @@ void updateUneeded(dynarec_arm_t* dyn)
                 if(dyn->insts[ninst].n.ymm_unneeded&(1<<i))
                     propagateYMMUneeded(dyn, ninst, i);
     }
+}
+
+void tryEarlyFpuBarrier(dynarec_arm_t* dyn, int last_fpu_used, int ninst)
+{
+    // there is a barrier at ninst
+    // check if, up to last fpu_used, if there is some suspicious jump that would prevent the barrier to be put earlier
+    int usefull = 0;
+    for(int i=ninst-1; i>last_fpu_used; --i)
+    {
+        if(!dyn->insts[i].x64.has_next)
+            return; // break of chain, don't try to be smart for now
+        if(dyn->insts[i].x64.barrier&BARRIER_FLOAT)
+            return; // already done?
+        if(dyn->insts[i].x64.jmp && dyn->insts[i].x64.jmp_insts==-1)
+            usefull = 1;
+        if(dyn->insts[i].x64.jmp && dyn->insts[i].x64.jmp_insts!=-1) {
+            int i2 = dyn->insts[i].x64.jmp_insts;
+            if(i2<last_fpu_used || i2>ninst) {
+                // check if some xmm/ymm/x87 stack are used in landing point
+                if(i2>ninst) {
+                    if(dyn->insts[i2].n.xmm_used || dyn->insts[i2].n.ymm_used || dyn->insts[i2].n.stack)
+                        return;
+                }
+                // we will stop there, not trying to guess too much thing
+                if((usefull && (i+1)!=ninst)) {
+                    if(BOX64ENV(dynarec_dump) || BOX64ENV(dynarec_log)>1) dynarec_log(LOG_NONE, "Putting early Float Barrier in %d for %d\n", i+1, ninst);
+                    dyn->insts[i+1].x64.barrier|=BARRIER_FLOAT;
+                }
+                return;
+            }
+            usefull = 1;
+        }
+        for(int pred=0; pred<dyn->insts[i].pred_sz; ++pred) {
+            if(dyn->insts[i].pred[pred]<=last_fpu_used) {
+                if(usefull && ((i+1)!=ninst)) {
+                    if(BOX64ENV(dynarec_dump) || BOX64ENV(dynarec_log)>1) dynarec_log(LOG_NONE, "Putting early Float Barrier in %d for %d\n", i+1, ninst);
+                    dyn->insts[i+1].x64.barrier|=BARRIER_FLOAT;
+                }
+                return;
+            }
+        }
+        if(dyn->insts[i].pred_sz>1)
+            usefull = 1;
+    }
+    if(usefull) {
+        if(BOX64ENV(dynarec_dump) || BOX64ENV(dynarec_log)>1) dynarec_log(LOG_NONE, "Putting early Float Barrier in %d for %d\n", last_fpu_used, ninst);
+        dyn->insts[last_fpu_used+1].x64.barrier|=BARRIER_FLOAT;
+    }
+}
+
+void propagateFpuBarrier(dynarec_arm_t* dyn)
+{
+    int last_fpu_used = -1;
+    for(int ninst=0; ninst<dyn->size; ++ninst) {
+        int fpu_used = dyn->insts[ninst].n.xmm_used || dyn->insts[ninst].n.ymm_used || dyn->insts[ninst].mmx_used || dyn->insts[ninst].x87_used;
+        if(fpu_used) last_fpu_used = ninst;
+        dyn->insts[ninst].fpu_used = fpu_used;
+        if(dyn->insts[ninst].fpupurge && (last_fpu_used!=-1) && (last_fpu_used!=(ninst-1))) {
+            tryEarlyFpuBarrier(dyn, last_fpu_used, ninst);
+            last_fpu_used = -1;  // reset the last_fpu_used...
+        }
+    }
 }
\ No newline at end of file
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h
index c2d88150..d32dbddd 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.h
+++ b/src/dynarec/arm64/dynarec_arm64_functions.h
@@ -86,7 +86,8 @@ void fpu_reset_ninst(dynarec_native_t* dyn, int ninst);
 // is st freed
 int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st);
 
+// propage FPU_BARRIER to trigger it as soon as possible (avoiding fetching an FPU reg if it's unused)
+void propagateFpuBarrier(dynarec_arm_t* dyn);
 // propage the uneeded flags on XMM/YMM regs (done between step 0 and step 1)
 void updateUneeded(dynarec_arm_t* dyn);
-
 #endif //__DYNAREC_ARM_FUNCTIONS_H__
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index e6ee1b70..643c2a97 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -1028,6 +1028,7 @@ int neoncache_st_coherency(dynarec_arm_t* dyn, int ninst, int a, int b)
 // the reg returned is *2 for FLOAT
 int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
 {
+    dyn->insts[ninst].x87_used = 1;
     if(dyn->n.mmxcount)
         mmx_purgecache(dyn, ninst, 0, s1);
     dyn->n.x87stack+=1;
@@ -1062,6 +1063,7 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
 }
 void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1)
 {
+    dyn->insts[ninst].x87_used = 1;
     if(dyn->n.mmxcount)
         mmx_purgecache(dyn, ninst, 0, s1);
     dyn->n.x87stack+=1;
@@ -1111,6 +1113,7 @@ static int internal_x87_dofree(dynarec_arm_t* dyn)
 }
 void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1)
 {
+    dyn->insts[ninst].x87_used = 1;
     if(dyn->n.mmxcount)
         mmx_purgecache(dyn, ninst, 0, s1);
     do {
@@ -1193,17 +1196,7 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int
         for (int i=0; i<8; ++i)
             if(dyn->n.x87cache[i]!=-1) {
                 int st = dyn->n.x87cache[i]+dyn->n.stack_pop;
-                #if STEP == 1
-                if(!next) {   // don't force promotion here
-                    // pre-apply pop, because purge happens in-between
-                    neoncache_promote_double(dyn, ninst, st);
-                }
-                #endif
-                #if STEP == 3
-                if(!next && neoncache_get_current_st(dyn, ninst, st)!=NEON_CACHE_ST_D) {
-                    MESSAGE(LOG_DUMP, "Warning, incoherency with purged ST%d cache\n", st);
-                }
-                #endif
+                // don't force promotion here
                 ADDw_U12(s3, s2, dyn->n.x87cache[i]);   // unadjusted count, as it's relative to real top
                 ANDw_mask(s3, s3, 0, 2); //mask=7   // (emu->top + st)&7
                 switch(neoncache_get_current_st(dyn, ninst, st)) {
@@ -1378,6 +1371,7 @@ void x87_unreflectcount(dynarec_arm_t* dyn, int ninst, int s1, int s2)
 
 int x87_get_current_cache(dynarec_arm_t* dyn, int ninst, int st, int t)
 {
+    dyn->insts[ninst].x87_used = 1;
     // search in cache first
     for (int i=0; i<8; ++i) {
         if(dyn->n.x87cache[i]==st) {
@@ -1398,6 +1392,7 @@ int x87_get_current_cache(dynarec_arm_t* dyn, int ninst, int st, int t)
 
 int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, int st, int t)
 {
+    dyn->insts[ninst].x87_used = 1;
     if(dyn->n.mmxcount)
         mmx_purgecache(dyn, ninst, 0, s1);
     int ret = x87_get_current_cache(dyn, ninst, st, t);
@@ -1431,6 +1426,7 @@ int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, i
 }
 int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
 {
+    dyn->insts[ninst].x87_used = 1;
     for(int ii=0; ii<24; ++ii)
         if((dyn->n.neoncache[ii].t == NEON_CACHE_ST_F
          || dyn->n.neoncache[ii].t == NEON_CACHE_ST_D
@@ -1442,10 +1438,12 @@ int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
 }
 int x87_get_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t)
 {
+    dyn->insts[ninst].x87_used = 1;
     return dyn->n.x87reg[x87_get_cache(dyn, ninst, 1, s1, s2, a, t)];
 }
 int x87_get_st_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t)
 {
+    dyn->insts[ninst].x87_used = 1;
     return dyn->n.x87reg[x87_get_cache(dyn, ninst, 0, s1, s2, a, t)];
 }
 
@@ -1500,6 +1498,7 @@ void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
 
 void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
 {
+    dyn->insts[ninst].x87_used = 1;
     if(dyn->n.mmxcount)
         mmx_purgecache(dyn, ninst, 0, s1);
     // search in cache first
@@ -1550,6 +1549,7 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
 
 void x87_free(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int st)
 {
+    dyn->insts[ninst].x87_used = 1;
     int ret = -1;
     for (int i=0; (i<8) && (ret==-1); ++i)
         if(dyn->n.x87cache[i] == st)
@@ -1683,6 +1683,7 @@ static int isx87Empty(dynarec_arm_t* dyn)
 // get neon register for a MMX reg, create the entry if needed
 int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a)
 {
+    dyn->insts[ninst].mmx_used = 1;
     if(!dyn->n.x87stack && isx87Empty(dyn))
         x87_purgecache(dyn, ninst, 0, s1, s2, s3);
     if(dyn->n.mmxcache[a]!=-1)
@@ -1695,6 +1696,7 @@ int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a)
 // get neon register for a MMX reg, but don't try to synch it if it needed to be created
 int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a)
 {
+    dyn->insts[ninst].mmx_used = 1;
     if(!dyn->n.x87stack && isx87Empty(dyn))
         x87_purgecache(dyn, ninst, 0, s1, s2, s3);
     if(dyn->n.mmxcache[a]!=-1)
@@ -2067,8 +2069,10 @@ void fpu_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int
     x87_purgecache(dyn, ninst, next, s1, s2, s3);
     mmx_purgecache(dyn, ninst, next, s1);
     sse_purgecache(dyn, ninst, next, s1);
-    if(!next)
+    if(!next) {
         fpu_reset_reg(dyn);
+        dyn->insts[ninst].fpupurge = 1;
+    }
 }
 
 static int findCacheSlot(dynarec_arm_t* dyn, int ninst, int t, int n, neoncache_t* cache)
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index b3eeed49..2d8a236c 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -132,6 +132,10 @@ typedef struct instruction_arm64_s {
     unsigned            df_notneeded:1;
     unsigned            unaligned:1;    // this opcode can be re-generated for unaligned special case
     unsigned            x87precision:1; // this opcode can handle x87pc
+    unsigned            mmx_used:1; // no fine tracking, just a global "any reg used"
+    unsigned            x87_used:1; // no fine tracking, just a global "any reg used"
+    unsigned            fpu_used:1; // any xmm/ymm/x87/mmx reg used
+    unsigned            fpupurge:1;   // this opcode will purge all fpu regs
     flagcache_t         f_exit;     // flags status at end of instruction
     neoncache_t         n;          // neoncache at end of instruction (but before poping)
     flagcache_t         f_entry;    // flags status before the instruction begin
diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h
index 5424c447..994eebc3 100644
--- a/src/dynarec/dynarec_arch.h
+++ b/src/dynarec/dynarec_arch.h
@@ -24,7 +24,7 @@
 #define MAXBLOCK_SIZE ((1<<20)-200)

 

 #define RAZ_SPECIFIC(A, N)      rasNativeState(A, N)

-#define UPDATE_SPECIFICS(A)     updateNativeFlags(A)

+#define UPDATE_SPECIFICS(A)     updateNativeFlags(A); propagateFpuBarrier(A)

 #define PREUPDATE_SPECIFICS(A)

 #define POSTUPDATE_SPECIFICS(A) updateUneeded(A)

 #define ARCH_SIZE(A)    get_size_arch(A)

diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index 8e142b57..98aa0e61 100644
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -702,6 +702,10 @@ dynablock_t* FillBlock64(uintptr_t addr, int alternate, int is32bits, int inst_m
         #endif
         {
             helper.insts[i].x64.need_after |= X_PEND;
+            if(helper.insts[i].barrier_maybe) {
+                helper.insts[i].x64.barrier|=BARRIER_FLOAT;
+                helper.insts[i].barrier_maybe = 0;
+            }
         } else {
             // find jump address instruction
             int k=-1;
@@ -734,8 +738,6 @@ dynablock_t* FillBlock64(uintptr_t addr, int alternate, int is32bits, int inst_m
                     k=i2;
             }*/
             if(k!=-1) {
-                if(!helper.insts[i].barrier_maybe)
-                    helper.insts[k].x64.barrier |= BARRIER_FULL;
                 // special case, loop on itself with some nop in between
                 if(k<i && !helper.insts[i].x64.has_next && is_nops(&helper, helper.insts[k].x64.addr, helper.insts[i].x64.addr-helper.insts[k].x64.addr)) {
                     #ifndef ARCH_NOP
@@ -746,6 +748,12 @@ dynablock_t* FillBlock64(uintptr_t addr, int alternate, int is32bits, int inst_m
                     #endif
                 }
                 helper.insts[i].x64.jmp_insts = k;
+                helper.insts[i].barrier_maybe = 0;
+            } else {
+                if(helper.insts[i].barrier_maybe) {
+                    helper.insts[i].x64.barrier|=BARRIER_FLOAT;
+                    helper.insts[i].barrier_maybe = 0;
+                }
             }
         }
     }
diff --git a/src/tools/env.c b/src/tools/env.c
index 6727d830..bb89cc49 100644
--- a/src/tools/env.c
+++ b/src/tools/env.c
@@ -804,7 +804,7 @@ done:
 #define HEADER_SIGN "DynaCache"
 #define SET_VERSION(MAJ, MIN, REV) (((MAJ)<<24)|((MIN)<<16)|(REV))
 #ifdef ARM64
-#define ARCH_VERSION SET_VERSION(0, 0, 1)
+#define ARCH_VERSION SET_VERSION(0, 0, 2)
 #elif defined(RV64)
 #define ARCH_VERSION SET_VERSION(0, 0, 1)
 #elif defined(LA64)