about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2022-10-24 20:19:09 +0200
committerptitSeb <sebastien.chev@gmail.com>2022-10-24 20:19:09 +0200
commitf8f1859a0508ac574035a75aa0206cd25271d735 (patch)
tree41d2118dc25cddee077f06a781eb1dd0d4ff4f71 /src
parent92f2a44b28d21b7c64358b7d2883fb4793508a73 (diff)
downloadbox64-f8f1859a0508ac574035a75aa0206cd25271d735.tar.gz
box64-f8f1859a0508ac574035a75aa0206cd25271d735.zip
[DYNAREC] Better (defered) flag handling in dynarec (gives between 5% and 10% speedup)
Diffstat (limited to 'src')
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_00.c18
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_0f.c6
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_67.c6
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_emit_math.c4
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_functions.c44
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_functions.h2
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_helper.c54
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_helper.h44
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_pass0.h11
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_pass3.h7
-rwxr-xr-xsrc/dynarec/dynarec_native.c130
-rwxr-xr-xsrc/dynarec/dynarec_native_pass.c9
-rwxr-xr-xsrc/dynarec/dynarec_private.h8
13 files changed, 210 insertions, 133 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index 59c70674..f3f2f960 100755
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -40,6 +40,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
     uint8_t wback, wb1, wb2, wb;
     int64_t fixedaddress;
     int lock;
+    int cacheupd;
 
     opcode = F8;
     MAYUSE(eb1);
@@ -47,6 +48,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
     MAYUSE(j64);
     MAYUSE(wb);
     MAYUSE(lock);
+    MAYUSE(cacheupd);
 
     switch(opcode) {
         case 0x00:
@@ -650,11 +652,11 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 i32 = dyn->insts[ninst].epilog-(dyn->native_size);      \
                 Bcond(NO, i32);                                         \
                 if(dyn->insts[ninst].x64.jmp_insts==-1) {               \
-                    if(!dyn->insts[ninst].x64.barrier)                  \
+                    if(!(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))  \
                         fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \
                     jump_to_next(dyn, addr+i8, 0, ninst);               \
                 } else {                                                \
-                    fpuCacheTransform(dyn, ninst, x1, x2, x3);          \
+                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \
                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);\
                     B(i32);                                             \
                 }                                                       \
@@ -1206,7 +1208,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 1:
             case 2:
                 if(rep==1) {INST_NAME("REPNZ CMPSB");} else {INST_NAME("REPZ CMPSB");}
-                SETFLAGS(X_ALL, SF_MAYSET);
+                MAYSETFLAGS();
+                SETFLAGS(X_ALL, SF_SET_PENDING);
                 CBZx_NEXT(xRCX);
                 TBNZ_MARK2(xFlags, F_DF);
                 MARK;   // Part with DF==0
@@ -1305,7 +1308,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 1:
             case 2:
                 if(rep==1) {INST_NAME("REPNZ SCASB");} else {INST_NAME("REPZ SCASB");}
-                SETFLAGS(X_ALL, SF_MAYSET);
+                MAYSETFLAGS();
+                SETFLAGS(X_ALL, SF_SET_PENDING);
                 CBZx_NEXT(xRCX);
                 UBFXw(x1, xRAX, 0, 8);
                 TBNZ_MARK2(xFlags, F_DF);
@@ -2035,11 +2039,11 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 i32 = dyn->insts[ninst].epilog-(dyn->native_size);      \
                 if(Z) {CBNZx(xRCX, i32);} else {CBZx(xRCX, i32);};      \
                 if(dyn->insts[ninst].x64.jmp_insts==-1) {               \
-                    if(!dyn->insts[ninst].x64.barrier)                  \
+                    if(!(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))  \
                         fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \
                     jump_to_next(dyn, addr+i8, 0, ninst);               \
                 } else {                                                \
-                    fpuCacheTransform(dyn, ninst, x1, x2, x3);          \
+                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \
                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);    \
                     Bcond(c__, i32);                                    \
                 }                                                       \
@@ -2176,7 +2180,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 jump_to_next(dyn, addr+i32, 0, ninst);
             } else {
                 // inside the block
-                fpuCacheTransform(dyn, ninst, x1, x2, x3);
+                CacheTransform(dyn, ninst, CHECK_CACHE(), x1, x2, x3);
                 tmp = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);
                 if(tmp==4) {
                     NOP;
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c
index 24877ed8..51c9e247 100755
--- a/src/dynarec/arm64/dynarec_arm64_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_0f.c
@@ -72,6 +72,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
     uint8_t wback, wb2;

     uint8_t eb1, eb2;

     int32_t i32, i32_;

+    int cacheupd;

     int v0, v1;

     int q0, q1;

     int d0, d1;

@@ -88,6 +89,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
     MAYUSE(d1);

     MAYUSE(s0);

     MAYUSE(j64);

+    MAYUSE(cacheupd);

     #if STEP > 1

     static const int8_t mask_shift8[] = { -7, -6, -5, -4, -3, -2, -1, 0 };

     #endif

@@ -1034,11 +1036,11 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 i32 = dyn->insts[ninst].epilog-(dyn->native_size);      \

                 Bcond(NO, i32);                                         \

                 if(dyn->insts[ninst].x64.jmp_insts==-1) {               \

-                    if(!dyn->insts[ninst].x64.barrier)                  \

+                    if(!(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))  \

                         fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \

                     jump_to_next(dyn, addr+i32_, 0, ninst);             \

                 } else {                                                \

-                    fpuCacheTransform(dyn, ninst, x1, x2, x3);          \

+                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \

                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);    \

                     B(i32);                                             \

                 }                                                       \

diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c
index 7157a5e8..17a89479 100755
--- a/src/dynarec/arm64/dynarec_arm64_67.c
+++ b/src/dynarec/arm64/dynarec_arm64_67.c
@@ -44,6 +44,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
     uint8_t u8;

     int32_t i32;

     int64_t j64, i64;

+    int cacheupd;

     int lock;

     int v0, v1, s0;

     MAYUSE(i32);

@@ -52,6 +53,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
     MAYUSE(v1);

     MAYUSE(s0);

     MAYUSE(lock);

+    MAYUSE(cacheupd);

 

     // REX prefix before the 67 are ignored

     rex.rex = 0;

@@ -762,11 +764,11 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 i32 = dyn->insts[ninst].epilog-(dyn->native_size);      \

                 Bcond(NO, i32);                                         \

                 if(dyn->insts[ninst].x64.jmp_insts==-1) {               \

-                    if(!dyn->insts[ninst].x64.barrier)                  \

+                    if(!(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))  \

                         fpu_purgecache(dyn, ninst, 1, x1, x2, x3);      \

                     jump_to_next(dyn, addr+i8, 0, ninst);               \

                 } else {                                                \

-                    fpuCacheTransform(dyn, ninst, x1, x2, x3);          \

+                    CacheTransform(dyn, ninst, cacheupd, x1, x2, x3);   \

                     i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);\

                     B(i32);                                             \

                 }                                                       \

diff --git a/src/dynarec/arm64/dynarec_arm64_emit_math.c b/src/dynarec/arm64/dynarec_arm64_emit_math.c
index eb55a249..37952444 100755
--- a/src/dynarec/arm64/dynarec_arm64_emit_math.c
+++ b/src/dynarec/arm64/dynarec_arm64_emit_math.c
@@ -77,7 +77,7 @@ void emit_add32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
 void emit_add32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4, int s5)
 {
     MAYUSE(s5);
-    if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.need_flags==X_PEND))
+    if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags==X_PEND))
     {
         // special case when doing math on ESP and only PEND is needed: ignoring it!
         if(c>=0 && c<0x1000) {
@@ -201,7 +201,7 @@ void emit_sub32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
 void emit_sub32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4, int s5)
 {
     MAYUSE(s5);
-    if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.need_flags==X_PEND))
+    if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags==X_PEND))
     {
         // special case when doing math on RSP and only PEND is needed: ignoring it!
         if(c>=0 && c<0x1000) {
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index d51f5a3a..2c6ebb36 100755
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -631,7 +631,7 @@ int isCacheEmpty(dynarec_arm_t* dyn, int ninst) {
 
 }
 
-int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
+static int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
     int i2 = dyn->insts[ninst].x64.jmp_insts;
     if(i2<0)
         return 1;
@@ -678,6 +678,48 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
     return ret;
 }
 
+static int flagsCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
+    int jmp = dyn->insts[ninst].x64.jmp_insts;
+    if(jmp<0)
+        return 0;
+    if(dyn->insts[ninst].f_exit.dfnone)  // flags are fully known, nothing we can do more
+        return 0;
+/*    if((dyn->f.pending!=SF_SET)
+    && (dyn->f.pending!=SF_SET_PENDING)) {
+        if(dyn->f.pending!=SF_PENDING) {*/
+    switch (dyn->insts[jmp].f_entry.pending) {
+        case SF_UNKNOWN: return 0;
+        case SF_SET: 
+            if(dyn->insts[ninst].f_exit.pending!=SF_SET && dyn->insts[ninst].f_exit.pending!=SF_SET_PENDING) 
+                return 1; 
+            else 
+                return 0;
+        case SF_SET_PENDING:
+            if(dyn->insts[ninst].f_exit.pending!=SF_SET 
+            && dyn->insts[ninst].f_exit.pending!=SF_SET_PENDING
+            && dyn->insts[ninst].f_exit.pending!=SF_PENDING) 
+                return 1; 
+            else 
+                return 0;
+        case SF_PENDING:
+            if(dyn->insts[ninst].f_exit.pending!=SF_SET 
+            && dyn->insts[ninst].f_exit.pending!=SF_SET_PENDING
+            && dyn->insts[ninst].f_exit.pending!=SF_PENDING)
+                return 1;
+            else
+                return (dyn->insts[jmp].f_entry.dfnone  == dyn->insts[ninst].f_exit.dfnone)?0:1;
+    }
+    if(dyn->insts[jmp].f_entry.dfnone && !dyn->insts[ninst].f_exit.dfnone)
+        return 1;
+    return 0;
+}
+int CacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
+    int ret = 0;
+    if (fpuCacheNeedsTransform(dyn, ninst)) ret|=1;
+    if (flagsCacheNeedsTransform(dyn, ninst)) ret|=2;
+    return ret;
+}
+
 void neoncacheUnwind(neoncache_t* cache)
 {
     if(cache->swapped) {
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h
index 7183fd6d..f8d5d127 100755
--- a/src/dynarec/arm64/dynarec_arm64_functions.h
+++ b/src/dynarec/arm64/dynarec_arm64_functions.h
@@ -72,7 +72,7 @@ void neoncache_promote_double(dynarec_arm_t* dyn, int ninst, int a);
 int neoncache_combine_st(dynarec_arm_t* dyn, int ninst, int a, int b);  // with stack current dyn->n_stack*
 
 // FPU Cache transformation (for loops)
-int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int i1);
+int CacheNeedsTransform(dynarec_arm_t* dyn, int i1);
 
 // Undo the changes of a neoncache to get the status before the instruction
 void neoncacheUnwind(neoncache_t* cache);
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 961b0278..521b0604 100755
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -1511,7 +1511,7 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in
     cache->neoncache[i].v = 0;
 }
 
-void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
+static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
 {
 #if STEP > 1
     int i2 = dyn->insts[ninst].x64.jmp_insts;
@@ -1642,6 +1642,58 @@ void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
     MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
 #endif
 }
+static void flagsCacheTransform(dynarec_arm_t* dyn, int ninst, int s1)
+{
+#if STEP > 1
+    int j64;
+    int jmp = dyn->insts[ninst].x64.jmp_insts;
+    if(jmp<0)
+        return;
+    if(dyn->f.dfnone)  // flags are fully known, nothing we can do more
+        return;
+    MESSAGE(LOG_DUMP, "\tFlags fetch ---- ninst=%d -> %d\n", ninst, jmp);
+    int go = 0;
+    switch (dyn->insts[jmp].f_entry.pending) {
+        case SF_UNKNOWN: break;
+        case SF_SET: 
+            if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING) 
+                go = 1; 
+            break;
+        case SF_SET_PENDING:
+            if(dyn->f.pending!=SF_SET 
+            && dyn->f.pending!=SF_SET_PENDING
+            && dyn->f.pending!=SF_PENDING) 
+                go = 1; 
+            break;
+        case SF_PENDING:
+            if(dyn->f.pending!=SF_SET 
+            && dyn->f.pending!=SF_SET_PENDING
+            && dyn->f.pending!=SF_PENDING)
+                go = 1;
+            else
+                go = (dyn->insts[jmp].f_entry.dfnone  == dyn->f.dfnone)?0:1;
+            break;
+    }
+    if(dyn->insts[jmp].f_entry.dfnone && !dyn->f.dfnone)
+        go = 1;
+    if(go) {
+        if(dyn->f.pending!=SF_PENDING) {
+            LDRw_U12(s1, xEmu, offsetof(x64emu_t, df));
+            j64 = (GETMARK3)-(dyn->native_size);
+            CBZw(s1, j64);
+        }
+        CALL_(UpdateFlags, -1, 0);
+        MARK3;
+    }
+#endif
+}
+
+void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) {
+    if(cacheupd&1)
+        fpuCacheTransform(dyn, ninst, s1, s2, s3);
+    if(cacheupd&2)
+        flagsCacheTransform(dyn, ninst, s1);
+}
 
 #ifdef HAVE_TRACE
 void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 954ac5b8..ab14a8e6 100755
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -477,11 +477,11 @@
     j64 = GETMARKLOCK-(dyn->native_size);  \
     CBNZx(reg, j64)
 
-#define IFX(A)  if((dyn->insts[ninst].x64.need_flags&(A)))
-#define IFX_PENDOR0  if((dyn->insts[ninst].x64.need_flags&(X_PEND) || !dyn->insts[ninst].x64.need_flags))
-#define IFXX(A) if((dyn->insts[ninst].x64.need_flags==(A)))
-#define IFX2X(A, B) if((dyn->insts[ninst].x64.need_flags==(A) || dyn->insts[ninst].x64.need_flags==(B) || dyn->insts[ninst].x64.need_flags==((A)|(B))))
-#define IFXN(A, B)  if((dyn->insts[ninst].x64.need_flags&(A) && !(dyn->insts[ninst].x64.need_flags&(B))))
+#define IFX(A)  if((dyn->insts[ninst].x64.gen_flags&(A)))
+#define IFX_PENDOR0  if((dyn->insts[ninst].x64.gen_flags&(X_PEND) || !dyn->insts[ninst].x64.gen_flags))
+#define IFXX(A) if((dyn->insts[ninst].x64.gen_flags==(A)))
+#define IFX2X(A, B) if((dyn->insts[ninst].x64.gen_flags==(A) || dyn->insts[ninst].x64.gen_flags==(B) || dyn->insts[ninst].x64.gen_flags==((A)|(B))))
+#define IFXN(A, B)  if((dyn->insts[ninst].x64.gen_flags&(A) && !(dyn->insts[ninst].x64.gen_flags&(B))))
 
 // Generate FCOM with s1 and s2 scratch regs (the VCMP is already done)
 #define FCOM(s1, s2, s3)                                                    \
@@ -598,6 +598,10 @@
 #define SET_NODF()          dyn->f.dfnone = 0
 #define SET_DFOK()          dyn->f.dfnone = 1
 
+#ifndef MAYSETFLAGS
+#define MAYSETFLAGS()
+#endif
+
 #ifndef READFLAGS
 #define READFLAGS(A) \
     if(((A)!=X_PEND && dyn->f.pending!=SF_SET)          \
@@ -613,23 +617,21 @@
         SET_DFOK();                                     \
     }
 #endif
-// SF_MAYSET doesn't change the flags status cache
-// it also doesn't consume any needed flags
+
 #ifndef SETFLAGS
 #define SETFLAGS(A, B)                                                                          \
     if(dyn->f.pending!=SF_SET                                                                   \
     && ((B)&SF_SUB)                                                                             \
-    && (dyn->insts[ninst].x64.need_flags&(~(A))))                                               \
-        READFLAGS(((dyn->insts[ninst].x64.need_flags&X_PEND)?X_ALL:dyn->insts[ninst].x64.need_flags)&(~(A)));\
-    if(dyn->insts[ninst].x64.need_flags) switch(B) {                                            \
+    && (dyn->insts[ninst].x64.gen_flags&(~(A))))                                                \
+        READFLAGS(((dyn->insts[ninst].x64.gen_flags&X_PEND)?X_ALL:dyn->insts[ninst].x64.gen_flags)&(~(A)));\
+    if(dyn->insts[ninst].x64.gen_flags) switch(B) {                                             \
         case SF_SUBSET:                                                                         \
         case SF_SET: dyn->f.pending = SF_SET; break;                                            \
         case SF_PENDING: dyn->f.pending = SF_PENDING; break;                                    \
         case SF_SUBSET_PENDING:                                                                 \
         case SF_SET_PENDING:                                                                    \
-            dyn->f.pending = (dyn->insts[ninst].x64.need_flags&X_PEND)?SF_SET_PENDING:SF_SET;   \
+            dyn->f.pending = (dyn->insts[ninst].x64.gen_flags&X_PEND)?SF_SET_PENDING:SF_SET;    \
             break;                                                                              \
-        case SF_MAYSET: break;                                                                  \
     } else dyn->f.pending = SF_SET
 #endif
 #ifndef JUMP
@@ -641,12 +643,12 @@
 #ifndef BARRIER_NEXT
 #define BARRIER_NEXT(A)
 #endif
-#define UFLAG_OP1(A) if(dyn->insts[ninst].x64.need_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, op1));}
-#define UFLAG_OP2(A) if(dyn->insts[ninst].x64.need_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, op2));}
-#define UFLAG_OP12(A1, A2) if(dyn->insts[ninst].x64.need_flags) {STRxw_U12(A1, xEmu, offsetof(x64emu_t, op1));STRxw_U12(A2, 0, offsetof(x64emu_t, op2));}
-#define UFLAG_RES(A) if(dyn->insts[ninst].x64.need_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, res));}
-#define UFLAG_DF(r, A) if(dyn->insts[ninst].x64.need_flags) {SET_DF(r, A)}
-#define UFLAG_IF if(dyn->insts[ninst].x64.need_flags)
+#define UFLAG_OP1(A) if(dyn->insts[ninst].x64.gen_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, op1));}
+#define UFLAG_OP2(A) if(dyn->insts[ninst].x64.gen_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, op2));}
+#define UFLAG_OP12(A1, A2) if(dyn->insts[ninst].x64.gen_flags) {STRxw_U12(A1, xEmu, offsetof(x64emu_t, op1));STRxw_U12(A2, 0, offsetof(x64emu_t, op2));}
+#define UFLAG_RES(A) if(dyn->insts[ninst].x64.gen_flags) {STRxw_U12(A, xEmu, offsetof(x64emu_t, res));}
+#define UFLAG_DF(r, A) if(dyn->insts[ninst].x64.gen_flags) {SET_DF(r, A)}
+#define UFLAG_IF if(dyn->insts[ninst].x64.gen_flags)
 #ifndef DEFAULT
 #define DEFAULT      *ok = -1; BARRIER(2)
 #endif
@@ -850,7 +852,7 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr);
 #define fpu_reflectcache STEPNAME(fpu_reflectcache)
 #endif
 
-#define fpuCacheTransform       STEPNAME(fpuCacheTransform)
+#define CacheTransform       STEPNAME(CacheTransform)
 
 /* setup r2 to address pointed by */
 uintptr_t geted(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, rex_t rex, int* l, int s, int delta);
@@ -977,12 +979,12 @@ void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1);
 // Set rounding according to mxcsr flags, return reg to restore flags
 int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
 
-void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
+void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3);
 
 #if STEP < 2
 #define CHECK_CACHE()   0
 #else
-#define CHECK_CACHE()   fpuCacheNeedsTransform(dyn, ninst)
+#define CHECK_CACHE()   (cacheupd = CacheNeedsTransform(dyn, ninst))
 #endif
 
 #define neoncache_st_coherency STEPNAME(neoncache_st_coherency)
diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h
index 23cbcc93..61eb1bcd 100755
--- a/src/dynarec/arm64/dynarec_arm64_pass0.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass0.h
@@ -6,16 +6,15 @@
     if(ninst) dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr
 
 #define MESSAGE(A, ...)  
+#define MAYSETFLAGS()   dyn->insts[ninst].x64.may_set = 1
 #define READFLAGS(A)    \
         dyn->insts[ninst].x64.use_flags = A; dyn->f.dfnone = 1;\
         dyn->f.pending=SF_SET
 #define SETFLAGS(A,B)   \
-        dyn->insts[ninst].x64.set_flags = A;            \
-        if(B!=SF_MAYSET) {                              \
-                dyn->insts[ninst].x64.state_flags = B;  \
-                dyn->f.pending=(B)&SF_SET_PENDING;      \
-                dyn->f.dfnone=((B)&SF_SET)?1:0;         \
-        }
+        dyn->insts[ninst].x64.set_flags = A;    \
+        dyn->insts[ninst].x64.state_flags = B;  \
+        dyn->f.pending=(B)&SF_SET_PENDING;      \
+        dyn->f.dfnone=((B)&SF_SET)?1:0;
 #define EMIT(A)     
 #define JUMP(A, C)         add_next(dyn, (uintptr_t)A); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C
 #define BARRIER(A)      if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1
diff --git a/src/dynarec/arm64/dynarec_arm64_pass3.h b/src/dynarec/arm64/dynarec_arm64_pass3.h
index 099ef09a..d1d8fbba 100755
--- a/src/dynarec/arm64/dynarec_arm64_pass3.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass3.h
@@ -19,7 +19,7 @@
 #define INST_NAME(name) \
     if(box64_dynarec_dump) {\
         printf_x64_instruction(my_context->dec, &dyn->insts[ninst].x64, name); \
-        dynarec_log(LOG_NONE, "%s%p: %d emited opcodes, inst=%d, barrier=%d state=%d/%d(%d), set=%X, use=%X, need=%X", \
+        dynarec_log(LOG_NONE, "%s%p: %d emited opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X", \
             (box64_dynarec_dump>1)?"\e[32m":"", \
             (void*)(dyn->native_start+dyn->insts[ninst].address),  \
             dyn->insts[ninst].size/4,           \
@@ -28,9 +28,12 @@
             dyn->insts[ninst].x64.state_flags,  \
             dyn->f.pending,                     \
             dyn->f.dfnone,                      \
+            dyn->insts[ninst].x64.may_set?"may":"set",              \
             dyn->insts[ninst].x64.set_flags,    \
+            dyn->insts[ninst].x64.gen_flags,    \
             dyn->insts[ninst].x64.use_flags,    \
-            dyn->insts[ninst].x64.need_flags);  \
+            dyn->insts[ninst].x64.need_before,  \
+            dyn->insts[ninst].x64.need_after);  \
         if(dyn->insts[ninst].pred_sz) {         \
             dynarec_log(LOG_NONE, ", pred=");   \
             for(int ii=0; ii<dyn->insts[ninst].pred_sz; ++ii)\
diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index c2a752b1..bbd4bcb7 100755
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -328,59 +328,51 @@ static void fillPredecessors(dynarec_native_t* dyn)
 
 }
 
-static void updateNeed(dynarec_native_t* dyn, int ninst, uint32_t need) {
-    uint32_t old_need = dyn->insts[ninst].x64.need_flags;
-    uint32_t new_need = old_need | need;
-    uint32_t new_use = dyn->insts[ninst].x64.use_flags;
-    uint32_t old_use = dyn->insts[ninst].x64.old_use;
-
-    if((new_need&X_PEND) && dyn->insts[ninst].x64.state_flags==SF_SUBSET) {
-        new_need &=~X_PEND;
-        new_need |= X_ALL;
-    } else if((new_need&X_PEND) && dyn->insts[ninst].x64.state_flags==SF_SUBSET_PENDING) {
-        new_need |= X_ALL&~dyn->insts[ninst].x64.set_flags;
-    }
-
-
-    uint32_t new_set = 0;
-    if(dyn->insts[ninst].x64.state_flags & SF_SET)
-        new_set = dyn->insts[ninst].x64.set_flags;
-    if(dyn->insts[ninst].x64.state_flags & SF_PENDING)
-        new_set |= X_PEND;
-    if((new_need&X_PEND) && (
-        dyn->insts[ninst].x64.state_flags==SF_SET || dyn->insts[ninst].x64.state_flags==SF_SUBSET)) {
-        new_need &=~X_PEND;
-        new_need |=X_ALL;
-    }
-    
-    dyn->insts[ninst].x64.need_flags = new_need;
-    dyn->insts[ninst].x64.old_use = new_use;
-
-    if(dyn->insts[ninst].x64.jmp_insts==-1)
-        new_need |= X_PEND;
-
-    if((new_need == old_need) && (new_use == old_use))    // no changes, bye
-        return;
-    
-    new_need &=~new_set;    // clean needed flag that were suplied
-    new_need |= new_use;    // new need
-    // a Flag Barrier will change all need to "Pending", as it clear all flags optimisation
-    if(new_need && dyn->insts[ninst].x64.barrier&BARRIER_FLAGS)
-        new_need = X_PEND;
-    
-    if((new_need == (X_ALL|X_PEND)) && (dyn->insts[ninst].x64.state_flags & SF_SET))
-        new_need = X_ALL;
-
-    //update need to new need on predecessor
-    for(int i=0; i<dyn->insts[ninst].pred_sz; ++i)
-        updateNeed(dyn, dyn->insts[ninst].pred[i], new_need);
-}
-
-static void resetNeed(dynarec_native_t* dyn) {
-    for(int i = dyn->size; i-- > 0;) {
-        dyn->insts[i].x64.old_use = 0;
-        dyn->insts[i].x64.need_flags = dyn->insts[i].x64.default_need;
+// updateNeed goes backward, from last intruction to top
+static int updateNeed(dynarec_arm_t* dyn, int ninst, uint8_t need) {
+    while (ninst>=0) {
+        // need pending but instruction is only a subset: remove pend and use an X_ALL instead
+        need |= dyn->insts[ninst].x64.need_after;
+        if((need&X_PEND) && (dyn->insts[ninst].x64.state_flags==SF_SUBSET)) {
+            need &=~X_PEND;
+            need |= X_ALL;
+        }
+        if((need&X_PEND) && (dyn->insts[ninst].x64.state_flags==SF_SET)) {
+            need &=~X_PEND;
+            need |= dyn->insts[ninst].x64.set_flags;    // SF_SET will compute all flags, it's not SUBSET!
+        }
+        if((need&X_PEND) && dyn->insts[ninst].x64.state_flags==SF_SUBSET_PENDING) {
+            need |= X_ALL&~(dyn->insts[ninst].x64.set_flags);
+        }
+        dyn->insts[ninst].x64.gen_flags = need&dyn->insts[ninst].x64.set_flags;
+        if((need&X_PEND) && (dyn->insts[ninst].x64.state_flags&SF_PENDING))
+            dyn->insts[ninst].x64.gen_flags |= X_PEND;
+        dyn->insts[ninst].x64.need_after = need;
+        need = dyn->insts[ninst].x64.need_after&~dyn->insts[ninst].x64.gen_flags;
+        if(dyn->insts[ninst].x64.may_set)
+            need |= dyn->insts[ninst].x64.gen_flags;    // forward the flags
+        // Consume X_PEND if relevant
+        if((need&X_PEND) && (dyn->insts[ninst].x64.set_flags&SF_PENDING))
+            need &=~X_PEND;
+        need |= dyn->insts[ninst].x64.use_flags;
+        if(dyn->insts[ninst].x64.need_before == need)
+            return ninst - 1;
+        dyn->insts[ninst].x64.need_before = need;
+        if(dyn->insts[ninst].x64.barrier&BARRIER_FLAGS) {
+            need = need?X_PEND:0;
+        }
+        int ok = 0;
+        for(int i=0; i<dyn->insts[ninst].pred_sz; ++i) {
+            if(dyn->insts[ninst].pred[i] == ninst-1)
+                ok = 1;
+            else
+                updateNeed(dyn, dyn->insts[ninst].pred[i], need);
+        }
+        if(!ok)
+            return ninst - 1;
+        --ninst;
     }
+    return ninst;
 }
 
 __thread void* current_helper = NULL;
@@ -446,17 +438,13 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) {
         protectDB(addr, end-addr);  //end is 1byte after actual end
     // compute hash signature
     uint32_t hash = X31_hash_code((void*)addr, end-addr);
-    // Compute flag_need, without current barriers
-    resetNeed(&helper);
-    for(int i = helper.size; i-- > 0;)
-        updateNeed(&helper, i, 0);
     // calculate barriers
     for(int i=0; i<helper.size; ++i)
         if(helper.insts[i].x64.jmp) {
             uintptr_t j = helper.insts[i].x64.jmp;
             if(j<start || j>=end) {
                 helper.insts[i].x64.jmp_insts = -1;
-                helper.insts[i].x64.use_flags |= X_PEND;
+                helper.insts[i].x64.need_after |= X_PEND;
             } else {
                 // find jump address instruction
                 int k=-1;
@@ -472,7 +460,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) {
     // fill predecessors with the jump address
     fillPredecessors(&helper);
     // check for the optionnal barriers now
-    for(int i=helper.size-1; i>=0; --i) {
+    /*for(int i=helper.size-1; i>=0; --i) {
         if(helper.insts[i].barrier_maybe) {
             // out-of-block jump
             if(helper.insts[i].x64.jmp_insts == -1) {
@@ -492,7 +480,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) {
                 }
             }
     	}
-    }
+    }*/
     // check to remove useless barrier, in case of jump when destination doesn't needs flags
     /*for(int i=helper.size-1; i>=0; --i) {
         int k;
@@ -509,27 +497,9 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) {
              }
         }
     }*/
-    // reset need_flags and compute again, now taking barrier into account (because barrier change use_flags)
-    for(int i = helper.size; i-- > 0;) {
-        int k;
-        if(helper.insts[i].x64.jmp 
-        && ((k=helper.insts[i].x64.jmp_insts)>=0)
-        ) {
-            if(helper.insts[k].x64.barrier&BARRIER_FLAGS)
-                // jumpto barrier
-                helper.insts[i].x64.use_flags |= X_PEND;
-            if(helper.insts[i].x64.barrier&BARRIER_FLAGS && (helper.insts[k].x64.need_flags | helper.insts[k].x64.use_flags))
-                helper.insts[k].x64.barrier|=BARRIER_FLAGS;
-            else
-                helper.insts[i].x64.use_flags |= (helper.insts[k].x64.need_flags | helper.insts[k].x64.use_flags);
-        }
-        if(helper.insts[i].x64.barrier&BARRIER_FLAGS && !(helper.insts[i].x64.set_flags&SF_PENDING))
-            // immediate barrier
-            helper.insts[i].x64.use_flags |= X_PEND;
-    }
-    resetNeed(&helper);
-    for(int i = helper.size; i-- > 0;)
-        updateNeed(&helper, i, 0);
+    int pos = helper.size;
+    while (pos>=0)
+        pos = updateNeed(&helper, pos, 0);
 
     // pass 1, float optimisations, first pass for flags
     native_pass1(&helper, addr);
diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index d0211197..b095889a 100755
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -30,6 +30,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr)
 {
     int ok = 1;
     int ninst = 0;
+    int j64;
     uintptr_t ip = addr;
     uintptr_t init_addr = addr;
     rex_t rex;
@@ -123,6 +124,9 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr)
         dyn->n.swapped = 0;
         NEW_INST;
         fpu_reset_scratch(dyn);
+        if((dyn->insts[ninst].x64.need_before&~X_PEND) && !dyn->insts[ninst].pred_sz) {
+            READFLAGS(dyn->insts[ninst].x64.need_before&~X_PEND);
+        }
 #ifdef HAVE_TRACE
         if(my_context->dec && box64_dynarec_trace) {
         if((trace_end == 0) 
@@ -226,10 +230,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr)
                 BARRIER(BARRIER_FLOAT);
             }
             #if STEP == 0
-            if(dyn->insts[ninst].x64.set_flags)
-                dyn->insts[ninst].x64.default_need |= X_PEND;
-            else
-                dyn->insts[ninst].x64.use_flags |= X_PEND;
+            dyn->insts[ninst].x64.need_after |= X_PEND;
             #endif
             ++ninst;
             fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
diff --git a/src/dynarec/dynarec_private.h b/src/dynarec/dynarec_private.h
index 71966bdf..0ab21df4 100755
--- a/src/dynarec/dynarec_private.h
+++ b/src/dynarec/dynarec_private.h
@@ -24,7 +24,6 @@
 #define SF_SUB      4
 #define SF_SUBSET   (SF_SUB|SF_SET)
 #define SF_SUBSET_PENDING   (SF_SUBSET|SF_PENDING)
-#define SF_MAYSET   8
 
 typedef struct instruction_x64_s {
     uintptr_t   addr;       //address of the instruction
@@ -37,9 +36,10 @@ typedef struct instruction_x64_s {
     uint8_t     state_flags;// One of SF_XXX state
     uint8_t     use_flags;  // 0 or combination of X_?F
     uint8_t     set_flags;  // 0 or combination of X_?F
-    uint8_t     default_need;// 0 or X_PEND basically
-    uint8_t     need_flags; // calculated
-    uint8_t     old_use;    // calculated
+    uint8_t     may_set;    // 1 if the flags may not be set
+    uint8_t     gen_flags;  // calculated
+    uint8_t     need_before;// calculated
+    uint8_t     need_after; // calculated
 } instruction_x64_t;
 
 void printf_x64_instruction(zydis_dec_t* dec, instruction_x64_t* inst, const char* name);