about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-04-24 15:34:54 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-04-24 15:34:54 +0200
commit2a79b604546769e600600f3d85a684641b0bca28 (patch)
treee061157fd647be5248117361a21db702d5026791 /src
parentdb32e498790a13f3dc0a78748d47530cf8404015 (diff)
downloadbox64-2a79b604546769e600600f3d85a684641b0bca28.tar.gz
box64-2a79b604546769e600600f3d85a684641b0bca28.zip
Changed x87 way of handling FFREE opcode ([DYNAREC] too, improving x87 robustness overall)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_00.c9
-rw-r--r--src/dynarec/arm64/dynarec_arm64_d9.c66
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.c29
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.h3
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c142
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h36
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass0.h2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h4
-rw-r--r--src/dynarec/dynarec_native.c10
-rw-r--r--src/dynarec/dynarec_native_pass.c9
-rw-r--r--src/dynarec/la64/dynarec_la64_00.c9
-rw-r--r--src/dynarec/la64/dynarec_la64_pass0.h2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_3.c9
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.c20
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.h3
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.c206
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h33
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass0.h2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h3
-rw-r--r--src/emu/x64emu.c4
-rw-r--r--src/emu/x64emu_private.h2
-rw-r--r--src/emu/x64rund9.c5
-rw-r--r--src/emu/x64rundd.c2
-rw-r--r--src/emu/x64test.c6
-rw-r--r--src/emu/x87emu_private.c36
-rw-r--r--src/emu/x87emu_private.h17
-rw-r--r--src/include/regs.h4
27 files changed, 428 insertions, 245 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index 1b5915b8..616f953c 100644
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -2161,7 +2161,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             if(box64_dynarec_safeflags) {
                 READFLAGS(X_PEND);  // lets play safe here too
             }
-            BARRIER(BARRIER_FLOAT);
+            fpu_purgecache(dyn, ninst, 1, x1, x2, x3);  // using next, even if there no next
             i32 = F16;
             retn_to_epilog(dyn, ninst, rex, i32);
             *need_epilog = 0;
@@ -2173,7 +2173,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             if(box64_dynarec_safeflags) {
                 READFLAGS(X_PEND);  // so instead, force the deferred flags, so it's not too slow, and flags are not lost
             }
-            BARRIER(BARRIER_FLOAT);
+            fpu_purgecache(dyn, ninst, 1, x1, x2, x3);  // using next, even if there no next
             ret_to_epilog(dyn, ninst, rex);
             *need_epilog = 0;
             *ok = 0;
@@ -3041,19 +3041,20 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         SETFLAGS(X_ALL, SF_SET_NODF);    // Hack to set flags to "dont'care" state
                     }
                     // regular call
-                    if(box64_dynarec_callret && box64_dynarec_bigblock>1) {
+                    /*if(box64_dynarec_callret && box64_dynarec_bigblock>1) {
                         BARRIER(BARRIER_FULL);
                         BARRIER_NEXT(BARRIER_FULL);
                     } else {
                         BARRIER(BARRIER_FLOAT);
                         *need_epilog = 0;
                         *ok = 0;
-                    }
+                    }*/
                     if(rex.is32bits) {
                         MOV32w(x2, addr);
                     } else {
                         TABLE64(x2, addr);
                     }
+                    fpu_purgecache(dyn, ninst, 1, x1, x3, x4);
                     PUSH1z(x2);
                     if(box64_dynarec_callret) {
                         SET_HASCALLRET();
diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c
index e12ecf72..5690e6be 100644
--- a/src/dynarec/arm64/dynarec_arm64_d9.c
+++ b/src/dynarec/arm64/dynarec_arm64_d9.c
@@ -136,40 +136,44 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             i1 = x87_get_current_cache(dyn, ninst, 0, NEON_CACHE_ST_D);
             // value put in x14
             if(i1==-1) {
-                // not in cache, so check Empty status and load it
-                i2 = -dyn->n.x87stack;
-                LDRw_U12(x3, xEmu, offsetof(x64emu_t, fpu_stack));
-                if(i2) {
-                    if(i2<0) {
-                        ADDw_U12(x3, x3, -i2);
-                    } else {
-                        SUBw_U12(x3, x3, i2);
+                if(fpu_is_st_freed(dyn, ninst, 0)) {
+                    MOV32w(x4, 0b100000100000000);
+                    B_MARK3_nocond;
+                } else {
+                    // not in cache, so check Empty status and load it
+                    i2 = -dyn->n.x87stack;
+                    LDRw_U12(x3, xEmu, offsetof(x64emu_t, fpu_stack));
+                    if(i2) {
+                        if(i2<0) {
+                            ADDw_U12(x3, x3, -i2);
+                        } else {
+                            SUBw_U12(x3, x3, i2);
+                        }
                     }
-                }
-                CMPSw_U12(x3, 0);
-                MOV32w(x3, 0b100000100000000);
-                CSELx(x4, x3, x4, cLE); // empty: C3,C2,C0 = 101
-                B_MARK3(cLE);
-                // x4 will be the actual top
-                LDRw_U12(x4, xEmu, offsetof(x64emu_t, top));
-                if(i2) {
-                    if(i2<0) {
-                        SUBw_U12(x4, x4, -i2);
-                    } else {
-                        ADDw_U12(x4, x4, i2);
+                    CMPSw_U12(x3, 0);
+                    MOV32w(x3, 0b100000100000000);
+                    CSELx(x4, x3, x4, cLE); // empty: C3,C2,C0 = 101
+                    B_MARK3(cLE);
+                    // x4 will be the actual top
+                    LDRw_U12(x4, xEmu, offsetof(x64emu_t, top));
+                    if(i2) {
+                        if(i2<0) {
+                            SUBw_U12(x4, x4, -i2);
+                        } else {
+                            ADDw_U12(x4, x4, i2);
+                        }
+                        ANDw_mask(x4, x4, 0, 3);    // (emu->top + i)&7
                     }
-                    ANDw_mask(x4, x4, 0, 3);    // (emu->top + i)&7
+                    // load tag
+                    LDRH_U12(x3, xEmu, offsetof(x64emu_t, fpu_tags));
+                    TSTw_mask(x3, 0, 1);    // 0b11
+                    MOV32w(x3, 0b100000100000000);
+                    CSELx(x4, x3, x4, cNE); // empty: C3,C2,C0 = 101
+                    B_MARK3(cNE);
+                    // load x2 with ST0 anyway, for sign extraction
+                    ADDx_REG_LSL(x1, xEmu, x4, 3);
+                    LDRx_U12(x2, x1, offsetof(x64emu_t, x87));
                 }
-                // load tag
-                ADDx_U12(x1, xEmu, offsetof(x64emu_t, p_regs));
-                LDRw_REG_LSL2(x3, x1, x4);
-                CMPSw_U12(x3, 0b11);    // empty
-                MOV32w(x3, 0b100000100000000);
-                CSELx(x4, x3, x4, cEQ); // empty: C3,C2,C0 = 101
-                B_MARK3(cEQ);
-                // load x2 with ST0 anyway, for sign extraction
-                ADDx_REG_LSL(x1, xEmu, x4, 3);
-                LDRx_U12(x2, x1, offsetof(x64emu_t, x87));
             } else {
                 // simply move from cache reg to x2
                 v1 = dyn->n.x87reg[i1];
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index 73946f88..f62ade0d 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -389,6 +389,7 @@ void neoncacheUnwind(neoncache_t* cache)
         // unswap
         int a = -1;
         int b = -1;
+        // in neoncache
         for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j)
             if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F || cache->neoncache[j].t == NEON_CACHE_ST_I64)) {
                 if(cache->neoncache[j].n == cache->combined1)
@@ -401,11 +402,12 @@ void neoncacheUnwind(neoncache_t* cache)
             cache->neoncache[a].n = cache->neoncache[b].n;
             cache->neoncache[b].n = tmp;
         }
+        // done
         cache->swapped = 0;
         cache->combined1 = cache->combined2 = 0;
     }
     if(cache->news) {
-        // reove the newly created neoncache
+        // remove the newly created neoncache
         for(int i=0; i<24; ++i)
             if(cache->news&(1<<i))
                 cache->neoncache[i].v = 0;
@@ -422,11 +424,23 @@ void neoncacheUnwind(neoncache_t* cache)
             }
         }
         cache->x87stack-=cache->stack_push;
+        cache->tags>>=(cache->stack_push*2);
         cache->stack-=cache->stack_push;
+        if(cache->pushed>=cache->stack_push)
+            cache->pushed-=cache->stack_push;
+        else
+            cache->pushed = 0;
         cache->stack_push = 0;
     }
     cache->x87stack+=cache->stack_pop;
     cache->stack_next = cache->stack;
+    if(cache->stack_pop) {
+        if(cache->poped>=cache->stack_pop)
+            cache->poped-=cache->stack_pop;
+        else
+            cache->poped = 0;
+        cache->tags<<=(cache->stack_pop*2);
+    }
     cache->stack_pop = 0;
     cache->barrier = 0;
     // And now, rebuild the x87cache info with neoncache
@@ -594,10 +608,9 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode)
 
 static void x87_reset(neoncache_t* n)
 {
-    for (int i=0; i<8; ++i) {
+    for (int i=0; i<8; ++i)
         n->x87cache[i] = -1;
-        n->freed[i] = -1;
-    }
+    n->tags = 0;
     n->x87stack = 0;
     n->stack = 0;
     n->stack_next = 0;
@@ -606,6 +619,9 @@ static void x87_reset(neoncache_t* n)
     n->combined1 = n->combined2 = 0;
     n->swapped = 0;
     n->barrier = 0;
+    n->pushed = 0;
+    n->poped = 0;
+
     for(int i=0; i<24; ++i)
         if(n->neoncache[i].t == NEON_CACHE_ST_F
          || n->neoncache[i].t == NEON_CACHE_ST_D
@@ -641,3 +657,8 @@ void fpu_reset_ninst(dynarec_arm_t* dyn, int ninst)
     sse_reset(&dyn->insts[ninst].n);
     fpu_reset_reg_neoncache(&dyn->insts[ninst].n);
 }
+
+int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st)
+{
+    return (dyn->n.tags&(0b11<<(st*2)))?1:0;
+}
\ No newline at end of file
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h
index 56039889..abe827bb 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.h
+++ b/src/dynarec/arm64/dynarec_arm64_functions.h
@@ -58,4 +58,7 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode);
 // reset the cache
 void fpu_reset(dynarec_native_t* dyn);
 void fpu_reset_ninst(dynarec_native_t* dyn, int ninst);
+
+// is st freed
+int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st);
 #endif //__DYNAREC_ARM_FUNCTIONS_H__
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index b8eb06f2..e3c2104f 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -941,6 +941,9 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
     dyn->n.stack+=1;
     dyn->n.stack_next+=1;
     dyn->n.stack_push+=1;
+    ++dyn->n.pushed;
+    if(dyn->n.poped)
+        --dyn->n.poped;
     // move all regs in cache, and find a free one
     for(int j=0; j<24; ++j)
         if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D)
@@ -948,9 +951,8 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
          ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64))
             ++dyn->n.neoncache[j].n;
     int ret = -1;
+    dyn->n.tags<<=2;
     for(int i=0; i<8; ++i) {
-        if(dyn->n.freed[i]!=-1)
-            ++dyn->n.freed[i];
         if(dyn->n.x87cache[i]!=-1)
             ++dyn->n.x87cache[i];
         else if(ret==-1) {
@@ -973,6 +975,9 @@ void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1)
     dyn->n.stack+=1;
     dyn->n.stack_next+=1;
     dyn->n.stack_push+=1;
+    ++dyn->n.pushed;
+    if(dyn->n.poped)
+        --dyn->n.poped;
     // move all regs in cache
     for(int j=0; j<24; ++j)
         if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D)
@@ -980,9 +985,8 @@ void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1)
          ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64))
             ++dyn->n.neoncache[j].n;
     int ret = -1;
+    dyn->n.tags<<=2;
     for(int i=0; i<8; ++i) {
-        if(dyn->n.freed[i]!=-1)
-            ++dyn->n.freed[i];
         if(dyn->n.x87cache[i]!=-1)
             ++dyn->n.x87cache[i];
         else if(ret==-1)
@@ -1008,17 +1012,11 @@ void static internal_x87_dopop(dynarec_arm_t* dyn)
 }
 int static internal_x87_dofree(dynarec_arm_t* dyn)
 {
-    int ret = 0;
-    for(int i=0; i<8; ++i)
-        if(dyn->n.freed[i]!=-1) {
-            --dyn->n.freed[i];
-            if(dyn->n.freed[i]<=0) {
-                MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n");
-                dyn->n.freed[i] = -1;
-                ret = 1;
-            }
-        }
-    return ret;
+    if(dyn->n.tags&0b11) {
+        MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n");
+        return 1;
+    }
+    return 0;
 }
 void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1)
 {
@@ -1028,6 +1026,10 @@ void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1)
         dyn->n.x87stack-=1;
         dyn->n.stack_next-=1;
         dyn->n.stack_pop+=1;
+        dyn->n.tags>>=2;
+        ++dyn->n.poped;
+        if(dyn->n.pushed)
+            --dyn->n.pushed;
         // move all regs in cache, poping ST0
         internal_x87_dopop(dyn);
     } while(internal_x87_dofree(dyn));
@@ -1051,8 +1053,9 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int
     int a = dyn->n.x87stack;
     if(a!=0) {
         // reset x87stack
-        if(!next)
+        if(!next) {
             dyn->n.x87stack = 0;
+        }
         // Add x87stack to emu fpu_stack
         LDRw_U12(s2, xEmu, offsetof(x64emu_t, fpu_stack));
         if(a>0) {
@@ -1063,31 +1066,33 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int
         STRw_U12(s2, xEmu, offsetof(x64emu_t, fpu_stack));
         // Sub x87stack to top, with and 7
         LDRw_U12(s2, xEmu, offsetof(x64emu_t, top));
-        // update tags (and top at the same time)
         if(a>0) {
-            // new tag to fulls
-            MOVZw(s3, 0);
-            ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs));
-            for (int i=0; i<a; ++i) {
-                SUBw_U12(s2, s2, 1);
-                ANDw_mask(s2, s2, 0, 2); //mask=7    // (emu->top + st)&7
-                if(x87_is_stcached(dyn, i)) // to handle ffree
-                    STRw_REG_LSL2(s3, s1, s2);
-            }
+            SUBw_U12(s2, s2, a);
         } else {
-            // empty tags
-            MOVZw(s3, 0b11);
-            ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs));
-            for (int i=0; i<-a; ++i) {
-                STRw_REG_LSL2(s3, s1, s2);
-                ADDw_U12(s2, s2, 1);
-                ANDw_mask(s2, s2, 0, 2); //mask=7    // (emu->top + st)&7
-            }
+            ADDw_U12(s2, s2, -a);
         }
+        ANDw_mask(s2, s2, 0, 2);
         STRw_U12(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags (and top at the same time)
+        LDRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        if(a>0) {
+            LSLw_IMM(s1, s1, a*2);
+        } else {
+            ORRw_mask(s1, s1, 0b010000, 0b001111);  // 0xffff0000
+            LSRw_IMM(s1, s1, -a*2);
+        }
+        STRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags));
     } else {
         LDRw_U12(s2, xEmu, offsetof(x64emu_t, top));
     }
+    // check if free is used
+    if(dyn->n.tags) {
+        LDRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        MOV32w(s3, dyn->n.tags);
+        ORRw_REG(s1, s1, s3);
+        STRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+    }
+
     if(ret!=0) {
         // --- set values
         // prepare offset to fpu => s1
@@ -1133,11 +1138,13 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int
     }
     if(!next) {
         dyn->n.stack_next = 0;
-        for(int i=0; i<8; ++i)
-            dyn->n.freed[i] = -1;
+        dyn->n.tags = 0;
         #if STEP < 2
         // refresh the cached valued, in case it's a purge outside a instruction
         dyn->insts[ninst].n.barrier = 1;
+        dyn->n.pushed = 0;
+        dyn->n.poped = 0;
+
         #endif
     }
     MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n");
@@ -1165,6 +1172,15 @@ static void x87_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
         }
         ANDw_mask(s2, s2, 0, 2);  //mask=7
         STRw_U12(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags
+        LDRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        if(a>0) {
+            LSLw_IMM(s1, s1, a*2);
+        } else {
+            ORRw_mask(s1, s1, 0b010000, 0b001111);  // 0xffff0000
+            LSRw_IMM(s1, s1, -a*2);
+        }
+        STRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags));
     }
     int ret = 0;
     for (int i=0; (i<8) && (!ret); ++i)
@@ -1213,6 +1229,15 @@ static void x87_unreflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, in
         }
         ANDw_mask(s2, s2, 0, 2);  //mask=7
         STRw_U12(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags
+        LDRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        if(a>0) {
+            ORRw_mask(s1, s1, 0b010000, 0b001111);  // 0xffff0000
+            LSRw_IMM(s1, s1, a*2);
+        } else {
+            LSLw_IMM(s1, s1, -a*2);
+        }
+        STRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags));
     }
 }
 
@@ -1439,17 +1464,8 @@ void x87_free(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int st)
             ANDw_mask(s2, s2, 0, 2); //mask=7    // (emu->top + i)&7
         }
     }
-    // mark as free
-    ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs));
-    MOVZw(s3, 0b11);
-    STRw_REG_LSL2(s3, s1, s2);
     // add mark in the freed array
-    for(int i=0; i<8; ++i)
-        if(dyn->n.freed[i]==-1) {
-            dyn->n.freed[i]=st;
-            MESSAGE(LOG_DUMP, "\t--------x87 Marked ST%d as Freed\n", st);
-            break;
-        }
+    dyn->n.tags |= 0b11<<(st*2);
     MESSAGE(LOG_DUMP, "\t--------x87 FFREE for ST%d\n", st);
 }
 
@@ -1984,26 +2000,15 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
         STRw_U12(s3, xEmu, offsetof(x64emu_t, fpu_stack));
         // Sub x87stack to top, with and 7
         LDRw_U12(s3, xEmu, offsetof(x64emu_t, top));
-        // update tags (and top at the same time)
+        // update tags
+        LDRH_U12(s2, xEmu, offsetof(x64emu_t, fpu_tags));
         if(a>0) {
-            // new tag to fulls
-            MOVZw(s2, 0);
-            ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs));
-            for (int i=0; i<a; ++i) {
-                SUBw_U12(s3, s3, 1);
-                ANDw_mask(s3, s3, 0, 2);    // (emu->top + st)&7
-                STRw_REG_LSL2(s2, s1, s3);    // that slot is full
-            }
+            LSLw_IMM(s2, s2, a*2);
         } else {
-            // empty tags
-            MOVZw(s2, 0b11);
-            ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs));
-            for (int i=0; i<-a; ++i) {
-                STRw_REG_LSL2(s2, s1, s3);    // empty slot before leaving it
-                ADDw_U12(s3, s3, 1);
-                ANDw_mask(s3, s3, 0, 2);    // (emu->top + st)&7
-            }
+            ORRw_mask(s2, s2, 0b010000, 0b001111);  // 0xffff0000
+            LSRw_IMM(s2, s2, -a*2);
         }
+        STRH_U12(s2, xEmu, offsetof(x64emu_t, fpu_tags));
         STRw_U12(s3, xEmu, offsetof(x64emu_t, top));
         s3_top = 0;
         stack_cnt = cache_i2.stack;
@@ -2243,8 +2248,14 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n)
     #if STEP > 1
     // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap)
     dyn->n = dyn->insts[ninst].n;
+    #else
+    dyn->n = dyn->insts[reset_n].n;
+    #endif
     neoncacheUnwind(&dyn->n);
-    #ifdef HAVE_TRACE
+    #if STEP == 0
+    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "New x87stack=%d\n", dyn->n.x87stack);
+        #endif
+    #if defined(HAVE_TRACE) && (STEP>2)
     if(box64_dynarec_dump)
         if(memcmp(&dyn->n, &dyn->insts[reset_n].n, sizeof(neon_cache_t))) {
             MESSAGE(LOG_DEBUG, "Warning, difference in neoncache: reset=");
@@ -2274,9 +2285,6 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n)
             MESSAGE(LOG_DEBUG, "\n");
         }
     #endif //HAVE_TRACE
-    #else
-    dyn->n = dyn->insts[reset_n].n;
-    #endif
 }
 
 // propagate ST stack state, especial stack pop that are deferred
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 617aecab..d62c571c 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -812,28 +812,28 @@
 #define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch)     x87_do_push_empty(dyn, ninst, scratch)
 #define X87_POP_OR_FAIL(dyn, ninst, scratch)            x87_do_pop(dyn, ninst, scratch)
 #else
-#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) \
-    if (dyn->n.x87stack == +8) {                      \
-        if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Push, stack=%d on inst %d\n", dyn->n.x87stack, ninst); \
-        dyn->abort = 1;                               \
-        return addr;                                  \
-    }                                                 \
+#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t)   \
+    if ((dyn->n.x87stack==8) || (dyn->n.pushed==8)) {   \
+        if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->n.x87stack, dyn->n.pushed, ninst); \
+        dyn->abort = 1;                                 \
+        return addr;                                    \
+    }                                                   \
     var = x87_do_push(dyn, ninst, scratch, t)
 
-#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) \
-    if (dyn->n.x87stack == +8) {                       \
-        if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Push, stack=%d on inst %d\n", dyn->n.x87stack, ninst); \
-        dyn->abort = 1;                             \
-        return addr;                                \
-    }                                               \
+#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch)     \
+    if ((dyn->n.x87stack==8) || (dyn->n.pushed==8)) {   \
+        if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->n.x87stack, dyn->n.pushed, ninst); \
+        dyn->abort = 1;                                 \
+        return addr;                                    \
+    }                                                   \
     x87_do_push_empty(dyn, ninst, scratch)
 
-#define X87_POP_OR_FAIL(dyn, ninst, scratch) \
-    if (dyn->n.x87stack == -8) {                \
-        if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Pop, stack=%d on inst %d\n", dyn->n.x87stack, ninst); \
-        dyn->abort = 1;                      \
-        return addr;                         \
-    }                                        \
+#define X87_POP_OR_FAIL(dyn, ninst, scratch)            \
+    if ((dyn->n.x87stack==-8) || (dyn->n.poped==8)) {   \
+        if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Pop, stack=%d/%d on inst %d\n", dyn->n.x87stack, dyn->n.poped, ninst); \
+        dyn->abort = 1;                                 \
+        return addr;                                    \
+    }                                                   \
     x87_do_pop(dyn, ninst, scratch)
 #endif
 
diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h
index e8a4b0a8..7d4c0c2d 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass0.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass0.h
@@ -16,7 +16,7 @@
         dyn->f.pending=(B)&SF_SET_PENDING;      \
         dyn->f.dfnone=((B)&SF_SET)?(((B)==SF_SET_NODF)?0:1):0;
 #define EMIT(A)         dyn->native_size+=4
-#define JUMP(A, C)         add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); SMEND(); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C
+#define JUMP(A, C)         add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); SMEND(); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C; dyn->insts[ninst].x64.jmp_insts = 0
 #define BARRIER(A)      if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1
 #define BARRIER_NEXT(A) dyn->insts[ninst].x64.barrier_next = A
 #define SET_HASCALLRET()    dyn->insts[ninst].x64.has_callret = 1
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 6db73b1c..b26d522d 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -42,11 +42,13 @@ typedef struct neoncache_s {
     uint8_t             combined2;
     uint8_t             swapped;        // the combined reg were swapped
     uint8_t             barrier;        // is there a barrier at instruction epilog?
+    uint8_t             pushed;         // positive pushed value (to check for overflow)
+    uint8_t             poped;          // positive poped value (to check for underflow)
     uint32_t            news;           // bitmask, wich neoncache are new for this opcode
     // fpu cache
     int8_t              x87cache[8];    // cache status for the 8 x87 register behind the fpu stack
     int8_t              x87reg[8];      // reg used for x87cache entry
-    int8_t              freed[8];       // set when FFREE is used, -1 else
+    int16_t             tags;           // similar to fpu_tags
     int8_t              mmxcache[8];    // cache status for the 8 MMX registers
     sse_cache_t         ssecache[16];   // cache status for the 16 SSE(2) registers
     int8_t              fpuused[24];    // all 0..24 double reg from fpu, used by x87, sse and mmx
diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index d53861ec..8eb69080 100644
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -103,7 +103,7 @@ void add_jump(dynarec_native_t *dyn, int ninst) {
 int get_first_jump(dynarec_native_t *dyn, int next) {
     for(int i=0; i<dyn->jmp_sz; ++i)
         if(dyn->insts[dyn->jmps[i]].x64.jmp == next)
-            return i;
+            return dyn->jmps[i];
     return -2;
 }
 
@@ -544,9 +544,11 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
                 if(helper.insts[i2].x64.addr==j)
                     k=i2;
             }*/
-            if(k!=-1 && !helper.insts[i].barrier_maybe)
-                helper.insts[k].x64.barrier |= BARRIER_FULL;
-            helper.insts[i].x64.jmp_insts = k;
+            if(k!=-1) {
+                if(k!=-1 && !helper.insts[i].barrier_maybe)
+                    helper.insts[k].x64.barrier |= BARRIER_FULL;
+                helper.insts[i].x64.jmp_insts = k;
+            }
         }
     }
     // no need for next and jmps anymore
diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index 11689307..0288cbeb 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -171,7 +171,6 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
         #if STEP > 0
         if(!dyn->insts[ninst].x64.has_next && dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts!=-1)
             next = dyn->insts[ninst].x64.jmp_insts;
-        #endif
         if(dyn->insts[ninst].x64.has_next && dyn->insts[next].x64.barrier) {
             if(dyn->insts[next].x64.barrier&BARRIER_FLOAT) {
                 fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
@@ -182,6 +181,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
                 dyn->last_ip = 0;
             }
         }
+        #endif
         #ifndef PROT_READ
         #define PROT_READ 1
         #endif
@@ -216,7 +216,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
         if(dyn->forward) {
             if(dyn->forward_to == addr && !need_epilog) {
                 // we made it!
-                if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Forward extend block for %d bytes %s%p -> %p\n", dyn->forward_to-dyn->forward, dyn->insts[dyn->forward_ninst].x64.has_callret?"(opt. call) ":"", (void*)dyn->forward, (void*)dyn->forward_to);
+                reset_n = get_first_jump(dyn, addr);
+                if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Forward extend block for %d bytes %s%p -> %p (ninst %d - %d)\n", dyn->forward_to-dyn->forward, dyn->insts[dyn->forward_ninst].x64.has_callret?"(opt. call) ":"", (void*)dyn->forward, (void*)dyn->forward_to, reset_n, ninst);
                 if(dyn->insts[dyn->forward_ninst].x64.has_callret && !dyn->insts[dyn->forward_ninst].x64.has_next)
                     dyn->insts[dyn->forward_ninst].x64.has_next = 1;  // this block actually continue
                 dyn->forward = 0;
@@ -252,7 +253,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
                         // and pred table is not ready yet
                         reset_n = get_first_jump(dyn, next);
                     }
-                    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Extend block %p, %s%p -> %p (ninst=%d, jump from %d)\n", dyn, dyn->insts[ninst].x64.has_callret?"(opt. call) ":"", (void*)addr, (void*)next, ninst, dyn->insts[ninst].x64.has_callret?ninst:reset_n);
+                    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Extend block %p, %s%p -> %p (ninst=%d, jump from %d)\n", dyn, dyn->insts[ninst].x64.has_callret?"(opt. call) ":"", (void*)addr, (void*)next, ninst+1, dyn->insts[ninst].x64.has_callret?ninst:reset_n);
                 } else if(next && (next-addr)<box64_dynarec_forward && (getProtection(next)&PROT_READ)/*box64_dynarec_bigblock>=stopblock*/) {
                     if(!((box64_dynarec_bigblock<stopblock) && !isJumpTableDefault64((void*)next))) {
                         if(dyn->forward) {
@@ -295,6 +296,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
             }
             #endif
         }
+        if(ok && dyn->insts[ninst].x64.has_callret)
+            reset_n = -2;
         ++ninst;
         #if STEP == 0
         memset(&dyn->insts[ninst], 0, sizeof(instruction_native_t));
diff --git a/src/dynarec/la64/dynarec_la64_00.c b/src/dynarec/la64/dynarec_la64_00.c
index ebb6f4f8..706094db 100644
--- a/src/dynarec/la64/dynarec_la64_00.c
+++ b/src/dynarec/la64/dynarec_la64_00.c
@@ -1207,7 +1207,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             if (box64_dynarec_safeflags) {
                 READFLAGS(X_PEND); // lets play safe here too
             }
-            BARRIER(BARRIER_FLOAT);
+            fpu_purgecache(dyn, ninst, 1, x1, x2, x3);  // using next, even if there no next
             i32 = F16;
             retn_to_epilog(dyn, ninst, rex, i32);
             *need_epilog = 0;
@@ -1219,7 +1219,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             if (box64_dynarec_safeflags) {
                 READFLAGS(X_PEND); // so instead, force the deferred flags, so it's not too slow, and flags are not lost
             }
-            BARRIER(BARRIER_FLOAT);
+            fpu_purgecache(dyn, ninst, 1, x1, x2, x3);  // using next, even if there no next
             ret_to_epilog(dyn, ninst, rex);
             *need_epilog = 0;
             *ok = 0;
@@ -1534,19 +1534,20 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         SETFLAGS(X_ALL, SF_SET); // Hack to set flags to "dont'care" state
                     }
                     // regular call
-                    if (box64_dynarec_callret && box64_dynarec_bigblock > 1) {
+                    /*if (box64_dynarec_callret && box64_dynarec_bigblock > 1) {
                         BARRIER(BARRIER_FULL);
                     } else {
                         BARRIER(BARRIER_FLOAT);
                         *need_epilog = 0;
                         *ok = 0;
-                    }
+                    }*/
 
                     if (rex.is32bits) {
                         MOV32w(x2, addr);
                     } else {
                         TABLE64(x2, addr);
                     }
+                    fpu_purgecache(dyn, ninst, 1, x1, x3, x4);
                     PUSH1z(x2);
                     if (box64_dynarec_callret) {
                         SET_HASCALLRET();
diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h
index 999b48e1..1eef76bf 100644
--- a/src/dynarec/la64/dynarec_la64_pass0.h
+++ b/src/dynarec/la64/dynarec_la64_pass0.h
@@ -16,7 +16,7 @@
     dyn->f.pending = (B) & SF_SET_PENDING; \
     dyn->f.dfnone = ((B) & SF_SET) ? 1 : 0;
 #define EMIT(A) dyn->native_size += 4
-#define JUMP(A, C) add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C
+#define JUMP(A, C)         add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); SMEND(); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C; dyn->insts[ninst].x64.jmp_insts = 0
 #define BARRIER(A)                                 \
     if (A != BARRIER_MAYBE) {                      \
         fpu_purgecache(dyn, ninst, 0, x1, x2, x3); \
diff --git a/src/dynarec/rv64/dynarec_rv64_00_3.c b/src/dynarec/rv64/dynarec_rv64_00_3.c
index df22e054..93bf1690 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_3.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_3.c
@@ -279,7 +279,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             if(box64_dynarec_safeflags) {
                 READFLAGS(X_PEND);  // lets play safe here too
             }
-            BARRIER(BARRIER_FLOAT);
+            fpu_purgecache(dyn, ninst, 1, x1, x2, x3);  // using next, even if there no next
             i32 = F16;
             retn_to_epilog(dyn, ninst, rex, i32);
             *need_epilog = 0;
@@ -291,7 +291,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             if(box64_dynarec_safeflags) {
                 READFLAGS(X_PEND);  // so instead, force the deferred flags, so it's not too slow, and flags are not lost
             }
-            BARRIER(BARRIER_FLOAT);
+            fpu_purgecache(dyn, ninst, 1, x1, x2, x3);  // using next, even if there no next
             ret_to_epilog(dyn, ninst, rex);
             *need_epilog = 0;
             *ok = 0;
@@ -903,19 +903,20 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         SETFLAGS(X_ALL, SF_SET);    // Hack to set flags to "dont'care" state
                     }
                     // regular call
-                    if(box64_dynarec_callret && box64_dynarec_bigblock>1) {
+                    /*if(box64_dynarec_callret && box64_dynarec_bigblock>1) {
                         BARRIER(BARRIER_FULL);
                     } else {
                         BARRIER(BARRIER_FLOAT);
                         *need_epilog = 0;
                         *ok = 0;
-                    }
+                    }*/
 
                     if(rex.is32bits) {
                         MOV32w(x2, addr);
                     } else {
                         TABLE64(x2, addr);
                     }
+                    fpu_purgecache(dyn, ninst, 1, x1, x3, x4);
                     PUSH1z(x2);
                     if(box64_dynarec_callret) {
                         SET_HASCALLRET();
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c
index 3b95c06e..b016e45b 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.c
+++ b/src/dynarec/rv64/dynarec_rv64_functions.c
@@ -422,11 +422,23 @@ void extcacheUnwind(extcache_t* cache)
             }
         }
         cache->x87stack-=cache->stack_push;
+        cache->tags>>=(cache->stack_push*2);
         cache->stack-=cache->stack_push;
+        if(cache->pushed>=cache->stack_push)
+            cache->pushed-=cache->stack_push;
+        else
+            cache->pushed = 0;
         cache->stack_push = 0;
     }
     cache->x87stack+=cache->stack_pop;
     cache->stack_next = cache->stack;
+    if(cache->stack_pop) {
+        if(cache->poped>=cache->stack_pop)
+            cache->poped-=cache->stack_pop;
+        else
+            cache->poped = 0;
+        cache->tags<<=(cache->stack_pop*2);
+    }
     cache->stack_pop = 0;
     cache->barrier = 0;
     // And now, rebuild the x87cache info with extcache
@@ -631,6 +643,9 @@ static void x87_reset(extcache_t* e)
     e->combined1 = e->combined2 = 0;
     e->swapped = 0;
     e->barrier = 0;
+    e->pushed = 0;
+    e->poped = 0;
+
     for(int i=0; i<24; ++i)
         if (e->extcache[i].t == EXT_CACHE_ST_F
             || e->extcache[i].t == EXT_CACHE_ST_D
@@ -666,3 +681,8 @@ void fpu_reset_ninst(dynarec_rv64_t* dyn, int ninst)
     sse_reset(&dyn->insts[ninst].e);
     fpu_reset_reg_extcache(&dyn->insts[ninst].e);
 }
+
+int fpu_is_st_freed(dynarec_rv64_t* dyn, int ninst, int st)
+{
+    return (dyn->e.tags&(0b11<<(st*2)))?1:0;
+}
\ No newline at end of file
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.h b/src/dynarec/rv64/dynarec_rv64_functions.h
index 01b5e9a4..e3a5171d 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.h
+++ b/src/dynarec/rv64/dynarec_rv64_functions.h
@@ -58,4 +58,7 @@ void print_newinst(dynarec_native_t* dyn, int ninst);
 // reset the cache
 void fpu_reset(dynarec_native_t* dyn);
 void fpu_reset_ninst(dynarec_native_t* dyn, int ninst);
+
+// is st freed
+int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st);
 #endif //__DYNAREC_RV64_FUNCTIONS_H__
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c
index 6c5d25ee..e5b9ac51 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.c
+++ b/src/dynarec/rv64/dynarec_rv64_helper.c
@@ -875,6 +875,9 @@ int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t)
     dyn->e.stack+=1;
     dyn->e.stack_next+=1;
     dyn->e.stack_push+=1;
+    ++dyn->e.pushed;
+    if(dyn->e.poped)
+        --dyn->e.poped;
     // move all regs in cache, and find a free one
     for(int j=0; j<24; ++j)
         if ((dyn->e.extcache[j].t == EXT_CACHE_ST_D)
@@ -882,6 +885,7 @@ int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t)
             || (dyn->e.extcache[j].t == EXT_CACHE_ST_I64))
             ++dyn->e.extcache[j].n;
     int ret = -1;
+    dyn->e.tags<<=2;
     for(int i=0; i<8; ++i)
         if(dyn->e.x87cache[i]!=-1)
             ++dyn->e.x87cache[i];
@@ -900,26 +904,31 @@ void x87_do_push_empty(dynarec_rv64_t* dyn, int ninst, int s1)
     dyn->e.stack+=1;
     dyn->e.stack_next+=1;
     dyn->e.stack_push+=1;
+    ++dyn->e.pushed;
+    if(dyn->e.poped)
+        --dyn->e.poped;
     // move all regs in cache
     for(int j=0; j<24; ++j)
         if ((dyn->e.extcache[j].t == EXT_CACHE_ST_D)
             || (dyn->e.extcache[j].t == EXT_CACHE_ST_F)
             || (dyn->e.extcache[j].t == EXT_CACHE_ST_I64))
             ++dyn->e.extcache[j].n;
+    int ret = -1;
+    dyn->e.tags<<=2;
     for(int i=0; i<8; ++i)
         if(dyn->e.x87cache[i]!=-1)
             ++dyn->e.x87cache[i];
+        else if(ret==-1)
+            ret = i;
+    if(ret==-1) {
+        MESSAGE(LOG_DUMP, "Incoherent x87 stack cache, aborting\n");
+        dyn->abort = 1;
+    }
     if(s1)
         x87_stackcount(dyn, ninst, s1);
 }
-void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1)
+void static internal_x87_dopop(dynarec_rv64_t* dyn)
 {
-    if(dyn->e.mmxcount)
-        mmx_purgecache(dyn, ninst, 0, s1);
-    dyn->e.x87stack-=1;
-    dyn->e.stack_next-=1;
-    dyn->e.stack_pop+=1;
-    // move all regs in cache, poping ST0
     for(int i=0; i<8; ++i)
         if(dyn->e.x87cache[i]!=-1) {
             --dyn->e.x87cache[i];
@@ -929,6 +938,30 @@ void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1)
             }
         }
 }
+int static internal_x87_dofree(dynarec_rv64_t* dyn)
+{
+    if(dyn->e.tags&0b11) {
+        MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n");
+        return 1;
+    }
+    return 0;
+}
+void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1)
+{
+    if(dyn->e.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    do {
+        dyn->e.x87stack-=1;
+        dyn->e.stack_next-=1;
+        dyn->e.stack_pop+=1;
+        dyn->e.tags>>=2;
+        ++dyn->e.poped;
+        if(dyn->e.pushed)
+            --dyn->e.pushed;
+        // move all regs in cache, poping ST0
+        internal_x87_dopop(dyn);
+    } while(internal_x87_dofree(dyn));
+}
 
 void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3)
 {
@@ -952,27 +985,32 @@ void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, in
         LW(s2, xEmu, offsetof(x64emu_t, top));
         // update tags (and top at the same time)
         if(a>0) {
-            // new tag to fulls
-            for (int i=0; i<a; ++i) {
-                SUBI(s2, s2, 1);
-                ANDI(s2, s2, 7);    // (emu->top + st)&7
-                if(rv64_zba) SH2ADD(s1, s2, xEmu); else {SLLI(s1, s2, 2); ADD(s1, xEmu, s1);}
-                SW(xZR, s1, offsetof(x64emu_t, p_regs));
-            }
+            SUBI(s2, s2, a);
         } else {
-            // empty tags
-            ADDI(s3, xZR, 0b11);
-            for (int i=0; i<-a; ++i) {
-                if(rv64_zba) SH2ADD(s1, s2, xEmu); else {SLLI(s1, s2, 2); ADD(s1, xEmu, s1);}
-                SW(s3, s1, offsetof(x64emu_t, p_regs));
-                ADDI(s2, s2, 1);
-                ANDI(s2, s2, 7);    // (emu->top + st)&7
-            }
+            ADDI(s2, s2, -a);
         }
+        ANDI(s2, s2, 7);
         SW(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags (and top at the same time)
+        LHU(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        if(a>0) {
+            SLLI(s1, s1, a*2);
+        } else {
+            SLLI(s3, xMASK, 16);    // 0xffff0000 (plus some unused hipart)
+            OR(s1, s1, s3);
+            SRLI(s1, s1, -a*2);
+        }
+        SH(s1, xEmu, offsetof(x64emu_t, fpu_tags));
     } else {
         LW(s2, xEmu, offsetof(x64emu_t, top));
     }
+    // check if free is used
+    if(dyn->e.tags) {
+        LH(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        MOV32w(s3, dyn->e.tags);
+        OR(s1, s1, s3);
+        SH(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+    }
     if(ret!=0) {
         // --- set values
         // Get top
@@ -1018,9 +1056,13 @@ void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, in
     }
     if(!next) {
         dyn->e.stack_next = 0;
+        dyn->e.tags = 0;
         #if STEP < 2
         // refresh the cached valued, in case it's a purge outside a instruction
         dyn->insts[ninst].e.barrier = 1;
+        dyn->e.pushed = 0;
+        dyn->e.poped = 0;
+
         #endif
     }
     MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n");
@@ -1040,6 +1082,16 @@ static void x87_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int
         SUBI(s2, s2, a);
         ANDI(s2, s2, 7);
         SW(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags (and top at the same time)
+        LH(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        if(a>0) {
+            SLLI(s1, s1, a*2);
+        } else {
+            SLLI(s3, xMASK, 16);    // 0xffff0000
+            OR(s1, s1, s3);
+            SRLI(s1, s1, -a*2);
+        }
+        SH(s1, xEmu, offsetof(x64emu_t, fpu_tags));
     }
     int ret = 0;
     for (int i=0; (i<8) && (!ret); ++i)
@@ -1080,6 +1132,13 @@ static void x87_unreflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, i
         ADDI(s2, s2, a);
         ANDI(s2, s2, 7);
         SW(s2, xEmu, offsetof(x64emu_t, top));
+        if(a>0) {
+            SLLI(s3, xMASK, 16);    // 0xffff0000
+            OR(s1, s1, s3);
+            SRLI(s1, s1, a*2);
+        } else {
+            SLLI(s1, s1, -a*2);
+        }
     }
 }
 
@@ -1275,6 +1334,66 @@ void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st)
     MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
 }
 
+void x87_free(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int st)
+{
+    int ret = -1;
+    for (int i=0; (i<8) && (ret==-1); ++i)
+        if(dyn->e.x87cache[i] == st)
+            ret = i;
+    MESSAGE(LOG_DUMP, "\tFFREE%s x87 Cache for ST%d\n", (ret!=-1)?" (and Forget)":"", st);
+    if(ret!=-1) {
+        const int reg = dyn->e.x87reg[ret];
+        #if STEP == 1
+        if(dyn->e.extcache[reg].t==EXT_CACHE_ST_F || dyn->e.extcache[reg].t==EXT_CACHE_ST_I64)
+            extcache_promote_double(dyn, ninst, st);
+        #endif
+        // Get top
+        LW(s2, xEmu, offsetof(x64emu_t, top));
+        // Update
+        int ast = st - dyn->e.x87stack;
+        if(ast) {
+            if(ast>0) {
+                ADDI(s2, s2, ast);
+            } else {
+                SUBI(s2, s2, -ast);
+            }
+            ANDI(s2, s2, 7); // (emu->top + i)&7
+        }
+        if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);}
+        if (dyn->e.extcache[EXTIDX(reg)].t == EXT_CACHE_ST_F) {
+            FCVTDS(SCRATCH0, reg);
+            FSD(SCRATCH0, s1, offsetof(x64emu_t, x87));
+        } else if (dyn->e.extcache[EXTIDX(reg)].t == EXT_CACHE_ST_I64) {
+            FMVXD(s2, reg);
+            FCVTDL(SCRATCH0, s2, RD_RTZ);
+            FSD(SCRATCH0, s1, offsetof(x64emu_t, x87));
+        } else {
+            FSD(reg, s1, offsetof(x64emu_t, x87));
+        }
+        // and forget that cache
+        fpu_free_reg(dyn, reg);
+        dyn->e.extcache[reg].v = 0;
+        dyn->e.x87cache[ret] = -1;
+        dyn->e.x87reg[ret] = -1;
+    } else {
+        // Get top
+        LW(s2, xEmu, offsetof(x64emu_t, top));
+        // Update
+        int ast = st - dyn->e.x87stack;
+        if(ast) {
+            if(ast>0) {
+                ADDI(s2, s2, ast);
+            } else {
+                SUBI(s2, s2, -ast);
+            }
+            ANDI(s2, s2, 7);    // (emu->top + i)&7
+        }
+    }
+    // add mark in the freed array
+    dyn->e.tags |= 0b11<<(st*2);
+    MESSAGE(LOG_DUMP, "\t--------x87 FFREE for ST%d\n", st);
+}
+
 void x87_swapreg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int b)
 {
     int i1, i2, i3;
@@ -1905,34 +2024,16 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in
         SW(s3, xEmu, offsetof(x64emu_t, fpu_stack));
         // Sub x87stack to top, with and 7
         LWU(s3, xEmu, offsetof(x64emu_t, top));
-        // update tags (and top at the same time)
+        // update tags
+        LH(s2, xEmu, offsetof(x64emu_t, fpu_tags));
         if(a>0) {
-            // new tag to fulls
-            ADDI(s2, xZR, 0);
-            ADDI(s1, xEmu, offsetof(x64emu_t, p_regs));
-            SLLI(s3, s3, 2);
-            for (int i=0; i<a; ++i) {
-                SUBI(s3, s3, 1<<2);
-                ANDI(s3, s3, 7<<2);
-                ADD(s3, s1, s3);
-                SW(s2, s3, 0);    // that slot is full
-                SUB(s3, s3, s1);
-            }
-            SRLI(s3, s3, 2);
+            SLLI(s2, s2, a*2);
         } else {
-            // empty tags
-            ADDI(s2, xZR, 0b11);
-            ADDI(s1, xEmu, offsetof(x64emu_t, p_regs));
-            SLLI(s3, s3, 2);
-            for (int i=0; i<-a; ++i) {
-                ADD(s3, s1, s3);
-                SW(s2, s3, 0);    // empty slot before leaving it
-                SUB(s3, s3, s1);
-                ADDI(s3, s3, 1<<2);
-                ANDI(s3, s3, 7<<2);    // (emu->top + st)&7
-            }
-            SRLI(s3, s3, 2);
+            SLLI(s3, xMASK, 16);    // 0xffff0000
+            OR(s2, s2, s3);
+            SRLI(s2, s2, -a*2);
         }
+        SH(s2, xEmu, offsetof(x64emu_t, fpu_tags));
         SW(s3, xEmu, offsetof(x64emu_t, top));
         s3_top = 0;
         stack_cnt = cache_i2.stack;
@@ -2142,8 +2243,14 @@ void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n)
     #if STEP > 1
     // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap)
     dyn->e = dyn->insts[ninst].e;
+    #else
+    dyn->e = dyn->insts[reset_n].e;
+    #endif
     extcacheUnwind(&dyn->e);
-    #ifdef HAVE_TRACE
+    #if STEP == 0
+    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "New x87stack=%d\n", dyn->e.x87stack);
+    #endif
+    #if defined(HAVE_TRACE) && (STEP>2)
     if(box64_dynarec_dump)
         if(memcmp(&dyn->e, &dyn->insts[reset_n].e, sizeof(ext_cache_t))) {
             MESSAGE(LOG_DEBUG, "Warning, difference in extcache: reset=");
@@ -2173,9 +2280,6 @@ void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n)
             MESSAGE(LOG_DEBUG, "\n");
         }
     #endif //HAVE_TRACE
-    #else
-    dyn->e = dyn->insts[reset_n].e;
-    #endif
 }
 
 // propagate ST stack state, especial stack pop that are deferred
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 3292ea2f..1ed23785 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -896,27 +896,27 @@
 #define X87_POP_OR_FAIL(dyn, ninst, scratch)            x87_do_pop(dyn, ninst, scratch)
 #else
 #define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) \
-    if (dyn->e.stack == +8) {                         \
-        if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Push, stack=%d on inst %d\n", dyn->e.x87stack, ninst); \
-        dyn->abort = 1;                               \
-        return addr;                                  \
-    }                                                 \
+    if ((dyn->e.x87stack==8) || (dyn->e.pushed==8)) {   \
+        if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->e.x87stack, dyn->e.pushed, ninst); \
+        dyn->abort = 1;                                 \
+        return addr;                                    \
+    }                                                   \
     var = x87_do_push(dyn, ninst, scratch, t);
 
 #define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) \
-    if (dyn->e.stack == +8) {                       \
-        if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Push, stack=%d on inst %d\n", dyn->e.x87stack, ninst); \
-        dyn->abort = 1;                               \
-        return addr;                                \
-    }                                               \
+    if ((dyn->e.x87stack==8) || (dyn->e.pushed==8)) {   \
+        if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->e.x87stack, dyn->e.pushed, ninst); \
+        dyn->abort = 1;                                 \
+        return addr;                                    \
+    }                                                   \
     x87_do_push_empty(dyn, ninst, scratch);
 
 #define X87_POP_OR_FAIL(dyn, ninst, scratch) \
-    if (dyn->e.stack == -8) {                \
-        if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Pop, stack=%d on inst %d\n", dyn->e.x87stack, ninst); \
-        dyn->abort = 1;                               \
-        return addr;                         \
-    }                                        \
+    if ((dyn->e.x87stack==-8) || (dyn->e.poped==8)) {   \
+        if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Pop, stack=%d/%d on inst %d\n", dyn->e.x87stack, dyn->e.poped, ninst); \
+        dyn->abort = 1;                                 \
+        return addr;                                    \
+    }                                                   \
     x87_do_pop(dyn, ninst, scratch);
 #endif
 
@@ -1196,6 +1196,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 #define x87_get_extcache      STEPNAME(x87_get_extcache)
 #define x87_get_st            STEPNAME(x87_get_st)
 #define x87_get_st_empty      STEPNAME(x87_get_st)
+#define x87_free              STEPNAME(x87_free)
 #define x87_refresh           STEPNAME(x87_refresh)
 #define x87_forget            STEPNAME(x87_forget)
 #define x87_reget_st          STEPNAME(x87_reget_st)
@@ -1358,6 +1359,8 @@ int x87_get_extcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a);
 int x87_get_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t);
 // get vfpu register for a x87 reg, create the entry if needed. Do not fetch the Stx if not already in cache
 int x87_get_st_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t);
+// Free st, using the FFREE opcode (so it's freed but stack is not moved)
+void x87_free(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int st);
 // refresh a value from the cache ->emu (nothing done if value is not cached)
 void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st);
 // refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached)
diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h
index 0def2bed..d5040d9b 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass0.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass0.h
@@ -16,7 +16,7 @@
         dyn->f.pending=(B)&SF_SET_PENDING;      \
         dyn->f.dfnone=((B)&SF_SET)?1:0;
 #define EMIT(A)         dyn->native_size+=4
-#define JUMP(A, C)         add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C
+#define JUMP(A, C)         add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); SMEND(); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C; dyn->insts[ninst].x64.jmp_insts = 0
 #define BARRIER(A)      if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1
 #define BARRIER_NEXT(A) dyn->insts[ninst].x64.barrier_next = A
 #define SET_HASCALLRET()    dyn->insts[ninst].x64.has_callret = 1
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index 6a870bf7..3acbdfb6 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -52,11 +52,14 @@ typedef struct extcache_s {
     uint8_t             combined2;
     uint8_t             swapped;        // the combined reg were swapped
     uint8_t             barrier;        // is there a barrier at instruction epilog?
+    uint8_t             pushed;         // positive pushed value (to check for overflow)
+    uint8_t             poped;          // positive poped value (to check for underflow)
     uint32_t            news;           // bitmask, wich neoncache are new for this opcode
     sse_old_t           olds[16];       // SSE regs has changed or has been removed
     // fpu cache
     int8_t              x87cache[8];    // cache status for the 8 x87 register behind the fpu stack
     int8_t              x87reg[8];      // reg used for x87cache entry
+    int16_t             tags;           // similar to fpu_tags
     int8_t              mmxcache[8];    // cache status for the 8 MMX registers
     sse_cache_t         ssecache[16];   // cache status for the 16 SSE(2) registers
     int8_t              fpuused[24];    // all 10..31 & 0..1 double reg from fpu, used by x87, sse and mmx
diff --git a/src/emu/x64emu.c b/src/emu/x64emu.c
index 69aed8c1..7d54d651 100644
--- a/src/emu/x64emu.c
+++ b/src/emu/x64emu.c
@@ -234,7 +234,7 @@ void CloneEmu(x64emu_t *newemu, const x64emu_t* emu)
 	memcpy(newemu->mmx, emu->mmx, sizeof(emu->mmx));
     memcpy(newemu->fpu_ld, emu->fpu_ld, sizeof(emu->fpu_ld));
     memcpy(newemu->fpu_ll, emu->fpu_ll, sizeof(emu->fpu_ll));
-	memcpy(newemu->p_regs, emu->p_regs, sizeof(emu->p_regs));
+    newemu->fpu_tags = emu->fpu_tags;
 	newemu->cw = emu->cw;
 	newemu->sw = emu->sw;
 	newemu->top = emu->top;
@@ -270,7 +270,7 @@ void CopyEmu(x64emu_t *newemu, const x64emu_t* emu)
     memcpy(newemu->xmm, emu->xmm, sizeof(emu->xmm));
     memcpy(newemu->fpu_ld, emu->fpu_ld, sizeof(emu->fpu_ld));
     memcpy(newemu->fpu_ll, emu->fpu_ll, sizeof(emu->fpu_ll));
-	memcpy(newemu->p_regs, emu->p_regs, sizeof(emu->p_regs));
+    newemu->fpu_tags = emu->fpu_tags;
 	newemu->cw = emu->cw;
 	newemu->sw = emu->sw;
 	newemu->top = emu->top;
diff --git a/src/emu/x64emu_private.h b/src/emu/x64emu_private.h
index 90c9b7b0..7ae9e1bd 100644
--- a/src/emu/x64emu_private.h
+++ b/src/emu/x64emu_private.h
@@ -76,7 +76,7 @@ typedef struct x64emu_s {
     #endif
     fpu_ld_t    fpu_ld[8]; // for long double emulation / 80bits fld fst
     fpu_ll_t    fpu_ll[8]; // for 64bits fild / fist sequence
-	fpu_p_reg_t p_regs[8];
+    uint64_t    fpu_tags;   // tags for the x87 regs, stacked, only on a 16bits anyway
     // old ip
     uintptr_t   old_ip;
     // deferred flags
diff --git a/src/emu/x64rund9.c b/src/emu/x64rund9.c
index 2cc8cdce..015dfa07 100644
--- a/src/emu/x64rund9.c
+++ b/src/emu/x64rund9.c
@@ -192,7 +192,10 @@ uintptr_t RunD9(x64emu_t *emu, rex_t rex, uintptr_t addr)
             emu->top=(emu->top-1)&7;    // this will probably break a few things

             break;

         case 0xF7:  /* FINCSTP */

-            emu->top=(emu->top+1)&7;    // this will probably break a few things

+            if(emu->fpu_tags&0b11)

+                fpu_do_pop(emu);

+            else

+                emu->top=(emu->top+1)&7;    // this will probably break a few things

             break;

         case 0xF9:  /* FYL2XP1 */

             ST(1).d *= log2(ST0.d + 1.0);

diff --git a/src/emu/x64rundd.c b/src/emu/x64rundd.c
index 35b439fe..a62c9254 100644
--- a/src/emu/x64rundd.c
+++ b/src/emu/x64rundd.c
@@ -45,7 +45,7 @@ uintptr_t RunDD(x64emu_t *emu, rex_t rex, uintptr_t addr)
         case 0xC5:

         case 0xC6:

         case 0xC7:

-            fpu_do_free(emu, nextop-0xC0);

+            fpu_do_free(emu, nextop&7);

             break;

 

         case 0xD0:  /* FST ST0, STx */

diff --git a/src/emu/x64test.c b/src/emu/x64test.c
index f1294466..447cd384 100644
--- a/src/emu/x64test.c
+++ b/src/emu/x64test.c
@@ -82,10 +82,10 @@ void x64test_check(x64emu_t* ref, uintptr_t ip)
     }
     //memcpy(ref->fpu_ld, emu->fpu_ld, sizeof(emu->fpu_ld));
     //memcpy(ref->fpu_ll, emu->fpu_ll, sizeof(emu->fpu_ll));
-	/*if(ref->p_regs != emu->p_regs) {
+	if(ref->fpu_tags != emu->fpu_tags) {
         BANNER;
-        printf_log(LOG_NONE, "X87 PREG: %x | %x\n", ref->p_regs, emu->p_regs);
-    }*/
+        printf_log(LOG_NONE, "X87 TAGS: %x | %x\n", ref->fpu_tags, emu->fpu_tags);
+    }
 	if(ref->cw.x16 != emu->cw.x16) {
         BANNER;
         printf_log(LOG_NONE, "X87 CW: %x | %x\n", ref->cw.x16, emu->cw.x16);
diff --git a/src/emu/x87emu_private.c b/src/emu/x87emu_private.c
index 67573c37..0ad3db8c 100644
--- a/src/emu/x87emu_private.c
+++ b/src/emu/x87emu_private.c
@@ -11,10 +11,10 @@
 
 void fpu_do_free(x64emu_t* emu, int i)
 {
-    emu->p_regs[(emu->top+i)&7].tag = 0b11;    // empty
+    emu->fpu_tags |= 0b11 << (i);   // empty
     // check if all empty
     for(int j=0; j<8; ++j)
-        if(emu->p_regs[j].tag != 0b11)
+        if(emu->fpu_tags != TAGS_EMPTY)
             return;
     emu->fpu_stack = 0;
 }
@@ -27,8 +27,7 @@ void reset_fpu(x64emu_t* emu)
     emu->sw.x16 = 0x0000;
     emu->top = 0;
     emu->fpu_stack = 0;
-    for(int i=0; i<8; ++i)
-        emu->p_regs[i].tag = 0b11;  // STx is empty
+    emu->fpu_tags = TAGS_EMPTY;
 }
 
 void fpu_fbst(x64emu_t* emu, uint8_t* d) {
@@ -258,9 +257,7 @@ void fpu_loadenv(x64emu_t* emu, char* p, int b16)
     p+=(b16)?2:4;
     // tagword: 2bits*8
     // tags... (only full = 0b11 / free = 0b00)
-    uint16_t tags = *(uint16_t*)p;
-    for(int i=0; i<8; ++i)
-        emu->p_regs[i].tag = (tags>>(i*2))&0b11;
+    emu->fpu_tags = *(uint16_t*)p;
     // intruction pointer: 16bits
     // data (operand) pointer: 16bits
     // last opcode: 11bits save: 16bits restaured (1st and 2nd opcode only)
@@ -277,10 +274,7 @@ void fpu_savenv(x64emu_t* emu, char* p, int b16)
     if(!b16) {*(uint16_t*)p = 0; p+=2;}
     // tagword: 2bits*8
     // tags...
-    uint16_t tags = 0;
-    for (int i=0; i<8; ++i)
-        tags |= (emu->p_regs[i].tag)<<(i*2);
-    *(uint16_t*)p = tags;
+    *(uint16_t*)p = emu->fpu_tags;
     // other stuff are not pushed....
 }
 
@@ -325,14 +319,14 @@ void fpu_fxsave32(x64emu_t* emu, void* ed)
     int top = emu->top&7;
     int stack = 8-top;
     if(top==0)  // check if stack is full or empty, based on tag[0]
-        stack = (emu->p_regs[0].tag)?8:0;
+        stack = (emu->fpu_tags&0b11)?8:0;
     emu->sw.f.F87_TOP = top;
     p->ControlWord = emu->cw.x16;
     p->StatusWord = emu->sw.x16;
     p->MxCsr = emu->mxcsr.x32;
     uint8_t tags = 0;
     for (int i=0; i<8; ++i)
-        tags |= ((emu->p_regs[i].tag)<<(i*2)==0b11)?0:1;
+        tags |= ((emu->fpu_tags>>(i*2))&0b11)?0:1;
     p->TagWord = tags;
     p->ErrorOpcode = 0;
     p->ErrorOffset = 0;
@@ -353,15 +347,15 @@ void fpu_fxsave64(x64emu_t* emu, void* ed)
     int top = emu->top&7;
     int stack = 8-top;
     if(top==0)  // check if stack is full or empty, based on tag[0]
-        stack = (emu->p_regs[0].tag)?8:0;
+        stack = (emu->fpu_tags&0b11)?8:0;
     emu->sw.f.F87_TOP = top;
     p->ControlWord = emu->cw.x16;
     p->StatusWord = emu->sw.x16;
     p->MxCsr = emu->mxcsr.x32;
     uint8_t tags = 0;
     for (int i=0; i<8; ++i)
-        tags |= ((emu->p_regs[i].tag)<<(i*2)==0b11)?0:1;
-    p->TagWord = tags;
+        tags |= ((emu->fpu_tags>>(i*2))&0b11)?0:1;
+    p->TagWord = emu->fpu_tags;
     p->ErrorOpcode = 0;
     p->ErrorOffset = 0;
     p->DataOffset = 0;
@@ -382,12 +376,12 @@ void fpu_fxrstor32(x64emu_t* emu, void* ed)
         applyFlushTo0(emu);
     emu->top = emu->sw.f.F87_TOP;
     uint8_t tags = p->TagWord;
-    for(int i=0; i<8; ++i)
-        emu->p_regs[i].tag = (tags>>(i*2))?0:0b11;
+    for (int i=0; i<8; ++i)
+        tags |= ((emu->fpu_tags>>(i*2))&0b11)?0:1;
     int top = emu->top&7;
     int stack = 8-top;
     if(top==0)  // check if stack is full or empty, based on tag[0]
-        stack = (emu->p_regs[0].tag)?8:0;
+        stack = (emu->fpu_tags&0b11)?8:0;
     // copy back MMX regs...
     for(int i=0; i<8; ++i)
         memcpy((i<stack)?&ST(i):&emu->mmx[i], &p->FloatRegisters[i].q[0], sizeof(mmx87_regs_t));
@@ -406,11 +400,11 @@ void fpu_fxrstor64(x64emu_t* emu, void* ed)
     emu->top = emu->sw.f.F87_TOP;
     uint8_t tags = p->TagWord;
     for(int i=0; i<8; ++i)
-        emu->p_regs[i].tag = (tags>>(i*2))?0:0b11;
+        emu->fpu_tags |= ((tags>>i)?0:0b11)<<(i*2);
     int top = emu->top&7;
     int stack = 8-top;
     if(top==0)  // check if stack is full or empty, based on tag[0]
-        stack = (emu->p_regs[0].tag)?8:0;
+        stack = (emu->fpu_tags&0b11)?8:0;
     // copy back MMX regs...
     for(int i=0; i<8; ++i)
         memcpy((i<stack)?&ST(i):&emu->mmx[i], &p->FloatRegisters[i].q[0], sizeof(mmx87_regs_t));
diff --git a/src/emu/x87emu_private.h b/src/emu/x87emu_private.h
index ae977133..a3c589df 100644
--- a/src/emu/x87emu_private.h
+++ b/src/emu/x87emu_private.h
@@ -15,6 +15,8 @@ typedef struct x64emu_s x64emu_t;
 #define LN2		0.69314718055994531
 #define LG2		0.3010299956639812
 
+#define TAGS_EMPTY 0b1111111111111111
+
 #define ST0 emu->x87[emu->top]
 #define ST1 emu->x87[(emu->top+1)&7]
 #define ST(a) emu->x87[(emu->top+(a))&7]
@@ -32,7 +34,8 @@ static inline void fpu_do_push(x64emu_t* emu)
     }*/
     if(emu->fpu_stack<8)
         ++emu->fpu_stack; 
-    emu->p_regs[newtop].tag = 0;    // full
+    emu->fpu_tags<<=2;  // st0 full
+    emu->fpu_tags &= TAGS_EMPTY;
     emu->top = newtop;
 }
 
@@ -47,8 +50,16 @@ static inline void fpu_do_pop(x64emu_t* emu)
     if(emu->fpu_stack>0)
         --emu->fpu_stack;
     
-    emu->p_regs[curtop].tag = 0b11;    // empty
+    emu->fpu_tags>>=2;
+    emu->fpu_tags |= 0b1100000000000000;    // top empty
     emu->top = (emu->top+1)&7;
+    // check tags
+    /*while((emu->fpu_tags&0b11) && emu->fpu_stack) {
+        --emu->fpu_stack;
+        emu->top = (emu->top+1)&7;
+        emu->fpu_tags>>=2;
+        emu->fpu_tags |= 0b1100000000000000;    // top empty
+    }*/
 }
 
 void fpu_do_free(x64emu_t* emu, int i);
@@ -128,7 +139,7 @@ static inline double fpu_round(x64emu_t* emu, double d) {
 
 static inline void fpu_fxam(x64emu_t* emu) {
     emu->sw.f.F87_C1 = (ST0.ud[1]&0x80000000)?1:0;
-    if((emu->fpu_stack<=0) || (emu->p_regs[(emu->top)&7].tag == 0b11)) {
+    if((emu->fpu_stack<=0) || (emu->fpu_tags&0b11)) {
         //Empty
         emu->sw.f.F87_C3 = 1;
         emu->sw.f.F87_C2 = 0;
diff --git a/src/include/regs.h b/src/include/regs.h
index dc72a648..721f155a 100644
--- a/src/include/regs.h
+++ b/src/include/regs.h
@@ -32,10 +32,6 @@ typedef union {
 	uint8_t  byte[8];
 } reg64_t;
 
-typedef struct {
-    uint32_t tag;
-} fpu_p_reg_t;
-
 typedef enum {
 	ROUND_Nearest = 0,		
 	ROUND_Down    = 1,