about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-06-07 13:34:22 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-06-07 13:34:22 +0200
commitb2fd651abe8aa02ff9984e5d9a84394b7869ed17 (patch)
tree6d2d767a7309df154bdbb11b243e55acd11ba5ea
parentd4b02e6a3fa74a73f42e230d920e91543a3b832b (diff)
downloadbox64-b2fd651abe8aa02ff9984e5d9a84394b7869ed17.tar.gz
box64-b2fd651abe8aa02ff9984e5d9a84394b7869ed17.zip
[ARM64_DYNAREC] Fixed YMM cache handling, espcially in high pressure regs cases
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.c71
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.h11
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c113
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h4
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass2.h1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass3.h6
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h3
-rw-r--r--src/dynarec/dynarec_native.c14
8 files changed, 140 insertions, 83 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index 9e4fd46c..49adddf1 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -26,18 +26,13 @@
 #include "custommem.h"
 #include "bridge.h"
 
-#define XMM0    0
-#define XMM8    16
-#define X870    8
-#define EMM0    8
-
 // Get a FPU scratch reg
 int fpu_get_scratch(dynarec_arm_t* dyn, int ninst)
 {
     int ret = SCRATCH0 + dyn->n.fpu_scratch++;
     if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) {
         // should only happens in step 0...
-        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret].n); // mark as purged
+        dyn->scratchs |= (1<<(dyn->n.fpu_scratch-1)); // mark as not free
         dyn->n.neoncache[ret].v = 0; // reset it
     }
     return ret;
@@ -54,7 +49,7 @@ int fpu_get_reg_x87(dynarec_arm_t* dyn, int ninst, int t, int n)
     while (dyn->n.fpuused[i]) ++i;
     if(dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) {
         // should only happens in step 0...
-        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[i].n); // mark as purged
+        dyn->mmx87 |= (1<<(i-1-X870)); // mark as purged
         dyn->n.neoncache[i].v = 0; // reset it
     }
     dyn->n.fpuused[i] = 1;
@@ -79,7 +74,7 @@ int fpu_get_reg_emm(dynarec_arm_t* dyn, int ninst, int emm)
     int ret = EMM0 + emm;
     if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) {
         // should only happens in step 0...
-        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret].n); // mark as purged
+        dyn->mmx87 |= (1<<emm); // mark as purged
         dyn->n.neoncache[ret].v = 0; // reset it
     }
     dyn->n.fpuused[ret] = 1;
@@ -103,7 +98,7 @@ int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm)
     dyn->n.news |= (1<<i);
     return i;
 }
-static int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg)
+int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg)
 {
     if(dyn->n.neoncache[reg].t==NEON_CACHE_YMMR || dyn->n.neoncache[reg].t==NEON_CACHE_YMMW) {
         if(dyn->n.neoncache[reg].n == ymm) {
@@ -120,7 +115,7 @@ static int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg)
     }
     return -1;
 }
-static int is_ymm_to_keep(dynarec_arm_t* dyn, int reg, int k1, int k2, int k3)
+int is_ymm_to_keep(dynarec_arm_t* dyn, int reg, int k1, int k2, int k3)
 {
     if(k1!=-1 && dyn->n.neoncache[reg].n==k1)
         return 1;
@@ -130,49 +125,7 @@ static int is_ymm_to_keep(dynarec_arm_t* dyn, int reg, int k1, int k2, int k3)
         return 1;
     return 0;
 }
-// Get an YMM quad reg, while preserving up to 3 other YMM regs
-int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3)
-{
-    int i = EMM0;
-    // first pass see if a slot is free in EMM/x87 slots
-    for(int j=0; j<8; ++j) {
-        if(!dyn->n.fpuused[i+j]) {
-            int ret = internal_mark_ymm(dyn, t, ymm, i+j);
-            if(ret>=0) return ret;
-        }
-    }
-    // no slot in the emm space, look for scratch space in reverse
-    i = SCRATCH0;
-    for(int j=7; j>=dyn->n.fpu_scratch; --j) {
-        int ret = internal_mark_ymm(dyn, t, ymm, i+j);
-        if(ret>=0) return ret;
-    }
-    // no free slot, needs to purge a value... First loop on the YMMR, they are easier to purge
-    i = EMM0;
-    int keep = 0;
-    for(int j=0; j<8; ++j) {
-        if(!dyn->n.fpuused[i+j]) {
-            // should a test be done to check if ymm is already in the purge list?
-            if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR) {
-                dyn->insts[ninst].purge_ymm |= 1<<dyn->n.neoncache[i+j].n;
-                dyn->n.neoncache[i+j].v = 0;
-                return internal_mark_ymm(dyn, t, ymm, i+j);
-            }
-        }
-    }
-    // make space in the scratch area
-    i = SCRATCH0;
-    for(int j=dyn->n.fpu_scratch; j<8; ++j) {
-            // should a test be done to check if ymm is already in the purge list?
-            if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3)) {
-                dyn->insts[ninst].purge_ymm |= 1<<dyn->n.neoncache[i+j].n;
-                dyn->n.neoncache[i+j].v = 0;
-                return internal_mark_ymm(dyn, t, ymm, i+j);
-            }
-    }
-    printf_log(LOG_NONE, "BOX64 Dynarec: Error, unable to free a reg for YMM %d at inst=%d\n", ymm, ninst);
-    return i;
-}
+
 // Reset fpu regs counter
 static void fpu_reset_reg_neoncache(neoncache_t* n)
 {
@@ -693,10 +646,20 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
                 default:    break;
             }
         }
+        if(memcmp(dyn->insts[ninst].n.neoncache, dyn->n.neoncache, sizeof(dyn->n.neoncache))) {
+            dynarec_log(LOG_NONE, " %s(Change:", (box64_dynarec_dump>1)?"\e[1;91m":"");
+            for(int ii=0; ii<32; ++ii) if(dyn->insts[ninst].n.neoncache[ii].v!=dyn->n.neoncache[ii].v) {
+                dynarec_log(LOG_NONE, " V%d:%s", ii, getCacheName(dyn->n.neoncache[ii].t, dyn->n.neoncache[ii].n));
+                dynarec_log(LOG_NONE, "->%s", getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n));
+            }
+            dynarec_log(LOG_NONE, ")%s", (box64_dynarec_dump>1)?"\e[32m":"");
+        }
         if(dyn->ymm_zero || dyn->insts[ninst].ymm0_add || dyn->insts[ninst].ymm0_sub)
-            dynarec_log(LOG_NONE, " ymm0=%04x(+%0x4-%04x)", dyn->ymm_zero, dyn->insts[ninst].ymm0_add ,dyn->insts[ninst].ymm0_sub);
+            dynarec_log(LOG_NONE, " ymm0=%04x(+%04x-%04x)", dyn->ymm_zero, dyn->insts[ninst].ymm0_add ,dyn->insts[ninst].ymm0_sub);
         if(dyn->insts[ninst].purge_ymm)
             dynarec_log(LOG_NONE, " purgeYmm=%04x", dyn->insts[ninst].purge_ymm);
+        if(dyn->mmx87 || dyn->scratchs)
+            dynarec_log(LOG_NONE, " mask=%04x-%04x", dyn->mmx87, dyn->scratchs);
         if(dyn->n.stack || dyn->insts[ninst].n.stack_next || dyn->insts[ninst].n.x87stack)
             dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, dyn->insts[ninst].n.stack_pop, dyn->insts[ninst].n.x87stack);
         if(dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2)
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h
index 7da65897..342f0f33 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.h
+++ b/src/dynarec/arm64/dynarec_arm64_functions.h
@@ -4,6 +4,11 @@
 #include "../dynarec_native_functions.h"
 
 #define SCRATCH0    24
+#define XMM0    0
+#define XMM8    16
+#define X870    8
+#define EMM0    8
+
 
 // Get an FPU scratch reg
 int fpu_get_scratch(dynarec_arm_t* dyn, int ninst);
@@ -15,12 +20,14 @@ int fpu_get_reg_x87(dynarec_arm_t* dyn, int ninst, int t, int n);
 int fpu_get_reg_emm(dynarec_arm_t* dyn, int ninst, int emm);
 // Get an XMM quad reg
 int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm);
-// Get an YMM upper quad reg, while keeping up to 3 other YMM reg (-1 to no keep)
-int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3);
 // Free a FPU/MMX/XMM reg
 void fpu_free_reg(dynarec_arm_t* dyn, int reg);
 // Reset fpu regs counter
 void fpu_reset_reg(dynarec_arm_t* dyn);
+// internal YMM handling
+int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg);
+// is ymm neoncache[reg] one of k1, k2, k3?
+int is_ymm_to_keep(dynarec_arm_t* dyn, int reg, int k1, int k2, int k3);
 
 // ---- Neon cache functions
 // Get type for STx
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index e79d81f7..f1a67270 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -1207,8 +1207,10 @@ static void x87_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
             ADDw_U12(s3, s2, dyn->n.x87cache[i]);
             ANDw_mask(s3, s3, 0, 2); // mask=7   // (emu->top + i)&7
             if(neoncache_get_st_f(dyn, ninst, dyn->n.x87cache[i])>=0) {
-                FCVT_D_S(SCRATCH0, dyn->n.x87reg[i]);
-                VSTR64_REG_LSL3(SCRATCH0, s1, s3);
+                int scratch = fpu_get_scratch(dyn, ninst);
+                FCVT_D_S(scratch, dyn->n.x87reg[i]);
+                VSTR64_REG_LSL3(scratch, s1, s3);
+                fpu_free_reg(dyn, scratch);
             } else
                 VSTR64_REG_LSL3(dyn->n.x87reg[i], s1, s3);
         }
@@ -1981,18 +1983,15 @@ static void swapCache(dynarec_arm_t* dyn, int ninst, int i, int j, neoncache_t *
     neon_cache_t tmp;
     MESSAGE(LOG_DUMP, "\t  - Swapping %d <-> %d\n", i, j);
     // There is no VSWP in Arm64 NEON to swap 2 register contents!
-    // so use a scratch...
-    int scratch = fpu_get_scratch(dyn, ninst);
     if(quad) {
-        VMOVQ(scratch, i);
-        VMOVQ(i, j);
-        VMOVQ(j, scratch);
+        VEORQ(i, i, j);
+        VEORQ(j, i, j);
+        VEORQ(i, i, j);
     } else {
-        VMOV(scratch, i);
-        VMOV(i, j);
-        VMOV(j, scratch);
+        VEOR(i, i, j);
+        VEOR(j, i, j);
+        VEOR(i, i, j);
     }
-    fpu_free_reg(dyn, scratch);
     tmp.v = cache->neoncache[i].v;
     cache->neoncache[i].v = cache->neoncache[j].v;
     cache->neoncache[j].v = tmp.v;
@@ -2131,7 +2130,6 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in
 
 static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
 {
-#if STEP > 0
     int i2 = dyn->insts[ninst].x64.jmp_insts;
     if(i2<0)
         return;
@@ -2251,7 +2249,7 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
     }
     // ymm0
     s3_top = 1;
-    if(dyn->ymm_zero && (dyn->ymm_zero&~dyn->insts[i2].ymm_zero)) {
+    if(dyn->ymm_zero && dyn->insts[i2].purge_ymm) {
         for(int i=0; i<16; ++i)
             if(dyn->insts[i2].purge_ymm&(1<<i))
                 if(is_avx_zero(dyn, ninst, i)) {
@@ -2295,11 +2293,9 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
         stack_cnt = cache_i2.stack;
     }
     MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
-#endif
 }
 static void flagsCacheTransform(dynarec_arm_t* dyn, int ninst, int s1)
 {
-#if STEP > 1
     int j64;
     int jmp = dyn->insts[ninst].x64.jmp_insts;
     if(jmp<0)
@@ -2338,7 +2334,6 @@ static void flagsCacheTransform(dynarec_arm_t* dyn, int ninst, int s1)
         CALL_(UpdateFlags, -1, 0);
         MARKF2;
     }
-#endif
 }
 
 void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) {
@@ -2466,8 +2461,8 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n)
     if(box64_dynarec_dump) dynarec_log(LOG_NONE, "New x87stack=%d\n", dyn->n.x87stack);
         #endif
     #if defined(HAVE_TRACE) && (STEP>2)
-    if(box64_dynarec_dump)
-        if(memcmp(&dyn->n, &dyn->insts[reset_n].n, sizeof(neon_cache_t))) {
+    if(box64_dynarec_dump && 0) //disable for now, need more work
+        if(memcmp(&dyn->n, &dyn->insts[reset_n].n, sizeof(neoncache_t))) {
             MESSAGE(LOG_DEBUG, "Warning, difference in neoncache: reset=");
             for(int i=0; i<32; ++i)
                 if(dyn->insts[reset_n].n.neoncache[i].v)
@@ -2520,11 +2515,15 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst)
 
 void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, uint16_t mask, int s1)
 {
-    MESSAGE(LOG_NONE, "Purge YMM mask=%04x --------\n", dyn->insts[ninst].purge_ymm);
     int s1_set = 0;
+    int do_something = 0;
     for(int i=0; i<16; ++i)
         if(mask&(1<<i)) {
             if(is_avx_zero_unset(dyn, ninst, i)) {
+                if(!do_something) {
+                    MESSAGE(LOG_NONE, "Purge YMM mask=%04x --------\n", mask);
+                    do_something = 1;
+                }
                 if(!s1_set) {
                     ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0]));
                     s1_set = 1;
@@ -2537,10 +2536,84 @@ void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, uint16_t mask, int s1)
                     dyn->n.neoncache[j].v = 0;
                     j=32;
                 } else if(dyn->n.neoncache[j].t==NEON_CACHE_YMMW && dyn->n.neoncache[j].n==i) {
+                    if(!do_something) {
+                        MESSAGE(LOG_NONE, "Purge YMM mask=%04x --------\n", mask);
+                        do_something = 1;
+                    }
                     VSTR128_U12(j, xEmu, offsetof(x64emu_t, ymm[i]));
                     dyn->n.neoncache[j].v = 0;
                     j=32;
                 }
         }
-    MESSAGE(LOG_NONE, "---------- Purge YMM\n");
+    if(do_something)
+        MESSAGE(LOG_NONE, "---------- Purge YMM\n");
+}
+
+// Get an YMM quad reg, while preserving up to 3 other YMM regs
+int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3)
+{
+    int i = -1;
+    #if STEP >1
+    // check the cached neoncache, it should be exact
+    // look for it
+    for(int ii=0; ii<32 && i==-1; ++ii)
+        if(dyn->insts[ninst].n.neoncache[ii].n==ymm && (dyn->insts[ninst].n.neoncache[ii].t==NEON_CACHE_YMMR || dyn->insts[ninst].n.neoncache[ii].t==NEON_CACHE_YMMW))
+            i = ii;
+    if(i!=-1) {
+        // already there!
+        if((dyn->n.neoncache[i].t==NEON_CACHE_YMMW  || dyn->n.neoncache[i].t==NEON_CACHE_YMMR) && dyn->n.neoncache[i].n==ymm) {
+            if(t==NEON_CACHE_YMMW)
+                dyn->n.neoncache[i].t=t;
+            return i;
+        }
+        // check if free or should be purge before...
+        if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW)
+            VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n]));
+        dyn->n.neoncache[i].t=t;
+        dyn->n.neoncache[i].n=ymm;
+        return i;
+    }
+    printf_log(LOG_NONE, "BOX64 Dynarec: Warning, unable to find YMM %d in neoncache at inst=%d\n", ymm, ninst);
+    #else
+    i = EMM0;
+    // first pass see if a slot is free in EMM/x87 slots
+    for(int j=0; j<8; ++j) {
+        if(!dyn->n.fpuused[i+j] && !(dyn->mmx87&(1<<j))) {
+            int ret = internal_mark_ymm(dyn, t, ymm, i+j);
+            if(ret>=0) return ret;
+        }
+    }
+    // no slot in the emm space, look for scratch space in reverse
+    i = SCRATCH0;
+    for(int j=7; j>=dyn->n.fpu_scratch; --j) 
+        if(!(dyn->scratchs&(1<<j))) {
+            int ret = internal_mark_ymm(dyn, t, ymm, i+j);
+            if(ret>=0) return ret;
+    }
+    // no free slot, needs to purge a value... First loop on the YMMR, they are easier to purge
+    i = EMM0;
+    int keep = 0;
+    for(int j=0; j<8; ++j) {
+        if(!dyn->n.fpuused[i+j] && !(dyn->mmx87&(1<<j))) {
+            // should a test be done to check if ymm is already in the purge list?
+            if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR) {
+                dyn->n.neoncache[i+j].v = 0;
+                return internal_mark_ymm(dyn, t, ymm, i+j);
+            }
+        }
+    }
+    // make space in the scratch area
+    i = SCRATCH0;
+    for(int j=dyn->n.fpu_scratch; j<8; ++j) {
+            // should a test be done to check if ymm is already in the purge list?
+            if(!(dyn->scratchs&(1<<j)) &&!is_ymm_to_keep(dyn, i+j, k1, k2, k3)) {
+                // Save the reg and recycle it
+                VSTR128_U12(i+j, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i+j].n]));
+                dyn->n.neoncache[i+j].v = 0;
+                return internal_mark_ymm(dyn, t, ymm, i+j);
+            }
+    }
+    #endif
+    printf_log(LOG_NONE, "BOX64 Dynarec: Error, unable to free a reg for YMM %d at inst=%d\n", ymm, ninst);
+    return i;
 }
\ No newline at end of file
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 34822661..62d3ec0f 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -1328,6 +1328,7 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr);
 #define ymm_get_reg       STEPNAME(ymm_get_reg)
 #define ymm_get_reg_empty STEPNAME(ymm_get_reg_empty)
 #define ymm_mark_zero     STEPNAME(ymm_mark_zero)
+#define fpu_get_reg_ymm   STEPNAME(fpu_get_reg_ymm)
 
 #define fpu_pushcache   STEPNAME(fpu_pushcache)
 #define fpu_popcache    STEPNAME(fpu_popcache)
@@ -1580,7 +1581,8 @@ int ymm_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite, int
 int ymm_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a, int k1, int k2, int k3);
 // mark an ymm upper part has zero (forgetting upper part if needed)
 void ymm_mark_zero(dynarec_arm_t* dyn, int ninst, int a);
-
+// Get an YMM upper quad reg, while keeping up to 3 other YMM reg (-1 to no keep)
+int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3);
 
 uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
diff --git a/src/dynarec/arm64/dynarec_arm64_pass2.h b/src/dynarec/arm64/dynarec_arm64_pass2.h
index 512e4416..013dfb86 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass2.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass2.h
@@ -11,6 +11,7 @@
         if(ninst) {                                                                                     \
                 dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size);     \
                 dyn->insts_size += 1+((dyn->insts[ninst-1].x64.size>(dyn->insts[ninst-1].size/4))?dyn->insts[ninst-1].x64.size:(dyn->insts[ninst-1].size/4))/15; \
+                dyn->insts[ninst].ymm0_pass2 = dyn->ymm_zero;   \
         }
 #define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; 
 #define INST_NAME(name) 
diff --git a/src/dynarec/arm64/dynarec_arm64_pass3.h b/src/dynarec/arm64/dynarec_arm64_pass3.h
index 875e8af9..38bcf61b 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass3.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass3.h
@@ -14,8 +14,10 @@
 
 #define MESSAGE(A, ...)  if(box64_dynarec_dump) dynarec_log(LOG_NONE, __VA_ARGS__)
 #define NEW_INST        \
-    if(ninst)                                                   \
-        addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst-1].x64.size, dyn->insts[ninst-1].size/4);
+    if(ninst) {                                                  \
+        addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst-1].x64.size, dyn->insts[ninst-1].size/4); \
+        dyn->insts[ninst].ymm0_pass3 = dyn->ymm_zero;   \
+    }
 #define INST_EPILOG     
 #define INST_NAME(name) inst_name_pass3(dyn, ninst, name, rex)
 #define TABLE64(A, V)   {int val64offset = Table64(dyn, (V), 3); MESSAGE(LOG_DUMP, "  Table64: 0x%lx\n", (V)); LDRx_literal(A, val64offset);}
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 03958927..a2e32e80 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -85,6 +85,7 @@ typedef struct instruction_arm64_s {
     uint16_t            ymm0_add;   // the ymm0 added by the opcode
     uint16_t            ymm0_sub;   // the ymm0 removed by the opcode
     uint16_t            ymm0_out;   // the ymmm0 at th end of the opcode
+    uint16_t            ymm0_pass2, ymm0_pass3;
     uint8_t             barrier_maybe;
     uint8_t             will_write;
     uint8_t             last_write;
@@ -131,6 +132,8 @@ typedef struct dynarec_arm_s {
     uint8_t             doublepop;
     uint8_t             always_test;
     uint8_t             abort;      // abort the creation of the block
+    uint8_t             scratchs;   // mask of the 8 scratch neon register globaly used in the dynablock
+    uint8_t             mmx87;      // mask of the 8 mmx/x87 neon register globaly used in the dynablock
 } dynarec_arm_t;
 
 void add_next(dynarec_arm_t *dyn, uintptr_t addr);
diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index a9086625..161e577e 100644
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -605,8 +605,10 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
     for(int i=1; i<helper.size-1; ++i)
         if(!helper.insts[i].pred_sz) {
             int ii = i;
-            while(ii<helper.size && !helper.insts[ii].pred_sz)
+            while(ii<helper.size && !helper.insts[ii].pred_sz) {
                 fpu_reset_ninst(&helper, ii++);
+                helper.insts[ii].ymm0_sub = helper.insts[ii].ymm0_add = helper.insts[ii].ymm0_out = helper.insts[ii].purge_ymm = 0;
+            }
             i = ii;
         }
 
@@ -661,6 +663,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
     int oldtable64size = helper.table64size;
     size_t oldnativesize = helper.native_size;
     size_t oldinstsize = helper.insts_size;
+    int oldsize= helper.size;
     helper.native_size = 0;
     helper.table64size = 0; // reset table64 (but not the cap)
     helper.insts_size = 0;  // reset
@@ -696,14 +699,17 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
         return NULL;
     }
     if((oldnativesize!=helper.native_size) || (oldtable64size<helper.table64size)) {
-        printf_log(LOG_NONE, "BOX64: Warning, size difference in block between pass2 (%zu) & pass3 (%zu)!\n", sz, helper.native_size+helper.table64size*8);
+        printf_log(LOG_NONE, "BOX64: Warning, size difference in block between pass2 (%zu, %d) & pass3 (%zu, %d)!\n", oldnativesize+oldtable64size*8, oldsize, helper.native_size+helper.table64size*8, helper.size);
         uint8_t *dump = (uint8_t*)helper.start;
         printf_log(LOG_NONE, "Dump of %d x64 opcodes:\n", helper.size);
         for(int i=0; i<helper.size; ++i) {
-            printf_log(LOG_NONE, "%p:", dump);
+            printf_log(LOG_NONE, "%s%p:", (helper.insts[i].size2!=helper.insts[i].size)?"=====> ":"", dump);
             for(; dump<(uint8_t*)helper.insts[i+1].x64.addr; ++dump)
                 printf_log(LOG_NONE, " %02X", *dump);
-            printf_log(LOG_NONE, "\t%d -> %d\n", helper.insts[i].size2, helper.insts[i].size);
+            printf_log(LOG_NONE, "\t%d -> %d", helper.insts[i].size2, helper.insts[i].size);
+            if(helper.insts[i].ymm0_pass2 || helper.insts[i].ymm0_pass3)
+                printf_log(LOG_NONE, "\t %04x -> %04x", helper.insts[i].ymm0_pass2, helper.insts[i].ymm0_pass3);
+            printf_log(LOG_NONE, "\n");
         }
         printf_log(LOG_NONE, "Table64 \t%d -> %d\n", oldtable64size*8, helper.table64size*8);
         printf_log(LOG_NONE, " ------------\n");