about summary refs log tree commit diff stats
path: root/src/dynarec
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynarec')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.c17
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.h1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c38
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass1.h2
-rw-r--r--src/dynarec/dynarec_arch.h3
-rw-r--r--src/dynarec/dynarec_native_pass.c2
6 files changed, 39 insertions, 24 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index ea60745c..afb1ed6b 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -168,6 +168,10 @@ static void fpu_reset_reg_neoncache(neoncache_t* n)
         n->fpuused[i]=0;
         n->neoncache[i].v = 0;
     }
+    n->ymm_regs = 0;
+    n->ymm_removed = 0;
+    n->ymm_used = 0;
+    n->ymm_write = 0;
 
 }
 void fpu_reset_reg(dynarec_arm_t* dyn)
@@ -767,7 +771,7 @@ static void sse_reset(neoncache_t* n)
             n->neoncache[i].v = 0;
 }
 
-void fpu_reset(dynarec_arm_t* dyn)
+void fpu_reset(dynarec_native_t* dyn)
 {
     x87_reset(&dyn->n);
     mmx_reset(&dyn->n);
@@ -776,12 +780,21 @@ void fpu_reset(dynarec_arm_t* dyn)
     dyn->ymm_zero = 0;
 }
 
-void fpu_reset_ninst(dynarec_arm_t* dyn, int ninst)
+void fpu_reset_ninst(dynarec_native_t* dyn, int ninst)
 {
     x87_reset(&dyn->insts[ninst].n);
     mmx_reset(&dyn->insts[ninst].n);
     sse_reset(&dyn->insts[ninst].n);
     fpu_reset_reg_neoncache(&dyn->insts[ninst].n);
+
+}
+
+void arm64_fpu_reset(dynarec_native_t* dyn, int ninst, int step)
+{
+    if(step<2) {
+        dyn->insts[ninst].ymm0_in = 0;
+        dyn->insts[ninst].ymm0_out = 0;
+    }
 }
 
 int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st)
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h
index b6c95904..0af490e4 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.h
+++ b/src/dynarec/arm64/dynarec_arm64_functions.h
@@ -69,6 +69,7 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode);
 // reset the cache
 void fpu_reset(dynarec_native_t* dyn);
 void fpu_reset_ninst(dynarec_native_t* dyn, int ninst);
+void arm64_fpu_reset(dynarec_native_t* dyn, int ninst, int step);
 
 // is st freed
 int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st);
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 32de5146..136c0f8c 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -743,7 +743,6 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save
     if(saveflags) {
         STRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags));
     }
-    fpu_pushcache(dyn, ninst, reg, 0);
     if(ret!=-2) {
         STPx_S7_preindex(xEmu, savereg, xSP, -16);   // ARM64 stack needs to be 16byte aligned
         STPx_S7_offset(xRAX, xRCX, xEmu, offsetof(x64emu_t, regs[_AX]));    // x9..x15, x16,x17,x18 those needs to be saved by caller
@@ -751,6 +750,7 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save
         STPx_S7_offset(xRSP, xRBP, xEmu, offsetof(x64emu_t, regs[_SP]));
         STPx_S7_offset(xRSI, xRDI, xEmu, offsetof(x64emu_t, regs[_SI]));
         STPx_S7_offset(xR8,  xR9,  xEmu, offsetof(x64emu_t, regs[_R8]));
+        fpu_pushcache(dyn, ninst, savereg, 0);
     }
     TABLE64(reg, (uintptr_t)fnc);
     BLR(reg);
@@ -772,8 +772,8 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save
         GO(RSI, RDI);
         GO(R8, R9);
         #undef GO
+        fpu_popcache(dyn, ninst, savereg, 0);   // savereg will not be used
     }
-    fpu_popcache(dyn, ninst, reg, 0);
     if(saveflags) {
         LDRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags));
     }
@@ -1658,7 +1658,7 @@ void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a)
             STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])+8);
         }
     } else for(int i=0; i<32; ++i)
-        if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) || (dyn->n.neoncache[i].t == NEON_CACHE_YMMR)) {
+        if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) || (dyn->n.neoncache[i].t == NEON_CACHE_YMMR) && (dyn->n.neoncache[i].n==a)) {
             if(dyn->n.neoncache[i].t == NEON_CACHE_YMMW)
                 VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n]));
             fpu_free_reg(dyn, i);
@@ -1853,7 +1853,7 @@ void ymm_mark_zero(dynarec_arm_t* dyn, int ninst, int a)
                 VEORQ(i, i, i);
                 return;
             }
-            dyn->n.neoncache[i].v = 0;  // forget it!
+            fpu_free_reg(dyn, i);
         }
     #if STEP == 0
     dyn->insts[ninst].ymm0_add |= (1<<a);
@@ -1869,8 +1869,6 @@ void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07)
     for (int i=start; i<16; i++) {
         if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write))
             ++n;
-        if(is_avx_zero(dyn, ninst, i))
-            ++n;
     }
     for(int i=0; i<32; ++i)
         if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW)
@@ -1878,23 +1876,16 @@ void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07)
     if(!n)
         return;
     MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n);
-    int s1_set = 0;
     for (int i=start; i<16; ++i) {
         if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) {
             VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
         }
-        if(is_avx_zero(dyn, ninst, i)) {
-            if(!s1_set) {
-                ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0]));
-                s1_set = 1;
-            }
-            STPx_S7_offset(xZR, xZR, s1, i*16);
-        }
     }
-    // purge the YMM values
-    for(int i=0; i<32; ++i)
+    // push the YMM values
+    for(int i=0; i<32; ++i) {
         if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW)
             VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n]));
+    }
     MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n);
 }
 
@@ -1906,6 +1897,9 @@ void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07)
     for (int i=start; i<16; i++)
         if(dyn->n.ssecache[i].v!=-1)
             ++n;
+    for(int i=0; i<32; ++i)
+        if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR)
+            ++n;
     if(!n)
         return;
     MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n);
@@ -1916,7 +1910,7 @@ void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07)
             dyn->n.neoncache[dyn->n.ssecache[i].reg].t = NEON_CACHE_XMMR;*/
         }
     for(int i=0; i<32; ++i)
-        if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW)
+        if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR)
             VLDR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n]));
     MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n);
 }
@@ -2265,11 +2259,11 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
                     MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
                     SCVTFDD(i, i);
                     cache.neoncache[i].t = NEON_CACHE_ST_D;
-                } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW)
-                    { cache.neoncache[i].t = NEON_CACHE_XMMW; }
-                else if(cache.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW)
-                    { cache.neoncache[i].t = NEON_CACHE_YMMW; }
-                else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) {
+                } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) {
+                    cache.neoncache[i].t = NEON_CACHE_XMMW;
+                } else if(cache.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW) {
+                    cache.neoncache[i].t = NEON_CACHE_YMMW;
+                } else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) {
                     // refresh cache...
                     MESSAGE(LOG_DUMP, "\t  - Refreh %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
                     VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[cache.neoncache[i].n]));
diff --git a/src/dynarec/arm64/dynarec_arm64_pass1.h b/src/dynarec/arm64/dynarec_arm64_pass1.h
index 6cf92feb..ab1f5fc4 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass1.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass1.h
@@ -5,10 +5,12 @@
 #define NEW_INST                                \
         dyn->insts[ninst].f_entry = dyn->f;     \
         dyn->n.combined1 = dyn->n.combined2 = 0;\
+        dyn->insts[ninst].ymm0_in = dyn->ymm_zero;\
         dyn->n.swapped = 0; dyn->n.barrier = 0
 
 #define INST_EPILOG                             \
         dyn->insts[ninst].n = dyn->n;           \
+        dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\
         dyn->insts[ninst].f_exit = dyn->f
 
 #define INST_NAME(name)  
diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h
index f89125a7..351d9fcd 100644
--- a/src/dynarec/dynarec_arch.h
+++ b/src/dynarec/dynarec_arch.h
@@ -17,6 +17,7 @@
 #include "arm64/dynarec_arm64_functions.h"

 // Limit here is defined by LD litteral, that is 19bits

 #define MAXBLOCK_SIZE ((1<<19)-200)

+#define ARM_FPU_RESET() arm64_fpu_reset(dyn, ninst, STEP)

 #elif defined(LA64)

 

 #define instruction_native_t        instruction_la64_t

@@ -33,6 +34,7 @@
 #include "la64/dynarec_la64_functions.h"

 // Limit here is unconditionnal jump, that is signed 28bits

 #define MAXBLOCK_SIZE ((1 << 27) - 200)

+#define ARM_FPU_RESET()

 #elif defined(RV64)

 

 #define instruction_native_t        instruction_rv64_t

@@ -49,6 +51,7 @@
 #include "rv64/dynarec_rv64_functions.h"

 // Limit here is unconditionnal jump, that is signed 21bits

 #define MAXBLOCK_SIZE ((1<<20)-200)

+#define ARM_FPU_RESET()

 #else

 #error Unsupported platform

 #endif

diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index 2ebc89cc..14f80103 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -89,12 +89,14 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
                 dyn->f.dfnone = 0;
                 dyn->f.pending = 0;
                 fpu_reset(dyn);
+                ARM_FPU_RESET();
             } else {
                 fpu_reset_cache(dyn, ninst, reset_n);
                 dyn->f = dyn->insts[reset_n].f_exit;
                 if(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT) {
                     MESSAGE(LOG_DEBUG, "Apply Barrier Float\n");
                     fpu_reset(dyn);
+                    ARM_FPU_RESET();
                 }
                 if(dyn->insts[ninst].x64.barrier&BARRIER_FLAGS) {
                     MESSAGE(LOG_DEBUG, "Apply Barrier Flags\n");