about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-06-05 10:44:31 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-06-05 10:44:31 +0200
commitba411303e951cb51766d42a15be59e2b9d5e67ec (patch)
treed348eb0b9c0561bde343f686ef6e308476a2e9a5 /src
parent8848bc2e7f404c72396392b307ee6c3494392488 (diff)
downloadbox64-ba411303e951cb51766d42a15be59e2b9d5e67ec.tar.gz
box64-ba411303e951cb51766d42a15be59e2b9d5e67ec.zip
[DYNAREC] Improved handling of the Ymm0 attribute
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.c5
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c23
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h6
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass0.h3
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h4
-rw-r--r--src/dynarec/dynarec_native.c37
-rw-r--r--src/dynarec/dynarec_native_pass.c4
-rw-r--r--src/dynarec/la64/dynarec_la64_pass0.h3
-rw-r--r--src/dynarec/la64/dynarec_la64_private.h3
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass0.h3
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h3
11 files changed, 69 insertions, 25 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index 4ee331ab..9e4fd46c 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -540,7 +540,6 @@ void neoncacheUnwind(neoncache_t* cache)
     // And now, rebuild the x87cache info with neoncache
     cache->mmxcount = 0;
     cache->fpu_scratch = 0;
-    cache->fpu_extra_qscratch = 0;
     cache->fpu_reg = 0;
     for(int i=0; i<8; ++i) {
         cache->x87cache[i] = -1;
@@ -694,8 +693,8 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
                 default:    break;
             }
         }
-        if(dyn->ymm_zero)
-            dynarec_log(LOG_NONE, " ymm0=%04x", dyn->ymm_zero);
+        if(dyn->ymm_zero || dyn->insts[ninst].ymm0_add || dyn->insts[ninst].ymm0_sub)
+            dynarec_log(LOG_NONE, " ymm0=%04x(+%0x4-%04x)", dyn->ymm_zero, dyn->insts[ninst].ymm0_add ,dyn->insts[ninst].ymm0_sub);
         if(dyn->insts[ninst].purge_ymm)
             dynarec_log(LOG_NONE, " purgeYmm=%04x", dyn->insts[ninst].purge_ymm);
         if(dyn->n.stack || dyn->insts[ninst].n.stack_next || dyn->insts[ninst].n.x87stack)
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index ded87eb3..e79d81f7 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -2002,9 +2002,9 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int
 {
     if(cache->neoncache[i].v) {
         int quad = 0;
-        if(t==NEON_CACHE_XMMR || t==NEON_CACHE_XMMW)
+        if(t==NEON_CACHE_XMMR || t==NEON_CACHE_XMMW || t==NEON_CACHE_YMMR || t==NEON_CACHE_YMMW)
             quad = 1;
-        if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW)
+        if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW || cache->neoncache[i].t==NEON_CACHE_YMMR || cache->neoncache[i].t==NEON_CACHE_YMMW)
             quad = 1;
         int j = i+1;
         while(cache->neoncache[j].v)
@@ -2171,12 +2171,17 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
     int s1_val = 0;
     int s2_val = 0;
     // unload every uneeded cache
-    // check SSE first, than MMX, in order, for optimisation issue
+    // check SSE first, than MMX, in order, to optimise successive memory write
     for(int i=0; i<16; ++i) {
         int j=findCacheSlot(dyn, ninst, NEON_CACHE_XMMW, i, &cache);
         if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_XMMW, i, &cache_i2)==-1)
             unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n);
     }
+    for(int i=0; i<16; ++i) {
+        int j=findCacheSlot(dyn, ninst, NEON_CACHE_YMMW, i, &cache);
+        if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_YMMW, i, &cache_i2)==-1)
+            unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n);
+    }
     for(int i=0; i<8; ++i) {
         int j=findCacheSlot(dyn, ninst, NEON_CACHE_MM, i, &cache);
         if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_MM, i, &cache_i2)==-1)
@@ -2347,7 +2352,7 @@ void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
 {
     x87_reflectcache(dyn, ninst, s1, s2, s3);
     mmx_reflectcache(dyn, ninst, s1);
-    sse_reflectcache(dyn, ninst, s1);
+    //sse_reflectcache(dyn, ninst, s1); // no need, it's pushed/unpushed during call
 }
 
 void fpu_unreflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
@@ -2464,7 +2469,7 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n)
     if(box64_dynarec_dump)
         if(memcmp(&dyn->n, &dyn->insts[reset_n].n, sizeof(neon_cache_t))) {
             MESSAGE(LOG_DEBUG, "Warning, difference in neoncache: reset=");
-            for(int i=0; i<24; ++i)
+            for(int i=0; i<32; ++i)
                 if(dyn->insts[reset_n].n.neoncache[i].v)
                     MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[reset_n].n.neoncache[i].t, dyn->insts[reset_n].n.neoncache[i].n));
             if(dyn->insts[reset_n].n.combined1 || dyn->insts[reset_n].n.combined2)
@@ -2472,7 +2477,7 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n)
             if(dyn->insts[reset_n].n.stack_push || dyn->insts[reset_n].n.stack_pop)
                 MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[reset_n].n.stack_push, -dyn->insts[reset_n].n.stack_pop);
             MESSAGE(LOG_DEBUG, " ==> ");
-            for(int i=0; i<24; ++i)
+            for(int i=0; i<32; ++i)
                 if(dyn->insts[ninst].n.neoncache[i].v)
                     MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[ninst].n.neoncache[i].t, dyn->insts[ninst].n.neoncache[i].n));
             if(dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2)
@@ -2480,7 +2485,7 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n)
             if(dyn->insts[ninst].n.stack_push || dyn->insts[ninst].n.stack_pop)
                 MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop);
             MESSAGE(LOG_DEBUG, " -> ");
-            for(int i=0; i<24; ++i)
+            for(int i=0; i<32; ++i)
                 if(dyn->n.neoncache[i].v)
                     MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->n.neoncache[i].t, dyn->n.neoncache[i].n));
             if(dyn->n.combined1 || dyn->n.combined2)
@@ -2513,12 +2518,12 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst)
     dyn->n.swapped = 0;
 }
 
-void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1)
+void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, uint16_t mask, int s1)
 {
     MESSAGE(LOG_NONE, "Purge YMM mask=%04x --------\n", dyn->insts[ninst].purge_ymm);
     int s1_set = 0;
     for(int i=0; i<16; ++i)
-        if(dyn->insts[ninst].purge_ymm&(1<<i)) {
+        if(mask&(1<<i)) {
             if(is_avx_zero_unset(dyn, ninst, i)) {
                 if(!s1_set) {
                     ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0]));
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index ed593ca7..cb0e181c 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -562,7 +562,7 @@
 // Get EY
 #define GETEY(ey)                                                                               \
     if(MODREG)                                                                                  \
-        ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1, -1, -1, -1);                 \
+        ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, -1, -1, -1);                 \
     else                                                                                        \
         VLDR128_U12(ey, ed, fixedaddress+16);                                                   \
 
@@ -1499,7 +1499,7 @@ void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1);
 // Set rounding according to mxcsr flags, return reg to restore flags
 int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
 // purge ymm_zero mask according to purge_ymm
-void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1);
+void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, uint16_t mask, int s1);
 
 void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3);
 
@@ -1772,6 +1772,6 @@ uintptr_t dynarec64_AVX_F2_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
         }                                       \
     }
 
-#define PURGE_YMM()    avx_purge_ymm(dyn, ninst, x1)
+#define PURGE_YMM()    avx_purge_ymm(dyn, ninst, dyn->insts[ninst+1].purge_ymm, x1)
 
 #endif //__DYNAREC_ARM64_HELPER_H__
diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h
index 6e9b8019..8b2fc6fb 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass0.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass0.h
@@ -32,6 +32,9 @@
 #define INST_EPILOG                             \
         dyn->insts[ninst].f_exit = dyn->f;      \
         dyn->insts[ninst].n = dyn->n;           \
+        dyn->insts[ninst].ymm0_add = dyn->ymm_zero&~dyn->insts[ninst].ymm_zero; \
+        dyn->insts[ninst].ymm0_sub = dyn->insts[ninst].ymm_zero&~dyn->ymm_zero; \
+        dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\
         dyn->insts[ninst].x64.has_next = (ok>0)?1:0;
 #define INST_NAME(name) 
 #define DEFAULT                         \
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 6a6647df..03958927 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -57,7 +57,6 @@ typedef struct neoncache_s {
     int8_t              x87stack;       // cache stack counter
     int8_t              mmxcount;       // number of mmx register used (not both mmx and x87 at the same time)
     int8_t              fpu_scratch;    // scratch counter
-    int8_t              fpu_extra_qscratch; // some opcode need an extra quad scratch register
     int8_t              fpu_reg;        // x87/sse/mmx reg counter
 } neoncache_t;
 
@@ -83,6 +82,9 @@ typedef struct instruction_arm64_s {
     uint16_t            retn;
     uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
     uint16_t            purge_ymm;  // need to purge some ymm
+    uint16_t            ymm0_add;   // the ymm0 added by the opcode
+    uint16_t            ymm0_sub;   // the ymm0 removed by the opcode
+    uint16_t            ymm0_out;   // the ymmm0 at th end of the opcode
     uint8_t             barrier_maybe;
     uint8_t             will_write;
     uint8_t             last_write;
diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index 649ba1a1..a9086625 100644
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -357,7 +357,7 @@ static void fillPredecessors(dynarec_native_t* dyn)
     }
 }
 
-// updateNeed goes backward, from last instruction to top
+// updateNeed for the current block. recursive function that goes backward
 static int updateNeed(dynarec_native_t* dyn, int ninst, uint8_t need) {
     while (ninst>=0) {
         // need pending but instruction is only a subset: remove pend and use an X_ALL instead
@@ -400,6 +400,32 @@ static int updateNeed(dynarec_native_t* dyn, int ninst, uint8_t need) {
     return ninst;
 }
 
+// ypdate Ymm0 and Purge_ymm0.
+static int updateYmm0(dynarec_native_t* dyn, int ninst, uint16_t mask) {
+    while (ninst<dyn->size) {
+        uint16_t ymm0 = mask&~dyn->insts[ninst].purge_ymm; // current ymm0
+        uint16_t to_purge = dyn->insts[ninst].ymm_zero & ~ymm0; // the new to purge
+        uint16_t ymm0_out = (mask|dyn->insts[ninst].ymm0_add)&~dyn->insts[ninst].ymm0_sub; // ymm0 at the output
+        //check if need to recurse further
+        int ok = (ymm0==dyn->insts[ninst].ymm_zero) && (!to_purge) && (ymm0_out==dyn->insts[ninst].ymm0_out);
+        if(ok && dyn->insts[ninst].x64.has_next)
+            ok = (dyn->insts[ninst+1].ymm_zero==(ymm0_out&~dyn->insts[ninst+1].purge_ymm));
+        if(ok && dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts!=-1)
+            ok = (dyn->insts[dyn->insts[ninst].x64.jmp_insts].ymm_zero==(ymm0_out&~dyn->insts[dyn->insts[ninst].x64.jmp_insts].purge_ymm));
+        if(ok)
+            return ninst+1;
+        dyn->insts[ninst].ymm_zero = ymm0;
+        dyn->insts[ninst].purge_ymm |= to_purge;
+        dyn->insts[ninst].ymm0_out = ymm0_out;
+        if(dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts!=-1)
+            updateYmm0(dyn, dyn->insts[ninst].x64.jmp_insts, ymm0_out);
+        if(!dyn->insts[ninst].x64.has_next)
+            return ninst+1;
+        ++ninst;
+    }
+    return ninst;
+}
+
 void* current_helper = NULL;
 static int static_jmps[MAX_INSTS+2];
 static uintptr_t static_next[MAX_INSTS+2];
@@ -556,12 +582,6 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
                 if(!helper.insts[i].barrier_maybe)
                     helper.insts[k].x64.barrier |= BARRIER_FULL;
                 helper.insts[i].x64.jmp_insts = k;
-                if(helper.insts[i].ymm_zero || helper.insts[k].ymm_zero) {
-                    // move to pureg the reg that are present in k (jump to) but not in i (jump from)
-                    uint16_t to_purge = helper.insts[k].ymm_zero & ~helper.insts[i].ymm_zero;
-                    helper.insts[k].purge_ymm |= to_purge;
-                    helper.insts[k].ymm_zero &= ~to_purge;
-                }
             }
         }
     }
@@ -578,6 +598,9 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
     int pos = helper.size;
     while (pos>=0)
         pos = updateNeed(&helper, pos, 0);
+    pos = 0;
+    while(pos<helper.size)
+        pos = updateYmm0(&helper, pos, helper.insts[pos].ymm_zero);
     // remove fpu stuff on non-executed code
     for(int i=1; i<helper.size-1; ++i)
         if(!helper.insts[i].pred_sz) {
diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index dcb4cb6f..a308e264 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -104,8 +104,6 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
             }
             reset_n = -1;
         }
-        if(dyn->insts[ninst].purge_ymm)
-            PURGE_YMM();
         #if STEP > 0
         else if(ninst && (dyn->insts[ninst].pred_sz>1 || (dyn->insts[ninst].pred_sz==1 && dyn->insts[ninst].pred[0]!=ninst-1)))
             dyn->last_ip = 0;   // reset IP if some jump are coming here
@@ -310,6 +308,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
         }
         if((ok>0) && dyn->insts[ninst].x64.has_callret)
             reset_n = -2;
+        if((ok>0) && reset_n==-1 && dyn->insts[ninst+1].purge_ymm)
+            PURGE_YMM();
         ++ninst;
         #if STEP == 0
         memset(&dyn->insts[ninst], 0, sizeof(instruction_native_t));
diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h
index 3990caa4..62f65853 100644
--- a/src/dynarec/la64/dynarec_la64_pass0.h
+++ b/src/dynarec/la64/dynarec_la64_pass0.h
@@ -36,6 +36,9 @@
 #define INST_EPILOG                    \
     dyn->insts[ninst].f_exit = dyn->f; \
     dyn->insts[ninst].lsx = dyn->lsx;  \
+    dyn->insts[ninst].ymm0_add = dyn->ymm_zero&~dyn->insts[ninst].ymm_zero; \
+    dyn->insts[ninst].ymm0_sub = dyn->insts[ninst].ymm_zero&~dyn->ymm_zero; \
+    dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\
     dyn->insts[ninst].x64.has_next = (ok > 0) ? 1 : 0;
 #define INST_NAME(name)
 #define DEFAULT                                                                                                                                     \
diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h
index 09b6698e..9086e68f 100644
--- a/src/dynarec/la64/dynarec_la64_private.h
+++ b/src/dynarec/la64/dynarec_la64_private.h
@@ -82,6 +82,9 @@ typedef struct instruction_la64_s {
     uint16_t            retn;
     uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
     uint16_t            purge_ymm;  // need to purge some ymm
+    uint16_t            ymm0_add;   // the ymm0 added by the opcode
+    uint16_t            ymm0_sub;   // the ymm0 removed by the opcode
+    uint16_t            ymm0_out;   // the ymmm0 at th end of the opcode
     uint8_t             barrier_maybe;
     uint8_t             will_write;
     uint8_t             last_write;
diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h
index 9c2de9ee..174bb092 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass0.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass0.h
@@ -34,6 +34,9 @@
 #define INST_EPILOG                             \
         dyn->insts[ninst].f_exit = dyn->f;      \
         dyn->insts[ninst].e = dyn->e;           \
+        dyn->insts[ninst].ymm0_add = dyn->ymm_zero&~dyn->insts[ninst].ymm_zero; \
+        dyn->insts[ninst].ymm0_sub = dyn->insts[ninst].ymm_zero&~dyn->ymm_zero; \
+        dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\
         dyn->insts[ninst].x64.has_next = (ok>0)?1:0;
 #define INST_NAME(name) 
 #define DEFAULT                         \
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index 1ba830d5..aeda741c 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -92,6 +92,9 @@ typedef struct instruction_rv64_s {
     uint16_t            retn;
     uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
     uint16_t            purge_ymm;  // need to purge some ymm
+    uint16_t            ymm0_add;   // the ymm0 added by the opcode
+    uint16_t            ymm0_sub;   // the ymm0 removed by the opcode
+    uint16_t            ymm0_out;   // the ymmm0 at th end of the opcode
     int                 barrier_maybe;
     flagcache_t         f_exit;     // flags status at end of intruction
     extcache_t          e;          // extcache at end of intruction (but before poping)