about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-07-07 10:37:51 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-07-07 10:37:51 +0200
commit14b0323bf64ba28b081effb78ef7d3897fd5d64d (patch)
tree724d37afadc393dc6ffd994449cafa8f57212f86 /src
parentb4828477794a8e69a96f0ca7991ad0e619d1b2a3 (diff)
downloadbox64-14b0323bf64ba28b081effb78ef7d3897fd5d64d.tar.gz
box64-14b0323bf64ba28b081effb78ef7d3897fd5d64d.zip
[ARM64_DYNAREC] Reworked ymm0 propagation
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.c10
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.h1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c8
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass0.h2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass1.h2
-rw-r--r--src/dynarec/dynarec_arch.h3
-rw-r--r--src/dynarec/dynarec_native.c100
-rw-r--r--src/dynarec/dynarec_native_pass.c2
-rw-r--r--src/dynarec/la64/dynarec_la64_pass0.h2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass0.h2
-rw-r--r--src/include/dynarec_native.h1
11 files changed, 83 insertions, 50 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index afb1ed6b..8fd0929e 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -688,6 +688,8 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
             dynarec_log(LOG_NONE, ", jmp=%d", dyn->insts[ninst].x64.jmp_insts);
         if(dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts==-1)
             dynarec_log(LOG_NONE, ", jmp=out");
+        if(dyn->insts[ninst].x64.has_callret)
+            dynarec_log(LOG_NONE, ", callret");
         if(dyn->last_ip)
             dynarec_log(LOG_NONE, ", last_ip=%p", (void*)dyn->last_ip);
         for(int ii=0; ii<32; ++ii) {
@@ -789,14 +791,6 @@ void fpu_reset_ninst(dynarec_native_t* dyn, int ninst)
 
 }
 
-void arm64_fpu_reset(dynarec_native_t* dyn, int ninst, int step)
-{
-    if(step<2) {
-        dyn->insts[ninst].ymm0_in = 0;
-        dyn->insts[ninst].ymm0_out = 0;
-    }
-}
-
 int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st)
 {
     return (dyn->n.tags&(0b11<<(st*2)))?1:0;
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h
index 0af490e4..b6c95904 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.h
+++ b/src/dynarec/arm64/dynarec_arm64_functions.h
@@ -69,7 +69,6 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode);
 // reset the cache
 void fpu_reset(dynarec_native_t* dyn);
 void fpu_reset_ninst(dynarec_native_t* dyn, int ninst);
-void arm64_fpu_reset(dynarec_native_t* dyn, int ninst, int step);
 
 // is st freed
 int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st);
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 136c0f8c..04fa97f6 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -1806,6 +1806,8 @@ int ymm_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite, int
             return i;
         }
     // nope, grab a new one
+    if(dyn->ymm_zero&(1<<a))
+        forwrite = 1;   // if the reg was zero, then it will need to be write back
     int ret =  fpu_get_reg_ymm(dyn, ninst, forwrite?NEON_CACHE_YMMW:NEON_CACHE_YMMR, a, k1, k2, k3);
     if(dyn->ymm_zero&(1<<a)) {
         VEORQ(ret, ret, ret);
@@ -2365,6 +2367,12 @@ void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
     x87_reflectcache(dyn, ninst, s1, s2, s3);
     mmx_reflectcache(dyn, ninst, s1);
     //sse_reflectcache(dyn, ninst, s1); // no need, it's pushed/unpushed during call
+    // but ymm0 needs to be pushed
+    if(dyn->ymm_zero) {
+        ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0]));
+        for(int i=0; i<16; ++i)
+            STPx_S7_offset(xZR, xZR, s1, 16*i);
+    }
 }
 
 void fpu_unreflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h
index 510dd4ab..7d4c0c2d 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass0.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass0.h
@@ -26,13 +26,11 @@
         dyn->n.combined1 = dyn->n.combined2 = 0;\
         dyn->n.swapped = 0; dyn->n.barrier = 0; \
         dyn->insts[ninst].f_entry = dyn->f;     \
-        dyn->insts[ninst].ymm0_in = dyn->ymm_zero;\
         if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;}
 
 #define INST_EPILOG                             \
         dyn->insts[ninst].f_exit = dyn->f;      \
         dyn->insts[ninst].n = dyn->n;           \
-        dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\
         dyn->insts[ninst].x64.has_next = (ok>0)?1:0;
 #define INST_NAME(name) 
 #define DEFAULT                         \
diff --git a/src/dynarec/arm64/dynarec_arm64_pass1.h b/src/dynarec/arm64/dynarec_arm64_pass1.h
index ab1f5fc4..6cf92feb 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass1.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass1.h
@@ -5,12 +5,10 @@
 #define NEW_INST                                \
         dyn->insts[ninst].f_entry = dyn->f;     \
         dyn->n.combined1 = dyn->n.combined2 = 0;\
-        dyn->insts[ninst].ymm0_in = dyn->ymm_zero;\
         dyn->n.swapped = 0; dyn->n.barrier = 0
 
 #define INST_EPILOG                             \
         dyn->insts[ninst].n = dyn->n;           \
-        dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\
         dyn->insts[ninst].f_exit = dyn->f
 
 #define INST_NAME(name)  
diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h
index 351d9fcd..f89125a7 100644
--- a/src/dynarec/dynarec_arch.h
+++ b/src/dynarec/dynarec_arch.h
@@ -17,7 +17,6 @@
 #include "arm64/dynarec_arm64_functions.h"

 // Limit here is defined by LD litteral, that is 19bits

 #define MAXBLOCK_SIZE ((1<<19)-200)

-#define ARM_FPU_RESET() arm64_fpu_reset(dyn, ninst, STEP)

 #elif defined(LA64)

 

 #define instruction_native_t        instruction_la64_t

@@ -34,7 +33,6 @@
 #include "la64/dynarec_la64_functions.h"

 // Limit here is unconditionnal jump, that is signed 28bits

 #define MAXBLOCK_SIZE ((1 << 27) - 200)

-#define ARM_FPU_RESET()

 #elif defined(RV64)

 

 #define instruction_native_t        instruction_rv64_t

@@ -51,7 +49,6 @@
 #include "rv64/dynarec_rv64_functions.h"

 // Limit here is unconditionnal jump, that is signed 21bits

 #define MAXBLOCK_SIZE ((1<<20)-200)

-#define ARM_FPU_RESET()

 #else

 #error Unsupported platform

 #endif

diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index 15ecdce8..a233c690 100644
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -405,35 +405,81 @@ static int updateNeed(dynarec_native_t* dyn, int ninst, uint8_t need) {
     return ninst;
 }
 
-// update Ymm0 and Purge_ymm0.
-static int updateYmm0(dynarec_native_t* dyn, int ninst) {
+static void updateYmm0s(dynarec_native_t* dyn, int ninst, int max_ninst_reached) {
+    int can_incr = ninst == max_ninst_reached; // Are we the top-level call?
     int ok = 1;
-    while (ok && ninst<dyn->size) {
-        uint16_t ymm0 = dyn->insts[ninst].ymm0_in; // entry ymm0
-        ymm0&=~dyn->insts[ninst].purge_ymm; // entry after purge
-        uint16_t ymm0_out = (ymm0|dyn->insts[ninst].ymm0_add)&~dyn->insts[ninst].ymm0_sub;  // ymm0 after the opcode
-        ok = dyn->insts[ninst].x64.has_next;    // continue?
-        if(ok) ok = (dyn->insts[ninst].ymm0_in!=ymm0) || (dyn->insts[ninst+1].ymm0_in!=ymm0_out); // continue if there has been any change...
-        if(ok) dyn->insts[ninst+1].ymm0_in=ymm0_out;   // make the change
-        dyn->insts[ninst].ymm0_out = ymm0_out;  // update ymm0_out
-        dyn->insts[ninst].ymm0_in = ymm0;  // write purged ymm0, as it's done at the entry
-        int jmp = (dyn->insts[ninst].x64.jmp)?dyn->insts[ninst].x64.jmp_insts:-1;
-        if(jmp!=-1) {
-            // check if a purge is needed at jump point
-            ymm0_out&=~dyn->insts[jmp].purge_ymm;
-            ok = (dyn->insts[jmp].pred_sz==1) && (dyn->insts[jmp].ymm0_in!=ymm0_out);
-            if(dyn->insts[jmp].pred_sz==1)
-                dyn->insts[jmp].ymm0_in = ymm0_out;
-            uint16_t ymm0_jmp = dyn->insts[jmp].ymm0_in;
-            uint16_t to_purge = ymm0_jmp&~ymm0_out; // if there are too many ymm0 at jump point
-            if(to_purge)
-                dyn->insts[jmp].purge_ymm|=to_purge;
-            if(to_purge || ok)
-                updateYmm0(dyn, jmp);
+    while ((can_incr || ok) && ninst<dyn->size) {
+        //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "update ninst=%d (%d): can_incr=%d\n", ninst, max_ninst_reached, can_incr);
+        uint16_t new_purge_ymm, new_ymm0_in, new_ymm0_out;
+
+        if (ninst && dyn->insts[ninst].pred_sz && dyn->insts[ninst].x64.alive) {
+            uint16_t ymm0_union = 0, ymm0_inter = (uint16_t)-1; // The union of the empty set is empty, the intersection is the universe
+            for (int i = 0; i < dyn->insts[ninst].pred_sz; ++i) {
+                int pred = dyn->insts[ninst].pred[i];
+                //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "\twith pred[%d] = %d", i, pred);
+                if (pred >= max_ninst_reached) {
+                    //if(box64_dynarec_dump) dynarec_log(LOG_NONE, " (skipped)\n");
+                    continue;
+                }
+
+                int pred_out = dyn->insts[pred].x64.has_callret ? 0 : dyn->insts[pred].ymm0_out;
+                //if(box64_dynarec_dump) dynarec_log(LOG_NONE, " ~> %04X\n", pred_out);
+                ymm0_union |= pred_out;
+                ymm0_inter &= pred_out;
+            }
+            //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "\t=> %04X,%04X\n", ymm0_union, ymm0_inter);
+            // Notice the default values yield something coherent here (if all pred are after ninst)
+            new_purge_ymm = ymm0_union & ~ymm0_inter;
+            new_ymm0_in = ymm0_inter;
+            new_ymm0_out = (ymm0_inter | dyn->insts[ninst].ymm0_add) & ~dyn->insts[ninst].ymm0_sub;
+
+            if ((dyn->insts[ninst].purge_ymm != new_purge_ymm) || (dyn->insts[ninst].ymm0_in != new_ymm0_in) || (dyn->insts[ninst].ymm0_out != new_ymm0_out)) {
+                // Need to update self and next(s)
+                dyn->insts[ninst].purge_ymm = new_purge_ymm;
+                dyn->insts[ninst].ymm0_in = new_ymm0_in;
+                dyn->insts[ninst].ymm0_out = new_ymm0_out;
+
+                if (can_incr) {
+                    // We always have ninst == max_ninst_reached when can_incr == 1
+                    ++max_ninst_reached;
+                } else {
+                    // We need to stop here if the opcode has no "real" next or if we reached the ninst of the toplevel
+                    ok = (max_ninst_reached - 1 != ninst) && dyn->insts[ninst].x64.has_next && !dyn->insts[ninst].x64.has_callret;
+                }
+
+                int jmp = (dyn->insts[ninst].x64.jmp)?dyn->insts[ninst].x64.jmp_insts:-1;
+                if((jmp!=-1) && (jmp < max_ninst_reached)) {
+                    //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "\t! jump to %d\n", jmp);
+                    // The jump goes before the last instruction reached, update the destination
+                    // If this is the top level call, this means the jump goes backward (jmp != ninst)
+                    // Otherwise, since we don't update all instructions, we may miss the update (don't use jmp < ninst)
+                    updateYmm0s(dyn, jmp, max_ninst_reached);
+                }
+            } else {
+                if (can_incr) {
+                    // We always have ninst == max_ninst_reached when can_incr == 1
+                    ++max_ninst_reached;
+
+                    // Also update jumps to before (they are skipped otherwise)
+                    int jmp = (dyn->insts[ninst].x64.jmp)?dyn->insts[ninst].x64.jmp_insts:-1;
+                    if((jmp!=-1) && (jmp < max_ninst_reached)) {
+                        //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "\t! jump to %d\n", jmp);
+                        updateYmm0s(dyn, jmp, max_ninst_reached);
+                    }
+                } else {
+                    // We didn't update anything, we can leave
+                    ok = 0;
+                }
+            }
+        } else if (can_incr) {
+            // We always have ninst == max_ninst_reached when can_incr == 1
+            ++max_ninst_reached;
+        } else {
+            // We didn't update anything, we can leave
+            ok = 0;
         }
         ++ninst;
     }
-    return ninst;
 }
 
 void* current_helper = NULL;
@@ -628,9 +674,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
         CancelBlock64(0);
         return CreateEmptyBlock(block, addr);
     }
-    pos = 0;
-    while(pos<helper.size)
-        pos = updateYmm0(&helper, pos);
+    updateYmm0s(&helper, 0, 0);
 
 
     // pass 1, float optimizations, first pass for flags
diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index 14f80103..2ebc89cc 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -89,14 +89,12 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
                 dyn->f.dfnone = 0;
                 dyn->f.pending = 0;
                 fpu_reset(dyn);
-                ARM_FPU_RESET();
             } else {
                 fpu_reset_cache(dyn, ninst, reset_n);
                 dyn->f = dyn->insts[reset_n].f_exit;
                 if(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT) {
                     MESSAGE(LOG_DEBUG, "Apply Barrier Float\n");
                     fpu_reset(dyn);
-                    ARM_FPU_RESET();
                 }
                 if(dyn->insts[ninst].x64.barrier&BARRIER_FLAGS) {
                     MESSAGE(LOG_DEBUG, "Apply Barrier Flags\n");
diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h
index 0ea40a86..99a897a6 100644
--- a/src/dynarec/la64/dynarec_la64_pass0.h
+++ b/src/dynarec/la64/dynarec_la64_pass0.h
@@ -30,13 +30,11 @@
     dyn->lsx.combined1 = dyn->lsx.combined2 = 0; \
     dyn->lsx.swapped = 0;                        \
     dyn->lsx.barrier = 0;                        \
-    dyn->insts[ninst].ymm0_in = dyn->ymm_zero;   \
     dyn->insts[ninst].f_entry = dyn->f;          \
     if (ninst) { dyn->insts[ninst - 1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst - 1].x64.addr; }
 #define INST_EPILOG                    \
     dyn->insts[ninst].f_exit = dyn->f; \
     dyn->insts[ninst].lsx = dyn->lsx;  \
-    dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\
     dyn->insts[ninst].x64.has_next = (ok > 0) ? 1 : 0;
 #define INST_NAME(name)
 #define DEFAULT                                                                                                                                     \
diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h
index 04857e8c..3ee1685f 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass0.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass0.h
@@ -28,13 +28,11 @@
         dyn->e.swapped = 0; dyn->e.barrier = 0; \
         for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\
         dyn->insts[ninst].f_entry = dyn->f;     \
-        dyn->insts[ninst].ymm0_in = dyn->ymm_zero;\
         if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;}
 
 #define INST_EPILOG                             \
         dyn->insts[ninst].f_exit = dyn->f;      \
         dyn->insts[ninst].e = dyn->e;           \
-        dyn->insts[ninst].ymm0_out = dyn->ymm_zero;\
         dyn->insts[ninst].x64.has_next = (ok>0)?1:0;
 #define INST_NAME(name) 
 #define DEFAULT                         \
diff --git a/src/include/dynarec_native.h b/src/include/dynarec_native.h
index 3bda443c..dd5218f6 100644
--- a/src/include/dynarec_native.h
+++ b/src/include/dynarec_native.h
@@ -5,6 +5,7 @@ typedef struct dynablock_s dynablock_t;
 typedef struct x64emu_s x64emu_t;
 typedef struct instsize_s instsize_t;
 
+
 //#define USE_CUSTOM_MEM
 #ifdef USE_CUSTOM_MEM
 #define dynaMalloc customMalloc