about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-06-08 16:51:51 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-06-08 16:51:51 +0200
commita147c12f728cd50a9b462244334cf5a720a2b435 (patch)
tree4da1ad5915eb654e2e51df00dfce3c373bcea0c3 /src
parent874828c2ac6ede4302b5f86a3405ba6650a9ebd4 (diff)
downloadbox64-a147c12f728cd50a9b462244334cf5a720a2b435.tar.gz
box64-a147c12f728cd50a9b462244334cf5a720a2b435.zip
[ARM64_DYNAREC] Another fix for YMM selection on High registry pressure cases
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.c17
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c42
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h2
3 files changed, 44 insertions, 17 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index b05cbcc3..269197b3 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -58,7 +58,7 @@ int fpu_get_double_scratch(dynarec_arm_t* dyn, int ninst)
 void fpu_reset_scratch(dynarec_arm_t* dyn)
 {
     dyn->n.fpu_scratch = 0;
-    dyn->ymm_used = 0;
+    dyn->n.ymm_used = 0;
 }
 // Get a x87 double reg
 int fpu_get_reg_x87(dynarec_arm_t* dyn, int ninst, int t, int n)
@@ -118,14 +118,14 @@ int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm)
 }
 int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg)
 {
-    if(dyn->n.neoncache[reg].t==NEON_CACHE_YMMR || dyn->n.neoncache[reg].t==NEON_CACHE_YMMW) {
+    if((dyn->n.neoncache[reg].t==NEON_CACHE_YMMR) || (dyn->n.neoncache[reg].t==NEON_CACHE_YMMW)) {
         if(dyn->n.neoncache[reg].n == ymm) {
             // already there!
             if(t==NEON_CACHE_YMMW)
                 dyn->n.neoncache[reg].t=t;
             return reg;
         }
-    } else {
+    } else if(!dyn->n.neoncache[reg].v) {
         // found a slot!
         dyn->n.neoncache[reg].t=t;
         dyn->n.neoncache[reg].n=ymm;
@@ -135,11 +135,13 @@ int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg)
 }
 int is_ymm_to_keep(dynarec_arm_t* dyn, int reg, int k1, int k2, int k3)
 {
-    if(k1!=-1 && dyn->n.neoncache[reg].n==k1)
+    if((k1!=-1) && (dyn->n.neoncache[reg].n==k1))
+        return 1;
+    if((k2!=-1) && (dyn->n.neoncache[reg].n==k2))
         return 1;
-    if(k2!=-1 && dyn->n.neoncache[reg].n==k2)
+    if((k3!=-1) && (dyn->n.neoncache[reg].n==k3))
         return 1;
-    if(k3!=-1 && dyn->n.neoncache[reg].n==k3)
+    if((dyn->n.neoncache[reg].t==NEON_CACHE_YMMR || dyn->n.neoncache[reg].t==NEON_CACHE_YMMW) && (dyn->n.ymm_used&(1<<dyn->n.neoncache[reg].n)))
         return 1;
     return 0;
 }
@@ -537,6 +539,7 @@ void neoncacheUnwind(neoncache_t* cache)
                     break;
                 case NEON_CACHE_YMMR:
                 case NEON_CACHE_YMMW:
+                    cache->fpuused[i] = 0;  // YMM does not mark the fpu reg as used
                     break;
                 case NEON_CACHE_ST_F:
                 case NEON_CACHE_ST_D:
@@ -672,6 +675,8 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
             }
             dynarec_log(LOG_NONE, ")%s", (box64_dynarec_dump>1)?"\e[32m":"");
         }
+        if(dyn->insts[ninst].n.ymm_used)
+            dynarec_log(LOG_NONE, " ymmUsed=%04x", dyn->insts[ninst].n.ymm_used);
         if(dyn->ymm_zero || dyn->insts[ninst].ymm0_add || dyn->insts[ninst].ymm0_sub)
             dynarec_log(LOG_NONE, " ymm0=%04x(+%04x-%04x)", dyn->ymm_zero, dyn->insts[ninst].ymm0_add ,dyn->insts[ninst].ymm0_sub);
         if(dyn->insts[ninst].purge_ymm)
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 528035de..4eedadd1 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -1834,7 +1834,7 @@ void ymm_mark_zero(dynarec_arm_t* dyn, int ninst, int a)
     // look if already exist
     for(int i=0; i<32; ++i)
         if((dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) && dyn->n.neoncache[i].n==a) {
-            if(dyn->ymm_used&(1<<a)) {
+            if(dyn->n.ymm_used&(1<<a)) {
                 // special case, the reg was just added in the opcode and cannot be marked as 0, so just RAZ it now
                 dyn->n.neoncache[i].t = NEON_CACHE_YMMW;
                 VEORQ(i, i, i);
@@ -2559,8 +2559,8 @@ void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, uint16_t mask, int s1)
 int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3)
 {
     int i = -1;
-    dyn->ymm_used|=(1<<ymm);
-    #if STEP >1
+    dyn->n.ymm_used|=(1<<ymm);
+    #if STEP > 1
     // check the cached neoncache, it should be exact
     // look for it
     for(int ii=0; ii<32 && i==-1; ++ii)
@@ -2599,28 +2599,50 @@ int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k
     }
     // no free slot, needs to purge a value... First loop on the YMMR, they are easier to purge
     i = EMM0;
-    int keep = 0;
     for(int j=0; j<8; ++j) {
         if(!dyn->n.fpuused[i+j] && !(dyn->mmx87&(1<<j))) {
             // should a test be done to check if ymm is already in the purge list?
-            if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR) {
+            if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && (dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR)) {
                 dyn->n.neoncache[i+j].v = 0;
-                return internal_mark_ymm(dyn, t, ymm, i+j);
+                int ret = internal_mark_ymm(dyn, t, ymm, i+j);
+                if(ret>=0) return ret;
             }
         }
     }
-    // make space in the scratch area
     i = SCRATCH0;
+    for(int j=dyn->n.fpu_scratch; j<8; ++j) 
+        if(!(dyn->scratchs&(1<<j))) {
+            if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && (dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR)) {
+                dyn->n.neoncache[i+j].v = 0;
+                int ret = internal_mark_ymm(dyn, t, ymm, i+j);
+                if(ret>=0) return ret;
+            }
+    }
+    // make space in the scratch area
     for(int j=dyn->n.fpu_scratch; j<8; ++j) {
             // should a test be done to check if ymm is already in the purge list?
-            if(!(dyn->scratchs&(1<<j)) &&!is_ymm_to_keep(dyn, i+j, k1, k2, k3)) {
+            if(!(dyn->scratchs&(1<<j)) && !is_ymm_to_keep(dyn, i+j, k1, k2, k3)) {
                 // Save the reg and recycle it
                 VSTR128_U12(i+j, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i+j].n]));
                 dyn->n.neoncache[i+j].v = 0;
-                return internal_mark_ymm(dyn, t, ymm, i+j);
+                int ret = internal_mark_ymm(dyn, t, ymm, i+j);
+                if(ret>=0) return ret;
             }
     }
+    // last resort, go back in the EMM area...
+    i = EMM0;
+    for(int j=7; j>=0; --j) {
+        if(!dyn->n.fpuused[i+j] && !(dyn->mmx87&(1<<j))) {
+            // should a test be done to check if ymm is already in the purge list?
+            if((dyn->n.neoncache[i+j].t==NEON_CACHE_YMMW) && !is_ymm_to_keep(dyn, i+j, k1, k2, k3)) {
+                VSTR128_U12(i+j, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i+j].n]));
+                dyn->n.neoncache[i+j].v = 0;
+                int ret = internal_mark_ymm(dyn, t, ymm, i+j);
+                if(ret>=0) return ret;
+            }
+        }
+    }
     #endif
-    printf_log(LOG_NONE, "BOX64 Dynarec: Error, unable to free a reg for YMM %d at inst=%d\n", ymm, ninst);
+    printf_log(LOG_NONE, "BOX64 Dynarec: Error, unable to free a reg for YMM %d at inst=%d on pass %d\n", ymm, ninst, STEP);
     return i;
 }
\ No newline at end of file
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 07b66a39..db205a3b 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -58,6 +58,7 @@ typedef struct neoncache_s {
     int8_t              mmxcount;       // number of mmx register used (not both mmx and x87 at the same time)
     int8_t              fpu_scratch;    // scratch counter
     int8_t              fpu_reg;        // x87/sse/mmx reg counter
+    uint16_t            ymm_used;       // mask of the ymm regs used in this opcode
 } neoncache_t;
 
 typedef struct flagcache_s {
@@ -126,7 +127,6 @@ typedef struct dynarec_arm_s {
     int32_t             forward_size;   // size at the forward point
     int                 forward_ninst;  // ninst at the forward point
     uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
-    uint16_t            ymm_used;   // mask of the ymm regs used in this opcode
     uint8_t             smwrite;    // for strongmem model emulation
     uint8_t             smread;
     uint8_t             doublepush;