about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-02-11 16:12:46 +0100
committerptitSeb <sebastien.chev@gmail.com>2024-02-11 16:12:46 +0100
commit5e6af3753292b8da43d4fbee186a78f3e5068141 (patch)
tree3cd5ec700844143a309d9448bc650f9d35738c91 /src
parentc0184f926dd98792f313194d3b80a92f4fe1c04a (diff)
downloadbox64-5e6af3753292b8da43d4fbee186a78f3e5068141.tar.gz
box64-5e6af3753292b8da43d4fbee186a78f3e5068141.zip
[ARM64_DYNAREC] Improved FFREE handling (fixing gameplay of Serious Sam 2, probably some other game too)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_d9.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_dd.c9
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c119
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h3
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h1
5 files changed, 115 insertions, 19 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c
index f0fa18ac..49ab9fe0 100644
--- a/src/dynarec/arm64/dynarec_arm64_d9.c
+++ b/src/dynarec/arm64/dynarec_arm64_d9.c
@@ -162,7 +162,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 }
                 // load tag
                 ADDx_U12(x1, xEmu, offsetof(x64emu_t, p_regs));
-                LDRw_REG_LSL2(x3, x1, x2);
+                LDRw_REG_LSL2(x3, x1, x4);
                 CMPSw_U12(x3, 0b11);    // empty
                 MOV32w(x3, 0b100000100000000);
                 CSELx(x4, x3, x4, cEQ); // empty: C3,C2,C0 = 101
diff --git a/src/dynarec/arm64/dynarec_arm64_dd.c b/src/dynarec/arm64/dynarec_arm64_dd.c
index 42a6f634..e349053c 100644
--- a/src/dynarec/arm64/dynarec_arm64_dd.c
+++ b/src/dynarec/arm64/dynarec_arm64_dd.c
@@ -52,14 +52,7 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
         case 0xC7:
             INST_NAME("FFREE STx");
             #if 1
-            x87_forget(dyn, ninst, x1, x2, nextop&7);
-            // empty tags
-            MOVZw(x3, 0b11);
-            ADDx_U12(x1, xEmu, offsetof(x64emu_t, p_regs));
-            LDRw_U12(x2, xEmu, offsetof(x64emu_t, top));
-            ADDw_U12(x2, x2, nextop&7);
-            ANDw_mask(x2, x2, 0, 2);    // mask=7
-            STRw_REG_LSL2(x3, x1, x2);
+            x87_free(dyn, ninst, x1, x2, x3, nextop&7);
             #else
             MESSAGE(LOG_DUMP, "Need Optimization\n");
             x87_purgecache(dyn, ninst, 0, x1, x2, x3);
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 88008fe8..efeb883e 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -853,8 +853,10 @@ void grab_segdata(dynarec_arm_t* dyn, uintptr_t addr, int ninst, int reg, int se
 // x87 stuffs
 static void x87_reset(dynarec_arm_t* dyn)
 {
-    for (int i=0; i<8; ++i)
+    for (int i=0; i<8; ++i) {
         dyn->n.x87cache[i] = -1;
+        dyn->n.freed[i] = -1;
+    }
     dyn->n.x87stack = 0;
     dyn->n.stack = 0;
     dyn->n.stack_next = 0;
@@ -967,7 +969,9 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
          ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64))
             ++dyn->n.neoncache[j].n;
     int ret = -1;
-    for(int i=0; i<8; ++i)
+    for(int i=0; i<8; ++i) {
+        if(dyn->n.freed[i]!=-1)
+            ++dyn->n.freed[i];
         if(dyn->n.x87cache[i]!=-1)
             ++dyn->n.x87cache[i];
         else if(ret==-1) {
@@ -975,6 +979,7 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
             ret=dyn->n.x87reg[i]=fpu_get_reg_x87(dyn, t, 0);
             dyn->n.neoncache[ret].t = X87_ST0;
         }
+    }
     return ret;
 }
 void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1)
@@ -991,20 +996,17 @@ void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1)
          ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_F)
          ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64))
             ++dyn->n.neoncache[j].n;
-    for(int i=0; i<8; ++i)
+    for(int i=0; i<8; ++i) {
+        if(dyn->n.freed[i]!=-1)
+            ++dyn->n.freed[i];
         if(dyn->n.x87cache[i]!=-1)
             ++dyn->n.x87cache[i];
+    }
     if(s1)
         x87_stackcount(dyn, ninst, s1);
 }
-void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1)
+void static internal_x87_dopop(dynarec_arm_t* dyn)
 {
-    if(dyn->n.mmxcount)
-        mmx_purgecache(dyn, ninst, 0, s1);
-    dyn->n.x87stack-=1;
-    dyn->n.stack_next-=1;
-    dyn->n.stack_pop+=1;
-    // move all regs in cache, poping ST0
     for(int i=0; i<8; ++i)
         if(dyn->n.x87cache[i]!=-1) {
             --dyn->n.x87cache[i];
@@ -1014,6 +1016,32 @@ void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1)
             }
         }
 }
+int static internal_x87_dofree(dynarec_arm_t* dyn)
+{
+    int ret = 0;
+    for(int i=0; i<8; ++i)
+        if(dyn->n.freed[i]!=-1) {
+            --dyn->n.freed[i];
+            if(dyn->n.freed[i]<=0) {
+                MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n");
+                dyn->n.freed[i] = -1;
+                ret = 1;
+            }
+        }
+    return ret;
+}
+void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1)
+{
+    if(dyn->n.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    do {
+        dyn->n.x87stack-=1;
+        dyn->n.stack_next-=1;
+        dyn->n.stack_pop+=1;
+        // move all regs in cache, poping ST0
+        internal_x87_dopop(dyn);
+    } while(internal_x87_dofree(dyn));
+}
 static int x87_is_stcached(dynarec_arm_t* dyn, int st)
 {
     for (int i=0; i<8; ++i)
@@ -1110,6 +1138,8 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int
     }
     if(!next) {
         dyn->n.stack_next = 0;
+        for(int i=0; i<8; ++i)
+            dyn->n.freed[i] = -1;
         #if STEP < 2
         // refresh the cached valued, in case it's a purge outside a instruction
         dyn->insts[ninst].n.barrier = 1;
@@ -1359,6 +1389,75 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
     MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
 }
 
+void x87_free(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int st)
+{
+    int ret = -1;
+    for (int i=0; (i<8) && (ret==-1); ++i)
+        if(dyn->n.x87cache[i] == st)
+            ret = i;
+    MESSAGE(LOG_DUMP, "\tFFREE%s x87 Cache for ST%d\n", (ret!=-1)?" (and Forget)":"", st);
+    if(ret!=-1) {
+        const int reg = dyn->n.x87reg[ret];
+        #if STEP == 1
+        if(dyn->n.neoncache[reg].t==NEON_CACHE_ST_F || dyn->n.neoncache[reg].t==NEON_CACHE_ST_I64)
+            neoncache_promote_double(dyn, ninst, st);
+        #endif
+        // prepare offset to fpu => s1
+        ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87));
+        // Get top
+        LDRw_U12(s2, xEmu, offsetof(x64emu_t, top));
+        // Update
+        int ast = st - dyn->n.x87stack;
+        if(ast) {
+            if(ast>0) {
+                ADDw_U12(s2, s2, ast);
+            } else {
+                SUBw_U12(s2, s2, -ast);
+            }
+            ANDw_mask(s2, s2, 0, 2); //mask=7    // (emu->top + i)&7
+        }
+        if(dyn->n.neoncache[reg].t==NEON_CACHE_ST_F) {
+            FCVT_D_S(31, reg);
+            VSTR64_REG_LSL3(31, s1, s2);
+        } else if(dyn->n.neoncache[reg].t==NEON_CACHE_ST_I64) {
+            SCVTFDD(31, reg);
+            VSTR64_REG_LSL3(31, s1, s2);
+        } else {
+            VSTR64_REG_LSL3(reg, s1, s2);
+        }
+        // and forget that cache
+        fpu_free_reg(dyn, reg);
+        dyn->n.neoncache[reg].v = 0;
+        dyn->n.x87cache[ret] = -1;
+        dyn->n.x87reg[ret] = -1;
+    } else {
+        // Get top
+        LDRw_U12(s2, xEmu, offsetof(x64emu_t, top));
+        // Update
+        int ast = st - dyn->n.x87stack;
+        if(ast) {
+            if(ast>0) {
+                ADDw_U12(s2, s2, ast);
+            } else {
+                SUBw_U12(s2, s2, -ast);
+            }
+            ANDw_mask(s2, s2, 0, 2); //mask=7    // (emu->top + i)&7
+        }
+    }
+    // mark as free
+    ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs));
+    MOVZw(s3, 0b11);
+    STRw_REG_LSL2(s3, s1, s2);
+    // add mark in the freed array
+    for(int i=0; i<8; ++i)
+        if(dyn->n.freed[i]==-1) {
+            dyn->n.freed[i]=st;
+            MESSAGE(LOG_DUMP, "\t--------x87 Marked ST%d as Freed\n", st);
+            break;
+        }
+    MESSAGE(LOG_DUMP, "\t--------x87 FFREE for ST%d\n", st);
+}
+
 void x87_swapreg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int b)
 {
     int i1, i2, i3;
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 918b8b2a..fa7c63b0 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -1090,6 +1090,7 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr);
 #define x87_get_neoncache STEPNAME(x87_get_neoncache)
 #define x87_get_st      STEPNAME(x87_get_st)
 #define x87_get_st_empty  STEPNAME(x87_get_st)
+#define x87_free        STEPNAME(x87_free)
 #define x87_forget      STEPNAME(x87_forget)
 #define x87_reget_st    STEPNAME(x87_reget_st)
 #define x87_stackcount  STEPNAME(x87_stackcount)
@@ -1253,6 +1254,8 @@ int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a);
 int x87_get_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t);
 // get vfpu register for a x87 reg, create the entry if needed. Do not fetch the Stx if not already in cache
 int x87_get_st_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t);
+// Free st, using the FFREE opcode (so it's freed but stack is not moved)
+void x87_free(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int st);
 // refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached)
 void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st);
 // refresh the cache value from emu
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 1a7b387e..f7e7b008 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -46,6 +46,7 @@ typedef struct neoncache_s {
     // fpu cache
     int8_t              x87cache[8];    // cache status for the 8 x87 register behind the fpu stack
     int8_t              x87reg[8];      // reg used for x87cache entry
+    int8_t              freed[8];       // set when FFREE is used, -1 else
     int8_t              mmxcache[8];    // cache status for the 8 MMX registers
     sse_cache_t         ssecache[16];   // cache status for the 16 SSE(2) registers
     int8_t              fpuused[24];    // all 0..24 double reg from fpu, used by x87, sse and mmx