about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2023-06-25 16:58:43 +0200
committerptitSeb <sebastien.chev@gmail.com>2023-06-25 16:58:43 +0200
commit735d7ab1b4bead627d2a380913864be49c214705 (patch)
tree002cacbb04db542e58c867aa79c67808c8063100
parent3c9b556ca80c86938b9a042ccfc48f7e06e80e95 (diff)
downloadbox64-735d7ab1b4bead627d2a380913864be49c214705.tar.gz
box64-735d7ab1b4bead627d2a380913864be49c214705.zip
[ARM64_DYNAREC] Improved handling of FILD/FISTP i64 sequence, important fo 32bits process (for #860)
-rw-r--r--src/dynarec/arm64/dynarec_arm64_df.c54
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_functions.c61
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_functions.h8
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_helper.c69
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_helper.h3
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_private.h15
6 files changed, 153 insertions, 57 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_df.c b/src/dynarec/arm64/dynarec_arm64_df.c
index 522fe8f3..3e99ceae 100644
--- a/src/dynarec/arm64/dynarec_arm64_df.c
+++ b/src/dynarec/arm64/dynarec_arm64_df.c
@@ -287,10 +287,12 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     break;
                 case 5:
                     INST_NAME("FILD ST0, i64");
-                    v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D);
+                    v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_I64);
                     addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
-                    LDx(x1, wback, fixedaddress);
-                    SCVTFDx(v1, x1);
+                    VLD64(v1, wback, fixedaddress);
+                    if(!ST_IS_I64(0)) {
+                        SCVTFDD(v1, v1);
+                    }
                     break;
                 case 6:
                     INST_NAME("FBSTP tbytes, ST0");
@@ -302,29 +304,35 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     break;
                 case 7:
                     INST_NAME("FISTP i64, ST0");
-                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                    u8 = x87_setround(dyn, ninst, x1, x2, x4);
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_I64);
+                    if(!ST_IS_I64(0)) {
+                        u8 = x87_setround(dyn, ninst, x1, x2, x4);
+                    }
                     addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                     ed = x1;
                     s0 = fpu_get_scratch(dyn);
-                    #if 0
-                    FRINT64XD(s0, v1);
-                    VFCVTZSd(s0, s0);
-                    VSTR64_U12(s0, wback, fixedaddress);
-                    #else
-                    MRS_fpsr(x5);
-                    BFCw(x5, FPSR_IOC, 1);   // reset IOC bit
-                    MSR_fpsr(x5);
-                    FRINTXD(s0, v1);
-                    VFCVTZSd(s0, s0);
-                    VST64(s0, wback, fixedaddress);
-                    MRS_fpsr(x5);   // get back FPSR to check the IOC bit
-                    TBZ_MARK3(x5, FPSR_IOC);
-                    ORRx_mask(x5, xZR, 1, 1, 0);    //0x8000000000000000
-                    STx(x5, wback, fixedaddress);
-                    MARK3;
-                    #endif
-                    x87_restoreround(dyn, ninst, u8);
+                    if(ST_IS_I64(0)) {
+                        VST64(v1, wback, fixedaddress);
+                    } else {
+                        #if 0
+                        FRINT64XD(s0, v1);
+                        VFCVTZSd(s0, s0);
+                        VSTR64_U12(s0, wback, fixedaddress);
+                        #else
+                        MRS_fpsr(x5);
+                        BFCw(x5, FPSR_IOC, 1);   // reset IOC bit
+                        MSR_fpsr(x5);
+                        FRINTXD(s0, v1);
+                        VFCVTZSd(s0, s0);
+                        VST64(s0, wback, fixedaddress);
+                        MRS_fpsr(x5);   // get back FPSR to check the IOC bit
+                        TBZ_MARK3(x5, FPSR_IOC);
+                        ORRx_mask(x5, xZR, 1, 1, 0);    //0x8000000000000000
+                        STx(x5, wback, fixedaddress);
+                        MARK3;
+                        #endif
+                        x87_restoreround(dyn, ninst, u8);
+                    }
                     x87_do_pop(dyn, ninst, x3);
                     break;
                 default:
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index 42dfd09e..d2963b55 100755
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -58,7 +58,7 @@ void fpu_free_reg(dynarec_arm_t* dyn, int reg)
 {
     // TODO: check upper limit?
     dyn->n.fpuused[reg] = 0;
-    if(dyn->n.neoncache[reg].t!=NEON_CACHE_ST_F && dyn->n.neoncache[reg].t!=NEON_CACHE_ST_D)
+    if(dyn->n.neoncache[reg].t!=NEON_CACHE_ST_F && dyn->n.neoncache[reg].t!=NEON_CACHE_ST_D && dyn->n.neoncache[reg].t!=NEON_CACHE_ST_I64)
         dyn->n.neoncache[reg].v = 0;
 }
 // Get an MMX double reg
@@ -106,7 +106,8 @@ int neoncache_get_st(dynarec_arm_t* dyn, int ninst, int a)
     }
     for(int i=0; i<24; ++i)
         if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F
-         || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D)
+         || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D
+         || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64)
          && dyn->insts[ninst].n.neoncache[i].n==a)
             return dyn->insts[ninst].n.neoncache[i].t;
     // not in the cache yet, so will be fetched...
@@ -120,7 +121,8 @@ int neoncache_get_current_st(dynarec_arm_t* dyn, int ninst, int a)
         return NEON_CACHE_ST_D;
     for(int i=0; i<24; ++i)
         if((dyn->n.neoncache[i].t==NEON_CACHE_ST_F
-         || dyn->n.neoncache[i].t==NEON_CACHE_ST_D)
+         || dyn->n.neoncache[i].t==NEON_CACHE_ST_D
+         || dyn->n.neoncache[i].t==NEON_CACHE_ST_I64)
          && dyn->n.neoncache[i].n==a)
             return dyn->n.neoncache[i].t;
     // not in the cache yet, so will be fetched...
@@ -138,6 +140,17 @@ int neoncache_get_st_f(dynarec_arm_t* dyn, int ninst, int a)
             return i;
     return -1;
 } 
+int neoncache_get_st_f_i64(dynarec_arm_t* dyn, int ninst, int a)
+{
+    /*if(a+dyn->insts[ninst].n.stack_next-st<0)
+        // The STx has been pushed at the end of instructon, so stop going back
+        return -1;*/
+    for(int i=0; i<24; ++i)
+        if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64 || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F)
+         && dyn->insts[ninst].n.neoncache[i].n==a)
+            return i;
+    return -1;
+} 
 int neoncache_get_st_f_noback(dynarec_arm_t* dyn, int ninst, int a)
 {
     for(int i=0; i<24; ++i)
@@ -146,6 +159,14 @@ int neoncache_get_st_f_noback(dynarec_arm_t* dyn, int ninst, int a)
             return i;
     return -1;
 } 
+int neoncache_get_st_f_i64_noback(dynarec_arm_t* dyn, int ninst, int a)
+{
+    for(int i=0; i<24; ++i)
+        if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64 || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F)
+         && dyn->insts[ninst].n.neoncache[i].n==a)
+            return i;
+    return -1;
+} 
 int neoncache_get_current_st_f(dynarec_arm_t* dyn, int a)
 {
     for(int i=0; i<24; ++i)
@@ -154,6 +175,14 @@ int neoncache_get_current_st_f(dynarec_arm_t* dyn, int a)
             return i;
     return -1;
 } 
+int neoncache_get_current_st_f_i64(dynarec_arm_t* dyn, int a)
+{
+    for(int i=0; i<24; ++i)
+        if((dyn->n.neoncache[i].t==NEON_CACHE_ST_I64 || dyn->n.neoncache[i].t==NEON_CACHE_ST_F)
+         && dyn->n.neoncache[i].n==a)
+            return i;
+    return -1;
+} 
 static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int maxinst, int a);
 static void neoncache_promote_double_internal(dynarec_arm_t* dyn, int ninst, int maxinst, int a);
 static void neoncache_promote_double_combined(dynarec_arm_t* dyn, int ninst, int maxinst, int a)
@@ -163,7 +192,7 @@ static void neoncache_promote_double_combined(dynarec_arm_t* dyn, int ninst, int
             a = dyn->insts[ninst].n.combined2;
         } else 
             a = dyn->insts[ninst].n.combined1;
-        int i = neoncache_get_st_f_noback(dyn, ninst, a);
+        int i = neoncache_get_st_f_i64_noback(dyn, ninst, a);
         //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_combined, ninst=%d combined%c %d i=%d (stack:%d/%d)\n", ninst, (a == dyn->insts[ninst].n.combined2)?'2':'1', a ,i, dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop);
         if(i>=0) {
             dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D;
@@ -182,7 +211,7 @@ static void neoncache_promote_double_internal(dynarec_arm_t* dyn, int ninst, int
         return;
     while(ninst>=0) {
         a+=dyn->insts[ninst].n.stack_pop;    // adjust Stack depth: add pop'd ST (going backward)
-        int i = neoncache_get_st_f(dyn, ninst, a);
+        int i = neoncache_get_st_f_i64(dyn, ninst, a);
         //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_internal, ninst=%d, a=%d st=%d:%d, i=%d\n", ninst, a, dyn->insts[ninst].n.stack, dyn->insts[ninst].n.stack_next, i);
         if(i<0) return;
         dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D;
@@ -217,7 +246,7 @@ static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int
             else if (a==dyn->insts[ninst].n.combined2)
                 a = dyn->insts[ninst].n.combined1;
         }
-        int i = neoncache_get_st_f_noback(dyn, ninst, a);
+        int i = neoncache_get_st_f_i64_noback(dyn, ninst, a);
         //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_forward, ninst=%d, a=%d st=%d:%d(%d/%d), i=%d\n", ninst, a, dyn->insts[ninst].n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop, i);
         if(i<0) return;
         dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D;
@@ -238,7 +267,7 @@ static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int
 
 void neoncache_promote_double(dynarec_arm_t* dyn, int ninst, int a)
 {
-    int i = neoncache_get_current_st_f(dyn, a);
+    int i = neoncache_get_current_st_f_i64(dyn, a);
     //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double, ninst=%d a=%d st=%d i=%d\n", ninst, a, dyn->n.stack, i);
     if(i<0) return;
     dyn->n.neoncache[i].t = NEON_CACHE_ST_D;
@@ -271,6 +300,9 @@ int neoncache_combine_st(dynarec_arm_t* dyn, int ninst, int a, int b)
     if( neoncache_get_current_st(dyn, ninst, a)==NEON_CACHE_ST_F
      && neoncache_get_current_st(dyn, ninst, b)==NEON_CACHE_ST_F )
         return NEON_CACHE_ST_F;
+    if( neoncache_get_current_st(dyn, ninst, a)==NEON_CACHE_ST_I64
+     && neoncache_get_current_st(dyn, ninst, b)==NEON_CACHE_ST_I64 )
+        return NEON_CACHE_ST_I64;
     return NEON_CACHE_ST_D;
 }
 
@@ -281,7 +313,9 @@ static int isCacheEmpty(dynarec_native_t* dyn, int ninst) {
     for(int i=0; i<24; ++i)
         if(dyn->insts[ninst].n.neoncache[i].v) {       // there is something at ninst for i
             if(!(
-            (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D)
+            (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F 
+             || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D
+             || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64)
             && dyn->insts[ninst].n.neoncache[i].n<dyn->insts[ninst].n.stack_pop))
                 return 0;
         }
@@ -304,7 +338,9 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
         for(int i=0; i<24 && !ret; ++i)
             if(dyn->insts[ninst].n.neoncache[i].v) {       // there is something at ninst for i
                 if(!(
-                (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D)
+                (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F 
+                || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D
+                || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64)
                 && dyn->insts[ninst].n.neoncache[i].n<dyn->insts[ninst].n.stack_pop))
                     ret = 1;
             }
@@ -343,7 +379,7 @@ void neoncacheUnwind(neoncache_t* cache)
         int a = -1; 
         int b = -1;
         for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j)
-            if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F)) {
+            if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F || cache->neoncache[j].t == NEON_CACHE_ST_I64)) {
                 if(cache->neoncache[j].n == cache->combined1)
                     a = j;
                 else if(cache->neoncache[j].n == cache->combined2)
@@ -367,7 +403,7 @@ void neoncacheUnwind(neoncache_t* cache)
     if(cache->stack_push) {
         // unpush
         for(int j=0; j<24; ++j) {
-            if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F)) {
+            if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F || cache->neoncache[j].t == NEON_CACHE_ST_I64)) {
                 if(cache->neoncache[j].n<cache->stack_push)
                     cache->neoncache[j].v = 0;
                 else
@@ -412,6 +448,7 @@ void neoncacheUnwind(neoncache_t* cache)
                     break;
                 case NEON_CACHE_ST_F:
                 case NEON_CACHE_ST_D:
+                case NEON_CACHE_ST_I64:
                     cache->x87cache[x87reg] = cache->neoncache[i].n;
                     cache->x87reg[x87reg] = i;
                     ++x87reg;
@@ -477,6 +514,7 @@ const char* getCacheName(int t, int n)
     switch(t) {
         case NEON_CACHE_ST_D: sprintf(buff, "ST%d", n); break;
         case NEON_CACHE_ST_F: sprintf(buff, "st%d", n); break;
+        case NEON_CACHE_ST_I64: sprintf(buff, "STi%d", n); break;
         case NEON_CACHE_MM: sprintf(buff, "MM%d", n); break;
         case NEON_CACHE_XMMW: sprintf(buff, "XMM%d", n); break;
         case NEON_CACHE_XMMR: sprintf(buff, "xmm%d", n); break;
@@ -521,6 +559,7 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
             switch(dyn->insts[ninst].n.neoncache[ii].t) {
                 case NEON_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
                 case NEON_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
+                case NEON_CACHE_ST_I64: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
                 case NEON_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
                 case NEON_CACHE_XMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
                 case NEON_CACHE_XMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h
index 950345fa..201cfcea 100755
--- a/src/dynarec/arm64/dynarec_arm64_functions.h
+++ b/src/dynarec/arm64/dynarec_arm64_functions.h
@@ -21,12 +21,16 @@ void fpu_reset_reg(dynarec_arm_t* dyn);
 // ---- Neon cache functions
 // Get type for STx
 int neoncache_get_st(dynarec_arm_t* dyn, int ninst, int a);
-// Get if STx is FLOAT or DOUBLE
+// Get if STx is FLOAT
 int neoncache_get_st_f(dynarec_arm_t* dyn, int ninst, int a);
+// Get if STx is FLOAT or I64
+int neoncache_get_st_f_i64(dynarec_arm_t* dyn, int ninst, int a);
 // Get actual type for STx
 int neoncache_get_current_st(dynarec_arm_t* dyn, int ninst, int a);
-// Get actual STx is FLOAT or DOUBLE
+// Get actual STx is FLOAT
 int neoncache_get_current_st_f(dynarec_arm_t* dyn, int a);
+// Get actual STx is FLOAT or I64
+int neoncache_get_current_st_f_i64(dynarec_arm_t* dyn, int a);
 // Back-propagate a change float->double
 void neoncache_promote_double(dynarec_arm_t* dyn, int ninst, int a);
 // Combine and propagate if needed (pass 1 only)
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index e1b93026..153f82a4 100755
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -846,7 +846,9 @@ static void x87_reset(dynarec_arm_t* dyn)
     dyn->n.swapped = 0;
     dyn->n.barrier = 0;
     for(int i=0; i<24; ++i)
-        if(dyn->n.neoncache[i].t == NEON_CACHE_ST_F || dyn->n.neoncache[i].t == NEON_CACHE_ST_D)
+        if(dyn->n.neoncache[i].t == NEON_CACHE_ST_F
+         || dyn->n.neoncache[i].t == NEON_CACHE_ST_D
+         || dyn->n.neoncache[i].t == NEON_CACHE_ST_I64)
             dyn->n.neoncache[i].v = 0;
 }
 
@@ -907,7 +909,9 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
     dyn->n.stack_push+=1;
     // move all regs in cache, and find a free one
     for(int j=0; j<24; ++j)
-        if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) || (dyn->n.neoncache[j].t == NEON_CACHE_ST_F))
+        if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D)
+         ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_F)
+         ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64))
             ++dyn->n.neoncache[j].n;
     int ret = -1;
     for(int i=0; i<8; ++i)
@@ -916,13 +920,7 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
         else if(ret==-1) {
             dyn->n.x87cache[i] = 0;
             ret=dyn->n.x87reg[i]=fpu_get_reg_x87(dyn, t, 0);
-            #if STEP == 1
-            // need to check if reg is compatible with float
-            if((ret>15) && (t == NEON_CACHE_ST_F))
-                dyn->n.neoncache[ret].t = NEON_CACHE_ST_D;
-            #else
             dyn->n.neoncache[ret].t = X87_ST0;
-            #endif
         }
     return ret;
 }
@@ -936,7 +934,9 @@ void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1)
     dyn->n.stack_push+=1;
     // move all regs in cache
     for(int j=0; j<24; ++j)
-        if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) || (dyn->n.neoncache[j].t == NEON_CACHE_ST_F))
+        if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D)
+         ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_F)
+         ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64))
             ++dyn->n.neoncache[j].n;
     for(int i=0; i<8; ++i)
         if(dyn->n.x87cache[i]!=-1)
@@ -1133,7 +1133,7 @@ int x87_get_current_cache(dynarec_arm_t* dyn, int ninst, int st, int t)
     for (int i=0; i<8; ++i) {
         if(dyn->n.x87cache[i]==st) {
             #if STEP == 1
-            if(t==NEON_CACHE_ST_D && (dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F))
+            if(t==NEON_CACHE_ST_D && (dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F || dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_I64))
                 neoncache_promote_double(dyn, ninst, st);
             #endif
             return i;
@@ -1179,7 +1179,9 @@ int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, i
 int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
 {
     for(int ii=0; ii<24; ++ii)
-        if((dyn->n.neoncache[ii].t == NEON_CACHE_ST_F || dyn->n.neoncache[ii].t == NEON_CACHE_ST_D)
+        if((dyn->n.neoncache[ii].t == NEON_CACHE_ST_F
+         || dyn->n.neoncache[ii].t == NEON_CACHE_ST_D
+         || dyn->n.neoncache[ii].t == NEON_CACHE_ST_I64)
          && dyn->n.neoncache[ii].n==st)
             return ii;
     assert(0);
@@ -1217,6 +1219,9 @@ void x87_refresh(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
     if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_F) {
         FCVT_D_S(31, dyn->n.x87reg[ret]);
         VSTR64_REG_LSL3(31, s1, s2);
+    } else if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_I64) {
+        SCVTFDD(31, dyn->n.x87reg[ret]);
+        VSTR64_REG_LSL3(31, s1, s2);
     } else {
         VSTR64_REG_LSL3(dyn->n.x87reg[ret], s1, s2);
     }
@@ -1234,7 +1239,7 @@ void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
         return;
     MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st);
     #if STEP == 1
-    if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_F)
+    if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_F || dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_I64)
         neoncache_promote_double(dyn, ninst, st);
     #endif
     // prepare offset to fpu => s1
@@ -1265,7 +1270,7 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
             // refresh the value
             MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st);
             #if STEP == 1
-            if(dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F)
+            if(dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F || dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_I64)
                 neoncache_promote_double(dyn, ninst, st);
             #endif
             ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87));
@@ -1591,10 +1596,20 @@ static int findCacheSlot(dynarec_arm_t* dyn, int ninst, int t, int n, neoncache_
                 case NEON_CACHE_ST_F:
                     if (t==NEON_CACHE_ST_D)
                         return i;
+                    if (t==NEON_CACHE_ST_I64)
+                        return i;
                     break;
                 case NEON_CACHE_ST_D:
                     if (t==NEON_CACHE_ST_F)
                         return i;
+                    if (t==NEON_CACHE_ST_I64)
+                        return i;
+                    break;
+                case NEON_CACHE_ST_I64:
+                    if (t==NEON_CACHE_ST_F)
+                        return i;
+                    if (t==NEON_CACHE_ST_D)
+                        return i;
                     break;
                 case NEON_CACHE_XMMR:
                     if(t==NEON_CACHE_XMMW)
@@ -1684,6 +1699,7 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int
             break;
         case NEON_CACHE_ST_D:
         case NEON_CACHE_ST_F:
+        case NEON_CACHE_ST_I64:
             MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
             if((*s3_top) == 0xffff) {
                 LDRw_U12(s3, xEmu, offsetof(x64emu_t, top));
@@ -1705,6 +1721,9 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int
             if(t==NEON_CACHE_ST_F) {
                 FCVT_S_D(i, i);
             }
+            if(t==NEON_CACHE_ST_I64) {
+                VFCVTZSQD(i, i);
+            }
             break;
         case NEON_CACHE_NONE:
         case NEON_CACHE_SCR:
@@ -1732,6 +1751,7 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in
             break;
         case NEON_CACHE_ST_D:
         case NEON_CACHE_ST_F:
+        case NEON_CACHE_ST_I64:
             MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
             if((*s3_top)==0xffff) {
                 LDRw_U12(s3, xEmu, offsetof(x64emu_t, top));
@@ -1751,6 +1771,8 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in
             *s2_val = 0;
             if(t==NEON_CACHE_ST_F) {
                 FCVT_D_S(i, i);
+            } else if (t==NEON_CACHE_ST_I64) {
+                SCVTFDD(i, i);
             }
             VSTR64_U12(i, s2, offsetof(x64emu_t, x87));
             break;
@@ -1880,6 +1902,23 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
                     MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
                     FCVT_D_S(i, i);
                     cache.neoncache[i].t = NEON_CACHE_ST_D;
+                } else if(cache.neoncache[i].t == NEON_CACHE_ST_D && cache_i2.neoncache[i].t == NEON_CACHE_ST_I64) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
+                    VFCVTZSQD(i, i);
+                    cache.neoncache[i].t = NEON_CACHE_ST_I64;
+                } else if(cache.neoncache[i].t == NEON_CACHE_ST_F && cache_i2.neoncache[i].t == NEON_CACHE_ST_I64) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
+                    VFCVTZSQS(i, i);
+                    cache.neoncache[i].t = NEON_CACHE_ST_D;
+                } else if(cache.neoncache[i].t == NEON_CACHE_ST_I64 && cache_i2.neoncache[i].t == NEON_CACHE_ST_F) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
+                    SCVTFDD(i, i);
+                    FCVT_S_D(i, i);
+                    cache.neoncache[i].t = NEON_CACHE_ST_F;
+                } else if(cache.neoncache[i].t == NEON_CACHE_ST_I64 && cache_i2.neoncache[i].t == NEON_CACHE_ST_D) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
+                    SCVTFDD(i, i);
+                    cache.neoncache[i].t = NEON_CACHE_ST_D;
                 } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW)
                     { cache.neoncache[i].t = NEON_CACHE_XMMW; }
                 else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) {
@@ -2031,7 +2070,9 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst)
 {
     if(dyn->n.stack_pop) {
         for(int j=0; j<24; ++j)
-            if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D || dyn->n.neoncache[j].t == NEON_CACHE_ST_F)) {
+            if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D
+             || dyn->n.neoncache[j].t == NEON_CACHE_ST_F
+             || dyn->n.neoncache[j].t == NEON_CACHE_ST_I64)) {
                 if(dyn->n.neoncache[j].n<dyn->n.stack_pop)
                     dyn->n.neoncache[j].v = 0;
                 else
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index ec3dbf33..64866147 100755
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -1142,16 +1142,19 @@ int neoncache_st_coherency(dynarec_arm_t* dyn, int ninst, int a, int b);
 
 #if STEP == 0
 #define ST_IS_F(A)          0
+#define ST_IS_I64(A)        0
 #define X87_COMBINE(A, B)   NEON_CACHE_ST_D
 #define X87_ST0             NEON_CACHE_ST_D
 #define X87_ST(A)           NEON_CACHE_ST_D
 #elif STEP == 1
 #define ST_IS_F(A) (neoncache_get_current_st(dyn, ninst, A)==NEON_CACHE_ST_F)
+#define ST_IS_I64(A) (neoncache_get_current_st(dyn, ninst, A)==NEON_CACHE_ST_I64)
 #define X87_COMBINE(A, B) neoncache_combine_st(dyn, ninst, A, B)
 #define X87_ST0     neoncache_get_current_st(dyn, ninst, 0)
 #define X87_ST(A)   neoncache_get_current_st(dyn, ninst, A)
 #else
 #define ST_IS_F(A) (neoncache_get_st(dyn, ninst, A)==NEON_CACHE_ST_F)
+#define ST_IS_I64(A) (neoncache_get_st(dyn, ninst, A)==NEON_CACHE_ST_I64)
 #if STEP == 3
 #define X87_COMBINE(A, B) neoncache_st_coherency(dyn, ninst, A, B)
 #else
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 52ad44b4..ba802217 100755
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -9,13 +9,14 @@ typedef struct instsize_s instsize_t;
 
 #define BARRIER_MAYBE   8
 
-#define NEON_CACHE_NONE 0
-#define NEON_CACHE_ST_D 1
-#define NEON_CACHE_ST_F 2
-#define NEON_CACHE_MM   3
-#define NEON_CACHE_XMMW 4
-#define NEON_CACHE_XMMR 5
-#define NEON_CACHE_SCR  6
+#define NEON_CACHE_NONE     0
+#define NEON_CACHE_ST_D     1
+#define NEON_CACHE_ST_F     2
+#define NEON_CACHE_ST_I64   3
+#define NEON_CACHE_MM       4
+#define NEON_CACHE_XMMW     5
+#define NEON_CACHE_XMMR     6
+#define NEON_CACHE_SCR      7
 typedef union neon_cache_s {
     int8_t           v;
     struct {