1 files changed, 250 insertions, 59 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 7aaf098e..13b58359 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -1,7 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include <pthread.h>
 #include <errno.h>
 #include <assert.h>
 #include <string.h>
@@ -19,7 +18,6 @@
 #include "x64trace.h"
 #include "dynarec_native.h"
 #include "../dynablock_private.h"
-#include "../tools/bridge_private.h"
 #include "custommem.h"
 
 #include "arm64_printer.h"
@@ -27,11 +25,16 @@
 #include "dynarec_arm64_functions.h"
 #include "dynarec_arm64_helper.h"
 
+static uintptr_t geted_32(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int* unscaled, int absmax, uint32_t mask, int* l, int s);
+
 /* setup r2 to address pointed by ED, also fixaddress is an optionnal delta in the range [-absmax, +absmax], with delta&mask==0 to be added to ed for LDR/STR */
 uintptr_t geted(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int* unscaled, int absmax, uint32_t mask, rex_t rex, int *l, int s, int delta)
 {
     MAYUSE(dyn); MAYUSE(ninst); MAYUSE(delta);
 
+    if(rex.is32bits)
+        return geted_32(dyn, addr, ninst, nextop, ed, hint, fixaddress, unscaled, absmax, mask, l, s);
+
     int lock = l?((l==LOCK_LOCK)?1:2):0;
     if(unscaled)
         *unscaled = 0;
@@ -119,7 +122,7 @@ uintptr_t geted(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, u
         }
         if(nextop&0x80)
             i64 = F32S;
-        else 
+        else
             i64 = F8S;
         if(i64==0 || ((i64>=absmin) && (i64<=absmax)  && !(i64&mask)) || (unscaled && (i64>-256) && (i64<256))) {
             *fixaddress = i64;
@@ -183,6 +186,141 @@ uintptr_t geted(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, u
     return addr;
 }
 
+static uintptr_t geted_32(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int* unscaled, int absmax, uint32_t mask, int* l, int s)
+{
+    MAYUSE(dyn); MAYUSE(ninst);
+
+    int lock = l?((l==LOCK_LOCK)?1:2):0;
+    if(unscaled)
+        *unscaled = 0;
+    if(lock==2)
+        *l = 0;
+    uint8_t ret = x2;
+    uint8_t scratch = x2;
+    *fixaddress = 0;
+    if(hint>0) ret = hint;
+    if(hint>0 && hint<xRAX) scratch = hint;
+    int absmin = 0;
+    if(s) absmin=-absmax;
+    MAYUSE(scratch);
+    if(!(nextop&0xC0)) {
+        if((nextop&7)==4) {
+            uint8_t sib = F8;
+            int sib_reg = (sib>>3)&7;
+            if((sib&0x7)==5) {
+                int64_t tmp = F32S;
+                if (sib_reg!=4) {
+                    if(tmp && (!((tmp>=absmin) && (tmp<=absmax) && !(tmp&mask))) || !(unscaled && (tmp>-256) && (tmp<256))) {
+                        MOV32w(scratch, tmp);
+                        ADDw_REG_LSL(ret, scratch, xRAX+sib_reg, (sib>>6));
+                    } else {
+                        LSLw(ret, xRAX+sib_reg, (sib>>6));
+                        *fixaddress = tmp;
+                        if(unscaled && (tmp>-256) && (tmp<256))
+                            *unscaled = 1;
+                    }
+                } else {
+                    switch(lock) {
+                        case 1: addLockAddress((int32_t)tmp); break;
+                        case 2: if(isLockAddress((int32_t)tmp)) *l=1; break;
+                    }
+                    MOV32w(ret, tmp);
+                }
+            } else {
+                if (sib_reg!=4) {
+                    ADDw_REG_LSL(ret, xRAX+(sib&0x7), xRAX+sib_reg, (sib>>6));
+                } else {
+                    ret = xRAX+(sib&0x7);
+                }
+            }
+        } else if((nextop&7)==5) {
+            uint64_t tmp = F32;
+            MOV32w(ret, tmp);
+            switch(lock) {
+                case 1: addLockAddress(tmp); break;
+                case 2: if(isLockAddress(tmp)) *l=1; break;
+            }
+        } else {
+            ret = xRAX+(nextop&7);
+            if(ret==hint) {
+                MOVw_REG(hint, ret);    //to clear upper part
+            }
+        }
+    } else {
+        int64_t i32;
+        uint8_t sib = 0;
+        int sib_reg = 0;
+        if((nextop&7)==4) {
+            sib = F8;
+            sib_reg = (sib>>3)&7;
+        }
+        if(nextop&0x80)
+            i32 = F32S;
+        else 
+            i32 = F8S;
+        if(i32==0 || ((i32>=absmin) && (i32<=absmax)  && !(i32&mask)) || (unscaled && (i32>-256) && (i32<256))) {
+            *fixaddress = i32;
+            if(unscaled && (i32>-256) && (i32<256))
+                *unscaled = 1;
+            if((nextop&7)==4) {
+                if (sib_reg!=4) {
+                    ADDw_REG_LSL(ret, xRAX+(sib&0x07), xRAX+sib_reg, (sib>>6));
+                } else {
+                    ret = xRAX+(sib&0x07);
+                }
+            } else {
+                ret = xRAX+(nextop&0x07);
+            }
+        } else {
+            int64_t sub = (i32<0)?1:0;
+            if(sub) i32 = -i32;
+            if(i32<0x1000) {
+                if((nextop&7)==4) {
+                    if (sib_reg!=4) {
+                        ADDw_REG_LSL(scratch, xRAX+(sib&0x07), xRAX+sib_reg, (sib>>6));
+                    } else {
+                        scratch = xRAX+(sib&0x07);
+                    }
+                } else
+                    scratch = xRAX+(nextop&0x07);
+                if(sub) {
+                    SUBw_U12(ret, scratch, i32);
+                } else {
+                    ADDw_U12(ret, scratch, i32);
+                }
+            } else {
+                MOV32w(scratch, i32);
+                if((nextop&7)==4) {
+                    if (sib_reg!=4) {
+                        if(sub) {
+                            SUBw_REG(scratch, xRAX+(sib&0x07), scratch);
+                        } else {
+                            ADDw_REG(scratch, scratch, xRAX+(sib&0x07));
+                        }
+                        ADDw_REG_LSL(ret, scratch, xRAX+sib_reg, (sib>>6));
+                    } else {
+                        PASS3(int tmp = xRAX+(sib&0x07));
+                        if(sub) {
+                            SUBw_REG(ret, tmp, scratch);
+                        } else {
+                            ADDw_REG(ret, tmp, scratch);
+                        }
+                    }
+                } else {
+                    PASS3(int tmp = xRAX+(nextop&0x07));
+                    if(sub) {
+                        SUBw_REG(ret, tmp, scratch);
+                    } else {
+                        ADDw_REG(ret, tmp, scratch);
+                    }
+                }
+            }
+        }
+    }
+    *ed = ret;
+    return addr;
+}
+
 /* setup r2 to address pointed by ED, also fixaddress is an optionnal delta in the range [-absmax, +absmax], with delta&mask==0 to be added to ed for LDR/STR */
 uintptr_t geted32(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int* unscaled, int absmax, uint32_t mask, rex_t rex, int* l, int s, int delta)
 {
@@ -256,9 +394,9 @@ uintptr_t geted32(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
         }
         if(nextop&0x80)
             i64 = F32S;
-        else 
+        else
             i64 = F8S;
-        if(i64==0 || ((i64>=absmin) && (i64<=absmax)  && !(i64&mask)) || (unscaled && (i64>-256) && (i64>256))) {
+        if(i64==0 || ((i64>=absmin) && (i64<=absmax)  && !(i64&mask)) || (unscaled && (i64>-256) && (i64<256))) {
             *fixaddress = i64;
             if(unscaled && (i64>-256) && (i64<256))
                 *unscaled = 1;
@@ -339,8 +477,8 @@ uintptr_t geted16(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop,
     int64_t offset = 0;
     int absmin = 0;
     if(s) absmin = -absmax;
-    if(!n && m==6) {
-        offset = F16;
+    if(!n && (m&7)==6) {
+        offset = F16S;
         MOVZw(ret, offset);
     } else {
         switch(n) {
@@ -458,18 +596,18 @@ void jump_to_next(dynarec_arm_t* dyn, uintptr_t ip, int reg, int ninst)
     }
     CLEARIP();
     #ifdef HAVE_TRACE
-    //MOVx(x3, 15);    no access to PC reg 
+    //MOVx(x3, 15);    no access to PC reg
     #endif
     SMEND();
     BLR(x2); // save LR...
 }
 
-void ret_to_epilog(dynarec_arm_t* dyn, int ninst)
+void ret_to_epilog(dynarec_arm_t* dyn, int ninst, rex_t rex)
 {
     MAYUSE(dyn); MAYUSE(ninst);
     MESSAGE(LOG_DUMP, "Ret to epilog\n");
-    POP1(xRIP);
-    MOVx_REG(x1, xRIP);
+    POP1z(xRIP);
+    MOVz_REG(x1, xRIP);
     SMEND();
     if(box64_dynarec_callret) {
         // pop the actual return address for ARM stack
@@ -496,18 +634,18 @@ void ret_to_epilog(dynarec_arm_t* dyn, int ninst)
     CLEARIP();
 }
 
-void retn_to_epilog(dynarec_arm_t* dyn, int ninst, int n)
+void retn_to_epilog(dynarec_arm_t* dyn, int ninst, rex_t rex, int n)
 {
     MAYUSE(dyn); MAYUSE(ninst);
     MESSAGE(LOG_DUMP, "Retn to epilog\n");
-    POP1(xRIP);
+    POP1z(xRIP);
     if(n>0xfff) {
         MOV32w(w1, n);
-        ADDx_REG(xRSP, xRSP, x1);
+        ADDz_REG(xRSP, xRSP, x1);
     } else {
-        ADDx_U12(xRSP, xRSP, n);
+        ADDz_U12(xRSP, xRSP, n);
     }
-    MOVx_REG(x1, xRIP);
+    MOVz_REG(x1, xRIP);
     SMEND();
     if(box64_dynarec_callret) {
         // pop the actual return address for ARM stack
@@ -541,24 +679,34 @@ void iret_to_epilog(dynarec_arm_t* dyn, int ninst, int is64bits)
     MESSAGE(LOG_DUMP, "IRet to epilog\n");
     // POP IP
     NOTEST(x2);
-    POP1(xRIP);
-    // POP CS
-    POP1(x2);
+    if(is64bits) {
+        POP1(xRIP);
+        POP1(x2);
+        POP1(xFlags);
+    } else {
+        POP1_32(xRIP);
+        POP1_32(x2);
+        POP1_32(xFlags);
+    }
+    // x2 is CS
     STRH_U12(x2, xEmu, offsetof(x64emu_t, segs[_CS]));
-    MOVZw(x1, 0);
-    STRx_U12(x1, xEmu, offsetof(x64emu_t, segs_serial[_CS]));
-    STRx_U12(x1, xEmu, offsetof(x64emu_t, segs_serial[_SS]));
-    // POP EFLAGS
-    POP1(xFlags);
+    STRw_U12(xZR, xEmu, offsetof(x64emu_t, segs_serial[_CS]));
+    // clean EFLAGS
     MOV32w(x1, 0x3F7FD7);
     ANDx_REG(xFlags, xFlags, x1);
-    ORRx_mask(xFlags, xFlags, 1, 0b111111, 0);
+    ORRx_mask(xFlags, xFlags, 1, 0b111111, 0); // xFlags | 0b10
     SET_DFNONE(x1);
     // POP RSP
-    POP1(x3);
+    if(is64bits) {
+        POP1(x3);   //rsp
+        POP1(x2);   //ss
+    } else {
+        POP1_32(x3);   //rsp
+        POP1_32(x2);   //ss
+    }
     // POP SS
-    POP1(x2);
     STRH_U12(x2, xEmu, offsetof(x64emu_t, segs[_SS]));
+    STRw_U12(xZR, xEmu, offsetof(x64emu_t, segs_serial[_SS]));
     // set new RSP
     MOVx_REG(xRSP, x3);
     // Ret....
@@ -698,7 +846,9 @@ static void x87_reset(dynarec_arm_t* dyn)
     dyn->n.swapped = 0;
     dyn->n.barrier = 0;
     for(int i=0; i<24; ++i)
-        if(dyn->n.neoncache[i].t == NEON_CACHE_ST_F || dyn->n.neoncache[i].t == NEON_CACHE_ST_D)
+        if(dyn->n.neoncache[i].t == NEON_CACHE_ST_F
+         || dyn->n.neoncache[i].t == NEON_CACHE_ST_D
+         || dyn->n.neoncache[i].t == NEON_CACHE_ST_I64)
             dyn->n.neoncache[i].v = 0;
 }
 
@@ -759,7 +909,9 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
     dyn->n.stack_push+=1;
     // move all regs in cache, and find a free one
     for(int j=0; j<24; ++j)
-        if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) || (dyn->n.neoncache[j].t == NEON_CACHE_ST_F))
+        if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D)
+         ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_F)
+         ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64))
             ++dyn->n.neoncache[j].n;
     int ret = -1;
     for(int i=0; i<8; ++i)
@@ -768,13 +920,7 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
         else if(ret==-1) {
             dyn->n.x87cache[i] = 0;
             ret=dyn->n.x87reg[i]=fpu_get_reg_x87(dyn, t, 0);
-            #if STEP == 1
-            // need to check if reg is compatible with float
-            if((ret>15) && (t == NEON_CACHE_ST_F))
-                dyn->n.neoncache[ret].t = NEON_CACHE_ST_D;
-            #else
             dyn->n.neoncache[ret].t = X87_ST0;
-            #endif
         }
     return ret;
 }
@@ -788,7 +934,9 @@ void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1)
     dyn->n.stack_push+=1;
     // move all regs in cache
     for(int j=0; j<24; ++j)
-        if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) || (dyn->n.neoncache[j].t == NEON_CACHE_ST_F))
+        if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D)
+         ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_F)
+         ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64))
             ++dyn->n.neoncache[j].n;
     for(int i=0; i<8; ++i)
         if(dyn->n.x87cache[i]!=-1)
@@ -985,7 +1133,9 @@ int x87_get_current_cache(dynarec_arm_t* dyn, int ninst, int st, int t)
     for (int i=0; i<8; ++i) {
         if(dyn->n.x87cache[i]==st) {
             #if STEP == 1
-            if(t==NEON_CACHE_ST_D && (dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F))
+            if(t==NEON_CACHE_ST_D && (dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F || dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_I64))
+                neoncache_promote_double(dyn, ninst, st);
+            else if(t==NEON_CACHE_ST_F && (dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_I64))
                 neoncache_promote_double(dyn, ninst, st);
             #endif
             return i;
@@ -1031,7 +1181,9 @@ int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, i
 int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
 {
     for(int ii=0; ii<24; ++ii)
-        if((dyn->n.neoncache[ii].t == NEON_CACHE_ST_F || dyn->n.neoncache[ii].t == NEON_CACHE_ST_D)
+        if((dyn->n.neoncache[ii].t == NEON_CACHE_ST_F
+         || dyn->n.neoncache[ii].t == NEON_CACHE_ST_D
+         || dyn->n.neoncache[ii].t == NEON_CACHE_ST_I64)
          && dyn->n.neoncache[ii].n==st)
             return ii;
     assert(0);
@@ -1069,6 +1221,9 @@ void x87_refresh(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
     if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_F) {
         FCVT_D_S(31, dyn->n.x87reg[ret]);
         VSTR64_REG_LSL3(31, s1, s2);
+    } else if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_I64) {
+        SCVTFDD(31, dyn->n.x87reg[ret]);
+        VSTR64_REG_LSL3(31, s1, s2);
     } else {
         VSTR64_REG_LSL3(dyn->n.x87reg[ret], s1, s2);
     }
@@ -1086,7 +1241,7 @@ void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
         return;
     MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st);
     #if STEP == 1
-    if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_F)
+    if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_F || dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_I64)
         neoncache_promote_double(dyn, ninst, st);
     #endif
     // prepare offset to fpu => s1
@@ -1117,7 +1272,7 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
             // refresh the value
             MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st);
             #if STEP == 1
-            if(dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F)
+            if(dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F || dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_I64)
                 neoncache_promote_double(dyn, ninst, st);
             #endif
             ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87));
@@ -1443,10 +1598,20 @@ static int findCacheSlot(dynarec_arm_t* dyn, int ninst, int t, int n, neoncache_
                 case NEON_CACHE_ST_F:
                     if (t==NEON_CACHE_ST_D)
                         return i;
+                    if (t==NEON_CACHE_ST_I64)
+                        return i;
                     break;
                 case NEON_CACHE_ST_D:
                     if (t==NEON_CACHE_ST_F)
                         return i;
+                    if (t==NEON_CACHE_ST_I64)
+                        return i;
+                    break;
+                case NEON_CACHE_ST_I64:
+                    if (t==NEON_CACHE_ST_F)
+                        return i;
+                    if (t==NEON_CACHE_ST_D)
+                        return i;
                     break;
                 case NEON_CACHE_XMMR:
                     if(t==NEON_CACHE_XMMW)
@@ -1471,7 +1636,7 @@ static void swapCache(dynarec_arm_t* dyn, int ninst, int i, int j, neoncache_t *
         quad =1;
     if(cache->neoncache[j].t==NEON_CACHE_XMMR || cache->neoncache[j].t==NEON_CACHE_XMMW)
         quad =1;
-    
+
     if(!cache->neoncache[i].v) {
         // a mov is enough, no need to swap
         MESSAGE(LOG_DUMP, "\t  - Moving %d <- %d\n", i, j);
@@ -1531,12 +1696,13 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int
             VLDR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n]));
             break;
         case NEON_CACHE_MM:
-            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));                    
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
             VLDR64_U12(i, xEmu, offsetof(x64emu_t, mmx[i]));
             break;
         case NEON_CACHE_ST_D:
         case NEON_CACHE_ST_F:
-            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));                    
+        case NEON_CACHE_ST_I64:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
             if((*s3_top) == 0xffff) {
                 LDRw_U12(s3, xEmu, offsetof(x64emu_t, top));
                 *s3_top = 0;
@@ -1557,12 +1723,15 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int
             if(t==NEON_CACHE_ST_F) {
                 FCVT_S_D(i, i);
             }
-            break;                    
+            if(t==NEON_CACHE_ST_I64) {
+                VFCVTZSQD(i, i);
+            }
+            break;
         case NEON_CACHE_NONE:
         case NEON_CACHE_SCR:
         default:    /* nothing done */
             MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
-            break; 
+            break;
     }
     cache->neoncache[i].n = n;
     cache->neoncache[i].t = t;
@@ -1579,12 +1748,13 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in
             VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n]));
             break;
         case NEON_CACHE_MM:
-            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));                    
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
             VSTR64_U12(i, xEmu, offsetof(x64emu_t, mmx[n]));
             break;
         case NEON_CACHE_ST_D:
         case NEON_CACHE_ST_F:
-            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));                    
+        case NEON_CACHE_ST_I64:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
             if((*s3_top)==0xffff) {
                 LDRw_U12(s3, xEmu, offsetof(x64emu_t, top));
                 *s3_top = 0;
@@ -1603,14 +1773,16 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in
             *s2_val = 0;
             if(t==NEON_CACHE_ST_F) {
                 FCVT_D_S(i, i);
+            } else if (t==NEON_CACHE_ST_I64) {
+                SCVTFDD(i, i);
             }
             VSTR64_U12(i, s2, offsetof(x64emu_t, x87));
-            break;                    
+            break;
         case NEON_CACHE_NONE:
         case NEON_CACHE_SCR:
         default:    /* nothing done */
             MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
-            break; 
+            break;
     }
     cache->neoncache[i].v = 0;
 }
@@ -1732,6 +1904,23 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
                     MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
                     FCVT_D_S(i, i);
                     cache.neoncache[i].t = NEON_CACHE_ST_D;
+                } else if(cache.neoncache[i].t == NEON_CACHE_ST_D && cache_i2.neoncache[i].t == NEON_CACHE_ST_I64) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
+                    VFCVTZSQD(i, i);
+                    cache.neoncache[i].t = NEON_CACHE_ST_I64;
+                } else if(cache.neoncache[i].t == NEON_CACHE_ST_F && cache_i2.neoncache[i].t == NEON_CACHE_ST_I64) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
+                    VFCVTZSQS(i, i);
+                    cache.neoncache[i].t = NEON_CACHE_ST_D;
+                } else if(cache.neoncache[i].t == NEON_CACHE_ST_I64 && cache_i2.neoncache[i].t == NEON_CACHE_ST_F) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
+                    SCVTFDD(i, i);
+                    FCVT_S_D(i, i);
+                    cache.neoncache[i].t = NEON_CACHE_ST_F;
+                } else if(cache.neoncache[i].t == NEON_CACHE_ST_I64 && cache_i2.neoncache[i].t == NEON_CACHE_ST_D) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
+                    SCVTFDD(i, i);
+                    cache.neoncache[i].t = NEON_CACHE_ST_D;
                 } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW)
                     { cache.neoncache[i].t = NEON_CACHE_XMMW; }
                 else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) {
@@ -1759,18 +1948,18 @@ static void flagsCacheTransform(dynarec_arm_t* dyn, int ninst, int s1)
     int go = 0;
     switch (dyn->insts[jmp].f_entry.pending) {
         case SF_UNKNOWN: break;
-        case SF_SET: 
-            if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING) 
-                go = 1; 
+        case SF_SET:
+            if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING)
+                go = 1;
             break;
         case SF_SET_PENDING:
-            if(dyn->f.pending!=SF_SET 
+            if(dyn->f.pending!=SF_SET
             && dyn->f.pending!=SF_SET_PENDING
-            && dyn->f.pending!=SF_PENDING) 
-                go = 1; 
+            && dyn->f.pending!=SF_PENDING)
+                go = 1;
             break;
         case SF_PENDING:
-            if(dyn->f.pending!=SF_SET 
+            if(dyn->f.pending!=SF_SET
             && dyn->f.pending!=SF_SET_PENDING
             && dyn->f.pending!=SF_PENDING)
                 go = 1;
@@ -1783,11 +1972,11 @@ static void flagsCacheTransform(dynarec_arm_t* dyn, int ninst, int s1)
     if(go) {
         if(dyn->f.pending!=SF_PENDING) {
             LDRw_U12(s1, xEmu, offsetof(x64emu_t, df));
-            j64 = (GETMARK3)-(dyn->native_size);
+            j64 = (GETMARKF2)-(dyn->native_size);
             CBZw(s1, j64);
         }
         CALL_(UpdateFlags, -1, 0);
-        MARK3;
+        MARKF2;
     }
 #endif
 }
@@ -1883,7 +2072,9 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst)
 {
     if(dyn->n.stack_pop) {
         for(int j=0; j<24; ++j)
-            if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D || dyn->n.neoncache[j].t == NEON_CACHE_ST_F)) {
+            if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D
+             || dyn->n.neoncache[j].t == NEON_CACHE_ST_F
+             || dyn->n.neoncache[j].t == NEON_CACHE_ST_I64)) {
                 if(dyn->n.neoncache[j].n<dyn->n.stack_pop)
                     dyn->n.neoncache[j].v = 0;
                 else