about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2023-03-21 20:20:35 +0000
committerptitSeb <sebastien.chev@gmail.com>2023-03-21 20:20:35 +0000
commit900f38779da63d30625c6951291ce0e39ff3c598 (patch)
treed68903d1156ce7566efda38fd00d03ae9e9ae0ec
parentd369ac8bf5d0c5de5e8222f5ef416df943298050 (diff)
downloadbox64-900f38779da63d30625c6951291ce0e39ff3c598.tar.gz
box64-900f38779da63d30625c6951291ce0e39ff3c598.zip
[RV64_DYNAREC] Added x87/SSE/mmx infrastructure, and a few x87 D9 opcodes
-rwxr-xr-xCMakeLists.txt2
-rwxr-xr-xsrc/dynarec/dynarec_arch.h6
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00.c4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_d9.c383
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.c414
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.h34
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.c1087
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h77
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass2.h1
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass3.h1
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h53
-rw-r--r--src/dynarec/rv64/rv64_emitter.h61
12 files changed, 2082 insertions, 41 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 866acaca..619e25b0 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -615,7 +615,7 @@ if(RV64_DYNAREC)
     "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_66.c"
     #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_67.c"
     #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_d8.c"
-    #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_d9.c"
+    "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_d9.c"
     #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_da.c"
     #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_db.c"
     #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_dc.c"
diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h
index d88f591b..51e74d4e 100755
--- a/src/dynarec/dynarec_arch.h
+++ b/src/dynarec/dynarec_arch.h
@@ -32,9 +32,11 @@
 #define instruction_native_t        instruction_rv64_t

 #define dynarec_native_t            dynarec_rv64_t

 

-#define ADDITIONNAL_DEFINITION()

+#define ADDITIONNAL_DEFINITION()  \

+    int fpuCacheNeedsTransform(dynarec_native_t* dyn, int ninst);

 

-#define OTHER_CACHE()

+#define OTHER_CACHE()   \

+    if (fpuCacheNeedsTransform(dyn, ninst)) ret|=2;

 

 #include "rv64/rv64_printer.h"

 #include "rv64/dynarec_rv64_private.h"

diff --git a/src/dynarec/rv64/dynarec_rv64_00.c b/src/dynarec/rv64/dynarec_rv64_00.c
index d91a3216..b6153b06 100644
--- a/src/dynarec/rv64/dynarec_rv64_00.c
+++ b/src/dynarec/rv64/dynarec_rv64_00.c
@@ -1003,6 +1003,10 @@ uintptr_t dynarec64_00(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             }
             break;
 
+        case 0xD9:
+            addr = dynarec64_D9(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+
         case 0xE8:
             INST_NAME("CALL Id");
             i32 = F32S;
diff --git a/src/dynarec/rv64/dynarec_rv64_d9.c b/src/dynarec/rv64/dynarec_rv64_d9.c
new file mode 100644
index 00000000..bff399e6
--- /dev/null
+++ b/src/dynarec/rv64/dynarec_rv64_d9.c
@@ -0,0 +1,383 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "rv64_printer.h"
+#include "dynarec_rv64_private.h"
+#include "dynarec_rv64_helper.h"
+#include "dynarec_rv64_functions.h"
+
+
+uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip; (void)rep; (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t ed;
+    uint8_t wback, wb1;
+    uint8_t u8;
+    int64_t fixedaddress;
+    int unscaled;
+    int v1, v2;
+    int s0;
+    int i1, i2, i3;
+
+    MAYUSE(s0);
+    MAYUSE(v2);
+    MAYUSE(v1);
+
+    switch(nextop) {
+        case 0xC0:
+        case 0xC1:
+        case 0xC2:
+        case 0xC3:
+        case 0xC4:
+        case 0xC5:
+        case 0xC6:
+        case 0xC7:
+            INST_NAME("FLD STx");
+            v2 = x87_do_push(dyn, ninst, x1, X87_ST(nextop&7));
+            v1 = x87_get_st(dyn, ninst, x1, x2, (nextop&7)+1, X87_COMBINE(0, (nextop&7)+1));
+            if(ST_IS_F(0)) {
+                FMVS(v2, v1);
+            } else {
+                FMVD(v2, v1);
+            }
+            break;
+
+        case 0xC8:
+            INST_NAME("FXCH ST0");
+            break;
+        case 0xC9:
+        case 0xCA:
+        case 0xCB:
+        case 0xCC:
+        case 0xCD:
+        case 0xCE:
+        case 0xCF:
+            INST_NAME("FXCH STx");
+            // swap the cache value, not the double value itself :p
+            x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_ST(nextop&7));
+            x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+            x87_swapreg(dyn, ninst, x1, x2, 0, nextop&7);
+            // should set C1 to 0
+            break;
+
+        case 0xD0:
+            INST_NAME("FNOP");
+            break;
+
+        case 0xD8:
+            INST_NAME("FSTPNCE ST0, ST0");
+            x87_do_pop(dyn, ninst, x3);
+            break;
+        case 0xD9:
+        case 0xDA:
+        case 0xDB:
+        case 0xDC:
+        case 0xDD:
+        case 0xDE:
+        case 0xDF:
+            INST_NAME("FSTPNCE ST0, STx");
+            // copy the cache value for st0 to stx
+            x87_get_st_empty(dyn, ninst, x1, x2, nextop&7, X87_ST(nextop&7));
+            x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+            x87_swapreg(dyn, ninst, x1, x2, 0, nextop&7);
+            x87_do_pop(dyn, ninst, x3);
+            break;
+        case 0xE0:
+            INST_NAME("FCHS");
+            v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+            if(ST_IS_F(0)) {
+                FNEGS(v1, v1);
+            } else {
+                FNEGD(v1, v1);
+            }
+            break;
+        case 0xE1:
+            INST_NAME("FABS");
+            v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+            if(ST_IS_F(0)) {
+                FABSS(v1, v1);
+            } else {
+                FABSD(v1, v1);
+            }
+            break;
+
+        case 0xE4:
+            INST_NAME("FTST");
+            DEFAULT
+            break;
+        case 0xE5:
+            INST_NAME("FXAM");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_refresh(dyn, ninst, x1, x2, 0);
+            CALL(fpu_fxam, -1);  // should be possible inline, but is it worth it?
+            break;
+
+        case 0xE8:
+            INST_NAME("FLD1");
+            v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_F);
+            if(ST_IS_F(0)) {
+                MOV32w(x1, 0x3f800000);
+                FMVWX(v1, x1);
+            } else {
+                MOV64x(x1, 0x3FF0000000000000);
+                FMVDX(v1, x1);
+            }
+            break;
+        case 0xE9:
+            INST_NAME("FLDL2T");
+            v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D);
+            FTABLE64(v1, L2T);
+            break;
+        case 0xEA:     
+            INST_NAME("FLDL2E");
+            v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D);
+            FTABLE64(v1, L2E);
+            break;
+        case 0xEB:
+            INST_NAME("FLDPI");
+            v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D);
+            FTABLE64(v1, PI);
+            break;
+        case 0xEC:
+            INST_NAME("FLDLG2");
+            v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D);
+            FTABLE64(v1, LG2);
+            break;
+        case 0xED:
+            INST_NAME("FLDLN2");
+            v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D);
+            FTABLE64(v1, LN2);
+            break;
+        case 0xEE:
+            INST_NAME("FLDZ");
+            v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_F);
+            if(ST_IS_F(0)) {
+                FMVWX(v1, xZR);
+            } else {
+                FMVDX(v1, xZR);
+            }
+            break;
+
+        case 0xF0:
+            INST_NAME("F2XM1");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            CALL(native_f2xm1, -1);
+            break;
+        case 0xF1:
+            INST_NAME("FYL2X");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            x87_forget(dyn, ninst, x1, x2, 1);
+            CALL(native_fyl2x, -1);
+            x87_do_pop(dyn, ninst, x3);
+            break;
+        case 0xF2:
+            INST_NAME("FPTAN");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            CALL(native_ftan, -1);
+            v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_F);
+            if(ST_IS_F(0)) {
+                MOV32w(x1, 0x3f800000);
+                FMVWX(v1, x1);
+            } else {
+                MOV64x(x1, 0x3FF0000000000000);
+                FMVDX(v1, x1);
+            }
+            break;
+        case 0xF3:
+            INST_NAME("FPATAN");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            x87_forget(dyn, ninst, x1, x2, 1);
+            CALL(native_fpatan, -1);
+            x87_do_pop(dyn, ninst, x3);
+            break;
+        case 0xF4:
+            INST_NAME("FXTRACT");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_do_push_empty(dyn, ninst, 0);
+            x87_forget(dyn, ninst, x1, x2, 1);
+            CALL(native_fxtract, -1);
+            break;
+        case 0xF5:
+            INST_NAME("FPREM1");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            x87_forget(dyn, ninst, x1, x2, 1);
+            CALL(native_fprem1, -1);
+            break;
+        case 0xF6:
+            INST_NAME("FDECSTP");
+            fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+            LW(x2, xEmu, offsetof(x64emu_t, top));
+            ADDI(x2, x2, -1);
+            ANDI(x2, x2, 7);
+            SW(x2, xEmu, offsetof(x64emu_t, top));
+            break;
+        case 0xF7:
+            INST_NAME("FINCSTP");
+            fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+            LW(x2, xEmu, offsetof(x64emu_t, top));
+            ADDI(x2, x2, 1);
+            ANDI(x2, x2, 7);
+            SW(x2, xEmu, offsetof(x64emu_t, top));
+            break;
+        case 0xF8:
+            INST_NAME("FPREM");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            x87_forget(dyn, ninst, x1, x2, 1);
+            CALL(native_fprem, -1);
+            break;
+        case 0xF9:
+            INST_NAME("FYL2XP1");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            x87_forget(dyn, ninst, x1, x2, 1);
+            CALL(native_fyl2xp1, -1);
+            x87_do_pop(dyn, ninst, x3);
+            break;
+        case 0xFA:
+            INST_NAME("FSQRT");
+            DEFAULT;
+            break;
+        case 0xFB:
+            INST_NAME("FSINCOS");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_do_push_empty(dyn, ninst, 0);
+            x87_forget(dyn, ninst, x1, x2, 1);
+            CALL(native_fsincos, -1);
+            break;
+        case 0xFC:
+            INST_NAME("FRNDINT");
+            DEFAULT;
+            break;
+        case 0xFD:
+            INST_NAME("FSCALE");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            x87_forget(dyn, ninst, x1, x2, 1);
+            CALL(native_fscale, -1);
+            break;
+        case 0xFE:
+            INST_NAME("FSIN");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            CALL(native_fsin, -1);
+            break;
+        case 0xFF:
+            INST_NAME("FCOS");
+            MESSAGE(LOG_DUMP, "Need Optimization\n");
+            x87_forget(dyn, ninst, x1, x2, 0);
+            CALL(native_fcos, -1);
+            break;
+
+
+        case 0xD1:
+        case 0xD4:
+        case 0xD5:
+        case 0xD6:
+        case 0xD7:
+        case 0xE2:
+        case 0xE3:
+        case 0xE6:
+        case 0xE7:
+        case 0xEF:
+            DEFAULT;
+            break;
+             
+        default:
+            switch((nextop>>3)&7) {
+                case 0:
+                    INST_NAME("FLD ST0, float[ED]");
+                    v1 = x87_do_push(dyn, ninst, x1, box64_dynarec_x87double?EXT_CACHE_ST_D:EXT_CACHE_ST_F);
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                    FLW(v1, ed, fixedaddress);
+                    if(!ST_IS_F(0)) {
+                        FCVTDS(v1, v1);
+                    }
+                    break;
+                case 2:
+                    INST_NAME("FST float[ED], ST0");
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_F);
+                    if(ST_IS_F(0))
+                        s0 = v1;
+                    else {
+                        s0 = fpu_get_scratch(dyn);
+                        FCVTSD(s0, v1);
+                    }
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                    FSW(s0, ed, fixedaddress);
+                    break;
+                case 3:
+                    INST_NAME("FSTP float[ED], ST0");
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_F);
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                    if(!ST_IS_F(0)) {
+                        FCVTSD(v1, v1);
+                    }
+                    FSW(v1, ed, fixedaddress);
+                    x87_do_pop(dyn, ninst, x3);
+                    break;
+                case 4:
+                    INST_NAME("FLDENV Ed");
+                    MESSAGE(LOG_DUMP, "Need Optimization\n");
+                    fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE?
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
+                    if(ed!=x1) {
+                        MV(x1, ed);
+                    }
+                    MOV32w(x2, 0);
+                    CALL(fpu_loadenv, -1);
+                    break;
+                case 5:
+                    INST_NAME("FLDCW Ew");
+                    GETEW(x1, 0);
+                    SH(x1, xEmu, offsetof(x64emu_t, cw));    // hopefully cw is not too far for an imm8
+                    break;
+                case 6:
+                    INST_NAME("FNSTENV Ed");
+                    MESSAGE(LOG_DUMP, "Need Optimization\n");
+                    fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE?
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
+                    if(ed!=x1) {
+                        MV(x1, ed);
+                    }
+                    MOV32w(x2, 0);
+                    CALL(fpu_savenv, -1);
+                    break;
+                case 7:
+                    INST_NAME("FNSTCW Ew");
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, x1, &fixedaddress, rex, NULL, 0, 0);
+                    ed = x1;
+                    wb1 = 1;
+                    LH(x1, xEmu, offsetof(x64emu_t, cw));
+                    EWBACK;
+                    break;
+                default:
+                    DEFAULT;
+            }
+    }
+    return addr;
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c
index 2c3de1b5..8994e0e5 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.c
+++ b/src/dynarec/rv64/dynarec_rv64_functions.c
@@ -29,12 +29,406 @@
 #include "bridge.h"
 #include "rv64_lock.h"
 
+#define XMM0    0
+#define X870    XMM0+16
+#define EMM0    XMM0+16
 
+#define SCRATCH0    0
+
+// Get a FPU scratch reg
+int fpu_get_scratch(dynarec_rv64_t* dyn)
+{
+    return SCRATCH0 + dyn->e.fpu_scratch++;  // return an Sx
+}
+// Reset scratch regs counter
 void fpu_reset_scratch(dynarec_rv64_t* dyn)
 {
-    //TODO
+    dyn->e.fpu_scratch = 0;
+}
+// Get a x87 double reg
+int fpu_get_reg_x87(dynarec_rv64_t* dyn, int t, int n)
+{
+    int i=X870;
+    while (dyn->e.fpuused[i]) ++i;
+    dyn->e.fpuused[i] = 1;
+    dyn->e.extcache[i].n = n;
+    dyn->e.extcache[i].t = t;
+    dyn->e.news |= (1<<i);
+    return EXTREG(i); // return a Dx
+}
+// Free a FPU double reg
+void fpu_free_reg(dynarec_rv64_t* dyn, int reg)
+{
+    int idx = EXTIDX(reg);
+    // TODO: check upper limit?
+    dyn->e.fpuused[idx] = 0;
+    if(dyn->e.extcache[idx].t!=EXT_CACHE_ST_F && dyn->e.extcache[idx].t!=EXT_CACHE_ST_D)
+        dyn->e.extcache[idx].v = 0;
+}
+// Get an MMX double reg
+int fpu_get_reg_emm(dynarec_rv64_t* dyn, int emm)
+{
+    dyn->e.fpuused[EMM0 + emm] = 1;
+    dyn->e.extcache[EMM0 + emm].t = EXT_CACHE_MM;
+    dyn->e.extcache[EMM0 + emm].n = emm;
+    dyn->e.news |= (1<<(EMM0 + emm));
+    return EXTREG(EMM0 + emm);
+}
+// Get an XMM quad reg
+int fpu_get_reg_xmm(dynarec_rv64_t* dyn, int t, int xmm)
+{
+    int i = XMM0+xmm;
+    dyn->e.fpuused[i] = 1;
+    dyn->e.extcache[i].t = t;
+    dyn->e.extcache[i].n = xmm;
+    dyn->e.news |= (1<<i);
+    return EXTREG(i);
+}
+// Reset fpu regs counter
+void fpu_reset_reg(dynarec_rv64_t* dyn)
+{
+    dyn->e.fpu_reg = 0;
+    for (int i=0; i<24; ++i) {
+        dyn->e.fpuused[i]=0;
+        dyn->e.extcache[i].v = 0;
+    }
+}
+
+int extcache_get_st(dynarec_rv64_t* dyn, int ninst, int a)
+{
+    if (dyn->insts[ninst].e.swapped) {
+        if(dyn->insts[ninst].e.combined1 == a)
+            a = dyn->insts[ninst].e.combined2;
+        else if(dyn->insts[ninst].e.combined2 == a)
+            a = dyn->insts[ninst].e.combined1;
+    }
+    for(int i=0; i<24; ++i)
+        if((dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F
+         || dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_D)
+         && dyn->insts[ninst].e.extcache[i].n==a)
+            return dyn->insts[ninst].e.extcache[i].t;
+    // not in the cache yet, so will be fetched...
+    return EXT_CACHE_ST_D;
+}
+
+int extcache_get_current_st(dynarec_rv64_t* dyn, int ninst, int a)
+{
+    (void)ninst;
+    if(!dyn->insts)
+        return EXT_CACHE_ST_D;
+    for(int i=0; i<24; ++i)
+        if((dyn->e.extcache[i].t==EXT_CACHE_ST_F
+         || dyn->e.extcache[i].t==EXT_CACHE_ST_D)
+         && dyn->e.extcache[i].n==a)
+            return dyn->e.extcache[i].t;
+    // not in the cache yet, so will be fetched...
+    return EXT_CACHE_ST_D;
+}
+
+int extcache_get_st_f(dynarec_rv64_t* dyn, int ninst, int a)
+{
+    for(int i=0; i<24; ++i)
+        if(dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F
+         && dyn->insts[ninst].e.extcache[i].n==a)
+            return i;
+    return -1;
+} 
+int extcache_get_st_f_noback(dynarec_rv64_t* dyn, int ninst, int a)
+{
+    for(int i=0; i<24; ++i)
+        if(dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F
+         && dyn->insts[ninst].e.extcache[i].n==a)
+            return i;
+    return -1;
+} 
+int extcache_get_current_st_f(dynarec_rv64_t* dyn, int a)
+{
+    for(int i=0; i<24; ++i)
+        if(dyn->e.extcache[i].t==EXT_CACHE_ST_F
+         && dyn->e.extcache[i].n==a)
+            return i;
+    return -1;
+} 
+
+static void extcache_promote_double_forward(dynarec_rv64_t* dyn, int ninst, int maxinst, int a);
+static void extcache_promote_double_internal(dynarec_rv64_t* dyn, int ninst, int maxinst, int a);
+static void extcache_promote_double_combined(dynarec_rv64_t* dyn, int ninst, int maxinst, int a)
+{
+    if(a == dyn->insts[ninst].e.combined1 || a == dyn->insts[ninst].e.combined2) {
+        if(a == dyn->insts[ninst].e.combined1) {
+            a = dyn->insts[ninst].e.combined2;
+        } else 
+            a = dyn->insts[ninst].e.combined1;
+        int i = extcache_get_st_f_noback(dyn, ninst, a);
+        //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_combined, ninst=%d combined%c %d i=%d (stack:%d/%d)\n", ninst, (a == dyn->insts[ninst].e.combined2)?'2':'1', a ,i, dyn->insts[ninst].e.stack_push, -dyn->insts[ninst].e.stack_pop);
+        if(i>=0) {
+            dyn->insts[ninst].e.extcache[i].t = EXT_CACHE_ST_D;
+            if(!dyn->insts[ninst].e.barrier)
+                extcache_promote_double_internal(dyn, ninst-1, maxinst, a-dyn->insts[ninst].e.stack_push);
+            // go forward is combined is not pop'd
+            if(a-dyn->insts[ninst].e.stack_pop>=0)
+                if(!dyn->insts[ninst+1].e.barrier)
+                    extcache_promote_double_forward(dyn, ninst+1, maxinst, a-dyn->insts[ninst].e.stack_pop);
+        }
+    }
+}
+static void extcache_promote_double_internal(dynarec_rv64_t* dyn, int ninst, int maxinst, int a)
+{
+    if(dyn->insts[ninst+1].e.barrier)
+        return;
+    while(ninst>=0) {
+        a+=dyn->insts[ninst].e.stack_pop;    // adjust Stack depth: add pop'd ST (going backward)
+        int i = extcache_get_st_f(dyn, ninst, a);
+        //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_internal, ninst=%d, a=%d st=%d:%d, i=%d\n", ninst, a, dyn->insts[ninst].e.stack, dyn->insts[ninst].e.stack_next, i);
+        if(i<0) return;
+        dyn->insts[ninst].e.extcache[i].t = EXT_CACHE_ST_D;
+        // check combined propagation too
+        if(dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2) {
+            if(dyn->insts[ninst].e.swapped) {
+                //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_internal, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack);
+                if (a==dyn->insts[ninst].e.combined1)
+                    a = dyn->insts[ninst].e.combined2;
+                else if (a==dyn->insts[ninst].e.combined2)
+                    a = dyn->insts[ninst].e.combined1;
+            } else {
+                //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_internal, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack);
+                extcache_promote_double_combined(dyn, ninst, maxinst, a);
+            }
+        }
+        a-=dyn->insts[ninst].e.stack_push;  // // adjust Stack depth: remove push'd ST (going backward)
+        --ninst;
+        if(ninst<0 || a<0 || dyn->insts[ninst].e.barrier)
+            return;
+    }
+}
+
+static void extcache_promote_double_forward(dynarec_rv64_t* dyn, int ninst, int maxinst, int a)
+{
+    while((ninst!=-1) && (ninst<maxinst) && (a>=0)) {
+        a+=dyn->insts[ninst].e.stack_push;  // // adjust Stack depth: add push'd ST (going forward)
+        if((dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2) && dyn->insts[ninst].e.swapped) {
+            //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_forward, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack);
+            if (a==dyn->insts[ninst].e.combined1)
+                a = dyn->insts[ninst].e.combined2;
+            else if (a==dyn->insts[ninst].e.combined2)
+                a = dyn->insts[ninst].e.combined1;
+        }
+        int i = extcache_get_st_f_noback(dyn, ninst, a);
+        //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_forward, ninst=%d, a=%d st=%d:%d(%d/%d), i=%d\n", ninst, a, dyn->insts[ninst].e.stack, dyn->insts[ninst].e.stack_next, dyn->insts[ninst].e.stack_push, -dyn->insts[ninst].e.stack_pop, i);
+        if(i<0) return;
+        dyn->insts[ninst].e.extcache[i].t = EXT_CACHE_ST_D;
+        // check combined propagation too
+        if((dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2) && !dyn->insts[ninst].e.swapped) {
+            //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_forward, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack);
+            extcache_promote_double_combined(dyn, ninst, maxinst, a);
+        }
+        a-=dyn->insts[ninst].e.stack_pop;    // adjust Stack depth: remove pop'd ST (going forward)
+        if(dyn->insts[ninst].x64.has_next && !dyn->insts[ninst].e.barrier)
+            ++ninst;
+        else
+            ninst=-1;
+    }
+    if(ninst==maxinst)
+        extcache_promote_double(dyn, ninst, a);
+}
+
+void extcache_promote_double(dynarec_rv64_t* dyn, int ninst, int a)
+{
+    int i = extcache_get_current_st_f(dyn, a);
+    //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double, ninst=%d a=%d st=%d i=%d\n", ninst, a, dyn->e.stack, i);
+    if(i<0) return;
+    dyn->e.extcache[i].t = EXT_CACHE_ST_D;
+    dyn->insts[ninst].e.extcache[i].t = EXT_CACHE_ST_D;
+    // check combined propagation too
+    if(dyn->e.combined1 || dyn->e.combined2) {
+        if(dyn->e.swapped) {
+            //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double, ninst=%d swapped! %d/%d vs %d\n", ninst, dyn->e.combined1 ,dyn->e.combined2, a);
+            if(dyn->e.combined1 == a)
+                a = dyn->e.combined2;
+            else if(dyn->e.combined2 == a)
+                a = dyn->e.combined1;
+        } else {
+            //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double, ninst=%d combined! %d/%d vs %d\n", ninst, dyn->e.combined1 ,dyn->e.combined2, a);
+            if(dyn->e.combined1 == a)
+                extcache_promote_double(dyn, ninst, dyn->e.combined2);
+            else if(dyn->e.combined2 == a)
+                extcache_promote_double(dyn, ninst, dyn->e.combined1);
+        }
+    }
+    a-=dyn->insts[ninst].e.stack_push;  // // adjust Stack depth: remove push'd ST (going backward)
+    if(!ninst || a<0) return;
+    extcache_promote_double_internal(dyn, ninst-1, ninst, a);
+}
+
+int extcache_combine_st(dynarec_rv64_t* dyn, int ninst, int a, int b)
+{
+    dyn->e.combined1=a;
+    dyn->e.combined2=b;
+    if( extcache_get_current_st(dyn, ninst, a)==EXT_CACHE_ST_F
+     && extcache_get_current_st(dyn, ninst, b)==EXT_CACHE_ST_F )
+        return EXT_CACHE_ST_F;
+    return EXT_CACHE_ST_D;
+}
+
+static int isCacheEmpty(dynarec_native_t* dyn, int ninst) {
+    if(dyn->insts[ninst].e.stack_next) {
+        return 0;
+    }
+    for(int i=0; i<24; ++i)
+        if(dyn->insts[ninst].e.extcache[i].v) {       // there is something at ninst for i
+            if(!(
+            (dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F || dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_D)
+            && dyn->insts[ninst].e.extcache[i].n<dyn->insts[ninst].e.stack_pop))
+                return 0;
+        }
+    return 1;
+
+}
+
+int fpuCacheNeedsTransform(dynarec_rv64_t* dyn, int ninst) {
+    int i2 = dyn->insts[ninst].x64.jmp_insts;
+    if(i2<0)
+        return 1;
+    if((dyn->insts[i2].x64.barrier&BARRIER_FLOAT))
+        // if the barrier as already been apply, no transform needed
+        return ((dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))?0:(isCacheEmpty(dyn, ninst)?0:1);
+    int ret = 0;
+    if(!i2) { // just purge
+        if(dyn->insts[ninst].e.stack_next) {
+            return 1;
+        }
+        for(int i=0; i<24 && !ret; ++i)
+            if(dyn->insts[ninst].e.extcache[i].v) {       // there is something at ninst for i
+                if(!(
+                (dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_F || dyn->insts[ninst].e.extcache[i].t==EXT_CACHE_ST_D)
+                && dyn->insts[ninst].e.extcache[i].n<dyn->insts[ninst].e.stack_pop))
+                    ret = 1;
+            }
+        return ret;
+    }
+    // Check if ninst can be compatible to i2
+    if(dyn->insts[ninst].e.stack_next != dyn->insts[i2].e.stack-dyn->insts[i2].e.stack_push) {
+        return 1;
+    }
+    extcache_t cache_i2 = dyn->insts[i2].e;
+    extcacheUnwind(&cache_i2);
+
+    for(int i=0; i<24; ++i) {
+        if(dyn->insts[ninst].e.extcache[i].v) {       // there is something at ninst for i
+            if(!cache_i2.extcache[i].v) {    // but there is nothing at i2 for i
+                ret = 1;
+            } else if(dyn->insts[ninst].e.extcache[i].v!=cache_i2.extcache[i].v) {  // there is something different
+                if(dyn->insts[ninst].e.extcache[i].n!=cache_i2.extcache[i].n) {   // not the same x64 reg
+                    ret = 1;
+                }
+                else if(dyn->insts[ninst].e.extcache[i].t == EXT_CACHE_SS && cache_i2.extcache[i].t == EXT_CACHE_SD)
+                    {/* nothing */ }
+                else
+                    ret = 1;
+            }
+        } else if(cache_i2.extcache[i].v)
+            ret = 1;
+    }
+    return ret;
+}
+
+void extcacheUnwind(extcache_t* cache)
+{
+    if(cache->swapped) {
+        // unswap
+        int a = -1; 
+        int b = -1;
+        for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j)
+            if((cache->extcache[j].t == EXT_CACHE_ST_D || cache->extcache[j].t == EXT_CACHE_ST_F)) {
+                if(cache->extcache[j].n == cache->combined1)
+                    a = j;
+                else if(cache->extcache[j].n == cache->combined2)
+                    b = j;
+            }
+        if(a!=-1 && b!=-1) {
+            int tmp = cache->extcache[a].n;
+            cache->extcache[a].n = cache->extcache[b].n;
+            cache->extcache[b].n = tmp;
+        }
+        cache->swapped = 0;
+        cache->combined1 = cache->combined2 = 0;
+    }
+    if(cache->news) {
+        // reove the newly created extcache
+        for(int i=0; i<24; ++i)
+            if(cache->news&(1<<i))
+                cache->extcache[i].v = 0;
+        cache->news = 0;
+    }
+    if(cache->stack_push) {
+        // unpush
+        for(int j=0; j<24; ++j) {
+            if((cache->extcache[j].t == EXT_CACHE_ST_D || cache->extcache[j].t == EXT_CACHE_ST_F)) {
+                if(cache->extcache[j].n<cache->stack_push)
+                    cache->extcache[j].v = 0;
+                else
+                    cache->extcache[j].n-=cache->stack_push;
+            }
+        }
+        cache->x87stack-=cache->stack_push;
+        cache->stack-=cache->stack_push;
+        cache->stack_push = 0;
+    }
+    cache->x87stack+=cache->stack_pop;
+    cache->stack_next = cache->stack;
+    cache->stack_pop = 0;
+    cache->barrier = 0;
+    // And now, rebuild the x87cache info with extcache
+    cache->mmxcount = 0;
+    cache->fpu_scratch = 0;
+    cache->fpu_extra_qscratch = 0;
+    cache->fpu_reg = 0;
+    for(int i=0; i<8; ++i) {
+        cache->x87cache[i] = -1;
+        cache->mmxcache[i] = -1;
+        cache->x87reg[i] = 0;
+        cache->ssecache[i*2].v = -1;
+        cache->ssecache[i*2+1].v = -1;
+    }
+    int x87reg = 0;
+    for(int i=0; i<24; ++i) {
+        if(cache->extcache[i].v) {
+            cache->fpuused[i] = 1;
+            switch (cache->extcache[i].t) {
+                case EXT_CACHE_MM:
+                    cache->mmxcache[cache->extcache[i].n] = i;
+                    ++cache->mmxcount;
+                    ++cache->fpu_reg;
+                    break;
+                case EXT_CACHE_SS:
+                    cache->ssecache[cache->extcache[i].n].reg = i;
+                    cache->ssecache[cache->extcache[i].n].single = 1;
+                    ++cache->fpu_reg;
+                    break;
+                case EXT_CACHE_SD:
+                    cache->ssecache[cache->extcache[i].n].reg = i;
+                    cache->ssecache[cache->extcache[i].n].single = 0;
+                    ++cache->fpu_reg;
+                    break;
+                case EXT_CACHE_ST_F:
+                case EXT_CACHE_ST_D:
+                    cache->x87cache[x87reg] = cache->extcache[i].n;
+                    cache->x87reg[x87reg] = i;
+                    ++x87reg;
+                    ++cache->fpu_reg;
+                    break;
+                case EXT_CACHE_SCR:
+                    cache->fpuused[i] = 0;
+                    cache->extcache[i].v = 0;
+                    break;
+            }
+        } else {
+            cache->fpuused[i] = 0;
+        }
+    }
 }
 
+
 uint8_t extract_byte(uint32_t val, void* address){
     int idx = (((uintptr_t)address)&3)*8;
     return (val>>idx)&0xff;
@@ -80,4 +474,20 @@ int rv64_lock_cas_h(void* addr, uint16_t ref, uint16_t val)
     uint32_t* aligned = (uint32_t*)(((uintptr_t)addr)&~3);
     uint32_t tmp = *aligned;
     return rv64_lock_cas_d(aligned, tmp, insert_half(tmp, val, addr));
-}
\ No newline at end of file
+}
+
+
+const char* getCacheName(int t, int n)
+{
+    static char buff[20];
+    switch(t) {
+        case EXT_CACHE_ST_D: sprintf(buff, "ST%d", n); break;
+        case EXT_CACHE_ST_F: sprintf(buff, "st%d", n); break;
+        case EXT_CACHE_MM: sprintf(buff, "MM%d", n); break;
+        case EXT_CACHE_SS: sprintf(buff, "SS%d", n); break;
+        case EXT_CACHE_SD: sprintf(buff, "SD%d", n); break;
+        case EXT_CACHE_SCR: sprintf(buff, "Scratch"); break;
+        case EXT_CACHE_NONE: buff[0]='\0'; break;
+    }
+    return buff;
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.h b/src/dynarec/rv64/dynarec_rv64_functions.h
index af55ad81..63640b0b 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.h
+++ b/src/dynarec/rv64/dynarec_rv64_functions.h
@@ -6,7 +6,41 @@
 typedef struct x64emu_s x64emu_t;
 typedef struct dynarec_rv64_s dynarec_rv64_t;
 
+// Get an FPU scratch reg
+int fpu_get_scratch(dynarec_rv64_t* dyn);
 // Reset scratch regs counter
 void fpu_reset_scratch(dynarec_rv64_t* dyn);
+// Get an x87 double reg
+int fpu_get_reg_x87(dynarec_rv64_t* dyn, int t, int n);
+// Get an MMX double reg
+int fpu_get_reg_emm(dynarec_rv64_t* dyn, int emm);
+// Get an XMM quad reg
+int fpu_get_reg_xmm(dynarec_rv64_t* dyn, int t, int xmm);
+// Free a FPU/MMX/XMM reg
+void fpu_free_reg(dynarec_rv64_t* dyn, int reg);
+// Reset fpu regs counter
+void fpu_reset_reg(dynarec_rv64_t* dyn);
+
+// ---- Neon cache functions
+// Get type for STx
+int extcache_get_st(dynarec_rv64_t* dyn, int ninst, int a);
+// Get if STx is FLOAT or DOUBLE
+int extcache_get_st_f(dynarec_rv64_t* dyn, int ninst, int a);
+// Get actual type for STx
+int extcache_get_current_st(dynarec_rv64_t* dyn, int ninst, int a);
+// Get actual STx is FLOAT or DOUBLE
+int extcache_get_current_st_f(dynarec_rv64_t* dyn, int a);
+// Back-propagate a change float->double
+void extcache_promote_double(dynarec_rv64_t* dyn, int ninst, int a);
+// Combine and propagate if needed (pass 1 only)
+int extcache_combine_st(dynarec_rv64_t* dyn, int ninst, int a, int b);  // with stack current dyn->n_stack*
+
+// FPU Cache transformation (for loops) // Specific, need to be writen par backend
+int fpuCacheNeedsTransform(dynarec_rv64_t* dyn, int ninst);
+
+// Undo the changes of a extcache to get the status before the instruction
+void extcacheUnwind(extcache_t* cache);
+
+const char* getCacheName(int t, int n);
 
 #endif //__DYNAREC_RV64_FUNCTIONS_H__
\ No newline at end of file
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c
index 5b57d4dc..b049a93b 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.c
+++ b/src/dynarec/rv64/dynarec_rv64_helper.c
@@ -4,6 +4,7 @@
 #include <pthread.h>
 #include <errno.h>
 #include <assert.h>
+#include <string.h>
 
 #include "bitutils.h"
 #include "debug.h"
@@ -523,55 +524,1027 @@ void grab_segdata(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, int reg, int s
     MESSAGE(LOG_DUMP, "----%s Offset\n", (segment==_FS)?"FS":"GS");
 }
 
-void fpu_reset(dynarec_rv64_t* dyn)
+// x87 stuffs
+static void x87_reset(dynarec_rv64_t* dyn)
 {
-    //TODO
+    for (int i=0; i<8; ++i)
+        dyn->e.x87cache[i] = -1;
+    dyn->e.x87stack = 0;
+    dyn->e.stack = 0;
+    dyn->e.stack_next = 0;
+    dyn->e.stack_pop = 0;
+    dyn->e.stack_push = 0;
+    dyn->e.combined1 = dyn->e.combined2 = 0;
+    dyn->e.swapped = 0;
+    dyn->e.barrier = 0;
+    for(int i=0; i<24; ++i)
+        if(dyn->e.extcache[i].t == EXT_CACHE_ST_F || dyn->e.extcache[i].t == EXT_CACHE_ST_D)
+            dyn->e.extcache[i].v = 0;
 }
 
-void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n)
+void x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch)
 {
-    //TODO
+    MAYUSE(scratch);
+    if(!dyn->e.x87stack)
+        return;
+    if(dyn->e.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, scratch);
+    MESSAGE(LOG_DUMP, "\tSynch x87 Stackcount (%d)\n", dyn->e.x87stack);
+    int a = dyn->e.x87stack;
+    // Add x87stack to emu fpu_stack
+    LW(scratch, xEmu, offsetof(x64emu_t, fpu_stack));
+    ADDI(scratch, scratch, a);
+    SW(scratch, xEmu, offsetof(x64emu_t, fpu_stack));
+    // Sub x87stack to top, with and 7
+    LW(scratch, xEmu, offsetof(x64emu_t, top));
+    ADDI(scratch, scratch, -a);
+    ANDI(scratch, scratch, 7);
+    SW(scratch, xEmu, offsetof(x64emu_t, top));
+    // reset x87stack, but not the stack count of extcache
+    dyn->e.x87stack = 0;
+    dyn->e.stack_next -= dyn->e.stack;
+    dyn->e.stack = 0;
+    MESSAGE(LOG_DUMP, "\t------x87 Stackcount\n");
 }
 
-void fpu_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3)
+int extcache_st_coherency(dynarec_rv64_t* dyn, int ninst, int a, int b)
 {
-    //TODO
+    int i1 = extcache_get_st(dyn, ninst, a);
+    int i2 = extcache_get_st(dyn, ninst, b);
+    if(i1!=i2) {
+        MESSAGE(LOG_DUMP, "Warning, ST cache incoherent between ST%d(%d) and ST%d(%d)\n", a, i1, b, i2);
+    }
+
+    return i1;
 }
 
-// propagate ST stack state, especial stack pop that are defered
-void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst)
+// On step 1, Float/Double for ST is actualy computed and back-propagated
+// On step 2-3, the value is just read for inst[...].n.neocache[..]
+// the reg returned is *2 for FLOAT
+int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t)
+{
+    if(dyn->e.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    dyn->e.x87stack+=1;
+    dyn->e.stack+=1;
+    dyn->e.stack_next+=1;
+    dyn->e.stack_push+=1;
+    // move all regs in cache, and find a free one
+    for(int j=0; j<24; ++j)
+        if((dyn->e.extcache[j].t == EXT_CACHE_ST_D) || (dyn->e.extcache[j].t == EXT_CACHE_ST_F))
+            ++dyn->e.extcache[j].n;
+    int ret = -1;
+    for(int i=0; i<8; ++i)
+        if(dyn->e.x87cache[i]!=-1)
+            ++dyn->e.x87cache[i];
+        else if(ret==-1) {
+            dyn->e.x87cache[i] = 0;
+            ret=dyn->e.x87reg[i]=fpu_get_reg_x87(dyn, t, 0);
+            #if STEP == 1
+            // need to check if reg is compatible with float
+            if((ret>15) && (t == EXT_CACHE_ST_F))
+                dyn->e.extcache[ret].t = EXT_CACHE_ST_D;
+            #else
+            dyn->e.extcache[ret].t = X87_ST0;
+            #endif
+        }
+    return ret;
+}
+void x87_do_push_empty(dynarec_rv64_t* dyn, int ninst, int s1)
 {
-    //TODO
+    if(dyn->e.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    dyn->e.x87stack+=1;
+    dyn->e.stack+=1;
+    dyn->e.stack_next+=1;
+    dyn->e.stack_push+=1;
+    // move all regs in cache
+    for(int j=0; j<24; ++j)
+        if((dyn->e.extcache[j].t == EXT_CACHE_ST_D) || (dyn->e.extcache[j].t == EXT_CACHE_ST_F))
+            ++dyn->e.extcache[j].n;
+    for(int i=0; i<8; ++i)
+        if(dyn->e.x87cache[i]!=-1)
+            ++dyn->e.x87cache[i];
+    if(s1)
+        x87_stackcount(dyn, ninst, s1);
+}
+void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1)
+{
+    if(dyn->e.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    dyn->e.x87stack-=1;
+    dyn->e.stack_next-=1;
+    dyn->e.stack_pop+=1;
+    // move all regs in cache, poping ST0
+    for(int i=0; i<8; ++i)
+        if(dyn->e.x87cache[i]!=-1) {
+            --dyn->e.x87cache[i];
+            if(dyn->e.x87cache[i]==-1) {
+                fpu_free_reg(dyn, dyn->e.x87reg[i]);
+                dyn->e.x87reg[i] = -1;
+            }
+        }
 }
 
+void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3)
+{
+    int ret = 0;
+    for (int i=0; i<8 && !ret; ++i)
+        if(dyn->e.x87cache[i] != -1)
+            ret = 1;
+    if(!ret && !dyn->e.x87stack)    // nothing to do
+        return;
+    MESSAGE(LOG_DUMP, "\tPurge %sx87 Cache and Synch Stackcount (%+d)---\n", next?"locally ":"", dyn->e.x87stack);
+    int a = dyn->e.x87stack;
+    if(a!=0) {
+        // reset x87stack
+        if(!next)
+            dyn->e.x87stack = 0;
+        // Add x87stack to emu fpu_stack
+        LW(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        ADDI(s2, s2, a);
+        SW(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        // Sub x87stack to top, with and 7
+        LW(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags (and top at the same time)
+        if(a>0) {
+            // new tag to fulls
+            ADDI(s3, xZR, 0);
+            for (int i=0; i<a; ++i) {
+                ADDI(s2, s2, -1);
+                ANDI(s2, s2, 7);    // (emu->top + st)&7
+                SLLI(s1, s2, 2);
+                ADD(s1, xEmu, s1);
+                SW(s3, s1, offsetof(x64emu_t, p_regs));
+            }
+        } else {
+            // empty tags
+            ADDI(s3, xZR, 0b11);
+            for (int i=0; i<-a; ++i) {
+                SLLI(s1, s2, 2);
+                ADD(s1, xEmu, s1);
+                SW(s3, s1, offsetof(x64emu_t, p_regs));
+                ADDI(s2, s2, 1);
+                ANDI(s2, s2, 7);    // (emu->top + st)&7
+            }
+        }
+        SW(s2, xEmu, offsetof(x64emu_t, top));
+    } else {
+        LW(s2, xEmu, offsetof(x64emu_t, top));
+    }
+    if(ret!=0) {
+        // --- set values
+        // Get top
+        // loop all cache entries
+        for (int i=0; i<8; ++i)
+            if(dyn->e.x87cache[i]!=-1) {
+                #if STEP == 1
+                if(!next) {   // don't force promotion here
+                    // pre-apply pop, because purge happens in-between
+                    extcache_promote_double(dyn, ninst, dyn->e.x87cache[i]+dyn->e.stack_pop);
+                }
+                #endif
+                #if STEP == 3
+                if(!next && extcache_get_st_f(dyn, ninst, dyn->e.x87cache[i])>=0) {
+                    MESSAGE(LOG_DUMP, "Warning, incoherency with purged ST%d cache\n", dyn->e.x87cache[i]);
+                }
+                #endif
+                ADDI(s3, s2, dyn->e.x87cache[i]);
+                ANDI(s3, s3, 7);   // (emu->top + st)&7
+                SLLI(s1, s3, 3);
+                ADD(s1, xEmu, s1);
+                if(next) {
+                    // need to check if a ST_F need local promotion
+                    if(extcache_get_st_f(dyn, ninst, dyn->e.x87cache[i])>=0) {
+                        FCVTDS(0, dyn->e.x87reg[i]);
+                        FSD(0, s1, offsetof(x64emu_t, x87));    // save the value
+                    } else {
+                        FSD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87));    // save the value
+                    }
+                } else {
+                    FSD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87));
+                    fpu_free_reg(dyn, dyn->e.x87reg[i]);
+                    dyn->e.x87reg[i] = -1;
+                    dyn->e.x87cache[i] = -1;
+                    //dyn->e.stack_pop+=1; //no pop, but the purge because of barrier will have the n.barrier flags set
+                }
+            }
+    }
+    if(!next) {
+        dyn->e.stack_next = 0;
+        #if STEP < 2
+        // refresh the cached valued, in case it's a purge outside a instruction
+        dyn->insts[ninst].e.barrier = 1;
+        #endif
+    }
+    MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n");
+}
+
+#ifdef HAVE_TRACE
+static void x87_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3)
+{
+    x87_stackcount(dyn, ninst, s1);
+    int ret = 0;
+    for (int i=0; (i<8) && (!ret); ++i)
+        if(dyn->e.x87cache[i] != -1)
+            ret = 1;
+    if(!ret)    // nothing to do
+        return;
+    // prepare offset to fpu => s1
+    // Get top
+    LW(s2, xEmu, offsetof(x64emu_t, top));
+    // loop all cache entries
+    for (int i=0; i<8; ++i)
+        if(dyn->e.x87cache[i]!=-1) {
+            ADDI(s3, s2, dyn->e.x87cache[i]);
+            ANDI(s3, s3, 7);   // (emu->top + i)&7
+            SLLI(s1, s3, 3);
+            ADD(s1, xEmu, s1);
+            FSD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87));
+        }
+}
+#endif
+
+int x87_get_current_cache(dynarec_rv64_t* dyn, int ninst, int st, int t)
+{
+    // search in cache first
+    for (int i=0; i<8; ++i) {
+        if(dyn->e.x87cache[i]==st) {
+            #if STEP == 1
+            if(t==EXT_CACHE_ST_D && (dyn->e.extcache[dyn->e.x87reg[i]].t==EXT_CACHE_ST_F))
+                extcache_promote_double(dyn, ninst, st);
+            #endif
+            return i;
+        }
+        assert(dyn->e.x87cache[i]<8);
+    }
+    return -1;
+}
+
+int x87_get_cache(dynarec_rv64_t* dyn, int ninst, int populate, int s1, int s2, int st, int t)
+{
+    if(dyn->e.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    int ret = x87_get_current_cache(dyn, ninst, st, t);
+    if(ret!=-1)
+        return ret;
+    MESSAGE(LOG_DUMP, "\tCreate %sx87 Cache for ST%d\n", populate?"and populate ":"", st);
+    // get a free spot
+    for (int i=0; (i<8) && (ret==-1); ++i)
+        if(dyn->e.x87cache[i]==-1)
+            ret = i;
+    // found, setup and grab the value
+    dyn->e.x87cache[ret] = st;
+    dyn->e.x87reg[ret] = fpu_get_reg_x87(dyn, EXT_CACHE_ST_D, st);
+    if(populate) {
+        LW(s2, xEmu, offsetof(x64emu_t, top));
+        int a = st - dyn->e.x87stack;
+        if(a) {
+            ADDI(s2, s2, a);
+            ANDI(s2, s2, 7);
+        }
+        ADD(s1, xEmu, s2);
+        FLD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87));
+    }
+    MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
+
+    return ret;
+}
+int x87_get_extcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st)
+{
+    for(int ii=0; ii<24; ++ii)
+        if((dyn->e.extcache[ii].t == EXT_CACHE_ST_F || dyn->e.extcache[ii].t == EXT_CACHE_ST_D)
+         && dyn->e.extcache[ii].n==st)
+            return ii;
+    assert(0);
+    return -1;
+}
+int x87_get_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t)
+{
+    return dyn->e.x87reg[x87_get_cache(dyn, ninst, 1, s1, s2, a, t)];
+}
+int x87_get_st_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t)
+{
+    return dyn->e.x87reg[x87_get_cache(dyn, ninst, 0, s1, s2, a, t)];
+}
+
+
+void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st)
+{
+    x87_stackcount(dyn, ninst, s1);
+    int ret = -1;
+    for (int i=0; (i<8) && (ret==-1); ++i)
+        if(dyn->e.x87cache[i] == st)
+            ret = i;
+    if(ret==-1)    // nothing to do
+        return;
+    MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st);
+    // prepare offset to fpu => s1
+    // Get top
+    LW(s2, xEmu, offsetof(x64emu_t, top));
+    // Update
+    if(st) {
+        ADDI(s2, s2, st);
+        ANDI(s2, s2, 7);    // (emu->top + i)&7
+    }
+    ADD(s1, xEmu, s2);
+    if(dyn->e.extcache[dyn->e.x87reg[ret]].t==EXT_CACHE_ST_F) {
+        FCVTDS(0, dyn->e.x87reg[ret]);
+        FSD(31, s1, offsetof(x64emu_t, x87));
+    } else {
+        FSD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87));
+    }
+    MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st);
+}
+
+void x87_forget(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st)
+{
+    x87_stackcount(dyn, ninst, s1);
+    int ret = -1;
+    for (int i=0; (i<8) && (ret==-1); ++i)
+        if(dyn->e.x87cache[i] == st)
+            ret = i;
+    if(ret==-1)    // nothing to do
+        return;
+    MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st);
+    #if STEP == 1
+    if(dyn->e.extcache[dyn->e.x87reg[ret]].t==EXT_CACHE_ST_F)
+        extcache_promote_double(dyn, ninst, st);
+    #endif
+    // prepare offset to fpu => s1
+    // Get top
+    LW(s2, xEmu, offsetof(x64emu_t, top));
+    // Update
+    if(st) {
+        ADDI(s2, s2, st);
+        ANDI(s2, s2, 7);    // (emu->top + i)&7
+    }
+    ADD(s1, xEmu, s2);
+    FSD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87));
+    MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st);
+    // and forget that cache
+    fpu_free_reg(dyn, dyn->e.x87reg[ret]);
+    dyn->e.extcache[dyn->e.x87reg[ret]].v = 0;
+    dyn->e.x87cache[ret] = -1;
+    dyn->e.x87reg[ret] = -1;
+}
+
+void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st)
+{
+    if(dyn->e.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    // search in cache first
+    for (int i=0; i<8; ++i)
+        if(dyn->e.x87cache[i]==st) {
+            // refresh the value
+            MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st);
+            #if STEP == 1
+            if(dyn->e.extcache[dyn->e.x87reg[i]].t==EXT_CACHE_ST_F)
+                extcache_promote_double(dyn, ninst, st);
+            #endif
+            LW(s2, xEmu, offsetof(x64emu_t, top));
+            int a = st - dyn->e.x87stack;
+            ADDI(s2, s2, a);
+            AND(s2, s2, 7);
+            SLLI(s2, s2, 3);
+            ADD(s1, xEmu, s2);
+            FLD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87));
+            MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
+            // ok
+            return;
+        }
+    // Was not in the cache? creating it....
+    MESSAGE(LOG_DUMP, "\tCreate x87 Cache for ST%d\n", st);
+    // get a free spot
+    int ret = -1;
+    for (int i=0; (i<8) && (ret==-1); ++i)
+        if(dyn->e.x87cache[i]==-1)
+            ret = i;
+    // found, setup and grab the value
+    dyn->e.x87cache[ret] = st;
+    dyn->e.x87reg[ret] = fpu_get_reg_x87(dyn, EXT_CACHE_ST_D, st);
+    LW(s2, xEmu, offsetof(x64emu_t, top));
+    int a = st - dyn->e.x87stack;
+    ADDI(s2, s2, a);
+    ANDI(s2, s2, 7);    // (emu->top + i)&7
+    SLLI(s2, s2, 3);
+    ADD(s1, xEmu, s2);
+    FLD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87));
+    MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
+}
+
+void x87_swapreg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int b)
+{
+    int i1, i2, i3;
+    i1 = x87_get_cache(dyn, ninst, 1, s1, s2, b, X87_ST(b));
+    i2 = x87_get_cache(dyn, ninst, 1, s1, s2, a, X87_ST(a));
+    i3 = dyn->e.x87cache[i1];
+    dyn->e.x87cache[i1] = dyn->e.x87cache[i2];
+    dyn->e.x87cache[i2] = i3;
+    // swap those too
+    int j1, j2, j3;
+    j1 = x87_get_extcache(dyn, ninst, s1, s2, b);
+    j2 = x87_get_extcache(dyn, ninst, s1, s2, a);
+    j3 = dyn->e.extcache[j1].n;
+    dyn->e.extcache[j1].n = dyn->e.extcache[j2].n;
+    dyn->e.extcache[j2].n = j3;
+    // mark as swapped
+    dyn->e.swapped = 1;
+    dyn->e.combined1= a; dyn->e.combined2=b;
+}
+
+// Set rounding according to cw flags, return reg to restore flags
+int x87_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3)
+{
+    MAYUSE(dyn); MAYUSE(ninst);
+    MAYUSE(s1); MAYUSE(s2);
+    LW(s1, xEmu, offsetof(x64emu_t, cw));
+    SRLI(s1, s1, 10);
+    ANDI(s1, s1, 0b11);
+    // MMX/x87 Round mode: 0..3: Nearest, Down, Up, Chop
+    // RV64: 0..7: Nearest, Toward Zero (Chop), Down, Up, Nearest tie to Max, invalid, invalid, dynamic (invalid here)
+    // 0->0, 1->2, 2->3, 3->1
+    SLLI(s1, s1, 1);
+    ADDI(s2, xZR, 3);
+    BGE(s1, s2, 4+8);
+    ADDI(s1, s1, -4);
+    XORI(s3, s1, 0b11);
+    // transform done (is there a faster way?)
+    FSRM(s3);               // exange RM with current
+    return s3;
+}
+
+// Set rounding according to mxcsr flags, return reg to restore flags
+int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3)
+{
+    MAYUSE(dyn); MAYUSE(ninst);
+    MAYUSE(s1); MAYUSE(s2);
+    LW(s1, xEmu, offsetof(x64emu_t, mxcsr));
+    SRLI(s1, s1, 13);
+    ANDI(s1, s1, 0b11);
+    // MMX/x87 Round mode: 0..3: Nearest, Down, Up, Chop
+    // RV64: 0..7: Nearest, Toward Zero (Chop), Down, Up, Nearest tie to Max, invalid, invalid, dynamic (invalid here)
+    // 0->0, 1->2, 2->3, 3->1
+    SLLI(s1, s1, 1);
+    ADDI(s2, xZR, 3);
+    BGE(s1, s2, 4+8);
+    ADDI(s1, s1, -4);
+    XORI(s3, s1, 0b11);
+    // transform done (is there a faster way?)
+    FSRM(s3);               // exange RM with current
+    return s3;
+}
+
+// Restore round flag, destroy s1 doing so
+void x87_restoreround(dynarec_rv64_t* dyn, int ninst, int s1)
+{
+    MAYUSE(dyn); MAYUSE(ninst);
+    MAYUSE(s1);
+    FSRM(s1);               // put back fpscr
+}
+
+// MMX helpers
+static void mmx_reset(dynarec_rv64_t* dyn)
+{
+    dyn->e.mmxcount = 0;
+    for (int i=0; i<8; ++i)
+        dyn->e.mmxcache[i] = -1;
+}
+static int isx87Empty(dynarec_rv64_t* dyn)
+{
+    for (int i=0; i<8; ++i)
+        if(dyn->e.x87cache[i] != -1)
+            return 0;
+    return 1;
+}
+
+// get neon register for a MMX reg, create the entry if needed
+int mmx_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a)
+{
+    if(!dyn->e.x87stack && isx87Empty(dyn))
+        x87_purgecache(dyn, ninst, 0, s1, s2, s3);
+    if(dyn->e.mmxcache[a]!=-1)
+        return dyn->e.mmxcache[a];
+    ++dyn->e.mmxcount;
+    int ret = dyn->e.mmxcache[a] = fpu_get_reg_emm(dyn, a);
+    FLD(ret, xEmu, offsetof(x64emu_t, mmx[a]));
+    return ret;
+}
+// get neon register for a MMX reg, but don't try to synch it if it needed to be created
+int mmx_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a)
+{
+    if(!dyn->e.x87stack && isx87Empty(dyn))
+        x87_purgecache(dyn, ninst, 0, s1, s2, s3);
+    if(dyn->e.mmxcache[a]!=-1)
+        return dyn->e.mmxcache[a];
+    ++dyn->e.mmxcount;
+    int ret = dyn->e.mmxcache[a] = fpu_get_reg_emm(dyn, a);
+    return ret;
+}
+// purge the MMX cache only(needs 3 scratch registers)
 void mmx_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1)
 {
-    // TODO
+    if(!dyn->e.mmxcount)
+        return;
+    if(!next)
+        dyn->e.mmxcount = 0;
+    int old = -1;
+    for (int i=0; i<8; ++i)
+        if(dyn->e.mmxcache[i]!=-1) {
+            if (old==-1) {
+                MESSAGE(LOG_DUMP, "\tPurge %sMMX Cache ------\n", next?"locally ":"");
+                ++old;
+            }
+            FSD(dyn->e.mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i]));
+            if(!next) {
+                fpu_free_reg(dyn, dyn->e.mmxcache[i]);
+                dyn->e.mmxcache[i] = -1;
+            }
+        }
+    if(old!=-1) {
+        MESSAGE(LOG_DUMP, "\t------ Purge MMX Cache\n");
+    }
+}
+#ifdef HAVE_TRACE
+static void mmx_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1)
+{
+    for (int i=0; i<8; ++i)
+        if(dyn->e.mmxcache[i]!=-1) {
+            FLD(dyn->e.mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i]));
+        }
 }
+#endif
 
-void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3)
+// SSE / SSE2 helpers
+static void sse_reset(dynarec_rv64_t* dyn)
 {
-    //TODO
+    for (int i=0; i<16; ++i)
+        dyn->e.ssecache[i].v = -1;
+}
+// get ext register for a SSE reg, create the entry if needed
+int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
+{
+    if(dyn->e.ssecache[a].v!=-1) {
+        // forget / reload if change of size
+        if(dyn->e.ssecache[a].single!=single) {
+            sse_forget_reg(dyn, ninst, a);
+            return sse_get_reg(dyn, ninst, s1, a, single);
+        }
+        return dyn->e.ssecache[a].reg;
+    }
+    dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a);
+    int ret =  dyn->e.ssecache[a].reg;
+    dyn->e.ssecache[a].single = single;
+    if(dyn->e.ssecache[a].single)
+        FLW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
+    else
+        FLD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
+    return ret;
+}
+// get ext register for a SSE reg, but don't try to synch it if it needed to be created
+int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
+{
+    if(dyn->e.ssecache[a].v!=-1) {
+        dyn->e.ssecache[a].single = single;
+        dyn->e.extcache[dyn->e.ssecache[a].reg].t = single?EXT_CACHE_SS:EXT_CACHE_SD;
+        return dyn->e.ssecache[a].reg;
+    }
+    dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a);
+    dyn->e.ssecache[a].single = 1; // it will be write...
+    return dyn->e.ssecache[a].reg;
+}
+// forget ext register for a SSE reg, create the entry if needed
+void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a)
+{
+    if(dyn->e.ssecache[a].v==-1)
+        return;
+    if(dyn->e.ssecache[a].single)
+        FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
+    else
+        FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
+    fpu_free_reg(dyn, dyn->e.ssecache[a].reg);
+    dyn->e.ssecache[a].v = -1;
+    return;
+}
+// purge the SSE cache for XMM0..XMM7 (to use before function native call)
+void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1)
+{
+    int old = -1;
+    for (int i=0; i<8; ++i)
+        if(dyn->e.ssecache[i].v!=-1) {
+            if (old==-1) {
+                MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n");
+                ++old;
+            }
+            if(dyn->e.ssecache[i].single)
+                FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+            else
+                FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+            fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
+            dyn->e.ssecache[i].v = -1;
+        }
+    if(old!=-1) {
+        MESSAGE(LOG_DUMP, "\t------ Purge XMM0..7 Cache\n");
+    }
 }
 
+// purge the SSE cache only
+static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1)
+{
+    int old = -1;
+    for (int i=0; i<16; ++i)
+        if(dyn->e.ssecache[i].v!=-1) {
+            if (old==-1) {
+                MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":"");
+                ++old;
+            }
+            if(dyn->e.ssecache[i].single)
+                FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+            else
+                FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+            if(!next) {
+                fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
+                dyn->e.ssecache[i].v = -1;
+            }
+        }
+    if(old!=-1) {
+        MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n");
+    }
+}
 #ifdef HAVE_TRACE
-void fpu_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3)
+static void sse_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1)
 {
-    //TODO
+    for (int i=0; i<16; ++i)
+        if(dyn->e.ssecache[i].v!=-1) {
+            if(dyn->e.ssecache[i].single)
+                FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+            else
+                FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+        }
 }
 #endif
 void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
 {
-    //TODO
+    int start = not07?8:0;
+    // only SSE regs needs to be push back to xEmu (needs to be "write")
+    int n=0;
+    for (int i=start; i<16; i++)
+        if(dyn->e.ssecache[i].v!=-1)
+            ++n;
+    if(!n)
+        return;
+    MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n);
+    for (int i=start; i<16; ++i)
+        if(dyn->e.ssecache[i].v!=-1) {
+            if(dyn->e.ssecache[i].single)
+                FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+            else
+                FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+        }
+    MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n);
 }
 void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
 {
-    //TODO
+    int start = not07?8:0;
+    // only SSE regs needs to be pop back from xEmu (don't need to be "write" this time)
+    int n=0;
+    for (int i=start; i<16; i++)
+        if(dyn->e.ssecache[i].v!=-1)
+            ++n;
+    if(!n)
+        return;
+    MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n);
+    for (int i=start; i<16; ++i)
+        if(dyn->e.ssecache[i].v!=-1) {
+            if(dyn->e.ssecache[i].single)
+                FLW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+            else
+                FLD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+        }
+    MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n);
+}
+
+void fpu_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3)
+{
+    x87_purgecache(dyn, ninst, next, s1, s2, s3);
+    mmx_purgecache(dyn, ninst, next, s1);
+    sse_purgecache(dyn, ninst, next, s1);
+    if(!next)
+        fpu_reset_reg(dyn);
+}
+
+static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_t* cache)
+{
+    ext_cache_t f;
+    f.n = n; f.t = t;
+    for(int i=0; i<24; ++i) {
+        if(cache->extcache[i].v == f.v)
+            return i;
+        if(cache->extcache[i].n == n) {
+            switch(cache->extcache[i].t) {
+                case EXT_CACHE_ST_F:
+                    if (t==EXT_CACHE_ST_D)
+                        return i;
+                    break;
+                case EXT_CACHE_ST_D:
+                    if (t==EXT_CACHE_ST_F)
+                        return i;
+                    break;
+            }
+        }
+    }
+    return -1;
+}
+
+static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *cache)
+{
+    if (i==j)
+        return;
+    int reg_i = EXTREG(i);
+    int reg_j = EXTREG(j);
+    int i_single = 0;
+    if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F)
+        i_single =1;
+    int j_single = 0;
+    if(cache->extcache[j].t==EXT_CACHE_SS || cache->extcache[j].t==EXT_CACHE_ST_F)
+        j_single =1;
+    
+    if(!cache->extcache[i].v) {
+        // a mov is enough, no need to swap
+        MESSAGE(LOG_DUMP, "\t  - Moving %d <- %d\n", i, j);
+        if(j_single) {
+            FMVS(reg_i, reg_j);
+        } else {
+            FMVD(reg_i, reg_j);
+        }
+        cache->extcache[i].v = cache->extcache[j].v;
+        cache->extcache[j].v = 0;
+        return;
+    }
+    // SWAP
+    ext_cache_t tmp;
+    MESSAGE(LOG_DUMP, "\t  - Swaping %d <-> %d\n", i, j);
+    // There is no VSWP in Arm64 NEON to swap 2 register contents!
+    // so use a scratch...
+    #define SCRATCH 0
+    if(i_single)
+        FMVS(SCRATCH, reg_i);
+    else
+        FMVD(SCRATCH, reg_i);
+    if(j_single)
+        FMVS(reg_i, reg_j);
+    else
+        FMVD(reg_i, reg_j);
+    if(i_single)
+        FMVS(reg_j, SCRATCH);
+    else
+        FMVD(reg_j, SCRATCH);
+    #undef SCRATCH
+    tmp.v = cache->extcache[i].v;
+    cache->extcache[i].v = cache->extcache[j].v;
+    cache->extcache[j].v = tmp.v;
+}
+
+static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, extcache_t *cache, int i, int t, int n)
+{
+    int reg = EXTREG(i);
+    if(cache->extcache[i].v) {
+        int single = 0;
+        if(t==EXT_CACHE_SS || t==EXT_CACHE_ST_F)
+            single = 1;
+        if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F)
+            single = 1;
+        int j = i+1;
+        while(cache->extcache[j].v)
+            ++j;
+        MESSAGE(LOG_DUMP, "\t  - Moving away %d\n", i);
+        if(single) {
+            FMVS(EXTREG(j), reg);
+        } else {
+            FMVD(EXTREG(j), reg);
+        }
+        cache->extcache[j].v = cache->extcache[i].v;
+    }
+    switch(t) {
+        case EXT_CACHE_SS:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
+            FLW(reg, xEmu, offsetof(x64emu_t, xmm[n]));
+            break;
+        case EXT_CACHE_SD:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
+            FLD(reg, xEmu, offsetof(x64emu_t, xmm[n]));
+            break;
+        case EXT_CACHE_MM:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));                    
+            FLD(reg, xEmu, offsetof(x64emu_t, mmx[i]));
+            break;
+        case EXT_CACHE_ST_D:
+        case EXT_CACHE_ST_F:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));                    
+            if((*s3_top) == 0xffff) {
+                LW(s3, xEmu, offsetof(x64emu_t, top));
+                *s3_top = 0;
+            }
+            int a = n  - (*s3_top) - stack_cnt;
+            if(a) {
+                ADDI(s3, s3, a);
+                ANDI(s3, s3, 7);    // (emu->top + i)&7
+            }
+            *s3_top += a;
+            *s2_val = 0;
+            SLLI(s2, s3, 3);
+            ADD(s2, xEmu, s2);
+            FLD(reg, s2, offsetof(x64emu_t, x87));
+            if(t==EXT_CACHE_ST_F) {
+                FCVTSD(reg, reg);
+            }
+            break;                    
+        case EXT_CACHE_NONE:
+        case EXT_CACHE_SCR:
+        default:    /* nothing done */
+            MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
+            break; 
+    }
+    cache->extcache[i].n = n;
+    cache->extcache[i].t = t;
+}
+
+static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, extcache_t *cache, int i, int t, int n)
+{
+    int reg = EXTREG(i);
+    switch(t) {
+        case EXT_CACHE_SS:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
+            FSW(reg, xEmu, offsetof(x64emu_t, xmm[n]));
+            break;
+        case EXT_CACHE_SD:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
+            FSD(reg, xEmu, offsetof(x64emu_t, xmm[n]));
+            break;
+        case EXT_CACHE_MM:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));                    
+            FSD(reg, xEmu, offsetof(x64emu_t, mmx[n]));
+            break;
+        case EXT_CACHE_ST_D:
+        case EXT_CACHE_ST_F:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));                    
+            if((*s3_top)==0xffff) {
+                LW(s3, xEmu, offsetof(x64emu_t, top));
+                *s3_top = 0;
+            }
+            int a = n - (*s3_top) - stack_cnt;
+            if(a) {
+                ADDI(s3, s3, a);
+                ANDI(s3, s3, 7);
+            }
+            *s3_top += a;
+            SLLI(s2, s3, 3);
+            ADD(s2, xEmu, s2);
+            *s2_val = 0;
+            if(t==EXT_CACHE_ST_F) {
+                FCVTDS(reg, reg);
+            }
+            FSD(reg, s2, offsetof(x64emu_t, x87));
+            break;                    
+        case EXT_CACHE_NONE:
+        case EXT_CACHE_SCR:
+        default:    /* nothing done */
+            MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
+            break; 
+    }
+    cache->extcache[i].v = 0;
 }
 
 static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3)
 {
-    //TODO
+#if STEP > 1
+    int i2 = dyn->insts[ninst].x64.jmp_insts;
+    if(i2<0)
+        return;
+    MESSAGE(LOG_DUMP, "\tCache Transform ---- ninst=%d -> %d\n", ninst, i2);
+    if((!i2) || (dyn->insts[i2].x64.barrier&BARRIER_FLOAT)) {
+        if(dyn->e.stack_next)  {
+            fpu_purgecache(dyn, ninst, 1, s1, s2, s3);
+            MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+            return;
+        }
+        for(int i=0; i<24; ++i)
+            if(dyn->e.extcache[i].v) {       // there is something at ninst for i
+                fpu_purgecache(dyn, ninst, 1, s1, s2, s3);
+                MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+                return;
+            }
+        MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+        return;
+    }
+    extcache_t cache_i2 = dyn->insts[i2].e;
+    extcacheUnwind(&cache_i2);
+
+    if(!cache_i2.stack) {
+        int purge = 1;
+        for (int i=0; i<24 && purge; ++i)
+            if(cache_i2.extcache[i].v)
+                purge = 0;
+        if(purge) {
+            fpu_purgecache(dyn, ninst, 1, s1, s2, s3);
+            MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+            return;
+        }
+    }
+    int stack_cnt = dyn->e.stack_next;
+    int s3_top = 0xffff;
+    if(stack_cnt != cache_i2.stack) {
+        MESSAGE(LOG_DUMP, "\t    - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack);
+        int a = stack_cnt - cache_i2.stack;
+        // Add x87stack to emu fpu_stack
+        LWU(s3, xEmu, offsetof(x64emu_t, fpu_stack));
+        ADDI(s3, s3, a);
+        SW(s3, xEmu, offsetof(x64emu_t, fpu_stack));
+        // Sub x87stack to top, with and 7
+        LWU(s3, xEmu, offsetof(x64emu_t, top));
+        // update tags (and top at the same time)
+        if(a>0) {
+            // new tag to fulls
+            ADDI(s2, xZR, 0);
+            ADDI(s1, xEmu, offsetof(x64emu_t, p_regs));
+            SLLI(s3, s3, 2);
+            for (int i=0; i<a; ++i) {
+                ADDI(s3, s3, -1<<2);
+                ANDI(s3, s3, 7<<2);
+                ADD(s3, s1, s3);
+                SW(s2, s3, 0);    // that slot is full
+                SUB(s3, s3, s1);
+            }
+            SRLI(s3, s3, 2);
+        } else {
+            // empty tags
+            ADDI(s2, xZR, 0b11);
+            ADDI(s1, xEmu, offsetof(x64emu_t, p_regs));
+            SLLI(s3, s3, 2);
+            for (int i=0; i<-a; ++i) {
+                ADD(s3, s1, s3);
+                SW(s2, s3, 0);    // empty slot before leaving it
+                SUB(s3, s3, s1);
+                ADDI(s3, s3, 1<<2);
+                ANDI(s3, s3, 7<<2);    // (emu->top + st)&7
+            }
+            SRLI(s3, s3, 2);
+        }
+        SW(s3, xEmu, offsetof(x64emu_t, top));
+        s3_top = 0;
+        stack_cnt = cache_i2.stack;
+    }
+    extcache_t cache = dyn->e;
+    int s1_val = 0;
+    int s2_val = 0;
+    // unload every uneeded cache
+    // check SSE first, than MMX, in order, for optimisation issue
+    for(int i=0; i<16; ++i) {
+        int j=findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache);
+        if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2)==-1)
+            unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
+        j=findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache);
+        if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache_i2)==-1)
+            unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
+    }
+    for(int i=0; i<8; ++i) {
+        int j=findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache);
+        if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache_i2)==-1)
+            unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
+    }
+    for(int i=0; i<24; ++i) {
+        if(cache.extcache[i].v)
+            if(findCacheSlot(dyn, ninst, cache.extcache[i].t, cache.extcache[i].n, &cache_i2)==-1)
+                unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.extcache[i].t, cache.extcache[i].n);
+    }
+    // and now load/swap the missing one
+    for(int i=0; i<24; ++i) {
+        if(cache_i2.extcache[i].v) {
+            if(cache_i2.extcache[i].v != cache.extcache[i].v) {
+                int j;
+                if((j=findCacheSlot(dyn, ninst, cache_i2.extcache[i].t, cache_i2.extcache[i].n, &cache))==-1)
+                    loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.extcache[i].t, cache_i2.extcache[i].n);
+                else {
+                    // it's here, lets swap if needed
+                    if(j!=i)
+                        swapCache(dyn, ninst, i, j, &cache);
+                }
+            }
+            if(cache.extcache[i].t != cache_i2.extcache[i].t) {
+                if(cache.extcache[i].t == EXT_CACHE_ST_D && cache_i2.extcache[i].t == EXT_CACHE_ST_F) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n));
+                    FCVTSD(EXTREG(i), EXTREG(i));
+                    cache.extcache[i].t = EXT_CACHE_ST_F;
+                } else if(cache.extcache[i].t == EXT_CACHE_ST_F && cache_i2.extcache[i].t == EXT_CACHE_ST_D) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n));
+                    FCVTDS(EXTREG(i), EXTREG(i));
+                    cache.extcache[i].t = EXT_CACHE_ST_D;
+                }
+            }
+        }
+    }
+    MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+#endif
 }
 static void flagsCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1)
 {
@@ -669,6 +1642,25 @@ void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val)
     }
 }
 
+#ifdef HAVE_TRACE
+void fpu_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3)
+{
+    x87_reflectcache(dyn, ninst, s1, s2, s3);
+    if(trace_emm)
+       mmx_reflectcache(dyn, ninst, s1);
+    if(trace_xmm)
+       sse_reflectcache(dyn, ninst, s1);
+}
+#endif
+
+void fpu_reset(dynarec_rv64_t* dyn)
+{
+    x87_reset(dyn);
+    mmx_reset(dyn);
+    sse_reset(dyn);
+    fpu_reset_reg(dyn);
+}
+
 void emit_pf(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4)
 {
     MAYUSE(dyn); MAYUSE(ninst);
@@ -685,3 +1677,64 @@ void emit_pf(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4)
     BEQZ(s4, 8);
     ORI(xFlags, xFlags, 1 << F_PF);
 }
+
+void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n)
+{
+    MESSAGE(LOG_DEBUG, "Reset Caches with %d\n",reset_n);
+    #if STEP > 1
+    // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap)
+    dyn->e = dyn->insts[ninst].e;
+    extcacheUnwind(&dyn->e);
+    #ifdef HAVE_TRACE
+    if(box64_dynarec_dump)
+        if(memcmp(&dyn->e, &dyn->insts[reset_n].e, sizeof(ext_cache_t))) {
+            MESSAGE(LOG_DEBUG, "Warning, difference in extcache: reset=");
+            for(int i=0; i<24; ++i)
+                if(dyn->insts[reset_n].e.extcache[i].v)
+                    MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[reset_n].e.extcache[i].t, dyn->insts[reset_n].e.extcache[i].n));
+            if(dyn->insts[reset_n].e.combined1 || dyn->insts[reset_n].e.combined2)
+                MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[reset_n].e.swapped?"SWP":"CMB", dyn->insts[reset_n].e.combined1, dyn->insts[reset_n].e.combined2);
+            if(dyn->insts[reset_n].e.stack_push || dyn->insts[reset_n].e.stack_pop)
+                MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[reset_n].e.stack_push, -dyn->insts[reset_n].e.stack_pop);
+            MESSAGE(LOG_DEBUG, " ==> ");
+            for(int i=0; i<24; ++i)
+                if(dyn->insts[ninst].e.extcache[i].v)
+                    MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[ninst].e.extcache[i].t, dyn->insts[ninst].e.extcache[i].n));
+            if(dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2)
+                MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[ninst].e.swapped?"SWP":"CMB", dyn->insts[ninst].e.combined1, dyn->insts[ninst].e.combined2);
+            if(dyn->insts[ninst].e.stack_push || dyn->insts[ninst].e.stack_pop)
+                MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[ninst].e.stack_push, -dyn->insts[ninst].e.stack_pop);
+            MESSAGE(LOG_DEBUG, " -> ");
+            for(int i=0; i<24; ++i)
+                if(dyn->e.extcache[i].v)
+                    MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->e.extcache[i].t, dyn->e.extcache[i].n));
+            if(dyn->e.combined1 || dyn->e.combined2)
+                MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->e.swapped?"SWP":"CMB", dyn->e.combined1, dyn->e.combined2);
+            if(dyn->e.stack_push || dyn->e.stack_pop)
+                MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->e.stack_push, -dyn->e.stack_pop);
+            MESSAGE(LOG_DEBUG, "\n");
+        }
+    #endif //HAVE_TRACE
+    #else
+    dyn->e = dyn->insts[reset_n].e;
+    #endif
+}
+
+// propagate ST stack state, especial stack pop that are defered
+void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst)
+{
+    if(dyn->e.stack_pop) {
+        for(int j=0; j<24; ++j)
+            if((dyn->e.extcache[j].t == EXT_CACHE_ST_D || dyn->e.extcache[j].t == EXT_CACHE_ST_F)) {
+                if(dyn->e.extcache[j].n<dyn->e.stack_pop)
+                    dyn->e.extcache[j].v = 0;
+                else
+                    dyn->e.extcache[j].n-=dyn->e.stack_pop;
+            }
+        dyn->e.stack_pop = 0;
+    }
+    dyn->e.stack = dyn->e.stack_next;
+    dyn->e.news = 0;
+    dyn->e.stack_push = 0;
+    dyn->e.swapped = 0;
+}
\ No newline at end of file
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index d1c5dc2e..e8e2cf6c 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -469,6 +469,9 @@
 #ifndef TABLE64
 #define TABLE64(A, V)
 #endif
+#ifndef FTABLE64
+#define FTABLE64(A, V)
+#endif
 
 #define ARCH_INIT()
 
@@ -638,7 +641,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 #define x87_do_pop      STEPNAME(x87_do_pop)
 #define x87_get_current_cache   STEPNAME(x87_get_current_cache)
 #define x87_get_cache   STEPNAME(x87_get_cache)
-#define x87_get_neoncache STEPNAME(x87_get_neoncache)
+#define x87_get_extcache STEPNAME(x87_get_extcache)
 #define x87_get_st      STEPNAME(x87_get_st)
 #define x87_get_st_empty  STEPNAME(x87_get_st)
 #define x87_refresh     STEPNAME(x87_refresh)
@@ -654,6 +657,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 #define sse_get_reg     STEPNAME(sse_get_reg)
 #define sse_get_reg_empty STEPNAME(sse_get_reg_empty)
 #define sse_forget_reg   STEPNAME(sse_forget_reg)
+#define sse_purge07cache STEPNAME(sse_purge07cache)
 
 #define fpu_pushcache   STEPNAME(fpu_pushcache)
 #define fpu_popcache    STEPNAME(fpu_popcache)
@@ -663,6 +667,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 #define fpu_purgecache  STEPNAME(fpu_purgecache)
 #define mmx_purgecache  STEPNAME(mmx_purgecache)
 #define x87_purgecache  STEPNAME(x87_purgecache)
+#define sse_purgecache  STEPNAME(sse_purgecache)
 #ifdef HAVE_TRACE
 #define fpu_reflectcache STEPNAME(fpu_reflectcache)
 #endif
@@ -766,37 +771,37 @@ void emit_pf(dynarec_rv64_t* dyn, int ninst, int s1, int s3, int s4);
 
 // x87 helper
 // cache of the local stack counter, to avoid upadte at every call
-//void x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch);
+void x87_stackcount(dynarec_rv64_t* dyn, int ninst, int scratch);
 // fpu push. Return the Dd value to be used
-//int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t);
+int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t);
 // fpu push. Do not allocate a cache register. Needs a scratch register to do x87stack synch (or 0 to not do it)
-//void x87_do_push_empty(dynarec_rv64_t* dyn, int ninst, int s1);
+void x87_do_push_empty(dynarec_rv64_t* dyn, int ninst, int s1);
 // fpu pop. All previous returned Dd should be considered invalid
-//void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1);
+void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1);
 // get cache index for a x87 reg, return -1 if cache doesn't exist
-//int x87_get_current_cache(dynarec_rv64_t* dyn, int ninst, int st, int t);
+int x87_get_current_cache(dynarec_rv64_t* dyn, int ninst, int st, int t);
 // get cache index for a x87 reg, create the entry if needed
-//int x87_get_cache(dynarec_rv64_t* dyn, int ninst, int populate, int s1, int s2, int a, int t);
-// get neoncache index for a x87 reg
-//int x87_get_neoncache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a);
+int x87_get_cache(dynarec_rv64_t* dyn, int ninst, int populate, int s1, int s2, int a, int t);
+// get extcache index for a x87 reg
+int x87_get_extcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a);
 // get vfpu register for a x87 reg, create the entry if needed
-//int x87_get_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t);
+int x87_get_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t);
 // get vfpu register for a x87 reg, create the entry if needed. Do not fetch the Stx if not already in cache
-//int x87_get_st_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t);
+int x87_get_st_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t);
 // refresh a value from the cache ->emu (nothing done if value is not cached)
-//void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st);
+void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st);
 // refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached)
-//void x87_forget(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st);
+void x87_forget(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st);
 // refresh the cache value from emu
-//void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st);
+void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st);
 // swap 2 x87 regs
-//void x87_swapreg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int b);
+void x87_swapreg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int b);
 // Set rounding according to cw flags, return reg to restore flags
-//int x87_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3);
+int x87_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3);
 // Restore round flag
-//void x87_restoreround(dynarec_rv64_t* dyn, int ninst, int s1);
+void x87_restoreround(dynarec_rv64_t* dyn, int ninst, int s1);
 // Set rounding according to mxcsr flags, return reg to restore flags
-//int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3);
+int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3);
 
 void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3);
 
@@ -808,6 +813,39 @@ void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zerou
 #else
 #define CHECK_CACHE()   (cacheupd = CacheNeedsTransform(dyn, ninst))
 #endif
+#define extcache_st_coherency STEPNAME(extcache_st_coherency)
+int extcache_st_coherency(dynarec_rv64_t* dyn, int ninst, int a, int b);
+
+#if STEP == 0
+#define ST_IS_F(A)          0
+#define X87_COMBINE(A, B)   EXT_CACHE_ST_D
+#define X87_ST0             EXT_CACHE_ST_D
+#define X87_ST(A)           EXT_CACHE_ST_D
+#elif STEP == 1
+#define ST_IS_F(A) (extcache_get_current_st(dyn, ninst, A)==EXT_CACHE_ST_F)
+#define X87_COMBINE(A, B) extcache_combine_st(dyn, ninst, A, B)
+#define X87_ST0     extcache_get_current_st(dyn, ninst, 0)
+#define X87_ST(A)   extcache_get_current_st(dyn, ninst, A)
+#else
+#define ST_IS_F(A) (extcache_get_st(dyn, ninst, A)==EXT_CACHE_ST_F)
+#if STEP == 3
+#define X87_COMBINE(A, B) extcache_st_coherency(dyn, ninst, A, B)
+#else
+#define X87_COMBINE(A, B) extcache_get_st(dyn, ninst, A)
+#endif
+#define X87_ST0     extcache_get_st(dyn, ninst, 0)
+#define X87_ST(A)   extcache_get_st(dyn, ninst, A)
+#endif
+
+//SSE/SSE2 helpers
+// get neon register for a SSE reg, create the entry if needed
+int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
+// get neon register for a SSE reg, but don't try to synch it if it needed to be created
+int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
+// forget neon register for a SSE reg, create the entry if needed
+void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
+// purge the XMM0..XMM7 cache (before function call)
+void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1);
 
 // common coproc helpers
 // reset the cache
@@ -828,6 +866,7 @@ void fpu_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3);
 void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07);
 void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07);
 
+
 uintptr_t dynarec64_00(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int seg, int* ok, int* need_epilog);
@@ -835,7 +874,7 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
 uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 //uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 //uintptr_t dynarec64_D8(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
-//uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 //uintptr_t dynarec64_DA(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 //uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 //uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
diff --git a/src/dynarec/rv64/dynarec_rv64_pass2.h b/src/dynarec/rv64/dynarec_rv64_pass2.h
index 408c3e97..176d512d 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass2.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass2.h
@@ -15,3 +15,4 @@
 #define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; 
 #define INST_NAME(name) 
 #define TABLE64(A, V)   {Table64(dyn, (V)); EMIT(0); EMIT(0);}
+#define FTABLE64(A, V)  {mmx87_regs_t v = {.d = V}; Table64(dyn, v.q); EMIT(0); EMIT(0);}
\ No newline at end of file
diff --git a/src/dynarec/rv64/dynarec_rv64_pass3.h b/src/dynarec/rv64/dynarec_rv64_pass3.h
index e6aa268f..dac190cd 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass3.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass3.h
@@ -57,3 +57,4 @@
     }
 
 #define TABLE64(A, V)   {int val64offset = Table64(dyn, (V)); MESSAGE(LOG_DUMP, "  Table64: 0x%lx\n", (V)); AUIPC(A, SPLIT20(val64offset)); LD(A, A, SPLIT12(val64offset));}
+#define FTABLE64(A, V)  {mmx87_regs_t v = {.d = V}; int val64offset = Table64(dyn, v.q); MESSAGE(LOG_DUMP, "  FTable64: %g\n", v.d); AUIPC(x1, SPLIT20(val64offset)); FLD(A, x1, SPLIT12(val64offset));}
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index ac403464..c00325e1 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -10,6 +10,52 @@ typedef struct instsize_s instsize_t;
 
 #define BARRIER_MAYBE   8
 
+#define EXT_CACHE_NONE 0
+#define EXT_CACHE_ST_D 1
+#define EXT_CACHE_ST_F 2
+#define EXT_CACHE_MM   3
+#define EXT_CACHE_SS   4
+#define EXT_CACHE_SD   5
+#define EXT_CACHE_SCR  6
+typedef union ext_cache_s {
+    int8_t           v;
+    struct {
+        uint8_t t:4;   // reg type
+        uint8_t n:4;   // reg number
+    };
+} ext_cache_t;
+typedef union sse_cache_s {
+    int8_t      v;
+    struct {
+        uint8_t     reg:7;
+        uint8_t     single:1;
+    };
+} sse_cache_t;
+typedef struct extcache_s {
+    // ext cache
+    ext_cache_t         extcache[24];
+    int8_t              stack;
+    int8_t              stack_next;
+    int8_t              stack_pop;
+    int8_t              stack_push;
+    uint8_t             combined1;
+    uint8_t             combined2;
+    uint8_t             swapped;        // the combined reg were swapped
+    uint8_t             barrier;        // is there a barrier at instruction epilog?
+    uint32_t            news;           // bitmask, wich neoncache are new for this opcode
+    // fpu cache
+    int8_t              x87cache[8];    // cache status for the 8 x87 register behind the fpu stack
+    int8_t              x87reg[8];      // reg used for x87cache entry
+    int8_t              mmxcache[8];    // cache status for the 8 MMX registers
+    sse_cache_t         ssecache[16];   // cache status for the 16 SSE(2) registers
+    int8_t              fpuused[24];    // all 10..31 & 0..1 double reg from fpu, used by x87, sse and mmx
+    int8_t              x87stack;       // cache stack counter
+    int8_t              mmxcount;       // number of mmx register used (not both mmx and x87 at the same time)
+    int8_t              fpu_scratch;    // scratch counter
+    int8_t              fpu_extra_qscratch; // some opcode need an extra quad scratch register
+    int8_t              fpu_reg;        // x87/sse/mmx reg counter
+} extcache_t;
+
 typedef struct flagcache_s {
     int                 pending;    // is there a pending flags here, or to check?
     int                 dfnone;     // if defered flags is already set to df_none
@@ -32,6 +78,7 @@ typedef struct instruction_rv64_s {
     int                 retn;
     int                 barrier_maybe;
     flagcache_t         f_exit;     // flags status at end of intruction
+    extcache_t          e;          // extcache at end of intruction (but before poping)
     flagcache_t         f_entry;    // flags status before the instruction begin
 } instruction_rv64_t;
 
@@ -50,6 +97,7 @@ typedef struct dynarec_rv64_s {
     int                 table64cap;
     uintptr_t           tablestart;
     flagcache_t         f;
+    extcache_t          e;          // cache for the 10..31 0..1 double reg from fpu, plus x87 stack delta
     uintptr_t*          next;       // variable array of "next" jump address
     int                 next_sz;
     int                 next_cap;
@@ -65,6 +113,11 @@ typedef struct dynarec_rv64_s {
     int                 forward_ninst;  // ninst at the forward point
 } dynarec_rv64_t;
 
+// convert idx (0..24) to reg index (10..31 0..1)
+#define EXTREG(A)   (((A)+10)&31)
+// convert reg index (10..31 0..1) or idx (0..24)
+#define EXTIDX(A)   (((A)-10)&31)
+
 void add_next(dynarec_rv64_t *dyn, uintptr_t addr);
 uintptr_t get_closest_next(dynarec_rv64_t *dyn, uintptr_t addr);
 int is_nops(dynarec_rv64_t *dyn, uintptr_t addr, int n);
diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h
index ca8b0891..ab12fa55 100644
--- a/src/dynarec/rv64/rv64_emitter.h
+++ b/src/dynarec/rv64/rv64_emitter.h
@@ -305,6 +305,13 @@ f28–31  ft8–11  FP temporaries                  Caller
 // Shift Right Arithmetic Immediate
 #define SRAIxw(rd, rs1, imm)        if (rex.w) { SRAI(rd, rs1, imm); } else { SRAIW(rd, rs1, imm); }
 
+#define CSRRW(rd, rs1, csr)         EMIT(I_type(csr, rs1, 0b001, rd, 0b1110011))
+#define CSRRS(rd, rs1, csr)         EMIT(I_type(csr, rs1, 0b010, rd, 0b1110011))
+#define CSRRC(rd, rs1, csr)         EMIT(I_type(csr, rs1, 0b011, rd, 0b1110011))
+#define CSRRWI(rd, imm, csr)        EMIT(I_type(csr, imm, 0b101, rd, 0b1110011))
+#define CSRRSI(rd, imm, csr)        EMIT(I_type(csr, imm, 0b110, rd, 0b1110011))
+#define CSRRCI(rd, imm, csr)        EMIT(I_type(csr, imm, 0b111, rd, 0b1110011))
+
 // RV32M
 // rd =(lower) rs1 * rs2 (both signed)
 #define MUL(rd, rs1, rs2)           EMIT(R_type(0b0000001, rs2, rs1, 0b000, rd, 0b0110011))
@@ -350,4 +357,58 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define LRxw(rd, rs1, aq, rl)       EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010|rex.w, rd, 0b0101111))
 #define SCxw(rd, rs2, rs1, aq, rl)  EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010|rex.w, rd, 0b0101111))
 
+// RV32F
+// Read round mode
+#define FRRM(rd)                    CSRRS(rd, xZR, 0x002)
+// Swap round mode with rd
+#define FSRM(rd)                    CSRRWI(rd, 0b111, 0x002)
+// load single precision from rs1+imm12 to frd
+#define FLW(frd, rs1, imm12)        EMIT(I_type(imm12, rs1, 0b010, frd, 0b0000111))
+// store single precision frs2 to rs1+imm12
+#define FSW(frs2, rs1, imm12)       EMIT(S_type(imm12, frs2, rs1, 0b010, 0b0100111))
+// store rs1 with rs2 sign bit to rd
+#define FSGNJS(rd, rs1, rs2)        EMIT(R_type(0b0010000, rs2, rs1, 0b000, rd, 0b1010011))
+// move rs1 to rd
+#define FMVS(rd, rs1)               FSGNJS(rd, rs1, rs1)
+// store rs1 with oposite rs2 sign bit to rd
+#define FSGNJNS(rd, rs1, rs2)       EMIT(R_type(0b0010000, rs2, rs1, 0b001, rd, 0b1010011))
+// -rs1 => rd
+#define FNEGS(rd, rs1)              FSGNJNS(rd, rs1, rs1)
+// store rs1 with rs1^rs2 sign bit to rd
+#define FSGNJXS(rd, rs1, rs2)       EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b1010011))
+// |rs1| => rd
+#define FABSS(rd, rs1)              FSGNJXS(rd, rs1, rs1)
+// Move from Single
+#define FMVXW(rd, frs1)             EMIT(R_type(0b1110000, 0b00000, frs1, 0b000, rd, 0b1010011))
+// Move to Single
+#define FMVWX(frd, rs1)             EMIT(R_type(0b1111000, 0b00000, rs1, 0b000, frd, 0b1010011))
+
+// RV32D
+// load double precision from rs1+imm12 to frd
+#define FLD(frd, rs1, imm12)        EMIT(I_type(imm12, rs1, 0b011, frd, 0b0000111))
+// store double precision frs2 to rs1+imm12
+#define FSD(frs2, rs1, imm12)       EMIT(S_type(imm12, frs2, rs1, 0b011, 0b0100111))
+// Convert Double frs1 to Single frd
+#define FCVTSD(frd, frs1)           EMIT(R_type(0b0100000, 0b00001, frs1, 0b000, frd, 0b1010011))
+// Convert Single frs1 to Double frd
+#define FCVTDS(frd, frs1)           EMIT(R_type(0b0100001, 0b00000, frs1, 0b000, frd, 0b1010011))
+// store rs1 with rs2 sign bit to rd
+#define FSGNJD(rd, rs1, rs2)        EMIT(R_type(0b0010001, rs2, rs1, 0b000, rd, 0b1010011))
+// move rs1 to rd
+#define FMVD(rd, rs1)               FSGNJD(rd, rs1, rs1)
+// store rs1 with oposite rs2 sign bit to rd
+#define FSGNJND(rd, rs1, rs2)       EMIT(R_type(0b0010001, rs2, rs1, 0b001, rd, 0b1010011))
+// -rs1 => rd
+#define FNEGD(rd, rs1)              FSGNJND(rd, rs1, rs1)
+// store rs1 with rs1^rs2 sign bit to rd
+#define FSGNJXD(rd, rs1, rs2)       EMIT(R_type(0b0010001, rs2, rs1, 0b010, rd, 0b1010011))
+// |rs1| => rd
+#define FABSD(rd, rs1)              FSGNJXD(rd, rs1, rs1)
+
+//RV64D
+// Move from Double
+#define FMVXD(rd, frs1)             EMIT(R_type(0b1110001, 0b00000, frs1, 0b000, rd, 0b1010011))
+// Move to Double
+#define FMVDX(frd, rs1)             EMIT(R_type(0b1111001, 0b00000, rs1, 0b000, frd, 0b1010011))
+
 #endif //__RV64_EMITTER_H__