about summary refs log tree commit diff stats
path: root/src/dynarec
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynarec')
-rw-r--r--src/dynarec/dynarec_arch.h2
-rw-r--r--src/dynarec/la64/dynarec_la64_00.c29
-rw-r--r--src/dynarec/la64/dynarec_la64_d8.c276
-rw-r--r--src/dynarec/la64/dynarec_la64_d9.c516
-rw-r--r--src/dynarec/la64/dynarec_la64_da.c210
-rw-r--r--src/dynarec/la64/dynarec_la64_db.c256
-rw-r--r--src/dynarec/la64/dynarec_la64_dc.c227
-rw-r--r--src/dynarec/la64/dynarec_la64_dd.c192
-rw-r--r--src/dynarec/la64/dynarec_la64_de.c158
-rw-r--r--src/dynarec/la64/dynarec_la64_df.c294
-rw-r--r--src/dynarec/la64/dynarec_la64_functions.c237
-rw-r--r--src/dynarec/la64/dynarec_la64_functions.h23
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.c739
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h226
-rw-r--r--src/dynarec/la64/dynarec_la64_pass0.h4
-rw-r--r--src/dynarec/la64/dynarec_la64_pass1.h7
-rw-r--r--src/dynarec/la64/dynarec_la64_pass2.h7
-rw-r--r--src/dynarec/la64/dynarec_la64_pass3.h8
-rw-r--r--src/dynarec/la64/dynarec_la64_private.h1
-rw-r--r--src/dynarec/la64/la64_mapping.h4
20 files changed, 3384 insertions, 32 deletions
diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h
index 6df0b53c..b57d7771 100644
--- a/src/dynarec/dynarec_arch.h
+++ b/src/dynarec/dynarec_arch.h
@@ -68,8 +68,6 @@ extern void* create_updateflags();
 #define STOP_NATIVE_FLAGS(A, B) {}

 #define ARCH_UNALIGNED(A, B) 0

 

-// NYI

-#define NATIVE_RESTORE_X87PC()

 #elif defined(RV64)

 

 #define instruction_native_t        instruction_rv64_t

diff --git a/src/dynarec/la64/dynarec_la64_00.c b/src/dynarec/la64/dynarec_la64_00.c
index 35dd0005..74a543b5 100644
--- a/src/dynarec/la64/dynarec_la64_00.c
+++ b/src/dynarec/la64/dynarec_la64_00.c
@@ -1234,6 +1234,9 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 BSTRPICK_D(xRDX, xRDX, 31, 0);
             }
             break;
+        case 0x9B:
+            INST_NAME("FWAIT");
+            break;
         case 0x9C:
             INST_NAME("PUSHF");
             READFLAGS(X_ALL);
@@ -2294,6 +2297,32 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             }
             break;
 
+        case 0xD8:
+            addr = dynarec64_D8(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+        case 0xD9:
+            addr = dynarec64_D9(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+        case 0xDA:
+            addr = dynarec64_DA(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+        case 0xDB:
+            addr = dynarec64_DB(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+        case 0xDC:
+            addr = dynarec64_DC(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+        case 0xDD:
+            addr = dynarec64_DD(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+
+        case 0xDE:
+            addr = dynarec64_DE(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+        case 0xDF:
+            addr = dynarec64_DF(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            break;
+
 #define GO(Z)                                                                               \
     BARRIER(BARRIER_MAYBE);                                                                 \
     JUMP(addr + i8, 1);                                                                     \
diff --git a/src/dynarec/la64/dynarec_la64_d8.c b/src/dynarec/la64/dynarec_la64_d8.c
new file mode 100644
index 00000000..e2051185
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_d8.c
@@ -0,0 +1,276 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "../dynarec_helper.h"
+#include "dynarec_la64_functions.h"
+
+
+uintptr_t dynarec64_D8(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)rep;
+    (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t ed;
+    uint8_t wback, wb1;
+    uint8_t u8;
+    int64_t fixedaddress;
+    int unscaled;
+    int v1, v2;
+    int s0;
+    int i1, i2, i3;
+
+    MAYUSE(s0);
+    MAYUSE(v2);
+    MAYUSE(v1);
+
+    if (MODREG)
+        switch (nextop) {
+            case 0xC0 ... 0xC7:
+                INST_NAME("FADD ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                if (ST_IS_F(0)) {
+                    FADD_S(v1, v1, v2);
+                } else {
+                    FADD_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xC8 ... 0xCF:
+                INST_NAME("FMUL ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                if (ST_IS_F(0)) {
+                    FMUL_S(v1, v1, v2);
+                } else {
+                    FMUL_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xD0 ... 0xD7:
+                INST_NAME("FCOM ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, v2, x1, x2, x3);
+                } else {
+                    FCOMD(v1, v2, x1, x2, x3);
+                }
+                break;
+            case 0xD8 ... 0xDF:
+                INST_NAME("FCOMP ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, v2, x1, x2, x3);
+                } else {
+                    FCOMD(v1, v2, x1, x2, x3);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xE0 ... 0xE7:
+                INST_NAME("FSUB ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                if (ST_IS_F(0)) {
+                    FSUB_S(v1, v1, v2);
+                } else {
+                    FSUB_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xE8 ... 0xEF:
+                INST_NAME("FSUBR ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                if (ST_IS_F(0)) {
+                    FSUB_S(v1, v2, v1);
+                } else {
+                    FSUB_D(v1, v2, v1);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xF0 ... 0xF7:
+                INST_NAME("FDIV ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                if (ST_IS_F(0)) {
+                    FDIV_S(v1, v1, v2);
+                } else {
+                    FDIV_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xF8 ... 0xFF:
+                INST_NAME("FDIVR ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                if (ST_IS_F(0)) {
+                    FDIV_S(v1, v2, v1);
+                } else {
+                    FDIV_D(v1, v2, v1);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            default:
+                DEFAULT;
+                break;
+        }
+    else
+        switch ((nextop >> 3) & 7) {
+            case 0:
+                INST_NAME("FADD ST0, float[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                s0 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(s0, ed, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3);
+                if (ST_IS_F(0)) {
+                    FADD_S(v1, v1, s0);
+                } else {
+                    FCVT_D_S(s0, s0);
+                    FADD_D(v1, v1, s0);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 1:
+                INST_NAME("FMUL ST0, float[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                s0 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(s0, ed, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3);
+                if (ST_IS_F(0)) {
+                    FMUL_S(v1, v1, s0);
+                } else {
+                    FCVT_D_S(s0, s0);
+                    FMUL_D(v1, v1, s0);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 2:
+                INST_NAME("FCOM ST0, float[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                s0 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(s0, ed, fixedaddress);
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, s0, x1, x6, x3);
+                } else {
+                    FCVT_D_S(s0, s0);
+                    FCOMD(v1, s0, x1, x6, x3);
+                }
+                break;
+            case 3:
+                INST_NAME("FCOMP ST0, float[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                s0 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(s0, ed, fixedaddress);
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, s0, x1, x6, x3);
+                } else {
+                    FCVT_D_S(s0, s0);
+                    FCOMD(v1, s0, x1, x6, x3);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 4:
+                INST_NAME("FSUB ST0, float[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                s0 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(s0, ed, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3);
+                if (ST_IS_F(0)) {
+                    FSUB_S(v1, v1, s0);
+                } else {
+                    FCVT_D_S(s0, s0);
+                    FSUB_D(v1, v1, s0);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 5:
+                INST_NAME("FSUBR ST0, float[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                s0 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(s0, ed, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3);
+                if (ST_IS_F(0)) {
+                    FSUB_S(v1, s0, v1);
+                } else {
+                    FCVT_D_S(s0, s0);
+                    FSUB_D(v1, s0, v1);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 6:
+                INST_NAME("FDIV ST0, float[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                s0 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(s0, ed, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3);
+                if (ST_IS_F(0)) {
+                    FDIV_S(v1, v1, s0);
+                } else {
+                    FCVT_D_S(s0, s0);
+                    FDIV_D(v1, v1, s0);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 7:
+                INST_NAME("FDIVR ST0, float[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                s0 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(s0, ed, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3);
+                if (ST_IS_F(0)) {
+                    FDIV_S(v1, s0, v1);
+                } else {
+                    FCVT_D_S(s0, s0);
+                    FDIV_D(v1, s0, v1);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+        }
+    return addr;
+}
diff --git a/src/dynarec/la64/dynarec_la64_d9.c b/src/dynarec/la64/dynarec_la64_d9.c
new file mode 100644
index 00000000..29285b3e
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_d9.c
@@ -0,0 +1,516 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "la64_emitter.h"
+#include "la64_mapping.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "../dynarec_helper.h"
+#include "dynarec_la64_functions.h"
+
+
+uintptr_t dynarec64_D9(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)rep;
+    (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t ed;
+    uint8_t wback, wb1;
+    uint8_t u8;
+    int64_t fixedaddress;
+    int unscaled;
+    int v0, v1, v2;
+    int s0;
+    int i1, i2, i3;
+    int64_t j64;
+
+    MAYUSE(s0);
+    MAYUSE(v0);
+    MAYUSE(v1);
+    MAYUSE(v2);
+    MAYUSE(j64);
+
+    if (MODREG)
+        switch (nextop) {
+            case 0xC0 ... 0xC7:
+                INST_NAME("FLD STx");
+                X87_PUSH_OR_FAIL(v2, dyn, ninst, x1, X87_ST(nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, (nextop & 7) + 1, X87_COMBINE(0, (nextop & 7) + 1));
+                if (ST_IS_F(0)) {
+                    FMOV_S(v2, v1);
+                } else {
+                    FMOV_D(v2, v1);
+                }
+                break;
+
+            case 0xC8:
+                INST_NAME("FXCH ST0");
+                break;
+            case 0xC9 ... 0xCF:
+                INST_NAME("FXCH STx");
+                // swap the cache value, not the double value itself :p
+                x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_ST(nextop & 7));
+                x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                x87_swapreg(dyn, ninst, x1, x2, 0, nextop & 7);
+                // should set C1 to 0
+                break;
+
+            case 0xD0:
+                INST_NAME("FNOP");
+                break;
+
+            case 0xD8:
+                INST_NAME("FSTPNCE ST0, ST0");
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xD9 ... 0xDF:
+                INST_NAME("FSTPNCE ST0, STx");
+                // copy the cache value for st0 to stx
+                x87_get_st_empty(dyn, ninst, x1, x2, nextop & 7, X87_ST(nextop & 7));
+                x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                x87_swapreg(dyn, ninst, x1, x2, 0, nextop & 7);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xE0:
+                INST_NAME("FCHS");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                if (ST_IS_F(0)) {
+                    FNEG_S(v1, v1);
+                } else {
+                    FNEG_D(v1, v1);
+                }
+                break;
+            case 0xE1:
+                INST_NAME("FABS");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                if (ST_IS_F(0)) {
+                    FABS_S(v1, v1);
+                } else {
+                    FABS_D(v1, v1);
+                }
+                break;
+
+            case 0xE4:
+                INST_NAME("FTST");
+                DEFAULT;
+                break;
+            case 0xE5:
+                INST_NAME("FXAM");
+#if 1
+                i1 = x87_get_current_cache(dyn, ninst, 0, LSX_CACHE_ST_D);
+                // value put in x4
+                if (i1 == -1) {
+                    if (fpu_is_st_freed(dyn, ninst, 0)) {
+                        MOV32w(x4, 0b100000100000000);
+                        B_MARK3_nocond;
+                    } else {
+                        // not in cache, so check Empty status and load it
+                        i2 = -dyn->lsx.x87stack;
+                        LD_WU(x3, xEmu, offsetof(x64emu_t, fpu_stack));
+                        if (i2) {
+                            ADDI_D(x3, x3, -i2);
+                        }
+                        MOV32w(x4, 0b100000100000000); // empty: C3,C2,C0 = 101
+                        BGE_MARK3(xZR, x3);
+                        // x5 will be the actual top
+                        LD_WU(x5, xEmu, offsetof(x64emu_t, top));
+                        if (i2) {
+                            ADDI_D(x5, x5, i2);
+                            ANDI(x5, x5, 7); // (emu->top + i)&7
+                        }
+                        // load x2 with ST0 anyway, for sign extraction
+                        SLLI_D(x5, x5, 3);
+                        ADD_D(x1, xEmu, x5);
+                        LD_D(x2, x1, offsetof(x64emu_t, x87));
+                        // load tag
+                        if (i2 >= 0) {
+                            LD_HU(x3, xEmu, offsetof(x64emu_t, fpu_tags));
+                            if (i2 > 0) {
+                                BSTRINS_D(x3, xZR, 15, 0);
+                                SRLI_D(x3, x3, i2 * 2);
+                            }
+                            ANDI(x3, x3, 0b11);
+                            BNEZ_MARK3(x3); // empty: C3,C2,C0 = 101
+                        }
+                    }
+                } else {
+                    // simply move from cache reg to x2
+                    v1 = dyn->lsx.x87reg[i1];
+                    MOVFR2GR_D(x2, v1);
+                }
+                // get exponant in x1
+                SRLI_D(x1, x2, 20 + 32);
+                ANDI(x1, x1, 0x7ff); // 0x7ff
+                BNEZ_MARK(x1);       // not zero or denormal
+                MOV64x(x3, 0x7fffffffffffffff);
+                AND(x1, x2, x3);
+                MOV32w(x4, 0b100000000000000); // Zero: C3,C2,C0 = 100
+                BEQZ_MARK3(x1);
+                MOV32w(x4, 0b100010000000000); // Denormal: C3,C2,C0 = 110
+                B_MARK3_nocond;
+                MARK;
+                ADDI_D(x3, xZR, 0x7ff);        // infinite/NaN?
+                MOV32w(x4, 0b000010000000000); // normal: C3,C2,C0 = 010
+                BNE_MARK3(x1, x3);
+                SLLI_D(x3, x2, 12);
+                SRLI_D(x3, x3, 12);            // and 0x000fffffffffffff
+                MOV32w(x4, 0b000010100000000); // infinity: C3,C2,C0 = 011
+                BEQZ_MARK3(x3);
+                MOV32w(x4, 0b000000100000000); // NaN: C3,C2,C0 = 001
+                MARK3;
+                // Extract sign & Update SW
+                SRLI_D(x1, x2, 63);
+                SLLI_D(x1, x1, 9);
+                OR(x4, x4, x1); // C1
+                LD_HU(x1, xEmu, offsetof(x64emu_t, sw));
+                MOV32w(x2, ~0b0100011100000000);
+                AND(x1, x1, x2);
+                OR(x4, x4, x1);
+                ST_H(x4, xEmu, offsetof(x64emu_t, sw));
+#else
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_refresh(dyn, ninst, x1, x2, 0);
+                s0 = x87_stackcount(dyn, ninst, x1);
+                CALL(fpu_fxam, -1, 0, 0); // should be possible inline, but is it worth it?
+                x87_unstackcount(dyn, ninst, x1, s0);
+#endif
+                break;
+
+            case 0xE8:
+                INST_NAME("FLD1");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_F);
+                if (ST_IS_F(0)) {
+                    MOV32w(x1, 0x3f800000);
+                    MOVGR2FR_W(v1, x1);
+                } else {
+                    MOV64x(x1, 0x3FF0000000000000);
+                    MOVGR2FR_D(v1, x1);
+                }
+                break;
+            case 0xE9:
+                INST_NAME("FLDL2T");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D);
+                FTABLE64(v1, L2T);
+                break;
+            case 0xEA:
+                INST_NAME("FLDL2E");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D);
+                FTABLE64(v1, L2E);
+                break;
+            case 0xEB:
+                INST_NAME("FLDPI");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D);
+                FTABLE64(v1, PI);
+                break;
+            case 0xEC:
+                INST_NAME("FLDLG2");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D);
+                FTABLE64(v1, LG2);
+                break;
+            case 0xED:
+                INST_NAME("FLDLN2");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D);
+                FTABLE64(v1, LN2);
+                break;
+            case 0xEE:
+                INST_NAME("FLDZ");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_F);
+                if (ST_IS_F(0))
+                    MOVGR2FR_W(v1, xZR);
+                else
+                    MOVGR2FR_D(v1, xZR);
+                break;
+
+            case 0xF0:
+                INST_NAME("F2XM1");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                CALL(const_native_f2xm1, -1, 0, 0);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                break;
+            case 0xF1:
+                INST_NAME("FYL2X");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                x87_forget(dyn, ninst, x1, x2, 1);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                CALL(const_native_fyl2x, -1, 0, 0);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xF2:
+                INST_NAME("FPTAN");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                CALL_(const_native_ftan, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_F);
+                if (ST_IS_F(0)) {
+                    MOV32w(x1, 0x3f800000);
+                    MOVGR2FR_W(v1, x1);
+                } else {
+                    MOV64x(x1, 0x3FF0000000000000);
+                    MOVGR2FR_D(v1, x1);
+                }
+                break;
+            case 0xF3:
+                INST_NAME("FPATAN");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                x87_forget(dyn, ninst, x1, x2, 1);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                CALL_(const_native_fpatan, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xF4:
+                INST_NAME("FXTRACT");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, x3);
+                x87_forget(dyn, ninst, x1, x2, 1);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                CALL(const_native_fxtract, -1, 0, 0);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                break;
+            case 0xF5:
+                INST_NAME("FPREM1");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                x87_forget(dyn, ninst, x1, x2, 1);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                CALL(const_native_fprem1, -1, 0, 0);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                break;
+            case 0xF6:
+                INST_NAME("FDECSTP");
+                fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+                LD_W(x2, xEmu, offsetof(x64emu_t, top));
+                ADDI_D(x2, x2, -1);
+                ANDI(x2, x2, 7);
+                ST_W(x2, xEmu, offsetof(x64emu_t, top));
+                break;
+            case 0xF7:
+                INST_NAME("FINCSTP");
+                fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+                LD_W(x2, xEmu, offsetof(x64emu_t, top));
+                ADDI_D(x2, x2, 1);
+                ANDI(x2, x2, 7);
+                ST_W(x2, xEmu, offsetof(x64emu_t, top));
+                break;
+            case 0xF8:
+                INST_NAME("FPREM");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                x87_forget(dyn, ninst, x1, x2, 1);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                CALL(const_native_fprem, -1, 0, 0);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                break;
+            case 0xF9:
+                INST_NAME("FYL2XP1");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                x87_forget(dyn, ninst, x1, x2, 1);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                CALL(const_native_fyl2xp1, -1, 0, 0);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xFA:
+                INST_NAME("FSQRT");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                if (ST_IS_F(0)) {
+                    FSQRT_S(v1, v1);
+                } else {
+                    FSQRT_D(v1, v1);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xFB:
+                INST_NAME("FSINCOS");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, x3);
+                x87_forget(dyn, ninst, x1, x2, 1);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                CALL_(const_native_fsincos, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                break;
+            case 0xFC:
+                INST_NAME("FRNDINT");
+                v0 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                v1 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn);
+                u8 = x87_setround(dyn, ninst, x1, x2);
+
+                if (ST_IS_F(0)) {
+                    FCMP_S(fcc0, v0, v0, cEQ);
+                    BCNEZ_MARK(fcc0);
+                    B_NEXT_nocond;
+                    MARK; // v0 is not nan
+                    FABS_S(v1, v0);
+                    MOV64x(x3, 1ULL << __FLT_MANT_DIG__);
+                    MOVGR2FR_W(v2, x3);
+                    FFINT_S_L(v2, v2);
+                    FCMP_S(fcc1, v1, v2, cLT);
+                    BCNEZ_MARK2(fcc1);
+                    B_NEXT_nocond;
+                    MARK2;
+                    FTINT_L_S(v1, v0);
+                    FFINT_S_L(v1, v1);
+                    FCOPYSIGN_S(v0, v1, v0);
+                } else {
+                    FCMP_D(fcc0, v0, v0, cEQ);
+                    BCNEZ_MARK(fcc0);
+                    B_NEXT_nocond;
+                    MARK; // v0 is not nan
+                    FABS_D(v1, v0);
+                    MOV64x(x3, 1ULL << __DBL_MANT_DIG__);
+                    MOVGR2FR_D(v2, x3);
+                    FFINT_D_L(v2, v2);
+                    FCMP_D(fcc1, v1, v2, cLT);
+                    BCNEZ_MARK2(fcc1);
+                    B_NEXT_nocond;
+                    MARK2;
+                    FTINT_L_D(v1, v0);
+                    FFINT_D_L(v1, v1);
+                    FCOPYSIGN_D(v0, v1, v0);
+                }
+                x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xFD:
+                INST_NAME("FSCALE");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                x87_forget(dyn, ninst, x1, x2, 1);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                CALL_(const_native_fscale, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                break;
+            case 0xFE:
+                INST_NAME("FSIN");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                CALL_(const_native_fsin, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                break;
+            case 0xFF:
+                INST_NAME("FCOS");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                CALL_(const_native_fcos, -1, BOX64ENV(dynarec_fastround) ? 0 : u8, 0, 0);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                break;
+            default:
+                DEFAULT;
+                break;
+        }
+    else
+        switch ((nextop >> 3) & 7) {
+            case 0:
+                INST_NAME("FLD ST0, float[ED]");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, (BOX64ENV(dynarec_x87double) == 1) ? LSX_CACHE_ST_D : LSX_CACHE_ST_F);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v1, ed, fixedaddress);
+                if (!ST_IS_F(0)) {
+                    FCVT_D_S(v1, v1);
+                }
+                break;
+            case 2:
+                INST_NAME("FST float[ED], ST0");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F);
+                if (ST_IS_F(0))
+                    s0 = v1;
+                else {
+                    s0 = fpu_get_scratch(dyn);
+                    if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x2);
+                    FCVT_S_D(s0, v1);
+                    if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                }
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FST_S(s0, ed, fixedaddress);
+                break;
+            case 3:
+                INST_NAME("FSTP float[ED], ST0");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                if (!ST_IS_F(0)) {
+                    if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x3);
+                    FCVT_S_D(v1, v1);
+                    if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                }
+                FST_S(v1, ed, fixedaddress);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 4:
+                INST_NAME("FLDENV Ed");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE?
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
+                MOV32w(x2, 0);
+                CALL(const_fpu_loadenv, -1, ed, x2);
+                NATIVE_RESTORE_X87PC();
+                break;
+            case 5:
+                INST_NAME("FLDCW Ew");
+                GETEW(x1, 0);
+                ST_H(x1, xEmu, offsetof(x64emu_t, cw)); // hopefully cw is not too far for an imm8
+                if (dyn->need_x87check) {
+                    SRLI_D(x87pc, x1, 8);
+                    ANDI(x87pc, x87pc, 0b11);
+                }
+                break;
+            case 6:
+                INST_NAME("FNSTENV Ed");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE?
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
+                MOV32w(x2, 0);
+                CALL(const_fpu_savenv, -1, ed, x2);
+                break;
+            case 7:
+                INST_NAME("FNSTCW Ew");
+                addr = geted(dyn, addr, ninst, nextop, &wback, x3, x1, &fixedaddress, rex, NULL, 0, 0);
+                ed = x1;
+                wb1 = 1;
+                LD_H(x1, xEmu, offsetof(x64emu_t, cw));
+                EWBACK;
+                break;
+            default:
+                DEFAULT;
+        }
+    return addr;
+}
diff --git a/src/dynarec/la64/dynarec_la64_da.c b/src/dynarec/la64/dynarec_la64_da.c
new file mode 100644
index 00000000..9fc06fff
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_da.c
@@ -0,0 +1,210 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "la64_emitter.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "../dynarec_helper.h"
+#include "dynarec_la64_functions.h"
+
+
+uintptr_t dynarec64_DA(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    uint8_t nextop = F8;
+    int64_t j64;
+    uint8_t ed;
+    uint8_t u8;
+    uint8_t wback;
+    int v1, v2;
+    int d0;
+    int s0;
+    int64_t fixedaddress;
+    int unscaled;
+
+    MAYUSE(s0);
+    MAYUSE(d0);
+    MAYUSE(v2);
+    MAYUSE(v1);
+    MAYUSE(ed);
+    MAYUSE(j64);
+
+    if (MODREG)
+        switch (nextop) {
+            case 0xC0 ... 0xC7:
+                INST_NAME("FCMOVB ST0, STx");
+                READFLAGS(X_CF);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                RESTORE_EFLAGS(x5);
+                ANDI(x1, xFlags, 1 << F_CF);
+                CBZ_NEXT(x1);
+                if (ST_IS_F(0))
+                    FMOV_S(v1, v2);
+                else
+                    FMOV_D(v1, v2);
+                break;
+            case 0xC8 ... 0xCF:
+                INST_NAME("FCMOVE ST0, STx");
+                READFLAGS(X_ZF);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                RESTORE_EFLAGS(x5);
+                ANDI(x1, xFlags, 1 << F_ZF);
+                CBZ_NEXT(x1);
+                if (ST_IS_F(0))
+                    FMOV_S(v1, v2);
+                else
+                    FMOV_D(v1, v2);
+                break;
+            case 0xD0 ... 0xD7:
+                INST_NAME("FCMOVBE ST0, STx");
+                READFLAGS(X_CF | X_ZF);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                RESTORE_EFLAGS(x5);
+                ANDI(x1, xFlags, (1 << F_CF) | (1 << F_ZF));
+                CBZ_NEXT(x1);
+                if (ST_IS_F(0))
+                    FMOV_S(v1, v2);
+                else
+                    FMOV_D(v1, v2);
+                break;
+            case 0xD8 ... 0xDF:
+                INST_NAME("FCMOVU ST0, STx");
+                READFLAGS(X_PF);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                RESTORE_EFLAGS(x5);
+                ANDI(x1, xFlags, (1 << F_PF));
+                CBZ_NEXT(x1);
+                if (ST_IS_F(0))
+                    FMOV_S(v1, v2);
+                else
+                    FMOV_D(v1, v2);
+                break;
+            case 0xE9:
+                INST_NAME("FUCOMPP ST0, ST1");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, 1, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, v2, x1, x2, x3);
+                } else {
+                    FCOMD(v1, v2, x1, x2, x3);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            default:
+                DEFAULT;
+                break;
+        }
+    else
+        switch ((nextop >> 3) & 7) {
+            case 0:
+                INST_NAME("FIADD ST0, Ed");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v2, ed, fixedaddress);
+                FFINT_D_W(v2, v2); // i32 -> double
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FADD_D(v1, v1, v2);
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 1:
+                INST_NAME("FIMUL ST0, Ed");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v2, ed, fixedaddress);
+                FFINT_D_W(v2, v2); // i32 -> double
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FMUL_D(v1, v1, v2);
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 2:
+                INST_NAME("FICOM ST0, Ed");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v2, ed, fixedaddress);
+                FFINT_D_W(v2, v2); // i32 -> double
+                FCOMD(v1, v2, x1, x2, x3);
+                break;
+            case 3:
+                INST_NAME("FICOMP ST0, Ed");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v2, ed, fixedaddress);
+                FFINT_D_W(v2, v2); // i32 -> double
+                FCOMD(v1, v2, x1, x2, x3);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 4:
+                INST_NAME("FISUB ST0, Ed");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v2, ed, fixedaddress);
+                FFINT_D_W(v2, v2); // i32 -> double
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FSUB_D(v1, v1, v2);
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 5:
+                INST_NAME("FISUBR ST0, Ed");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v2, ed, fixedaddress);
+                FFINT_D_W(v2, v2); // i32 -> double
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FSUB_D(v1, v2, v1);
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 6:
+                INST_NAME("FIDIV ST0, Ed");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v2, ed, fixedaddress);
+                FFINT_D_W(v2, v2); // i32 -> double
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FDIV_D(v1, v1, v2);
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 7:
+                INST_NAME("FIDIVR ST0, Ed");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v2, ed, fixedaddress);
+                FFINT_D_W(v2, v2); // i32 -> double
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FDIV_D(v1, v2, v1);
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+        }
+    return addr;
+}
diff --git a/src/dynarec/la64/dynarec_la64_db.c b/src/dynarec/la64/dynarec_la64_db.c
new file mode 100644
index 00000000..7b81af4a
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_db.c
@@ -0,0 +1,256 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "la64_emitter.h"
+#include "la64_mapping.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "../dynarec_helper.h"
+#include "dynarec_la64_functions.h"
+
+
+uintptr_t dynarec64_DB(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)rep;
+    (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t ed;
+    uint8_t wback;
+    uint8_t u8;
+    int64_t fixedaddress;
+    int unscaled;
+    int v1, v2;
+    int s0;
+    int64_t j64;
+
+    MAYUSE(s0);
+    MAYUSE(v2);
+    MAYUSE(v1);
+    MAYUSE(j64);
+
+    if (MODREG)
+        switch (nextop) {
+            case 0xC0 ... 0xC7:
+                INST_NAME("FCMOVNB ST0, STx");
+                READFLAGS(X_CF);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                RESTORE_EFLAGS(x5);
+                ANDI(x1, xFlags, 1 << F_CF);
+                CBNZ_NEXT(x1);
+                if (ST_IS_F(0)) {
+                    FMOV_S(v1, v2);
+                } else {
+                    FMOV_D(v1, v2); // F_CF==0
+                }
+                break;
+            case 0xC8 ... 0xCF:
+                INST_NAME("FCMOVNE ST0, STx");
+                READFLAGS(X_ZF);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                RESTORE_EFLAGS(x5);
+                ANDI(x1, xFlags, 1 << F_ZF);
+                CBNZ_NEXT(x1);
+                if (ST_IS_F(0)) {
+                    FMOV_S(v1, v2);
+                } else {
+                    FMOV_D(v1, v2); // F_ZF==0
+                }
+                break;
+            case 0xD0 ... 0xD7:
+                INST_NAME("FCMOVNBE ST0, STx");
+                READFLAGS(X_CF | X_ZF);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                RESTORE_EFLAGS(x5);
+                ANDI(x1, xFlags, (1 << F_CF) | (1 << F_ZF));
+                CBNZ_NEXT(x1);
+                if (ST_IS_F(0)) {
+                    FMOV_S(v1, v2);
+                } else {
+                    FMOV_D(v1, v2); // F_CF==0 & F_ZF==0
+                }
+                break;
+            case 0xD8 ... 0xDF:
+                INST_NAME("FCMOVNU ST0, STx");
+                READFLAGS(X_PF);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                RESTORE_EFLAGS(x5);
+                ANDI(x1, xFlags, 1 << F_PF);
+                CBNZ_NEXT(x1);
+                if (ST_IS_F(0)) {
+                    FMOV_S(v1, v2);
+                } else {
+                    FMOV_D(v1, v2); // F_PF==0
+                }
+                break;
+            case 0xE1:
+                INST_NAME("FDISI8087_NOP"); // so.. NOP?
+                break;
+            case 0xE2:
+                INST_NAME("FNCLEX");
+                LD_H(x2, xEmu, offsetof(x64emu_t, sw));
+                BSTRINS_D(x2, x2, 7, 0);   // IE .. PE, SF, ES
+                BSTRINS_D(x2, x2, 15, 15); // B
+                ST_H(x2, xEmu, offsetof(x64emu_t, sw));
+                break;
+            case 0xE3:
+                INST_NAME("FNINIT");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_purgecache(dyn, ninst, 0, x1, x2, x3);
+                CALL(const_reset_fpu, -1, 0, 0);
+                NATIVE_RESTORE_X87PC();
+                break;
+            case 0xE8 ... 0xEF:
+                INST_NAME("FUCOMI ST0, STx");
+                SETFLAGS(X_ALL, SF_SET, NAT_FLAGS_NOFUSION);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMIS(v1, v2, x1, x2);
+                } else {
+                    FCOMID(v1, v2, x1, x2);
+                }
+
+                break;
+            case 0xF0 ... 0xF7:
+                INST_NAME("FCOMI ST0, STx");
+                SETFLAGS(X_ALL, SF_SET, NAT_FLAGS_NOFUSION);
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMIS(v1, v2, x1, x2);
+                } else {
+                    FCOMID(v1, v2, x1, x2);
+                }
+                break;
+            default:
+                DEFAULT;
+                break;
+        }
+    else
+        switch ((nextop >> 3) & 7) {
+            case 0:
+                INST_NAME("FILD ST0, Ed");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_S(v1, ed, fixedaddress);
+                FFINT_D_W(v1, v1); // i32 -> double
+                break;
+            case 1:
+                INST_NAME("FISTTP Ed, ST0");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0);
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVGR2FCSR(FCSR2, xZR); // reset all bits
+                }
+                FTINTRZ_W_D(v2, v1);
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
+                    BSTRPICK_D(x5, x5, FR_V, FR_V);
+                    BEQZ_MARK(x5);
+                    MOV32w(x4, 0x80000000);
+                    MOVGR2FR_W(v2, x4);
+                    MARK;
+                }
+                FST_S(v2, wback, fixedaddress);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 2:
+                INST_NAME("FIST Ed, ST0");
+                DEFAULT;
+                break;
+            case 3:
+                INST_NAME("FISTP Ed, ST0");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                u8 = x87_setround(dyn, ninst, x1, x5);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                v2 = fpu_get_scratch(dyn);
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVGR2FCSR(FCSR2, xZR); // reset all bits
+                }
+                FTINT_W_D(v2, v1);
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
+                    BSTRPICK_D(x5, x5, FR_V, FR_V);
+                    BEQZ_MARK(x5);
+                    MOV32w(x4, 0x80000000);
+                    MOVGR2FR_W(v2, x4);
+                    MARK;
+                }
+                FST_S(v2, wback, fixedaddress);
+                x87_restoreround(dyn, ninst, u8);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 5:
+                INST_NAME("FLD tbyte");
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 8, 0);
+                if ((PK(0) == 0xDB && ((PK(1) >> 3) & 7) == 7) || (!rex.is32bits && PK(0) >= 0x40 && PK(0) <= 0x4f && PK(1) == 0xDB && ((PK(2) >> 3) & 7) == 7)) {
+                    NOTEST(x5);
+                    // the FLD is immediatly followed by an FSTP
+                    LD_D(x5, ed, fixedaddress + 0);
+                    LD_H(x6, ed, fixedaddress + 8);
+                    // no persistant scratch register, so unrool both instruction here...
+                    MESSAGE(LOG_DUMP, "\tHack: FSTP tbyte\n");
+                    nextop = F8; // 0xDB or rex
+                    if (!rex.is32bits && nextop >= 0x40 && nextop <= 0x4f) {
+                        rex.rex = nextop;
+                        nextop = F8; // 0xDB
+                    } else
+                        rex.rex = 0;
+                    nextop = F8; // modrm
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 8, 0);
+                    ST_D(x5, ed, fixedaddress + 0);
+                    ST_H(x6, ed, fixedaddress + 8);
+                } else {
+                    if (BOX64ENV(x87_no80bits)) {
+                        X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D);
+                        FLD_D(v1, ed, fixedaddress);
+                    } else {
+                        ADDI_D(x1, ed, fixedaddress);
+                        X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, x3);
+                        x87_reflectcount(dyn, ninst, x3, x4);
+                        CALL(const_native_fld, -1, x1, 0);
+                        x87_unreflectcount(dyn, ninst, x3, x4);
+                    }
+                }
+                break;
+            case 7:
+                INST_NAME("FSTP tbyte");
+                if (BOX64ENV(x87_no80bits)) {
+                    v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                    FST_D(v1, wback, fixedaddress);
+                } else {
+                    x87_forget(dyn, ninst, x1, x3, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
+                    x87_reflectcount(dyn, ninst, x3, x4);
+                    CALL(const_native_fstp, -1, ed, 0);
+                    x87_unreflectcount(dyn, ninst, x3, x4);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            default:
+                DEFAULT;
+        }
+    return addr;
+}
diff --git a/src/dynarec/la64/dynarec_la64_dc.c b/src/dynarec/la64/dynarec_la64_dc.c
new file mode 100644
index 00000000..4d639822
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_dc.c
@@ -0,0 +1,227 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "../dynarec_helper.h"
+#include "dynarec_la64_functions.h"
+
+
+uintptr_t dynarec64_DC(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)rep;
+    (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t wback;
+    uint8_t u8;
+    int64_t fixedaddress;
+    int unscaled;
+    int v1, v2;
+
+    MAYUSE(v2);
+    MAYUSE(v1);
+
+    if (MODREG)
+        switch (nextop) {
+            case 0xC0 ... 0xC7:
+                INST_NAME("FADD STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FADD_S(v1, v1, v2);
+                } else {
+                    FADD_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xC8 ... 0xCF:
+                INST_NAME("FMUL STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FMUL_S(v1, v1, v2);
+                } else {
+                    FMUL_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xD0 ... 0xD7:
+                INST_NAME("FCOM ST0, STx"); // yep
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, v2, x1, x2, x3);
+                } else {
+                    FCOMD(v1, v2, x1, x2, x3);
+                }
+                break;
+            case 0xD8 ... 0xDF:
+                INST_NAME("FCOMP ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, v2, x1, x2, x3);
+                } else {
+                    FCOMD(v1, v2, x1, x2, x3);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xE0 ... 0xE7:
+                INST_NAME("FSUBR STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FSUB_S(v1, v2, v1);
+                } else {
+                    FSUB_D(v1, v2, v1);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xE8 ... 0xEF:
+                INST_NAME("FSUB STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FSUB_S(v1, v1, v2);
+                } else {
+                    FSUB_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xF0 ... 0xF7:
+                INST_NAME("FDIVR STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FDIV_S(v1, v2, v1);
+                } else {
+                    FDIV_D(v1, v2, v1);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 0xF8 ... 0xFF:
+                INST_NAME("FDIV STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FDIV_S(v1, v1, v2);
+                } else {
+                    FDIV_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            default:
+                DEFAULT;
+                break;
+        }
+    else
+        switch ((nextop >> 3) & 7) {
+            case 0:
+                INST_NAME("FADD ST0, double[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_D(v2, wback, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FADD_D(v1, v1, v2);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 1:
+                INST_NAME("FMUL ST0, double[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_D(v2, wback, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FMUL_D(v1, v1, v2);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 2:
+                INST_NAME("FCOM ST0, double[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_D(v2, wback, fixedaddress);
+                FCOMD(v1, v2, x1, x6, x3);
+                break;
+            case 3:
+                INST_NAME("FCOMP ST0, double[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_D(v2, wback, fixedaddress);
+                FCOMD(v1, v2, x1, x6, x3);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 4:
+                INST_NAME("FSUB ST0, double[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_D(v2, wback, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FSUB_D(v1, v1, v2);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 5:
+                INST_NAME("FSUBR ST0, double[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_D(v2, wback, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FSUB_D(v1, v2, v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 6:
+                INST_NAME("FDIV ST0, double[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_D(v2, wback, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FDIV_D(v1, v1, v2);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+            case 7:
+                INST_NAME("FDIVR ST0, double[ED]");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_D(v2, wback, fixedaddress);
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                FDIV_D(v1, v2, v1);
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                break;
+        }
+    return addr;
+}
diff --git a/src/dynarec/la64/dynarec_la64_dd.c b/src/dynarec/la64/dynarec_la64_dd.c
new file mode 100644
index 00000000..ea1101b6
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_dd.c
@@ -0,0 +1,192 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "la64_emitter.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "../dynarec_helper.h"
+#include "dynarec_la64_functions.h"
+
+
+uintptr_t dynarec64_DD(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)rep;
+    (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t ed, wback;
+    int64_t fixedaddress;
+    int unscaled;
+    int v1, v2;
+    int s0;
+    int64_t j64;
+
+    MAYUSE(s0);
+    MAYUSE(v2);
+    MAYUSE(v1);
+    MAYUSE(j64);
+
+    if (MODREG)
+        switch (nextop) {
+            case 0xC0 ... 0xC7:
+#if 1
+                if ((nextop & 7) == 0 && PK(0) == 0xD9 && PK(1) == 0xF7) {
+                    MESSAGE(LOG_DUMP, "Hack for FFREE ST0 / FINCSTP\n");
+                    x87_do_pop(dyn, ninst, x1);
+                    addr += 2;
+                    SKIPTEST(x1);
+                } else
+                    x87_free(dyn, ninst, x1, x2, x3, nextop & 7);
+#else
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                x87_purgecache(dyn, ninst, 0, x1, x2, x3);
+                MOV32w(x1, nextop & 7);
+                CALL(fpu_do_free, -1, x1, 0);
+#endif
+                break;
+            case 0xD0 ... 0xD7:
+                INST_NAME("FST ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FMOV_S(v2, v1);
+                } else {
+                    FMOV_D(v2, v1);
+                }
+                break;
+            case 0xD8:
+                INST_NAME("FSTP ST0, ST0");
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xD9 ... 0xDF:
+                INST_NAME("FSTP ST0, STx");
+                // copy the cache value for st0 to stx
+                x87_get_st_empty(dyn, ninst, x1, x2, nextop & 7, X87_ST(nextop & 7));
+                x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+                x87_swapreg(dyn, ninst, x1, x2, 0, nextop & 7);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xE0 ... 0xE7:
+                INST_NAME("FUCOM ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, v2, x1, x2, x3);
+                } else {
+                    FCOMD(v1, v2, x1, x2, x3);
+                }
+                break;
+            case 0xE8 ... 0xEF:
+                INST_NAME("FUCOMP ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, v2, x1, x2, x3);
+                } else {
+                    FCOMD(v1, v2, x1, x2, x3);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            default:
+                DEFAULT;
+                break;
+        }
+    else
+        switch ((nextop >> 3) & 7) {
+            case 0:
+                INST_NAME("FLD double");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_D);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FLD_D(v1, wback, fixedaddress);
+                break;
+            case 1:
+                INST_NAME("FISTTP i64, ST0");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_I64);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0);
+                if (ST_IS_I64(0)) {
+                    FST_D(v1, wback, fixedaddress);
+                } else {
+                    if (!BOX64ENV(dynarec_fastround)) {
+                        MOVGR2FCSR(FCSR2, xZR); // reset all bits
+                    }
+                    FTINTRZ_L_D(v2, v1);
+                    if (!BOX64ENV(dynarec_fastround)) {
+                        MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
+                        MOV32w(x3, (1 << FR_V));
+                        AND(x5, x5, x3);
+                        BEQZ_MARK(x5);
+                        MOV64x(x4, 0x8000000000000000);
+                        MOVGR2FR_D(v2, x4);
+                        MARK;
+                    }
+                    FST_D(v2, wback, fixedaddress);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 2:
+                INST_NAME("FST double");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FST_D(v1, wback, fixedaddress);
+                break;
+            case 3:
+                INST_NAME("FSTP double");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_D);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0);
+                FST_D(v1, wback, fixedaddress);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 4:
+                INST_NAME("FRSTOR m108byte");
+                MESSAGE(LOG_DUMP, "Need Optimization (FRSTOR)\n");
+                fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x4, x6, &fixedaddress, rex, NULL, 0, 0);
+                CALL(const_native_frstor, -1, ed, 0);
+                break;
+            case 6:
+                INST_NAME("FNSAVE m108byte");
+                MESSAGE(LOG_DUMP, "Need Optimization\n");
+                fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x4, x6, &fixedaddress, rex, NULL, 0, 0);
+                CALL(const_native_fsave, -1, ed, 0);
+                NATIVE_RESTORE_X87PC();
+                break;
+            case 7:
+                INST_NAME("FNSTSW m2byte");
+                // fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x4, x6, &fixedaddress, rex, NULL, 0, 0);
+                LD_WU(x2, xEmu, offsetof(x64emu_t, top));
+                LD_HU(x3, xEmu, offsetof(x64emu_t, sw));
+                if (dyn->lsx.x87stack) {
+                    // update top
+                    ADDI_D(x2, x2, -dyn->lsx.x87stack);
+                    ANDI(x2, x2, 7);
+                }
+                MOV32w(x5, ~0x3800);
+                AND(x3, x3, x5);    // mask out TOP
+                SLLI_D(x2, x2, 11); // shift TOP to bit 11
+                OR(x3, x3, x2);     // inject TOP
+                ST_H(x3, xEmu, offsetof(x64emu_t, sw));
+                ST_H(x3, ed, fixedaddress); // store whole sw flags
+                break;
+            default:
+                DEFAULT;
+        }
+    return addr;
+}
diff --git a/src/dynarec/la64/dynarec_la64_de.c b/src/dynarec/la64/dynarec_la64_de.c
new file mode 100644
index 00000000..cb73d76b
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_de.c
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "../dynarec_helper.h"
+#include "dynarec_la64_functions.h"
+
+
+uintptr_t dynarec64_DE(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)rep;
+    (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t wback;
+    uint8_t u8;
+    int64_t fixedaddress;
+    int v1, v2;
+
+    MAYUSE(v2);
+    MAYUSE(v1);
+
+    if (MODREG)
+        switch (nextop) {
+            case 0xC0 ... 0xC7:
+                INST_NAME("FADDP STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FADD_S(v1, v1, v2);
+                } else {
+                    FADD_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xC8 ... 0xCF:
+                INST_NAME("FMULP STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FMUL_S(v1, v1, v2);
+                } else {
+                    FMUL_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xD0 ... 0xD7:
+                INST_NAME("FCOMP ST0, STx"); // yep
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, v2, x1, x2, x3);
+                } else {
+                    FCOMD(v1, v2, x1, x2, x3);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xD9:
+                INST_NAME("FCOMPP ST0, STx");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMS(v1, v2, x1, x2, x3);
+                } else {
+                    FCOMD(v1, v2, x1, x2, x3);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xE0 ... 0xE7:
+                INST_NAME("FSUBRP STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FSUB_S(v1, v2, v1);
+                } else {
+                    FSUB_D(v1, v2, v1);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xE8 ... 0xEF:
+                INST_NAME("FSUBP STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FSUB_S(v1, v1, v2);
+                } else {
+                    FSUB_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xF0 ... 0xF7:
+                INST_NAME("FDIVRP STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FDIV_S(v1, v2, v1);
+                } else {
+                    FDIV_D(v1, v2, v1);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 0xF8 ... 0xFF:
+                INST_NAME("FDIVP STx, ST0");
+                v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v1 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (!BOX64ENV(dynarec_fastround)) u8 = x87_setround(dyn, ninst, x1, x5);
+                if (ST_IS_F(0)) {
+                    FDIV_S(v1, v1, v2);
+                } else {
+                    FDIV_D(v1, v1, v2);
+                }
+                X87_CHECK_PRECISION(v1);
+                if (!BOX64ENV(dynarec_fastround)) x87_restoreround(dyn, ninst, u8);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            default:
+                DEFAULT;
+                break;
+        }
+    else
+        switch ((nextop >> 3) & 7) {
+            default:
+                DEFAULT;
+        }
+    return addr;
+}
diff --git a/src/dynarec/la64/dynarec_la64_df.c b/src/dynarec/la64/dynarec_la64_df.c
new file mode 100644
index 00000000..07994fb5
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_df.c
@@ -0,0 +1,294 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "box64cpu.h"
+#include "emu/x64emu_private.h"
+#include "la64_emitter.h"
+#include "la64_mapping.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "emu/x87emu_private.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "../dynarec_helper.h"
+#include "dynarec_la64_functions.h"
+
+uintptr_t dynarec64_DF(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)rep;
+    (void)need_epilog;
+
+    uint8_t nextop = F8;
+    uint8_t ed, wback, u8;
+    int v1, v2;
+    int s0;
+    int64_t j64;
+    int64_t fixedaddress;
+
+    MAYUSE(s0);
+    MAYUSE(v2);
+    MAYUSE(v1);
+    MAYUSE(j64);
+
+    if (MODREG)
+        switch (nextop) {
+            case 0xC0 ... 0xC7:
+                INST_NAME("FFREEP STx");
+                // not handling Tag...
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+
+            case 0xE0:
+                INST_NAME("FNSTSW AX");
+                LD_WU(x2, xEmu, offsetof(x64emu_t, top));
+                if (dyn->lsx.x87stack) {
+                    ADDI_D(x2, x2, -dyn->lsx.x87stack);
+                    ANDI(x2, x2, 0x7);
+                }
+                LD_HU(x1, xEmu, offsetof(x64emu_t, sw));
+                MOV32w(x3, 0b1100011111111111); // mask
+                AND(x1, x1, x3);
+                SLLI_D(x2, x2, 11);
+                OR(x1, x1, x2); // inject top
+                ST_H(x1, xEmu, offsetof(x64emu_t, sw));
+                SRLI_D(xRAX, xRAX, 16);
+                SLLI_D(xRAX, xRAX, 16);
+                OR(xRAX, xRAX, x1);
+                break;
+            case 0xE8 ... 0xF7:
+                if (nextop < 0xF0) {
+                    INST_NAME("FUCOMIP ST0, STx");
+                } else {
+                    INST_NAME("FCOMIP ST0, STx");
+                }
+                SETFLAGS(X_ALL, SF_SET, NAT_FLAGS_NOFUSION);
+                SET_DFNONE();
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop & 7));
+                v2 = x87_get_st(dyn, ninst, x1, x2, nextop & 7, X87_COMBINE(0, nextop & 7));
+                if (ST_IS_F(0)) {
+                    FCOMIS(v1, v2, x1, x2);
+                } else {
+                    FCOMID(v1, v2, x1, x2);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            default:
+                DEFAULT;
+                break;
+        }
+    else
+        switch ((nextop >> 3) & 7) {
+            case 0:
+                INST_NAME("FILD ST0, Ew");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_F);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0);
+                LD_H(x1, wback, fixedaddress);
+                MOVGR2FR_D(v1, x1);
+                if (ST_IS_F(0)) {
+                    FFINT_S_L(v1, v1);
+                } else {
+                    FFINT_D_L(v1, v1);
+                }
+                break;
+            case 1:
+                INST_NAME("FISTTP Ew, ST0");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F);
+                v2 = fpu_get_scratch(dyn);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0);
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVGR2FCSR(FCSR2, xZR); // reset all bits
+                }
+                if (ST_IS_F(0)) {
+                    FTINTRZ_W_S(v2, v1);
+                    MOVFR2GR_S(x4, v2);
+                } else {
+                    FTINTRZ_W_D(v2, v1);
+                    MOVFR2GR_S(x4, v2);
+                }
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
+                    BSTRPICK_D(x5, x5, FR_V, FR_V);
+                    BNEZ_MARK(x5);
+                    SLLI_W(x5, x4, 16);
+                    SRAI_W(x5, x5, 16);
+                    BEQ_MARK2(x5, x4);
+                    MARK;
+                    MOV32w(x4, 0x8000);
+                }
+                MARK2;
+                ST_H(x4, wback, fixedaddress);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 2:
+                INST_NAME("FIST Ew, ST0");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F);
+                v2 = fpu_get_scratch(dyn);
+                u8 = x87_setround(dyn, ninst, x1, x5);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVGR2FCSR(FCSR2, xZR); // reset all bits
+                }
+                if (ST_IS_F(0)) {
+                    FTINT_W_S(v2, v1);
+                    MOVFR2GR_S(x4, v2);
+                } else {
+                    FTINT_W_D(v2, v1);
+                    MOVFR2GR_S(x4, v2);
+                }
+                x87_restoreround(dyn, ninst, u8);
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
+                    BSTRPICK_D(x5, x5, FR_V, FR_V);
+                    BNEZ_MARK(x5);
+                    SLLI_W(x5, x4, 16);
+                    SRAI_W(x5, x5, 16);
+                    BEQ_MARK2(x5, x4);
+                    MARK;
+                    MOV32w(x4, 0x8000);
+                }
+                MARK2;
+                ST_H(x4, wback, fixedaddress);
+                break;
+            case 3:
+                INST_NAME("FISTP Ew, ST0");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_F);
+                v2 = fpu_get_scratch(dyn);
+                u8 = x87_setround(dyn, ninst, x1, x5);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVGR2FCSR(FCSR2, xZR); // reset all bits
+                }
+                if (ST_IS_F(0)) {
+                    FTINT_W_S(v2, v1);
+                    MOVFR2GR_S(x4, v2);
+                } else {
+                    FTINT_W_D(v2, v1);
+                    MOVFR2GR_S(x4, v2);
+                }
+                x87_restoreround(dyn, ninst, u8);
+                if (!BOX64ENV(dynarec_fastround)) {
+                    MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
+                    BSTRPICK_D(x5, x5, FR_V, FR_V);
+                    BNEZ_MARK(x5);
+                    SLLI_W(x5, x4, 16);
+                    SRAI_W(x5, x5, 16);
+                    BEQ_MARK2(x5, x4);
+                    MARK;
+                    MOV32w(x4, 0x8000);
+                }
+                MARK2;
+                ST_H(x4, wback, fixedaddress);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 4:
+                INST_NAME("FBLD ST0, tbytes");
+                X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, x1);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                CALL(const_fpu_fbld, -1, ed, 0);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                break;
+            case 5:
+                INST_NAME("FILD ST0, i64");
+                X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, LSX_CACHE_ST_I64);
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+
+                if (ST_IS_I64(0)) {
+                    FLD_D(v1, wback, fixedaddress);
+                } else {
+                    LD_D(x1, wback, fixedaddress);
+                    if (rex.is32bits) {
+                        // need to also feed the STll stuff...
+                        ADDI_D(x4, xEmu, offsetof(x64emu_t, fpu_ll));
+                        LD_WU(x5, xEmu, offsetof(x64emu_t, top));
+                        int a = 0 - dyn->lsx.x87stack;
+                        if (a) {
+                            ADDI_W(x5, x5, a);
+                            ANDI(x5, x5, 0x7);
+                        }
+                        SLLI_D(x5, x5, 4); // fpu_ll is 2 i64
+                        ADD_D(x5, x5, x4);
+                        ST_D(x1, x5, 8); // ll
+                    }
+                    MOVGR2FR_D(v1, x1);
+                    FFINT_D_L(v1, v1);
+                    if (rex.is32bits) {
+                        FST_D(v1, x5, 0); // ref
+                    }
+                }
+                break;
+            case 6:
+                INST_NAME("FBSTP tbytes, ST0");
+                x87_forget(dyn, ninst, x1, x2, 0);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
+                s0 = x87_stackcount(dyn, ninst, x3);
+                CALL(const_fpu_fbst, -1, ed, 0);
+                x87_unstackcount(dyn, ninst, x3, s0);
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            case 7:
+                INST_NAME("FISTP i64, ST0");
+                v1 = x87_get_st(dyn, ninst, x1, x2, 0, LSX_CACHE_ST_I64);
+                v2 = fpu_get_scratch(dyn);
+                if (!ST_IS_I64(0)) {
+                    u8 = x87_setround(dyn, ninst, x1, x7);
+                }
+                addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0);
+
+                if (ST_IS_I64(0)) {
+                    FST_D(v1, wback, fixedaddress);
+                } else {
+                    if (rex.is32bits) {
+                        // need to check STll first...
+                        ADDI_D(x4, xEmu, offsetof(x64emu_t, fpu_ll));
+                        LD_WU(x5, xEmu, offsetof(x64emu_t, top));
+                        int a = 0 - dyn->lsx.x87stack;
+                        if (a) {
+                            ADDI_W(x5, x5, a);
+                            ANDI(x5, x5, 0x7);
+                        }
+                        SLLI_D(x5, x5, 4); // fpu_ll is 2 i64
+                        ADD_D(x5, x5, x4);
+                        MOVFR2GR_D(x3, v1);
+                        LD_D(x6, x5, 0); // ref
+                        BNE_MARK(x6, x3);
+                        LD_D(x6, x5, 8); // ll
+                        ST_D(x6, wback, fixedaddress);
+                        B_MARK3_nocond;
+                        MARK;
+                    }
+
+                    if (!BOX64ENV(dynarec_fastround)) {
+                        MOVGR2FCSR(FCSR2, xZR); // reset all bits
+                    }
+                    FTINT_L_D(v2, v1);
+                    if (!BOX64ENV(dynarec_fastround)) {
+                        MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
+                        BSTRPICK_D(x5, x5, FR_V, FR_V);
+                        BEQ_MARK2(x5, xZR);
+                        MOV64x(x4, 0x8000000000000000LL);
+                        MOVGR2FR_D(v2, x4);
+                    }
+                    MARK2;
+                    FST_D(v2, wback, fixedaddress);
+                    MARK3;
+                    x87_restoreround(dyn, ninst, u8);
+                }
+                X87_POP_OR_FAIL(dyn, ninst, x3);
+                break;
+            default:
+                DEFAULT;
+                break;
+        }
+    return addr;
+}
diff --git a/src/dynarec/la64/dynarec_la64_functions.c b/src/dynarec/la64/dynarec_la64_functions.c
index f2674483..a38b2a47 100644
--- a/src/dynarec/la64/dynarec_la64_functions.c
+++ b/src/dynarec/la64/dynarec_la64_functions.c
@@ -29,8 +29,8 @@
 #include "elfloader.h"
 
 #define XMM0 0
-#define X870 16
-#define EMM0 16
+#define X870 XMM0 + 16
+#define EMM0 XMM0 + 16
 
 // Get a FPU scratch reg
 int fpu_get_scratch(dynarec_la64_t* dyn)
@@ -42,7 +42,18 @@ void fpu_reset_scratch(dynarec_la64_t* dyn)
 {
     dyn->lsx.fpu_scratch = 0;
 }
-
+// Get a x87 double reg
+int fpu_get_reg_x87(dynarec_la64_t* dyn, int t, int n)
+{
+    int i = X870;
+    while (dyn->lsx.fpuused[i])
+        ++i;
+    dyn->lsx.fpuused[i] = 1;
+    dyn->lsx.lsxcache[i].n = n;
+    dyn->lsx.lsxcache[i].t = t;
+    dyn->lsx.news |= (1 << i);
+    return i; // return a Dx
+}
 // Free a FPU double reg
 void fpu_free_reg(dynarec_la64_t* dyn, int reg)
 {
@@ -101,6 +112,220 @@ void fpu_reset_reg(dynarec_la64_t* dyn)
     fpu_reset_reg_lsxcache(&dyn->lsx);
 }
 
+
+int lsxcache_no_i64(dynarec_la64_t* dyn, int ninst, int st, int a)
+{
+    if (a == LSX_CACHE_ST_I64) {
+        lsxcache_promote_double(dyn, ninst, st);
+        return LSX_CACHE_ST_D;
+    }
+    return a;
+}
+
+int lsxcache_get_st(dynarec_la64_t* dyn, int ninst, int a)
+{
+    if (dyn->insts[ninst].lsx.swapped) {
+        if (dyn->insts[ninst].lsx.combined1 == a)
+            a = dyn->insts[ninst].lsx.combined2;
+        else if (dyn->insts[ninst].lsx.combined2 == a)
+            a = dyn->insts[ninst].lsx.combined1;
+    }
+    for (int i = 0; i < 24; ++i)
+        if ((dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F
+                || dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_D
+                || dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_I64)
+            && dyn->insts[ninst].lsx.lsxcache[i].n == a)
+            return dyn->insts[ninst].lsx.lsxcache[i].t;
+    // not in the cache yet, so will be fetched...
+    return LSX_CACHE_ST_D;
+}
+
+int lsxcache_get_current_st(dynarec_la64_t* dyn, int ninst, int a)
+{
+    (void)ninst;
+    if (!dyn->insts)
+        return LSX_CACHE_ST_D;
+    for (int i = 0; i < 24; ++i)
+        if ((dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_F
+                || dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_D
+                || dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_I64)
+            && dyn->lsx.lsxcache[i].n == a)
+            return dyn->lsx.lsxcache[i].t;
+    // not in the cache yet, so will be fetched...
+    return LSX_CACHE_ST_D;
+}
+
+int lsxcache_get_st_f(dynarec_la64_t* dyn, int ninst, int a)
+{
+    for (int i = 0; i < 24; ++i)
+        if (dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F
+            && dyn->insts[ninst].lsx.lsxcache[i].n == a)
+            return i;
+    return -1;
+}
+
+int lsxcache_get_st_f_i64(dynarec_la64_t* dyn, int ninst, int a)
+{
+    for (int i = 0; i < 24; ++i)
+        if ((dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_I64 || dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F)
+            && dyn->insts[ninst].lsx.lsxcache[i].n == a)
+            return i;
+    return -1;
+}
+
+int lsxcache_get_st_f_noback(dynarec_la64_t* dyn, int ninst, int a)
+{
+    for (int i = 0; i < 24; ++i)
+        if (dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F
+            && dyn->insts[ninst].lsx.lsxcache[i].n == a)
+            return i;
+    return -1;
+}
+
+int lsxcache_get_st_f_i64_noback(dynarec_la64_t* dyn, int ninst, int a)
+{
+    for (int i = 0; i < 24; ++i)
+        if ((dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_I64 || dyn->insts[ninst].lsx.lsxcache[i].t == LSX_CACHE_ST_F)
+            && dyn->insts[ninst].lsx.lsxcache[i].n == a)
+            return i;
+    return -1;
+}
+
+int lsxcache_get_current_st_f(dynarec_la64_t* dyn, int a)
+{
+    for (int i = 0; i < 24; ++i)
+        if (dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_F
+            && dyn->lsx.lsxcache[i].n == a)
+            return i;
+    return -1;
+}
+
+int lsxcache_get_current_st_f_i64(dynarec_la64_t* dyn, int a)
+{
+    for (int i = 0; i < 24; ++i)
+        if ((dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_I64 || dyn->lsx.lsxcache[i].t == LSX_CACHE_ST_F)
+            && dyn->lsx.lsxcache[i].n == a)
+            return i;
+    return -1;
+}
+
+static void lsxcache_promote_double_forward(dynarec_la64_t* dyn, int ninst, int maxinst, int a);
+static void lsxcache_promote_double_internal(dynarec_la64_t* dyn, int ninst, int maxinst, int a);
+static void lsxcache_promote_double_combined(dynarec_la64_t* dyn, int ninst, int maxinst, int a)
+{
+    if (a == dyn->insts[ninst].lsx.combined1 || a == dyn->insts[ninst].lsx.combined2) {
+        if (a == dyn->insts[ninst].lsx.combined1) {
+            a = dyn->insts[ninst].lsx.combined2;
+        } else
+            a = dyn->insts[ninst].lsx.combined1;
+        int i = lsxcache_get_st_f_i64_noback(dyn, ninst, a);
+        if (i >= 0) {
+            dyn->insts[ninst].lsx.lsxcache[i].t = LSX_CACHE_ST_D;
+            if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2;
+            if (!dyn->insts[ninst].lsx.barrier)
+                lsxcache_promote_double_internal(dyn, ninst - 1, maxinst, a - dyn->insts[ninst].lsx.stack_push);
+            // go forward is combined is not pop'd
+            if (a - dyn->insts[ninst].lsx.stack_pop >= 0)
+                if (!dyn->insts[ninst + 1].lsx.barrier)
+                    lsxcache_promote_double_forward(dyn, ninst + 1, maxinst, a - dyn->insts[ninst].lsx.stack_pop);
+        }
+    }
+}
+static void lsxcache_promote_double_internal(dynarec_la64_t* dyn, int ninst, int maxinst, int a)
+{
+    if (dyn->insts[ninst + 1].lsx.barrier)
+        return;
+    while (ninst >= 0) {
+        a += dyn->insts[ninst].lsx.stack_pop; // adjust Stack depth: add pop'd ST (going backward)
+        int i = lsxcache_get_st_f_i64(dyn, ninst, a);
+        if (i < 0) return;
+        dyn->insts[ninst].lsx.lsxcache[i].t = LSX_CACHE_ST_D;
+        if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2;
+        // check combined propagation too
+        if (dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2) {
+            if (dyn->insts[ninst].lsx.swapped) {
+                // if(dyn->need_dump) dynarec_log(LOG_NONE, "lsxcache_promote_double_internal, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack);
+                if (a == dyn->insts[ninst].lsx.combined1)
+                    a = dyn->insts[ninst].lsx.combined2;
+                else if (a == dyn->insts[ninst].lsx.combined2)
+                    a = dyn->insts[ninst].lsx.combined1;
+            } else {
+                lsxcache_promote_double_combined(dyn, ninst, maxinst, a);
+            }
+        }
+        a -= dyn->insts[ninst].lsx.stack_push; // // adjust Stack depth: remove push'd ST (going backward)
+        --ninst;
+        if (ninst < 0 || a < 0 || dyn->insts[ninst].lsx.barrier)
+            return;
+    }
+}
+
+static void lsxcache_promote_double_forward(dynarec_la64_t* dyn, int ninst, int maxinst, int a)
+{
+    while ((ninst != -1) && (ninst < maxinst) && (a >= 0)) {
+        a += dyn->insts[ninst].lsx.stack_push; // // adjust Stack depth: add push'd ST (going forward)
+        if ((dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2) && dyn->insts[ninst].lsx.swapped) {
+            // if(dyn->need_dump) dynarec_log(LOG_NONE, "lsxcache_promote_double_forward, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack);
+            if (a == dyn->insts[ninst].lsx.combined1)
+                a = dyn->insts[ninst].lsx.combined2;
+            else if (a == dyn->insts[ninst].lsx.combined2)
+                a = dyn->insts[ninst].lsx.combined1;
+        }
+        int i = lsxcache_get_st_f_i64_noback(dyn, ninst, a);
+        if (i < 0) return;
+        dyn->insts[ninst].lsx.lsxcache[i].t = LSX_CACHE_ST_D;
+        if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2;
+        // check combined propagation too
+        if ((dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2) && !dyn->insts[ninst].lsx.swapped) {
+            // if(dyn->need_dump) dynarec_log(LOG_NONE, "lsxcache_promote_double_forward, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].e.combined1 ,dyn->insts[ninst].e.combined2, a, dyn->insts[ninst].e.stack);
+            lsxcache_promote_double_combined(dyn, ninst, maxinst, a);
+        }
+        a -= dyn->insts[ninst].lsx.stack_pop; // adjust Stack depth: remove pop'd ST (going forward)
+        if (dyn->insts[ninst].x64.has_next && !dyn->insts[ninst].lsx.barrier)
+            ++ninst;
+        else
+            ninst = -1;
+    }
+    if (ninst == maxinst)
+        lsxcache_promote_double(dyn, ninst, a);
+}
+
+void lsxcache_promote_double(dynarec_la64_t* dyn, int ninst, int a)
+{
+    int i = lsxcache_get_current_st_f_i64(dyn, a);
+    if (i < 0) return;
+    dyn->lsx.lsxcache[i].t = LSX_CACHE_ST_D;
+    dyn->insts[ninst].lsx.lsxcache[i].t = LSX_CACHE_ST_D;
+    if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2;
+    // check combined propagation too
+    if (dyn->lsx.combined1 || dyn->lsx.combined2) {
+        if (dyn->lsx.swapped) {
+            if (dyn->lsx.combined1 == a)
+                a = dyn->lsx.combined2;
+            else if (dyn->lsx.combined2 == a)
+                a = dyn->lsx.combined1;
+        } else {
+            if (dyn->lsx.combined1 == a)
+                lsxcache_promote_double(dyn, ninst, dyn->lsx.combined2);
+            else if (dyn->lsx.combined2 == a)
+                lsxcache_promote_double(dyn, ninst, dyn->lsx.combined1);
+        }
+    }
+    a -= dyn->insts[ninst].lsx.stack_push; // // adjust Stack depth: remove push'd ST (going backward)
+    if (!ninst || a < 0) return;
+    lsxcache_promote_double_internal(dyn, ninst - 1, ninst, a);
+}
+
+int lsxcache_combine_st(dynarec_la64_t* dyn, int ninst, int a, int b)
+{
+    dyn->lsx.combined1 = a;
+    dyn->lsx.combined2 = b;
+    if (lsxcache_get_current_st(dyn, ninst, a) == LSX_CACHE_ST_F
+        && lsxcache_get_current_st(dyn, ninst, b) == LSX_CACHE_ST_F)
+        return LSX_CACHE_ST_F;
+    return LSX_CACHE_ST_D;
+}
+
 static int isCacheEmpty(dynarec_native_t* dyn, int ninst)
 {
     if (dyn->insts[ninst].lsx.stack_next) {
@@ -549,6 +774,12 @@ void fpu_reset(dynarec_la64_t* dyn)
     fpu_reset_reg(dyn);
 }
 
+int fpu_is_st_freed(dynarec_la64_t* dyn, int ninst, int st)
+{
+    return (dyn->lsx.tags & (0b11 << (st * 2))) ? 1 : 0;
+}
+
+
 void fpu_reset_ninst(dynarec_la64_t* dyn, int ninst)
 {
     // TODO: x87 and mmx
diff --git a/src/dynarec/la64/dynarec_la64_functions.h b/src/dynarec/la64/dynarec_la64_functions.h
index 4b96b497..18c8b0d3 100644
--- a/src/dynarec/la64/dynarec_la64_functions.h
+++ b/src/dynarec/la64/dynarec_la64_functions.h
@@ -12,6 +12,8 @@ typedef struct dynarec_la64_s dynarec_la64_t;
 int fpu_get_scratch(dynarec_la64_t* dyn);
 // Reset scratch regs counter
 void fpu_reset_scratch(dynarec_la64_t* dyn);
+// Get an x87 double reg
+int fpu_get_reg_x87(dynarec_la64_t* dyn, int t, int n);
 // Get an XMM quad reg
 int fpu_get_reg_xmm(dynarec_la64_t* dyn, int t, int xmm);
 // Get an YMM quad reg
@@ -23,6 +25,25 @@ void fpu_reset_reg(dynarec_la64_t* dyn);
 // Get an MMX double reg
 int fpu_get_reg_emm(dynarec_la64_t* dyn, int emm);
 
+// Get type for STx
+int lsxcache_get_st(dynarec_la64_t* dyn, int ninst, int a);
+// Get if STx is FLOAT or DOUBLE
+int lsxcache_get_st_f(dynarec_la64_t* dyn, int ninst, int a);
+// Get if STx is FLOAT or I64
+int lsxcache_get_st_f_i64(dynarec_la64_t* dyn, int ninst, int a);
+// Get actual type for STx
+int lsxcache_get_current_st(dynarec_la64_t* dyn, int ninst, int a);
+// Get actual STx is FLOAT or DOUBLE
+int lsxcache_get_current_st_f(dynarec_la64_t* dyn, int a);
+// Get actual STx is FLOAT or I64
+int lsxcache_get_current_st_f_i64(dynarec_la64_t* dyn, int a);
+// Back-propagate a change float->double
+void lsxcache_promote_double(dynarec_la64_t* dyn, int ninst, int a);
+// Combine and propagate if needed (pass 1 only)
+int lsxcache_combine_st(dynarec_la64_t* dyn, int ninst, int a, int b); // with stack current dyn->n_stack*
+// Do not allow i64 type
+int lsxcache_no_i64(dynarec_la64_t* dyn, int ninst, int st, int a);
+
 // FPU Cache transformation (for loops) // Specific, need to be written by backend
 int fpuCacheNeedsTransform(dynarec_la64_t* dyn, int ninst);
 
@@ -39,6 +60,8 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode);
 // reset the cache
 void fpu_reset(dynarec_native_t* dyn);
 void fpu_reset_ninst(dynarec_native_t* dyn, int ninst);
+// is st freed
+int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st);
 
 void updateNativeFlags(dynarec_la64_t* dyn);
 void get_free_scratch(dynarec_la64_t* dyn, int ninst, uint8_t* tmp1, uint8_t* tmp2, uint8_t* tmp3, uint8_t s1, uint8_t s2, uint8_t s3, uint8_t s4, uint8_t s5);
diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c
index d283c09f..5a774e46 100644
--- a/src/dynarec/la64/dynarec_la64_helper.c
+++ b/src/dynarec/la64/dynarec_la64_helper.c
@@ -695,7 +695,7 @@ void call_c(dynarec_la64_t* dyn, int ninst, la64_consts_t fnc, int reg, int ret,
 {
     MAYUSE(fnc);
     if (savereg == 0)
-        savereg = x6;
+        savereg = x87pc;
     if (saveflags) {
         RESTORE_EFLAGS(reg);
         ST_D(xFlags, xEmu, offsetof(x64emu_t, eflags));
@@ -753,6 +753,8 @@ void call_c(dynarec_la64_t* dyn, int ninst, la64_consts_t fnc, int reg, int ret,
         LD_D(xFlags, xEmu, offsetof(x64emu_t, eflags));
         SPILL_EFLAGS();
     }
+    if (savereg != x87pc && dyn->need_x87check)
+        NATIVE_RESTORE_X87PC();
     // SET_NODF();
     dyn->last_ip = 0;
 }
@@ -783,10 +785,240 @@ void grab_segdata(dynarec_la64_t* dyn, uintptr_t addr, int ninst, int reg, int s
     MESSAGE(LOG_DUMP, "----%s Offset\n", (segment == _FS) ? "FS" : "GS");
 }
 
+int x87_stackcount(dynarec_la64_t* dyn, int ninst, int scratch)
+{
+    MAYUSE(scratch);
+    if (!dyn->lsx.x87stack)
+        return 0;
+    if (dyn->lsx.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, scratch);
+    MESSAGE(LOG_DUMP, "\tSynch x87 Stackcount (%d)\n", dyn->lsx.x87stack);
+    int a = dyn->lsx.x87stack;
+    // Add x87stack to emu fpu_stack
+    LD_W(scratch, xEmu, offsetof(x64emu_t, fpu_stack));
+    ADDI_D(scratch, scratch, a);
+    ST_W(scratch, xEmu, offsetof(x64emu_t, fpu_stack));
+    // Sub x87stack to top, with and 7
+    LD_W(scratch, xEmu, offsetof(x64emu_t, top));
+    ADDI_D(scratch, scratch, -a);
+    ANDI(scratch, scratch, 7);
+    ST_W(scratch, xEmu, offsetof(x64emu_t, top));
+    // reset x87stack, but not the stack count of extcache
+    dyn->lsx.x87stack = 0;
+    dyn->lsx.stack_next -= dyn->lsx.stack;
+    int ret = dyn->lsx.stack;
+    dyn->lsx.stack = 0;
+    MESSAGE(LOG_DUMP, "\t------x87 Stackcount\n");
+    return ret;
+}
+void x87_unstackcount(dynarec_la64_t* dyn, int ninst, int scratch, int count)
+{
+    MAYUSE(scratch);
+    if (!count)
+        return;
+    if (dyn->lsx.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, scratch);
+    MESSAGE(LOG_DUMP, "\tSynch x87 Unstackcount (%d)\n", count);
+    int a = -count;
+    // Add x87stack to emu fpu_stack
+    LD_W(scratch, xEmu, offsetof(x64emu_t, fpu_stack));
+    ADDI_D(scratch, scratch, a);
+    ST_W(scratch, xEmu, offsetof(x64emu_t, fpu_stack));
+    // Sub x87stack to top, with and 7
+    LD_W(scratch, xEmu, offsetof(x64emu_t, top));
+    ADDI_D(scratch, scratch, -a);
+    ANDI(scratch, scratch, 7);
+    ST_W(scratch, xEmu, offsetof(x64emu_t, top));
+    // reset x87stack, but not the stack count of extcache
+    dyn->lsx.x87stack = count;
+    dyn->lsx.stack = count;
+    dyn->lsx.stack_next += dyn->lsx.stack;
+    MESSAGE(LOG_DUMP, "\t------x87 Unstackcount\n");
+}
 
 void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st)
 {
-    // TODO
+    int ret = -1;
+    for (int i = 0; (i < 8) && (ret == -1); ++i)
+        if (dyn->lsx.x87cache[i] == st)
+            ret = i;
+    if (ret == -1) // nothing to do
+        return;
+    MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st);
+    const int reg = dyn->lsx.x87reg[ret];
+#if STEP == 1
+    if (dyn->lsx.lsxcache[dyn->lsx.x87reg[ret]].t == LSX_CACHE_ST_F
+        || dyn->lsx.lsxcache[dyn->lsx.x87reg[ret]].t == LSX_CACHE_ST_I64)
+        lsxcache_promote_double(dyn, ninst, st);
+#endif
+    // prepare offset to fpu => s1
+    // Get top
+    LD_W(s2, xEmu, offsetof(x64emu_t, top));
+    // Update
+    int a = st - dyn->lsx.x87stack;
+    if (a) {
+        ADDI_D(s2, s2, a);
+        ANDI(s2, s2, 7); // (emu->top + i)&7
+    }
+    SLLI_D(s2, s2, 3);
+    ADD_D(s1, xEmu, s2);
+    if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_F) {
+        FCVT_D_S(SCRATCH0, reg);
+        FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87));
+    } else if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_I64) {
+        FFINT_D_L(SCRATCH0, reg);
+        FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87));
+    } else {
+        FST_D(reg, s1, offsetof(x64emu_t, x87));
+    }
+    MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st);
+    // and forget that cache
+    fpu_free_reg(dyn, dyn->lsx.x87reg[ret]);
+    dyn->lsx.lsxcache[reg].v = 0;
+    dyn->lsx.x87cache[ret] = -1;
+    dyn->lsx.x87reg[ret] = -1;
+}
+
+
+void x87_reget_st(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st)
+{
+    if (dyn->lsx.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    // search in cache first
+    for (int i = 0; i < 8; ++i)
+        if (dyn->lsx.x87cache[i] == st) {
+            // refresh the value
+            MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st);
+#if STEP == 1
+            if (dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_F
+                || dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_I64)
+                lsxcache_promote_double(dyn, ninst, st);
+#endif
+            LD_W(s2, xEmu, offsetof(x64emu_t, top));
+            int a = st - dyn->lsx.x87stack;
+            if (a) {
+                ADDI_D(s2, s2, a);
+                AND(s2, s2, 7);
+            }
+            SLLI_D(s2, s2, 3);
+            ADD_D(s1, xEmu, s2);
+            FLD_D(dyn->lsx.x87reg[i], s1, offsetof(x64emu_t, x87));
+            MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
+            // ok
+            return;
+        }
+    // Was not in the cache? creating it....
+    MESSAGE(LOG_DUMP, "\tCreate x87 Cache for ST%d\n", st);
+    // get a free spot
+    int ret = -1;
+    for (int i = 0; (i < 8) && (ret == -1); ++i)
+        if (dyn->lsx.x87cache[i] == -1)
+            ret = i;
+    // found, setup and grab the value
+    dyn->lsx.x87cache[ret] = st;
+    dyn->lsx.x87reg[ret] = fpu_get_reg_x87(dyn, LSX_CACHE_ST_D, st);
+    LD_W(s2, xEmu, offsetof(x64emu_t, top));
+    int a = st - dyn->lsx.x87stack;
+    ADDI_D(s2, s2, a);
+    ANDI(s2, s2, 7); // (emu->top + i)&7
+    SLLI_D(s2, s2, 3);
+    ADD_D(s1, xEmu, s2);
+    FLD_D(dyn->lsx.x87reg[ret], s1, offsetof(x64emu_t, x87));
+    MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
+}
+
+void x87_free(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int st)
+{
+    int ret = -1;
+    for (int i = 0; (i < 8) && (ret == -1); ++i)
+        if (dyn->lsx.x87cache[i] == st)
+            ret = i;
+    MESSAGE(LOG_DUMP, "\tFFREE%s x87 Cache for ST%d\n", (ret != -1) ? " (and Forget)" : "", st);
+    if (ret != -1) {
+        const int reg = dyn->lsx.x87reg[ret];
+#if STEP == 1
+        if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_F || dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_I64)
+            lsxcache_promote_double(dyn, ninst, st);
+#endif
+        // Get top
+        LD_W(s2, xEmu, offsetof(x64emu_t, top));
+        // Update
+        int ast = st - dyn->lsx.x87stack;
+        if (ast) {
+            ADDI_D(s2, s2, ast);
+            ANDI(s2, s2, 7); // (emu->top + i)&7
+        }
+        SLLI_D(s2, s2, 3);
+        ADD_D(s1, xEmu, s2);
+        if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_F) {
+            FCVT_D_S(SCRATCH0, reg);
+            FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87));
+        } else if (dyn->lsx.lsxcache[reg].t == LSX_CACHE_ST_I64) {
+            FFINT_D_L(SCRATCH0, reg);
+            FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87));
+        } else {
+            FST_D(reg, s1, offsetof(x64emu_t, x87));
+        }
+        // and forget that cache
+        fpu_free_reg(dyn, reg);
+        dyn->lsx.lsxcache[reg].v = 0;
+        dyn->lsx.x87cache[ret] = -1;
+        dyn->lsx.x87reg[ret] = -1;
+    } else {
+        // Get top
+        LD_W(s2, xEmu, offsetof(x64emu_t, top));
+        // Update
+        int ast = st - dyn->lsx.x87stack;
+        if (ast) {
+            ADDI_D(s2, s2, ast);
+            ANDI(s2, s2, 7); // (emu->top + i)&7
+        }
+    }
+    // add mark in the freed array
+    dyn->lsx.tags |= 0b11 << (st * 2);
+    MESSAGE(LOG_DUMP, "\t--------x87 FFREE for ST%d\n", st);
+}
+
+void x87_swapreg(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int b)
+{
+    int i1, i2, i3;
+    i1 = x87_get_cache(dyn, ninst, 1, s1, s2, b, X87_ST(b));
+    i2 = x87_get_cache(dyn, ninst, 1, s1, s2, a, X87_ST(a));
+    i3 = dyn->lsx.x87cache[i1];
+    dyn->lsx.x87cache[i1] = dyn->lsx.x87cache[i2];
+    dyn->lsx.x87cache[i2] = i3;
+    // swap those too
+    int j1, j2, j3;
+    j1 = x87_get_lsxcache(dyn, ninst, s1, s2, b);
+    j2 = x87_get_lsxcache(dyn, ninst, s1, s2, a);
+    j3 = dyn->lsx.lsxcache[j1].n;
+    dyn->lsx.lsxcache[j1].n = dyn->lsx.lsxcache[j2].n;
+    dyn->lsx.lsxcache[j2].n = j3;
+    // mark as swapped
+    dyn->lsx.swapped = 1;
+    dyn->lsx.combined1 = a;
+    dyn->lsx.combined2 = b;
+}
+
+// Set rounding according to cw flags, return reg to restore flags
+int x87_setround(dynarec_la64_t* dyn, int ninst, int s1, int s2)
+{
+    MAYUSE(dyn);
+    MAYUSE(ninst);
+    MAYUSE(s1);
+    MAYUSE(s2);
+    LD_W(s1, xEmu, offsetof(x64emu_t, cw));
+    BSTRPICK_W(s1, s1, 11, 10);
+    // MMX/x87 Round mode: 0..3: Nearest, Down, Up, Chop
+    // LA64: 0..3: Nearest, TowardZero, TowardsPositive, TowardsNegative
+    // 0->0, 1->3, 2->2, 3->1
+    SUB_W(s1, xZR, s1);
+    ANDI(s1, s1, 3);
+    // done
+    SLLI_D(s1, s1, 8);
+    MOVFCSR2GR(s2, FCSR3);
+    MOVGR2FCSR(FCSR3, s1); // exchange RM with current
+    return s2;
 }
 
 // Set rounding according to mxcsr flags, return reg to restore flags
@@ -810,10 +1042,394 @@ int sse_setround(dynarec_la64_t* dyn, int ninst, int s1, int s2)
     return s2;
 }
 
+int lsxcache_st_coherency(dynarec_la64_t* dyn, int ninst, int a, int b)
+{
+    int i1 = lsxcache_get_st(dyn, ninst, a);
+    int i2 = lsxcache_get_st(dyn, ninst, b);
+    if (i1 != i2) {
+        MESSAGE(LOG_DUMP, "Warning, ST cache incoherent between ST%d(%d) and ST%d(%d)\n", a, i1, b, i2);
+    }
+
+    return i1;
+}
+
+// On step 1, Float/Double for ST is actually computed and back-propagated
+// On step 2-3, the value is just read for inst[...].n.neocache[..]
+// the reg returned is *2 for FLOAT
+int x87_do_push(dynarec_la64_t* dyn, int ninst, int s1, int t)
+{
+    if (dyn->lsx.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    dyn->lsx.x87stack += 1;
+    dyn->lsx.stack += 1;
+    dyn->lsx.stack_next += 1;
+    dyn->lsx.stack_push += 1;
+    ++dyn->lsx.pushed;
+    if (dyn->lsx.poped)
+        --dyn->lsx.poped;
+    // move all regs in cache, and find a free one
+    for (int j = 0; j < 24; ++j)
+        if ((dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_D)
+            || (dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_F)
+            || (dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_I64))
+            ++dyn->lsx.lsxcache[j].n;
+    int ret = -1;
+    dyn->lsx.tags <<= 2;
+    for (int i = 0; i < 8; ++i)
+        if (dyn->lsx.x87cache[i] != -1)
+            ++dyn->lsx.x87cache[i];
+        else if (ret == -1) {
+            dyn->lsx.x87cache[i] = 0;
+            ret = dyn->lsx.x87reg[i] = fpu_get_reg_x87(dyn, t, 0);
+            dyn->lsx.lsxcache[ret].t = X87_ST0;
+        }
+    if (ret == -1) {
+        MESSAGE(LOG_DUMP, "Incoherent x87 stack cache, aborting\n");
+        dyn->abort = 1;
+    }
+    return ret;
+}
+void x87_do_push_empty(dynarec_la64_t* dyn, int ninst, int s1)
+{
+    if (dyn->lsx.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    dyn->lsx.x87stack += 1;
+    dyn->lsx.stack += 1;
+    dyn->lsx.stack_next += 1;
+    dyn->lsx.stack_push += 1;
+    ++dyn->lsx.pushed;
+    if (dyn->lsx.poped)
+        --dyn->lsx.poped;
+    // move all regs in cache
+    for (int j = 0; j < 24; ++j)
+        if ((dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_D)
+            || (dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_F)
+            || (dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_I64))
+            ++dyn->lsx.lsxcache[j].n;
+    int ret = -1;
+    dyn->lsx.tags <<= 2;
+    for (int i = 0; i < 8; ++i)
+        if (dyn->lsx.x87cache[i] != -1)
+            ++dyn->lsx.x87cache[i];
+        else if (ret == -1)
+            ret = i;
+    if (ret == -1) {
+        MESSAGE(LOG_DUMP, "Incoherent x87 stack cache, aborting\n");
+        dyn->abort = 1;
+    }
+}
+static void internal_x87_dopop(dynarec_la64_t* dyn)
+{
+    for (int i = 0; i < 8; ++i)
+        if (dyn->lsx.x87cache[i] != -1) {
+            --dyn->lsx.x87cache[i];
+            if (dyn->lsx.x87cache[i] == -1) {
+                fpu_free_reg(dyn, dyn->lsx.x87reg[i]);
+                dyn->lsx.x87reg[i] = -1;
+            }
+        }
+}
+static int internal_x87_dofree(dynarec_la64_t* dyn)
+{
+    if (dyn->lsx.tags & 0b11) {
+        MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n");
+        return 1;
+    }
+    return 0;
+}
+void x87_do_pop(dynarec_la64_t* dyn, int ninst, int s1)
+{
+    if (dyn->lsx.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    do {
+        dyn->lsx.x87stack -= 1;
+        dyn->lsx.stack_next -= 1;
+        dyn->lsx.stack_pop += 1;
+        dyn->lsx.tags >>= 2;
+        ++dyn->lsx.poped;
+        if (dyn->lsx.pushed)
+            --dyn->lsx.pushed;
+        // move all regs in cache, poping ST0
+        internal_x87_dopop(dyn);
+    } while (internal_x87_dofree(dyn));
+}
+
 
 void x87_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3)
 {
-    // TODO
+    int ret = 0;
+    for (int i = 0; i < 8 && !ret; ++i)
+        if (dyn->lsx.x87cache[i] != -1)
+            ret = 1;
+    if (!ret && !dyn->lsx.x87stack) // nothing to do
+        return;
+    MESSAGE(LOG_DUMP, "\tPurge %sx87 Cache and Synch Stackcount (%+d)---\n", next ? "locally " : "", dyn->lsx.x87stack);
+    int a = dyn->lsx.x87stack;
+    if (a != 0) {
+        // reset x87stack
+        if (!next)
+            dyn->lsx.x87stack = 0;
+        // Add x87stack to emu fpu_stack
+        LD_W(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        ADDI_D(s2, s2, a);
+        ST_W(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        // Sub x87stack to top, with and 7
+        LD_W(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags (and top at the same time)
+        ADDI_D(s2, s2, -a);
+        ANDI(s2, s2, 7);
+        ST_W(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags (and top at the same time)
+        LD_HU(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        if (a > 0) {
+            SLLI_D(s1, s1, a * 2);
+        } else {
+            BSTRINS_D(s1, xZR, 15, 0);
+            SRLI_D(s1, s1, -a * 2);
+        }
+        ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+    } else {
+        LD_W(s2, xEmu, offsetof(x64emu_t, top));
+    }
+    // check if free is used
+    if (dyn->lsx.tags) {
+        LD_H(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        MOV32w(s3, dyn->lsx.tags);
+        OR(s1, s1, s3);
+        ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+    }
+    if (ret != 0) {
+        // --- set values
+        // Get top
+        // loop all cache entries
+        for (int i = 0; i < 8; ++i)
+            if (dyn->lsx.x87cache[i] != -1) {
+                int st = dyn->lsx.x87cache[i] + dyn->lsx.stack_pop;
+#if STEP == 1
+                if (!next) { // don't force promotion here
+                    // pre-apply pop, because purge happens in-between
+                    lsxcache_promote_double(dyn, ninst, st);
+                }
+#endif
+#if STEP == 3
+                if (!next && lsxcache_get_current_st(dyn, ninst, st) != LSX_CACHE_ST_D) {
+                    MESSAGE(LOG_DUMP, "Warning, incoherency with purged ST%d cache\n", st);
+                }
+#endif
+                ADDI_D(s3, s2, dyn->lsx.x87cache[i]); // unadjusted count, as it's relative to real top
+                ANDI(s3, s3, 7);                      // (emu->top + st)&7
+                SLLI_D(s1, s3, 3);
+                ADD_D(s1, xEmu, s1);
+                switch (lsxcache_get_current_st(dyn, ninst, st)) {
+                    case LSX_CACHE_ST_D:
+                        FST_D(dyn->lsx.x87reg[i], s1, offsetof(x64emu_t, x87)); // save the value
+                        break;
+                    case LSX_CACHE_ST_F:
+                        FCVT_D_S(SCRATCH0, dyn->lsx.x87reg[i]);
+                        FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87)); // save the value
+                        break;
+                    case LSX_CACHE_ST_I64:
+                        FFINT_D_L(SCRATCH0, dyn->lsx.x87reg[i]);
+                        FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87)); // save the value
+                        break;
+                }
+                if (!next) {
+                    fpu_free_reg(dyn, dyn->lsx.x87reg[i]);
+                    dyn->lsx.x87reg[i] = -1;
+                    dyn->lsx.x87cache[i] = -1;
+                    // dyn->lsx.stack_pop+=1; //no pop, but the purge because of barrier will have the n.barrier flags set
+                }
+            }
+    }
+    if (!next) {
+        dyn->lsx.stack_next = 0;
+        dyn->lsx.tags = 0;
+#if STEP < 2
+        // refresh the cached valued, in case it's a purge outside a instruction
+        dyn->insts[ninst].lsx.barrier = 1;
+        dyn->lsx.pushed = 0;
+        dyn->lsx.poped = 0;
+
+#endif
+    }
+    MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n");
+}
+
+void x87_reflectcount(dynarec_la64_t* dyn, int ninst, int s1, int s2)
+{
+    // Synch top and stack count
+    int a = dyn->lsx.x87stack;
+    if (a) {
+        MESSAGE(LOG_DUMP, "\tSync x87 Count of %d-----\n", a);
+        // Add x87stack to emu fpu_stack
+        LD_W(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        ADDI_D(s2, s2, a);
+        ST_W(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        // Sub x87stack to top, with and 7
+        LD_W(s2, xEmu, offsetof(x64emu_t, top));
+        ADDI_D(s2, s2, -a);
+        ANDI(s2, s2, 7);
+        ST_W(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags
+        LD_H(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        if (a > 0) {
+            SLLI_D(s1, s1, a * 2);
+        } else {
+            MOV32w(s2, 0xffff0000);
+            OR(s1, s1, s2);
+            SRLI_D(s1, s1, -a * 2);
+        }
+        ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+    }
+}
+
+static void x87_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3)
+{
+    // Sync top and stack count
+    int a = dyn->lsx.x87stack;
+    if (a) {
+        // Add x87stack to emu fpu_stack
+        LD_W(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        ADDI_D(s2, s2, a);
+        ST_W(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        // Sub x87stack to top, with and 7
+        LD_W(s2, xEmu, offsetof(x64emu_t, top));
+        ADDI_D(s2, s2, -a);
+        ANDI(s2, s2, 7);
+        ST_W(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags (and top at the same time)
+        LD_H(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        if (a > 0) {
+            SLLI_D(s1, s1, a * 2);
+        } else {
+            MOV32w(s3, 0xffff0000);
+            OR(s1, s1, s3);
+            SRLI_D(s1, s1, -a * 2);
+        }
+        ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+    }
+    int ret = 0;
+    for (int i = 0; (i < 8) && (!ret); ++i)
+        if (dyn->lsx.x87cache[i] != -1)
+            ret = 1;
+    if (!ret) // nothing to do
+        return;
+    // prepare offset to fpu => s1
+    // Get top
+    if (!a) {
+        LD_W(s2, xEmu, offsetof(x64emu_t, top));
+    }
+    // loop all cache entries
+    for (int i = 0; i < 8; ++i)
+        if (dyn->lsx.x87cache[i] != -1) {
+            ADDI_D(s3, s2, dyn->lsx.x87cache[i]);
+            ANDI(s3, s3, 7); // (emu->top + i)&7
+            SLLI_D(s1, s3, 3);
+            ADD_D(s1, xEmu, s1);
+            if (lsxcache_get_current_st_f(dyn, dyn->lsx.x87cache[i]) >= 0) {
+                FCVT_D_S(SCRATCH0, dyn->lsx.x87reg[i]);
+                FST_D(SCRATCH0, s1, offsetof(x64emu_t, x87));
+            } else
+                FST_D(dyn->lsx.x87reg[i], s1, offsetof(x64emu_t, x87));
+        }
+}
+
+
+void x87_unreflectcount(dynarec_la64_t* dyn, int ninst, int s1, int s2)
+{
+    // revert top and stack count
+    int a = dyn->lsx.x87stack;
+    if (a) {
+        // Sub x87stack to emu fpu_stack
+        LD_W(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        ADDI_D(s2, s2, -a);
+        ST_W(s2, xEmu, offsetof(x64emu_t, fpu_stack));
+        // Add x87stack to top, with and 7
+        LD_W(s2, xEmu, offsetof(x64emu_t, top));
+        ADDI_D(s2, s2, a);
+        ANDI(s2, s2, 7);
+        ST_W(s2, xEmu, offsetof(x64emu_t, top));
+        // update tags
+        LD_H(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+        if (a > 0) {
+            MOV32w(s2, 0xffff0000);
+            OR(s1, s1, s2);
+            SRLI_D(s1, s1, a * 2);
+        } else {
+            SLLI_D(s1, s1, -a * 2);
+        }
+        ST_H(s1, xEmu, offsetof(x64emu_t, fpu_tags));
+    }
+}
+
+int x87_get_current_cache(dynarec_la64_t* dyn, int ninst, int st, int t)
+{
+    // search in cache first
+    for (int i = 0; i < 8; ++i) {
+        if (dyn->lsx.x87cache[i] == st) {
+#if STEP == 1
+            if (t == LSX_CACHE_ST_D && (dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_F || dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_I64))
+                lsxcache_promote_double(dyn, ninst, st);
+            else if (t == LSX_CACHE_ST_I64 && (dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_F))
+                lsxcache_promote_double(dyn, ninst, st);
+            else if (t == LSX_CACHE_ST_F && (dyn->lsx.lsxcache[dyn->lsx.x87reg[i]].t == LSX_CACHE_ST_I64))
+                lsxcache_promote_double(dyn, ninst, st);
+#endif
+            return i;
+        }
+        assert(dyn->lsx.x87cache[i] < 8);
+    }
+    return -1;
+}
+
+int x87_get_cache(dynarec_la64_t* dyn, int ninst, int populate, int s1, int s2, int st, int t)
+{
+    if (dyn->lsx.mmxcount)
+        mmx_purgecache(dyn, ninst, 0, s1);
+    int ret = x87_get_current_cache(dyn, ninst, st, t);
+    if (ret != -1)
+        return ret;
+    MESSAGE(LOG_DUMP, "\tCreate %sx87 Cache for ST%d\n", populate ? "and populate " : "", st);
+    // get a free spot
+    for (int i = 0; (i < 8) && (ret == -1); ++i)
+        if (dyn->lsx.x87cache[i] == -1)
+            ret = i;
+    // found, setup and grab the value
+    dyn->lsx.x87cache[ret] = st;
+    dyn->lsx.x87reg[ret] = fpu_get_reg_x87(dyn, LSX_CACHE_ST_D, st);
+    if (populate) {
+        LD_W(s2, xEmu, offsetof(x64emu_t, top));
+        int a = st - dyn->lsx.x87stack;
+        if (a) {
+            ADDI_D(s2, s2, a);
+            ANDI(s2, s2, 7);
+        }
+        SLLI_D(s2, s2, 3);
+        ADD_D(s1, xEmu, s2);
+        FLD_D(dyn->lsx.x87reg[ret], s1, offsetof(x64emu_t, x87));
+    }
+    MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st);
+
+    return ret;
+}
+int x87_get_lsxcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st)
+{
+    for (int ii = 0; ii < 24; ++ii)
+        if ((dyn->lsx.lsxcache[ii].t == LSX_CACHE_ST_F
+                || dyn->lsx.lsxcache[ii].t == LSX_CACHE_ST_D
+                || dyn->lsx.lsxcache[ii].t == LSX_CACHE_ST_I64)
+            && dyn->lsx.lsxcache[ii].n == st)
+            return ii;
+    assert(0);
+    return -1;
+}
+int x87_get_st(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int t)
+{
+    return dyn->lsx.x87reg[x87_get_cache(dyn, ninst, 1, s1, s2, a, t)];
+}
+int x87_get_st_empty(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int t)
+{
+    return dyn->lsx.x87reg[x87_get_cache(dyn, ninst, 0, s1, s2, a, t)];
 }
 
 // Restore round flag
@@ -1259,7 +1875,7 @@ void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, in
 
 void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3)
 {
-    // TODO: x87_reflectcache(dyn, ninst, s1, s2, s3);
+    x87_reflectcache(dyn, ninst, s1, s2, s3);
     mmx_reflectcache(dyn, ninst, s1);
     sse_reflectcache(dyn, ninst, s1);
     avx_reflectcache(dyn, ninst, s1);
@@ -1267,7 +1883,8 @@ void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3)
 
 void fpu_unreflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3)
 {
-    // TODO
+    // need to undo the top and stack tracking that must not be reflected permanently yet
+    x87_unreflectcount(dyn, ninst, s1, s2);
 }
 
 void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4)
@@ -1294,13 +1911,43 @@ void fpu_reset_cache(dynarec_la64_t* dyn, int ninst, int reset_n)
 #if STEP > 1
     // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap)
     dyn->lsx = dyn->insts[ninst].lsx;
-    lsxcacheUnwind(&dyn->lsx);
-#ifdef HAVE_TRACE
-// TODO: trace
-#endif // HAVE_TRACE
 #else
     dyn->lsx = dyn->insts[reset_n].lsx;
 #endif
+    lsxcacheUnwind(&dyn->lsx);
+#if STEP == 0
+    if (dyn->need_dump) dynarec_log(LOG_NONE, "New x87stack=%d\n", dyn->lsx.x87stack);
+#endif
+#if defined(HAVE_TRACE) && (STEP > 2)
+    if (dyn->need_dump)
+        if (memcmp(&dyn->lsx, &dyn->insts[reset_n].lsx, sizeof(lsx_cache_t))) {
+            MESSAGE(LOG_DEBUG, "Warning, difference in lsxcache: reset=");
+            for (int i = 0; i < 24; ++i)
+                if (dyn->insts[reset_n].lsx.lsxcache[i].v)
+                    MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[reset_n].lsx.lsxcache[i].t, dyn->insts[reset_n].lsx.lsxcache[i].n));
+            if (dyn->insts[reset_n].lsx.combined1 || dyn->insts[reset_n].lsx.combined2)
+                MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[reset_n].lsx.swapped ? "SWP" : "CMB", dyn->insts[reset_n].lsx.combined1, dyn->insts[reset_n].lsx.combined2);
+            if (dyn->insts[reset_n].lsx.stack_push || dyn->insts[reset_n].lsx.stack_pop)
+                MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[reset_n].lsx.stack_push, -dyn->insts[reset_n].lsx.stack_pop);
+            MESSAGE(LOG_DEBUG, " ==> ");
+            for (int i = 0; i < 24; ++i)
+                if (dyn->insts[ninst].lsx.lsxcache[i].v)
+                    MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[ninst].lsx.lsxcache[i].t, dyn->insts[ninst].lsx.lsxcache[i].n));
+            if (dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2)
+                MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[ninst].lsx.swapped ? "SWP" : "CMB", dyn->insts[ninst].lsx.combined1, dyn->insts[ninst].lsx.combined2);
+            if (dyn->insts[ninst].lsx.stack_push || dyn->insts[ninst].lsx.stack_pop)
+                MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[ninst].lsx.stack_push, -dyn->insts[ninst].lsx.stack_pop);
+            MESSAGE(LOG_DEBUG, " -> ");
+            for (int i = 0; i < 24; ++i)
+                if (dyn->lsx.lsxcache[i].v)
+                    MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->lsx.lsxcache[i].t, dyn->lsx.lsxcache[i].n));
+            if (dyn->lsx.combined1 || dyn->lsx.combined2)
+                MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->lsx.swapped ? "SWP" : "CMB", dyn->lsx.combined1, dyn->lsx.combined2);
+            if (dyn->lsx.stack_push || dyn->lsx.stack_pop)
+                MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->lsx.stack_push, -dyn->lsx.stack_pop);
+            MESSAGE(LOG_DEBUG, "\n");
+        }
+#endif // HAVE_TRACE
 }
 
 // propagate ST stack state, especial stack pop that are deferred
@@ -1491,7 +2138,26 @@ static void loadCache(dynarec_la64_t* dyn, int ninst, int stack_cnt, int s1, int
         case LSX_CACHE_ST_F:
         case LSX_CACHE_ST_I64:
             MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
-            // TODO: x87
+            if ((*s3_top) == 0xffff) {
+                LD_W(s3, xEmu, offsetof(x64emu_t, top));
+                *s3_top = 0;
+            }
+            int a = n - (*s3_top) - stack_cnt;
+            if (a) {
+                ADDI_D(s3, s3, a);
+                ANDI(s3, s3, 7); // (emu->top + i)&7
+            }
+            *s3_top += a;
+            *s2_val = 0;
+            SLLI_D(s2, s3, 3);
+            ADD_D(s2, xEmu, s2);
+            FLD_D(i, s2, offsetof(x64emu_t, x87));
+            if (t == LSX_CACHE_ST_F) {
+                FCVT_S_D(i, i);
+            }
+            if (t == LSX_CACHE_ST_I64) {
+                FTINTRZ_L_D(i, i);
+            }
             break;
         case LSX_CACHE_NONE:
         case LSX_CACHE_SCR:
@@ -1528,7 +2194,26 @@ static void unloadCache(dynarec_la64_t* dyn, int ninst, int stack_cnt, int s1, i
         case LSX_CACHE_ST_F:
         case LSX_CACHE_ST_I64:
             MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
-            // TODO: x87
+            if ((*s3_top) == 0xffff) {
+                LD_W(s3, xEmu, offsetof(x64emu_t, top));
+                *s3_top = 0;
+            }
+            int a = n - (*s3_top) - stack_cnt;
+            if (a) {
+                ADDI_D(s3, s3, a);
+                ANDI(s3, s3, 7);
+            }
+            *s3_top += a;
+            SLLI_D(s2, s3, 3);
+            ADD_D(s2, xEmu, s2);
+            *s2_val = 0;
+            if (t == LSX_CACHE_ST_F) {
+                FCVT_D_S(i, i);
+            }
+            if (t == LSX_CACHE_ST_I64) {
+                FFINT_D_L(i, i);
+            }
+            FST_D(i, s2, offsetof(x64emu_t, x87));
             break;
         case LSX_CACHE_NONE:
         case LSX_CACHE_SCR:
@@ -1576,13 +2261,6 @@ static void fpuCacheTransform(dynarec_la64_t* dyn, int ninst, int s1, int s2, in
     }
     int stack_cnt = dyn->lsx.stack_next;
     int s3_top = 0xffff;
-    if (stack_cnt != cache_i2.stack) {
-        MESSAGE(LOG_DUMP, "\t    - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack);
-        int a = stack_cnt - cache_i2.stack;
-        // TODO: x87
-        s3_top = 0;
-        stack_cnt = cache_i2.stack;
-    }
     lsxcache_t cache = dyn->lsx;
     int s1_val = 0;
     int s2_val = 0;
@@ -1666,6 +2344,31 @@ static void fpuCacheTransform(dynarec_la64_t* dyn, int ninst, int s1, int s2, in
             }
         }
     }
+    if (stack_cnt != cache_i2.stack) {
+        MESSAGE(LOG_DUMP, "\t    - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack);
+        int a = stack_cnt - cache_i2.stack;
+        // Add x87stack to emu fpu_stack
+        LD_WU(s3, xEmu, offsetof(x64emu_t, fpu_stack));
+        ADDI_D(s3, s3, a);
+        ST_W(s3, xEmu, offsetof(x64emu_t, fpu_stack));
+        // Sub x87stack to top, with and 7
+        LD_WU(s3, xEmu, offsetof(x64emu_t, top));
+        ADDI_D(s3, s3, -a);
+        ANDI(s3, s3, 7);
+        ST_W(s3, xEmu, offsetof(x64emu_t, top));
+        // update tags
+        LD_H(s2, xEmu, offsetof(x64emu_t, fpu_tags));
+        if (a > 0) {
+            SLLI_D(s2, s2, a * 2);
+        } else {
+            MOV32w(s3, 0xffff0000);
+            OR(s2, s2, s3);
+            SRLI_D(s2, s2, -a * 2);
+        }
+        ST_H(s2, xEmu, offsetof(x64emu_t, fpu_tags));
+        s3_top = 0;
+        stack_cnt = cache_i2.stack;
+    }
     MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
 }
 
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index 9cb25114..440cfba5 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -794,6 +794,10 @@
 #define BLTU_MARK(reg1, reg2) Bxx_gen(LTU, MARK, reg1, reg2)
 // Branch to MARK if reg1>=reg2 (use j64)
 #define BGE_MARK(reg1, reg2) Bxx_gen(GE, MARK, reg1, reg2)
+// Branch to MARK2 if reg1>=0 (use j64)
+#define BGE_MARK2(reg, reg2) Bxx_gen(GE, MARK2, reg, reg2)
+// Branch to MARK3 if reg1>=0 (use j64)
+#define BGE_MARK3(reg, reg2) Bxx_gen(GE, MARK3, reg, reg2)
 
 // Branch to MARK instruction unconditionnal (use j64)
 #define B_MARK_nocond Bxx_gen(__, MARK, 0, 0)
@@ -849,6 +853,23 @@
 #define IFX2X(A, B) if ((dyn->insts[ninst].x64.gen_flags == (A) || dyn->insts[ninst].x64.gen_flags == (B) || dyn->insts[ninst].x64.gen_flags == ((A) | (B))))
 #define IFXN(A, B)  if ((dyn->insts[ninst].x64.gen_flags & (A) && !(dyn->insts[ninst].x64.gen_flags & (B))))
 
+#ifndef NATIVE_RESTORE_X87PC
+#define NATIVE_RESTORE_X87PC()                     \
+    if (dyn->need_x87check) {                      \
+        LD_D(x87pc, xEmu, offsetof(x64emu_t, cw)); \
+        SRLI_D(x87pc, x87pc, 8);                   \
+        ANDI(x87pc, x87pc, 0b11);                  \
+    }
+#endif
+#ifndef X87_CHECK_PRECISION
+#define X87_CHECK_PRECISION(A)               \
+    if (!ST_IS_F(0) && dyn->need_x87check) { \
+        BNEZ(x87pc, 4 + 8);                  \
+        FCVT_S_D(A, A);                      \
+        FCVT_D_S(A, A);                      \
+    }
+#endif
+
 #define STORE_REG(A) ST_D(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
 #define LOAD_REG(A)  LD_D(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
 
@@ -951,6 +972,37 @@
         }                                                             \
     }
 
+
+#if STEP == 0
+#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) var = x87_do_push(dyn, ninst, scratch, t)
+#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch)   x87_do_push_empty(dyn, ninst, scratch)
+#define X87_POP_OR_FAIL(dyn, ninst, scratch)          x87_do_pop(dyn, ninst, scratch)
+#else
+#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t)                                                                                                    \
+    if ((dyn->lsx.x87stack == 8) || (dyn->lsx.pushed == 8)) {                                                                                            \
+        if (dyn->need_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->lsx.x87stack, dyn->lsx.pushed, ninst); \
+        dyn->abort = 1;                                                                                                                                  \
+        return addr;                                                                                                                                     \
+    }                                                                                                                                                    \
+    var = x87_do_push(dyn, ninst, scratch, t);
+
+#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch)                                                                                                      \
+    if ((dyn->lsx.x87stack == 8) || (dyn->lsx.pushed == 8)) {                                                                                            \
+        if (dyn->need_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->lsx.x87stack, dyn->lsx.pushed, ninst); \
+        dyn->abort = 1;                                                                                                                                  \
+        return addr;                                                                                                                                     \
+    }                                                                                                                                                    \
+    x87_do_push_empty(dyn, ninst, scratch);
+
+#define X87_POP_OR_FAIL(dyn, ninst, scratch)                                                                                                           \
+    if ((dyn->lsx.x87stack == -8) || (dyn->lsx.poped == 8)) {                                                                                          \
+        if (dyn->need_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Pop, stack=%d/%d on inst %d\n", dyn->lsx.x87stack, dyn->lsx.poped, ninst); \
+        dyn->abort = 1;                                                                                                                                \
+        return addr;                                                                                                                                   \
+    }                                                                                                                                                  \
+    x87_do_pop(dyn, ninst, scratch);
+#endif
+
 #ifndef MAYSETFLAGS
 #define MAYSETFLAGS() \
     do {              \
@@ -1053,6 +1105,9 @@
 #ifndef TABLE64
 #define TABLE64(A, V)
 #endif
+#ifndef FTABLE64
+#define FTABLE64(A, V)
+#endif
 #ifndef TABLE64C
 #define TABLE64C(A, V)
 #endif
@@ -1148,6 +1203,16 @@
 #define dynarec64_AVX_F3_0F   STEPNAME(dynarec64_AVX_F3_0F)
 #define dynarec64_AVX_F3_0F38 STEPNAME(dynarec64_AVX_F3_0F38)
 
+#define dynarec64_D8 STEPNAME(dynarec64_D8)
+#define dynarec64_D9 STEPNAME(dynarec64_D9)
+#define dynarec64_DA STEPNAME(dynarec64_DA)
+#define dynarec64_DB STEPNAME(dynarec64_DB)
+#define dynarec64_DC STEPNAME(dynarec64_DC)
+#define dynarec64_DD STEPNAME(dynarec64_DD)
+#define dynarec64_DE STEPNAME(dynarec64_DE)
+#define dynarec64_DF STEPNAME(dynarec64_DF)
+#define dynarec64_F0 STEPNAME(dynarec64_F0)
+
 #define geted               STEPNAME(geted)
 #define geted32             STEPNAME(geted32)
 #define jump_to_epilog      STEPNAME(jump_to_epilog)
@@ -1234,11 +1299,30 @@
 
 #define emit_pf STEPNAME(emit_pf)
 
-#define x87_restoreround  STEPNAME(x87_restoreround)
+#define x87_do_push           STEPNAME(x87_do_push)
+#define x87_do_push_empty     STEPNAME(x87_do_push_empty)
+#define x87_do_pop            STEPNAME(x87_do_pop)
+#define x87_get_current_cache STEPNAME(x87_get_current_cache)
+#define x87_get_cache         STEPNAME(x87_get_cache)
+#define x87_get_lsxcache      STEPNAME(x87_get_lsxcache)
+#define x87_get_st            STEPNAME(x87_get_st)
+#define x87_get_st_empty      STEPNAME(x87_get_st)
+#define x87_free              STEPNAME(x87_free)
+#define x87_refresh           STEPNAME(x87_refresh)
+#define x87_forget            STEPNAME(x87_forget)
+#define x87_reget_st          STEPNAME(x87_reget_st)
+#define x87_stackcount        STEPNAME(x87_stackcount)
+#define x87_unstackcount      STEPNAME(x87_unstackcount)
+#define x87_swapreg           STEPNAME(x87_swapreg)
+#define x87_setround          STEPNAME(x87_setround)
+#define x87_restoreround      STEPNAME(x87_restoreround)
+#define x87_reflectcount      STEPNAME(x87_reflectcount)
+#define x87_unreflectcount    STEPNAME(x87_unreflectcount)
+#define x87_purgecache        STEPNAME(x87_purgecache)
+
 #define sse_setround      STEPNAME(sse_setround)
 #define mmx_get_reg       STEPNAME(mmx_get_reg)
 #define mmx_get_reg_empty STEPNAME(mmx_get_reg_empty)
-#define x87_forget        STEPNAME(x87_forget)
 #define sse_purge07cache  STEPNAME(sse_purge07cache)
 #define sse_get_reg       STEPNAME(sse_get_reg)
 #define sse_get_reg_empty STEPNAME(sse_get_reg_empty)
@@ -1259,7 +1343,6 @@
 #define fpu_propagate_stack STEPNAME(fpu_propagate_stack)
 #define fpu_purgecache      STEPNAME(fpu_purgecache)
 #define mmx_purgecache      STEPNAME(mmx_purgecache)
-#define x87_purgecache      STEPNAME(x87_purgecache)
 #define fpu_reflectcache    STEPNAME(fpu_reflectcache)
 #define fpu_unreflectcache  STEPNAME(fpu_unreflectcache)
 
@@ -1359,22 +1442,58 @@ void emit_rol32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c,
 void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4);
 
 // common coproc helpers
+
+// x87 helper
+// cache of the local stack counter, to avoid update at every call
+int x87_stackcount(dynarec_la64_t* dyn, int ninst, int scratch);
+// restore local stack counter
+void x87_unstackcount(dynarec_la64_t* dyn, int ninst, int scratch, int count);
+// fpu push. Return the Dd value to be used
+int x87_do_push(dynarec_la64_t* dyn, int ninst, int s1, int t);
+// fpu push. Do not allocate a cache register. Needs a scratch register to do x87stack synch (or 0 to not do it)
+void x87_do_push_empty(dynarec_la64_t* dyn, int ninst, int s1);
+// fpu pop. All previous returned Dd should be considered invalid
+void x87_do_pop(dynarec_la64_t* dyn, int ninst, int s1);
+// get cache index for a x87 reg, return -1 if cache doesn't exist
+int x87_get_current_cache(dynarec_la64_t* dyn, int ninst, int st, int t);
+// get cache index for a x87 reg, create the entry if needed
+int x87_get_cache(dynarec_la64_t* dyn, int ninst, int populate, int s1, int s2, int a, int t);
+// get extcache index for a x87 reg
+int x87_get_lsxcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a);
+// get vfpu register for a x87 reg, create the entry if needed
+int x87_get_st(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int t);
+// get vfpu register for a x87 reg, create the entry if needed. Do not fetch the Stx if not already in cache
+int x87_get_st_empty(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int t);
+// Free st, using the FFREE opcode (so it's freed but stack is not moved)
+void x87_free(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int st);
+// refresh a value from the cache ->emu (nothing done if value is not cached)
+void x87_refresh(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st);
+// refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached)
+void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st);
+// refresh the cache value from emu
+void x87_reget_st(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st);
+// swap 2 x87 regs
+void x87_swapreg(dynarec_la64_t* dyn, int ninst, int s1, int s2, int a, int b);
+// Set rounding according to cw flags, return reg to restore flags
+int x87_setround(dynarec_la64_t* dyn, int ninst, int s1, int s2);
+// Restore round flag
+void x87_restoreround(dynarec_la64_t* dyn, int ninst, int s1);
+// Set rounding according to mxcsr flags, return reg to restore flags
+void x87_reflectcount(dynarec_la64_t* dyn, int ninst, int s1, int s2);
+void x87_unreflectcount(dynarec_la64_t* dyn, int ninst, int s1, int s2);
+void x87_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3);
+
 // reset the cache with n
 void fpu_reset_cache(dynarec_la64_t* dyn, int ninst, int reset_n);
 void fpu_propagate_stack(dynarec_la64_t* dyn, int ninst);
 void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3);
 void mmx_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1);
-void x87_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3);
 void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3);
 void fpu_unreflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3);
 void fpu_pushcache(dynarec_la64_t* dyn, int ninst, int s1, int not07);
 void fpu_popcache(dynarec_la64_t* dyn, int ninst, int s1, int not07);
-// Restore round flag
-void x87_restoreround(dynarec_la64_t* dyn, int ninst, int s1);
 // Set rounding according to mxcsr flags, return reg to restore flags
 int sse_setround(dynarec_la64_t* dyn, int ninst, int s1, int s2);
-// refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached)
-void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st);
 
 // SSE/SSE2 helpers
 // purge the XMM0..XMM7 cache (before function call)
@@ -1417,6 +1536,34 @@ void la64_move32(dynarec_la64_t* dyn, int ninst, int reg, int32_t val, int zerou
 #define CHECK_CACHE() (cacheupd = CacheNeedsTransform(dyn, ninst))
 #endif
 
+#define lsxcache_st_coherency STEPNAME(lsxcache_st_coherency)
+int lsxcache_st_coherency(dynarec_la64_t* dyn, int ninst, int a, int b);
+
+#if STEP == 0
+#define ST_IS_F(A)        0
+#define ST_IS_I64(A)      0
+#define X87_COMBINE(A, B) LSX_CACHE_ST_D
+#define X87_ST0           LSX_CACHE_ST_D
+#define X87_ST(A)         LSX_CACHE_ST_D
+#elif STEP == 1
+#define ST_IS_F(A)        (lsxcache_get_current_st(dyn, ninst, A) == LSX_CACHE_ST_F)
+#define ST_IS_I64(A)      (lsxcache_get_current_st(dyn, ninst, A) == LSX_CACHE_ST_I64)
+#define X87_COMBINE(A, B) lsxcache_combine_st(dyn, ninst, A, B)
+#define X87_ST0           lsxcache_no_i64(dyn, ninst, 0, lsxcache_get_current_st(dyn, ninst, 0))
+#define X87_ST(A)         lsxcache_no_i64(dyn, ninst, A, lsxcache_get_current_st(dyn, ninst, A))
+#else
+#define ST_IS_F(A)   (lsxcache_get_st(dyn, ninst, A) == LSX_CACHE_ST_F)
+#define ST_IS_I64(A) (lsxcache_get_st(dyn, ninst, A) == LSX_CACHE_ST_I64)
+#if STEP == 3
+#define X87_COMBINE(A, B) lsxcache_st_coherency(dyn, ninst, A, B)
+#else
+#define X87_COMBINE(A, B) lsxcache_get_st(dyn, ninst, A)
+#endif
+#define X87_ST0   lsxcache_get_st(dyn, ninst, 0)
+#define X87_ST(A) lsxcache_get_st(dyn, ninst, A)
+#endif
+
+
 uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_F30F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
@@ -1441,6 +1588,14 @@ uintptr_t dynarec64_AVX_F2_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
 uintptr_t dynarec64_AVX_F2_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 uintptr_t dynarec64_AVX_F3_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 uintptr_t dynarec64_AVX_F3_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
+uintptr_t dynarec64_D8(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_D9(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DA(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DB(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DC(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DD(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DE(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_DF(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 
 
 #if STEP < 3
@@ -1570,6 +1725,61 @@ uintptr_t dynarec64_AVX_F3_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             opcode = F8;                           \
         }
 
+
+#define FCOM(w, v1, v2, s1, s2, s3)                            \
+    LD_HU(s3, xEmu, offsetof(x64emu_t, sw));                   \
+    MOV32w(s1, 0b1011100011111111); /* mask off c0,c1,c2,c3 */ \
+    AND(s3, s3, s1);                                           \
+    FCMP_##w(fcc0, v1, v2, cOR);                               \
+    BCEQZ(fcc0, 28); /* undefined/NaN */                       \
+    FCMP_##w(fcc1, v1, v2, cEQ);                               \
+    BCNEZ(fcc1, 32);             /* equal */                   \
+    FCMP_##w(fcc2, v1, v2, cLT); /* x2 = (v1<v2)?1:0 */        \
+    MOVCF2GR(s2, fcc2);                                        \
+    SLLI_D(s1, s2, 8);                                         \
+    B(20); /* end */                                           \
+    /* undefined/NaN */                                        \
+    LU12I_W(s1, 4);                                            \
+    ADDI_D(s1, s1, 0b010100000000);                            \
+    B(8); /* end */                                            \
+    /* equal */                                                \
+    LU12I_W(s1, 4);                                            \
+    /* end */                                                  \
+    OR(s3, s3, s1);                                            \
+    ST_H(s3, xEmu, offsetof(x64emu_t, sw));
+
+#define FCOMS(v1, v2, s1, s2, s3) FCOM(S, v1, v2, s1, s2, s3)
+#define FCOMD(v1, v2, s1, s2, s3) FCOM(D, v1, v2, s1, s2, s3)
+
+#define FCOMI(w, v1, v2, s1, s2)                               \
+    IFX (X_OF | X_AF | X_SF | X_PEND) {                        \
+        MOV64x(s2, ((1 << F_OF) | (1 << F_AF) | (1 << F_SF))); \
+        ANDN(xFlags, xFlags, s2);                              \
+    }                                                          \
+    IFX (X_CF | X_PF | X_ZF | X_PEND) {                        \
+        MOV32w(s2, 0b01000101);                                \
+        ANDN(xFlags, xFlags, s2);                              \
+        FCMP_##w(fcc0, v1, v2, cOR);                           \
+        BCEQZ(fcc0, 24); /* undefined/NaN */                   \
+        FCMP_##w(fcc1, v1, v2, cEQ);                           \
+        BCNEZ(fcc1, 24);             /* equal */               \
+        FCMP_##w(fcc2, v1, v2, cLT); /* s1 = (v1<v2)?1:0 */    \
+        MOVCF2GR(s1, fcc2);                                    \
+        B(4 * 4); /* end */                                    \
+        /* undefined/NaN */                                    \
+        MV(s1, s2);                                            \
+        B(2 * 4); /* end */                                    \
+        /* equal */                                            \
+        ADDI_D(s1, xZR, 0b01000000);                           \
+        /* end */                                              \
+        OR(xFlags, xFlags, s1);                                \
+    }                                                          \
+    SPILL_EFLAGS();                                            \
+    SET_DFNONE()
+
+#define FCOMIS(v1, v2, s1, s2) FCOMI(S, v1, v2, s1, s2)
+#define FCOMID(v1, v2, s1, s2) FCOMI(D, v1, v2, s1, s2)
+
 // Restore xFlags from LBT.eflags
 #define RESTORE_EFLAGS(s)             \
     do {                              \
diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h
index c235bbcd..31cbadcf 100644
--- a/src/dynarec/la64/dynarec_la64_pass0.h
+++ b/src/dynarec/la64/dynarec_la64_pass0.h
@@ -68,3 +68,7 @@
         PrintFunctionAddr(ip, " => ");                                                                                    \
         dynarec_log_prefix(0, LOG_NONE, "\n");                                                                            \
     }
+
+
+#define NATIVE_RESTORE_X87PC()
+#define X87_CHECK_PRECISION(A)
\ No newline at end of file
diff --git a/src/dynarec/la64/dynarec_la64_pass1.h b/src/dynarec/la64/dynarec_la64_pass1.h
index 20366bd0..b0dde230 100644
--- a/src/dynarec/la64/dynarec_la64_pass1.h
+++ b/src/dynarec/la64/dynarec_la64_pass1.h
@@ -13,3 +13,10 @@
     dyn->insts[ninst].f_exit = dyn->f
 
 #define INST_NAME(name)
+
+#define NATIVE_RESTORE_X87PC()
+#define X87_CHECK_PRECISION(A)      \
+    do {                            \
+        if (dyn->need_x87check)     \
+            dyn->need_x87check = 2; \
+    } while (0)
diff --git a/src/dynarec/la64/dynarec_la64_pass2.h b/src/dynarec/la64/dynarec_la64_pass2.h
index eb722e42..26ce8fab 100644
--- a/src/dynarec/la64/dynarec_la64_pass2.h
+++ b/src/dynarec/la64/dynarec_la64_pass2.h
@@ -33,6 +33,13 @@
         EMIT(0);              \
         EMIT(0);              \
     } while (0)
+#define FTABLE64(A, V)               \
+    do {                             \
+        mmx87_regs_t v = { .d = V }; \
+        Table64(dyn, v.q, 2);        \
+        EMIT(0);                     \
+        EMIT(0);                     \
+    } while (0)
 #define TABLE64C(A, V)                                       \
     do {                                                     \
         if (dyn->need_reloc && !isTable64(dyn, getConst(V))) \
diff --git a/src/dynarec/la64/dynarec_la64_pass3.h b/src/dynarec/la64/dynarec_la64_pass3.h
index 8188e761..f0eb1419 100644
--- a/src/dynarec/la64/dynarec_la64_pass3.h
+++ b/src/dynarec/la64/dynarec_la64_pass3.h
@@ -40,6 +40,14 @@
         PCADDU12I(A, SPLIT20(val64offset));           \
         LD_D(A, A, SPLIT12(val64offset));             \
     } while (0)
+#define FTABLE64(A, V)                              \
+    do {                                            \
+        mmx87_regs_t v = { .d = V };                \
+        int val64offset = Table64(dyn, v.q, 3);     \
+        MESSAGE(LOG_DUMP, "  FTable64: %g\n", v.d); \
+        PCADDU12I(x1, SPLIT20(val64offset));            \
+        FLD_D(A, x1, SPLIT12(val64offset));           \
+    } while (0)
 #define TABLE64C(A, V)                                       \
     do {                                                     \
         if (dyn->need_reloc && !isTable64(dyn, getConst(V))) \
diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h
index 120fc14e..65beba34 100644
--- a/src/dynarec/la64/dynarec_la64_private.h
+++ b/src/dynarec/la64/dynarec_la64_private.h
@@ -114,6 +114,7 @@ typedef struct instruction_la64_s {
     uint8_t             nat_flags_needsign:1;
     uint8_t             nat_flags_op1;
     uint8_t             nat_flags_op2;
+    uint8_t             x87precision:1; // this opcode can handle x87pc
     flagcache_t         f_exit;     // flags status at end of instruction
     lsxcache_t          lsx;        // lsxcache at end of instruction (but before poping)
     flagcache_t         f_entry;    // flags status before the instruction begin
diff --git a/src/dynarec/la64/la64_mapping.h b/src/dynarec/la64/la64_mapping.h
index 3bb6c1d4..6446f55f 100644
--- a/src/dynarec/la64/la64_mapping.h
+++ b/src/dynarec/la64/la64_mapping.h
@@ -26,7 +26,7 @@ r17     t5     x4           Temporary                       Scratch
 r18     t6     x5           Temporary                       Scratch                 Caller
 r19     t7     x6           Temporary                       Scratch                 Caller
 r20     t8     x7           Temporary                       Scratch                 Caller
-r21     rx     -            Reserved                        N/A                     -
+r21     rx     -            Reserved                        X87 Precision Control   -
 r22     fp     SavedSP      Saved register/frame pointer    -                       Callee
 r23     s0     R10          Saved register                  -                       Callee
 r24     s1     R11          Saved register                  -                       Callee
@@ -74,6 +74,8 @@ r31     s8     xEmu         Saved register                  The Emu struct
 #define x6 19
 #define x7 20
 
+#define x87pc 21
+
 // emu is $r31
 #define xEmu 31
 // LA64 RA