about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <numbksco@gmail.com>2024-03-28 21:17:26 +0800
committerGitHub <noreply@github.com>2024-03-28 14:17:26 +0100
commit912622bf1f3db7f3cba32b44f4f1c106e3fb7dbf (patch)
tree074034e80742c29628d8d9288e2f62c53e7c567a /src
parent70fec208d83899e5da663302c1ec7c632393afd7 (diff)
downloadbox64-912622bf1f3db7f3cba32b44f4f1c106e3fb7dbf.tar.gz
box64-912622bf1f3db7f3cba32b44f4f1c106e3fb7dbf.zip
[LA64_DYNAREC] Added basic SSE infra and 66 0F 6E MOVD opcode (#1391)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/la64/dynarec_la64_660f.c15
-rw-r--r--src/dynarec/la64/dynarec_la64_functions.c209
-rw-r--r--src/dynarec/la64/dynarec_la64_functions.h15
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.c410
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h7
-rw-r--r--src/dynarec/la64/dynarec_la64_private.h53
-rw-r--r--src/dynarec/la64/la64_emitter.h37
7 files changed, 718 insertions, 28 deletions
diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c
index 5f7192ca..effaf5bf 100644
--- a/src/dynarec/la64/dynarec_la64_660f.c
+++ b/src/dynarec/la64/dynarec_la64_660f.c
@@ -55,6 +55,21 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             FAKEED;
             break;
+        case 0x6E:
+            INST_NAME("MOVD Gx, Ed");
+            nextop = F8;
+            GETGX_empty(v0);
+            v1 = fpu_get_scratch(dyn);
+            GETED(0);
+            VXOR_V(v0, v0, v0);
+            if (rex.w) {
+                MOVGR2FR_D(v1, ed);
+                VEXTRINS_D(v0, v1, 0); // v0[63:0] = v1[63:0]
+            } else {
+                MOVGR2FR_W(v1, ed);
+                VEXTRINS_W(v0, v1, 0); // v0[31:0] = v1[31:0]
+            }
+            break;
         default:
             DEFAULT;
     }
diff --git a/src/dynarec/la64/dynarec_la64_functions.c b/src/dynarec/la64/dynarec_la64_functions.c
index c73f91e4..01eaee06 100644
--- a/src/dynarec/la64/dynarec_la64_functions.c
+++ b/src/dynarec/la64/dynarec_la64_functions.c
@@ -26,10 +26,169 @@
 #include "custommem.h"
 #include "bridge.h"
 
+#define XMM0 0
+#define XMM8 16
+#define X870 8
+#define EMM0 8
+
+// Get a FPU scratch reg
+int fpu_get_scratch(dynarec_la64_t* dyn)
+{
+    return SCRATCH0 + dyn->lsx.fpu_scratch++; // return an Sx
+}
 // Reset scratch regs counter
 void fpu_reset_scratch(dynarec_la64_t* dyn)
 {
-    // TODO
+    dyn->lsx.fpu_scratch = 0;
+}
+
+// Free a FPU double reg
+void fpu_free_reg(dynarec_la64_t* dyn, int reg)
+{
+    // TODO: check upper limit?
+    dyn->lsx.fpuused[reg] = 0;
+    if (dyn->lsx.lsxcache[reg].t != LSX_CACHE_ST_F && dyn->lsx.lsxcache[reg].t != LSX_CACHE_ST_D && dyn->lsx.lsxcache[reg].t != LSX_CACHE_ST_I64)
+        dyn->lsx.lsxcache[reg].v = 0;
+}
+
+// Get an XMM quad reg
+int fpu_get_reg_xmm(dynarec_la64_t* dyn, int t, int xmm)
+{
+    int i;
+    if (xmm > 7) {
+        i = XMM8 + xmm - 8;
+    } else {
+        i = XMM0 + xmm;
+    }
+    dyn->lsx.fpuused[i] = 1;
+    dyn->lsx.lsxcache[i].t = t;
+    dyn->lsx.lsxcache[i].n = xmm;
+    dyn->lsx.news |= (1 << i);
+    return i;
+}
+
+// Reset fpu regs counter
+static void fpu_reset_reg_lsxcache(lsxcache_t* lsx)
+{
+    lsx->fpu_reg = 0;
+    for (int i = 0; i < 24; ++i) {
+        lsx->fpuused[i] = 0;
+        lsx->lsxcache[i].v = 0;
+    }
+}
+
+void fpu_reset_reg(dynarec_la64_t* dyn)
+{
+    fpu_reset_reg_lsxcache(&dyn->lsx);
+}
+
+void lsxcacheUnwind(lsxcache_t* cache)
+{
+    if (cache->swapped) {
+        // unswap
+        int a = -1;
+        int b = -1;
+        for (int j = 0; j < 24 && ((a == -1) || (b == -1)); ++j)
+            if ((cache->lsxcache[j].t == LSX_CACHE_ST_D || cache->lsxcache[j].t == LSX_CACHE_ST_F || cache->lsxcache[j].t == LSX_CACHE_ST_I64)) {
+                if (cache->lsxcache[j].n == cache->combined1)
+                    a = j;
+                else if (cache->lsxcache[j].n == cache->combined2)
+                    b = j;
+            }
+        if (a != -1 && b != -1) {
+            int tmp = cache->lsxcache[a].n;
+            cache->lsxcache[a].n = cache->lsxcache[b].n;
+            cache->lsxcache[b].n = tmp;
+        }
+        cache->swapped = 0;
+        cache->combined1 = cache->combined2 = 0;
+    }
+    if (cache->news) {
+        // reove the newly created lsxcache
+        for (int i = 0; i < 24; ++i)
+            if (cache->news & (1 << i))
+                cache->lsxcache[i].v = 0;
+        cache->news = 0;
+    }
+    if (cache->stack_push) {
+        // unpush
+        for (int j = 0; j < 24; ++j) {
+            if ((cache->lsxcache[j].t == LSX_CACHE_ST_D || cache->lsxcache[j].t == LSX_CACHE_ST_F || cache->lsxcache[j].t == LSX_CACHE_ST_I64)) {
+                if (cache->lsxcache[j].n < cache->stack_push)
+                    cache->lsxcache[j].v = 0;
+                else
+                    cache->lsxcache[j].n -= cache->stack_push;
+            }
+        }
+        cache->x87stack -= cache->stack_push;
+        cache->stack -= cache->stack_push;
+        cache->stack_push = 0;
+    }
+    cache->x87stack += cache->stack_pop;
+    cache->stack_next = cache->stack;
+    cache->stack_pop = 0;
+    cache->barrier = 0;
+    // And now, rebuild the x87cache info with lsxcache
+    cache->mmxcount = 0;
+    cache->fpu_scratch = 0;
+    cache->fpu_extra_qscratch = 0;
+    cache->fpu_reg = 0;
+    for (int i = 0; i < 8; ++i) {
+        cache->x87cache[i] = -1;
+        cache->mmxcache[i] = -1;
+        cache->x87reg[i] = 0;
+        cache->ssecache[i * 2].v = -1;
+        cache->ssecache[i * 2 + 1].v = -1;
+    }
+    int x87reg = 0;
+    for (int i = 0; i < 24; ++i) {
+        if (cache->lsxcache[i].v) {
+            cache->fpuused[i] = 1;
+            switch (cache->lsxcache[i].t) {
+                case LSX_CACHE_MM:
+                    cache->mmxcache[cache->lsxcache[i].n] = i;
+                    ++cache->mmxcount;
+                    ++cache->fpu_reg;
+                    break;
+                case LSX_CACHE_XMMR:
+                case LSX_CACHE_XMMW:
+                    cache->ssecache[cache->lsxcache[i].n].reg = i;
+                    cache->ssecache[cache->lsxcache[i].n].write = (cache->lsxcache[i].t == LSX_CACHE_XMMW) ? 1 : 0;
+                    ++cache->fpu_reg;
+                    break;
+                case LSX_CACHE_ST_F:
+                case LSX_CACHE_ST_D:
+                case LSX_CACHE_ST_I64:
+                    cache->x87cache[x87reg] = cache->lsxcache[i].n;
+                    cache->x87reg[x87reg] = i;
+                    ++x87reg;
+                    ++cache->fpu_reg;
+                    break;
+                case LSX_CACHE_SCR:
+                    cache->fpuused[i] = 0;
+                    cache->lsxcache[i].v = 0;
+                    break;
+            }
+        } else {
+            cache->fpuused[i] = 0;
+        }
+    }
+}
+
+const char* getCacheName(int t, int n)
+{
+    static char buff[20];
+    switch (t) {
+        case LSX_CACHE_ST_D: sprintf(buff, "ST%d", n); break;
+        case LSX_CACHE_ST_F: sprintf(buff, "st%d", n); break;
+        case LSX_CACHE_ST_I64: sprintf(buff, "STi%d", n); break;
+        case LSX_CACHE_MM: sprintf(buff, "MM%d", n); break;
+        case LSX_CACHE_XMMW: sprintf(buff, "XMM%d", n); break;
+        case LSX_CACHE_XMMR: sprintf(buff, "xmm%d", n); break;
+        case LSX_CACHE_SCR: sprintf(buff, "Scratch"); break;
+        case LSX_CACHE_NONE: buff[0] = '\0'; break;
+    }
+    return buff;
 }
 
 void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex)
@@ -63,23 +222,23 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
             dynarec_log(LOG_NONE, ", jmp=out");
         if (dyn->last_ip)
             dynarec_log(LOG_NONE, ", last_ip=%p", (void*)dyn->last_ip);
-        // for (int ii = 0; ii < 24; ++ii) {
-        //     switch (dyn->insts[ninst].n.neoncache[ii].t) {
-        //         case NEON_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
-        //         case NEON_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
-        //         case NEON_CACHE_ST_I64: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
-        //         case NEON_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
-        //         case NEON_CACHE_XMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
-        //         case NEON_CACHE_XMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
-        //         case NEON_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
-        //         case NEON_CACHE_NONE:
-        //         default: break;
-        //     }
-        // }
-        // if (dyn->n.stack || dyn->insts[ninst].n.stack_next || dyn->insts[ninst].n.x87stack)
-        //     dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, dyn->insts[ninst].n.stack_pop, dyn->insts[ninst].n.x87stack);
-        // if (dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2)
-        //     dynarec_log(LOG_NONE, " %s:%d/%d", dyn->insts[ninst].n.swapped ? "SWP" : "CMB", dyn->insts[ninst].n.combined1, dyn->insts[ninst].n.combined2);
+        for (int ii = 0; ii < 24; ++ii) {
+            switch (dyn->insts[ninst].lsx.lsxcache[ii].t) {
+                case LSX_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break;
+                case LSX_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break;
+                case LSX_CACHE_ST_I64: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break;
+                case LSX_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break;
+                case LSX_CACHE_XMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break;
+                case LSX_CACHE_XMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break;
+                case LSX_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].lsx.lsxcache[ii].t, dyn->insts[ninst].lsx.lsxcache[ii].n)); break;
+                case LSX_CACHE_NONE:
+                default: break;
+            }
+        }
+        if (dyn->lsx.stack || dyn->insts[ninst].lsx.stack_next || dyn->insts[ninst].lsx.x87stack)
+            dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->lsx.stack, dyn->insts[ninst].lsx.stack_next, dyn->insts[ninst].lsx.stack_push, dyn->insts[ninst].lsx.stack_pop, dyn->insts[ninst].lsx.x87stack);
+        if (dyn->insts[ninst].lsx.combined1 || dyn->insts[ninst].lsx.combined2)
+            dynarec_log(LOG_NONE, " %s:%d/%d", dyn->insts[ninst].lsx.swapped ? "SWP" : "CMB", dyn->insts[ninst].lsx.combined1, dyn->insts[ninst].lsx.combined2);
         dynarec_log(LOG_NONE, "%s\n", (box64_dynarec_dump > 1) ? "\e[m" : "");
     }
 }
@@ -147,12 +306,22 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode)
     dynarec_log(LOG_NONE, "\t%08x\t%s\n", opcode, la64_print(opcode, (uintptr_t)dyn->block));
 }
 
+static void sse_reset(lsxcache_t* lsx)
+{
+    for (int i = 0; i < 16; ++i)
+        lsx->ssecache[i].v = -1;
+}
+
 void fpu_reset(dynarec_la64_t* dyn)
 {
-    // TODO
+    // TODO: x87 and mmx
+    sse_reset(&dyn->lsx);
+    fpu_reset_reg(dyn);
 }
 
 void fpu_reset_ninst(dynarec_la64_t* dyn, int ninst)
 {
-    // TODO
+    // TODO: x87 and mmx
+    sse_reset(&dyn->insts[ninst].lsx);
+    fpu_reset_reg_lsxcache(&dyn->insts[ninst].lsx);
 }
\ No newline at end of file
diff --git a/src/dynarec/la64/dynarec_la64_functions.h b/src/dynarec/la64/dynarec_la64_functions.h
index 67608783..2badba4d 100644
--- a/src/dynarec/la64/dynarec_la64_functions.h
+++ b/src/dynarec/la64/dynarec_la64_functions.h
@@ -3,11 +3,26 @@
 
 #include "../dynarec_native_functions.h"
 
+#define SCRATCH0 24
+
 typedef struct x64emu_s x64emu_t;
 typedef struct dynarec_la64_s dynarec_la64_t;
 
+// Get an FPU scratch reg
+int fpu_get_scratch(dynarec_la64_t* dyn);
 // Reset scratch regs counter
 void fpu_reset_scratch(dynarec_la64_t* dyn);
+// Get an XMM quad reg
+int fpu_get_reg_xmm(dynarec_la64_t* dyn, int t, int xmm);
+// Free a FPU/MMX/XMM reg
+void fpu_free_reg(dynarec_la64_t* dyn, int reg);
+// Reset fpu regs counter
+void fpu_reset_reg(dynarec_la64_t* dyn);
+
+// Undo the changes of a lsxcache to get the status before the instruction
+void lsxcacheUnwind(lsxcache_t* cache);
+
+const char* getCacheName(int t, int n);
 
 void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex);
 void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode);
diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c
index cbd97fa8..9ad715e6 100644
--- a/src/dynarec/la64/dynarec_la64_helper.c
+++ b/src/dynarec/la64/dynarec_la64_helper.c
@@ -511,30 +511,122 @@ void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st)
     // TODO
 }
 
+// get lsx register for an SSE reg, but don't try to synch it if it needed to be created
+int sse_get_reg_empty(dynarec_la64_t* dyn, int ninst, int s1, int a)
+{
+    if (dyn->lsx.ssecache[a].v != -1) {
+        dyn->lsx.ssecache[a].write = 1;
+        dyn->lsx.lsxcache[dyn->lsx.ssecache[a].reg].t = LSX_CACHE_XMMW;
+        return dyn->lsx.ssecache[a].reg;
+    }
+    dyn->lsx.ssecache[a].reg = fpu_get_reg_xmm(dyn, LSX_CACHE_XMMW, a);
+    dyn->lsx.ssecache[a].write = 1; // it will be write...
+    return dyn->lsx.ssecache[a].reg;
+}
+
 // purge the SSE cache for XMM0..XMM7 (to use before function native call)
 void sse_purge07cache(dynarec_la64_t* dyn, int ninst, int s1)
 {
-    // TODO
+    int old = -1;
+    for (int i = 0; i < 8; ++i)
+        if (dyn->lsx.ssecache[i].v != -1) {
+            if (old == -1) {
+                MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n");
+                ++old;
+            }
+            if (dyn->lsx.lsxcache[dyn->lsx.ssecache[i].reg].t == LSX_CACHE_XMMW) {
+                VST(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+            }
+            fpu_free_reg(dyn, dyn->lsx.ssecache[i].reg);
+            dyn->lsx.ssecache[i].v = -1;
+        }
+    if (old != -1) {
+        MESSAGE(LOG_DUMP, "\t------ Purge XMM0..7 Cache\n");
+    }
+}
+
+// purge the SSE cache only
+static void sse_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1)
+{
+    int old = -1;
+    for (int i = 0; i < 16; ++i)
+        if (dyn->lsx.ssecache[i].v != -1) {
+            if (dyn->lsx.ssecache[i].write) {
+                if (old == -1) {
+                    MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next ? "locally " : "");
+                    ++old;
+                }
+                VST(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+            }
+            if (!next) {
+                fpu_free_reg(dyn, dyn->lsx.ssecache[i].reg);
+                dyn->lsx.ssecache[i].v = -1;
+            }
+        }
+    if (old != -1) {
+        MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n");
+    }
+}
+
+static void sse_reflectcache(dynarec_la64_t* dyn, int ninst, int s1)
+{
+    for (int i=0; i<16; ++i)
+        if(dyn->lsx.ssecache[i].v!=-1 && dyn->lsx.ssecache[i].write) {
+            VST(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+        }
 }
 
 void fpu_pushcache(dynarec_la64_t* dyn, int ninst, int s1, int not07)
 {
-    // TODO
+    int start = not07 ? 8 : 0;
+    // only SSE regs needs to be push back to xEmu (needs to be "write")
+    int n = 0;
+    for (int i = start; i < 16; i++)
+        if ((dyn->lsx.ssecache[i].v != -1) && (dyn->lsx.ssecache[i].write))
+            ++n;
+    if (!n)
+        return;
+    MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n);
+    for (int i = start; i < 16; ++i)
+        if ((dyn->lsx.ssecache[i].v != -1) && (dyn->lsx.ssecache[i].write)) {
+            VST(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+        }
+    MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n);
 }
 
 void fpu_popcache(dynarec_la64_t* dyn, int ninst, int s1, int not07)
 {
-    // TODO
+    int start = not07 ? 8 : 0;
+    // only SSE regs needs to be pop back from xEmu (don't need to be "write" this time)
+    int n = 0;
+    for (int i = start; i < 16; i++)
+        if (dyn->lsx.ssecache[i].v != -1)
+            ++n;
+    if (!n)
+        return;
+    MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n);
+    for (int i = start; i < 16; ++i)
+        if (dyn->lsx.ssecache[i].v != -1) {
+            VLD(dyn->lsx.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+        }
+    MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n);
 }
 
 void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3)
 {
-    // TODO
+    // TODO: x87_purgecache(dyn, ninst, next, s1, s2, s3);
+    // TODO: mmx_purgecache(dyn, ninst, next, s1);
+
+    sse_purgecache(dyn, ninst, next, s1);
+    if (!next)
+        fpu_reset_reg(dyn);
 }
 
 void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3)
 {
-    // TODO
+    // TODO: x87_reflectcache(dyn, ninst, s1, s2, s3);
+    // TODO: mmx_reflectcache(dyn, ninst, s1);
+    sse_reflectcache(dyn, ninst, s1);
 }
 
 void fpu_unreflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3)
@@ -562,19 +654,321 @@ void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4)
 
 void fpu_reset_cache(dynarec_la64_t* dyn, int ninst, int reset_n)
 {
-    // TODO
+    MESSAGE(LOG_DEBUG, "Reset Caches with %d\n", reset_n);
+#if STEP > 1
+    // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap)
+    dyn->lsx = dyn->insts[ninst].lsx;
+    lsxcacheUnwind(&dyn->lsx);
+#ifdef HAVE_TRACE
+// TODO: trace
+#endif // HAVE_TRACE
+#else
+    dyn->lsx = dyn->insts[reset_n].lsx;
+#endif
 }
 
 // propagate ST stack state, especial stack pop that are deferred
 void fpu_propagate_stack(dynarec_la64_t* dyn, int ninst)
 {
-    // TODO
+    if (dyn->lsx.stack_pop) {
+        for (int j = 0; j < 24; ++j)
+            if ((dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_D
+                    || dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_F
+                    || dyn->lsx.lsxcache[j].t == LSX_CACHE_ST_I64)) {
+                if (dyn->lsx.lsxcache[j].n < dyn->lsx.stack_pop)
+                    dyn->lsx.lsxcache[j].v = 0;
+                else
+                    dyn->lsx.lsxcache[j].n -= dyn->lsx.stack_pop;
+            }
+        dyn->lsx.stack_pop = 0;
+    }
+    dyn->lsx.stack = dyn->lsx.stack_next;
+    dyn->lsx.news = 0;
+    dyn->lsx.stack_push = 0;
+    dyn->lsx.swapped = 0;
+}
+
+
+static int findCacheSlot(dynarec_la64_t* dyn, int ninst, int t, int n, lsxcache_t* cache)
+{
+    lsx_cache_t f;
+    f.n = n;
+    f.t = t;
+    for (int i = 0; i < 24; ++i) {
+        if (cache->lsxcache[i].v == f.v)
+            return i;
+        if (cache->lsxcache[i].n == n) {
+            switch (cache->lsxcache[i].t) {
+                case LSX_CACHE_ST_F:
+                    if (t == LSX_CACHE_ST_D)
+                        return i;
+                    if (t == LSX_CACHE_ST_I64)
+                        return i;
+                    break;
+                case LSX_CACHE_ST_D:
+                    if (t == LSX_CACHE_ST_F)
+                        return i;
+                    if (t == LSX_CACHE_ST_I64)
+                        return i;
+                    break;
+                case LSX_CACHE_ST_I64:
+                    if (t == LSX_CACHE_ST_F)
+                        return i;
+                    if (t == LSX_CACHE_ST_D)
+                        return i;
+                    break;
+                case LSX_CACHE_XMMR:
+                    if (t == LSX_CACHE_XMMW)
+                        return i;
+                    break;
+                case LSX_CACHE_XMMW:
+                    if (t == LSX_CACHE_XMMR)
+                        return i;
+                    break;
+            }
+        }
+    }
+    return -1;
+}
+
+static void swapCache(dynarec_la64_t* dyn, int ninst, int i, int j, lsxcache_t* cache)
+{
+    if (i == j)
+        return;
+    int quad = 0;
+    if (cache->lsxcache[i].t == LSX_CACHE_XMMR || cache->lsxcache[i].t == LSX_CACHE_XMMW)
+        quad = 1;
+    if (cache->lsxcache[j].t == LSX_CACHE_XMMR || cache->lsxcache[j].t == LSX_CACHE_XMMW)
+        quad = 1;
+
+    if (!cache->lsxcache[i].v) {
+        // a mov is enough, no need to swap
+        MESSAGE(LOG_DUMP, "\t  - Moving %d <- %d\n", i, j);
+        if (quad) {
+            VOR_V(i, j, j);
+        } else {
+            VXOR_V(i, i, i);
+            VEXTRINS_D(i, j, 0);
+        }
+        cache->lsxcache[i].v = cache->lsxcache[j].v;
+        cache->lsxcache[j].v = 0;
+        return;
+    }
+    // SWAP
+    lsx_cache_t tmp;
+    MESSAGE(LOG_DUMP, "\t  - Swapping %d <-> %d\n", i, j);
+    // There is no VSWP in Arm64 NEON to swap 2 register contents!
+    // so use a scratch...
+#define SCRATCH 31
+    if (quad) {
+        VOR_V(SCRATCH, i, i);
+        VOR_V(i, j, j);
+        VOR_V(j, SCRATCH, SCRATCH);
+    } else {
+        VXOR_V(SCRATCH, SCRATCH, SCRATCH);
+        VEXTRINS_D(SCRATCH, i, 0);
+        VXOR_V(i, i, i);
+        VEXTRINS_D(i, j, 0);
+        VXOR_V(j, j, j);
+        VEXTRINS_D(j, SCRATCH, 0);
+    }
+#undef SCRATCH
+    tmp.v = cache->lsxcache[i].v;
+    cache->lsxcache[i].v = cache->lsxcache[j].v;
+    cache->lsxcache[j].v = tmp.v;
 }
 
+static void loadCache(dynarec_la64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, lsxcache_t* cache, int i, int t, int n)
+{
+    if (cache->lsxcache[i].v) {
+        int quad = 0;
+        if (t == LSX_CACHE_XMMR || t == LSX_CACHE_XMMW)
+            quad = 1;
+        if (cache->lsxcache[i].t == LSX_CACHE_XMMR || cache->lsxcache[i].t == LSX_CACHE_XMMW)
+            quad = 1;
+        int j = i + 1;
+        while (cache->lsxcache[j].v)
+            ++j;
+        MESSAGE(LOG_DUMP, "\t  - Moving away %d\n", i);
+        if (quad) {
+            VOR_V(j, i, i);
+        } else {
+            VXOR_V(j, j, j);
+            VEXTRINS_D(j, i, 0);
+        }
+        cache->lsxcache[j].v = cache->lsxcache[i].v;
+    }
+    switch (t) {
+        case LSX_CACHE_XMMR:
+        case LSX_CACHE_XMMW:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
+            VLD(i, xEmu, offsetof(x64emu_t, xmm[n]));
+            break;
+        case LSX_CACHE_MM:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
+            VLD(i, xEmu, offsetof(x64emu_t, mmx[n]));
+            break;
+        case LSX_CACHE_ST_D:
+        case LSX_CACHE_ST_F:
+        case LSX_CACHE_ST_I64:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
+            // TODO: x87
+            break;
+        case LSX_CACHE_NONE:
+        case LSX_CACHE_SCR:
+        default: /* nothing done */
+            MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
+            break;
+    }
+    cache->lsxcache[i].n = n;
+    cache->lsxcache[i].t = t;
+}
+
+static void unloadCache(dynarec_la64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, lsxcache_t* cache, int i, int t, int n)
+{
+    switch (t) {
+        case LSX_CACHE_XMMR:
+            MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
+            break;
+        case LSX_CACHE_XMMW:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
+            VST(i, xEmu, offsetof(x64emu_t, xmm[n]));
+            break;
+        case LSX_CACHE_MM:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
+            VST(i, xEmu, offsetof(x64emu_t, mmx[n]));
+            break;
+        case LSX_CACHE_ST_D:
+        case LSX_CACHE_ST_F:
+        case LSX_CACHE_ST_I64:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
+            // TODO: x87
+            break;
+        case LSX_CACHE_NONE:
+        case LSX_CACHE_SCR:
+        default: /* nothing done */
+            MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
+            break;
+    }
+    cache->lsxcache[i].v = 0;
+}
 
 static void fpuCacheTransform(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3)
 {
-    // TODO
+#if STEP > 1
+    int i2 = dyn->insts[ninst].x64.jmp_insts;
+    if (i2 < 0)
+        return;
+    MESSAGE(LOG_DUMP, "\tCache Transform ---- ninst=%d -> %d\n", ninst, i2);
+    if ((!i2) || (dyn->insts[i2].x64.barrier & BARRIER_FLOAT)) {
+        if (dyn->lsx.stack_next) {
+            fpu_purgecache(dyn, ninst, 1, s1, s2, s3);
+            MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+            return;
+        }
+        for (int i = 0; i < 24; ++i)
+            if (dyn->lsx.lsxcache[i].v) { // there is something at ninst for i
+                fpu_purgecache(dyn, ninst, 1, s1, s2, s3);
+                MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+                return;
+            }
+        MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+        return;
+    }
+    lsxcache_t cache_i2 = dyn->insts[i2].lsx;
+    lsxcacheUnwind(&cache_i2);
+
+    if (!cache_i2.stack) {
+        int purge = 1;
+        for (int i = 0; i < 24 && purge; ++i)
+            if (cache_i2.lsxcache[i].v)
+                purge = 0;
+        if (purge) {
+            fpu_purgecache(dyn, ninst, 1, s1, s2, s3);
+            MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+            return;
+        }
+    }
+    int stack_cnt = dyn->lsx.stack_next;
+    int s3_top = 0xffff;
+    if (stack_cnt != cache_i2.stack) {
+        MESSAGE(LOG_DUMP, "\t    - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack);
+        int a = stack_cnt - cache_i2.stack;
+        // TODO: x87
+        s3_top = 0;
+        stack_cnt = cache_i2.stack;
+    }
+    lsxcache_t cache = dyn->lsx;
+    int s1_val = 0;
+    int s2_val = 0;
+    // unload every uneeded cache
+    // check SSE first, than MMX, in order, for optimisation issue
+    for (int i = 0; i < 16; ++i) {
+        int j = findCacheSlot(dyn, ninst, LSX_CACHE_XMMW, i, &cache);
+        if (j >= 0 && findCacheSlot(dyn, ninst, LSX_CACHE_XMMW, i, &cache_i2) == -1)
+            unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.lsxcache[j].t, cache.lsxcache[j].n);
+    }
+    for (int i = 0; i < 8; ++i) {
+        int j = findCacheSlot(dyn, ninst, LSX_CACHE_MM, i, &cache);
+        if (j >= 0 && findCacheSlot(dyn, ninst, LSX_CACHE_MM, i, &cache_i2) == -1)
+            unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.lsxcache[j].t, cache.lsxcache[j].n);
+    }
+    for (int i = 0; i < 24; ++i) {
+        if (cache.lsxcache[i].v)
+            if (findCacheSlot(dyn, ninst, cache.lsxcache[i].t, cache.lsxcache[i].n, &cache_i2) == -1)
+                unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.lsxcache[i].t, cache.lsxcache[i].n);
+    }
+    // and now load/swap the missing one
+    for (int i = 0; i < 24; ++i) {
+        if (cache_i2.lsxcache[i].v) {
+            if (cache_i2.lsxcache[i].v != cache.lsxcache[i].v) {
+                int j;
+                if ((j = findCacheSlot(dyn, ninst, cache_i2.lsxcache[i].t, cache_i2.lsxcache[i].n, &cache)) == -1)
+                    loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.lsxcache[i].t, cache_i2.lsxcache[i].n);
+                else {
+                    // it's here, lets swap if needed
+                    if (j != i)
+                        swapCache(dyn, ninst, i, j, &cache);
+                }
+            }
+            if (cache.lsxcache[i].t != cache_i2.lsxcache[i].t) {
+                if (cache.lsxcache[i].t == LSX_CACHE_ST_D && cache_i2.lsxcache[i].t == LSX_CACHE_ST_F) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n));
+                    FCVT_S_D(i, i);
+                    cache.lsxcache[i].t = LSX_CACHE_ST_F;
+                } else if (cache.lsxcache[i].t == LSX_CACHE_ST_F && cache_i2.lsxcache[i].t == LSX_CACHE_ST_D) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n));
+                    FCVT_D_S(i, i);
+                    cache.lsxcache[i].t = LSX_CACHE_ST_D;
+                } else if (cache.lsxcache[i].t == LSX_CACHE_ST_D && cache_i2.lsxcache[i].t == LSX_CACHE_ST_I64) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n));
+                    VFTINTRZ_L_D(i, i);
+                    cache.lsxcache[i].t = LSX_CACHE_ST_I64;
+                } else if (cache.lsxcache[i].t == LSX_CACHE_ST_F && cache_i2.lsxcache[i].t == LSX_CACHE_ST_I64) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n));
+                    VFTINTRZL_L_S(i, i);
+                    cache.lsxcache[i].t = LSX_CACHE_ST_D;
+                } else if (cache.lsxcache[i].t == LSX_CACHE_ST_I64 && cache_i2.lsxcache[i].t == LSX_CACHE_ST_F) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n));
+                    VFFINT_S_L(i, i, i);
+                    cache.lsxcache[i].t = LSX_CACHE_ST_F;
+                } else if (cache.lsxcache[i].t == LSX_CACHE_ST_I64 && cache_i2.lsxcache[i].t == LSX_CACHE_ST_D) {
+                    MESSAGE(LOG_DUMP, "\t  - Convert %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n));
+                    VFFINT_D_L(i, i);
+                    cache.lsxcache[i].t = LSX_CACHE_ST_D;
+                } else if (cache.lsxcache[i].t == LSX_CACHE_XMMR && cache_i2.lsxcache[i].t == LSX_CACHE_XMMW) {
+                    cache.lsxcache[i].t = LSX_CACHE_XMMW;
+                } else if (cache.lsxcache[i].t == LSX_CACHE_XMMW && cache_i2.lsxcache[i].t == LSX_CACHE_XMMR) {
+                    // refresh cache...
+                    MESSAGE(LOG_DUMP, "\t  - Refreh %s\n", getCacheName(cache.lsxcache[i].t, cache.lsxcache[i].n));
+                    VST(i, xEmu, offsetof(x64emu_t, xmm[cache.lsxcache[i].n]));
+                    cache.lsxcache[i].t = LSX_CACHE_XMMR;
+                }
+            }
+        }
+    }
+    MESSAGE(LOG_DUMP, "\t---- Cache Transform\n");
+#endif
 }
 
 static void flagsCacheTransform(dynarec_la64_t* dyn, int ninst, int s1)
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index f26dfba7..30f341e8 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -174,6 +174,10 @@
     gd = i;                                                   \
     BSTRPICK_D(gd, gb1, gb2 + 7, gb2);
 
+#define GETGX_empty(a)                          \
+    gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
+    a = sse_get_reg_empty(dyn, ninst, x1, gd)
+
 // Write gb (gd) back to original register / memory, using s1 as scratch
 #define GBBACK(s1) BSTRINS_D(gb1, gd, gb2 + 7, gb2);
 
@@ -523,6 +527,7 @@ void* la64_next(x64emu_t* emu, uintptr_t addr);
 
 #define x87_forget       STEPNAME(x87_forget)
 #define sse_purge07cache STEPNAME(sse_purge07cache)
+#define sse_get_reg_empty STEPNAME(sse_get_reg_empty)
 
 #define fpu_pushcache       STEPNAME(fpu_pushcache)
 #define fpu_popcache        STEPNAME(fpu_popcache)
@@ -595,6 +600,8 @@ void x87_forget(dynarec_la64_t* dyn, int ninst, int s1, int s2, int st);
 // SSE/SSE2 helpers
 // purge the XMM0..XMM7 cache (before function call)
 void sse_purge07cache(dynarec_la64_t* dyn, int ninst, int s1);
+// get lsx register for an SSE reg, but don't try to synch it if it needed to be created
+int sse_get_reg_empty(dynarec_la64_t* dyn, int ninst, int s1, int a);
 
 void CacheTransform(dynarec_la64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3);
 
diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h
index 1ecb88f6..7b76a75f 100644
--- a/src/dynarec/la64/dynarec_la64_private.h
+++ b/src/dynarec/la64/dynarec_la64_private.h
@@ -9,6 +9,57 @@ typedef struct instsize_s instsize_t;
 
 #define BARRIER_MAYBE   8
 
+#define LSX_CACHE_NONE   0
+#define LSX_CACHE_ST_D   1
+#define LSX_CACHE_ST_F   2
+#define LSX_CACHE_ST_I64 3
+#define LSX_CACHE_MM     4
+#define LSX_CACHE_XMMW   5
+#define LSX_CACHE_XMMR   6
+#define LSX_CACHE_SCR    7
+
+typedef union lsx_cache_s {
+    int8_t v;
+    struct {
+        uint8_t t : 4; // reg type
+        uint8_t n : 4; // reg number
+    };
+} lsx_cache_t;
+
+typedef union sse_cache_s {
+    int8_t v;
+    struct {
+        uint8_t reg : 7;
+        uint8_t write : 1;
+    };
+} sse_cache_t;
+
+typedef struct lsxcache_s {
+    // LSX cache
+    lsx_cache_t     lsxcache[24];
+    int8_t          stack;
+    int8_t          stack_next;
+    int8_t          stack_pop;
+    int8_t          stack_push;
+    uint8_t         combined1;
+    uint8_t         combined2;
+    uint8_t         swapped;        // the combined reg were swapped
+    uint8_t         barrier;        // is there a barrier at instruction epilog?
+    uint32_t        news;           // bitmask, wich neoncache are new for this opcode
+    // fpu cache
+    int8_t          x87cache[8];    // cache status for the 8 x87 register behind the fpu stack
+    int8_t          x87reg[8];      // reg used for x87cache entry
+    int8_t          freed[8];       // set when FFREE is used, -1 else
+    int8_t          mmxcache[8];    // cache status for the 8 MMX registers
+    sse_cache_t     ssecache[16];   // cache status for the 16 SSE(2) registers
+    int8_t          fpuused[24];    // all 0..24 double reg from fpu, used by x87, sse and mmx
+    int8_t          x87stack;       // cache stack counter
+    int8_t          mmxcount;       // number of mmx register used (not both mmx and x87 at the same time)
+    int8_t          fpu_scratch;    // scratch counter
+    int8_t          fpu_extra_qscratch; // some opcode need an extra quad scratch register
+    int8_t          fpu_reg;        // x87/sse/mmx reg counter
+} lsxcache_t;
+
 typedef struct flagcache_s {
     int                 pending;    // is there a pending flags here, or to check?
     int                 dfnone;     // if deferred flags is already set to df_none
@@ -33,6 +84,7 @@ typedef struct instruction_la64_s {
     uint8_t             will_write;
     uint8_t             last_write;
     flagcache_t         f_exit;     // flags status at end of instruction
+    lsxcache_t          lsx;        // lsxcache at end of instruction (but before poping)
     flagcache_t         f_entry;    // flags status before the instruction begin
 } instruction_la64_t;
 
@@ -52,6 +104,7 @@ typedef struct dynarec_la64_s {
     uintptr_t            tablestart;
     uintptr_t            jmp_next;   // address of the jump_next address
     flagcache_t          f;
+    lsxcache_t           lsx;
     uintptr_t*           next;       // variable array of "next" jump address
     int                  next_sz;
     int                  next_cap;
diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h
index c9c67407..117bb99e 100644
--- a/src/dynarec/la64/la64_emitter.h
+++ b/src/dynarec/la64/la64_emitter.h
@@ -947,11 +947,41 @@ LSX instruction starts with V, LASX instruction starts with XV.
 #define VFMINA_D(vd, vj, vk)        EMIT(type_3R(0b01110001010000110, vk, vj, vd))
 #define VFCVT_H_S(vd, vj, vk)       EMIT(type_3R(0b01110001010001100, vk, vj, vd))
 #define VFCVT_S_D(vd, vj, vk)       EMIT(type_3R(0b01110001010001101, vk, vj, vd))
+#define VFTINT_W_S(vd, vj)          EMIT(type_2R(0b0111001010011110001100, vj, vd))
+#define VFTINT_L_D(vd, vj)          EMIT(type_2R(0b0111001010011110001101, vj, vd))
+#define VFTINTRM_W_S(vd, vj)        EMIT(type_2R(0b0111001010011110001110, vj, vd))
+#define VFTINTRM_L_D(vd, vj)        EMIT(type_2R(0b0111001010011110001111, vj, vd))
+#define VFTINTRP_W_S(vd, vj)        EMIT(type_2R(0b0111001010011110010000, vj, vd))
+#define VFTINTRP_L_D(vd, vj)        EMIT(type_2R(0b0111001010011110010001, vj, vd))
+#define VFTINTRZ_W_S(vd, vj)        EMIT(type_2R(0b0111001010011110010010, vj, vd))
+#define VFTINTRZ_L_D(vd, vj)        EMIT(type_2R(0b0111001010011110010011, vj, vd))
+#define VFTINTRNE_W_S(vd, vj)       EMIT(type_2R(0b0111001010011110010100, vj, vd))
+#define VFTINTRNE_L_D(vd, vj)       EMIT(type_2R(0b0111001010011110010101, vj, vd))
+#define VFTINT_WU_S(vd, vj)         EMIT(type_2R(0b0111001010011110010110, vj, vd))
+#define VFTINT_LU_D(vd, vj)         EMIT(type_2R(0b0111001010011110010111, vj, vd))
+#define VFTINTRZ_WU_S(vd, vj)       EMIT(type_2R(0b0111001010011110011100, vj, vd))
+#define VFTINTRZ_LU_D(vd, vj)       EMIT(type_2R(0b0111001010011110011101, vj, vd))
 #define VFTINT_W_D(vd, vj, vk)      EMIT(type_3R(0b01110001010010011, vk, vj, vd))
 #define VFTINTRM_W_D(vd, vj, vk)    EMIT(type_3R(0b01110001010010100, vk, vj, vd))
 #define VFTINTRP_W_D(vd, vj, vk)    EMIT(type_3R(0b01110001010010101, vk, vj, vd))
 #define VFTINTRZ_W_D(vd, vj, vk)    EMIT(type_3R(0b01110001010010110, vk, vj, vd))
 #define VFTINTRNE_W_D(vd, vj, vk)   EMIT(type_3R(0b01110001010010111, vk, vj, vd))
+#define VFTINTL_L_S(vd, vj)         EMIT(type_2R(0b0111001010011110100000, vj, vd))
+#define VFTINTH_L_S(vd, vj)         EMIT(type_2R(0b0111001010011110100001, vj, vd))
+#define VFTINTRML_L_S(vd, vj)       EMIT(type_2R(0b0111001010011110100010, vj, vd))
+#define VFTINTRMH_L_S(vd, vj)       EMIT(type_2R(0b0111001010011110100011, vj, vd))
+#define VFTINTRPL_L_S(vd, vj)       EMIT(type_2R(0b0111001010011110100100, vj, vd))
+#define VFTINTRPH_L_S(vd, vj)       EMIT(type_2R(0b0111001010011110100101, vj, vd))
+#define VFTINTRZL_L_S(vd, vj)       EMIT(type_2R(0b0111001010011110100110, vj, vd))
+#define VFTINTRZH_L_S(vd, vj)       EMIT(type_2R(0b0111001010011110100111, vj, vd))
+#define VFTINTRNEL_L_S(vd, vj)      EMIT(type_2R(0b0111001010011110101000, vj, vd))
+#define VFTINTRNEH_L_S(vd, vj)      EMIT(type_2R(0b0111001010011110101001, vj, vd))
+#define VFFINT_S_W(vd, vj)          EMIT(type_2R(0b0111001010011110000000, vj, vd))
+#define VFFINT_S_WU(vd, vj)         EMIT(type_2R(0b0111001010011110000001, vj, vd))
+#define VFFINT_D_L(vd, vj)          EMIT(type_2R(0b0111001010011110000010, vj, vd))
+#define VFFINT_D_LU(vd, vj)         EMIT(type_2R(0b0111001010011110000011, vj, vd))
+#define VFFINTL_D_W(vd, vj)         EMIT(type_2R(0b0111001010011110000100, vj, vd))
+#define VFFINTH_D_W(vd, vj)         EMIT(type_2R(0b0111001010011110000101, vj, vd))
 #define VFFINT_S_L(vd, vj, vk)      EMIT(type_3R(0b01110001010010000, vk, vj, vd))
 #define VSEQ_B(vd, vj, vk)          EMIT(type_3R(0b01110000000000000, vk, vj, vd))
 #define VSEQ_H(vd, vj, vk)          EMIT(type_3R(0b01110000000000001, vk, vj, vd))
@@ -1000,6 +1030,13 @@ LSX instruction starts with V, LASX instruction starts with XV.
 #define VSHUF_H(vd, vj, vk)         EMIT(type_3R(0b01110001011110101, vk, vj, vd))
 #define VSHUF_W(vd, vj, vk)         EMIT(type_3R(0b01110001011110110, vk, vj, vd))
 #define VSHUF_D(vd, vj, vk)         EMIT(type_3R(0b01110001011110111, vk, vj, vd))
+#define VEXTRINS_D(vd, vj, imm8)    EMIT(type_2RI8(0b01110011100000, imm8, vj, vd))
+#define VEXTRINS_W(vd, vj, imm8)    EMIT(type_2RI8(0b01110011100001, imm8, vj, vd))
+#define VEXTRINS_H(vd, vj, imm8)    EMIT(type_2RI8(0b01110011100010, imm8, vj, vd))
+#define VEXTRINS_B(vd, vj, imm8)    EMIT(type_2RI8(0b01110011100011, imm8, vj, vd))
+#define VLD(vd, rj, imm12)          EMIT(type_2RI12(0b0010110000, imm12, rj, vd))
+#define VST(vd, rj, imm12)          EMIT(type_2RI12(0b0010110001, imm12, rj, vd))
+
 
 #define XVADD_B(vd, vj, vk)          EMIT(type_3R(0b01110100000010100, vk, vj, vd))
 #define XVADD_H(vd, vj, vk)          EMIT(type_3R(0b01110100000010101, vk, vj, vd))