about summary refs log tree commit diff stats
path: root/src/dynarec
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynarec')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_00.c27
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx.c60
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_0f.c115
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.c16
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c64
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h17
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass0.h1
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h13
-rw-r--r--src/dynarec/dynarec_native.c8
-rw-r--r--src/dynarec/dynarec_native_functions.c30
-rw-r--r--src/dynarec/dynarec_native_functions.h9
-rw-r--r--src/dynarec/dynarec_native_pass.c2
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h2
-rw-r--r--src/dynarec/la64/dynarec_la64_pass0.h1
-rw-r--r--src/dynarec/la64/dynarec_la64_private.h5
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass0.h1
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h5
18 files changed, 359 insertions, 19 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index 91e0159d..f09f0803 100644
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -2206,7 +2206,19 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 STRH_U12(x1, xEmu, offsetof(x64emu_t, segs[_ES]));
                 STRw_U12(xZR, xEmu, offsetof(x64emu_t, segs_serial[_ES]));
             } else {
-                DEFAULT;
+                vex_t vex = {0};
+                vex.rex = rex;
+                u8 = nextop;
+                vex.m = u8&0b00011111;
+                vex.rex.b = (u8&0b00100000)?0:1;
+                vex.rex.x = (u8&0b01000000)?0:1;
+                vex.rex.r = (u8&0b10000000)?0:1;
+                u8 = F8;
+                vex.p = u8&0b00000011;
+                vex.l = (u8>>2)&1;
+                vex.v = ((~u8)>>3)&0b1111;
+                vex.rex.w = (u8>>7)&1;
+                addr = dynarec64_AVX(dyn, addr, ip, ninst, vex, ok, need_epilog);
             }
             break;
         case 0xC5:
@@ -2220,7 +2232,18 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 STRH_U12(x1, xEmu, offsetof(x64emu_t, segs[_DS]));
                 STRw_U12(xZR, xEmu, offsetof(x64emu_t, segs_serial[_DS]));
             } else {
-                DEFAULT;
+                vex_t vex = {0};
+                vex.rex = rex;
+                u8 = nextop;
+                vex.p = u8&0b00000011;
+                vex.l = (u8>>2)&1;
+                vex.v = ((~u8)>>3)&0b1111;
+                vex.rex.r = (u8&0b10000000)?0:1;
+                vex.rex.b = 0;
+                vex.rex.x = 0;
+                vex.rex.w = 0;
+                vex.m = VEX_M_0F;
+                addr = dynarec64_AVX(dyn, addr, ip, ninst, vex, ok, need_epilog);
             }
             break;
         case 0xC6:
diff --git a/src/dynarec/arm64/dynarec_arm64_avx.c b/src/dynarec/arm64/dynarec_arm64_avx.c
new file mode 100644
index 00000000..53ff0cf5
--- /dev/null
+++ b/src/dynarec/arm64/dynarec_arm64_avx.c
@@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+
+#include "arm64_printer.h"
+#include "dynarec_arm64_private.h"
+#include "dynarec_arm64_functions.h"
+#include "dynarec_arm64_helper.h"
+
+static const char* avx_prefix_string(uint16_t p)
+{
+    switch(p) {
+        case VEX_P_NONE: return "0";
+        case VEX_P_66: return "66";
+        case VEX_P_F2: return "F2";
+        case VEX_P_F3: return "F3";
+        default: return "??";
+    }
+}
+static const char* avx_map_string(uint16_t m)
+{
+    switch(m) {
+        case VEX_M_NONE: return "0";
+        case VEX_M_0F: return "0F";
+        case VEX_M_0F38: return "0F38";
+        case VEX_M_0F3A: return "0F3A";
+        default: return "??";
+    }
+}
+
+uintptr_t dynarec64_AVX(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog)
+{
+    (void)ip; (void)need_epilog;
+
+    uint8_t opcode = PK(0);
+    rex_t rex = vex.rex;
+
+    if( (vex.m==VEX_M_0F) && (vex.p==VEX_P_NONE))
+        addr = dynarec64_AVX_0F(dyn, addr, ip, ninst, vex, ok, need_epilog);
+    else {DEFAULT;}
+
+    if(*ok==-1) {
+        printf_log(LOG_INFO, "Dynarec unimplemented AVX opcode size %d prefix %s map %s opcode %02X ", 128<<vex.l, avx_prefix_string(vex.p), avx_map_string(vex.m), opcode);
+    }
+    return addr;
+}
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
new file mode 100644
index 00000000..8addb9b1
--- /dev/null
+++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+#include "my_cpuid.h"
+#include "emu/x87emu_private.h"
+#include "emu/x64shaext.h"
+
+#include "arm64_printer.h"
+#include "dynarec_arm64_private.h"
+#include "dynarec_arm64_functions.h"
+#include "dynarec_arm64_helper.h"
+
+uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog)
+{
+    (void)ip; (void)need_epilog;
+
+    uint8_t opcode = F8;
+    uint8_t nextop, u8;
+    uint8_t gd, ed;
+    uint8_t wback, wb1, wb2;
+    uint8_t eb1, eb2, gb1, gb2;
+    int32_t i32, i32_;
+    int cacheupd = 0;
+    int v0, v1, v2;
+    int q0, q1, q2;
+    int d0, d1, d2;
+    int s0;
+    uint64_t tmp64u;
+    int64_t j64;
+    int64_t fixedaddress;
+    int unscaled;
+    MAYUSE(wb1);
+    MAYUSE(wb2);
+    MAYUSE(eb1);
+    MAYUSE(eb2);
+    MAYUSE(gb1);
+    MAYUSE(gb2);
+    MAYUSE(q0);
+    MAYUSE(q1);
+    MAYUSE(d0);
+    MAYUSE(d1);
+    MAYUSE(s0);
+    MAYUSE(j64);
+    MAYUSE(cacheupd);
+    #if STEP > 1
+    static const int8_t mask_shift8[] = { -7, -6, -5, -4, -3, -2, -1, 0 };
+    #endif
+
+    rex_t rex = vex.rex;
+
+    switch(opcode) {
+
+        case 0xC6:
+            INST_NAME("VSHUFPS Gx, Vx, Ex, Ib");
+            nextop = F8;
+            GETVX(v2, 0);
+            GETGX_empty(v0);
+            if(!MODREG) {
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);
+                v1 = -1; // to avoid a warning
+            } else
+                v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0);
+            u8 = F8;
+            if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) {
+                VDUPQ_32(v0, v2, u8&3);
+            } else if(v2==v1 && (u8==0xe0)) {   // easy special case
+                VMOVQ(v0, v2);
+                VMOVeS(v0, 1, v0, 0);
+            } else if(v0==v1 && (u8==0xe5)) {   // easy special case
+                VMOVQ(v0, v2);
+                VMOVeS(v0, 0, v0, 1);
+            } else {
+                d0 = fpu_get_scratch(dyn);
+                // first two elements from Gx
+                for(int i=0; i<2; ++i) {
+                    VMOVeS(d0, i, v2, (u8>>(i*2))&3);
+                }
+                // second two from Ex
+                if(MODREG) {
+                    for(int i=2; i<4; ++i) {
+                        VMOVeS(d0, i, v1, (u8>>(i*2))&3);
+                    }
+                } else {
+                    SMREAD();
+                    for(int i=2; i<4; ++i) {
+                        ADDx_U12(x2, ed, ((u8>>(i*2))&3)*4);
+                        VLD1_32(d0, i, x2);
+                    }
+                }
+                VMOVQ(v0, d0);
+            }
+            if(vex.l) {
+                DEFAULT;    /* TDOD! */
+            } else YMM0(gd);
+            break;
+
+        default:
+            DEFAULT;
+    }
+    return addr;
+}
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index f62ade0d..7c4bac22 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -88,7 +88,7 @@ int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm)
 static void fpu_reset_reg_neoncache(neoncache_t* n)
 {
     n->fpu_reg = 0;
-    for (int i=0; i<24; ++i) {
+    for (int i=0; i<32; ++i) {
         n->fpuused[i]=0;
         n->neoncache[i].v = 0;
     }
@@ -456,7 +456,7 @@ void neoncacheUnwind(neoncache_t* cache)
         cache->ssecache[i*2+1].v = -1;
     }
     int x87reg = 0;
-    for(int i=0; i<24; ++i) {
+    for(int i=0; i<32; ++i) {
         if(cache->neoncache[i].v) {
             cache->fpuused[i] = 1;
             switch (cache->neoncache[i].t) {
@@ -467,6 +467,8 @@ void neoncacheUnwind(neoncache_t* cache)
                     break;
                 case NEON_CACHE_XMMR:
                 case NEON_CACHE_XMMW:
+                case NEON_CACHE_YMMR:
+                case NEON_CACHE_YMMW:
                     cache->ssecache[cache->neoncache[i].n].reg = i;
                     cache->ssecache[cache->neoncache[i].n].write = (cache->neoncache[i].t==NEON_CACHE_XMMW)?1:0;
                     ++cache->fpu_reg;
@@ -543,6 +545,8 @@ const char* getCacheName(int t, int n)
         case NEON_CACHE_MM: sprintf(buff, "MM%d", n); break;
         case NEON_CACHE_XMMW: sprintf(buff, "XMM%d", n); break;
         case NEON_CACHE_XMMR: sprintf(buff, "xmm%d", n); break;
+        case NEON_CACHE_YMMW: sprintf(buff, "YMM%d", n); break;
+        case NEON_CACHE_YMMR: sprintf(buff, "ymm%d", n); break;
         case NEON_CACHE_SCR: sprintf(buff, "Scratch"); break;
         case NEON_CACHE_NONE: buff[0]='\0'; break;
     }
@@ -580,7 +584,7 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
             dynarec_log(LOG_NONE, ", jmp=out");
         if(dyn->last_ip)
             dynarec_log(LOG_NONE, ", last_ip=%p", (void*)dyn->last_ip);
-        for(int ii=0; ii<24; ++ii) {
+        for(int ii=0; ii<32; ++ii) {
             switch(dyn->insts[ninst].n.neoncache[ii].t) {
                 case NEON_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
                 case NEON_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
@@ -588,11 +592,15 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
                 case NEON_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
                 case NEON_CACHE_XMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
                 case NEON_CACHE_XMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
-                case NEON_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
+                case NEON_CACHE_YMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
+                case NEON_CACHE_YMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
+                //case NEON_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
                 case NEON_CACHE_NONE:
                 default:    break;
             }
         }
+        if(dyn->ymm_zero)
+            dynarec_log(LOG_NONE, " ymm0_mask=%04x", dyn->ymm_zero);
         if(dyn->n.stack || dyn->insts[ninst].n.stack_next || dyn->insts[ninst].n.x87stack)
             dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, dyn->insts[ninst].n.stack_pop, dyn->insts[ninst].n.x87stack);
         if(dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2)
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 568483dd..5e406588 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -1673,6 +1673,18 @@ static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1)
                 dyn->n.ssecache[i].v = -1;
             }
         }
+    //AVX
+    if(dyn->ymm_zero) {
+        if (old==-1) {
+            MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":"");
+            ++old;
+        }
+        for(int i=0; i<16; ++i)
+            if(is_avx_zero(dyn, ninst, i))
+                STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+        if(!next)
+            avx_mark_zero_reset(dyn, ninst);
+    }
     if(old!=-1) {
         MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n");
     }
@@ -1684,10 +1696,17 @@ static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1)
         if(dyn->n.ssecache[i].v!=-1 && dyn->n.ssecache[i].write) {
             VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
         }
+    //AVX
+    if(dyn->ymm_zero)
+        for(int i=0; i<16; ++i)
+            if(is_avx_zero(dyn, ninst, i))
+                STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i]));
 }
 
 void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a)
 {
+    if(is_avx_zero(dyn, ninst, a))
+        STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[a]));
     if(dyn->n.ssecache[a].v==-1)
         return;
     if(dyn->n.neoncache[dyn->n.ssecache[a].reg].t == NEON_CACHE_XMMW) {
@@ -1708,10 +1727,13 @@ void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07)
     if(!n)
         return;
     MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n);
-    for (int i=start; i<16; ++i)
+    for (int i=start; i<16; ++i) {
         if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) {
             VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
         }
+        if(is_avx_zero(dyn, ninst, i))
+            STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+    }
     MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n);
 }
 
@@ -1778,6 +1800,13 @@ static int findCacheSlot(dynarec_arm_t* dyn, int ninst, int t, int n, neoncache_
                 case NEON_CACHE_XMMW:
                     if(t==NEON_CACHE_XMMR)
                         return i;
+                case NEON_CACHE_YMMR:
+                    if(t==NEON_CACHE_YMMW)
+                        return i;
+                    break;
+                case NEON_CACHE_YMMW:
+                    if(t==NEON_CACHE_YMMR)
+                        return i;
                     break;
             }
         }
@@ -1790,9 +1819,9 @@ static void swapCache(dynarec_arm_t* dyn, int ninst, int i, int j, neoncache_t *
     if (i==j)
         return;
     int quad = 0;
-    if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW)
+    if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW || cache->neoncache[i].t==NEON_CACHE_YMMR || cache->neoncache[i].t==NEON_CACHE_YMMW)
         quad =1;
-    if(cache->neoncache[j].t==NEON_CACHE_XMMR || cache->neoncache[j].t==NEON_CACHE_XMMW)
+    if(cache->neoncache[j].t==NEON_CACHE_XMMR || cache->neoncache[j].t==NEON_CACHE_XMMW || cache->neoncache[j].t==NEON_CACHE_YMMR || cache->neoncache[j].t==NEON_CACHE_YMMW)
         quad =1;
 
     if(!cache->neoncache[i].v) {
@@ -1821,7 +1850,6 @@ static void swapCache(dynarec_arm_t* dyn, int ninst, int i, int j, neoncache_t *
         VMOV(i, j);
         VMOV(j, SCRATCH);
     }
-    #undef SCRATCH
     tmp.v = cache->neoncache[i].v;
     cache->neoncache[i].v = cache->neoncache[j].v;
     cache->neoncache[j].v = tmp.v;
@@ -1852,6 +1880,11 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int
             MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
             VLDR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n]));
             break;
+        case NEON_CACHE_YMMR:
+        case NEON_CACHE_YMMW:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
+            VLDR128_U12(i, xEmu, offsetof(x64emu_t, ymm[n]));
+            break;
         case NEON_CACHE_MM:
             MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
             VLDR64_U12(i, xEmu, offsetof(x64emu_t, mmx[n]));
@@ -1900,12 +1933,17 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in
 {
     switch(t) {
         case NEON_CACHE_XMMR:
+        case NEON_CACHE_YMMR:
             MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
             break;
         case NEON_CACHE_XMMW:
             MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
             VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n]));
             break;
+        case NEON_CACHE_YMMW:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
+            VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[n]));
+            break;
         case NEON_CACHE_MM:
             MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
             VSTR64_U12(i, xEmu, offsetof(x64emu_t, mmx[n]));
@@ -2047,11 +2085,18 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
                     cache.neoncache[i].t = NEON_CACHE_ST_D;
                 } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW)
                     { cache.neoncache[i].t = NEON_CACHE_XMMW; }
+                else if(cache.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW)
+                    { cache.neoncache[i].t = NEON_CACHE_YMMW; }
                 else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) {
                     // refresh cache...
                     MESSAGE(LOG_DUMP, "\t  - Refreh %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
                     VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[cache.neoncache[i].n]));
                     cache.neoncache[i].t = NEON_CACHE_XMMR;
+                } else if(cache.neoncache[i].t == NEON_CACHE_YMMW && cache_i2.neoncache[i].t == NEON_CACHE_YMMR) {
+                    // refresh cache...
+                    MESSAGE(LOG_DUMP, "\t  - Refreh %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n));
+                    VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[cache.neoncache[i].n]));
+                    cache.neoncache[i].t = NEON_CACHE_YMMR;
                 }
             }
         }
@@ -2309,3 +2354,14 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst)
     dyn->n.stack_push = 0;
     dyn->n.swapped = 0;
 }
+
+void avx_purge_ymm0(dynarec_arm_t* dyn, int ninst)
+{
+    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Purge YMM Zero mask=%04x --------\n", dyn->insts[ninst].purge_ymm0);
+    for(int i=0; i<16; ++i)
+        if(dyn->insts[ninst].purge_ymm0&(1<<i) && is_avx_zero(dyn, ninst, i)) {
+            STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+            avx_unmark_zero(dyn, ninst, i);
+        }
+    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "---------- Purge YMM Zero\n");
+}
\ No newline at end of file
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 96c56be8..36950291 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -449,6 +449,14 @@
     gd = ((nextop&0x38)>>3)+(rex.r<<3); \
     a = sse_get_reg_empty(dyn, ninst, x1, gd)
 
+// Get VX as a quad (might use x1)
+#define GETVX(a, w)                     \
+    a = sse_get_reg(dyn, ninst, x1, vex.v, w)
+
+// Get an empty VX (use x1)
+#define GETVX_empty(a)                  \
+    a = sse_get_reg_empty(dyn, ninst, x1, vex.v)
+
 // Get EX as a quad, (x1 is used)
 #define GETEX(a, w, D)                                                                                  \
     if(MODREG) {                                                                                        \
@@ -530,6 +538,7 @@
         SMWRITE2();                         \
     }
 
+#define YMM0(a) avx_mark_zero(dyn, ninst, gd);
 
 // Get Direction with size Z and based of F_DF flag, on register r ready for LDR/STR fetching
 // F_DF is 1<<10, so 1 ROR 11*2 (so F_OF)
@@ -1016,6 +1025,8 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr);
 #define dynarec64_66F0     STEPNAME(dynarec64_66F0)
 #define dynarec64_F20F     STEPNAME(dynarec64_F20F)
 #define dynarec64_F30F     STEPNAME(dynarec64_F30F)
+#define dynarec64_AVX      STEPNAME(dynarec64_AVX)
+#define dynarec64_AVX_0F   STEPNAME(dynarec64_AVX_0F)
 
 #define geted           STEPNAME(geted)
 #define geted32         STEPNAME(geted32)
@@ -1162,6 +1173,7 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr);
 #define x87_purgecache  STEPNAME(x87_purgecache)
 #define fpu_reflectcache STEPNAME(fpu_reflectcache)
 #define fpu_unreflectcache STEPNAME(fpu_unreflectcache)
+#define avx_purge_ymm0  STEPNAME(avx_purge_ymm0)
 
 #define CacheTransform       STEPNAME(CacheTransform)
 
@@ -1322,6 +1334,8 @@ int x87_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
 void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1);
 // Set rounding according to mxcsr flags, return reg to restore flags
 int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
+// purge ymm_zero mask according to purge_ymm0
+void avx_purge_ymm0(dynarec_arm_t* dyn, int ninst);
 
 void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3);
 
@@ -1420,6 +1434,8 @@ uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
 uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
+uintptr_t dynarec64_AVX(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
+uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 
 #if STEP < 2
 #define PASS2(A)
@@ -1577,5 +1593,6 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
         }                                       \
     }
 
+#define PURGE_YMM0()    avx_purge_ymm0(dyn, ninst)
 
 #endif //__DYNAREC_ARM64_HELPER_H__
diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h
index 7d4c0c2d..6e9b8019 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass0.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass0.h
@@ -26,6 +26,7 @@
         dyn->n.combined1 = dyn->n.combined2 = 0;\
         dyn->n.swapped = 0; dyn->n.barrier = 0; \
         dyn->insts[ninst].f_entry = dyn->f;     \
+        dyn->insts[ninst].ymm_zero = dyn->ymm_zero;\
         if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;}
 
 #define INST_EPILOG                             \
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index b26d522d..2788ddc4 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -16,7 +16,9 @@ typedef struct instsize_s instsize_t;
 #define NEON_CACHE_MM       4
 #define NEON_CACHE_XMMW     5
 #define NEON_CACHE_XMMR     6
-#define NEON_CACHE_SCR      7
+#define NEON_CACHE_YMMW     7
+#define NEON_CACHE_YMMR     8
+#define NEON_CACHE_SCR      9
 typedef union neon_cache_s {
     int8_t           v;
     struct {
@@ -33,7 +35,7 @@ typedef union sse_cache_s {
 } sse_cache_t;
 typedef struct neoncache_s {
     // Neon cache
-    neon_cache_t        neoncache[24];
+    neon_cache_t        neoncache[32];
     int8_t              stack;
     int8_t              stack_next;
     int8_t              stack_pop;
@@ -51,7 +53,7 @@ typedef struct neoncache_s {
     int16_t             tags;           // similar to fpu_tags
     int8_t              mmxcache[8];    // cache status for the 8 MMX registers
     sse_cache_t         ssecache[16];   // cache status for the 16 SSE(2) registers
-    int8_t              fpuused[24];    // all 0..24 double reg from fpu, used by x87, sse and mmx
+    int8_t              fpuused[32];    // all neon regs, used by x87, mmx, sse and avx
     int8_t              x87stack;       // cache stack counter
     int8_t              mmxcount;       // number of mmx register used (not both mmx and x87 at the same time)
     int8_t              fpu_scratch;    // scratch counter
@@ -78,7 +80,9 @@ typedef struct instruction_arm64_s {
     uintptr_t           marklock;
     int                 pass2choice;// value for choices that are fixed on pass2 for pass3
     uintptr_t           natcall;
-    int                 retn;
+    uint16_t            retn;
+    uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
+    uint16_t            purge_ymm0; // need to purge some ymm0 because of a loop
     uint8_t             barrier_maybe;
     uint8_t             will_write;
     uint8_t             last_write;
@@ -118,6 +122,7 @@ typedef struct dynarec_arm_s {
     uintptr_t           forward_to; // address of the next jump to (to check if everything is ok)
     int32_t             forward_size;   // size at the forward point
     int                 forward_ninst;  // ninst at the forward point
+    uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
     uint8_t             smwrite;    // for strongmem model emulation
     uint8_t             smread;
     uint8_t             doublepush;
diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index 7ac19234..4f509ac1 100644
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -553,9 +553,15 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
                     k=i2;
             }*/
             if(k!=-1) {
-                if(k!=-1 && !helper.insts[i].barrier_maybe)
+                if(!helper.insts[i].barrier_maybe)
                     helper.insts[k].x64.barrier |= BARRIER_FULL;
                 helper.insts[i].x64.jmp_insts = k;
+                if(helper.insts[i].ymm_zero || helper.insts[k].ymm_zero) {
+                    // move to pureg the reg that are present in k (jump to) but not in i (jump from)
+                    uint16_t to_purge = helper.insts[k].ymm_zero & ~helper.insts[i].ymm_zero;
+                    helper.insts[k].purge_ymm0 |= to_purge;
+                    helper.insts[k].ymm_zero &= ~to_purge;
+                }
             }
         }
     }
diff --git a/src/dynarec/dynarec_native_functions.c b/src/dynarec/dynarec_native_functions.c
index 1cb2834d..10d6f333 100644
--- a/src/dynarec/dynarec_native_functions.c
+++ b/src/dynarec/dynarec_native_functions.c
@@ -507,7 +507,7 @@ uint8_t geted_ib(dynarec_native_t* dyn, uintptr_t addr, int ninst, uint8_t nexto
 }
 #undef F8
 
-int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress, int* retn)
+int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress, uint16_t* retn)
 {
     (void)dyn;
 
@@ -533,3 +533,31 @@ int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress,
 #undef PK32
 #undef PK
 }
+
+// AVX
+void avx_mark_zero(dynarec_native_t* dyn, int ninst, int reg)
+{
+    dyn->ymm_zero |= (1<<reg);
+}
+
+int is_avx_zero(dynarec_native_t* dyn, int ninst, int reg)
+{
+    return (dyn->ymm_zero>>reg)&1;
+}
+int is_avx_zero_unset(dynarec_native_t* dyn, int ninst, int reg)
+{
+    if((dyn->ymm_zero>>reg)&1) {
+        dyn->ymm_zero &= ~(1<<reg);
+        return 1;    
+    }
+    return 0;
+}
+void avx_mark_zero_reset(dynarec_native_t* dyn, int ninst)
+{
+    dyn->ymm_zero = 0;
+}
+
+void avx_unmark_zero(dynarec_native_t* dyn, int ninst, int reg)
+{
+    dyn->ymm_zero &= ~(1<<reg);
+}
diff --git a/src/dynarec/dynarec_native_functions.h b/src/dynarec/dynarec_native_functions.h
index 533dfeeb..3e81081b 100644
--- a/src/dynarec/dynarec_native_functions.h
+++ b/src/dynarec/dynarec_native_functions.h
@@ -65,7 +65,14 @@ uintptr_t fakeed(dynarec_native_t* dyn, uintptr_t addr, int ninst, uint8_t nexto
 uint8_t geted_ib(dynarec_native_t* dyn, uintptr_t addr, int ninst, uint8_t nextop);
 
 // Is what pointed at addr a native call? And if yes, to what function?
-int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress, int* retn);
+int isNativeCall(dynarec_native_t* dyn, uintptr_t addr, uintptr_t* calladdress, uint16_t* retn);
+
+// AVX utilities
+void avx_mark_zero(dynarec_native_t* dyn, int ninst, int reg);
+int is_avx_zero(dynarec_native_t* dyn, int ninst, int reg);
+int is_avx_zero_unset(dynarec_native_t* dyn, int ninst, int reg);
+void avx_mark_zero_reset(dynarec_native_t* dyn, int ninst);
+void avx_unmark_zero(dynarec_native_t* dyn, int ninst, int reg);
 
 ADDITIONNAL_DEFINITION()
 
diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index 2772a973..cab03222 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -80,6 +80,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
         }
         #endif
         fpu_propagate_stack(dyn, ninst);
+        if(dyn->insts[ninst].purge_ymm0)
+            PURGE_YMM0();
         ip = addr;
         if (reset_n!=-1) {
             dyn->last_ip = 0;
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index fbc91e34..def767b4 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -1075,4 +1075,6 @@ uintptr_t dynarec64_F20F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
         }                                  \
     } while (0)
 
+#define PURGE_YMM0()    /* TODO */
+
 #endif //__DYNAREC_LA64_HELPER_H__
\ No newline at end of file
diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h
index 99a897a6..3990caa4 100644
--- a/src/dynarec/la64/dynarec_la64_pass0.h
+++ b/src/dynarec/la64/dynarec_la64_pass0.h
@@ -30,6 +30,7 @@
     dyn->lsx.combined1 = dyn->lsx.combined2 = 0; \
     dyn->lsx.swapped = 0;                        \
     dyn->lsx.barrier = 0;                        \
+    dyn->insts[ninst].ymm_zero = dyn->ymm_zero;  \
     dyn->insts[ninst].f_entry = dyn->f;          \
     if (ninst) { dyn->insts[ninst - 1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst - 1].x64.addr; }
 #define INST_EPILOG                    \
diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h
index 7b76a75f..b31d3f2e 100644
--- a/src/dynarec/la64/dynarec_la64_private.h
+++ b/src/dynarec/la64/dynarec_la64_private.h
@@ -79,7 +79,9 @@ typedef struct instruction_la64_s {
     uintptr_t           marklock;
     int                 pass2choice;// value for choices that are fixed on pass2 for pass3
     uintptr_t           natcall;
-    int                 retn;
+    uint16_t            retn;
+    uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
+    uint16_t            purge_ymm0; // need to purge some ymm0 because of a loop
     uint8_t             barrier_maybe;
     uint8_t             will_write;
     uint8_t             last_write;
@@ -119,6 +121,7 @@ typedef struct dynarec_la64_s {
     uintptr_t            forward_to; // address of the next jump to (to check if everything is ok)
     int32_t              forward_size;   // size at the forward point
     int                  forward_ninst;  // ninst at the forward point
+    uint16_t             ymm_zero;   // bitmap of ymm to zero at purge
     uint8_t              smread;    // for strongmem model emulation
     uint8_t              smwrite;    // for strongmem model emulation
     uint8_t              always_test;
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 6a374499..fd680474 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -1671,4 +1671,6 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
     BLT(reg, s, 4 + 4);           \
     ADDIW(reg, s, -1);
 
+#define PURGE_YMM0()    /* TODO */
+
 #endif //__DYNAREC_RV64_HELPER_H__
diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h
index 3ee1685f..9c2de9ee 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass0.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass0.h
@@ -28,6 +28,7 @@
         dyn->e.swapped = 0; dyn->e.barrier = 0; \
         for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\
         dyn->insts[ninst].f_entry = dyn->f;     \
+        dyn->insts[ninst].ymm_zero = dyn->ymm_zero;\
         if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;}
 
 #define INST_EPILOG                             \
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index 3acbdfb6..dff6f84e 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -89,7 +89,9 @@ typedef struct instruction_rv64_s {
     uintptr_t           marklock;
     int                 pass2choice;// value for choices that are fixed on pass2 for pass3
     uintptr_t           natcall;
-    int                 retn;
+    uint16_t            retn;
+    uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
+    uint16_t            purge_ymm0; // need to purge some ymm0 because of a loop
     int                 barrier_maybe;
     flagcache_t         f_exit;     // flags status at end of intruction
     extcache_t          e;          // extcache at end of intruction (but before poping)
@@ -129,6 +131,7 @@ typedef struct dynarec_rv64_s {
     uintptr_t           forward_to; // address of the next jump to (to check if everything is ok)
     int32_t             forward_size;   // size at the forward point
     int                 forward_ninst;  // ninst at the forward point
+    uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
     uint8_t             always_test;
     uint8_t             abort;
 } dynarec_rv64_t;