about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2024-10-26 23:50:24 +0800
committerGitHub <noreply@github.com>2024-10-26 17:50:24 +0200
commit5563103453e96b6894a8cddc7cbbe7dba234a983 (patch)
treebb845edb96c07bd04ef9ee05e3cd727706901ae9 /src
parenta1d62eb92bd8139e7e8ce61ee18743492d429974 (diff)
downloadbox64-5563103453e96b6894a8cddc7cbbe7dba234a983.tar.gz
box64-5563103453e96b6894a8cddc7cbbe7dba234a983.zip
[RV64_DYNAREC] Implemented the first AVX128 opcode for scalar only (#1962)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_3.c40
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx.c63
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c96
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f30f.c2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.c19
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.c129
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h25
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h6
8 files changed, 368 insertions, 12 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_00_3.c b/src/dynarec/rv64/dynarec_rv64_00_3.c
index 1b76badc..cd2914cd 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_3.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_3.c
@@ -300,7 +300,45 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             *need_epilog = 0;
             *ok = 0;
             break;
-
+        case 0xC4:
+            nextop = F8;
+            if (rex.is32bits && !(MODREG)) {
+                DEFAULT;
+            } else {
+                vex_t vex = { 0 };
+                vex.rex = rex;
+                u8 = nextop;
+                vex.m = u8 & 0b00011111;
+                vex.rex.b = (u8 & 0b00100000) ? 0 : 1;
+                vex.rex.x = (u8 & 0b01000000) ? 0 : 1;
+                vex.rex.r = (u8 & 0b10000000) ? 0 : 1;
+                u8 = F8;
+                vex.p = u8 & 0b00000011;
+                vex.l = (u8 >> 2) & 1;
+                vex.v = ((~u8) >> 3) & 0b1111;
+                vex.rex.w = (u8 >> 7) & 1;
+                addr = dynarec64_AVX(dyn, addr, ip, ninst, vex, ok, need_epilog);
+            }
+            break;
+        case 0xC5:
+            nextop = F8;
+            if (rex.is32bits && !(MODREG)) {
+                DEFAULT;
+            } else {
+                vex_t vex = { 0 };
+                vex.rex = rex;
+                u8 = nextop;
+                vex.p = u8 & 0b00000011;
+                vex.l = (u8 >> 2) & 1;
+                vex.v = ((~u8) >> 3) & 0b1111;
+                vex.rex.r = (u8 & 0b10000000) ? 0 : 1;
+                vex.rex.b = 0;
+                vex.rex.x = 0;
+                vex.rex.w = 0;
+                vex.m = VEX_M_0F;
+                addr = dynarec64_AVX(dyn, addr, ip, ninst, vex, ok, need_epilog);
+            }
+            break;
         case 0xC6:
             INST_NAME("MOV Eb, Ib");
             nextop=F8;
diff --git a/src/dynarec/rv64/dynarec_rv64_avx.c b/src/dynarec/rv64/dynarec_rv64_avx.c
new file mode 100644
index 00000000..c9d80f2a
--- /dev/null
+++ b/src/dynarec/rv64/dynarec_rv64_avx.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+
+#include "rv64_printer.h"
+#include "dynarec_rv64_private.h"
+#include "dynarec_rv64_functions.h"
+#include "dynarec_rv64_helper.h"
+
+static const char* avx_prefix_string(uint16_t p)
+{
+    switch (p) {
+        case VEX_P_NONE: return "0";
+        case VEX_P_66: return "66";
+        case VEX_P_F2: return "F2";
+        case VEX_P_F3: return "F3";
+        default: return "??";
+    }
+}
+static const char* avx_map_string(uint16_t m)
+{
+    switch (m) {
+        case VEX_M_NONE: return "0";
+        case VEX_M_0F: return "0F";
+        case VEX_M_0F38: return "0F38";
+        case VEX_M_0F3A: return "0F3A";
+        default: return "??";
+    }
+}
+
+uintptr_t dynarec64_AVX(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)need_epilog;
+
+    uint8_t opcode = PK(0);
+    rex_t rex = vex.rex;
+
+    if ((vex.m == VEX_M_0F) && (vex.p == VEX_P_F3))
+        addr = dynarec64_AVX_F3_0F(dyn, addr, ip, ninst, vex, ok, need_epilog);
+    else {
+        DEFAULT;
+    }
+
+    if ((*ok == -1) && (box64_dynarec_log >= LOG_INFO || box64_dynarec_dump || box64_dynarec_missing == 1)) {
+        dynarec_log(LOG_NONE, "Dynarec unimplemented AVX opcode size %d prefix %s map %s opcode %02X ", 128 << vex.l, avx_prefix_string(vex.p), avx_map_string(vex.m), opcode);
+    }
+    return addr;
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c b/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c
new file mode 100644
index 00000000..443ef949
--- /dev/null
+++ b/src/dynarec/rv64/dynarec_rv64_avx_f3_0f.c
@@ -0,0 +1,96 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+#include "my_cpuid.h"
+#include "emu/x87emu_private.h"
+#include "emu/x64shaext.h"
+
+#include "rv64_printer.h"
+#include "dynarec_rv64_private.h"
+#include "dynarec_rv64_functions.h"
+#include "dynarec_rv64_helper.h"
+
+uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog)
+{
+    (void)ip;
+    (void)need_epilog;
+
+    uint8_t opcode = F8;
+    uint8_t nextop, u8;
+    uint8_t gd, ed, vd;
+    uint8_t wback, wb1, wb2, gback, vback;
+    uint8_t eb1, eb2, gb1, gb2;
+    int32_t i32, i32_;
+    int cacheupd = 0;
+    int v0, v1, v2;
+    int q0, q1, q2;
+    int d0, d1, d2;
+    int s0;
+    uint64_t tmp64u, u64;
+    int64_t j64;
+    int64_t fixedaddress, gdoffset, vxoffset;
+    int unscaled;
+
+    rex_t rex = vex.rex;
+
+    switch (opcode) {
+        case 0x10:
+            INST_NAME("VMOVSS Gx, [Vx,] Ex");
+            nextop = F8;
+            GETG;
+            if (MODREG) {
+                if (gd == vex.v) {
+                    v0 = sse_get_reg(dyn, ninst, x1, gd, 1);
+                    q0 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 1);
+                    FMVS(v0, q0);
+                } else {
+                    GETGX();
+                    GETVX();
+                    GETEX(x2, 0, 1);
+                    if (rv64_xtheadmempair) {
+                        ADD(x1, vback, vxoffset);
+                        TH_LDD(x3, x4, x1, 0);
+                    } else {
+                        LD(x3, vback, vxoffset);
+                        LD(x4, vback, vxoffset + 8);
+                    }
+                    LWU(x5, wback, fixedaddress);
+                    if (rv64_xtheadmempair) {
+                        ADDI(x1, gback, gdoffset);
+                        TH_SDD(x3, x4, x1, 0);
+                    } else {
+                        SD(x3, gback, gdoffset);
+                        SD(x4, gback, gdoffset + 8);
+                    }
+                    SW(x5, gback, gdoffset);
+                }
+            } else {
+                v0 = sse_get_reg_empty(dyn, ninst, x1, gd, 1);
+                SMREAD();
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0);
+                FLW(v0, ed, fixedaddress);
+                // reset upper part
+                SW(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 4);
+                SD(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 8);
+            }
+            YMM0(gd);
+            break;
+        default:
+            DEFAULT;
+    }
+    return addr;
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_f30f.c b/src/dynarec/rv64/dynarec_rv64_f30f.c
index 29a65c33..97fa1935 100644
--- a/src/dynarec/rv64/dynarec_rv64_f30f.c
+++ b/src/dynarec/rv64/dynarec_rv64_f30f.c
@@ -59,7 +59,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             } else {
                 v0 = sse_get_reg_empty(dyn, ninst, x1, gd, 1);
                 SMREAD();
-                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 8, 0);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0);
                 FLW(v0, ed, fixedaddress);
                 // reset upper part
                 SW(xZR, xEmu, offsetof(x64emu_t, xmm[gd]) + 4);
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c
index 619041be..234f3b6d 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.c
+++ b/src/dynarec/rv64/dynarec_rv64_functions.c
@@ -92,11 +92,11 @@ int fpu_get_reg_xmm(dynarec_rv64_t* dyn, int t, int xmm)
     return EXTREG(i);
 }
 // Reset fpu regs counter
-void fpu_reset_reg_extcache(dynarec_rv64_t* dyn, extcache_t* e)
+static void fpu_reset_reg_extcache(dynarec_rv64_t* dyn, extcache_t* e)
 {
     e->fpu_reg = 0;
-    for (int i=0; i<24; ++i) {
-        e->fpuused[i]=0;
+    for (int i = 0; i < 32; ++i) {
+        e->fpuused[i] = 0;
         e->extcache[i].v = 0;
     }
     dyn->vector_sew = VECTOR_SEWNA;
@@ -492,7 +492,7 @@ void extcacheUnwind(extcache_t* cache)
         cache->ssecache[i*2+1].v = -1;
     }
     int x87reg = 0;
-    for(int i=0; i<24; ++i) {
+    for (int i = 0; i < 32; ++i) {
         if(cache->extcache[i].v) {
             cache->fpuused[i] = 1;
             switch (cache->extcache[i].t) {
@@ -515,6 +515,8 @@ void extcacheUnwind(extcache_t* cache)
                     break;
                 case EXT_CACHE_XMMR:
                 case EXT_CACHE_XMMW:
+                case EXT_CACHE_YMMR:
+                case EXT_CACHE_YMMW:
                     cache->ssecache[cache->extcache[i].n].reg = EXTREG(i);
                     cache->ssecache[cache->extcache[i].n].vector = 1;
                     cache->ssecache[cache->extcache[i].n].write = (cache->extcache[i].t == EXT_CACHE_XMMW) ? 1 : 0;
@@ -605,6 +607,8 @@ const char* getCacheName(int t, int n)
         case EXT_CACHE_SCR: sprintf(buff, "Scratch"); break;
         case EXT_CACHE_XMMW: sprintf(buff, "XMM%d", n); break;
         case EXT_CACHE_XMMR: sprintf(buff, "xmm%d", n); break;
+        case EXT_CACHE_YMMW: sprintf(buff, "YMM%d", n); break;
+        case EXT_CACHE_YMMR: sprintf(buff, "ymm%d", n); break;
         case EXT_CACHE_NONE: buff[0]='\0'; break;
     }
     return buff;
@@ -654,7 +658,7 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
             dynarec_log(LOG_NONE, ", jmp=out");
         if(dyn->last_ip)
             dynarec_log(LOG_NONE, ", last_ip=%p", (void*)dyn->last_ip);
-        for(int ii=0; ii<24; ++ii) {
+        for (int ii = 0; ii < 32; ++ii) {
             switch(dyn->insts[ninst].e.extcache[ii].t) {
                 case EXT_CACHE_ST_D: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
                 case EXT_CACHE_ST_F: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
@@ -664,11 +668,15 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
                 case EXT_CACHE_SD: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
                 case EXT_CACHE_XMMR: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
                 case EXT_CACHE_XMMW: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
+                case EXT_CACHE_YMMW: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
+                case EXT_CACHE_YMMR: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
                 case EXT_CACHE_SCR: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
                 case EXT_CACHE_NONE:
                 default:    break;
             }
         }
+        if (dyn->ymm_zero)
+            dynarec_log(LOG_NONE, " ymm0_mask = %04x", dyn->ymm_zero);
         if(dyn->e.stack || dyn->insts[ninst].e.stack_next || dyn->insts[ninst].e.x87stack)
             dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->e.stack, dyn->insts[ninst].e.stack_next, dyn->insts[ninst].e.stack_push, dyn->insts[ninst].e.stack_pop, dyn->insts[ninst].e.x87stack);
         if(dyn->insts[ninst].e.combined1 || dyn->insts[ninst].e.combined2)
@@ -733,6 +741,7 @@ void fpu_reset(dynarec_rv64_t* dyn)
     mmx_reset(&dyn->e);
     sse_reset(&dyn->e);
     fpu_reset_reg(dyn);
+    dyn->ymm_zero = 0;
 }
 
 void fpu_reset_ninst(dynarec_rv64_t* dyn, int ninst)
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c
index be4298a8..9b2d69ee 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.c
+++ b/src/dynarec/rv64/dynarec_rv64_helper.c
@@ -1897,6 +1897,26 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1)
                 dyn->e.ssecache[i].v = -1;
             }
         }
+
+    // AVX
+    if (dyn->ymm_zero) {
+        if (old == -1) {
+            MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next ? "locally " : "");
+            ++old;
+        }
+        for (int i = 0; i < 16; ++i)
+            if (is_avx_zero(dyn, ninst, i)) {
+                if (rv64_xtheadmempair) {
+                    ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i]));
+                    TH_SDD(xZR, xZR, s1, 0);
+                } else {
+                    SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+                    SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8);
+                }
+            }
+        if (!next)
+            avx_mark_zero_reset(dyn, ninst);
+    }
     if(old!=-1) {
         MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n");
     }
@@ -1915,10 +1935,32 @@ static void sse_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1)
             else
                 FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
         }
+
+    // AVX
+    if (dyn->ymm_zero)
+        for (int i = 0; i < 16; ++i)
+            if (is_avx_zero(dyn, ninst, i)) {
+                if (rv64_xtheadmempair) {
+                    ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i]));
+                    TH_SDD(xZR, xZR, s1, 0);
+                } else {
+                    SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+                    SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8);
+                }
+            }
 }
 
 void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a)
 {
+    if (is_avx_zero(dyn, ninst, a)) {
+        if (rv64_xtheadmempair) {
+            ADDI(s1, xEmu, offsetof(x64emu_t, ymm[a]));
+            TH_SDD(xZR, xZR, s1, 0);
+        } else {
+            SD(xZR, xEmu, offsetof(x64emu_t, ymm[a]));
+            SD(xZR, xEmu, offsetof(x64emu_t, ymm[a]) + 8);
+        }
+    }
     if (dyn->e.ssecache[a].v == -1)
         return;
     if (dyn->e.ssecache[a].vector) {
@@ -1931,6 +1973,14 @@ void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a)
         FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
 }
 
+void ymm_mark_zero(dynarec_rv64_t* dyn, int ninst, int a)
+{
+#if STEP == 0
+    dyn->insts[ninst].ymm0_add |= (1 << a);
+#endif
+    avx_mark_zero(dyn, ninst, a);
+}
+
 void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
 {
     // for float registers, we might lost f0..f7, f10..f17 and f28..f31, that means
@@ -1949,6 +1999,15 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
                     FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
                 else
                     FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
+                if (is_avx_zero(dyn, ninst, i)) {
+                    if (rv64_xtheadmempair) {
+                        ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i]));
+                        TH_SDD(xZR, xZR, s1, 0);
+                    } else {
+                        SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+                        SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8);
+                    }
+                }
             }
         MESSAGE(LOG_DUMP, "\t------- Push (float) XMM Cache (%d)\n", n);
     }
@@ -1990,6 +2049,15 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
                     ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
                     VSE_V(dyn->e.ssecache[i].reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
                 }
+                if (is_avx_zero(dyn, ninst, i)) {
+                    if (rv64_xtheadmempair) {
+                        ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i]));
+                        TH_SDD(xZR, xZR, s1, 0);
+                    } else {
+                        SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+                        SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8);
+                    }
+                }
             }
         MESSAGE(LOG_DUMP, "\t------- Push (vector) XMM Cache (%d)\n", n);
     }
@@ -2100,6 +2168,13 @@ static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_
                 case EXT_CACHE_XMMW:
                     if (t == EXT_CACHE_XMMR)
                         return i;
+                case EXT_CACHE_YMMR:
+                    if (t == EXT_CACHE_YMMW)
+                        return i;
+                    break;
+                case EXT_CACHE_YMMW:
+                    if (t == EXT_CACHE_YMMR)
+                        return i;
                     break;
             }
         }
@@ -2111,7 +2186,10 @@ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *
 {
     if (i == j) return;
 
-    if (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW || cache->extcache[j].t == EXT_CACHE_XMMR || cache->extcache[j].t == EXT_CACHE_XMMW) {
+    if (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW
+        || cache->extcache[j].t == EXT_CACHE_XMMR || cache->extcache[j].t == EXT_CACHE_XMMW
+        || cache->extcache[i].t == EXT_CACHE_YMMR || cache->extcache[i].t == EXT_CACHE_YMMW
+        || cache->extcache[j].t == EXT_CACHE_YMMR || cache->extcache[j].t == EXT_CACHE_YMMW) {
         int reg_i = EXTREG(i);
         int reg_j = EXTREG(j);
         if (!cache->extcache[i].v) {
@@ -2178,7 +2256,7 @@ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *
 static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, extcache_t* cache, int i, int t, int n)
 {
     int reg = EXTREG(i);
-    if (cache->extcache[i].v && (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW)) {
+    if (cache->extcache[i].v && (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW || cache->extcache[i].t == EXT_CACHE_YMMR || cache->extcache[i].t == EXT_CACHE_YMMW)) {
         int j = i + 1;
         while (cache->extcache[j].v) ++j;
         MESSAGE(LOG_DUMP, "\t  - Moving away %d\n", i);
@@ -2208,6 +2286,13 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int
             ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n]));
             VLE_V(reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
             break;
+        case EXT_CACHE_YMMR:
+        case EXT_CACHE_YMMW:
+            MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
+            SET_ELEMENT_WIDTH(s1, VECTOR_SEWANY, 0);
+            ADDI(s1, xEmu, offsetof(x64emu_t, ymm[n]));
+            VLE_V(reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
+            break;
         case EXT_CACHE_SS:
             MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
             FLW(reg, xEmu, offsetof(x64emu_t, xmm[n]));
@@ -2260,6 +2345,7 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i
     int reg = EXTREG(i);
     switch(t) {
         case EXT_CACHE_XMMR:
+        case EXT_CACHE_YMMR:
             MESSAGE(LOG_DUMP, "\t  - ignoring %s\n", getCacheName(t, n));
             break;
         case EXT_CACHE_XMMW:
@@ -2268,6 +2354,12 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i
             ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n]));
             VSE_V(reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
             break;
+        case EXT_CACHE_YMMW:
+            MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
+            SET_ELEMENT_WIDTH(s1, VECTOR_SEWANY, 0);
+            ADDI(s1, xEmu, offsetof(x64emu_t, ymm[n]));
+            VSE_V(reg, s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
+            break;
         case EXT_CACHE_SS:
             MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
             FSW(reg, xEmu, offsetof(x64emu_t, xmm[n]));
@@ -2421,6 +2513,8 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in
                     cache.extcache[i].t = EXT_CACHE_ST_D;
                 } else if (cache.extcache[i].t == EXT_CACHE_XMMR && cache_i2.extcache[i].t == EXT_CACHE_XMMW) {
                     cache.extcache[i].t = EXT_CACHE_XMMW;
+                } else if (cache.extcache[i].t == EXT_CACHE_YMMR && cache_i2.extcache[i].t == EXT_CACHE_YMMW) {
+                    cache.extcache[i].t = EXT_CACHE_YMMW;
                 } else if (cache.extcache[i].t == EXT_CACHE_XMMW && cache_i2.extcache[i].t == EXT_CACHE_XMMR) {
                     // refresh cache...
                     MESSAGE(LOG_DUMP, "\t  - Refreh %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n));
@@ -2428,6 +2522,13 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in
                     ADDI(s1, xEmu, offsetof(x64emu_t, xmm[cache.extcache[i].n]));
                     VSE_V(EXTREG(i), s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
                     cache.extcache[i].t = EXT_CACHE_XMMR;
+                } else if (cache.extcache[i].t == EXT_CACHE_YMMW && cache_i2.extcache[i].t == EXT_CACHE_YMMR) {
+                    // refresh cache...
+                    MESSAGE(LOG_DUMP, "\t  - Refreh %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n));
+                    SET_ELEMENT_WIDTH(s1, VECTOR_SEWANY, 0);
+                    ADDI(s1, xEmu, offsetof(x64emu_t, ymm[cache.extcache[i].n]));
+                    VSE_V(EXTREG(i), s1, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
+                    cache.extcache[i].t = EXT_CACHE_YMMR;
                 }
             }
         }
@@ -2835,3 +2936,27 @@ void vector_loadmask(dynarec_rv64_t* dyn, int ninst, int vreg, uint64_t imm, int
     }
 #endif
 }
+
+
+void avx_purge_ymm(dynarec_rv64_t* dyn, int ninst, uint16_t mask, int s1)
+{
+    int do_something = 0;
+    for (int i = 0; i < 16; ++i)
+        if (mask & (1 << i)) {
+            if (is_avx_zero_unset(dyn, ninst, i)) {
+                if (!do_something) {
+                    MESSAGE(LOG_NONE, "Purge YMM mask=%04x --------\n", mask);
+                    do_something = 1;
+                }
+                if (rv64_xtheadmempair) {
+                    ADDI(s1, xEmu, offsetof(x64emu_t, ymm[i]));
+                    TH_SDD(xZR, xZR, s1, 0);
+                } else {
+                    SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+                    SD(xZR, xEmu, offsetof(x64emu_t, ymm[i]) + 8);
+                }
+            }
+        }
+    if (do_something)
+        MESSAGE(LOG_NONE, "---------- Purge YMM\n");
+}
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index d2d84e90..d69addee 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -420,6 +420,9 @@
         OR(wback, wback, ed);             \
     }
 
+
+#define YMM0(a) ymm_mark_zero(dyn, ninst, a);
+
 // Get direction with size Z and based of F_DF flag, on register r ready for load/store fetching
 // using s as scratch.
 #define GETDIR(r, s, Z)            \
@@ -480,6 +483,11 @@
     gback = xEmu;                               \
     gdoffset = offsetof(x64emu_t, xmm[gd])
 
+#define GETVX()                            \
+    sse_forget_reg(dyn, ninst, x3, vex.v); \
+    vback = xEmu;                          \
+    vxoffset = offsetof(x64emu_t, xmm[vex.v])
+
 // Get Ex address in general register a, will purge SS or SD if it's reg and is loaded. May use x3. Use wback as load address!
 #define GETEX(a, D, I12)                                                                         \
     if (MODREG) {                                                                                \
@@ -1148,6 +1156,9 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 #define dynarec64_F20F_vector   STEPNAME(dynarec64_F20F_vector)
 #define dynarec64_F30F_vector   STEPNAME(dynarec64_F30F_vector)
 
+#define dynarec64_AVX       STEPNAME(dynarec64_AVX)
+#define dynarec64_AVX_F3_0F STEPNAME(dynarec64_AVX_F3_0F)
+
 #define geted               STEPNAME(geted)
 #define geted32             STEPNAME(geted32)
 #define geted16             STEPNAME(geted16)
@@ -1279,6 +1290,8 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 #define sse_purge07cache         STEPNAME(sse_purge07cache)
 #define sse_reflect_reg          STEPNAME(sse_reflect_reg)
 
+#define ymm_mark_zero STEPNAME(ymm_mark_zero)
+
 #define sse_get_reg_empty_vector STEPNAME(sse_get_reg_empty_vector)
 #define sse_get_reg_vector       STEPNAME(sse_get_reg_vector)
 #define sse_forget_reg_vector    STEPNAME(sse_forget_reg_vector)
@@ -1293,6 +1306,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
 #define sse_purgecache      STEPNAME(sse_purgecache)
 #define fpu_reflectcache    STEPNAME(fpu_reflectcache)
 #define fpu_unreflectcache  STEPNAME(fpu_unreflectcache)
+#define avx_purge_ymm       STEPNAME(avx_purge_ymm)
 
 #define CacheTransform STEPNAME(CacheTransform)
 #define rv64_move64    STEPNAME(rv64_move64)
@@ -1450,6 +1464,9 @@ void x87_restoreround(dynarec_rv64_t* dyn, int ninst, int s1);
 // Set rounding according to mxcsr flags, return reg to restore flags
 int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2);
 
+// purge ymm_zero mask according to purge_ymm
+void avx_purge_ymm(dynarec_rv64_t* dyn, int ninst, uint16_t mask, int s1);
+
 void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3);
 
 void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val);
@@ -1518,6 +1535,9 @@ void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1);
 // Push current value to the cache
 void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a);
 
+// mark an ymm upper part has zero (forgetting upper part if needed)
+void ymm_mark_zero(dynarec_rv64_t* dyn, int ninst, int a);
+
 // common coproc helpers
 // reset the cache with n
 void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n);
@@ -1571,6 +1591,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
 uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 
+uintptr_t dynarec64_AVX(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
+uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
+
 #if STEP < 2
 #define PASS2(A)
 #else
@@ -1754,7 +1777,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
 #define FCOMIS(v1, v2, s1, s2, s3, s4, s5) FCOMI(S, v1, v2, s1, s2, s3, s4, s5)
 #define FCOMID(v1, v2, s1, s2, s3, s4, s5) FCOMI(D, v1, v2, s1, s2, s3, s4, s5)
 
-#define PURGE_YMM()    /* TODO */
+#define PURGE_YMM() avx_purge_ymm(dyn, ninst, dyn->insts[ninst + 1].purge_ymm, x1)
 
 // reg = (reg < -32768) ? -32768 : ((reg > 32767) ? 32767 : reg)
 #define SAT16(reg, s)             \
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index 4f552e6c..b591ecee 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -20,6 +20,8 @@ typedef struct instsize_s instsize_t;
 #define EXT_CACHE_SCR    7
 #define EXT_CACHE_XMMW   8
 #define EXT_CACHE_XMMR   9
+#define EXT_CACHE_YMMW   10
+#define EXT_CACHE_YMMR   11
 
 #define EXT_CACHE_OLD_SD   0
 #define EXT_CACHE_OLD_SS   1
@@ -56,7 +58,7 @@ typedef union sse_old_s {
 
 typedef struct extcache_s {
     // ext cache
-    ext_cache_t         extcache[24];
+    ext_cache_t         extcache[32];
     int8_t              stack;
     int8_t              stack_next;
     int8_t              stack_pop;
@@ -75,7 +77,7 @@ typedef struct extcache_s {
     int16_t             tags;           // similar to fpu_tags
     int8_t              mmxcache[8];    // cache status for the 8 MMX registers
     sse_cache_t         ssecache[16];   // cache status for the 16 SSE(2) registers
-    int8_t              fpuused[24];    // all 10..31 & 0..1 double reg from fpu, used by x87, sse and mmx
+    int8_t              fpuused[32];    // all double reg from fpu, used by x87, mmx, sse and avx
     int8_t              x87stack;       // cache stack counter
     int8_t              mmxcount;       // number of mmx register used (not both mmx and x87 at the same time)
     int8_t              fpu_scratch;    // scratch counter