about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2024-07-19 17:34:45 +0800
committerGitHub <noreply@github.com>2024-07-19 11:34:45 +0200
commit3e20a60c37b77b782a81f1a6b234a3c4ddd61d2c (patch)
tree19dbc080091a1eff074f42eebca9db8b743e17c3
parent984c13634980f4fdd7d7c67c3e25cb38cb360874 (diff)
downloadbox64-3e20a60c37b77b782a81f1a6b234a3c4ddd61d2c.tar.gz
box64-3e20a60c37b77b782a81f1a6b234a3c4ddd61d2c.zip
[RV64_DYNAREC] Added vector SEW cache (#1698)
* [RV64_DYNAREC] Added vector SEW cache

* handling reset_n
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h1
-rw-r--r--src/dynarec/dynarec_arch.h10
-rw-r--r--src/dynarec/dynarec_native_pass.c1
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h1
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f_vector.c20
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.c27
-rw-r--r--src/dynarec/rv64/dynarec_rv64_functions.h2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.c49
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h21
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass0.h35
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass1.h18
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass2.h18
-rw-r--r--src/dynarec/rv64/dynarec_rv64_pass3.h12
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h2
-rw-r--r--src/dynarec/rv64/rv64_emitter.h2
15 files changed, 143 insertions, 76 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index dbc3c29f..7d7114ed 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -1107,6 +1107,7 @@
     dyn->smread = dyn->smwrite = 0; \
     dyn->doublepush = 0;            \
     dyn->doublepop = 0;
+#define ARCH_RESET()
 
 #if STEP < 2
 #define GETIP(A) TABLE64(0, 0)
diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h
index f89125a7..6a5c4977 100644
--- a/src/dynarec/dynarec_arch.h
+++ b/src/dynarec/dynarec_arch.h
@@ -38,11 +38,13 @@
 #define instruction_native_t        instruction_rv64_t

 #define dynarec_native_t            dynarec_rv64_t

 

-#define ADDITIONNAL_DEFINITION()  \

-    int fpuCacheNeedsTransform(dynarec_native_t* dyn, int ninst);

+#define ADDITIONNAL_DEFINITION()                                  \

+    int fpuCacheNeedsTransform(dynarec_native_t* dyn, int ninst); \

+    int sewNeedsTransform(dynarec_rv64_t* dyn, int ninst);

 

-#define OTHER_CACHE()   \

-    if (fpuCacheNeedsTransform(dyn, ninst)) ret|=2;

+#define OTHER_CACHE()                                 \

+    if (fpuCacheNeedsTransform(dyn, ninst)) ret |= 2; \

+    if (sewNeedsTransform(dyn, ninst)) ret |= 3;

 

 #include "rv64/rv64_printer.h"

 #include "rv64/dynarec_rv64_private.h"

diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index a6bcd4ec..779ba9be 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -89,6 +89,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
                 dyn->f.dfnone = 0;
                 dyn->f.pending = 0;
                 fpu_reset(dyn);
+                ARCH_RESET();
             } else {
                 fpu_reset_cache(dyn, ninst, reset_n);
                 dyn->f = dyn->insts[reset_n].f_exit;
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index 573c115b..f2972274 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -710,6 +710,7 @@
 #endif
 
 #define ARCH_INIT()
+#define ARCH_RESET()
 
 #if STEP < 2
 #define GETIP(A) TABLE64(0, 0)
diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c
index deb95d26..4ca426e6 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c
@@ -50,11 +50,9 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
         case 0x28:
             INST_NAME("MOVAPD Gx, Ex");
             nextop = F8;
-            // FIXME
-            vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8);
-
             GETG;
             if (MODREG) {
+                SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY);
                 ed = (nextop & 7) + (rex.b << 3);
                 v1 = sse_get_reg_vector(dyn, ninst, x1, ed, 0);
                 v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd);
@@ -63,7 +61,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 SMREAD();
                 v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
-                VLE8_V(v0, ed, VECTOR_UNMASKED, VECTOR_NFIELD1);
+                VL1RE64_V(v0, ed);
             }
             break;
         case 0x38: // SSSE3 opcodes
@@ -72,9 +70,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 case 0x00:
                     INST_NAME("PSHUFB Gx, Ex");
                     nextop = F8;
-                    // FIXME
-                    vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8);
-
+                    SET_ELEMENT_WIDTH(x1, VECTOR_SEW8);
                     GETGX_vector(q0, 1);
                     GETEX_vector(q1, 0, 0);
                     v0 = fpu_get_scratch(dyn);
@@ -94,10 +90,8 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
         case 0x6F:
             INST_NAME("MOVDQA Gx, Ex");
             nextop = F8;
-            // FIXME
-            vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8);
-
             if (MODREG) {
+                SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY);
                 v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0);
                 GETGX_empty_vector(v0);
                 VMV_V_V(v0, v1);
@@ -105,7 +99,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 GETGX_empty_vector(v0);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 0, 0);
-                VLE8_V(v0, ed, VECTOR_UNMASKED, VECTOR_NFIELD1);
+                VL1RE64_V(v0, ed);
             }
             break;
         case 0x7E:
@@ -113,9 +107,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
         case 0xEF:
             INST_NAME("PXOR Gx, Ex");
             nextop = F8;
-            // FIXME: we should try to minimize vsetvl usage as it may hurts performance a lot.
-            vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8);
-
+            SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY);
             GETG;
             if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
                 // special case
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c
index 6ce97b8a..2d46d5e2 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.c
+++ b/src/dynarec/rv64/dynarec_rv64_functions.c
@@ -18,6 +18,7 @@
 #include "callback.h"
 #include "emu/x64run_private.h"
 #include "emu/x87emu_private.h"
+#include "rv64_emitter.h"
 #include "x64trace.h"
 #include "signals.h"
 #include "dynarec_native.h"
@@ -370,6 +371,20 @@ int fpuCacheNeedsTransform(dynarec_rv64_t* dyn, int ninst) {
     return ret;
 }
 
+int sewNeedsTransform(dynarec_rv64_t* dyn, int ninst)
+{
+    int i2 = dyn->insts[ninst].x64.jmp_insts;
+
+    if (dyn->insts[i2].vector_sew == VECTOR_SEWNA)
+        return 0;
+    else if (dyn->insts[i2].vector_sew == VECTOR_SEWANY && dyn->insts[ninst].vector_sew != VECTOR_SEWNA)
+        return 0;
+    else if (dyn->insts[i2].vector_sew == dyn->insts[ninst].vector_sew)
+        return 0;
+
+    return 1;
+}
+
 void extcacheUnwind(extcache_t* cache)
 {
     if(cache->swapped) {
@@ -592,22 +607,22 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
     };
     if(box64_dynarec_dump) {
         printf_x64_instruction(rex.is32bits?my_context->dec32:my_context->dec, &dyn->insts[ninst].x64, name);
-        dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d/%d",
-            (box64_dynarec_dump>1)?"\e[32m":"",
-            (void*)(dyn->native_start+dyn->insts[ninst].address),
-            dyn->insts[ninst].size/4,
+        dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d/%d, sew=%d",
+            (box64_dynarec_dump > 1) ? "\e[32m" : "",
+            (void*)(dyn->native_start + dyn->insts[ninst].address),
+            dyn->insts[ninst].size / 4,
             ninst,
             dyn->insts[ninst].x64.barrier,
             dyn->insts[ninst].x64.state_flags,
             dyn->f.pending,
             dyn->f.dfnone,
-            dyn->insts[ninst].x64.may_set?"may":"set",
+            dyn->insts[ninst].x64.may_set ? "may" : "set",
             dyn->insts[ninst].x64.set_flags,
             dyn->insts[ninst].x64.gen_flags,
             dyn->insts[ninst].x64.use_flags,
             dyn->insts[ninst].x64.need_before,
             dyn->insts[ninst].x64.need_after,
-            dyn->smread, dyn->smwrite);
+            dyn->smread, dyn->smwrite, dyn->insts[ninst].vector_sew);
         if(dyn->insts[ninst].pred_sz) {
             dynarec_log(LOG_NONE, ", pred=");
             for(int ii=0; ii<dyn->insts[ninst].pred_sz; ++ii)
diff --git a/src/dynarec/rv64/dynarec_rv64_functions.h b/src/dynarec/rv64/dynarec_rv64_functions.h
index e3a5171d..41d01c4b 100644
--- a/src/dynarec/rv64/dynarec_rv64_functions.h
+++ b/src/dynarec/rv64/dynarec_rv64_functions.h
@@ -46,6 +46,8 @@ int extcache_no_i64(dynarec_rv64_t* dyn, int ninst, int st, int a);
 // FPU Cache transformation (for loops) // Specific, need to be written par backend
 int fpuCacheNeedsTransform(dynarec_rv64_t* dyn, int ninst);
 
+int sewNeedsTransform(dynarec_rv64_t* dyn, int ninst);
+
 // Undo the changes of a extcache to get the status before the instruction
 void extcacheUnwind(extcache_t* cache);
 
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c
index 4c132350..3229b3da 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.c
+++ b/src/dynarec/rv64/dynarec_rv64_helper.c
@@ -1675,7 +1675,7 @@ int sse_get_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a, int forwri
     dyn->e.ssecache[a].vector = 1;
     dyn->e.ssecache[a].single = 0; // just to be clean
     ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a]));
-    VLE8_V(ret, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+    VL1RE64_V(ret, s1);
     return ret;
 }
 
@@ -1710,7 +1710,7 @@ void sse_forget_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a)
         return sse_forget_reg(dyn, ninst, s1, a);
     if (dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t == EXT_CACHE_XMMW) {
         ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a]));
-        VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+        VS1R_V(dyn->e.ssecache[a].reg, s1);
     }
     fpu_free_reg(dyn, dyn->e.ssecache[a].reg);
     dyn->e.ssecache[a].v = -1;
@@ -1729,7 +1729,7 @@ void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1)
             }
             if (dyn->e.ssecache[i].vector) {
                 ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
-                VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+                VS1R_V(dyn->e.ssecache[i].reg, s1);
             } else if (dyn->e.ssecache[i].single)
                 FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
             else
@@ -1754,7 +1754,7 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1)
             }
             if (dyn->e.ssecache[i].vector) {
                 ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
-                VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+                VS1R_V(dyn->e.ssecache[i].reg, s1);
             } else if (dyn->e.ssecache[i].single)
                 FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
             else
@@ -1784,7 +1784,7 @@ static void sse_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1)
         if (dyn->e.ssecache[i].v != -1) {
             if (dyn->e.ssecache[i].vector) {
                 ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
-                VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+                VS1R_V(dyn->e.ssecache[i].reg, s1);
             } else if (dyn->e.ssecache[i].single)
                 FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
             else
@@ -1798,7 +1798,7 @@ void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a)
         return;
     if (dyn->e.ssecache[a].vector) {
         ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a]));
-        VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+        VS1R_V(dyn->e.ssecache[a].reg, s1);
     } else if (dyn->e.ssecache[a].single)
         FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
     else
@@ -1821,7 +1821,7 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
             if(dyn->e.ssecache[i].v!=-1) {
                 if (dyn->e.ssecache[i].vector) {
                     ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
-                    VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+                    VS1R_V(dyn->e.ssecache[i].reg, s1);
                 } else if (dyn->e.ssecache[i].single)
                     FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
                 else
@@ -1867,7 +1867,7 @@ void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
             if(dyn->e.ssecache[i].v!=-1) {
                 if (dyn->e.ssecache[i].vector) {
                     ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
-                    VLE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+                    VL1RE64_V(dyn->e.ssecache[i].reg, s1);
                 } else if (dyn->e.ssecache[i].single)
                     FLW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
                 else
@@ -2047,7 +2047,7 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int
         case EXT_CACHE_XMMW:
             MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
             ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n]));
-            VLE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+            VL1RE64_V(i, s1);
             break;
         case EXT_CACHE_SS:
             MESSAGE(LOG_DUMP, "\t  - Loading %s\n", getCacheName(t, n));
@@ -2106,7 +2106,7 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i
         case EXT_CACHE_XMMW:
             MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
             ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n]));
-            VSE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
+            VS1R_V(i, s1);
             break;
         case EXT_CACHE_SS:
             MESSAGE(LOG_DUMP, "\t  - Unloading %s\n", getCacheName(t, n));
@@ -2197,7 +2197,6 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in
     int s2_val = 0;
     // unload every uneeded cache
     // check SSE first, than MMX, in order, for optimisation issue
-    if (rv64_vector) vector_vsetvl_emul1(dyn, ninst, s1, VECTOR_SEW8);
     for (int i = 0; i < 16; ++i) {
         int j = findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache);
         if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2) == -1)
@@ -2339,10 +2338,24 @@ static void flagsCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1)
 #endif
 }
 
+static void sewTransform(dynarec_rv64_t* dyn, int ninst, int s1)
+{
+#if STEP > 1
+    int j64;
+    int jmp = dyn->insts[ninst].x64.jmp_insts;
+    if (jmp < 0) return;
+    if (dyn->insts[jmp].vector_sew == VECTOR_SEWNA) return;
+    MESSAGE(LOG_DUMP, "\tSEW changed to %d ---- ninst=%d -> %d\n", dyn->insts[jmp].vector_sew, ninst, jmp);
+    vector_vsetvl_emul1(dyn, ninst, s1, dyn->insts[jmp].vector_sew);
+#endif
+}
+
 void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) {
-    if(cacheupd&2)
+    if (cacheupd & 3)
+        sewTransform(dyn, ninst, s1);
+    if (cacheupd & 2)
         fpuCacheTransform(dyn, ninst, s1, s2, s3);
-    if(cacheupd&1)
+    if (cacheupd & 1)
         flagsCacheTransform(dyn, ninst, s1);
 }
 
@@ -2426,16 +2439,18 @@ void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n)
 {
     MESSAGE(LOG_DEBUG, "Reset Caches with %d\n",reset_n);
     #if STEP > 1
-    // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap)
+    // for STEP 2 & 3, just need to refresh with current, and undo the changes (push & swap)
     dyn->e = dyn->insts[ninst].e;
+    dyn->vector_sew = dyn->insts[ninst].vector_sew;
     #else
     dyn->e = dyn->insts[reset_n].e;
+    dyn->vector_sew = dyn->insts[reset_n].vector_sew;
     #endif
     extcacheUnwind(&dyn->e);
     #if STEP == 0
     if(box64_dynarec_dump) dynarec_log(LOG_NONE, "New x87stack=%d\n", dyn->e.x87stack);
     #endif
-    #if defined(HAVE_TRACE) && (STEP>2)
+    #if defined(HAVE_TRACE) && (STEP > 2)
     if(box64_dynarec_dump)
         if(memcmp(&dyn->e, &dyn->insts[reset_n].e, sizeof(ext_cache_t))) {
             MESSAGE(LOG_DEBUG, "Warning, difference in extcache: reset=");
@@ -2464,7 +2479,7 @@ void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n)
                 MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->e.stack_push, -dyn->e.stack_pop);
             MESSAGE(LOG_DEBUG, "\n");
         }
-    #endif //HAVE_TRACE
+#endif // HAVE_TRACE
 }
 
 // propagate ST stack state, especial stack pop that are deferred
@@ -2492,6 +2507,8 @@ void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst)
 // other configs are set automatically.
 void vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew)
 {
+    if (sew == VECTOR_SEWNA) return;
+    if (sew == VECTOR_SEWANY) sew = VECTOR_SEW8;
     /* mu:   mask undisturbed
      * tu:   tail undisturbed
      * sew:  selected element width
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 17cca67d..e3c3bbf2 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -508,7 +508,7 @@
         addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, D); \
         a = fpu_get_scratch(dyn);                                                            \
         ADDI(x2, ed, fixedaddress);                                                          \
-        VLE8_V(a, x2, VECTOR_UNMASKED, VECTOR_NFIELD1);                                      \
+        VL1RE64_V(a, x2);                                                                    \
     }
 
 #define GETGM()                     \
@@ -1031,8 +1031,10 @@
 #define FTABLE64(A, V)
 #endif
 
-#define ARCH_INIT()
-
+#define ARCH_INIT() \
+    dyn->vector_sew = VECTOR_SEWNA;
+#define ARCH_RESET() \
+    dyn->vector_sew = VECTOR_SEWNA;
 
 #if STEP < 2
 #define GETIP(A) TABLE64(0, 0)
@@ -1078,6 +1080,19 @@
 
 #define MODREG ((nextop & 0xC0) == 0xC0)
 
+#ifndef SET_ELEMENT_WIDTH
+#define SET_ELEMENT_WIDTH(s1, sew)                                            \
+    do {                                                                      \
+        if (sew == VECTOR_SEWNA) {                                            \
+        } else if (sew == VECTOR_SEWANY && dyn->vector_sew != VECTOR_SEWNA) { \
+        } else if (sew == dyn->vector_sew) {                                  \
+        } else {                                                              \
+            vector_vsetvl_emul1(dyn, ninst, s1, sew);                         \
+        }                                                                     \
+        dyn->vector_sew = sew;                                                \
+    } while (0)
+#endif
+
 void rv64_epilog(void);
 void rv64_epilog_fast(void);
 void* rv64_next(x64emu_t* emu, uintptr_t addr);
diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h
index ed3c321b..8924cae0 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass0.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass0.h
@@ -20,20 +20,26 @@
 #define BARRIER(A)      if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1
 #define BARRIER_NEXT(A) dyn->insts[ninst].x64.barrier_next = A
 #define SET_HASCALLRET()    dyn->insts[ninst].x64.has_callret = 1
-#define NEW_INST \
-        ++dyn->size;                            \
-        memset(&dyn->insts[ninst], 0, sizeof(instruction_native_t));     \
-        dyn->insts[ninst].x64.addr = ip;        \
-        dyn->e.combined1 = dyn->e.combined2 = 0;\
-        dyn->e.swapped = 0; dyn->e.barrier = 0; \
-        for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\
-        dyn->insts[ninst].f_entry = dyn->f;     \
-        if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;}
+#define NEW_INST                                                                   \
+    ++dyn->size;                                                                   \
+    memset(&dyn->insts[ninst], 0, sizeof(instruction_native_t));                   \
+    dyn->insts[ninst].x64.addr = ip;                                               \
+    dyn->e.combined1 = dyn->e.combined2 = 0;                                       \
+    dyn->e.swapped = 0;                                                            \
+    dyn->e.barrier = 0;                                                            \
+    for (int i = 0; i < 16; ++i)                                                   \
+        dyn->e.olds[i].v = 0;                                                      \
+    dyn->insts[ninst].f_entry = dyn->f;                                            \
+    if (reset_n != -1)                                                             \
+        dyn->vector_sew = ninst ? dyn->insts[ninst - 1].vector_sew : VECTOR_SEWNA; \
+    if (ninst)                                                                     \
+        dyn->insts[ninst - 1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst - 1].x64.addr;
 
-#define INST_EPILOG                             \
-        dyn->insts[ninst].f_exit = dyn->f;      \
-        dyn->insts[ninst].e = dyn->e;           \
-        dyn->insts[ninst].x64.has_next = (ok>0)?1:0;
+#define INST_EPILOG                                 \
+    dyn->insts[ninst].f_exit = dyn->f;              \
+    dyn->insts[ninst].e = dyn->e;                   \
+    dyn->insts[ninst].vector_sew = dyn->vector_sew; \
+    dyn->insts[ninst].x64.has_next = (ok > 0) ? 1 : 0;
 #define INST_NAME(name)
 #define DEFAULT                         \
         --dyn->size;                    \
@@ -66,3 +72,6 @@
         dynarec_log(LOG_NONE, "\n");                                                                         \
     }                                                                                                        \
     return 0
+
+#define SET_ELEMENT_WIDTH(s1, sew) \
+    dyn->vector_sew = sew;
diff --git a/src/dynarec/rv64/dynarec_rv64_pass1.h b/src/dynarec/rv64/dynarec_rv64_pass1.h
index b76d7e97..c7813ba0 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass1.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass1.h
@@ -1,15 +1,19 @@
-#define INIT    
+#define INIT
 #define FINI
 #define MESSAGE(A, ...) do {} while (0)
 #define EMIT(A) do {} while (0)
-#define NEW_INST                                \
-        dyn->insts[ninst].f_entry = dyn->f;     \
-        dyn->e.combined1 = dyn->e.combined2 = 0;\
-        for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\
-        dyn->e.swapped = 0; dyn->e.barrier = 0
+#define NEW_INST                                                                   \
+    dyn->insts[ninst].f_entry = dyn->f;                                            \
+    dyn->e.combined1 = dyn->e.combined2 = 0;                                       \
+    for (int i = 0; i < 16; ++i)                                                   \
+        dyn->e.olds[i].v = 0;                                                      \
+    if (reset_n != -1)                                                             \
+        dyn->vector_sew = ninst ? dyn->insts[ninst - 1].vector_sew : VECTOR_SEWNA; \
+    dyn->e.swapped = 0;                                                            \
+    dyn->e.barrier = 0
 
 #define INST_EPILOG                             \
         dyn->insts[ninst].e = dyn->e;           \
         dyn->insts[ninst].f_exit = dyn->f
 
-#define INST_NAME(name)  
+#define INST_NAME(name)
diff --git a/src/dynarec/rv64/dynarec_rv64_pass2.h b/src/dynarec/rv64/dynarec_rv64_pass2.h
index 6761a454..37a71b9a 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass2.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass2.h
@@ -7,13 +7,15 @@
 
 #define MESSAGE(A, ...) do {} while (0)
 #define EMIT(A)     do {dyn->insts[ninst].size+=4; dyn->native_size+=4;}while(0)
-#define NEW_INST                                                                                        \
-        if(ninst) {                                                                                     \
-                dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size);     \
-                dyn->insts_size += 1+((dyn->insts[ninst-1].x64.size>(dyn->insts[ninst-1].size/4))?dyn->insts[ninst-1].x64.size:(dyn->insts[ninst-1].size/4))/15; \
-                dyn->insts[ninst].ymm0_pass2 = dyn->ymm_zero;                                           \
-        }
-#define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; 
-#define INST_NAME(name) 
+#define NEW_INST                                                                                                                                                               \
+    if (reset_n != -1)                                                                                                                                                         \
+        dyn->vector_sew = ninst ? dyn->insts[ninst - 1].vector_sew : VECTOR_SEWNA;                                                                                             \
+    if (ninst) {                                                                                                                                                               \
+        dyn->insts[ninst].address = (dyn->insts[ninst - 1].address + dyn->insts[ninst - 1].size);                                                                              \
+        dyn->insts_size += 1 + ((dyn->insts[ninst - 1].x64.size > (dyn->insts[ninst - 1].size / 4)) ? dyn->insts[ninst - 1].x64.size : (dyn->insts[ninst - 1].size / 4)) / 15; \
+        dyn->insts[ninst].ymm0_pass2 = dyn->ymm_zero;                                                                                                                          \
+    }
+#define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size;
+#define INST_NAME(name)
 #define TABLE64(A, V)   {Table64(dyn, (V), 2); EMIT(0); EMIT(0);}
 #define FTABLE64(A, V)  {mmx87_regs_t v = {.d = V}; Table64(dyn, v.q, 2); EMIT(0); EMIT(0);}
diff --git a/src/dynarec/rv64/dynarec_rv64_pass3.h b/src/dynarec/rv64/dynarec_rv64_pass3.h
index 4a32a728..1dce2bc4 100644
--- a/src/dynarec/rv64/dynarec_rv64_pass3.h
+++ b/src/dynarec/rv64/dynarec_rv64_pass3.h
@@ -12,11 +12,13 @@
     }while(0)
 
 #define MESSAGE(A, ...)  if(box64_dynarec_dump) dynarec_log(LOG_NONE, __VA_ARGS__)
-#define NEW_INST        \
-    if(box64_dynarec_dump) print_newinst(dyn, ninst);   \
-    if(ninst) {                                         \
-        addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst-1].x64.size, dyn->insts[ninst-1].size/4); \
-        dyn->insts[ninst].ymm0_pass3 = dyn->ymm_zero;   \
+#define NEW_INST                                                                                                  \
+    if (reset_n != -1)                                                                                            \
+        dyn->vector_sew = ninst ? dyn->insts[ninst - 1].vector_sew : VECTOR_SEWNA;                                \
+    if (box64_dynarec_dump) print_newinst(dyn, ninst);                                                            \
+    if (ninst) {                                                                                                  \
+        addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst - 1].x64.size, dyn->insts[ninst - 1].size / 4); \
+        dyn->insts[ninst].ymm0_pass3 = dyn->ymm_zero;                                                             \
     }
 #define INST_EPILOG
 #define INST_NAME(name) inst_name_pass3(dyn, ninst, name, rex)
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index 94c4cf23..70f58661 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -110,6 +110,7 @@ typedef struct instruction_rv64_s {
     flagcache_t         f_exit;     // flags status at end of intruction
     extcache_t          e;          // extcache at end of intruction (but before poping)
     flagcache_t         f_entry;    // flags status before the instruction begin
+    uint8_t             vector_sew;
 } instruction_rv64_t;
 
 typedef struct dynarec_rv64_s {
@@ -148,6 +149,7 @@ typedef struct dynarec_rv64_s {
     uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
     uint8_t             always_test;
     uint8_t             abort;
+    uint8_t             vector_sew;
 } dynarec_rv64_t;
 
 // convert idx (0..24) to reg index (10..31 0..1)
diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h
index e0fe4403..4d574684 100644
--- a/src/dynarec/rv64/rv64_emitter.h
+++ b/src/dynarec/rv64/rv64_emitter.h
@@ -1215,6 +1215,8 @@ f28–31  ft8–11  FP temporaries                  Caller
 #define VECTOR_SEW16 0b001
 #define VECTOR_SEW32 0b010
 #define VECTOR_SEW64 0b011
+#define VECTOR_SEWNA  0b111  // N/A
+#define VECTOR_SEWANY 0b1000 // any sew would be ok, but not N/A.
 
 #define VECTOR_MASKED   0
 #define VECTOR_UNMASKED 1