From 2835a2f87d293ce56ddf40f88520ad971de4f06b Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Mon, 28 Oct 2024 05:31:20 +0800 Subject: [RV64_DYNAREC] Added more opcodes for vector (#1966) --- src/dynarec/arm64/dynarec_arm64_0f.c | 2 -- src/dynarec/rv64/dynarec_rv64_0f_vector.c | 48 +++++++++++++++++++++++++++++ src/dynarec/rv64/dynarec_rv64_660f_vector.c | 24 +++++++++++++++ src/dynarec/rv64/dynarec_rv64_helper.c | 8 ++--- 4 files changed, 76 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c index 3969a0cf..9326ea60 100644 --- a/src/dynarec/arm64/dynarec_arm64_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_0f.c @@ -283,7 +283,6 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0x14: INST_NAME("UNPCKLPS Gx, Ex"); nextop = F8; - SMREAD(); GETEX(q0, 0, 0); GETGX(v0, 1); VZIP1Q_32(v0, v0, q0); @@ -291,7 +290,6 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0x15: INST_NAME("UNPCKHPS Gx, Ex"); nextop = F8; - SMREAD(); GETEX(q0, 0, 0); GETGX(v0, 1); VZIP2Q_32(v0, v0, q0); diff --git a/src/dynarec/rv64/dynarec_rv64_0f_vector.c b/src/dynarec/rv64/dynarec_rv64_0f_vector.c index e01ed34b..1f5c0089 100644 --- a/src/dynarec/rv64/dynarec_rv64_0f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_0f_vector.c @@ -120,6 +120,54 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, VLE8_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1); } break; + case 0x14: + INST_NAME("UNPCKLPS Gx, Ex"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); + GETGX_vector(q0, 1, VECTOR_SEW32); + GETEX_vector(q1, 0, 0, VECTOR_SEW32); + if (q0 == q1) { + q1 = fpu_get_scratch(dyn); + VMV_V_V(q1, q0); + } + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + v1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // no more scratches! + // Zvbb VWSLL would help here.... + VWADDU_VX(v0, q0, xZR, VECTOR_UNMASKED); + VWADDU_VX(v1, q1, xZR, VECTOR_UNMASKED); + VSLIDE1UP_VX(d0, v1, xZR, VECTOR_UNMASKED); + VOR_VV(q0, v0, d0, VECTOR_UNMASKED); + break; + case 0x15: + INST_NAME("UNPCKHPS Gx, Ex"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); + GETGX_vector(q0, 1, VECTOR_SEW32); + GETEX_vector(q1, 0, 0, VECTOR_SEW32); + if (q0 == q1) { + q1 = fpu_get_scratch(dyn); + VMV_V_V(q1, q0); + } + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + v1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // no more scratches! + if (rv64_vlen >= 256) { + VWADDU_VX(v0, q0, xZR, VECTOR_UNMASKED); + VWADDU_VX(v1, q1, xZR, VECTOR_UNMASKED); + vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL2, 2); + VSLIDEDOWN_VI(d0, v1, 3, VECTOR_UNMASKED); + VSLIDEDOWN_VI(v1, v0, 4, VECTOR_UNMASKED); + vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 1); + VOR_VV(q0, v1, d0, VECTOR_UNMASKED); + } else { + // Zvbb VWSLL would help here.... + VWADDU_VX(v0, q0, xZR, VECTOR_UNMASKED); + VWADDU_VX(v1, q1, xZR, VECTOR_UNMASKED); + VSLIDE1UP_VX(d0, v1 + 1, xZR, VECTOR_UNMASKED); + VOR_VV(q0, v0 + 1, d0, VECTOR_UNMASKED); + } + break; case 0x16: nextop = F8; if (MODREG) { diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c index 3327adf8..5d835fb4 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c @@ -1831,6 +1831,30 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i MARK; VSLL_VX(q0, q0, x4, VECTOR_UNMASKED); break; + case 0xF4: + INST_NAME("PMULUDQ Gx, Ex"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); + GETGX_vector(v0, 1, VECTOR_SEW32); + GETEX_vector(v1, 0, 0, VECTOR_SEW32); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + d1 = fpu_get_scratch(dyn); + VWMULU_VV(d0, v0, v1, VECTOR_UNMASKED); + if (rv64_vlen >= 256) { + VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); + q0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + vector_vsetvli(dyn, ninst, x1, VECTOR_SEW64, VECTOR_LMUL2, 2); + VSLIDE1DOWN_VX((v0 & 1) ? q0 : v0, d0, xZR, VECTOR_UNMASKED); + SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); + if (v0 & 1) VMV_V_V(v0, q0); + VMV_X_S(x4, d0); + VMV_S_X(v0, x4); + } else { + SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); + VSLIDEUP_VI(d0, d1, 1, VECTOR_UNMASKED); + VMV_V_V(v0, d0); + } + break; case 0xF5: INST_NAME("PMADDWD Gx, Ex"); nextop = F8; diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index 9b2d69ee..3a942f9b 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -1988,7 +1988,7 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) int start = not07 ? 8 : 0; int n = 0; for (int i = start; i < 8; i++) - if (dyn->e.ssecache[i].v != -1) ++n; + if (dyn->e.ssecache[i].v != -1 && !dyn->e.ssecache[i].vector) ++n; if(n) { MESSAGE(LOG_DUMP, "\tPush (float) XMM Cache (%d)------\n", n); for (int i = start; i < 8; ++i) @@ -2039,7 +2039,7 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) // TODO: save MMX registers too when we add support for MMX vector. n = 0; for (int i = start; i < 16; i++) - if (dyn->e.ssecache[i].v != -1) ++n; + if (dyn->e.ssecache[i].v != -1 && dyn->e.ssecache[i].vector) ++n; if (n) { MESSAGE(LOG_DUMP, "\tPush (vector) XMM Cache (%d)------\n", n); for (int i = start; i < 16; ++i) @@ -2068,7 +2068,7 @@ void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) int start = not07 ? 8 : 0; int n = 0; for (int i = start; i < 8; i++) - if (dyn->e.ssecache[i].v != -1) ++n; + if (dyn->e.ssecache[i].v != -1 && !dyn->e.ssecache[i].vector) ++n; if (n) { MESSAGE(LOG_DUMP, "\tPop (float) XMM Cache (%d)------\n", n); for (int i = start; i < 8; ++i) @@ -2109,7 +2109,7 @@ void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) // TODO: restore MMX registers too when we add support for MMX vector. n = 0; for (int i = start; i < 16; i++) - if (dyn->e.ssecache[i].v != -1) ++n; + if (dyn->e.ssecache[i].v != -1 && dyn->e.ssecache[i].vector) ++n; if (n) { MESSAGE(LOG_DUMP, "\tPop (vector) XMM Cache (%d)------\n", n); for (int i = start; i < 16; ++i) -- cgit 1.4.1