diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2024-09-11 16:25:04 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-09-11 10:25:04 +0200 |
| commit | 4d60b75240ef44b7b15b73dda5c45b6aabdbb7e7 (patch) | |
| tree | 4995de70927f3f6093ad1e3f786d6910ad7e4b92 | |
| parent | fc9900c8f6b29185f285c0f687d9a666206071d8 (diff) | |
| download | box64-4d60b75240ef44b7b15b73dda5c45b6aabdbb7e7.tar.gz box64-4d60b75240ef44b7b15b73dda5c45b6aabdbb7e7.zip | |
[RV64_DYNAREC] Added more 0F opcodes for vector and optimized some opcodes too (#1816)
* [RV64_DYNAREC] Optimized 66 0F 67 PACKUSWB opcode * [RV64_DYNAREC] Optimized 66 0F 6C PUNPCKLQDQ opcode * [RV64_DYNAREC] Added some 0F opcodes for vector * review
| -rw-r--r-- | CMakeLists.txt | 2 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_00_0.c | 5 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_0f_vector.c | 145 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_66.c | 6 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f_vector.c | 23 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.c | 9 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 24 |
7 files changed, 180 insertions, 34 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index e84f5c6f..aa46ca6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -910,8 +910,8 @@ if(RV64_DYNAREC) "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_00_2.c" "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_00_3.c" "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_0f.c" + "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_0f_vector.c" "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_64.c" - #"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_65.c" "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_66.c" "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_67.c" "${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_67_32.c" diff --git a/src/dynarec/rv64/dynarec_rv64_00_0.c b/src/dynarec/rv64/dynarec_rv64_00_0.c index cb50df7f..4bb35312 100644 --- a/src/dynarec/rv64/dynarec_rv64_00_0.c +++ b/src/dynarec/rv64/dynarec_rv64_00_0.c @@ -40,6 +40,7 @@ uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int int64_t fixedaddress; int lock; int cacheupd = 0; + uintptr_t retaddr = 0; opcode = F8; MAYUSE(eb1); @@ -177,7 +178,9 @@ uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x0F: switch(rep) { case 0: - addr = dynarec64_0F(dyn, addr, ip, ninst, rex, ok, need_epilog); + if (rv64_vector) + retaddr = dynarec64_0F_vector(dyn, addr, ip, ninst, rex, ok, need_epilog); + addr = retaddr ? retaddr : dynarec64_0F(dyn, addr, ip, ninst, rex, ok, need_epilog); break; case 1: addr = dynarec64_F20F(dyn, addr, ip, ninst, rex, ok, need_epilog); diff --git a/src/dynarec/rv64/dynarec_rv64_0f_vector.c b/src/dynarec/rv64/dynarec_rv64_0f_vector.c new file mode 100644 index 00000000..6de7eaef --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_0f_vector.c @@ -0,0 +1,145 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "my_cpuid.h" +#include "emu/x87emu_private.h" +#include "emu/x64shaext.h" +#include "bitutils.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_functions.h" +#include "dynarec_rv64_helper.h" + +uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + uint8_t nextop, u8; + uint8_t gd, ed; + uint8_t wb1, wback, wb2, gback; + uint8_t eb1, eb2; + uint8_t gb1, gb2; + int32_t i32, i32_; + int cacheupd = 0; + int v0, v1; + int q0, q1; + int d0, d1; + int s0, s1; + uint64_t tmp64u; + int64_t j64; + int64_t fixedaddress, gdoffset; + int unscaled; + MAYUSE(wb2); + MAYUSE(gback); + MAYUSE(eb1); + MAYUSE(eb2); + MAYUSE(q0); + MAYUSE(q1); + MAYUSE(d0); + MAYUSE(d1); + MAYUSE(s0); + MAYUSE(j64); + MAYUSE(cacheupd); + + switch (opcode) { + case 0x10: + INST_NAME("MOVUPS Gx, Ex"); + nextop = F8; + GETG; + SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1); + if (MODREG) { + ed = (nextop & 7) + (rex.b << 3); + v1 = sse_get_reg_vector(dyn, ninst, x1, ed, 0, dyn->vector_eew); + v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd); + VMV_V_V(v0, v1); + } else { + SMREAD(); + v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0); + VLE_V(v0, ed, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); + } + break; + case 0x11: + INST_NAME("MOVUPS Ex, Gx"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1); + GETGX_vector(v0, 0, dyn->vector_eew); + if (MODREG) { + ed = (nextop & 7) + (rex.b << 3); + v1 = sse_get_reg_empty_vector(dyn, ninst, x1, ed); + VMV_V_V(v1, v0); + } else { + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0); + VSE_V(v0, ed, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); + SMWRITE2(); + } + break; + case 0x16: + nextop = F8; + if (MODREG) { + INST_NAME("MOVLHPS Gx, Ex"); + SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); + GETGX_vector(v0, 1, VECTOR_SEW64); + v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64); + if (v0 == v1) { + // for vslideup.vi, cannot be overlapped + v1 = fpu_get_scratch(dyn); + VMV_V_V(v1, v0); + } + VSLIDEUP_VI(v0, 1, v1, VECTOR_UNMASKED); + } else { + INST_NAME("MOVHPS Gx, Ex"); + SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); + GETGX_vector(v0, 1, VECTOR_SEW64); + q0 = fpu_get_scratch(dyn); + VXOR_VV(q0, q0, q0, VECTOR_UNMASKED); + VMV_V_I(VMASK, 0b10); + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0); + VLUXEI64_V(v0, ed, q0, VECTOR_MASKED, VECTOR_NFIELD1); + } + break; + case 0x29: + INST_NAME("MOVAPS Ex, Gx"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1); + GETGX_vector(v0, 0, dyn->vector_eew); + if (MODREG) { + ed = (nextop & 7) + (rex.b << 3); + v1 = sse_get_reg_empty_vector(dyn, ninst, x1, ed); + VMV_V_V(v1, v0); + } else { + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0); + VSE_V(v0, ed, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1); + SMWRITE2(); + } + break; + case 0x00 ... 0x0F: + case 0x18: + case 0x1F: + case 0x31: + case 0x40 ... 0x4F: + case 0x80 ... 0xBF: + return 0; + default: + DEFAULT_VECTOR; + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_66.c b/src/dynarec/rv64/dynarec_rv64_66.c index 5ad815af..bf46d302 100644 --- a/src/dynarec/rv64/dynarec_rv64_66.c +++ b/src/dynarec/rv64/dynarec_rv64_66.c @@ -131,11 +131,9 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x0F: switch(rep) { case 0: { - if (rv64_vector) { + if (rv64_vector) retaddr = dynarec64_660F_vector(dyn, addr, ip, ninst, rex, ok, need_epilog); - addr = retaddr ? retaddr : dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); - } else - addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); + addr = retaddr ? retaddr : dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); break; } case 1: addr = dynarec64_66F20F(dyn, addr, ip, ninst, rex, ok, need_epilog); break; diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c index 96d4ea89..44e4b3c5 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c @@ -149,14 +149,13 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! d0 = fpu_get_scratch(dyn); d1 = fpu_get_scratch(dyn); - VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED); - VMAX_VX(d1, xZR, q1, VECTOR_UNMASKED); if (rv64_vlen >= 256) { - /* mu tu sew lmul=1 */ - vtypei = (0b0 << 7) | (0b0 << 6) | (VECTOR_SEW16 << 3) | 0b000; - ADDI(x1, xZR, 16); // double the vl for slideup. - VSETVLI(xZR, x1, vtypei); - VSLIDEUP_VI(d0, 8, d1, VECTOR_UNMASKED); // splice d0 and d1 here! + vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW16, 2); // double the vl for slideup. + VSLIDEUP_VI(q0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here! + VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED); + } else { + VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED); + VMAX_VX(d1, xZR, q1, VECTOR_UNMASKED); } SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); VNCLIPU_WI(q0, 0, d0, VECTOR_UNMASKED); @@ -185,18 +184,18 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i // GX->q[0] = GX->q[0]; -> unchanged // GX->q[1] = EX->q[0]; GETGX_vector(v0, 1, VECTOR_SEW64); - q0 = fpu_get_scratch(dyn); - VXOR_VV(q0, q0, q0, VECTOR_UNMASKED); - VMV_V_I(VMASK, 0b10); if (MODREG) { v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64); if (v0 == v1) { - // for vrgather.vv, cannot be overlapped + // for vslideup.vi, cannot be overlapped v1 = fpu_get_scratch(dyn); VMV_V_V(v1, v0); } - VRGATHER_VV(v0, q0, v1, VECTOR_MASKED); + VSLIDEUP_VI(v0, 1, v1, VECTOR_UNMASKED); } else { + q0 = fpu_get_scratch(dyn); + VXOR_VV(q0, q0, q0, VECTOR_UNMASKED); + VMV_V_I(VMASK, 0b10); SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0); VLUXEI64_V(v0, ed, q0, VECTOR_MASKED, VECTOR_NFIELD1); diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index 75ad99a8..a8ef8f21 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -2434,7 +2434,7 @@ static void sewTransform(dynarec_rv64_t* dyn, int ninst, int s1) if (jmp < 0) return; if (dyn->insts[jmp].vector_sew == VECTOR_SEWNA) return; MESSAGE(LOG_DUMP, "\tSEW changed to %d ---- ninst=%d -> %d\n", dyn->insts[jmp].vector_sew, ninst, jmp); - vector_vsetvl_emul1(dyn, ninst, s1, dyn->insts[jmp].vector_sew); + vector_vsetvl_emul1(dyn, ninst, s1, dyn->insts[jmp].vector_sew, 1); } void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) @@ -2590,9 +2590,8 @@ void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst) dyn->e.swapped = 0; } -// Use vector extension as like SIMD for now, this function sets the specified element width, -// other configs are set automatically. -int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew) +// Simple wrapper for vsetvli +int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew, int multiple) { if (sew == VECTOR_SEWNA) return VECTOR_SEW8; if (sew == VECTOR_SEWANY) sew = VECTOR_SEW8; @@ -2603,7 +2602,7 @@ int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew) * * mu tu sew lmul=1 */ uint32_t vtypei = (0b0 << 7) | (0b0 << 6) | (sew << 3) | 0b000; - ADDI(s1, xZR, 16 >> sew); + ADDI(s1, xZR, (16 >> sew) * multiple); VSETVLI(xZR, s1, vtypei); return sew; } diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index cd3990a8..345b3c0f 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -1081,16 +1081,16 @@ #define MODREG ((nextop & 0xC0) == 0xC0) #ifndef SET_ELEMENT_WIDTH -#define SET_ELEMENT_WIDTH(s1, sew, set) \ - do { \ - if (sew == VECTOR_SEWANY && dyn->vector_sew != VECTOR_SEWNA) { \ - dyn->vector_eew = dyn->vector_sew; \ - } else if (sew == dyn->vector_sew) { \ - dyn->vector_eew = dyn->vector_sew; \ - } else { \ - dyn->vector_eew = vector_vsetvl_emul1(dyn, ninst, s1, sew); \ - } \ - if (set) dyn->vector_sew = dyn->vector_eew; \ +#define SET_ELEMENT_WIDTH(s1, sew, set) \ + do { \ + if (sew == VECTOR_SEWANY && dyn->vector_sew != VECTOR_SEWNA) { \ + dyn->vector_eew = dyn->vector_sew; \ + } else if (sew == dyn->vector_sew) { \ + dyn->vector_eew = dyn->vector_sew; \ + } else { \ + dyn->vector_eew = vector_vsetvl_emul1(dyn, ninst, s1, sew, 1); \ + } \ + if (set) dyn->vector_sew = dyn->vector_eew; \ } while (0) #endif @@ -1134,6 +1134,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define dynarec64_F20F STEPNAME(dynarec64_F20F) #define dynarec64_F30F STEPNAME(dynarec64_F30F) +#define dynarec64_0F_vector STEPNAME(dynarec64_0F_vector) #define dynarec64_660F_vector STEPNAME(dynarec64_660F_vector) #define geted STEPNAME(geted) @@ -1441,7 +1442,7 @@ void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2 void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val); void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zeroup); -int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew); +int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew, int multiple); #if STEP < 2 #define CHECK_CACHE() 0 @@ -1546,6 +1547,7 @@ uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); #if STEP < 2 |