diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2024-09-18 15:23:54 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-09-18 09:23:54 +0200 |
| commit | b37b6addac68d908102db1de6d753b4b059a5516 (patch) | |
| tree | ef0b9d8784359a068fac4768569eb2b1a095c28d /src | |
| parent | a2dfe70cd86749107a8aeb27869e0efe34ee9c4b (diff) | |
| download | box64-b37b6addac68d908102db1de6d753b4b059a5516.tar.gz box64-b37b6addac68d908102db1de6d753b4b059a5516.zip | |
[RV64_DYNAREC] Added more opcodes for vector (#1833)
* [RV64_DYNAREC] Added more opcodes for vector * review
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f_vector.c | 170 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_functions.c | 10 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_functions.h | 2 |
3 files changed, 139 insertions, 43 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c index 3f03f50d..ddbb9268 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c @@ -129,8 +129,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); GETGX_vector(q0, 1, VECTOR_SEW16); GETEX_vector(q1, 0, 0, VECTOR_SEW16); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); fpu_get_scratch(dyn); VWMUL_VV(v0, q0, q1, VECTOR_UNMASKED); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL2, 2); @@ -203,8 +202,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW8); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.5); VWADD_VX(v0, xZR, q1, VECTOR_UNMASKED); SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); @@ -216,8 +214,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW8); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.25); @@ -233,8 +230,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW8); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.125); @@ -252,8 +248,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW16); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.5); VWADD_VX(v0, xZR, q1, VECTOR_UNMASKED); SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); @@ -265,8 +260,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW16); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.25); @@ -282,8 +276,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW32); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5); VWADD_VX(v0, xZR, q1, VECTOR_UNMASKED); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); @@ -295,8 +288,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW8); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.5); VWADDU_VX(v0, xZR, q1, VECTOR_UNMASKED); SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); @@ -308,8 +300,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW8); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.25); @@ -325,8 +316,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW8); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW8, VECTOR_LMUL1, 0.125); @@ -344,8 +334,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW16); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.5); VWADDU_VX(v0, xZR, q1, VECTOR_UNMASKED); SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); @@ -357,8 +346,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW16); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 0.25); @@ -374,8 +362,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); GETGX_empty_vector(q0); GETEX_vector(q1, 0, 0, VECTOR_SEW32); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - v0 = fpu_get_scratch(dyn); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5); VWADDU_VX(v0, xZR, q1, VECTOR_UNMASKED); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); @@ -655,12 +642,61 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i VRGATHER_VV(d1, v0, q1, VECTOR_UNMASKED); VMERGE_VVM(q0, d1, d0); break; - case 0x66: - INST_NAME("PCMPGTD Gx, Ex"); + case 0x62: + INST_NAME("PUNPCKLDQ Gx,Ex"); nextop = F8; SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); + ADDI(x1, xZR, 0b1010); + VMV_V_X(VMASK, x1); // VMASK = 0b1010 + v0 = fpu_get_scratch(dyn); + VIOTA_M(v0, VMASK, VECTOR_UNMASKED); // v0 = 1 1 0 0 GETGX_vector(q0, 1, VECTOR_SEW32); GETEX_vector(q1, 0, 0, VECTOR_SEW32); + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + VRGATHER_VV(d0, v0, q0, VECTOR_UNMASKED); + VRGATHER_VV(d1, v0, q1, VECTOR_UNMASKED); + VMERGE_VVM(q0, d1, d0); + break; + case 0x63: + INST_NAME("PACKSSWB Gx, Ex"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); + GETGX_vector(q0, 1, VECTOR_SEW16); + GETEX_vector(q1, 0, 0, VECTOR_SEW16); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + d1 = fpu_get_scratch(dyn); + if (rv64_vlen >= 256) { + vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 2); // double the vl for slideup. + if (q0 == q1) { + VMV_V_V(d0, q0); + VSLIDEUP_VI(d0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here! + } else { + VSLIDEUP_VI(q0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here! + d0 = q0; + } + } else { + VMV_V_V(d0, q0); + VMV_V_V(d1, q1); + } + SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); + VNCLIP_WI(q0, 0, d0, VECTOR_UNMASKED); + break; + case 0x64 ... 0x66: + if (opcode == 0x64) { + INST_NAME("PCMPGTB Gx,Ex"); + u8 = VECTOR_SEW8; + } else if (opcode == 0x65) { + INST_NAME("PCMPGTW Gx,Ex"); + u8 = VECTOR_SEW16; + } else { + INST_NAME("PCMPGTD Gx, Ex"); + u8 = VECTOR_SEW32; + } + nextop = F8; + SET_ELEMENT_WIDTH(x1, u8, 1); + GETGX_vector(q0, 1, dyn->vector_eew); + GETEX_vector(q1, 0, 0, dyn->vector_eew); VMSLT_VV(VMASK, q0, q1, VECTOR_UNMASKED); VXOR_VV(q0, q0, q0, VECTOR_UNMASKED); VMERGE_VIM(q0, 1, q0); // implies vmask and widened it @@ -672,13 +708,18 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); GETGX_vector(q0, 1, VECTOR_SEW16); GETEX_vector(q1, 0, 0, VECTOR_SEW16); - fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment! - d0 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); d1 = fpu_get_scratch(dyn); if (rv64_vlen >= 256) { vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL1, 2); // double the vl for slideup. - VSLIDEUP_VI(q0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here! - VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED); + if (q0 == q1) { + VMV_V_V(d0, q0); + VSLIDEUP_VI(d0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here! + VMAX_VX(d0, xZR, d0, VECTOR_UNMASKED); + } else { + VSLIDEUP_VI(q0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here! + VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED); + } } else { VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED); VMAX_VX(d1, xZR, q1, VECTOR_UNMASKED); @@ -686,23 +727,66 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); VNCLIPU_WI(q0, 0, d0, VECTOR_UNMASKED); break; - case 0x69: - INST_NAME("PUNPCKHWD Gx, Ex"); - nextop = F8; - SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); - ADDI(x1, xZR, 0b10101010); - VMV_V_X(VMASK, x1); // VMASK = 0b10101010 - v0 = fpu_get_scratch(dyn); - VIOTA_M(v0, VMASK, VECTOR_UNMASKED); - VADD_VI(v0, 4, v0, VECTOR_UNMASKED); // v0 = 7 7 6 6 5 5 4 4 - GETGX_vector(q0, 1, VECTOR_SEW16); - GETEX_vector(q1, 0, 0, VECTOR_SEW16); + case 0x68 ... 0x6A: + if (opcode == 0x68) { + INST_NAME("PUNPCKHBW Gx,Ex"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); + ADDI(x1, xZR, 0b1010101010101010); + VMV_V_X(VMASK, x1); // VMASK = 0b1010101010101010 + v0 = fpu_get_scratch(dyn); + VIOTA_M(v0, VMASK, VECTOR_UNMASKED); + VADD_VI(v0, 8, v0, VECTOR_UNMASKED); // v0 = 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 + } else if (opcode == 0x69) { + INST_NAME("PUNPCKHWD Gx, Ex"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); + ADDI(x1, xZR, 0b10101010); + VMV_V_X(VMASK, x1); // VMASK = 0b10101010 + v0 = fpu_get_scratch(dyn); + VIOTA_M(v0, VMASK, VECTOR_UNMASKED); + VADD_VI(v0, 4, v0, VECTOR_UNMASKED); // v0 = 7 7 6 6 5 5 4 4 + } else { + INST_NAME("PUNPCKHDQ Gx, Ex"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); + VMV_V_I(VMASK, 0b1010); + v0 = fpu_get_scratch(dyn); + VIOTA_M(v0, VMASK, VECTOR_UNMASKED); + VADD_VI(v0, 2, v0, VECTOR_UNMASKED); // v0 = 3 3 2 2 + } + GETGX_vector(q0, 1, dyn->vector_eew); + GETEX_vector(q1, 0, 0, dyn->vector_eew); d0 = fpu_get_scratch(dyn); d1 = fpu_get_scratch(dyn); VRGATHER_VV(d0, v0, q0, VECTOR_UNMASKED); VRGATHER_VV(d1, v0, q1, VECTOR_UNMASKED); VMERGE_VVM(q0, d1, d0); break; + case 0x6B: + INST_NAME("PACKSSDW Gx, Ex"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); + GETGX_vector(q0, 1, VECTOR_SEW32); + GETEX_vector(q1, 0, 0, VECTOR_SEW32); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + d1 = fpu_get_scratch(dyn); + if (rv64_vlen >= 256) { + vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 2); // double the vl for slideup. + if (q0 == q1) { + VMV_V_V(d0, q0); + VSLIDEUP_VI(d0, 4, q1, VECTOR_UNMASKED); // splice q0 and q1 here! + } else { + VSLIDEUP_VI(q0, 4, q1, VECTOR_UNMASKED); // splice q0 and q1 here! + d0 = q0; + } + } else { + VMV_V_V(d0, q0); + VMV_V_V(d1, q1); + } + SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); + VNCLIP_WI(q0, 0, d0, VECTOR_UNMASKED); + break; case 0x6C: INST_NAME("PUNPCKLQDQ Gx, Ex"); nextop = F8; diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c index 3f0a04ad..9ff6b4ff 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.c +++ b/src/dynarec/rv64/dynarec_rv64_functions.c @@ -37,6 +37,16 @@ int fpu_get_scratch(dynarec_rv64_t* dyn) { return SCRATCH0 + dyn->e.fpu_scratch++; // return an Sx } + +// Get a FPU scratch reg aligned to LMUL +int fpu_get_scratch_lmul(dynarec_rv64_t* dyn, int lmul) +{ + int reg = SCRATCH0 + dyn->e.fpu_scratch; + int skip = (1 << lmul) - (reg % (1 << lmul)); + dyn->e.fpu_scratch += skip + 1; + return reg + skip; +} + // Reset scratch regs counter void fpu_reset_scratch(dynarec_rv64_t* dyn) { diff --git a/src/dynarec/rv64/dynarec_rv64_functions.h b/src/dynarec/rv64/dynarec_rv64_functions.h index 04cf7c47..03a20925 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.h +++ b/src/dynarec/rv64/dynarec_rv64_functions.h @@ -11,6 +11,8 @@ typedef struct dynarec_rv64_s dynarec_rv64_t; // Get an FPU scratch reg int fpu_get_scratch(dynarec_rv64_t* dyn); +// Get a FPU scratch reg aligned to LMUL +int fpu_get_scratch_lmul(dynarec_rv64_t* dyn, int lmul); // Reset scratch regs counter void fpu_reset_scratch(dynarec_rv64_t* dyn); // Get an x87 double reg |