diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2025-05-15 15:00:50 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-05-15 09:00:50 +0200 |
| commit | 274949f9d0dd0c0c9e783430d16304d1ac471366 (patch) | |
| tree | 367525f84e63db42bc8c4b0e2908e535e3cc8fb0 /src | |
| parent | 4a70577e6f30da25decabb11e5f22ca2c22c50bd (diff) | |
| download | box64-274949f9d0dd0c0c9e783430d16304d1ac471366.tar.gz box64-274949f9d0dd0c0c9e783430d16304d1ac471366.zip | |
[RV64_DYNAREC] Added and optimized some SSE/MMX opcodes (#2635)
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_0f_vector.c | 28 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f_vector.c | 77 |
2 files changed, 60 insertions, 45 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_0f_vector.c b/src/dynarec/rv64/dynarec_rv64_0f_vector.c index 20ce41aa..2d15e636 100644 --- a/src/dynarec/rv64/dynarec_rv64_0f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_0f_vector.c @@ -1191,10 +1191,16 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, SET_ELEMENT_WIDTH(x1, u8, 1); SLTIU(x3, x4, i32); SUB(x3, xZR, x3); - NOT(x3, x3); // mask VSRL_VX(v0, v0, x4, VECTOR_UNMASKED); - VAND_VX(q0, v0, x3, VECTOR_UNMASKED); - VXOR_VV(v0, v0, q0, VECTOR_UNMASKED); + VAND_VX(v0, v0, x3, VECTOR_UNMASKED); + break; + case 0xD4: + INST_NAME("PADDQ Gm, Em"); + nextop = F8; + GETGM_vector(q0); + SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); + GETEM_vector(q1, 0); + VADD_VV(q0, q1, q0, VECTOR_UNMASKED); break; case 0xD5: INST_NAME("PMULLW Gm, Em"); @@ -1393,7 +1399,6 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, i32 = 64; } nextop = F8; - q0 = fpu_get_scratch(dyn); GETGM_vector(v0); if (MODREG) { v1 = mmx_get_reg_vector(dyn, ninst, x1, x2, x3, (nextop & 7)); @@ -1407,10 +1412,19 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, SET_ELEMENT_WIDTH(x1, u8, 1); SLTIU(x3, x4, i32); SUB(x3, xZR, x3); - NOT(x3, x3); // mask VSLL_VX(v0, v0, x4, VECTOR_UNMASKED); - VAND_VX(q0, v0, x3, VECTOR_UNMASKED); - VXOR_VV(v0, v0, q0, VECTOR_UNMASKED); + VAND_VX(v0, v0, x3, VECTOR_UNMASKED); + break; + case 0xF4: + INST_NAME("PMULUDQ Gx, Ex"); + nextop = F8; + GETGM_vector(v0); + SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); + GETEM_vector(v1, 0); + SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + VWMULU_VV(d0, v0, v1, VECTOR_UNMASKED); + VMV_V_V(v0, d0); break; case 0xF5: INST_NAME("PMADDWD Gm, Em"); diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c index 5baaa65d..42a8e449 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c @@ -647,23 +647,28 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i case 0x28: INST_NAME("PMULDQ Gx, Ex"); nextop = F8; - SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); - GETGX_vector(q0, 0, VECTOR_SEW64); - GETEX_vector(q1, 0, 0, VECTOR_SEW64); - d0 = fpu_get_scratch(dyn); - d1 = fpu_get_scratch(dyn); - // make sure the alignments before vnsrl... - // v0 cannot be the same with q0 in xtheadvector for the lack of tail undisturbed - v0 = ((q0 & 1) || rv64_xtheadvector) ? fpu_get_scratch_lmul(dyn, VECTOR_LMUL2) : q0; - v1 = (q1 & 1) ? fpu_get_scratch_lmul(dyn, VECTOR_LMUL2) : q1; - if (v0 != q0) VMV_V_V(v0, q0); - if (v1 != q1) VMV_V_V(v1, q1); - vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL1, 0.5); - VNSRL_WX(d0, v0, xZR, VECTOR_UNMASKED); - VNSRL_WX(d1, v1, xZR, VECTOR_UNMASKED); - VWMUL_VV(v0, d1, d0, VECTOR_UNMASKED); - vector_vsetvli(dyn, ninst, x1, VECTOR_SEW64, VECTOR_LMUL1, 1); - if (v0 != q0) VMV_V_V(q0, v0); + if (rv64_vlen >= 256) { + SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); + GETGX_vector(v0, 1, VECTOR_SEW64); + GETEX_vector(v1, 0, 0, VECTOR_SEW64); + d1 = fpu_get_scratch(dyn); + ADDI(x2, xZR, 32); + VSLL_VX(v0, v0, x2, VECTOR_UNMASKED); + VSRA_VX(v0, v0, x2, VECTOR_UNMASKED); + VSLL_VX(d1, v1, x2, VECTOR_UNMASKED); + VSRA_VX(d1, d1, x2, VECTOR_UNMASKED); + VMUL_VV(v0, v0, d1, VECTOR_UNMASKED); + } else { + SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); + GETGX_vector(v0, 1, VECTOR_SEW32); + GETEX_vector(v1, 0, 0, VECTOR_SEW32); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + d1 = fpu_get_scratch(dyn); + VWMUL_VV(d0, v0, v1, VECTOR_UNMASKED); + SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); + VSLIDEUP_VI(d0, d1, 1, VECTOR_UNMASKED); + VMV_V_V(v0, d0); + } break; case 0x29: INST_NAME("PCMPEQQ Gx, Ex"); // SSE4 opcode! @@ -1951,13 +1956,10 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LD(x4, wback, fixedaddress); } SET_ELEMENT_WIDTH(x1, u8, 1); - v0 = fpu_get_scratch(dyn); SLTIU(x3, x4, i32); SUB(x3, xZR, x3); - NOT(x3, x3); // mask VSRL_VX(q0, q0, x4, VECTOR_UNMASKED); - VAND_VX(v0, q0, x3, VECTOR_UNMASKED); - VXOR_VV(q0, q0, v0, VECTOR_UNMASKED); + VAND_VX(q0, q0, x3, VECTOR_UNMASKED); break; case 0xD4: INST_NAME("PADDQ Gx, Ex"); @@ -2307,33 +2309,32 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LD(x4, wback, fixedaddress); } SET_ELEMENT_WIDTH(x1, u8, 1); - v0 = fpu_get_scratch(dyn); SLTIU(x3, x4, i32); SUB(x3, xZR, x3); - NOT(x3, x3); // mask VSLL_VX(q0, q0, x4, VECTOR_UNMASKED); - VAND_VX(v0, q0, x3, VECTOR_UNMASKED); - VXOR_VV(q0, q0, v0, VECTOR_UNMASKED); + VAND_VX(q0, q0, x3, VECTOR_UNMASKED); break; case 0xF4: INST_NAME("PMULUDQ Gx, Ex"); nextop = F8; - SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); - GETGX_vector(v0, 1, VECTOR_SEW32); - GETEX_vector(v1, 0, 0, VECTOR_SEW32); - d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); - d1 = fpu_get_scratch(dyn); - VWMULU_VV(d0, v0, v1, VECTOR_UNMASKED); if (rv64_vlen >= 256) { - VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); - q0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); - vector_vsetvli(dyn, ninst, x1, VECTOR_SEW64, VECTOR_LMUL2, 2); - VSLIDE1DOWN_VX((v0 & 1) ? q0 : v0, d0, xZR, VECTOR_UNMASKED); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); - if (v0 & 1) VMV_V_V(v0, q0); - VMV_X_S(x4, d0); - VMV_S_X(v0, x4); + GETGX_vector(v0, 1, VECTOR_SEW64); + GETEX_vector(v1, 0, 0, VECTOR_SEW64); + d1 = fpu_get_scratch(dyn); + ADDI(x2, xZR, 32); + VSLL_VX(v0, v0, x2, VECTOR_UNMASKED); + VSRL_VX(v0, v0, x2, VECTOR_UNMASKED); + VSLL_VX(d1, v1, x2, VECTOR_UNMASKED); + VSRL_VX(d1, d1, x2, VECTOR_UNMASKED); + VMUL_VV(v0, v0, d1, VECTOR_UNMASKED); } else { + SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); + GETGX_vector(v0, 1, VECTOR_SEW32); + GETEX_vector(v1, 0, 0, VECTOR_SEW32); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + d1 = fpu_get_scratch(dyn); + VWMULU_VV(d0, v0, v1, VECTOR_UNMASKED); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); VSLIDEUP_VI(d0, d1, 1, VECTOR_UNMASKED); VMV_V_V(v0, d0); |