diff options
| author | xctan <xctan@cirno.icu> | 2024-05-31 20:09:10 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-05-31 14:09:10 +0200 |
| commit | 6749c2d1a6d1762b3a2ad87c0bc70a65359d1c08 (patch) | |
| tree | ebfbb4181819a0fe32e8540b23b17f0015d5958d /src | |
| parent | 7eb24ff51407562fe29fc1259cd3a8ce47f143e8 (diff) | |
| download | box64-6749c2d1a6d1762b3a2ad87c0bc70a65359d1c08.tar.gz box64-6749c2d1a6d1762b3a2ad87c0bc70a65359d1c08.zip | |
[RV64_DYNAREC] Added more MMX opcodes and some optimizations too (#1542)
* [RV64_DYNAREC] Added 0F 38 06 PHSUBD opcode * [RV64_DYNAREC] Added 0F 38 07 PHSUBSW opcode * [RV64_DYNAREC] Added 0F 38 05 PHSUBW opcode * [RV64_DYNAREC] Added 0F C4 PINSRW opcode * [RV64_DYNAREC] Added 0F 38 04 PMADDUBSW opcode * [RV64_DYNAREC] Added 0F EE PMAXSW opcode * [RV64_DYNAREC] Optimized SSE packed min/max * [RV64_DYNAREC] Added 0F DE PMAXUB opcode * [RV64_DYNAREC] Added 0F EA PMINSW opcode * [RV64_DYNAREC] Added 0F DA PMINUB opcode * [RV64_DYNAREC] Optimized 0F D9 PSUBUSW opcode * [RV64_DYNAREC] Added 0F D7 PMOVMSKB opcode * [RV64_DYNAREC] Optimized (66) 0F D7 PMOVMSKB opcode * [RV64_DYNAREC] Switched to the simpler implementation for PMOVMSKB
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_0f.c | 2 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_0f.c | 240 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f.c | 78 |
3 files changed, 305 insertions, 15 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c index 8773b054..596c9fd1 100644 --- a/src/dynarec/arm64/dynarec_arm64_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_0f.c @@ -2642,7 +2642,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin SQADD_16(d0, d0, d1); break; case 0xEE: - INST_NAME("PMAXSW Gx,Ex"); + INST_NAME("PMAXSW Gm,Em"); nextop = F8; GETGM(v0); GETEM(q0, 0); diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c index a7f96a32..0b63c070 100644 --- a/src/dynarec/rv64/dynarec_rv64_0f.c +++ b/src/dynarec/rv64/dynarec_rv64_0f.c @@ -541,6 +541,130 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } } break; + case 0x04: + INST_NAME("PMADDUBSW Gm,Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + MOV64x(x5, 32767); + MOV64x(x6, -32768); + for (int i = 0; i < 4; ++i) { + LBU(x3, gback, gdoffset + i * 2); + LB(x4, wback, fixedaddress + i * 2); + MUL(x9, x3, x4); + LBU(x3, gback, gdoffset + i * 2 + 1); + LB(x4, wback, fixedaddress + i * 2 + 1); + MUL(x3, x3, x4); + ADD(x3, x3, x9); + if (rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, x6); + } else { + BLT(x3, x5, 4 + 4); + MV(x3, x5); + BLT(x6, x3, 4 + 4); + MV(x3, x6); + } + SH(x3, gback, gdoffset + i * 2); + } + break; + case 0x05: + INST_NAME("PHSUBW Gm,Em"); + nextop = F8; + GETGM(); + for (int i = 0; i < 2; ++i) { + // tmp32s = GX->sw[i*2+0]+GX->sw[i*2+1]; + // GX->sw[i] = sat(tmp32s); + LH(x3, gback, gdoffset + 2 * (i * 2 + 0)); + LH(x4, gback, gdoffset + 2 * (i * 2 + 1)); + SUBW(x3, x3, x4); + SH(x3, gback, gdoffset + i * 2); + } + if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { + // GM->d[1] = GM->d[0]; + LW(x3, gback, gdoffset + 0); + SW(x3, gback, gdoffset + 4); + } else { + GETEM(x2, 0); + for (int i = 0; i < 2; ++i) { + // tmp32s = EX->sw[i*2+0] + EX->sw[i*2+1]; + // GX->sw[4+i] = sat(tmp32s); + LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); + LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); + SUBW(x3, x3, x4); + SH(x3, gback, gdoffset + 2 * (2 + i)); + } + } + break; + case 0x06: + INST_NAME("PHSUBD Gm,Em"); + nextop = F8; + GETGM(); + // GM->sd[0] += GM->sd[1]; + LW(x3, gback, gdoffset + 0 * 4); + LW(x4, gback, gdoffset + 1 * 4); + SUBW(x3, x3, x4); + SW(x3, gback, gdoffset + 0 * 4); + if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { + // GM->sd[1] = GM->sd[0]; + SW(x3, gback, gdoffset + 1 * 4); + } else { + GETEM(x2, 0); + // GM->sd[1] = EM->sd[0] + EM->sd[1]; + LW(x3, wback, fixedaddress + 0 * 4); + LW(x4, wback, fixedaddress + 1 * 4); + SUBW(x3, x3, x4); + SW(x3, gback, gdoffset + 1 * 4); + } + break; + case 0x07: + INST_NAME("PHSUBSW Gm,Em"); + nextop = F8; + GETGM(); + MOV64x(x5, 32767); + MOV64x(x6, -32768); + for (int i = 0; i < 2; ++i) { + // tmp32s = GX->sw[i*2+0]+GX->sw[i*2+1]; + // GX->sw[i] = sat(tmp32s); + LH(x3, gback, gdoffset + 2 * (i * 2 + 0)); + LH(x4, gback, gdoffset + 2 * (i * 2 + 1)); + SUBW(x3, x3, x4); + if (rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, x6); + } else { + BLT(x3, x5, 4 + 4); + MV(x3, x5); + BLT(x6, x3, 4 + 4); + MV(x3, x6); + } + SH(x3, gback, gdoffset + i * 2); + } + if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { + // GM->d[1] = GM->d[0]; + LW(x3, gback, gdoffset + 0); + SW(x3, gback, gdoffset + 4); + } else { + GETEM(x2, 0); + for (int i = 0; i < 2; ++i) { + // tmp32s = EX->sw[i*2+0] + EX->sw[i*2+1]; + // GX->sw[4+i] = sat(tmp32s); + LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); + LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); + SUBW(x3, x3, x4); + if (rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, x6); + } else { + BLT(x3, x5, 4 + 4); + MV(x3, x5); + BLT(x6, x3, 4 + 4); + MV(x3, x6); + } + SH(x3, gback, gdoffset + 2 * (2 + i)); + } + } + break; case 0x1C: INST_NAME("PABSB Gm,Em"); nextop = F8; @@ -2122,6 +2246,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni SDxw(gd, ed, fixedaddress); } break; + case 0xC4: + INST_NAME("PINSRW Gm,Ed,Ib"); + nextop = F8; + GETED(1); + GETGM(); + u8 = (F8)&3; + SH(ed, gback, gdoffset + u8 * 2); + break; case 0xC5: INST_NAME("PEXTRW Gd,Em,Ib"); nextop = F8; @@ -2182,12 +2314,70 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni GETEM(x2, 0); MMX_LOOP_WS(x3, x4, MULW(x3, x3, x4)); break; + case 0xD7: + INST_NAME("PMOVMSKB Gd, Em"); + nextop = F8; + GETGD; + GETEM(x2, 0); + LD(x1, wback, fixedaddress + 0); + for (int i = 0; i < 8; i++) { + if (rv64_zbs) { + if (i == 0) { + BEXTI(gd, x1, 63); + } else { + BEXTI(x6, x1, 63 - i * 8); + } + } else { + if (i == 0) { + SRLI(gd, x1, 63); + } else { + SRLI(x6, x1, 63 - i * 8); + ANDI(x6, x6, 1); + } + } + if (i != 0) { + if (rv64_zba) { + SH1ADD(gd, gd, x6); + } else { + SLLI(gd, gd, 1); + OR(gd, gd, x6); + } + } + } + break; case 0xD9: INST_NAME("PSUBUSW Gm, Em"); nextop = F8; GETGM(); GETEM(x2, 0); - MMX_LOOP_W(x3, x4, SUB(x3, x3, x4); SLT(x4, xZR, x3); NEG(x4, x4); AND(x3, x3, x4)); + MMX_LOOP_W(x3, x4, + SUB(x3, x3, x4); + if (rv64_zbb) { + MAX(x3, x3, xZR); + } else { + NOT(x4, x3); + SRAI(x4, x4, 63); + AND(x3, x3, x4); + } + SH(x3, gback, gdoffset + i * 2); + ); + break; + case 0xDA: + INST_NAME("PMINUB Gm, Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + for (int i = 0; i < 8; ++i) { + LBU(x3, gback, gdoffset + i); + LBU(x4, wback, fixedaddress + i); + if (rv64_zbb) { + MINU(x3, x3, x4); + } else { + BLTU(x3, x4, 8); + MV(x3, x4); + } + SB(x3, gback, gdoffset + i); + } break; case 0xDB: INST_NAME("PAND Gm, Em"); @@ -2239,6 +2429,23 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni SH(x3, gback, gdoffset + i * 2); } break; + case 0xDE: + INST_NAME("PMAXUB Gm, Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + for (int i = 0; i < 8; ++i) { + LBU(x3, gback, gdoffset + i); + LBU(x4, wback, fixedaddress + i); + if (rv64_zbb) { + MAXU(x3, x3, x4); + } else { + BLTU(x4, x3, 8); + MV(x3, x4); + } + SB(x3, gback, gdoffset + i); + } + break; case 0xDF: INST_NAME("PANDN Gm, Em"); nextop = F8; @@ -2347,6 +2554,23 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni SH(x3, gback, gdoffset + 2 * i); } break; + case 0xEA: + INST_NAME("PMINSW Gx,Ex"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + for (int i = 0; i < 4; ++i) { + LH(x3, gback, gdoffset + 2 * i); + LH(x4, wback, fixedaddress + 2 * i); + if (rv64_zbb) { + MIN(x3, x3, x4); + } else { + BLT(x3, x4, 8); + MV(x3, x4); + } + SH(x3, gback, gdoffset + 2 * i); + } + break; case 0xEB: INST_NAME("POR Gm, Em"); nextop = F8; @@ -2407,6 +2631,20 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni SH(x3, gback, gdoffset + 2 * i); } break; + case 0xEE: + INST_NAME("PMAXSW Gm,Em"); + nextop = F8; + GETGM(); + GETEM(x1, 0); + MMX_LOOP_WS(x3, x4, + if (rv64_zbb) { + MAX(x3, x3, x4); + } else { + BGE(x3, x4, 8); + MV(x3, x4); + } + ); + break; case 0xEF: INST_NAME("PXOR Gm,Em"); nextop = F8; diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c index cb6831ee..c6d3c62c 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f.c +++ b/src/dynarec/rv64/dynarec_rv64_660f.c @@ -2755,12 +2755,45 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; GETEX(x2, 0); GETGD; - MV(gd, xZR); - for (int i = 0; i < 16; ++i) { - LB(x1, wback, fixedaddress + i); - SLT(x3, x1, xZR); - if (i > 0) SLLI(x3, x3, i); - OR(gd, gd, x3); + LD(x1, wback, fixedaddress + 8); // high part + LD(x2, wback, fixedaddress + 0); // low part, also destroyed wback(x2) + for (int i = 0; i < 8; i++) { + if (rv64_zbs) { + if (i == 0) { + BEXTI(gd, x1, 63); + } else { + BEXTI(x6, x1, 63 - i * 8); + } + } else { + if (i == 0) { + SRLI(gd, x1, 63); + } else { + SRLI(x6, x1, 63 - i * 8); + ANDI(x6, x6, 1); + } + } + if (i != 0) { + if (rv64_zba) { + SH1ADD(gd, gd, x6); + } else { + SLLI(gd, gd, 1); + OR(gd, gd, x6); + } + } + } + for (int i = 0; i < 8; i++) { + if (rv64_zbs) { + BEXTI(x6, x2, 63 - i * 8); + } else { + SRLI(x6, x2, 63 - i * 8); + ANDI(x6, x6, 1); + } + if (rv64_zba) { + SH1ADD(gd, gd, x6); + } else { + SLLI(gd, gd, 1); + OR(gd, gd, x6); + } } break; case 0xD8: @@ -2807,8 +2840,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int for (int i = 0; i < 16; ++i) { LBU(x3, gback, gdoffset + i); LBU(x4, wback, fixedaddress + i); - BLTU(x3, x4, 8); - MV(x3, x4); + if (rv64_zbb) { + MINU(x3, x3, x4); + } else { + BLTU(x3, x4, 8); + MV(x3, x4); + } SB(x3, gback, gdoffset + i); } break; @@ -2867,8 +2904,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int for (int i = 0; i < 16; ++i) { LBU(x3, gback, gdoffset + i); LBU(x4, wback, fixedaddress + i); - BLTU(x4, x3, 8); - MV(x3, x4); + if (rv64_zbb) { + MAXU(x3, x3, x4); + } else { + BLTU(x4, x3, 8); + MV(x3, x4); + } SB(x3, gback, gdoffset + i); } break; @@ -3061,8 +3102,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int for (int i = 0; i < 8; ++i) { LH(x3, gback, gdoffset + 2 * i); LH(x4, wback, fixedaddress + 2 * i); - BLT(x3, x4, 8); - MV(x3, x4); + if (rv64_zbb) { + MIN(x3, x3, x4); + } else { + BLT(x3, x4, 8); + MV(x3, x4); + } SH(x3, gback, gdoffset + 2 * i); } break; @@ -3128,7 +3173,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; GETGX(); GETEX(x2, 0); - SSE_LOOP_WS(x3, x4, BGE(x3, x4, 8); MV(x3, x4)); + SSE_LOOP_WS(x3, x4, + if (rv64_zbb) { + MAX(x3, x3, x4); + } else { + BGE(x3, x4, 8); + MV(x3, x4); + } + ); break; case 0xEF: INST_NAME("PXOR Gx, Ex"); |