diff options
| author | xctan <xctan@cirno.icu> | 2024-05-30 16:54:48 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-05-30 10:54:48 +0200 |
| commit | 98d9f36f9cc98343217d09473db5587de1fe05b6 (patch) | |
| tree | d2ecc9b296fef890e72b61a90428676f73d28589 | |
| parent | 22bc6872f4aac9964b8bc0eca0540cd4dfbef629 (diff) | |
| download | box64-98d9f36f9cc98343217d09473db5587de1fe05b6.tar.gz box64-98d9f36f9cc98343217d09473db5587de1fe05b6.zip | |
Added more MMX opcodes and some optimizations too (#1537)
* [RV64_DYNAREC] Added 0F DD PADDUSW opcode and optimized 66 0F DD PADDUSW opcode * [RV64_DYNAREC] Added 0F 3A 0F PALIGNR opcode * [RV64_DYNAREC] Optimized 66 0F 3A 0F PALIGNR opcode
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_0f.c | 49 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f.c | 83 |
2 files changed, 110 insertions, 22 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c index 8fb37279..46313b6d 100644 --- a/src/dynarec/rv64/dynarec_rv64_0f.c +++ b/src/dynarec/rv64/dynarec_rv64_0f.c @@ -564,6 +564,34 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x3A: // more SSE3 opcodes opcode = F8; switch (opcode) { + case 0x0F: + INST_NAME("PALIGNR Gm, Em, Ib"); + nextop = F8; + GETGM(); + GETEM(x2, 1); + u8 = F8; + if (u8 > 15) { + SD(xZR, gback, gdoffset); + } else if (u8 > 7) { + if (u8 > 8) { + LD(x1, gback, gdoffset); + SRLI(x1, x1, (u8 - 8) * 8); + SD(x1, gback, gdoffset); + } + } else { + if (u8 > 0) { + LD(x3, wback, fixedaddress); + LD(x1, gback, gdoffset); + SRLI(x3, x3, u8 * 8); + SLLI(x1, x1, (8 - u8) * 8); + OR(x1, x1, x3); + SD(x1, gback, gdoffset); + } else { + LD(x1, wback, fixedaddress); + SD(x1, gback, gdoffset); + } + } + break; case 0xCC: INST_NAME("SHA1RNDS4 Gx, Ex, Ib"); nextop = F8; @@ -2036,6 +2064,27 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni SB(x3, gback, gdoffset + i); } break; + case 0xDD: + INST_NAME("PADDUSW Gm,Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + MOV32w(x5, 65535); + for (int i = 0; i < 4; ++i) { + // tmp32s = (int32_t)GX->uw[i] + EX->uw[i]; + // GX->uw[i] = (tmp32s>65535)?65535:tmp32s; + LHU(x3, gback, gdoffset + i * 2); + LHU(x4, wback, fixedaddress + i * 2); + ADDW(x3, x3, x4); + if (rv64_zbb) { + MINU(x3, x3, x5); + } else { + BGE(x5, x3, 8); // tmp32s <= 65535? + MV(x3, x5); + } + SH(x3, gback, gdoffset + i * 2); + } + break; case 0xE2: INST_NAME("PSRAD Gm, Em"); nextop = F8; diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c index bc1ab239..019c6cf6 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f.c +++ b/src/dynarec/rv64/dynarec_rv64_660f.c @@ -266,7 +266,6 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; GETGX(); GETEX(x2, 0); - sse_forget_reg(dyn, ninst, x5); ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); @@ -1170,28 +1169,64 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETGX(); GETEX(x2, 1); u8 = F8; - sse_forget_reg(dyn, ninst, x5); - ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); - // perserve gd - LD(x3, gback, gdoffset + 0); - LD(x4, gback, gdoffset + 8); - SD(x3, x5, 0); - SD(x4, x5, 8); if (u8 > 31) { SD(xZR, gback, gdoffset + 0); SD(xZR, gback, gdoffset + 8); + } else if (u8 > 23) { + LD(x5, gback, gdoffset + 8); + if (u8 > 24) { + SRLI(x5, x5, 8 * (u8 - 24)); + } + SD(x5, gback, gdoffset + 0); + SD(xZR, gback, gdoffset + 8); + } else if (u8 > 15) { + if (u8 > 16) { + LD(x5, gback, gdoffset + 8); + LD(x4, gback, gdoffset + 0); + SRLI(x3, x5, 8 * (u8 - 16)); // lower of higher 64 bits + SLLI(x5, x5, 8 * (24 - u8)); // higher of lower 64 bits + SD(x3, gback, gdoffset + 8); + SRLI(x4, x4, 8 * (u8 - 16)); // lower of lower 64 bits + OR(x4, x4, x5); // lower 64 bits + SD(x4, gback, gdoffset + 0); + } + } else if (u8 > 7) { + if (u8 > 8) { + LD(x5, gback, gdoffset + 8); + LD(x4, gback, gdoffset + 0); + LD(x3, wback, fixedaddress + 8); + SLLI(x5, x5, 8 * (16 - u8)); // higher of higher 64 bits + SRLI(x1, x4, 8 * (u8 - 8)); // lower of higher 64 bits + SLLI(x4, x4, 8 * (16 - u8)); // higher of lower 64 bits + OR(x5, x1, x5); // higher 64 bits + SRLI(x3, x3, 8 * (u8 - 8)); // lower of lower 64 bits + SD(x5, gback, gdoffset + 8); + OR(x4, x4, x3); // lower 64 bits + SD(x4, gback, gdoffset + 0); + } else { + LD(x5, gback, gdoffset + 0); + LD(x4, wback, fixedaddress + 8); + SD(x5, gback, gdoffset + 8); + SD(x4, gback, gdoffset + 0); + } } else { - for (int i = 0; i < 16; ++i, ++u8) { - if (u8 > 15) { - if (u8 > 31) { - SB(xZR, gback, gdoffset + i); - continue; - } else - LBU(x3, x5, u8 - 16); - } else { - LBU(x3, wback, fixedaddress + u8); - } - SB(x3, gback, gdoffset + i); + if (u8 > 0) { + LD(x5, gback, gdoffset + 0); + LD(x4, wback, fixedaddress + 8); + LD(x3, wback, fixedaddress + 0); + SLLI(x5, x5, 8 * (8 - u8)); // higher of higher 64 bits + SRLI(x1, x4, 8 * (u8 - 0)); // lower of higher 64 bits + SLLI(x4, x4, 8 * (8 - u8)); // higher of lower 64 bits + OR(x5, x1, x5); // higher 64 bits + SRLI(x3, x3, 8 * (u8 - 0)); // lower of lower 64 bits + SD(x5, gback, gdoffset + 8); + OR(x4, x4, x3); // lower 64 bits + SD(x4, gback, gdoffset + 0); + } else { + LD(x5, wback, fixedaddress + 8); + LD(x4, wback, fixedaddress + 0); + SD(x5, gback, gdoffset + 8); + SD(x4, gback, gdoffset + 0); } } break; @@ -2776,15 +2811,19 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; GETGX(); GETEX(x2, 0); + MOV32w(x5, 65535); for (int i = 0; i < 8; ++i) { // tmp32s = (int32_t)GX->uw[i] + EX->uw[i]; // GX->uw[i] = (tmp32s>65535)?65535:tmp32s; LHU(x3, gback, gdoffset + i * 2); LHU(x4, wback, fixedaddress + i * 2); ADDW(x3, x3, x4); - MOV32w(x4, 65536); - BLT(x3, x4, 8); - ADDIW(x3, x4, -1); + if (rv64_zbb) { + MINU(x3, x3, x5); + } else { + BGE(x5, x3, 8); // tmp32s <= 65535? + MV(x3, x5); + } SH(x3, gback, gdoffset + i * 2); } break; |