diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2024-10-11 18:25:13 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-10-11 12:25:13 +0200 |
| commit | 143fc90a4c40d93553ce42e23b696628095363b9 (patch) | |
| tree | 7d2098908793537eadcb5ddb51341de86f3c7395 /src | |
| parent | f65feaf9dabed3f2802864b5a8a8464cb7dc6b88 (diff) | |
| download | box64-143fc90a4c40d93553ce42e23b696628095363b9.tar.gz box64-143fc90a4c40d93553ce42e23b696628095363b9.zip | |
[RV64_DYNAREC] Split 660f.c to speedup compilation a bit (#1924)
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_0f.c | 118 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f.c | 1250 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f38.c | 1266 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 2 |
4 files changed, 1348 insertions, 1288 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c index 4529763b..fd210e2a 100644 --- a/src/dynarec/rv64/dynarec_rv64_0f.c +++ b/src/dynarec/rv64/dynarec_rv64_0f.c @@ -1652,69 +1652,67 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni LD(x3, gback, gdoffset + 0); SD(x3, wback, fixedaddress); break; -#define GO(GETFLAGS, NO, YES, F) \ - READFLAGS(F); \ - i32_ = F32S; \ - if(rex.is32bits) \ - j64 = (uint32_t)(addr+i32_); \ - else \ - j64 = addr+i32_; \ - BARRIER(BARRIER_MAYBE); \ - JUMP(j64, 1); \ - GETFLAGS; \ - if (dyn->insts[ninst].x64.jmp_insts == -1 || CHECK_CACHE()) { \ - /* out of the block */ \ - i32 = dyn->insts[ninst].epilog - (dyn->native_size); \ - B##NO##_safe(x1, i32); \ - if (dyn->insts[ninst].x64.jmp_insts == -1) { \ - if (!(dyn->insts[ninst].x64.barrier & BARRIER_FLOAT)) \ - fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ - jump_to_next(dyn, j64, 0, ninst, rex.is32bits); \ - } else { \ - CacheTransform(dyn, ninst, cacheupd, x1, x2, x3); \ - i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size); \ - B(i32); \ - } \ - } else { \ - /* inside the block */ \ - i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size); \ - B##YES##_safe(x1, i32); \ - } - + #define GO(GETFLAGS, NO, YES, F) \ + READFLAGS(F); \ + i32_ = F32S; \ + if(rex.is32bits) \ + j64 = (uint32_t)(addr+i32_); \ + else \ + j64 = addr+i32_; \ + BARRIER(BARRIER_MAYBE); \ + JUMP(j64, 1); \ + GETFLAGS; \ + if (dyn->insts[ninst].x64.jmp_insts == -1 || CHECK_CACHE()) { \ + /* out of the block */ \ + i32 = dyn->insts[ninst].epilog - (dyn->native_size); \ + B##NO##_safe(x1, i32); \ + if (dyn->insts[ninst].x64.jmp_insts == -1) { \ + if (!(dyn->insts[ninst].x64.barrier & BARRIER_FLOAT)) \ + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ + jump_to_next(dyn, j64, 0, ninst, rex.is32bits); \ + } else { \ + CacheTransform(dyn, ninst, cacheupd, x1, x2, x3); \ + i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size); \ + B(i32); \ + } \ + } else { \ + /* inside the block */ \ + i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address - (dyn->native_size); \ + B##YES##_safe(x1, i32); \ + } GOCOND(0x80, "J", "Id"); -#undef GO - -#define GO(GETFLAGS, NO, YES, F) \ - READFLAGS(F); \ - GETFLAGS; \ - nextop = F8; \ - S##YES(x3, x1); \ - if (MODREG) { \ - if (rex.rex) { \ - eb1 = xRAX + (nextop & 7) + (rex.b << 3); \ - eb2 = 0; \ - } else { \ - ed = (nextop & 7); \ - eb2 = (ed >> 2) * 8; \ - eb1 = xRAX + (ed & 3); \ - } \ - if (eb2) { \ - LUI(x1, 0xffff0); \ - ORI(x1, x1, 0xff); \ - AND(eb1, eb1, x1); \ - SLLI(x3, x3, 8); \ - } else { \ - ANDI(eb1, eb1, 0xf00); \ - } \ - OR(eb1, eb1, x3); \ - } else { \ - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); \ - SB(x3, ed, fixedaddress); \ - SMWRITE(); \ - } + #undef GO + #define GO(GETFLAGS, NO, YES, F) \ + READFLAGS(F); \ + GETFLAGS; \ + nextop = F8; \ + S##YES(x3, x1); \ + if (MODREG) { \ + if (rex.rex) { \ + eb1 = xRAX + (nextop & 7) + (rex.b << 3); \ + eb2 = 0; \ + } else { \ + ed = (nextop & 7); \ + eb2 = (ed >> 2) * 8; \ + eb1 = xRAX + (ed & 3); \ + } \ + if (eb2) { \ + LUI(x1, 0xffff0); \ + ORI(x1, x1, 0xff); \ + AND(eb1, eb1, x1); \ + SLLI(x3, x3, 8); \ + } else { \ + ANDI(eb1, eb1, 0xf00); \ + } \ + OR(eb1, eb1, x3); \ + } else { \ + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); \ + SB(x3, ed, fixedaddress); \ + SMWRITE(); \ + } GOCOND(0x90, "SET", "Eb"); -#undef GO + #undef GO case 0xA2: INST_NAME("CPUID"); diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c index 0236012f..e6a4d524 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f.c +++ b/src/dynarec/rv64/dynarec_rv64_660f.c @@ -259,1237 +259,31 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } break; case 0x38: // SSSE3 opcodes - nextop = F8; - switch (nextop) { - case 0x00: - INST_NAME("PSHUFB Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 15); - - ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); - - // preserve gd - LD(x3, gback, gdoffset + 0); - LD(x4, gback, gdoffset + 8); - SD(x3, x5, 0); - SD(x4, x5, 8); - - for (int i = 0; i < 16; ++i) { - LBU(x3, wback, fixedaddress + i); - ANDI(x4, x3, 128); - BEQZ(x4, 12); - SB(xZR, gback, gdoffset + i); - BEQZ(xZR, 20); // continue - ANDI(x4, x3, 15); - ADD(x4, x4, x5); - LBU(x4, x4, 0); - SB(x4, gback, gdoffset + i); - } - break; - case 0x01: - INST_NAME("PHADDW Gx, Ex"); - nextop = F8; - GETGX(); - for (int i = 0; i < 4; ++i) { - // GX->sw[i] = GX->sw[i*2+0]+GX->sw[i*2+1]; - LH(x3, gback, gdoffset + 2 * (i * 2 + 0)); - LH(x4, gback, gdoffset + 2 * (i * 2 + 1)); - ADDW(x3, x3, x4); - SH(x3, gback, gdoffset + 2 * i); - } - if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { - // GX->q[1] = GX->q[0]; - LD(x3, gback, gdoffset + 0); - SD(x3, gback, gdoffset + 8); - } else { - GETEX(x2, 0, 14); - for (int i = 0; i < 4; ++i) { - // GX->sw[4+i] = EX->sw[i*2+0] + EX->sw[i*2+1]; - LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); - LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); - ADDW(x3, x3, x4); - SH(x3, gback, gdoffset + 2 * (4 + i)); - } - } - break; - case 0x02: - INST_NAME("PHADDD Gx, Ex"); - nextop = F8; - GETGX(); - // GX->sd[0] += GX->sd[1]; - LW(x3, gback, gdoffset + 0 * 4); - LW(x4, gback, gdoffset + 1 * 4); - ADDW(x3, x3, x4); - SW(x3, gback, gdoffset + 0 * 4); - // GX->sd[1] = GX->sd[2] + GX->sd[3]; - LW(x3, gback, gdoffset + 2 * 4); - LW(x4, gback, gdoffset + 3 * 4); - ADDW(x3, x3, x4); - SW(x3, gback, gdoffset + 1 * 4); - if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { - // GX->q[1] = GX->q[0]; - LD(x3, gback, gdoffset + 0); - SD(x3, gback, gdoffset + 8); - } else { - GETEX(x2, 0, 12); - // GX->sd[2] = EX->sd[0] + EX->sd[1]; - LW(x3, wback, fixedaddress + 0 * 4); - LW(x4, wback, fixedaddress + 1 * 4); - ADDW(x3, x3, x4); - SW(x3, gback, gdoffset + 2 * 4); - // GX->sd[3] = EX->sd[2] + EX->sd[3]; - LW(x3, wback, fixedaddress + 2 * 4); - LW(x4, wback, fixedaddress + 3 * 4); - ADDW(x3, x3, x4); - SW(x3, gback, gdoffset + 3 * 4); - } - break; - case 0x03: - INST_NAME("PHADDSW Gx, Ex"); - nextop = F8; - GETGX(); - MOV64x(x5, 32767); - MOV64x(x6, -32768); - for (int i = 0; i < 4; ++i) { - // tmp32s = GX->sw[i*2+0]+GX->sw[i*2+1]; - // GX->sw[i] = sat(tmp32s); - LH(x3, gback, gdoffset + 2 * (i * 2 + 0)); - LH(x4, gback, gdoffset + 2 * (i * 2 + 1)); - ADDW(x3, x3, x4); - if (rv64_zbb) { - MIN(x3, x3, x5); - MAX(x3, x3, x6); - } else { - BLT(x3, x5, 4 + 4); - MV(x3, x5); - BLT(x6, x3, 4 + 4); - MV(x3, x6); - } - SH(x3, gback, gdoffset + i * 2); - } - if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { - // GX->q[1] = GX->q[0]; - LD(x3, gback, gdoffset + 0); - SD(x3, gback, gdoffset + 8); - } else { - GETEX(x2, 0, 14); - for (int i = 0; i < 4; ++i) { - // tmp32s = EX->sw[i*2+0] + EX->sw[i*2+1]; - // GX->sw[4+i] = sat(tmp32s); - LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); - LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); - ADDW(x3, x3, x4); - if (rv64_zbb) { - MIN(x3, x3, x5); - MAX(x3, x3, x6); - } else { - BLT(x3, x5, 4 + 4); - MV(x3, x5); - BLT(x6, x3, 4 + 4); - MV(x3, x6); - } - SH(x3, gback, gdoffset + 2 * (4 + i)); - } - } - break; - case 0x04: - INST_NAME("PMADDUBSW Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 15); - MOV64x(x5, 32767); - MOV64x(x6, -32768); - for (int i = 0; i < 8; ++i) { - LBU(x3, gback, gdoffset + i * 2); - LB(x4, wback, fixedaddress + i * 2); - MUL(x9, x3, x4); - LBU(x3, gback, gdoffset + i * 2 + 1); - LB(x4, wback, fixedaddress + i * 2 + 1); - MUL(x3, x3, x4); - ADD(x3, x3, x9); - if (rv64_zbb) { - MIN(x3, x3, x5); - MAX(x3, x3, x6); - } else { - BLT(x3, x5, 4 + 4); - MV(x3, x5); - BLT(x6, x3, 4 + 4); - MV(x3, x6); - } - SH(x3, gback, gdoffset + i * 2); - } - break; - case 0x05: - INST_NAME("PHSUBW Gx, Ex"); - nextop = F8; - GETGX(); - for (int i = 0; i < 4; ++i) { - // GX->sw[i] = GX->sw[i*2+0] - GX->sw[i*2+1]; - LH(x3, gback, gdoffset + 2 * (i * 2 + 0)); - LH(x4, gback, gdoffset + 2 * (i * 2 + 1)); - SUBW(x3, x3, x4); - SH(x3, gback, gdoffset + 2 * i); - } - if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { - // GX->q[1] = GX->q[0]; - LD(x3, gback, gdoffset + 0); - SD(x3, gback, gdoffset + 8); - } else { - GETEX(x2, 0, 14); - for (int i = 0; i < 4; ++i) { - // GX->sw[4+i] = EX->sw[i*2+0] - EX->sw[i*2+1]; - LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); - LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); - SUBW(x3, x3, x4); - SH(x3, gback, gdoffset + 2 * (4 + i)); - } - } - break; - case 0x08: - INST_NAME("PSIGNB Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 15); - for (int i = 0; i < 16; ++i) { - LB(x3, gback, gdoffset + i); - LB(x4, wback, fixedaddress + i); - SLT(x1, xZR, x4); - SRAI(x5, x4, 63); - OR(x1, x1, x5); - MUL(x3, x1, x3); - SB(x3, gback, gdoffset + i); - } - break; - case 0x09: - INST_NAME("PSIGNW Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 14); - for (int i = 0; i < 8; ++i) { - LH(x3, gback, gdoffset + i * 2); - LH(x4, wback, fixedaddress + i * 2); - SLT(x1, xZR, x4); - SRAI(x5, x4, 63); - OR(x1, x1, x5); - MUL(x3, x1, x3); - SH(x3, gback, gdoffset + i * 2); - } - break; - case 0x0A: - INST_NAME("PSIGND Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 12); - for (int i = 0; i < 4; ++i) { - LW(x3, gback, gdoffset + i * 4); - LW(x4, wback, fixedaddress + i * 4); - SLT(x1, xZR, x4); - SRAI(x5, x4, 63); - OR(x1, x1, x5); - MUL(x3, x1, x3); - SW(x3, gback, gdoffset + i * 4); - } - break; - case 0x0B: - INST_NAME("PMULHRSW Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 14); - for (int i = 0; i < 8; ++i) { - LH(x3, gback, gdoffset + i * 2); - LH(x4, wback, fixedaddress + i * 2); - MUL(x3, x3, x4); - SRAI(x3, x3, 14); - ADDI(x3, x3, 1); - SRAI(x3, x3, 1); - SH(x3, gback, gdoffset + i * 2); - } - break; - case 0x10: - INST_NAME("PBLENDVB Gx,Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 15); - sse_forget_reg(dyn, ninst, x6, 0); // forget xmm[0] - for (int i = 0; i < 16; ++i) { - LB(x3, xEmu, offsetof(x64emu_t, xmm[0]) + i); - BGE(x3, xZR, 12); // continue - LBU(x3, wback, fixedaddress + i); - SB(x3, gback, gdoffset + i); - // continue - } - break; - case 0x14: - INST_NAME("PBLENDVPS Gx,Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 12); - for (int i = 0; i < 4; ++i) { - LW(x3, xEmu, offsetof(x64emu_t, xmm[0]) + i * 4); - BGE(x3, xZR, 4 + 4 * 2); - LWU(x3, wback, fixedaddress + i * 4); - SW(x3, gback, gdoffset + i * 4); - } - break; - case 0x17: - INST_NAME("PTEST Gx, Ex"); - nextop = F8; - SETFLAGS(X_ALL, SF_SET); - GETGX(); - GETEX(x2, 0, 8); - CLEAR_FLAGS(); - SET_DFNONE(); - IFX(X_ZF | X_CF) - { - LD(x5, wback, fixedaddress + 0); - LD(x6, wback, fixedaddress + 8); - - IFX(X_ZF) - { - LD(x3, gback, gdoffset + 0); - LD(x4, gback, gdoffset + 8); - AND(x3, x3, x5); - AND(x4, x4, x6); - OR(x3, x3, x4); - BNEZ(x3, 8); - ORI(xFlags, xFlags, 1 << F_ZF); - } - IFX(X_CF) - { - LD(x3, gback, gdoffset + 0); - NOT(x3, x3); - LD(x4, gback, gdoffset + 8); - NOT(x4, x4); - AND(x3, x3, x5); - AND(x4, x4, x6); - OR(x3, x3, x4); - BNEZ(x3, 8); - ORI(xFlags, xFlags, 1 << F_CF); - } - } - break; - - case 0x1C: - INST_NAME("PABSB Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 15); - for (int i = 0; i < 16; ++i) { - LB(x4, wback, fixedaddress + i); - BGE(x4, xZR, 4 + 4); - NEG(x4, x4); - SB(x4, gback, gdoffset + i); - } - break; - case 0x1D: - INST_NAME("PABSW Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 14); - for (int i = 0; i < 8; ++i) { - LH(x4, wback, fixedaddress + i * 2); - BGE(x4, xZR, 4 + 4); - NEG(x4, x4); - SH(x4, gback, gdoffset + i * 2); - } - break; - case 0x1E: - INST_NAME("PABSD Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 12); - for (int i = 0; i < 4; ++i) { - LW(x4, wback, fixedaddress + i * 4); - BGE(x4, xZR, 4 + 4); - NEG(x4, x4); - SW(x4, gback, gdoffset + i * 4); - } - break; - case 0x20: - INST_NAME("PMOVSXBW Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 7); - for (int i = 7; i >= 0; --i) { - // GX->sw[i] = EX->sb[i]; - LB(x3, wback, fixedaddress + i); - SH(x3, gback, gdoffset + i * 2); - } - break; - case 0x21: - INST_NAME("PMOVSXBD Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 3); - for (int i = 3; i >= 0; --i) { - // GX->sd[i] = EX->sb[i]; - LB(x3, wback, fixedaddress + i); - SW(x3, gback, gdoffset + i * 4); - } - break; - case 0x22: - INST_NAME("PMOVSXBQ Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 1); - for (int i = 1; i >= 0; --i) { - // GX->sq[i] = EX->sb[i]; - LB(x3, wback, fixedaddress + i); - SD(x3, gback, gdoffset + i * 8); - } - break; - case 0x23: - INST_NAME("PMOVSXWD Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 6); - for (int i = 3; i >= 0; --i) { - // GX->sd[i] = EX->sw[i]; - LH(x3, wback, fixedaddress + i * 2); - SW(x3, gback, gdoffset + i * 4); - } - break; - case 0x24: - INST_NAME("PMOVSXWQ Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 2); - for (int i = 1; i >= 0; --i) { - // GX->sq[i] = EX->sw[i]; - LH(x3, wback, fixedaddress + i * 2); - SD(x3, gback, gdoffset + i * 8); - } - break; - case 0x25: - INST_NAME("PMOVSXDQ Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 4); - for (int i = 1; i >= 0; --i) { - // GX->sq[i] = EX->sd[i]; - LW(x4, wback, fixedaddress + i * 4); - SD(x4, gback, gdoffset + i * 8); - } - break; - case 0x28: - INST_NAME("PMULDQ Gx, Ex"); - nextop = F8; - GETEX(x2, 0, 8); - GETGX(); - for (int i = 1; i >= 0; --i) { - LW(x3, wback, fixedaddress + i * 8); - LW(x4, gback, gdoffset + i * 8); - MUL(x3, x3, x4); - SD(x3, gback, gdoffset + i * 8); - } - break; - case 0x29: - INST_NAME("PCMPEQQ Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 8); - SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4); SNEZ(x3, x3); ADDI(x3, x3, -1)); - break; - case 0x2B: - INST_NAME("PACKUSDW Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 12); - MOV64x(x5, 65535); - for (int i = 0; i < 4; ++i) { - LW(x3, gback, gdoffset + i * 4); - if (rv64_zbb) { - MIN(x3, x3, x5); - MAX(x3, x3, xZR); - } else { - BGE(x3, xZR, 4 + 4); - MV(x3, xZR); - BLT(x3, x5, 4 + 4); - MV(x3, x5); - } - SH(x3, gback, gdoffset + i * 2); - } - if (MODREG && gd == ed) { - LD(x3, gback, gdoffset + 0); - SD(x3, gback, gdoffset + 8); - } else - for (int i = 0; i < 4; ++i) { - LW(x3, wback, fixedaddress + i * 4); - if (rv64_zbb) { - MIN(x3, x3, x5); - MAX(x3, x3, xZR); - } else { - BGE(x3, xZR, 4 + 4); - MV(x3, xZR); - BLT(x3, x5, 4 + 4); - MV(x3, x5); - } - SH(x3, gback, gdoffset + 8 + i * 2); - } - break; - - case 0x30: - INST_NAME("PMOVZXBW Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 7); - for (int i = 7; i >= 0; --i) { - LBU(x3, wback, fixedaddress + i); - SH(x3, gback, gdoffset + i * 2); - } - break; - case 0x31: - INST_NAME("PMOVZXBD Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 3); - for (int i = 3; i >= 0; --i) { - LBU(x3, wback, fixedaddress + i); - SW(x3, gback, gdoffset + i * 4); - } - break; - case 0x32: - INST_NAME("PMOVZXBQ Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 2); - for (int i = 1; i >= 0; --i) { - LBU(x3, wback, fixedaddress + i); - SD(x3, gback, gdoffset + i * 8); - } - break; - case 0x33: - INST_NAME("PMOVZXWD Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 6); - for (int i = 3; i >= 0; --i) { - LHU(x3, wback, fixedaddress + i * 2); - SW(x3, gback, gdoffset + i * 4); - } - break; - case 0x34: - INST_NAME("PMOVZXWQ Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 2); - for (int i = 1; i >= 0; --i) { - LHU(x3, wback, fixedaddress + i * 2); - SD(x3, gback, gdoffset + i * 8); - } - break; - case 0x35: - INST_NAME("PMOVZXDQ Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 4); - for (int i = 1; i >= 0; --i) { - LWU(x3, wback, fixedaddress + i * 4); - SD(x3, gback, gdoffset + i * 8); - } - break; - - case 0x38: - INST_NAME("PMINSB Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 15); - for (int i = 0; i < 16; ++i) { - LB(x3, gback, gdoffset + i); - LB(x4, wback, fixedaddress + i); - if (rv64_zbb) - MIN(x4, x3, x4); - else - BLT(x3, x4, 4 + 4); - SB(x4, gback, gdoffset + i); - } - break; - case 0x39: - INST_NAME("PMINSD Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 12); - for (int i = 0; i < 4; ++i) { - LW(x3, gback, gdoffset + i * 4); - LW(x4, wback, fixedaddress + i * 4); - if (rv64_zbb) - MIN(x4, x3, x4); - else - BLT(x3, x4, 4 + 4); - SW(x4, gback, gdoffset + i * 4); - } - break; - case 0x3A: - INST_NAME("PMINUW Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 14); - for (int i = 0; i < 8; ++i) { - LHU(x3, gback, gdoffset + i * 2); - LHU(x4, wback, fixedaddress + i * 2); - if (rv64_zbb) - MINU(x4, x3, x4); - else - BLTU(x3, x4, 4 + 4); - SH(x4, gback, gdoffset + i * 2); - } - break; - case 0x3B: - INST_NAME("PMINUD Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 12); - for (int i = 0; i < 4; ++i) { - LWU(x3, gback, gdoffset + i * 4); - LWU(x4, wback, fixedaddress + i * 4); - if (rv64_zbb) - MINU(x4, x3, x4); - else - BLTU(x3, x4, 4 + 4); - SW(x4, gback, gdoffset + i * 4); - } - break; - case 0x3C: - INST_NAME("PMAXSB Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 15); - for (int i = 0; i < 16; ++i) { - LB(x3, gback, gdoffset + i); - LB(x4, wback, fixedaddress + i); - if (rv64_zbb) - MAX(x4, x3, x4); - else - BLT(x4, x3, 4 + 4); - SB(x4, gback, gdoffset + i); - } - break; - case 0x3D: - INST_NAME("PMAXSD Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 12); - for (int i = 0; i < 4; ++i) { - LW(x3, gback, gdoffset + i * 4); - LW(x4, wback, fixedaddress + i * 4); - if (rv64_zbb) - MAX(x4, x3, x4); - else - BLT(x4, x3, 4 + 4); - SW(x4, gback, gdoffset + i * 4); - } - break; - case 0x3E: - INST_NAME("PMAXUW Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 14); - for (int i = 0; i < 8; ++i) { - LHU(x3, gback, gdoffset + i * 2); - LHU(x4, wback, fixedaddress + i * 2); - if (rv64_zbb) - MAXU(x4, x3, x4); - else - BLTU(x4, x3, 4 + 4); - SH(x4, gback, gdoffset + i * 2); - } - break; - case 0x3F: - INST_NAME("PMAXUD Gx, Ex"); // SSE4 opcode! - nextop = F8; - GETGX(); - GETEX(x2, 0, 12); - for (int i = 0; i < 4; ++i) { - LWU(x3, gback, gdoffset + i * 4); - LWU(x4, wback, fixedaddress + i * 4); - if (rv64_zbb) - MAXU(x4, x3, x4); - else - BLTU(x4, x3, 4 + 4); - SW(x4, gback, gdoffset + i * 4); - } - break; - case 0x40: - INST_NAME("PMULLD Gx, Ex"); - nextop = F8; - GETGX(); - GETEX(x2, 0, 12); - for (int i = 0; i < 4; ++i) { - LW(x3, gback, gdoffset + i * 4); - LW(x4, wback, fixedaddress + i * 4); - MUL(x3, x3, x4); - SW(x3, gback, gdoffset + i * 4); - } - break; - case 0x61: - INST_NAME("PCMPESTRI Gx, Ex, Ib"); - SETFLAGS(X_ALL, SF_SET_DF); - nextop = F8; - GETG; - sse_reflect_reg(dyn, ninst, x6, gd); - ADDI(x3, xEmu, offsetof(x64emu_t, xmm[gd])); - if (MODREG) { - ed = (nextop & 7) + (rex.b << 3); - sse_reflect_reg(dyn, ninst, x6, ed); - ADDI(x1, xEmu, offsetof(x64emu_t, xmm[ed])); - } else { - addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1); - if (ed != x1) MV(x1, ed); - } - // prepare rest arguments - MV(x2, xRDX); - MV(x4, xRAX); - u8 = F8; - MOV32w(x5, u8); - CALL(sse42_compare_string_explicit_len, x1); - ZEROUP(x1); - BNEZ_MARK(x1); - MOV32w(xRCX, (u8 & 1) ? 8 : 16); - B_NEXT_nocond; - MARK; - if (u8 & 0b1000000) { - CLZxw(xRCX, x1, 0, x2, x3, x4); - ADDI(x2, xZR, 31); - SUB(xRCX, x2, xRCX); - } else { - CTZxw(xRCX, x1, 0, x2, x3); - } - break; - case 0xDB: - INST_NAME("AESIMC Gx, Ex"); // AES-NI - nextop = F8; - GETGX(); - GETEX(x2, 0, 8); - SSE_LOOP_MV_Q(x3); - sse_forget_reg(dyn, ninst, x6, gd); - MOV32w(x1, gd); - CALL(native_aesimc, -1); - break; - case 0xDC: - INST_NAME("AESENC Gx, Ex"); // AES-NI - nextop = F8; - GETG; - sse_forget_reg(dyn, ninst, x6, gd); - MOV32w(x1, gd); - CALL(native_aese, -1); - GETGX(); - GETEX(x2, 0, 8); - SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); - break; - case 0xDD: - INST_NAME("AESENCLAST Gx, Ex"); // AES-NI - nextop = F8; - GETG; - sse_forget_reg(dyn, ninst, x6, gd); - MOV32w(x1, gd); - CALL(native_aeselast, -1); - GETGX(); - GETEX(x2, 0, 8); - SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); - break; - case 0xDE: - INST_NAME("AESDEC Gx, Ex"); // AES-NI - nextop = F8; - GETG; - sse_forget_reg(dyn, ninst, x6, gd); - MOV32w(x1, gd); - CALL(native_aesd, -1); - GETGX(); - GETEX(x2, 0, 8); - SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); - break; - - case 0xDF: - INST_NAME("AESDECLAST Gx, Ex"); // AES-NI - nextop = F8; - GETG; - sse_forget_reg(dyn, ninst, x6, gd); - MOV32w(x1, gd); - CALL(native_aesdlast, -1); - GETGX(); - GETEX(x2, 0, 8); - SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); - break; - case 0xF0: - INST_NAME("MOVBE Gw, Ew"); - nextop = F8; - GETGD; - SMREAD(); - addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, 0); - LHU(x1, ed, fixedaddress); - if (rv64_zbb) { - REV8(x1, x1); - SRLI(x1, x1, 48); - } else if (rv64_xtheadbb) { - TH_REVW(x1, x1); - SRLI(x1, x1, 16); - } else { - ANDI(x2, x1, 0xff); - SLLI(x2, x2, 8); - SRLI(x1, x1, 8); - OR(x1, x1, x2); - } - LUI(x2, 0xffff0); - AND(gd, gd, x2); - OR(gd, gd, x1); - break; - case 0xF1: - INST_NAME("MOVBE Ew, Gw"); - nextop = F8; - GETGD; - SMREAD(); - addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, 0); - if (rv64_zbb) { - REV8(x1, gd); - SRLI(x1, x1, 48); - } else if (rv64_xtheadbb) { - TH_REVW(x1, gd); - SRLI(x1, x1, 16); - } else { - ANDI(x1, gd, 0xff); - SLLI(x1, x1, 8); - SRLI(x2, gd, 8); - ANDI(x2, x2, 0xff); - OR(x1, x1, x2); - } - SH(x1, wback, fixedaddress); - break; - default: - DEFAULT; - } - break; case 0x3A: // these are some more SSSE3+ opcodes - opcode = F8; - switch (opcode) { - case 0x09: - INST_NAME("ROUNDPD Gx, Ex, Ib"); - nextop = F8; - GETGX(); - GETEX(x2, 1, 8); - u8 = F8; - d0 = fpu_get_scratch(dyn); - d1 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); - MOV64x(x3, 1ULL << __DBL_MANT_DIG__); - FCVTDL(d1, x3, RD_RTZ); - - // i = 0 - FLD(d0, wback, fixedaddress); - FEQD(x4, d0, d0); - BNEZ(x4, 8); - B_MARK_nocond; - // d0 is not nan - FABSD(v1, d0); - FLTD(x4, v1, d1); - BNEZ(x4, 8); - B_MARK_nocond; - if (u8 & 4) { - u8 = sse_setround(dyn, ninst, x4, x5); - FCVTLD(x5, d0, RD_DYN); - FCVTDL(d0, x5, RD_RTZ); - x87_restoreround(dyn, ninst, u8); - } else { - FCVTLD(x5, d0, round_round[u8 & 3]); - FCVTDL(d0, x5, RD_RTZ); - } - MARK; - FSD(d0, gback, gdoffset + 0); - - // i = 1 - FLD(d0, wback, fixedaddress + 8); - FEQD(x4, d0, d0); - BNEZ(x4, 8); - B_MARK2_nocond; - // d0 is not nan - FABSD(v1, d0); - FLTD(x4, v1, d1); - BNEZ(x4, 8); - B_MARK2_nocond; - if (u8 & 4) { - u8 = sse_setround(dyn, ninst, x4, x5); - FCVTLD(x5, d0, RD_DYN); - FCVTDL(d0, x5, RD_RTZ); - x87_restoreround(dyn, ninst, u8); - } else { - FCVTLD(x5, d0, round_round[u8 & 3]); - FCVTDL(d0, x5, RD_RTZ); - } - MARK2; - FSD(d0, gback, gdoffset + 8); - break; - case 0x0A: - INST_NAME("ROUNDSS Gx, Ex, Ib"); - nextop = F8; - GETEXSS(d0, 1); - GETGXSS_empty(v0); - d1 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); - u8 = F8; - FEQS(x2, d0, d0); - BNEZ_MARK(x2); - if (v0 != d0) FMVS(v0, d0); - B_NEXT_nocond; - MARK; // d0 is not nan - FABSS(v1, d0); - MOV64x(x3, 1ULL << __FLT_MANT_DIG__); - FCVTSW(d1, x3, RD_RTZ); - FLTS(x3, v1, d1); - BNEZ_MARK2(x3); - if (v0 != d0) FMVS(v0, d0); - B_NEXT_nocond; - MARK2; - if (u8 & 4) { - u8 = sse_setround(dyn, ninst, x4, x2); - FCVTWS(x5, d0, RD_DYN); - FCVTSW(v0, x5, RD_RTZ); - x87_restoreround(dyn, ninst, u8); - } else { - FCVTWS(x5, d0, round_round[u8 & 3]); - FCVTSW(v0, x5, RD_RTZ); - } - break; - case 0x0B: - INST_NAME("ROUNDSD Gx, Ex, Ib"); - nextop = F8; - GETEXSD(d0, 1); - GETGXSD_empty(v0); - d1 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); - u8 = F8; - FEQD(x2, d0, d0); - BNEZ_MARK(x2); - if (v0 != d0) FMVD(v0, d0); - B_NEXT_nocond; - MARK; // d0 is not nan - FABSD(v1, d0); - MOV64x(x3, 1ULL << __DBL_MANT_DIG__); - FCVTDL(d1, x3, RD_RTZ); - FLTD(x3, v1, d1); - BNEZ_MARK2(x3); - if (v0 != d0) FMVD(v0, d0); - B_NEXT_nocond; - MARK2; - if (u8 & 4) { - u8 = sse_setround(dyn, ninst, x4, x2); - FCVTLD(x5, d0, RD_DYN); - FCVTDL(v0, x5, RD_RTZ); - x87_restoreround(dyn, ninst, u8); - } else { - FCVTLD(x5, d0, round_round[u8 & 3]); - FCVTDL(v0, x5, RD_RTZ); - } - break; - case 0x0C: - INST_NAME("BLENDPS Gx, Ex, Ib"); - nextop = F8; - GETGX(); - GETEX(x2, 1, 12); - u8 = F8 & 0b1111; - for (int i = 0; i < 4; ++i) - if (u8 & (1 << i)) { - LWU(x1, wback, fixedaddress + i * 4); - SW(x1, gback, gdoffset + i * 4); - } - break; - case 0x0E: - INST_NAME("PBLENDW Gx, Ex, Ib"); - nextop = F8; - GETGX(); - GETEX(x2, 1, 14); - u8 = F8; - i32 = 0; - if (MODREG && gd == ed) break; - while (u8) - if (u8 & 1) { - if (!(i32 & 1) && u8 & 2) { - if (!(i32 & 3) && (u8 & 0xf) == 0xf) { - // whole 64bits - LD(x3, wback, fixedaddress + 8 * (i32 >> 2)); - SD(x3, gback, gdoffset + 8 * (i32 >> 2)); - i32 += 4; - u8 >>= 4; - } else { - // 32bits - LWU(x3, wback, fixedaddress + 4 * (i32 >> 1)); - SW(x3, gback, gdoffset + 4 * (i32 >> 1)); - i32 += 2; - u8 >>= 2; - } - } else { - // 16 bits - LHU(x3, wback, fixedaddress + 2 * i32); - SH(x3, gback, gdoffset + 2 * i32); - i32++; - u8 >>= 1; - } - } else { - // nope - i32++; - u8 >>= 1; - } - break; - case 0x0F: - INST_NAME("PALIGNR Gx, Ex, Ib"); - nextop = F8; - GETGX(); - GETEX(x2, 1, 8); - u8 = F8; - if (u8 > 31) { - SD(xZR, gback, gdoffset + 0); - SD(xZR, gback, gdoffset + 8); - } else if (u8 > 23) { - LD(x5, gback, gdoffset + 8); - if (u8 > 24) { - SRLI(x5, x5, 8 * (u8 - 24)); - } - SD(x5, gback, gdoffset + 0); - SD(xZR, gback, gdoffset + 8); - } else if (u8 > 15) { - if (u8 > 16) { - LD(x5, gback, gdoffset + 8); - LD(x4, gback, gdoffset + 0); - SRLI(x3, x5, 8 * (u8 - 16)); // lower of higher 64 bits - SLLI(x5, x5, 8 * (24 - u8)); // higher of lower 64 bits - SD(x3, gback, gdoffset + 8); - SRLI(x4, x4, 8 * (u8 - 16)); // lower of lower 64 bits - OR(x4, x4, x5); // lower 64 bits - SD(x4, gback, gdoffset + 0); - } - } else if (u8 > 7) { - if (u8 > 8) { - LD(x5, gback, gdoffset + 8); - LD(x4, gback, gdoffset + 0); - LD(x3, wback, fixedaddress + 8); - SLLI(x5, x5, 8 * (16 - u8)); // higher of higher 64 bits - SRLI(x1, x4, 8 * (u8 - 8)); // lower of higher 64 bits - SLLI(x4, x4, 8 * (16 - u8)); // higher of lower 64 bits - OR(x5, x1, x5); // higher 64 bits - SRLI(x3, x3, 8 * (u8 - 8)); // lower of lower 64 bits - SD(x5, gback, gdoffset + 8); - OR(x4, x4, x3); // lower 64 bits - SD(x4, gback, gdoffset + 0); - } else { - LD(x5, gback, gdoffset + 0); - LD(x4, wback, fixedaddress + 8); - SD(x5, gback, gdoffset + 8); - SD(x4, gback, gdoffset + 0); - } - } else { - if (u8 > 0) { - LD(x5, gback, gdoffset + 0); - LD(x4, wback, fixedaddress + 8); - LD(x3, wback, fixedaddress + 0); - SLLI(x5, x5, 8 * (8 - u8)); // higher of higher 64 bits - SRLI(x1, x4, 8 * (u8 - 0)); // lower of higher 64 bits - SLLI(x4, x4, 8 * (8 - u8)); // higher of lower 64 bits - OR(x5, x1, x5); // higher 64 bits - SRLI(x3, x3, 8 * (u8 - 0)); // lower of lower 64 bits - SD(x5, gback, gdoffset + 8); - OR(x4, x4, x3); // lower 64 bits - SD(x4, gback, gdoffset + 0); - } else { - LD(x5, wback, fixedaddress + 8); - LD(x4, wback, fixedaddress + 0); - SD(x5, gback, gdoffset + 8); - SD(x4, gback, gdoffset + 0); - } - } - break; - case 0x16: - if (rex.w) { - INST_NAME("PEXTRQ Ed, Gx, Ib"); - } else { - INST_NAME("PEXTRD Ed, Gx, Ib"); - } - nextop = F8; - GETGX(); - GETED(1); - u8 = F8; - if (rex.w) - LD(ed, gback, gdoffset + 8 * (u8 & 1)); - else - LWU(ed, gback, gdoffset + 4 * (u8 & 3)); - if (wback) { - SDxw(ed, wback, fixedaddress); - SMWRITE2(); - } - break; - case 0x17: - INST_NAME("EXTRACTPS Ew, Gx, Ib"); - nextop = F8; - GETGX(); - GETED(1); - u8 = F8; - LWU(ed, gback, gdoffset + 4 * (u8 & 3)); - if (wback) { - SW(ed, wback, fixedaddress); - SMWRITE2(); - } - break; - case 0x20: - INST_NAME("PINSRB Gx, ED, Ib"); - nextop = F8; - GETGX(); - GETED(1); - u8 = F8; - SB(ed, gback, gdoffset + (u8 & 0xF)); - break; - case 0x21: - INST_NAME("INSERTPS GX, EX, Ib"); - nextop = F8; - GETGX(); - GETEX(x2, 1, 12); - u8 = F8; - if (MODREG) - s8 = (u8 >> 6) & 3; - else - s8 = 0; - // GX->ud[(tmp8u>>4)&3] = EX->ud[tmp8s]; - LWU(x3, wback, fixedaddress + 4 * s8); - SW(x3, gback, gdoffset + 4 * (u8 >> 4)); - for (int i = 0; i < 4; ++i) { - if (u8 & (1 << i)) - // GX->ud[i] = 0; - SW(xZR, gback, gdoffset + 4 * i); - } - break; - case 0x22: - INST_NAME("PINSRD Gx, ED, Ib"); - nextop = F8; - GETGX(); - GETED(1); - u8 = F8; - if (rex.w) { - SD(ed, gback, gdoffset + 8 * (u8 & 0x1)); - } else { - SW(ed, gback, gdoffset + 4 * (u8 & 0x3)); - } - break; - case 0x40: - INST_NAME("DPPS Gx, Ex, Ib"); - nextop = F8; - GETGX(); - GETEX(x2, 1, 12); - u8 = F8; - d0 = fpu_get_scratch(dyn); - d1 = fpu_get_scratch(dyn); - d2 = fpu_get_scratch(dyn); - FMVWX(d2, xZR); - for (int i = 0; i < 4; ++i) - if (u8 & (1 << (i + 4))) { - FLW(d0, gback, gdoffset + i * 4); - FLW(d1, wback, fixedaddress + i * 4); - FMULS(d0, d0, d1); - FADDS(d2, d2, d0); - } - for (int i = 0; i < 4; ++i) - if (u8 & (1 << i)) - FSW(d2, gback, gdoffset + i * 4); - else - SW(xZR, gback, gdoffset + i * 4); - break; - case 0x44: - INST_NAME("PCLMULQDQ Gx, Ex, Ib"); - nextop = F8; - GETG; - sse_forget_reg(dyn, ninst, x6, gd); - MOV32w(x1, gd); // gx - if (MODREG) { - ed = (nextop & 7) + (rex.b << 3); - sse_forget_reg(dyn, ninst, x6, ed); - MOV32w(x2, ed); - MOV32w(x3, 0); // p = NULL - } else { - MOV32w(x2, 0); - addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 0, 1); - if (ed != x3) { - MV(x3, ed); - } - } - u8 = F8; - MOV32w(x4, u8); - CALL(native_pclmul, -1); - break; - case 0x63: - INST_NAME("PCMPISTRI Gx, Ex, Ib"); - SETFLAGS(X_ALL, SF_SET_DF); - nextop = F8; - GETG; - sse_reflect_reg(dyn, ninst, x6, gd); - ADDI(x2, xEmu, offsetof(x64emu_t, xmm[gd])); - if (MODREG) { - ed = (nextop & 7) + (rex.b << 3); - sse_reflect_reg(dyn, ninst, x6, ed); - ADDI(x1, xEmu, offsetof(x64emu_t, xmm[ed])); - } else { - addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1); - if (ed != x1) MV(x1, ed); - } - u8 = F8; - MOV32w(x3, u8); - CALL(sse42_compare_string_implicit_len, x1); - ZEROUP(x1); - BNEZ_MARK(x1); - MOV32w(xRCX, (u8 & 1) ? 8 : 16); - B_NEXT_nocond; - MARK; - if (u8 & 0b1000000) { - CLZxw(xRCX, x1, 0, x2, x3, x4); - ADDI(x2, xZR, 31); - SUB(xRCX, x2, xRCX); - } else { - CTZxw(xRCX, x1, 0, x2, x3); - } - break; - case 0xDF: - INST_NAME("AESKEYGENASSIST Gx, Ex, Ib"); // AES-NI - nextop = F8; - GETG; - sse_forget_reg(dyn, ninst, x6, gd); - MOV32w(x1, gd); // gx - if (MODREG) { - ed = (nextop & 7) + (rex.b << 3); - sse_forget_reg(dyn, ninst, x6, ed); - MOV32w(x2, ed); - MOV32w(x3, 0); // p = NULL - } else { - MOV32w(x2, 0); - addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 1); - if (ed != x3) { - MV(x3, ed); - } - } - u8 = F8; - MOV32w(x4, u8); - CALL(native_aeskeygenassist, -1); - break; - default: - DEFAULT; - } - break; -#define GO(GETFLAGS, NO, YES, F) \ - READFLAGS(F); \ - GETFLAGS; \ - nextop = F8; \ - GETGD; \ - if (MODREG) { \ - ed = xRAX + (nextop & 7) + (rex.b << 3); \ - ZEXTH(x4, ed); \ - ed = x4; \ - } else { \ - SMREAD(); \ - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \ - LHU(x4, ed, fixedaddress); \ - ed = x4; \ - } \ - B##NO(x1, 4 + 3 * 4); \ - LUI(x3, 0xffff0); \ - AND(gd, gd, x3); \ - OR(gd, gd, ed); + addr = dynarec64_660F38(dyn, addr, opcode, ip, ninst, rex, ok, need_epilog); + break; + #define GO(GETFLAGS, NO, YES, F) \ + READFLAGS(F); \ + GETFLAGS; \ + nextop = F8; \ + GETGD; \ + if (MODREG) { \ + ed = xRAX + (nextop & 7) + (rex.b << 3); \ + ZEXTH(x4, ed); \ + ed = x4; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \ + LHU(x4, ed, fixedaddress); \ + ed = x4; \ + } \ + B##NO(x1, 4 + 3 * 4); \ + LUI(x3, 0xffff0); \ + AND(gd, gd, x3); \ + OR(gd, gd, ed); GOCOND(0x40, "CMOV", "Gw, Ew"); -#undef GO + #undef GO case 0x50: INST_NAME("PMOVMSKD Gd, Ex"); nextop = F8; diff --git a/src/dynarec/rv64/dynarec_rv64_660f38.c b/src/dynarec/rv64/dynarec_rv64_660f38.c new file mode 100644 index 00000000..475bf1da --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_660f38.c @@ -0,0 +1,1266 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "bitutils.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_functions.h" +#include "dynarec_rv64_helper.h" +#include "emu/x64compstrings.h" + +uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t nextop, u8, s8; + int32_t i32; + uint8_t gd, ed; + uint8_t wback, wb1, wb2, gback; + uint8_t eb1, eb2; + int64_t j64; + uint64_t tmp64u, tmp64u2; + int v0, v1; + int q0, q1; + int d0, d1, d2; + int64_t fixedaddress, gdoffset; + int unscaled; + + MAYUSE(d0); + MAYUSE(d1); + MAYUSE(q0); + MAYUSE(q1); + MAYUSE(eb1); + MAYUSE(eb2); + MAYUSE(j64); + + static const int8_t round_round[] = { RD_RNE, RD_RDN, RD_RUP, RD_RTZ }; + + switch (opcode) { + case 0x38: // SSSE3 opcodes + nextop = F8; + switch (nextop) { + case 0x00: + INST_NAME("PSHUFB Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 15); + + ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); + + // preserve gd + LD(x3, gback, gdoffset + 0); + LD(x4, gback, gdoffset + 8); + SD(x3, x5, 0); + SD(x4, x5, 8); + + for (int i = 0; i < 16; ++i) { + LBU(x3, wback, fixedaddress + i); + ANDI(x4, x3, 128); + BEQZ(x4, 12); + SB(xZR, gback, gdoffset + i); + BEQZ(xZR, 20); // continue + ANDI(x4, x3, 15); + ADD(x4, x4, x5); + LBU(x4, x4, 0); + SB(x4, gback, gdoffset + i); + } + break; + case 0x01: + INST_NAME("PHADDW Gx, Ex"); + nextop = F8; + GETGX(); + for (int i = 0; i < 4; ++i) { + // GX->sw[i] = GX->sw[i*2+0]+GX->sw[i*2+1]; + LH(x3, gback, gdoffset + 2 * (i * 2 + 0)); + LH(x4, gback, gdoffset + 2 * (i * 2 + 1)); + ADDW(x3, x3, x4); + SH(x3, gback, gdoffset + 2 * i); + } + if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { + // GX->q[1] = GX->q[0]; + LD(x3, gback, gdoffset + 0); + SD(x3, gback, gdoffset + 8); + } else { + GETEX(x2, 0, 14); + for (int i = 0; i < 4; ++i) { + // GX->sw[4+i] = EX->sw[i*2+0] + EX->sw[i*2+1]; + LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); + LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); + ADDW(x3, x3, x4); + SH(x3, gback, gdoffset + 2 * (4 + i)); + } + } + break; + case 0x02: + INST_NAME("PHADDD Gx, Ex"); + nextop = F8; + GETGX(); + // GX->sd[0] += GX->sd[1]; + LW(x3, gback, gdoffset + 0 * 4); + LW(x4, gback, gdoffset + 1 * 4); + ADDW(x3, x3, x4); + SW(x3, gback, gdoffset + 0 * 4); + // GX->sd[1] = GX->sd[2] + GX->sd[3]; + LW(x3, gback, gdoffset + 2 * 4); + LW(x4, gback, gdoffset + 3 * 4); + ADDW(x3, x3, x4); + SW(x3, gback, gdoffset + 1 * 4); + if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { + // GX->q[1] = GX->q[0]; + LD(x3, gback, gdoffset + 0); + SD(x3, gback, gdoffset + 8); + } else { + GETEX(x2, 0, 12); + // GX->sd[2] = EX->sd[0] + EX->sd[1]; + LW(x3, wback, fixedaddress + 0 * 4); + LW(x4, wback, fixedaddress + 1 * 4); + ADDW(x3, x3, x4); + SW(x3, gback, gdoffset + 2 * 4); + // GX->sd[3] = EX->sd[2] + EX->sd[3]; + LW(x3, wback, fixedaddress + 2 * 4); + LW(x4, wback, fixedaddress + 3 * 4); + ADDW(x3, x3, x4); + SW(x3, gback, gdoffset + 3 * 4); + } + break; + case 0x03: + INST_NAME("PHADDSW Gx, Ex"); + nextop = F8; + GETGX(); + MOV64x(x5, 32767); + MOV64x(x6, -32768); + for (int i = 0; i < 4; ++i) { + // tmp32s = GX->sw[i*2+0]+GX->sw[i*2+1]; + // GX->sw[i] = sat(tmp32s); + LH(x3, gback, gdoffset + 2 * (i * 2 + 0)); + LH(x4, gback, gdoffset + 2 * (i * 2 + 1)); + ADDW(x3, x3, x4); + if (rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, x6); + } else { + BLT(x3, x5, 4 + 4); + MV(x3, x5); + BLT(x6, x3, 4 + 4); + MV(x3, x6); + } + SH(x3, gback, gdoffset + i * 2); + } + if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { + // GX->q[1] = GX->q[0]; + LD(x3, gback, gdoffset + 0); + SD(x3, gback, gdoffset + 8); + } else { + GETEX(x2, 0, 14); + for (int i = 0; i < 4; ++i) { + // tmp32s = EX->sw[i*2+0] + EX->sw[i*2+1]; + // GX->sw[4+i] = sat(tmp32s); + LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); + LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); + ADDW(x3, x3, x4); + if (rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, x6); + } else { + BLT(x3, x5, 4 + 4); + MV(x3, x5); + BLT(x6, x3, 4 + 4); + MV(x3, x6); + } + SH(x3, gback, gdoffset + 2 * (4 + i)); + } + } + break; + case 0x04: + INST_NAME("PMADDUBSW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 15); + MOV64x(x5, 32767); + MOV64x(x6, -32768); + for (int i = 0; i < 8; ++i) { + LBU(x3, gback, gdoffset + i * 2); + LB(x4, wback, fixedaddress + i * 2); + MUL(x9, x3, x4); + LBU(x3, gback, gdoffset + i * 2 + 1); + LB(x4, wback, fixedaddress + i * 2 + 1); + MUL(x3, x3, x4); + ADD(x3, x3, x9); + if (rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, x6); + } else { + BLT(x3, x5, 4 + 4); + MV(x3, x5); + BLT(x6, x3, 4 + 4); + MV(x3, x6); + } + SH(x3, gback, gdoffset + i * 2); + } + break; + case 0x05: + INST_NAME("PHSUBW Gx, Ex"); + nextop = F8; + GETGX(); + for (int i = 0; i < 4; ++i) { + // GX->sw[i] = GX->sw[i*2+0] - GX->sw[i*2+1]; + LH(x3, gback, gdoffset + 2 * (i * 2 + 0)); + LH(x4, gback, gdoffset + 2 * (i * 2 + 1)); + SUBW(x3, x3, x4); + SH(x3, gback, gdoffset + 2 * i); + } + if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { + // GX->q[1] = GX->q[0]; + LD(x3, gback, gdoffset + 0); + SD(x3, gback, gdoffset + 8); + } else { + GETEX(x2, 0, 14); + for (int i = 0; i < 4; ++i) { + // GX->sw[4+i] = EX->sw[i*2+0] - EX->sw[i*2+1]; + LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); + LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); + SUBW(x3, x3, x4); + SH(x3, gback, gdoffset + 2 * (4 + i)); + } + } + break; + case 0x08: + INST_NAME("PSIGNB Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 15); + for (int i = 0; i < 16; ++i) { + LB(x3, gback, gdoffset + i); + LB(x4, wback, fixedaddress + i); + SLT(x1, xZR, x4); + SRAI(x5, x4, 63); + OR(x1, x1, x5); + MUL(x3, x1, x3); + SB(x3, gback, gdoffset + i); + } + break; + case 0x09: + INST_NAME("PSIGNW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 14); + for (int i = 0; i < 8; ++i) { + LH(x3, gback, gdoffset + i * 2); + LH(x4, wback, fixedaddress + i * 2); + SLT(x1, xZR, x4); + SRAI(x5, x4, 63); + OR(x1, x1, x5); + MUL(x3, x1, x3); + SH(x3, gback, gdoffset + i * 2); + } + break; + case 0x0A: + INST_NAME("PSIGND Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 12); + for (int i = 0; i < 4; ++i) { + LW(x3, gback, gdoffset + i * 4); + LW(x4, wback, fixedaddress + i * 4); + SLT(x1, xZR, x4); + SRAI(x5, x4, 63); + OR(x1, x1, x5); + MUL(x3, x1, x3); + SW(x3, gback, gdoffset + i * 4); + } + break; + case 0x0B: + INST_NAME("PMULHRSW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 14); + for (int i = 0; i < 8; ++i) { + LH(x3, gback, gdoffset + i * 2); + LH(x4, wback, fixedaddress + i * 2); + MUL(x3, x3, x4); + SRAI(x3, x3, 14); + ADDI(x3, x3, 1); + SRAI(x3, x3, 1); + SH(x3, gback, gdoffset + i * 2); + } + break; + case 0x10: + INST_NAME("PBLENDVB Gx,Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 15); + sse_forget_reg(dyn, ninst, x6, 0); // forget xmm[0] + for (int i = 0; i < 16; ++i) { + LB(x3, xEmu, offsetof(x64emu_t, xmm[0]) + i); + BGE(x3, xZR, 12); // continue + LBU(x3, wback, fixedaddress + i); + SB(x3, gback, gdoffset + i); + // continue + } + break; + case 0x14: + INST_NAME("PBLENDVPS Gx,Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 12); + for (int i = 0; i < 4; ++i) { + LW(x3, xEmu, offsetof(x64emu_t, xmm[0]) + i * 4); + BGE(x3, xZR, 4 + 4 * 2); + LWU(x3, wback, fixedaddress + i * 4); + SW(x3, gback, gdoffset + i * 4); + } + break; + case 0x17: + INST_NAME("PTEST Gx, Ex"); + nextop = F8; + SETFLAGS(X_ALL, SF_SET); + GETGX(); + GETEX(x2, 0, 8); + CLEAR_FLAGS(); + SET_DFNONE(); + IFX (X_ZF | X_CF) { + LD(x5, wback, fixedaddress + 0); + LD(x6, wback, fixedaddress + 8); + + IFX (X_ZF) { + LD(x3, gback, gdoffset + 0); + LD(x4, gback, gdoffset + 8); + AND(x3, x3, x5); + AND(x4, x4, x6); + OR(x3, x3, x4); + BNEZ(x3, 8); + ORI(xFlags, xFlags, 1 << F_ZF); + } + IFX (X_CF) { + LD(x3, gback, gdoffset + 0); + NOT(x3, x3); + LD(x4, gback, gdoffset + 8); + NOT(x4, x4); + AND(x3, x3, x5); + AND(x4, x4, x6); + OR(x3, x3, x4); + BNEZ(x3, 8); + ORI(xFlags, xFlags, 1 << F_CF); + } + } + break; + + case 0x1C: + INST_NAME("PABSB Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 15); + for (int i = 0; i < 16; ++i) { + LB(x4, wback, fixedaddress + i); + BGE(x4, xZR, 4 + 4); + NEG(x4, x4); + SB(x4, gback, gdoffset + i); + } + break; + case 0x1D: + INST_NAME("PABSW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 14); + for (int i = 0; i < 8; ++i) { + LH(x4, wback, fixedaddress + i * 2); + BGE(x4, xZR, 4 + 4); + NEG(x4, x4); + SH(x4, gback, gdoffset + i * 2); + } + break; + case 0x1E: + INST_NAME("PABSD Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 12); + for (int i = 0; i < 4; ++i) { + LW(x4, wback, fixedaddress + i * 4); + BGE(x4, xZR, 4 + 4); + NEG(x4, x4); + SW(x4, gback, gdoffset + i * 4); + } + break; + case 0x20: + INST_NAME("PMOVSXBW Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 7); + for (int i = 7; i >= 0; --i) { + // GX->sw[i] = EX->sb[i]; + LB(x3, wback, fixedaddress + i); + SH(x3, gback, gdoffset + i * 2); + } + break; + case 0x21: + INST_NAME("PMOVSXBD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 3); + for (int i = 3; i >= 0; --i) { + // GX->sd[i] = EX->sb[i]; + LB(x3, wback, fixedaddress + i); + SW(x3, gback, gdoffset + i * 4); + } + break; + case 0x22: + INST_NAME("PMOVSXBQ Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 1); + for (int i = 1; i >= 0; --i) { + // GX->sq[i] = EX->sb[i]; + LB(x3, wback, fixedaddress + i); + SD(x3, gback, gdoffset + i * 8); + } + break; + case 0x23: + INST_NAME("PMOVSXWD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 6); + for (int i = 3; i >= 0; --i) { + // GX->sd[i] = EX->sw[i]; + LH(x3, wback, fixedaddress + i * 2); + SW(x3, gback, gdoffset + i * 4); + } + break; + case 0x24: + INST_NAME("PMOVSXWQ Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 2); + for (int i = 1; i >= 0; --i) { + // GX->sq[i] = EX->sw[i]; + LH(x3, wback, fixedaddress + i * 2); + SD(x3, gback, gdoffset + i * 8); + } + break; + case 0x25: + INST_NAME("PMOVSXDQ Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 4); + for (int i = 1; i >= 0; --i) { + // GX->sq[i] = EX->sd[i]; + LW(x4, wback, fixedaddress + i * 4); + SD(x4, gback, gdoffset + i * 8); + } + break; + case 0x28: + INST_NAME("PMULDQ Gx, Ex"); + nextop = F8; + GETEX(x2, 0, 8); + GETGX(); + for (int i = 1; i >= 0; --i) { + LW(x3, wback, fixedaddress + i * 8); + LW(x4, gback, gdoffset + i * 8); + MUL(x3, x3, x4); + SD(x3, gback, gdoffset + i * 8); + } + break; + case 0x29: + INST_NAME("PCMPEQQ Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 8); + SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4); SNEZ(x3, x3); ADDI(x3, x3, -1)); + break; + case 0x2B: + INST_NAME("PACKUSDW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 12); + MOV64x(x5, 65535); + for (int i = 0; i < 4; ++i) { + LW(x3, gback, gdoffset + i * 4); + if (rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, xZR); + } else { + BGE(x3, xZR, 4 + 4); + MV(x3, xZR); + BLT(x3, x5, 4 + 4); + MV(x3, x5); + } + SH(x3, gback, gdoffset + i * 2); + } + if (MODREG && gd == ed) { + LD(x3, gback, gdoffset + 0); + SD(x3, gback, gdoffset + 8); + } else + for (int i = 0; i < 4; ++i) { + LW(x3, wback, fixedaddress + i * 4); + if (rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, xZR); + } else { + BGE(x3, xZR, 4 + 4); + MV(x3, xZR); + BLT(x3, x5, 4 + 4); + MV(x3, x5); + } + SH(x3, gback, gdoffset + 8 + i * 2); + } + break; + + case 0x30: + INST_NAME("PMOVZXBW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 7); + for (int i = 7; i >= 0; --i) { + LBU(x3, wback, fixedaddress + i); + SH(x3, gback, gdoffset + i * 2); + } + break; + case 0x31: + INST_NAME("PMOVZXBD Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 3); + for (int i = 3; i >= 0; --i) { + LBU(x3, wback, fixedaddress + i); + SW(x3, gback, gdoffset + i * 4); + } + break; + case 0x32: + INST_NAME("PMOVZXBQ Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 2); + for (int i = 1; i >= 0; --i) { + LBU(x3, wback, fixedaddress + i); + SD(x3, gback, gdoffset + i * 8); + } + break; + case 0x33: + INST_NAME("PMOVZXWD Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 6); + for (int i = 3; i >= 0; --i) { + LHU(x3, wback, fixedaddress + i * 2); + SW(x3, gback, gdoffset + i * 4); + } + break; + case 0x34: + INST_NAME("PMOVZXWQ Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 2); + for (int i = 1; i >= 0; --i) { + LHU(x3, wback, fixedaddress + i * 2); + SD(x3, gback, gdoffset + i * 8); + } + break; + case 0x35: + INST_NAME("PMOVZXDQ Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 4); + for (int i = 1; i >= 0; --i) { + LWU(x3, wback, fixedaddress + i * 4); + SD(x3, gback, gdoffset + i * 8); + } + break; + + case 0x38: + INST_NAME("PMINSB Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 15); + for (int i = 0; i < 16; ++i) { + LB(x3, gback, gdoffset + i); + LB(x4, wback, fixedaddress + i); + if (rv64_zbb) + MIN(x4, x3, x4); + else + BLT(x3, x4, 4 + 4); + SB(x4, gback, gdoffset + i); + } + break; + case 0x39: + INST_NAME("PMINSD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 12); + for (int i = 0; i < 4; ++i) { + LW(x3, gback, gdoffset + i * 4); + LW(x4, wback, fixedaddress + i * 4); + if (rv64_zbb) + MIN(x4, x3, x4); + else + BLT(x3, x4, 4 + 4); + SW(x4, gback, gdoffset + i * 4); + } + break; + case 0x3A: + INST_NAME("PMINUW Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 14); + for (int i = 0; i < 8; ++i) { + LHU(x3, gback, gdoffset + i * 2); + LHU(x4, wback, fixedaddress + i * 2); + if (rv64_zbb) + MINU(x4, x3, x4); + else + BLTU(x3, x4, 4 + 4); + SH(x4, gback, gdoffset + i * 2); + } + break; + case 0x3B: + INST_NAME("PMINUD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 12); + for (int i = 0; i < 4; ++i) { + LWU(x3, gback, gdoffset + i * 4); + LWU(x4, wback, fixedaddress + i * 4); + if (rv64_zbb) + MINU(x4, x3, x4); + else + BLTU(x3, x4, 4 + 4); + SW(x4, gback, gdoffset + i * 4); + } + break; + case 0x3C: + INST_NAME("PMAXSB Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 15); + for (int i = 0; i < 16; ++i) { + LB(x3, gback, gdoffset + i); + LB(x4, wback, fixedaddress + i); + if (rv64_zbb) + MAX(x4, x3, x4); + else + BLT(x4, x3, 4 + 4); + SB(x4, gback, gdoffset + i); + } + break; + case 0x3D: + INST_NAME("PMAXSD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 12); + for (int i = 0; i < 4; ++i) { + LW(x3, gback, gdoffset + i * 4); + LW(x4, wback, fixedaddress + i * 4); + if (rv64_zbb) + MAX(x4, x3, x4); + else + BLT(x4, x3, 4 + 4); + SW(x4, gback, gdoffset + i * 4); + } + break; + case 0x3E: + INST_NAME("PMAXUW Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 14); + for (int i = 0; i < 8; ++i) { + LHU(x3, gback, gdoffset + i * 2); + LHU(x4, wback, fixedaddress + i * 2); + if (rv64_zbb) + MAXU(x4, x3, x4); + else + BLTU(x4, x3, 4 + 4); + SH(x4, gback, gdoffset + i * 2); + } + break; + case 0x3F: + INST_NAME("PMAXUD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0, 12); + for (int i = 0; i < 4; ++i) { + LWU(x3, gback, gdoffset + i * 4); + LWU(x4, wback, fixedaddress + i * 4); + if (rv64_zbb) + MAXU(x4, x3, x4); + else + BLTU(x4, x3, 4 + 4); + SW(x4, gback, gdoffset + i * 4); + } + break; + case 0x40: + INST_NAME("PMULLD Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0, 12); + for (int i = 0; i < 4; ++i) { + LW(x3, gback, gdoffset + i * 4); + LW(x4, wback, fixedaddress + i * 4); + MUL(x3, x3, x4); + SW(x3, gback, gdoffset + i * 4); + } + break; + case 0x61: + INST_NAME("PCMPESTRI Gx, Ex, Ib"); + SETFLAGS(X_ALL, SF_SET_DF); + nextop = F8; + GETG; + sse_reflect_reg(dyn, ninst, x6, gd); + ADDI(x3, xEmu, offsetof(x64emu_t, xmm[gd])); + if (MODREG) { + ed = (nextop & 7) + (rex.b << 3); + sse_reflect_reg(dyn, ninst, x6, ed); + ADDI(x1, xEmu, offsetof(x64emu_t, xmm[ed])); + } else { + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1); + if (ed != x1) MV(x1, ed); + } + // prepare rest arguments + MV(x2, xRDX); + MV(x4, xRAX); + u8 = F8; + MOV32w(x5, u8); + CALL(sse42_compare_string_explicit_len, x1); + ZEROUP(x1); + BNEZ_MARK(x1); + MOV32w(xRCX, (u8 & 1) ? 8 : 16); + B_NEXT_nocond; + MARK; + if (u8 & 0b1000000) { + CLZxw(xRCX, x1, 0, x2, x3, x4); + ADDI(x2, xZR, 31); + SUB(xRCX, x2, xRCX); + } else { + CTZxw(xRCX, x1, 0, x2, x3); + } + break; + case 0xDB: + INST_NAME("AESIMC Gx, Ex"); // AES-NI + nextop = F8; + GETGX(); + GETEX(x2, 0, 8); + SSE_LOOP_MV_Q(x3); + sse_forget_reg(dyn, ninst, x6, gd); + MOV32w(x1, gd); + CALL(native_aesimc, -1); + break; + case 0xDC: + INST_NAME("AESENC Gx, Ex"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, x6, gd); + MOV32w(x1, gd); + CALL(native_aese, -1); + GETGX(); + GETEX(x2, 0, 8); + SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); + break; + case 0xDD: + INST_NAME("AESENCLAST Gx, Ex"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, x6, gd); + MOV32w(x1, gd); + CALL(native_aeselast, -1); + GETGX(); + GETEX(x2, 0, 8); + SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); + break; + case 0xDE: + INST_NAME("AESDEC Gx, Ex"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, x6, gd); + MOV32w(x1, gd); + CALL(native_aesd, -1); + GETGX(); + GETEX(x2, 0, 8); + SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); + break; + + case 0xDF: + INST_NAME("AESDECLAST Gx, Ex"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, x6, gd); + MOV32w(x1, gd); + CALL(native_aesdlast, -1); + GETGX(); + GETEX(x2, 0, 8); + SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); + break; + case 0xF0: + INST_NAME("MOVBE Gw, Ew"); + nextop = F8; + GETGD; + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, 0); + LHU(x1, ed, fixedaddress); + if (rv64_zbb) { + REV8(x1, x1); + SRLI(x1, x1, 48); + } else if (rv64_xtheadbb) { + TH_REVW(x1, x1); + SRLI(x1, x1, 16); + } else { + ANDI(x2, x1, 0xff); + SLLI(x2, x2, 8); + SRLI(x1, x1, 8); + OR(x1, x1, x2); + } + LUI(x2, 0xffff0); + AND(gd, gd, x2); + OR(gd, gd, x1); + break; + case 0xF1: + INST_NAME("MOVBE Ew, Gw"); + nextop = F8; + GETGD; + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, 0); + if (rv64_zbb) { + REV8(x1, gd); + SRLI(x1, x1, 48); + } else if (rv64_xtheadbb) { + TH_REVW(x1, gd); + SRLI(x1, x1, 16); + } else { + ANDI(x1, gd, 0xff); + SLLI(x1, x1, 8); + SRLI(x2, gd, 8); + ANDI(x2, x2, 0xff); + OR(x1, x1, x2); + } + SH(x1, wback, fixedaddress); + break; + default: + DEFAULT; + } + break; + case 0x3A: // these are some more SSSE3+ opcodes + opcode = F8; + switch (opcode) { + case 0x09: + INST_NAME("ROUNDPD Gx, Ex, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1, 8); + u8 = F8; + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + MOV64x(x3, 1ULL << __DBL_MANT_DIG__); + FCVTDL(d1, x3, RD_RTZ); + + // i = 0 + FLD(d0, wback, fixedaddress); + FEQD(x4, d0, d0); + BNEZ(x4, 8); + B_MARK_nocond; + // d0 is not nan + FABSD(v1, d0); + FLTD(x4, v1, d1); + BNEZ(x4, 8); + B_MARK_nocond; + if (u8 & 4) { + u8 = sse_setround(dyn, ninst, x4, x5); + FCVTLD(x5, d0, RD_DYN); + FCVTDL(d0, x5, RD_RTZ); + x87_restoreround(dyn, ninst, u8); + } else { + FCVTLD(x5, d0, round_round[u8 & 3]); + FCVTDL(d0, x5, RD_RTZ); + } + MARK; + FSD(d0, gback, gdoffset + 0); + + // i = 1 + FLD(d0, wback, fixedaddress + 8); + FEQD(x4, d0, d0); + BNEZ(x4, 8); + B_MARK2_nocond; + // d0 is not nan + FABSD(v1, d0); + FLTD(x4, v1, d1); + BNEZ(x4, 8); + B_MARK2_nocond; + if (u8 & 4) { + u8 = sse_setround(dyn, ninst, x4, x5); + FCVTLD(x5, d0, RD_DYN); + FCVTDL(d0, x5, RD_RTZ); + x87_restoreround(dyn, ninst, u8); + } else { + FCVTLD(x5, d0, round_round[u8 & 3]); + FCVTDL(d0, x5, RD_RTZ); + } + MARK2; + FSD(d0, gback, gdoffset + 8); + break; + case 0x0A: + INST_NAME("ROUNDSS Gx, Ex, Ib"); + nextop = F8; + GETEXSS(d0, 1); + GETGXSS_empty(v0); + d1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + u8 = F8; + FEQS(x2, d0, d0); + BNEZ_MARK(x2); + if (v0 != d0) FMVS(v0, d0); + B_NEXT_nocond; + MARK; // d0 is not nan + FABSS(v1, d0); + MOV64x(x3, 1ULL << __FLT_MANT_DIG__); + FCVTSW(d1, x3, RD_RTZ); + FLTS(x3, v1, d1); + BNEZ_MARK2(x3); + if (v0 != d0) FMVS(v0, d0); + B_NEXT_nocond; + MARK2; + if (u8 & 4) { + u8 = sse_setround(dyn, ninst, x4, x2); + FCVTWS(x5, d0, RD_DYN); + FCVTSW(v0, x5, RD_RTZ); + x87_restoreround(dyn, ninst, u8); + } else { + FCVTWS(x5, d0, round_round[u8 & 3]); + FCVTSW(v0, x5, RD_RTZ); + } + break; + case 0x0B: + INST_NAME("ROUNDSD Gx, Ex, Ib"); + nextop = F8; + GETEXSD(d0, 1); + GETGXSD_empty(v0); + d1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + u8 = F8; + FEQD(x2, d0, d0); + BNEZ_MARK(x2); + if (v0 != d0) FMVD(v0, d0); + B_NEXT_nocond; + MARK; // d0 is not nan + FABSD(v1, d0); + MOV64x(x3, 1ULL << __DBL_MANT_DIG__); + FCVTDL(d1, x3, RD_RTZ); + FLTD(x3, v1, d1); + BNEZ_MARK2(x3); + if (v0 != d0) FMVD(v0, d0); + B_NEXT_nocond; + MARK2; + if (u8 & 4) { + u8 = sse_setround(dyn, ninst, x4, x2); + FCVTLD(x5, d0, RD_DYN); + FCVTDL(v0, x5, RD_RTZ); + x87_restoreround(dyn, ninst, u8); + } else { + FCVTLD(x5, d0, round_round[u8 & 3]); + FCVTDL(v0, x5, RD_RTZ); + } + break; + case 0x0C: + INST_NAME("BLENDPS Gx, Ex, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1, 12); + u8 = F8 & 0b1111; + for (int i = 0; i < 4; ++i) + if (u8 & (1 << i)) { + LWU(x1, wback, fixedaddress + i * 4); + SW(x1, gback, gdoffset + i * 4); + } + break; + case 0x0E: + INST_NAME("PBLENDW Gx, Ex, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1, 14); + u8 = F8; + i32 = 0; + if (MODREG && gd == ed) break; + while (u8) + if (u8 & 1) { + if (!(i32 & 1) && u8 & 2) { + if (!(i32 & 3) && (u8 & 0xf) == 0xf) { + // whole 64bits + LD(x3, wback, fixedaddress + 8 * (i32 >> 2)); + SD(x3, gback, gdoffset + 8 * (i32 >> 2)); + i32 += 4; + u8 >>= 4; + } else { + // 32bits + LWU(x3, wback, fixedaddress + 4 * (i32 >> 1)); + SW(x3, gback, gdoffset + 4 * (i32 >> 1)); + i32 += 2; + u8 >>= 2; + } + } else { + // 16 bits + LHU(x3, wback, fixedaddress + 2 * i32); + SH(x3, gback, gdoffset + 2 * i32); + i32++; + u8 >>= 1; + } + } else { + // nope + i32++; + u8 >>= 1; + } + break; + case 0x0F: + INST_NAME("PALIGNR Gx, Ex, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1, 8); + u8 = F8; + if (u8 > 31) { + SD(xZR, gback, gdoffset + 0); + SD(xZR, gback, gdoffset + 8); + } else if (u8 > 23) { + LD(x5, gback, gdoffset + 8); + if (u8 > 24) { + SRLI(x5, x5, 8 * (u8 - 24)); + } + SD(x5, gback, gdoffset + 0); + SD(xZR, gback, gdoffset + 8); + } else if (u8 > 15) { + if (u8 > 16) { + LD(x5, gback, gdoffset + 8); + LD(x4, gback, gdoffset + 0); + SRLI(x3, x5, 8 * (u8 - 16)); // lower of higher 64 bits + SLLI(x5, x5, 8 * (24 - u8)); // higher of lower 64 bits + SD(x3, gback, gdoffset + 8); + SRLI(x4, x4, 8 * (u8 - 16)); // lower of lower 64 bits + OR(x4, x4, x5); // lower 64 bits + SD(x4, gback, gdoffset + 0); + } + } else if (u8 > 7) { + if (u8 > 8) { + LD(x5, gback, gdoffset + 8); + LD(x4, gback, gdoffset + 0); + LD(x3, wback, fixedaddress + 8); + SLLI(x5, x5, 8 * (16 - u8)); // higher of higher 64 bits + SRLI(x1, x4, 8 * (u8 - 8)); // lower of higher 64 bits + SLLI(x4, x4, 8 * (16 - u8)); // higher of lower 64 bits + OR(x5, x1, x5); // higher 64 bits + SRLI(x3, x3, 8 * (u8 - 8)); // lower of lower 64 bits + SD(x5, gback, gdoffset + 8); + OR(x4, x4, x3); // lower 64 bits + SD(x4, gback, gdoffset + 0); + } else { + LD(x5, gback, gdoffset + 0); + LD(x4, wback, fixedaddress + 8); + SD(x5, gback, gdoffset + 8); + SD(x4, gback, gdoffset + 0); + } + } else { + if (u8 > 0) { + LD(x5, gback, gdoffset + 0); + LD(x4, wback, fixedaddress + 8); + LD(x3, wback, fixedaddress + 0); + SLLI(x5, x5, 8 * (8 - u8)); // higher of higher 64 bits + SRLI(x1, x4, 8 * (u8 - 0)); // lower of higher 64 bits + SLLI(x4, x4, 8 * (8 - u8)); // higher of lower 64 bits + OR(x5, x1, x5); // higher 64 bits + SRLI(x3, x3, 8 * (u8 - 0)); // lower of lower 64 bits + SD(x5, gback, gdoffset + 8); + OR(x4, x4, x3); // lower 64 bits + SD(x4, gback, gdoffset + 0); + } else { + LD(x5, wback, fixedaddress + 8); + LD(x4, wback, fixedaddress + 0); + SD(x5, gback, gdoffset + 8); + SD(x4, gback, gdoffset + 0); + } + } + break; + case 0x16: + if (rex.w) { + INST_NAME("PEXTRQ Ed, Gx, Ib"); + } else { + INST_NAME("PEXTRD Ed, Gx, Ib"); + } + nextop = F8; + GETGX(); + GETED(1); + u8 = F8; + if (rex.w) + LD(ed, gback, gdoffset + 8 * (u8 & 1)); + else + LWU(ed, gback, gdoffset + 4 * (u8 & 3)); + if (wback) { + SDxw(ed, wback, fixedaddress); + SMWRITE2(); + } + break; + case 0x17: + INST_NAME("EXTRACTPS Ew, Gx, Ib"); + nextop = F8; + GETGX(); + GETED(1); + u8 = F8; + LWU(ed, gback, gdoffset + 4 * (u8 & 3)); + if (wback) { + SW(ed, wback, fixedaddress); + SMWRITE2(); + } + break; + case 0x20: + INST_NAME("PINSRB Gx, ED, Ib"); + nextop = F8; + GETGX(); + GETED(1); + u8 = F8; + SB(ed, gback, gdoffset + (u8 & 0xF)); + break; + case 0x21: + INST_NAME("INSERTPS GX, EX, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1, 12); + u8 = F8; + if (MODREG) + s8 = (u8 >> 6) & 3; + else + s8 = 0; + // GX->ud[(tmp8u>>4)&3] = EX->ud[tmp8s]; + LWU(x3, wback, fixedaddress + 4 * s8); + SW(x3, gback, gdoffset + 4 * (u8 >> 4)); + for (int i = 0; i < 4; ++i) { + if (u8 & (1 << i)) + // GX->ud[i] = 0; + SW(xZR, gback, gdoffset + 4 * i); + } + break; + case 0x22: + INST_NAME("PINSRD Gx, ED, Ib"); + nextop = F8; + GETGX(); + GETED(1); + u8 = F8; + if (rex.w) { + SD(ed, gback, gdoffset + 8 * (u8 & 0x1)); + } else { + SW(ed, gback, gdoffset + 4 * (u8 & 0x3)); + } + break; + case 0x40: + INST_NAME("DPPS Gx, Ex, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1, 12); + u8 = F8; + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + d2 = fpu_get_scratch(dyn); + FMVWX(d2, xZR); + for (int i = 0; i < 4; ++i) + if (u8 & (1 << (i + 4))) { + FLW(d0, gback, gdoffset + i * 4); + FLW(d1, wback, fixedaddress + i * 4); + FMULS(d0, d0, d1); + FADDS(d2, d2, d0); + } + for (int i = 0; i < 4; ++i) + if (u8 & (1 << i)) + FSW(d2, gback, gdoffset + i * 4); + else + SW(xZR, gback, gdoffset + i * 4); + break; + case 0x44: + INST_NAME("PCLMULQDQ Gx, Ex, Ib"); + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, x6, gd); + MOV32w(x1, gd); // gx + if (MODREG) { + ed = (nextop & 7) + (rex.b << 3); + sse_forget_reg(dyn, ninst, x6, ed); + MOV32w(x2, ed); + MOV32w(x3, 0); // p = NULL + } else { + MOV32w(x2, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 0, 1); + if (ed != x3) { + MV(x3, ed); + } + } + u8 = F8; + MOV32w(x4, u8); + CALL(native_pclmul, -1); + break; + case 0x63: + INST_NAME("PCMPISTRI Gx, Ex, Ib"); + SETFLAGS(X_ALL, SF_SET_DF); + nextop = F8; + GETG; + sse_reflect_reg(dyn, ninst, x6, gd); + ADDI(x2, xEmu, offsetof(x64emu_t, xmm[gd])); + if (MODREG) { + ed = (nextop & 7) + (rex.b << 3); + sse_reflect_reg(dyn, ninst, x6, ed); + ADDI(x1, xEmu, offsetof(x64emu_t, xmm[ed])); + } else { + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1); + if (ed != x1) MV(x1, ed); + } + u8 = F8; + MOV32w(x3, u8); + CALL(sse42_compare_string_implicit_len, x1); + ZEROUP(x1); + BNEZ_MARK(x1); + MOV32w(xRCX, (u8 & 1) ? 8 : 16); + B_NEXT_nocond; + MARK; + if (u8 & 0b1000000) { + CLZxw(xRCX, x1, 0, x2, x3, x4); + ADDI(x2, xZR, 31); + SUB(xRCX, x2, xRCX); + } else { + CTZxw(xRCX, x1, 0, x2, x3); + } + break; + case 0xDF: + INST_NAME("AESKEYGENASSIST Gx, Ex, Ib"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, x6, gd); + MOV32w(x1, gd); // gx + if (MODREG) { + ed = (nextop & 7) + (rex.b << 3); + sse_forget_reg(dyn, ninst, x6, ed); + MOV32w(x2, ed); + MOV32w(x3, 0); // p = NULL + } else { + MOV32w(x2, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 1); + if (ed != x3) { + MV(x3, ed); + } + } + u8 = F8; + MOV32w(x4, u8); + CALL(native_aeskeygenassist, -1); + break; + default: + DEFAULT; + } + break; + default: + DEFAULT; + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 683ca3e5..12dd594b 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -1133,6 +1133,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define dynarec64_DF STEPNAME(dynarec64_DF) #define dynarec64_F0 STEPNAME(dynarec64_F0) #define dynarec64_660F STEPNAME(dynarec64_660F) +#define dynarec64_660F38 STEPNAME(dynarec64_660F38) #define dynarec64_66F20F STEPNAME(dynarec64_66F20F) #define dynarec64_66F30F STEPNAME(dynarec64_66F30F) #define dynarec64_6664 STEPNAME(dynarec64_6664) @@ -1556,6 +1557,7 @@ uintptr_t dynarec64_DE(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_66F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_66F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_6664(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog); |