diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2025-08-22 19:13:04 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-08-22 13:13:04 +0200 |
| commit | 893ffe8205c8400db51337066eb34009f2e04b78 (patch) | |
| tree | 48cfe3022d9843254914deb7b585c10a073d067c /src | |
| parent | 82a91e717afa01781e367d200801eb29275b062a (diff) | |
| download | box64-893ffe8205c8400db51337066eb34009f2e04b78.tar.gz box64-893ffe8205c8400db51337066eb34009f2e04b78.zip | |
[RV64_DYNAREC] Added more scalar avx opcodes (#2965)
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_avx_66_0f.c | 490 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_avx_66_0f3a.c | 26 |
2 files changed, 516 insertions, 0 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c index 4acef003..1841d2e7 100644 --- a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c +++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c @@ -90,6 +90,45 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, } if (!MODREG) SMWRITE2(); break; + case 0x2E: + // no special check... + case 0x2F: + if (opcode == 0x2F) { + INST_NAME("VCOMISD Gx, Ex"); + } else { + INST_NAME("VUCOMISD Gx, Ex"); + } + SETFLAGS(X_ALL, SF_SET, NAT_FLAGS_NOFUSION); + SET_DFNONE(); + nextop = F8; + GETGXSD(d0); + GETEXSD(v0, 0); + CLEAR_FLAGS(); + // if isnan(d0) || isnan(v0) + IFX (X_ZF | X_PF | X_CF) { + FEQD(x3, d0, d0); + FEQD(x2, v0, v0); + AND(x2, x2, x3); + BNE_MARK(x2, xZR); + ORI(xFlags, xFlags, (1 << F_ZF) | (1 << F_PF) | (1 << F_CF)); + B_NEXT_nocond; + } + MARK; + // else if isless(d0, v0) + IFX (X_CF) { + FLTD(x2, d0, v0); + BEQ_MARK2(x2, xZR); + ORI(xFlags, xFlags, 1 << F_CF); + B_NEXT_nocond; + } + MARK2; + // else if d0 == v0 + IFX (X_ZF) { + FEQD(x2, d0, v0); + CBZ_NEXT(x2); + ORI(xFlags, xFlags, 1 << F_ZF); + } + break; case 0x50: INST_NAME("VMOVMSKPD Gd, Ex"); nextop = F8; @@ -292,6 +331,292 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, SD(xZR, gback, gyoffset + 8); } break; + case 0x58: + INST_NAME("VADDPD Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 24 : 8); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vxoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + FADDD(v0, v0, v1); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gdoffset + 8 * i); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vyoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + FADDD(v0, v0, v1); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gyoffset + 8 * i); + } + } else { + SD(xZR, gback, gyoffset); + SD(xZR, gback, gyoffset + 8); + } + break; + case 0x59: + INST_NAME("VMULPD Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 24 : 8); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vxoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + FMULD(v0, v0, v1); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gdoffset + 8 * i); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vyoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + FMULD(v0, v0, v1); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gyoffset + 8 * i); + } + } else { + SD(xZR, gback, gyoffset); + SD(xZR, gback, gyoffset + 8); + } + break; + case 0x5C: + INST_NAME("VSUBPD Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 24 : 8); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vxoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + FSUBD(v0, v1, v0); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gdoffset + 8 * i); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vyoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + FSUBD(v0, v1, v0); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gyoffset + 8 * i); + } + } else { + SD(xZR, gback, gyoffset); + SD(xZR, gback, gyoffset + 8); + } + break; + case 0x5D: + INST_NAME("VMINPD Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 24 : 8); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + for (int i = 0; i < 2; ++i) { + FLD(v0, vback, vxoffset + 8 * i); + FLD(v1, wback, fixedaddress + 8 * i); + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + AND(x3, x3, x4); + BEQ(x3, xZR, 4 + 3 * 4); + FLTD(x3, v0, v1); + BEQ(x3, xZR, 4 + 4); // continue + FMVD(v1, v0); + FSD(v1, gback, gdoffset + 8 * i); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 2; ++i) { + FLD(v0, vback, vyoffset + 8 * i); + FLD(v1, wback, fixedaddress + 8 * i); + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + AND(x3, x3, x4); + BEQ(x3, xZR, 4 + 3 * 4); + FLTD(x3, v0, v1); + BEQ(x3, xZR, 4 + 4); // continue + FMVD(v1, v0); + FSD(v1, gback, gyoffset + 8 * i); + } + } else { + SD(xZR, gback, gyoffset); + SD(xZR, gback, gyoffset + 8); + } + break; + case 0x5E: + INST_NAME("VDIVPD Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 24 : 8); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vxoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + FDIVD(v0, v1, v0); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gdoffset + 8 * i); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vyoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + FDIVD(v0, v1, v0); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gyoffset + 8 * i); + } + } else { + SD(xZR, gback, gyoffset); + SD(xZR, gback, gyoffset + 8); + } + break; + case 0x5F: + INST_NAME("VMAXPD Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 24 : 8); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + for (int i = 0; i < 2; ++i) { + FLD(v0, vback, vxoffset + 8 * i); + FLD(v1, wback, fixedaddress + 8 * i); + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + AND(x3, x3, x4); + BEQ(x3, xZR, 4 + 3 * 4); + FLTD(x3, v1, v0); + BEQ(x3, xZR, 4 + 4); // continue + FMVD(v1, v0); + FSD(v1, gback, gdoffset + 8 * i); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 2; ++i) { + FLD(v0, vback, vyoffset + 8 * i); + FLD(v1, wback, fixedaddress + 8 * i); + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + AND(x3, x3, x4); + BEQ(x3, xZR, 4 + 3 * 4); + FLTD(x3, v1, v0); + BEQ(x3, xZR, 4 + 4); // continue + FMVD(v1, v0); + FSD(v1, gback, gyoffset + 8 * i); + } + } else { + SD(xZR, gback, gyoffset); + SD(xZR, gback, gyoffset + 8); + } + break; case 0x66: INST_NAME("VPCMPGTD Gx, Vx, Ex"); nextop = F8; @@ -404,6 +729,171 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, SD(xZR, wback, fixedaddress + 8); } break; + case 0xC2: + INST_NAME("VCMPPD Gx, Vx, Ex, Ib"); + nextop = F8; + GETEX(x2, 1, vex.l ? 24 : 8); + GETGX(); + GETVX(); + GETGY(); + GETVY(); + u8 = F8; + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + for (int i = 0; i < 2 + (vex.l ? 2 : 0); ++i) { + if (i == 2) { GETEY(); } + if (i < 2) { + FLD(d0, vback, vxoffset + 8 * i); + FLD(d1, wback, fixedaddress + 8 * i); + } else { + FLD(d0, vback, vyoffset + 8 * (i - 2)); + FLD(d1, wback, fixedaddress + 8 * (i - 2)); + } + + if ((u8 & 0xf) != 0x0b && (u8 & 0xf) != 0xf) { + // x6 = !(isnan(d0) || isnan(d1)) + FEQD(x4, d0, d0); + FEQD(x3, d1, d1); + AND(x6, x3, x4); + } + + switch (u8 & 0x7) { + case 0: + FEQD(x3, d0, d1); + break; // Equal + case 1: + BEQ(x6, xZR, 8); + FLTD(x3, d0, d1); + break; // Less than + case 2: + BEQ(x6, xZR, 8); + FLED(x3, d0, d1); + break; // Less or equal + case 3: + if (u8 & 0x8) + ADDI(x3, xZR, 0); + else + XORI(x3, x6, 1); + break; + case 4: + FEQD(x3, d0, d1); + XORI(x3, x3, 1); + break; // Not Equal or unordered + case 5: + BEQ(x6, xZR, 12); + FLED(x3, d1, d0); + J(8); + ADDI(x3, xZR, 1); + break; // Greater or equal or unordered + case 6: + BEQ(x6, xZR, 12); + FLTD(x3, d1, d0); + J(8); + ADDI(x3, xZR, 1); + break; // Greater or unordered + case 7: + if (u8 & 0x8) + ADDI(x3, xZR, 1); + else + MV(x3, x6); + break; // Not NaN + } + if ((u8 & 0x3) != 0x3) { + if ((u8 & 0xC) == 0x8 || (u8 & 0xC) == 0x4) { + XORI(x7, x6, 1); + OR(x3, x3, x7); + } else + AND(x3, x3, x6); + } + NEG(x3, x3); + if (i < 2) { + SD(x3, gback, gdoffset + 8 * i); + } else { + SD(x3, gback, gyoffset + 8 * (i - 2)); + } + } + break; + case 0xC6: + INST_NAME("VSHUFPD Gx, Vx, Ex, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1, 8); + GETGY(); + GETVX(); + GETVY(); + u8 = F8; + LD(x3, vback, vxoffset + 8 * (u8 & 1)); + LD(x4, wback, fixedaddress + 8 * ((u8 >> 1) & 1)); + SD(x3, gback, gdoffset + 0); + SD(x4, gback, gdoffset + 8); + if (vex.l) { + GETEY(); + LD(x3, vback, vyoffset + 8 * ((u8 >> 2) & 1)); + LD(x4, wback, fixedaddress + 8 * ((u8 >> 3) & 1)); + SD(x3, gback, gyoffset + 0); + SD(x4, gback, gyoffset + 8); + } else { + SD(xZR, gback, gyoffset + 0); + SD(xZR, gback, gyoffset + 8); + } + break; + case 0xD0: + INST_NAME("VADDSUBPD Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 24 : 8); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vxoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + if (i == 0) + FSUBD(v0, v1, v0); + else + FADDD(v0, v1, v0); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gdoffset + 8 * i); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 2; ++i) { + FLD(v0, wback, fixedaddress + 8 * i); + FLD(v1, vback, vyoffset + 8 * i); + if (!BOX64ENV(dynarec_fastnan)) { + FEQD(x3, v0, v0); + FEQD(x4, v1, v1); + } + if (i == 0) + FSUBD(v0, v1, v0); + else + FADDD(v0, v1, v0); + if (!BOX64ENV(dynarec_fastnan)) { + AND(x3, x3, x4); + BEQZ(x3, 16); + FEQD(x3, v0, v0); + BNEZ(x3, 8); + FNEGD(v0, v0); + } + FSD(v0, gback, gyoffset + 8 * i); + } + } else { + SD(xZR, gback, gyoffset); + SD(xZR, gback, gyoffset + 8); + } + break; case 0xEF: INST_NAME("VPXOR Gx, Vx, Ex"); nextop = F8; diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f3a.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f3a.c index 6b8cde4f..b1c19693 100644 --- a/src/dynarec/rv64/dynarec_rv64_avx_66_0f3a.c +++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f3a.c @@ -230,6 +230,32 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SD(xZR, gback, gyoffset + 8); } break; + case 0x22: + if (rex.w) { + INST_NAME("VPINSRQ Gx, Vx, ED, Ib"); + } else { + INST_NAME("VPINSRD Gx, Vx, ED, Ib"); + } + nextop = F8; + GETGX(); + GETED(1); + GETGY(); + GETVX(); + u8 = F8; + if (gd != vex.v) { + LD(x4, vback, vxoffset + 0); + LD(x5, vback, vxoffset + 8); + SD(x4, gback, gdoffset + 0); + SD(x5, gback, gdoffset + 8); + } + if (rex.w) { + SD(ed, gback, gdoffset + 8 * (u8 & 0x1)); + } else { + SW(ed, gback, gdoffset + 4 * (u8 & 0x3)); + } + SD(xZR, gback, gyoffset + 0); + SD(xZR, gback, gyoffset + 8); + break; case 0x4A: INST_NAME("VBLENDVPS Gx, Vx, Ex, XMMImm8"); nextop = F8; |