diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f38.c | 19 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_avx_66_0f.c | 144 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c | 34 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 46 |
4 files changed, 176 insertions, 67 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_660f38.c b/src/dynarec/rv64/dynarec_rv64_660f38.c index 6ba960e7..1e9a357a 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f38.c +++ b/src/dynarec/rv64/dynarec_rv64_660f38.c @@ -141,13 +141,15 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode, INST_NAME("PHADDSW Gx, Ex"); nextop = F8; GETGX(); + LUI(x6, 0xFFFF8); // -32768 + LUI(x7, 0x8); // 32768 for (int i = 0; i < 4; ++i) { // tmp32s = GX->sw[i*2+0]+GX->sw[i*2+1]; // GX->sw[i] = sat(tmp32s); LH(x3, gback, gdoffset + 2 * (i * 2 + 0)); LH(x4, gback, gdoffset + 2 * (i * 2 + 1)); ADDW(x3, x3, x4); - SAT16(x3, x6); + SATw(x3, x6, x7); SH(x3, gback, gdoffset + i * 2); } if (MODREG && gd == (nextop & 7) + (rex.b << 3)) { @@ -162,7 +164,7 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode, LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); ADDW(x3, x3, x4); - SAT16(x3, x6); + SATw(x3, x6, x7); SH(x3, gback, gdoffset + 2 * (4 + i)); } } @@ -172,15 +174,17 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode, nextop = F8; GETGX(); GETEX(x2, 0, 15); + LUI(x6, 0xFFFF8); // -32768 + LUI(x7, 0x8); // 32768 for (int i = 0; i < 8; ++i) { LBU(x3, gback, gdoffset + i * 2); LB(x4, wback, fixedaddress + i * 2); - MUL(x7, x3, x4); + MUL(x1, x3, x4); LBU(x3, gback, gdoffset + i * 2 + 1); LB(x4, wback, fixedaddress + i * 2 + 1); MUL(x3, x3, x4); - ADD(x3, x3, x7); - SAT16(x3, x6); + ADD(x3, x3, x1); + SATw(x3, x6, x7); SH(x3, gback, gdoffset + i * 2); } break; @@ -467,9 +471,10 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode, nextop = F8; GETGX(); GETEX(x2, 0, 12); + LUI(x5, 0x10); // 65536 for (int i = 0; i < 4; ++i) { LW(x3, gback, gdoffset + i * 4); - SATU16(x3, x5); + SATUw(x3, x5); SH(x3, gback, gdoffset + i * 2); } if (MODREG && gd == ed) { @@ -478,7 +483,7 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode, } else for (int i = 0; i < 4; ++i) { LW(x3, wback, fixedaddress + i * 4); - SATU16(x3, x5); + SATUw(x3, x5); SH(x3, gback, gdoffset + 8 + i * 2); } break; diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c index 372e0faa..f19a3f32 100644 --- a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c +++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c @@ -719,6 +719,12 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, GETGY(); GETVX(); GETVY(); + if (opcode == 0x63) { + ADDIW(x6, xZR, 0xF80); // -128 + ADDIW(x7, xZR, 0x80); // 128 + } else { + ADDIW(x6, xZR, 0x100); // 256 + } if (gd == ed) { ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); LD(x3, wback, fixedaddress + 0); @@ -731,9 +737,9 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, for (int i = 0; i < 8; ++i) { LH(x3, vback, vxoffset + i * 2); if (opcode == 0x63) - SAT8(x3, x6); + SATw(x3, x6, x7); else - SATU8(x3, x6); + SATUw(x3, x6); SB(x3, gback, gdoffset + i); } if (vex.v == ed) { @@ -743,9 +749,9 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, for (int i = 0; i < 8; ++i) { LH(x3, wback, fixedaddress + i * 2); if (opcode == 0x63) - SAT8(x3, x6); + SATw(x3, x6, x7); else - SATU8(x3, x6); + SATUw(x3, x6); SB(x3, gback, gdoffset + 8 + i); } } @@ -763,9 +769,9 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, for (int i = 0; i < 8; ++i) { LH(x3, vback, vyoffset + i * 2); if (opcode == 0x63) - SAT8(x3, x6); + SATw(x3, x6, x7); else - SATU8(x3, x6); + SATUw(x3, x6); SB(x3, gback, gyoffset + i); } if (vex.v == ed) { @@ -775,9 +781,9 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, for (int i = 0; i < 8; ++i) { LH(x3, wback, fixedaddress + i * 2); if (opcode == 0x63) - SAT8(x3, x6); + SATw(x3, x6, x7); else - SATU8(x3, x6); + SATUw(x3, x6); SB(x3, gback, gyoffset + 8 + i); } } @@ -948,6 +954,8 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, GETGY(); GETVX(); GETVY(); + LUI(x6, 0xFFFF8); // -32768 + LUI(x7, 0x8); // 32768 if (gd == ed) { ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); LD(x3, wback, fixedaddress + 0); @@ -959,7 +967,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, } for (int i = 0; i < 4; ++i) { LW(x3, vback, vxoffset + i * 4); - SAT16(x3, x6); + SATw(x3, x6, x7); SH(x3, gback, gdoffset + i * 2); } if (vex.v == ed) { @@ -968,7 +976,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, } else { for (int i = 0; i < 4; ++i) { LW(x3, wback, fixedaddress + i * 4); - SAT16(x3, x6); + SATw(x3, x6, x7); SH(x3, gback, gdoffset + (4 + i) * 2); } } @@ -985,7 +993,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, } for (int i = 0; i < 4; ++i) { LW(x3, vback, vyoffset + i * 4); - SAT16(x3, x6); + SATw(x3, x6, x7); SH(x3, gback, gyoffset + i * 2); } if (vex.v == ed) { @@ -994,7 +1002,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, } else { for (int i = 0; i < 4; ++i) { LW(x3, wback, fixedaddress + i * 4); - SAT16(x3, x6); + SATw(x3, x6, x7); SH(x3, gback, gyoffset + (4 + i) * 2); } } @@ -2141,6 +2149,118 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, } else YMM0(gd); break; + case 0xE4: + INST_NAME("VPMULHUW Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 30 : 14); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + for (int i = 0; i < 8; ++i) { + LHU(x3, vback, vxoffset + i * 2); + LHU(x4, wback, fixedaddress + i * 2); + MULW(x3, x3, x4); + SRLIW(x3, x3, 16); + SH(x3, gback, gdoffset + i * 2); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 8; ++i) { + LHU(x3, vback, vyoffset + i * 2); + LHU(x4, wback, fixedaddress + i * 2); + MULW(x3, x3, x4); + SRLIW(x3, x3, 16); + SH(x3, gback, gyoffset + i * 2); + } + } else + YMM0(gd); + break; + case 0xE5: + INST_NAME("VPMULHW Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 30 : 14); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + for (int i = 0; i < 8; ++i) { + LH(x3, vback, vxoffset + i * 2); + LH(x4, wback, fixedaddress + i * 2); + MULW(x3, x3, x4); + SRAIW(x3, x3, 16); + SH(x3, gback, gdoffset + i * 2); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 8; ++i) { + LH(x3, vback, vyoffset + i * 2); + LH(x4, wback, fixedaddress + i * 2); + MULW(x3, x3, x4); + SRAIW(x3, x3, 16); + SH(x3, gback, gyoffset + i * 2); + } + } else + YMM0(gd); + break; + case 0xE8: + INST_NAME("VPSUBSB Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 31 : 15); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + ADDIW(x6, xZR, 0xF80); // -128 + ADDIW(x7, xZR, 0x80); // 128 + for (int i = 0; i < 16; ++i) { + LB(x3, vback, vxoffset + i); + LB(x4, wback, fixedaddress + i); + SUBW(x3, x3, x4); + SATw(x3, x6, x7); + SB(x3, gback, gdoffset + i); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 16; ++i) { + LB(x3, vback, vyoffset + i); + LB(x4, wback, fixedaddress + i); + SUBW(x3, x3, x4); + SATw(x3, x6, x7); + SB(x3, gback, gyoffset + i); + } + } else + YMM0(gd); + break; + case 0xE9: + INST_NAME("VPSUBSW Gx, Vx, Ex"); + nextop = F8; + GETEX(x1, 0, vex.l ? 30 : 14); + GETGX(); + GETGY(); + GETVX(); + GETVY(); + LUI(x6, 0xFFFF8); // -32768 + LUI(x7, 0x8); // 32768 + for (int i = 0; i < 8; ++i) { + LH(x3, vback, vxoffset + i * 2); + LH(x4, wback, fixedaddress + i * 2); + SUBW(x3, x3, x4); + SATw(x3, x6, x7); + SH(x3, gback, gdoffset + i * 2); + } + if (vex.l) { + GETEY(); + for (int i = 0; i < 8; ++i) { + LH(x3, vback, vyoffset + i * 2); + LH(x4, wback, fixedaddress + i * 2); + SUBW(x3, x3, x4); + SATw(x3, x6, x7); + SH(x3, gback, gyoffset + i * 2); + } + } else + YMM0(gd); + break; case 0xEF: INST_NAME("VPXOR Gx, Vx, Ex"); nextop = F8; diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c index 3c81181a..40ec2518 100644 --- a/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c +++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c @@ -110,6 +110,8 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i GETVX(); GETGY(); GETVY(); + LUI(x6, 0xFFFF8); // -32768 + LUI(x7, 0x8); // 32768 if (gd == ed) { ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); LD(x3, wback, fixedaddress + 0); @@ -124,7 +126,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LH(x3, vback, vxoffset + 2 * (i * 2 + 0)); LH(x4, vback, vxoffset + 2 * (i * 2 + 1)); ADDW(x3, x3, x4); - if (opcode == 0x03) SAT16(x3, x6); + if (opcode == 0x03) SATw(x3, x6, x7); SH(x3, gback, gdoffset + 2 * i); } if (MODREG && ed == vex.v) { @@ -137,7 +139,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); ADDW(x3, x3, x4); - if (opcode == 0x03) SAT16(x3, x6); + if (opcode == 0x03) SATw(x3, x6, x7); SH(x3, gback, gdoffset + 2 * (4 + i)); } } @@ -157,7 +159,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LH(x3, vback, vyoffset + 2 * (i * 2 + 0)); LH(x4, vback, vyoffset + 2 * (i * 2 + 1)); ADDW(x3, x3, x4); - if (opcode == 0x03) SAT16(x3, x6); + if (opcode == 0x03) SATw(x3, x6, x7); SH(x3, gback, gyoffset + 2 * i); } if (MODREG && ed == vex.v) { @@ -170,7 +172,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); ADDW(x3, x3, x4); - if (opcode == 0x03) SAT16(x3, x6); + if (opcode == 0x03) SATw(x3, x6, x7); SH(x3, gback, gyoffset + 2 * (4 + i)); } } @@ -256,15 +258,17 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i GETVX(); GETGY(); GETVY(); + LUI(x6, 0xFFFF8); // -32768 + LUI(x7, 0x8); // 32768 for (int i = 0; i < 8; ++i) { LBU(x3, vback, vxoffset + i * 2); LB(x4, wback, fixedaddress + i * 2); - MUL(x7, x3, x4); + MUL(x2, x3, x4); LBU(x3, vback, vxoffset + i * 2 + 1); LB(x4, wback, fixedaddress + i * 2 + 1); MUL(x3, x3, x4); - ADD(x3, x3, x7); - SAT16(x3, x6); + ADD(x3, x3, x2); + SATw(x3, x6, x7); SH(x3, gback, gdoffset + i * 2); } if (vex.l) { @@ -272,12 +276,12 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i for (int i = 0; i < 8; ++i) { LBU(x3, vback, vyoffset + i * 2); LB(x4, wback, fixedaddress + i * 2); - MUL(x7, x3, x4); + MUL(x2, x3, x4); LBU(x3, vback, vyoffset + i * 2 + 1); LB(x4, wback, fixedaddress + i * 2 + 1); MUL(x3, x3, x4); - ADD(x3, x3, x7); - SAT16(x3, x6); + ADD(x3, x3, x2); + SATw(x3, x6, x7); SH(x3, gback, gyoffset + i * 2); } } else @@ -295,6 +299,8 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i GETVX(); GETGY(); GETVY(); + LUI(x6, 0xFFFF8); // -32768 + LUI(x7, 0x8); // 32768 if (gd == ed) { ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); LD(x3, wback, fixedaddress + 0); @@ -309,7 +315,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LH(x3, vback, vxoffset + 2 * (i * 2 + 0)); LH(x4, vback, vxoffset + 2 * (i * 2 + 1)); SUBW(x3, x3, x4); - if (opcode == 0x07) SAT16(x3, x6); + if (opcode == 0x07) SATw(x3, x6, x7); SH(x3, gback, gdoffset + 2 * i); } if (MODREG && ed == vex.v) { @@ -322,7 +328,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); SUBW(x3, x3, x4); - if (opcode == 0x07) SAT16(x3, x6); + if (opcode == 0x07) SATw(x3, x6, x7); SH(x3, gback, gdoffset + 2 * (4 + i)); } } @@ -342,7 +348,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LH(x3, vback, vyoffset + 2 * (i * 2 + 0)); LH(x4, vback, vyoffset + 2 * (i * 2 + 1)); SUBW(x3, x3, x4); - if (opcode == 0x07) SAT16(x3, x6); + if (opcode == 0x07) SATw(x3, x6, x7); SH(x3, gback, gyoffset + 2 * i); } if (MODREG && ed == vex.v) { @@ -355,7 +361,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LH(x3, wback, fixedaddress + 2 * (i * 2 + 0)); LH(x4, wback, fixedaddress + 2 * (i * 2 + 1)); SUBW(x3, x3, x4); - if (opcode == 0x07) SAT16(x3, x6); + if (opcode == 0x07) SATw(x3, x6, x7); SH(x3, gback, gyoffset + 2 * (4 + i)); } } diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 03bf3cf6..d59e07f2 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -1957,42 +1957,20 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, #define PURGE_YMM() // TODO: zbb? -#define SAT8(reg, s) \ - do { \ - ADDIW(s, xZR, 0xF80); /* -128 */ \ - BGE(reg, s, 4 + 4); \ - MV(reg, s); \ - ADDIW(s, xZR, 0x80); /* 128 */ \ - BLT(reg, s, 4 + 4); \ - ADDIW(reg, s, -1); \ +#define SATw(reg, min, maxp1) \ + do { \ + BGE(reg, min, 4 + 4); \ + MV(reg, min); \ + BLT(reg, maxp1, 4 + 4); \ + ADDIW(reg, maxp1, -1); \ } while (0) -#define SATU8(reg, s) \ - do { \ - ADDIW(s, xZR, 0x100); /* 256 */ \ - BGE(reg, xZR, 4 + 4); \ - MV(reg, xZR); \ - BLT(reg, s, 4 + 4); \ - ADDIW(reg, s, -1); \ - } while (0) - -#define SAT16(reg, s) \ - do { \ - LUI(s, 0xFFFF8); /* -32768 */ \ - BGE(reg, s, 4 + 4); \ - MV(reg, s); \ - LUI(s, 0x8); /* 32768 */ \ - BLT(reg, s, 4 + 4); \ - ADDIW(reg, s, -1); \ - } while (0) - -#define SATU16(reg, s) \ - do { \ - LUI(s, 0x10); /* 65536 */ \ - BGE(reg, xZR, 4 + 4); \ - MV(reg, xZR); \ - BLT(reg, s, 4 + 4); \ - ADDIW(reg, s, -1); \ +#define SATUw(reg, maxu) \ + do { \ + BGE(reg, xZR, 4 + 4); \ + MV(reg, xZR); \ + BLT(reg, maxu, 4 + 4); \ + ADDIW(reg, maxu, -1); \ } while (0) #define FAST_8BIT_OPERATION(dst, src, s1, OP) \ |