diff options
| author | Yang Liu <liuyang22@iscas.ac.cn> | 2023-04-20 14:52:40 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-04-20 08:52:40 +0200 |
| commit | f280c6498056dc747089d07725f7f6edd03efd24 (patch) | |
| tree | 9f3f4d742590ff544e5e83d460f2446ae5581f12 /src | |
| parent | d0ae6a9a7da7d77f17b97b41c14951a4af0f9c70 (diff) | |
| download | box64-f280c6498056dc747089d07725f7f6edd03efd24.tar.gz box64-f280c6498056dc747089d07725f7f6edd03efd24.zip | |
[RV64_DYNAREC] Added more opcodes and some fixes (#716)
* Fixed various bugs * Added 66 0F 38 01 PHADDW opcode * Added 66 0F 38 02 PHADDD opcode * Added 66 0F EC PADDSB opcode * Some small optimizations
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_660f.c | 142 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_f20f.c | 2 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_f30f.c | 49 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.c | 4 | ||||
| -rw-r--r-- | src/dynarec/rv64/dynarec_rv64_helper.h | 4 |
5 files changed, 148 insertions, 53 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c index 20a5b441..962ddce3 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f.c +++ b/src/dynarec/rv64/dynarec_rv64_660f.c @@ -194,6 +194,64 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SB(x4, gback, i); } break; + case 0x01: + INST_NAME("PHADDW Gx, Ex"); + nextop = F8; + GETGX(x1); + for (int i=0; i<4; ++i) { + // GX->sw[i] = GX->sw[i*2+0]+GX->sw[i*2+1]; + LH(x3, gback, 2*(i*2+0)); + LH(x4, gback, 2*(i*2+1)); + ADDW(x3, x3, x4); + SH(x3, gback, 2*i); + } + if (MODREG && gd==(nextop&7)+(rex.b<<3)) { + // GX->q[1] = GX->q[0]; + LD(x3, gback, 0); + SD(x3, gback, 8); + } else { + GETEX(x2, 0); + for (int i=0; i<4; ++i) { + // GX->sw[4+i] = EX->sw[i*2+0] + EX->sw[i*2+1]; + LH(x3, wback, fixedaddress+2*(i*2+0)); + LH(x4, wback, fixedaddress+2*(i*2+1)); + ADDW(x3, x3, x4); + SH(x3, gback, 2*(4+i)); + } + } + break; + case 0x02: + INST_NAME("PHADDD Gx, Ex"); + nextop = F8; + GETGX(x1); + // GX->sd[0] += GX->sd[1]; + LW(x3, gback, 0*4); + LW(x4, gback, 1*4); + ADDW(x3, x3, x4); + SW(x3, gback, 0*4); + // GX->sd[1] = GX->sd[2] + GX->sd[3]; + LW(x3, gback, 2*4); + LW(x4, gback, 3*4); + ADDW(x3, x3, x4); + SW(x3, gback, 1*4); + if (MODREG && gd==(nextop&7)+(rex.b<<3)) { + // GX->q[1] = GX->q[0]; + LD(x3, gback, 0); + SD(x3, gback, 8); + } else { + GETEX(x2, 0); + // GX->sd[2] = EX->sd[0] + EX->sd[1]; + LW(x3, wback, fixedaddress+0*4); + LW(x4, wback, fixedaddress+1*4); + ADDW(x3, x3, x4); + SW(x3, gback, 2*4); + // GX->sd[3] = EX->sd[2] + EX->sd[3]; + LW(x3, wback, fixedaddress+2*4); + LW(x4, wback, fixedaddress+3*4); + ADDW(x3, x3, x4); + SW(x3, gback, 3*4); + } + break; case 0x17: INST_NAME("PTEST Gx, Ex"); nextop = F8; @@ -245,29 +303,43 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int DEFAULT; } break; - // case 0x3A: // these are some more SSSE3+ opcodes - // opcode = F8; - // switch(opcode) { - // case 0x0B: - // INST_NAME("ROUNDSD Gx, Ex, Ib"); - // nextop = F8; - // GETEXSD(d0, 0); - // GETGXSD_empty(v0); - // u8 = F8; - // if(u8&4) { - // u8 = sse_setround(dyn, ninst, x4, x2); - // FCVTLD(x5, d0, RD_DYN); - // FCVTDL(v0, x5, RD_DYN); - // x87_restoreround(dyn, ninst, u8); - // } else { - // FCVTLD(x5, d0, round_round[u8&3]); - // FCVTDL(v0, x5, round_round[u8&3]); - // } - // break; - // default: - // DEFAULT; - // } - // break; + case 0x3A: // these are some more SSSE3+ opcodes + opcode = F8; + switch(opcode) { + case 0x0B: + INST_NAME("ROUNDSD Gx, Ex, Ib"); + nextop = F8; + GETEXSD(d0, 0); + GETGXSD_empty(v0); + d1 = fpu_get_scratch(dyn); + u8 = F8; + FEQD(x2, d0, d0); + BNEZ_MARK(x2); + FADDD(v0, d0, d0); + B_NEXT_nocond; + MARK; // d0 is not nan + FABSD(v0, d0); + MOV64x(x3, 1ULL << __DBL_MANT_DIG__); + FCVTDL(d1, x3, RD_RTZ); + FLTD(x3, v0, d1); + BNEZ_MARK2(x3); + if (v0!=d0) FMVD(v0, d0); + B_NEXT_nocond; + MARK2; + if(u8&4) { + u8 = sse_setround(dyn, ninst, x4, x2); + FCVTLD(x5, d0, RD_DYN); + FCVTDL(v0, x5, RD_DYN); + x87_restoreround(dyn, ninst, u8); + } else { + FCVTLD(x5, d0, round_round[u8&3]); + FCVTDL(v0, x5, round_round[u8&3]); + } + break; + default: + DEFAULT; + } + break; case 0x54: INST_NAME("ANDPD Gx, Ex"); @@ -1028,6 +1100,30 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEX(x2, 0); SSE_LOOP_Q(x3, x4, OR(x3, x3, x4)); break; + case 0xEC: + INST_NAME("PADDSB Gx,Ex"); + nextop = F8; + GETGX(x1); + GETEX(x2, 0); + for(int i=0; i<16; ++i) { + // tmp16s = (int16_t)GX->sb[i] + EX->sb[i]; + // GX->sb[i] = (tmp16s>127)?127:((tmp16s<-128)?-128:tmp16s); + LB(x3, gback, i); + LB(x4, wback, fixedaddress+i); + ADDW(x3, x3, x4); + SLLIW(x3, x3, 16); + SRAIW(x3, x3, 16); + ADDI(x4, xZR, 0x7f); + BLT(x3, x4, 12); // tmp16s>127? + SB(x4, gback, i); + J(24); // continue + ADDI(x4, xZR, 0xf80); + BLT(x4, x3, 12); // tmp16s<-128? + SB(x4, gback, i); + J(8); // continue + SB(x3, gback, i); + } + break; case 0xEE: INST_NAME("PMAXSW Gx,Ex"); nextop = F8; diff --git a/src/dynarec/rv64/dynarec_rv64_f20f.c b/src/dynarec/rv64/dynarec_rv64_f20f.c index fe902ac1..c8976aff 100644 --- a/src/dynarec/rv64/dynarec_rv64_f20f.c +++ b/src/dynarec/rv64/dynarec_rv64_f20f.c @@ -130,7 +130,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int FSFLAGSI(xZR); // // reset all bits } u8 = sse_setround(dyn, ninst, x2, x3); - FCVTLDxw(gd, v0, RD_RM); + FCVTLDxw(gd, v0, RD_DYN); x87_restoreround(dyn, ninst, u8); if(!box64_dynarec_fastround) { FRFLAGS(x5); // get back FPSR to check the IOC bit diff --git a/src/dynarec/rv64/dynarec_rv64_f30f.c b/src/dynarec/rv64/dynarec_rv64_f30f.c index e942e9b5..c3d066c8 100644 --- a/src/dynarec/rv64/dynarec_rv64_f30f.c +++ b/src/dynarec/rv64/dynarec_rv64_f30f.c @@ -228,24 +228,23 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x5B: INST_NAME("CVTTPS2DQ Gx, Ex"); nextop = F8; - GETEX(x5, 0) ; - GETGX(x6); + GETGX(x1); + GETEX(x2, 0); v0 = fpu_get_scratch(dyn); - v1 = fpu_get_scratch(dyn); - q0 = fpu_get_scratch(dyn); - q1 = fpu_get_scratch(dyn); - FLW(v0, x5, 0); - FLW(v1, x5, 4); - FLW(q0, x5, 8); - FLW(q1, x5, 12); - FCVTWS(x1, v0, RD_RTZ); - FCVTWS(x2, v1, RD_RTZ); - FCVTWS(x3, q0, RD_RTZ); - FCVTWS(x4, q1, RD_RTZ); - SW(x1, x6, 0); - SW(x2, x6, 4); - SW(x3, x6, 8); - SW(x4, x6, 12); + for(int i=0; i<4; ++i) { + if(!box64_dynarec_fastround) { + FSFLAGSI(xZR); // reset all bits + } + FLW(v0, wback, fixedaddress+i*4); + FCVTWS(x3, v0, RD_RTZ); + if(!box64_dynarec_fastround) { + FRFLAGS(x5); // get back FPSR to check the IOC bit + ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF)); + BEQZ(x5, 8); + MOV32w(x3, 0x80000000); + } + SW(x3, gback, i*4); + } break; case 0xBC: INST_NAME("TZCNT Gd, Ed"); @@ -379,16 +378,16 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xE6: INST_NAME("CVTDQ2PD Gx, Ex"); nextop = F8; - GETEX(x1, 0); - GETGX(x2); + GETGX(x1); + GETEX(x2, 0); q0 = fpu_get_scratch(dyn); q1 = fpu_get_scratch(dyn); - LW(x3, x1, 0); - LW(x4, x1, 4); - FCVTDW(q0, x3, RD_DYN); - FCVTDW(q1, x4, RD_DYN); - FSD(q0, x2, 0); - FSD(q1, x2, 8); + LW(x3, wback, fixedaddress+0); + LW(x4, wback, fixedaddress+4); + FCVTDW(q0, x3, RD_RTZ); + FCVTDW(q1, x4, RD_RTZ); + FSD(q0, gback, 0); + FSD(q1, gback, 8); break; default: diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index a395871d..a0f88502 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -998,7 +998,7 @@ int x87_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2) ADDI(s2, xZR, 3); BEQ(s1, s2, 12); ADDI(s1, s1, 1); - BEQ(xZR, xZR, 8); + J(8); ADDI(s1, xZR, 1); // transform done (is there a faster way?) FSRM(s1, s1); // exange RM with current @@ -1020,7 +1020,7 @@ int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2) ADDI(s2, xZR, 3); BEQ(s1, s2, 12); ADDI(s1, s1, 1); - BEQ(xZR, xZR, 8); + J(8); ADDI(s1, xZR, 1); // transform done (is there a faster way?) FSRM(s1, s1); // exange RM with current diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 95fc3f87..76792594 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -281,10 +281,10 @@ gd = ((nextop&0x38)>>3)+(rex.r<<3); \ a = sse_get_reg(dyn, ninst, x1, gd, 0) -// Get GX as a Double (might use x1), no fetching old value +// Get GX as a Double (might use x2), no fetching old value #define GETGXSD_empty(a) \ gd = ((nextop&0x38)>>3)+(rex.r<<3); \ - a = sse_get_reg_empty(dyn, ninst, x1, gd, 0) + a = sse_get_reg_empty(dyn, ninst, x2, gd, 0) // Get Ex as a single, not a quad (warning, x1 get used, x2 might too) #define GETEXSS(a, D) \ |