diff options
Diffstat (limited to 'src/dynarec/rv64')
39 files changed, 5258 insertions, 1154 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_00.c b/src/dynarec/rv64/dynarec_rv64_00.c index f5bd8af7..5f529fb7 100644 --- a/src/dynarec/rv64/dynarec_rv64_00.c +++ b/src/dynarec/rv64/dynarec_rv64_00.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include <signal.h> #include <assert.h> @@ -26,8 +25,6 @@ #include "dynarec_rv64_functions.h" #include "dynarec_rv64_helper.h" -int isSimpleWrapper(wrapper_t fun); - uintptr_t dynarec64_00(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) { uint8_t opcode; diff --git a/src/dynarec/rv64/dynarec_rv64_00_0.c b/src/dynarec/rv64/dynarec_rv64_00_0.c index 0320107d..a0ff3746 100644 --- a/src/dynarec/rv64/dynarec_rv64_00_0.c +++ b/src/dynarec/rv64/dynarec_rv64_00_0.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include <signal.h> #include <assert.h> @@ -26,8 +25,6 @@ #include "dynarec_rv64_functions.h" #include "dynarec_rv64_helper.h" -int isSimpleWrapper(wrapper_t fun); - uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) { uint8_t nextop, opcode; @@ -178,7 +175,7 @@ uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; GETGD; GETED(0); - emit_adc32(dyn, ninst, rex, ed, gd, x3, x4, x5); + emit_adc32(dyn, ninst, rex, ed, gd, x3, x4, x5, x6); WBACK; break; @@ -231,6 +228,15 @@ uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ANDI(xRAX, xRAX, ~0xff); OR(xRAX, xRAX, x1); break; + case 0x1D: + INST_NAME("SBB EAX, Id"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + i64 = F32S; + MOV64xw(x2, i64); + emit_sbb32(dyn, ninst, rex, xRAX, x2, x3, x4, x5); + break; + case 0x20: INST_NAME("AND Eb, Gb"); SETFLAGS(X_ALL, SF_SET_PENDING); diff --git a/src/dynarec/rv64/dynarec_rv64_00_1.c b/src/dynarec/rv64/dynarec_rv64_00_1.c index 3abb0444..54ca28f5 100644 --- a/src/dynarec/rv64/dynarec_rv64_00_1.c +++ b/src/dynarec/rv64/dynarec_rv64_00_1.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include <signal.h> #include <assert.h> @@ -53,7 +52,32 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int MAYUSE(cacheupd); switch(opcode) { - + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + INST_NAME("INC Reg (32bits)"); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); + gd = xRAX + (opcode&7); + emit_inc32(dyn, ninst, rex, gd, x1, x2, x3, x4); + break; + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + INST_NAME("DEC Reg (32bits)"); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); + gd = xRAX + (opcode&7); + emit_dec32(dyn, ninst, rex, gd, x1, x2, x3, x4); + break; case 0x50: case 0x51: case 0x52: @@ -64,8 +88,7 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x57: INST_NAME("PUSH reg"); gd = xRAX+(opcode&0x07)+(rex.b<<3); - SD(gd, xRSP, -8); - SUBI(xRSP, xRSP, 8); + PUSH1z(gd); break; case 0x58: case 0x59: @@ -77,31 +100,65 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x5F: INST_NAME("POP reg"); gd = xRAX+(opcode&0x07)+(rex.b<<3); - LD(gd, xRSP, 0); - if(gd!=xRSP) { - ADDI(xRSP, xRSP, 8); + POP1z(gd); + break; + + case 0x60: + if(rex.is32bits) { + INST_NAME("PUSHAD"); + AND(x1, xRSP, xMASK); + PUSH1_32(xRAX); + PUSH1_32(xRCX); + PUSH1_32(xRDX); + PUSH1_32(xRBX); + PUSH1_32(x1); + PUSH1_32(xRBP); + PUSH1_32(xRSI); + PUSH1_32(xRDI); + } else { + DEFAULT; + } + break; + case 0x61: + if(rex.is32bits) { + INST_NAME("POPAD"); + POP1_32(xRDI); + POP1_32(xRSI); + POP1_32(xRBP); + POP1_32(x1); + POP1_32(xRBX); + POP1_32(xRDX); + POP1_32(xRCX); + POP1_32(xRAX); + } else { + DEFAULT; } break; case 0x63: - INST_NAME("MOVSXD Gd, Ed"); - nextop = F8; - GETGD; - if(rex.w) { - if(MODREG) { // reg <= reg - ADDIW(gd, xRAX+(nextop&7)+(rex.b<<3), 0); - } else { // mem <= reg - SMREAD(); - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); - LW(gd, ed, fixedaddress); - } + if(rex.is32bits) { + // this is ARPL opcode + DEFAULT; } else { - if(MODREG) { // reg <= reg - AND(gd, xRAX+(nextop&7)+(rex.b<<3), xMASK); - } else { // mem <= reg - SMREAD(); - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); - LWU(gd, ed, fixedaddress); + INST_NAME("MOVSXD Gd, Ed"); + nextop = F8; + GETGD; + if(rex.w) { + if(MODREG) { // reg <= reg + ADDIW(gd, xRAX+(nextop&7)+(rex.b<<3), 0); + } else { // mem <= reg + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + LW(gd, ed, fixedaddress); + } + } else { + if(MODREG) { // reg <= reg + AND(gd, xRAX+(nextop&7)+(rex.b<<3), xMASK); + } else { // mem <= reg + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + LWU(gd, ed, fixedaddress); + } } } break; @@ -114,7 +171,9 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x66: addr = dynarec64_66(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); break; - + case 0x67: + addr = dynarec64_67(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; case 0x68: INST_NAME("PUSH Id"); i64 = F32S; @@ -122,10 +181,10 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int MESSAGE(LOG_DUMP, "PUSH then RET, using indirect\n"); TABLE64(x3, addr-4); LW(x1, x3, 0); - PUSH1(x1); + PUSH1z(x1); } else { - MOV64x(x3, i64); - PUSH1(x3); + MOV64z(x3, i64); + PUSH1z(x3); } break; case 0x69: @@ -164,8 +223,8 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x6A: INST_NAME("PUSH Ib"); i64 = F8S; - MOV64x(x3, i64); - PUSH1(x3); + MOV64z(x3, i64); + PUSH1z(x3); break; case 0x6B: INST_NAME("IMUL Gd, Ed, Ib"); @@ -179,12 +238,12 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int // 64bits imul UFLAG_IF { MULH(x3, ed, x4); - MULW(gd, ed, x4); + MUL(gd, ed, x4); UFLAG_OP1(x3); UFLAG_RES(gd); UFLAG_DF(x3, d_imul64); } else { - MULxw(gd, ed, x4); + MUL(gd, ed, x4); } } else { // 32bits imul @@ -195,7 +254,7 @@ uintptr_t dynarec64_00_1(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int UFLAG_OP1(x3); UFLAG_DF(x3, d_imul32); } else { - MULxw(gd, ed, x4); + MULW(gd, ed, x4); } ZEROUP(gd); } diff --git a/src/dynarec/rv64/dynarec_rv64_00_2.c b/src/dynarec/rv64/dynarec_rv64_00_2.c index 6f0ef03e..20333f96 100644 --- a/src/dynarec/rv64/dynarec_rv64_00_2.c +++ b/src/dynarec/rv64/dynarec_rv64_00_2.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include <signal.h> #include <assert.h> @@ -26,8 +25,6 @@ #include "dynarec_rv64_functions.h" #include "dynarec_rv64_helper.h" -int isSimpleWrapper(wrapper_t fun); - uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) { uint8_t nextop, opcode; @@ -72,6 +69,15 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int emit_or8c(dyn, ninst, x1, u8, x2, x4, x5); EBBACK(x5, 0); break; + case 2: // ADC + INST_NAME("ADC Eb, Ib"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEB(x1, 1); + u8 = F8; + emit_adc8c(dyn, ninst, x1, u8, x2, x4, x5, x6); + EBBACK(x5, 0); + break; case 3: // SBB INST_NAME("SBB Eb, Ib"); READFLAGS(X_CF); @@ -148,7 +154,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETED((opcode==0x81)?4:1); if(opcode==0x81) i64 = F32S; else i64 = F8S; MOV64xw(x5, i64); - emit_adc32(dyn, ninst, rex, ed, x5, x3, x4, x6); + emit_adc32(dyn, ninst, rex, ed, x5, x3, x4, x6, x9); WBACK; break; case 3: // SBB @@ -297,7 +303,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ANDI(gd, gb1, 0xff); if(eb2) { MOV64x(x1, 0xffffffffffff00ffLL); - ANDI(x1, eb1, x1); + AND(x1, eb1, x1); SLLI(gd, gd, 8); OR(eb1, x1, gd); } else { @@ -316,7 +322,7 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETGD; if(MODREG) { // reg <= reg MVxw(xRAX+(nextop&7)+(rex.b<<3), gd); - } else { // mem <= reg + } else { // mem <= reg addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0); SDxw(gd, ed, fixedaddress); SMWRITELOCK(lock); @@ -391,15 +397,13 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("LEA Gd, Ed"); nextop=F8; GETGD; - if(MODREG) { // reg <= reg? that's an invalid operation + if(MODREG) { // reg <= reg? that's an invalid operation DEFAULT; - } else { // mem <= reg - addr = geted(dyn, addr, ninst, nextop, &ed, gd, x1, &fixedaddress, rex, NULL, 0, 0); - if(gd!=ed) { // it's sometimes used as a 3 bytes NOP - MV(gd, ed); - } - if(!rex.w) { - ZEROUP(gd); //truncate the higher 32bits as asked + } else { // mem <= reg + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 0, 0); + MV(gd, ed); + if(!rex.w || rex.is32bits) { + ZEROUP(gd); // truncate the higher 32bits as asked } } break; @@ -421,17 +425,17 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("POP Ed"); nextop = F8; if(MODREG) { - POP1(xRAX+(nextop&7)+(rex.b<<3)); + POP1z(xRAX+(nextop&7)+(rex.b<<3)); } else { - POP1(x2); // so this can handle POP [ESP] and maybe some variant too - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0); + POP1z(x2); // so this can handle POP [ESP] and maybe some variant too + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x1, &fixedaddress, rex, &lock, 1, 0); if(ed==xRSP) { - SD(x2, ed, fixedaddress); + SDz(x2, ed, fixedaddress); } else { // complicated to just allow a segfault that can be recovered correctly - SUB(xRSP, xRSP, 8); - SD(x2, ed, fixedaddress); - ADD(xRSP, xRSP, 8); + ADDIz(xRSP, xRSP, rex.is32bits?-4:-8); + SDz(x2, ed, fixedaddress); + ADDIz(xRSP, xRSP, rex.is32bits?4:8); } } break; @@ -473,39 +477,68 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ZEROUP(xRDX); } break; + case 0x9B: + INST_NAME("FWAIT"); + break; case 0x9C: INST_NAME("PUSHF"); + NOTEST(x1); READFLAGS(X_ALL); FLAGS_ADJUST_TO11(x3, xFlags, x2); - PUSH1(x3); + PUSH1z(x3); break; case 0x9D: INST_NAME("POPF"); SETFLAGS(X_ALL, SF_SET); - POP1(xFlags); + POP1z(xFlags); FLAGS_ADJUST_FROM11(xFlags, x2); MOV32w(x1, 0x3F7FD7); AND(xFlags, xFlags, x1); ORI(xFlags, xFlags, 0x2); SET_DFNONE(); + if(box64_wine) { // should this be done all the time? + ANDI(x1, xFlags, 1 << F_TF); + CBZ_NEXT(x1); + MOV64x(xRIP, addr); + STORE_XEMU_CALL(); + CALL(native_singlestep, -1); + ANDI(xFlags, xFlags, ~(1 << F_TF)); + } + break; + case 0x9F: + INST_NAME("LAHF"); + READFLAGS(X_CF|X_PF|X_AF|X_ZF|X_SF); + ANDI(x1, xFlags, 0xFF); + SLLI(x1, x1, 8); + MOV64x(x2, 0xffffffffffff00ffLL); + AND(xRAX, xRAX, x2); + OR(xRAX, xRAX, x1); + break; + case 0xA0: + INST_NAME("MOV AL,Ob"); + if(rex.is32bits) u64 = F32; else u64 = F64; + MOV64z(x1, u64); + LBU(x1, x1, 0); + ANDI(xRAX, xRAX, ~0xff); + OR(xRAX, xRAX, x1); break; case 0xA1: INST_NAME("MOV EAX,Od"); - u64 = F64; - MOV64x(x1, u64); + if(rex.is32bits) u64 = F32; else u64 = F64; + MOV64z(x1, u64); LDxw(xRAX, x1, 0); break; case 0xA2: INST_NAME("MOV Ob,AL"); - u64 = F64; - MOV64x(x1, u64); + if(rex.is32bits) u64 = F32; else u64 = F64; + MOV64z(x1, u64); SB(xRAX, x1, 0); SMWRITE(); break; case 0xA3: INST_NAME("MOV Od,EAX"); - u64 = F64; - MOV64x(x1, u64); + if(rex.is32bits) u64 = F32; else u64 = F64; + MOV64z(x1, u64); SDxw(xRAX, x1, 0); SMWRITE(); break; @@ -628,6 +661,31 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int MOV64xw(x2, i64); emit_test32(dyn, ninst, rex, xRAX, x2, x3, x4, x5); break; + case 0xAA: + if(rep) { + INST_NAME("REP STOSB"); + CBZ_NEXT(xRCX); + ANDI(x1, xFlags, 1<<F_DF); + BNEZ_MARK2(x1); + MARK; // Part with DF==0 + SB(xRAX, xRDI, 0); + ADDI(xRDI, xRDI, 1); + ADDI(xRCX, xRCX, -1); + BNEZ_MARK(xRCX); + B_NEXT_nocond; + MARK2; // Part with DF==1 + SB(xRAX, xRDI, 0); + ADDI(xRDI, xRDI, -1); + ADDI(xRCX, xRCX, -1); + BNEZ_MARK2(xRCX); + // done + } else { + INST_NAME("STOSB"); + GETDIR(x3, x1, 1); + SB(xRAX, xRDI, 0); + ADD(xRDI, xRDI, x3); + } + break; case 0xAB: if(rep) { INST_NAME("REP STOSD"); @@ -653,6 +711,82 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ADD(xRDI, xRDI, x3); } break; + case 0xAE: + switch (rep) { + case 1: + case 2: + if (rep==1) {INST_NAME("REPNZ SCASB");} else {INST_NAME("REPZ SCASB");} + MAYSETFLAGS(); + SETFLAGS(X_ALL, SF_SET_PENDING); + CBZ_NEXT(xRCX); + ANDI(x1, xRAX, 0xff); + ANDI(x2, xFlags, 1<<F_DF); + BNEZ_MARK2(x2); + MARK; // Part with DF==0 + LBU(x2, xRDI, 0); + ADDI(xRDI, xRDI, 1); + SUBI(xRCX, xRCX, 1); + if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);} + BNE_MARK(xRCX, xZR); + B_MARK3_nocond; + MARK2; // Part with DF==1 + LBU(x2, xRDI, 0); + SUBI(xRDI, xRDI, 1); + SUBI(xRCX, xRCX, 1); + if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);} + BNE_MARK2(xRCX, xZR); + MARK3; // end + emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6); + break; + default: + INST_NAME("SCASB"); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETDIR(x3, x1, 1); + ANDI(x1, xRAX, 0xff); + LBU(x2, xRDI, 0); + ADD(xRDI, xRDI, x3); + emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6); + break; + } + break; + case 0xAF: + switch (rep) { + case 1: + case 2: + if (rep==1) {INST_NAME("REPNZ SCASD");} else {INST_NAME("REPZ SCASD");} + MAYSETFLAGS(); + SETFLAGS(X_ALL, SF_SET_PENDING); + CBZ_NEXT(xRCX); + if (rex.w) {MV(x1, xRAX);} else {AND(x1, xRAX, xMASK);} + ANDI(x2, xFlags, 1<<F_DF); + BNEZ_MARK2(x2); + MARK; // Part with DF==0 + LDxw(x2, xRDI, 0); + ADDI(xRDI, xRDI, rex.w?8:4); + SUBI(xRCX, xRCX, 1); + if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);} + BNE_MARK(xRCX, xZR); + B_MARK3_nocond; + MARK2; // Part with DF==1 + LDxw(x2, xRDI, 0); + SUBI(xRDI, xRDI, rex.w?8:4); + SUBI(xRCX, xRCX, 1); + if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);} + BNE_MARK2(xRCX, xZR); + MARK3; // end + emit_cmp32(dyn, ninst, rex, x1, x2, x3, x4, x5, x6); + break; + default: + INST_NAME("SCASD"); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETDIR(x3, x1, rex.w?8:4); + AND(x1, xRAX, xMASK); + LDxw(x2, xRDI, 0); + ADD(xRDI, xRDI, x3); + emit_cmp32(dyn, ninst, rex, x1, x2, x3, x4, x5, x6); + break; + } + break; case 0xB0: case 0xB1: case 0xB2: diff --git a/src/dynarec/rv64/dynarec_rv64_00_3.c b/src/dynarec/rv64/dynarec_rv64_00_3.c index a19f3f68..2be53fc8 100644 --- a/src/dynarec/rv64/dynarec_rv64_00_3.c +++ b/src/dynarec/rv64/dynarec_rv64_00_3.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include <signal.h> #include <assert.h> @@ -27,6 +26,7 @@ #include "dynarec_rv64_helper.h" int isSimpleWrapper(wrapper_t fun); +int isRetX87Wrapper(wrapper_t fun); uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) { @@ -66,6 +66,16 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int CALL_(rol8, ed, x3); EBBACK(x5, 0); break; + case 1: + INST_NAME("ROR Eb, Ib"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + SETFLAGS(X_OF|X_CF, SF_SET); + GETEB(x1, 1); + u8 = F8; + MOV32w(x2, u8); + CALL_(ror8, ed, x3); + EBBACK(x5, 0); + break; case 4: case 6: INST_NAME("SHL Eb, Ib"); @@ -187,7 +197,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } BARRIER(BARRIER_FLOAT); i32 = F16; - retn_to_epilog(dyn, ninst, i32); + retn_to_epilog(dyn, ninst, rex, i32); *need_epilog = 0; *ok = 0; break; @@ -198,7 +208,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int READFLAGS(X_PEND); // so instead, force the deferred flags, so it's not too slow, and flags are not lost } BARRIER(BARRIER_FLOAT); - ret_to_epilog(dyn, ninst); + ret_to_epilog(dyn, ninst, rex); *need_epilog = 0; *ok = 0; break; @@ -219,7 +229,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if (eb2) { // load a mask to x3 (ffffffffffff00ff) - LUI(x3, 0xffffffffffff0); + LUI(x3, 0xffff0); ORI(x3, x3, 0xff); // apply mask AND(eb1, eb1, x3); @@ -270,8 +280,8 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xC9: INST_NAME("LEAVE"); - MV(xRSP, xRBP); - POP1(xRBP); + MVz(xRSP, xRBP); + POP1z(xRBP); break; case 0xCC: @@ -298,6 +308,9 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int // disabling isSimpleWrapper because all signed value less than 64bits needs to be sign extended // and return value needs to be cleanned up tmp = 0;//isSimpleWrapper(*(wrapper_t*)(addr)); + if(isRetX87Wrapper(*(wrapper_t*)(addr))) + // return value will be on the stack, so the stack depth needs to be updated + x87_purgecache(dyn, ninst, 0, x3, x1, x4); if(tmp<0 || tmp>1) tmp=0; //TODO: removed when FP is in place if((box64_log<2 && !cycle_log) && tmp) { @@ -336,6 +349,39 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int #endif } break; + case 0xCD: + u8 = F8; + if (box64_wine && u8 == 0x2D) { + INST_NAME("INT 2D"); + // lets do nothing + MESSAGE(LOG_INFO, "INT 2D Windows anti-debug hack\n"); + } else if (u8 == 0x80) { + INST_NAME("32bits SYSCALL"); + NOTEST(x1); + SMEND(); + GETIP(addr); + STORE_XEMU_CALL(); + CALL_S(x86Syscall, -1); + LOAD_XEMU_CALL(); + TABLE64(x3, addr); // expected return address + BNE_MARK(xRIP, x3); + LW(x1, xEmu, offsetof(x64emu_t, quit)); + BEQ_NEXT(x1, xZR); + MARK; + LOAD_XEMU_REM(); + jump_to_epilog(dyn, 0, xRIP, ninst); + } else { + INST_NAME("INT n"); + SETFLAGS(X_ALL, SF_SET); // Hack to set flags in "don't care" state + GETIP(ip); + STORE_XEMU_CALL(); + CALL(native_priv, -1); + LOAD_XEMU_CALL(); + jump_to_epilog(dyn, 0, xRIP, ninst); + *need_epilog = 0; + *ok = 0; + } + break; case 0xCF: INST_NAME("IRET"); SETFLAGS(X_ALL, SF_SET); // Not a hack, EFLAGS are restored @@ -348,6 +394,24 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xD2: // TODO: Jump if CL is 0 nextop = F8; switch((nextop>>3)&7) { + case 0: + if(opcode==0xD0) { + INST_NAME("ROL Eb, 1"); + MOV32w(x2, 1); + } else { + INST_NAME("ROL Eb, CL"); + ANDI(x2, xRCX, 7); + } + SETFLAGS(X_OF|X_CF, SF_PENDING); + GETEB(x1, 0); + UFLAG_OP12(ed, x2); + SLL(x3, ed, x2); + SRLI(x4, x3, 8); + OR(ed, x3, x4); + EBBACK(x5, 1); + UFLAG_RES(ed); + UFLAG_DF(x3, d_rol8); + break; case 1: if(opcode==0xD0) { INST_NAME("ROR Eb, 1"); @@ -367,6 +431,23 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int UFLAG_RES(ed); UFLAG_DF(x3, d_ror8); break; + case 4: + case 6: + if(opcode==0xD0) { + INST_NAME("SHL Eb, 1"); + MOV32w(x2, 1); + } else { + INST_NAME("SHL Eb, CL"); + ANDI(x2, xRCX, 7); + } + SETFLAGS(X_ALL, SF_PENDING); + GETEB(x1, 0); + UFLAG_OP12(ed, x2) + SLL(ed, ed, x2); + EBBACK(x5, 1); + UFLAG_RES(ed); + UFLAG_DF(x3, d_shl8); + break; case 5: if(opcode==0xD0) { INST_NAME("SHR Eb, 1"); @@ -422,6 +503,16 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int WBACK; if(!wback && !rex.w) ZEROUP(ed); break; + case 3: + INST_NAME("RCR Ed, 1"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + READFLAGS(X_CF); + SETFLAGS(X_OF|X_CF, SF_SET); + MOV32w(x2, 1); + GETEDW(x4, x1, 0); + CALL_(rex.w?((void*)rcr64):((void*)rcr32), ed, x4); + WBACK; + break; case 4: case 6: INST_NAME("SHL Ed, 1"); @@ -517,6 +608,12 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xDB: addr = dynarec64_DB(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); break; + case 0xDC: + addr = dynarec64_DC(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; + case 0xDD: + addr = dynarec64_DD(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; case 0xDE: addr = dynarec64_DE(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); @@ -534,7 +631,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int #endif } #if STEP < 2 - if(isNativeCall(dyn, addr+i32, &dyn->insts[ninst].natcall, &dyn->insts[ninst].retn)) + if(!rex.is32bits && isNativeCall(dyn, addr+i32, &dyn->insts[ninst].natcall, &dyn->insts[ninst].retn)) tmp = dyn->insts[ninst].pass2choice = 3; else tmp = dyn->insts[ninst].pass2choice = 0; @@ -564,6 +661,9 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int tmp=0; // float paramters not ready! } else tmp=0; + if(dyn->insts[ninst].natcall && isRetX87Wrapper(*(wrapper_t*)(dyn->insts[ninst].natcall+2))) + // return value will be on the stack, so the stack depth needs to be updated + x87_purgecache(dyn, ninst, 0, x3, x1, x4); if((box64_log<2 && !cycle_log) && dyn->insts[ninst].natcall && tmp) { //GETIP(ip+3+8+8); // read the 0xCC call_n(dyn, ninst, *(void**)(dyn->insts[ninst].natcall+2+8), tmp); @@ -611,12 +711,13 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int *need_epilog = 0; *ok = 0; } - if(addr<0x100000000LL) { - MOV64x(x2, addr); + + if(rex.is32bits) { + MOV32w(x2, addr); } else { TABLE64(x2, addr); } - PUSH1(x2); + PUSH1z(x2); // TODO: Add support for CALLRET optim /*if(box64_dynarec_callret) { // Push actual return address @@ -636,16 +737,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int *ok = 0; *need_epilog = 0; } - if(addr+i32==0) { // self modifying code maybe? so use indirect address fetching - if(addr-4<0x100000000LL) { - MOV64x(x4, addr-4); - } else { - TABLE64(x4, addr-4); - } - LD(x4, x4, 0); - jump_to_next(dyn, 0, x4, ninst); - } else - jump_to_next(dyn, addr+i32, 0, ninst); + jump_to_next(dyn, addr+i32, 0, ninst); break; } break; @@ -659,11 +751,11 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("JMP Ib"); i32 = F8S; } - JUMP(addr+i32, 0); + JUMP((uintptr_t)getAlternate((void*)(addr+i32)), 0); if(dyn->insts[ninst].x64.jmp_insts==-1) { // out of the block fpu_purgecache(dyn, ninst, 1, x1, x2, x3); - jump_to_next(dyn, addr+i32, 0, ninst); + jump_to_next(dyn, (uintptr_t)getAlternate((void*)(addr+i32)), 0, ninst); } else { // inside the block CacheTransform(dyn, ninst, CHECK_CACHE(), x1, x2, x3); @@ -681,6 +773,12 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xF0: addr = dynarec64_F0(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); break; + case 0xF5: + INST_NAME("CMC"); + READFLAGS(X_CF); + SETFLAGS(X_CF, SF_SUBSET); + XORI(xFlags, xFlags, 1<<F_CF); + break; case 0xF6: nextop = F8; switch((nextop>>3)&7) { @@ -716,8 +814,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int UFLAG_RES(x1); LUI(x2, 0xffff0); AND(xRAX, xRAX, x2); - SLLI(x1, x1, 48); - SRLI(x1, x1, 48); + ZEXTH(x1, x1); OR(xRAX, xRAX, x1); break; case 5: @@ -731,8 +828,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int UFLAG_RES(x1); LUI(x2, 0xffff0); AND(xRAX, xRAX, x2); - SLLI(x1, x1, 48); - SRLI(x1, x1, 48); + ZEXTH(x1, x1); OR(xRAX, xRAX, x1); break; case 6: @@ -840,9 +936,9 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int AND(xRAX, x2, xMASK); ZEROUP(xRDX); } else { - if(ninst - && dyn->insts[ninst-1].x64.addr - && *(uint8_t*)(dyn->insts[ninst-1].x64.addr)==0x31 + if(ninst + && dyn->insts[ninst-1].x64.addr + && *(uint8_t*)(dyn->insts[ninst-1].x64.addr)==0x31 && *(uint8_t*)(dyn->insts[ninst-1].x64.addr+1)==0xD2) { SET_DFNONE(); GETED(0); @@ -879,7 +975,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ZEROUP(xRDX); } else { if(ninst && dyn->insts - && dyn->insts[ninst-1].x64.addr + && dyn->insts[ninst-1].x64.addr && *(uint8_t*)(dyn->insts[ninst-1].x64.addr)==0x48 && *(uint8_t*)(dyn->insts[ninst-1].x64.addr+1)==0x99) { SET_DFNONE() @@ -970,7 +1066,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int break; case 2: // CALL Ed INST_NAME("CALL Ed"); - PASS2IF((box64_dynarec_safeflags>1) || + PASS2IF((box64_dynarec_safeflags>1) || ((ninst && dyn->insts[ninst-1].x64.set_flags) || ((ninst>1) && dyn->insts[ninst-2].x64.set_flags)), 1) { @@ -978,7 +1074,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } else { SETFLAGS(X_ALL, SF_SET); //Hack to put flag in "don't care" state } - GETEDx(0); + GETEDz(0); if(box64_dynarec_callret && box64_dynarec_bigblock>1) { BARRIER(BARRIER_FULL); } else { @@ -1001,22 +1097,41 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } STPx_S7_preindex(x4, xRIP, xSP, -16); }*/ - PUSH1(xRIP); + PUSH1z(xRIP); jump_to_next(dyn, 0, ed, ninst); break; case 4: // JMP Ed INST_NAME("JMP Ed"); READFLAGS(X_PEND); BARRIER(BARRIER_FLOAT); - GETEDx(0); + GETEDz(0); jump_to_next(dyn, 0, ed, ninst); *need_epilog = 0; *ok = 0; break; + case 5: // JMP FAR Ed + if(MODREG) { + DEFAULT; + } else { + INST_NAME("JMP FAR Ed"); + READFLAGS(X_PEND); + BARRIER(BARRIER_FLOAT); + SMREAD() + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 0, 0); + LDxw(x1, wback, 0); + ed = x1; + LHU(x3, wback, rex.w?8:4); + SW(x3, xEmu, offsetof(x64emu_t, segs[_CS])); + SW(xZR, xEmu, offsetof(x64emu_t, segs_serial[_CS])); + jump_to_epilog(dyn, 0, ed, ninst); + *need_epilog = 0; + *ok = 0; + } + break; case 6: // Push Ed INST_NAME("PUSH Ed"); - GETEDx(0); - PUSH1(ed); + GETEDz(0); + PUSH1z(ed); break; default: diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c index a3d9efc1..5c8d7b81 100644 --- a/src/dynarec/rv64/dynarec_rv64_0f.c +++ b/src/dynarec/rv64/dynarec_rv64_0f.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -42,7 +41,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni int s0, s1; uint64_t tmp64u; int64_t j64; - int64_t fixedaddress; + int64_t fixedaddress, gdoffset; int unscaled; MAYUSE(wb2); MAYUSE(gback); @@ -113,24 +112,36 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni *ok = 0; break; + case 0x0D: + nextop = F8; + switch((nextop>>3)&7) { + case 1: + INST_NAME("PREFETCHW"); + // nop without Zicbom, Zicbop, Zicboz extensions + FAKEED; + break; + default: //??? + DEFAULT; + } + break; case 0x10: INST_NAME("MOVUPS Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); LD(x3, wback, fixedaddress+0); LD(x4, wback, fixedaddress+8); - SD(x3, gback, 0); - SD(x4, gback, 8); + SD(x3, gback, gdoffset+0); + SD(x4, gback, gdoffset+8); break; case 0x11: INST_NAME("MOVUPS Ex,Gx"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); - LD(x3, gback, 0); - LD(x4, gback, 8); + LD(x3, gback, gdoffset+0); + LD(x4, gback, gdoffset+8); SD(x3, wback, fixedaddress+0); SD(x4, wback, fixedaddress+8); if(!MODREG) @@ -140,10 +151,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni nextop = F8; if(MODREG) { INST_NAME("MOVHLPS Gx,Ex"); - GETGX(x1); + GETGX(); GETEX(x2, 0); LD(x3, wback, fixedaddress+8); - SD(x3, gback, 0); + SD(x3, gback, gdoffset+0); } else { INST_NAME("MOVLPS Gx,Ex"); GETEXSD(v0, 0); @@ -154,9 +165,9 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x13: INST_NAME("MOVLPS Ex,Gx"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); - LD(x3, gback, 0); + LD(x3, gback, gdoffset+0); SD(x3, wback, fixedaddress+0); if(!MODREG) SMWRITE2(); @@ -164,28 +175,28 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x14: INST_NAME("UNPCKLPS Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); - LWU(x5, gback, 1*4); + LWU(x5, gback, gdoffset+1*4); LWU(x3, wback, fixedaddress+0); LWU(x4, wback, fixedaddress+4); - SW(x4, gback, 3*4); - SW(x5, gback, 2*4); - SW(x3, gback, 1*4); + SW(x4, gback, gdoffset+3*4); + SW(x5, gback, gdoffset+2*4); + SW(x3, gback, gdoffset+1*4); break; case 0x15: INST_NAME("UNPCKHPS Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); LWU(x3, wback, fixedaddress+2*4); LWU(x4, wback, fixedaddress+3*4); - LWU(x5, gback, 2*4); - LWU(x6, gback, 3*4); - SW(x5, gback, 0*4); - SW(x3, gback, 1*4); - SW(x6, gback, 2*4); - SW(x4, gback, 3*4); + LWU(x5, gback, gdoffset+2*4); + LWU(x6, gback, gdoffset+3*4); + SW(x5, gback, gdoffset+0*4); + SW(x3, gback, gdoffset+1*4); + SW(x6, gback, gdoffset+2*4); + SW(x4, gback, gdoffset+3*4); break; case 0x16: nextop = F8; @@ -195,17 +206,17 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("MOVHPS Gx,Ex"); SMREAD(); } - GETGX(x1); + GETGX(); GETEX(x2, 0); LD(x4, wback, fixedaddress+0); - SD(x4, gback, 8); + SD(x4, gback, gdoffset+8); break; case 0x17: INST_NAME("MOVHPS Ex,Gx"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); - LD(x4, gback, 8); + LD(x4, gback, gdoffset+8); SD(x4, wback, fixedaddress+0); if(!MODREG) SMWRITE2(); @@ -217,16 +228,11 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } else switch((nextop>>3)&7) { case 0: - DEFAULT; - break; case 1: - DEFAULT; - break; case 2: - DEFAULT; - break; case 3: - DEFAULT; + INST_NAME("PREFETCHh Ed"); + FAKEED; break; default: INST_NAME("NOP (multibyte)"); @@ -243,14 +249,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x28: INST_NAME("MOVAPS Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_MV_Q(x3); break; case 0x29: INST_NAME("MOVAPS Ex,Gx"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_MV_Q2(x3); if(!MODREG) @@ -260,10 +266,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x2B: INST_NAME("MOVNTPS Ex,Gx"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); - LD(x3, gback, 0); - LD(x4, gback, 8); + LD(x3, gback, gdoffset+0); + LD(x4, gback, gdoffset+8); SD(x3, wback, fixedaddress+0); SD(x4, wback, fixedaddress+8); break; @@ -304,10 +310,11 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni break; case 0x31: INST_NAME("RDTSC"); + NOTEST(x1); MESSAGE(LOG_DUMP, "Need Optimization\n"); - CALL(ReadTSC, xRAX); // will return the u64 in xEAX - SRLI(xRDX, xRAX, 32); - ZEROUP(xRAX); // wipe upper part + CALL(ReadTSC, x3); // will return the u64 in x3 + SRLI(xRDX, x3, 32); + AND(xRAX, x3, 32); // wipe upper part break; @@ -342,12 +349,72 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni OR(gd, gd, x2); } break; + case 0x51: + INST_NAME("SQRTPS Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + d0 = fpu_get_scratch(dyn); + for(int i=0; i<4; ++i) { + FLW(d0, wback, fixedaddress+4*i); + FSQRTS(d0, d0); + FSW(d0, gback, gdoffset+4*i); + } + break; + case 0x52: + INST_NAME("RSQRTPS Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + s0 = fpu_get_scratch(dyn); + s1 = fpu_get_scratch(dyn); // 1.0f + v0 = fpu_get_scratch(dyn); // 0.0f + // do accurate computation, because riscv doesn't have rsqrt + MOV32w(x3, 1); + FCVTSW(s1, x3, RD_DYN); + if (!box64_dynarec_fastnan) { + FCVTSW(v0, xZR, RD_DYN); + } + for(int i=0; i<4; ++i) { + FLW(s0, wback, fixedaddress+i*4); + if (!box64_dynarec_fastnan) { + FLES(x3, v0, s0); // s0 >= 0.0f? + BNEZ(x3, 6*4); + FEQS(x3, s0, s0); // isnan(s0)? + BEQZ(x3, 2*4); + // s0 is negative, so generate a NaN + FDIVS(s0, s1, v0); + // s0 is a NaN, just copy it + FSW(s0, gback, gdoffset+i*4); + J(4*4); + // do regular computation + } + FSQRTS(s0, s0); + FDIVS(s0, s1, s0); + FSW(s0, gback, gdoffset+i*4); + } + break; + case 0x53: + INST_NAME("RCPPS Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + LUI(x3, 0x3f800); + FMVWX(d0, x3); // 1.0f + for(int i=0; i<4; ++i) { + FLW(d1, wback, fixedaddress+4*i); + FDIVS(d1, d0, d1); + FSW(d1, gback, gdoffset+4*i); + } + break; case 0x54: INST_NAME("ANDPS Gx, Ex"); nextop = F8; gd = ((nextop&0x38)>>3)+(rex.r<<3); if(!(MODREG && gd==(nextop&7)+(rex.b<<3))) { - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_Q(x3, x4, AND(x3, x3, x4)); } @@ -355,7 +422,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x55: INST_NAME("ANDNPS Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_Q(x3, x4, NOT(x3, x3); AND(x3, x3, x4)); break; @@ -364,7 +431,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni nextop = F8; gd = ((nextop&0x38)>>3)+(rex.r<<3); if(!(MODREG && gd==(nextop&7)+(rex.b<<3))) { - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_Q(x3, x4, OR(x3, x3, x4)); } @@ -373,12 +440,12 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("XORPS Gx, Ex"); nextop = F8; //TODO: it might be possible to check if SS or SD are used and not purge them to optimize a bit - GETGX(x1); + GETGX(); if(MODREG && gd==(nextop&7)+(rex.b<<3)) { // just zero dest - SD(xZR, x1, 0); - SD(xZR, x1, 8); + SD(xZR, gback, gdoffset+0); + SD(xZR, gback, gdoffset+8); } else { GETEX(x2, 0); SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); @@ -387,37 +454,37 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x58: INST_NAME("ADDPS Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); s0 = fpu_get_scratch(dyn); s1 = fpu_get_scratch(dyn); for(int i=0; i<4; ++i) { // GX->f[i] += EX->f[i]; FLW(s0, wback, fixedaddress+i*4); - FLW(s1, gback, i*4); + FLW(s1, gback, gdoffset+i*4); FADDS(s1, s1, s0); - FSW(s1, gback, i*4); + FSW(s1, gback, gdoffset+i*4); } break; case 0x59: INST_NAME("MULPS Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); s0 = fpu_get_scratch(dyn); s1 = fpu_get_scratch(dyn); for(int i=0; i<4; ++i) { // GX->f[i] *= EX->f[i]; FLW(s0, wback, fixedaddress+i*4); - FLW(s1, gback, i*4); + FLW(s1, gback, gdoffset+i*4); FMULS(s1, s1, s0); - FSW(s1, gback, i*4); + FSW(s1, gback, gdoffset+i*4); } break; case 0x5A: INST_NAME("CVTPS2PD Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); s0 = fpu_get_scratch(dyn); s1 = fpu_get_scratch(dyn); @@ -425,46 +492,46 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni FLW(s1, wback, fixedaddress+4); FCVTDS(s0, s0); FCVTDS(s1, s1); - FSD(s0, gback, 0); - FSD(s1, gback, 8); + FSD(s0, gback, gdoffset+0); + FSD(s1, gback, gdoffset+8); break; case 0x5B: INST_NAME("CVTDQ2PS Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); s0 = fpu_get_scratch(dyn); for (int i=0; i<4; ++i) { LW(x3, wback, fixedaddress+i*4); FCVTSW(s0, x3, RD_RNE); - FSW(s0, gback, i*4); + FSW(s0, gback, gdoffset+i*4); } break; case 0x5C: INST_NAME("SUBPS Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); s0 = fpu_get_scratch(dyn); s1 = fpu_get_scratch(dyn); for(int i=0; i<4; ++i) { // GX->f[i] -= EX->f[i]; FLW(s0, wback, fixedaddress+i*4); - FLW(s1, gback, i*4); + FLW(s1, gback, gdoffset+i*4); FSUBS(s1, s1, s0); - FSW(s1, gback, i*4); + FSW(s1, gback, gdoffset+i*4); } break; case 0x5D: INST_NAME("MINPS Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); s0 = fpu_get_scratch(dyn); s1 = fpu_get_scratch(dyn); for(int i=0; i<4; ++i) { FLW(s0, wback, fixedaddress+i*4); - FLW(s1, gback, i*4); + FLW(s1, gback, gdoffset+i*4); if(!box64_dynarec_fastnan) { FEQS(x3, s0, s0); FEQS(x4, s1, s1); @@ -472,38 +539,38 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni BEQZ(x3, 12); FLTS(x3, s0, s1); BEQZ(x3, 8); - FSW(s0, gback, i*4); + FSW(s0, gback, gdoffset+i*4); } else { FMINS(s1, s1, s0); - FSW(s1, gback, i*4); + FSW(s1, gback, gdoffset+i*4); } } break; case 0x5E: INST_NAME("DIVPS Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); s0 = fpu_get_scratch(dyn); s1 = fpu_get_scratch(dyn); for(int i=0; i<4; ++i) { // GX->f[i] /= EX->f[i]; FLW(s0, wback, fixedaddress+i*4); - FLW(s1, gback, i*4); + FLW(s1, gback, gdoffset+i*4); FDIVS(s1, s1, s0); - FSW(s1, gback, i*4); + FSW(s1, gback, gdoffset+i*4); } break; case 0x5F: INST_NAME("MAXPS Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); s0 = fpu_get_scratch(dyn); s1 = fpu_get_scratch(dyn); for(int i=0; i<4; ++i) { FLW(s0, wback, fixedaddress+i*4); - FLW(s1, gback, i*4); + FLW(s1, gback, gdoffset+i*4); if(!box64_dynarec_fastnan) { FEQS(x3, s0, s0); FEQS(x4, s1, s1); @@ -511,13 +578,242 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni BEQZ(x3, 12); FLTS(x3, s1, s0); BEQZ(x3, 8); - FSW(s0, gback, i*4); + FSW(s0, gback, gdoffset+i*4); } else { FMAXS(s1, s1, s0); - FSW(s1, gback, i*4); + FSW(s1, gback, gdoffset+i*4); + } + } + break; + case 0x60: + INST_NAME("PUNPCKLBW Gm,Em"); + nextop = F8; + GETGM(); + for(int i=3; i>0; --i) { // 0 is untouched + // GX->ub[2 * i] = GX->ub[i]; + LBU(x3, gback, gdoffset+i); + SB(x3, gback, gdoffset+2*i); + } + if (MODREG && gd==(nextop&7)) { + for(int i=0; i<4; ++i) { + // GX->ub[2 * i + 1] = GX->ub[2 * i]; + LBU(x3, gback, gdoffset+2*i); + SB(x3, gback, gdoffset+2*i+1); + } + } else { + GETEM(x2, 0); + for(int i=0; i<4; ++i) { + // GX->ub[2 * i + 1] = EX->ub[i]; + LBU(x3, wback, fixedaddress+i); + SB(x3, gback, gdoffset+2*i+1); + } + } + break; + case 0x61: + INST_NAME("PUNPCKLWD Gm, Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + // GM->uw[3] = EM->uw[1]; + LHU(x3, wback, fixedaddress+2*1); + SH(x3, gback, gdoffset+2*3); + // GM->uw[2] = GM->uw[1]; + LHU(x3, gback, gdoffset+2*1); + SH(x3, gback, gdoffset+2*2); + // GM->uw[1] = EM->uw[0]; + LHU(x3, wback, fixedaddress+2*0); + SH(x3, gback, gdoffset+2*1); + break; + case 0x62: + INST_NAME("PUNPCKLDQ Gm, Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + // GM->ud[1] = EM->ud[0]; + LWU(x3, wback, fixedaddress); + SW(x3, gback, gdoffset+4*1); + break; + case 0x67: + INST_NAME("PACKUSWB Gm, Em"); + nextop = F8; + GETGM(); + ADDI(x5, xZR, 0xFF); + for(int i=0; i<4; ++i) { + // GX->ub[i] = (GX->sw[i]<0)?0:((GX->sw[i]>0xff)?0xff:GX->sw[i]); + LH(x3, gback, gdoffset+i*2); + BGE(x5, x3, 8); + ADDI(x3, xZR, 0xFF); + NOT(x4, x3); + SRAI(x4, x4, 63); + AND(x3, x3, x4); + SB(x3, gback, gdoffset+i); + } + if (MODREG && gd==(nextop&7)) { + // GM->ud[1] = GM->ud[0]; + LW(x3, gback, gdoffset+0*4); + SW(x3, gback, gdoffset+1*4); + } else { + GETEM(x1, 0); + for(int i=0; i<4; ++i) { + // GX->ub[4+i] = (EX->sw[i]<0)?0:((EX->sw[i]>0xff)?0xff:EX->sw[i]); + LH(x3, wback, fixedaddress+i*2); + BGE(x5, x3, 8); + ADDI(x3, xZR, 0xFF); + NOT(x4, x3); + SRAI(x4, x4, 63); + AND(x3, x3, x4); + SB(x3, gback, gdoffset+4+i); + } + } + break; + case 0x68: + INST_NAME("PUNPCKHBW Gm,Em"); + nextop = F8; + GETGM(); + for(int i=0; i<4; ++i) { + // GX->ub[2 * i] = GX->ub[i + 4]; + LBU(x3, gback, gdoffset+i+4); + SB(x3, gback, gdoffset+2*i); + } + if (MODREG && gd==(nextop&7)) { + for(int i=0; i<4; ++i) { + // GX->ub[2 * i + 1] = GX->ub[2 * i]; + LBU(x3, gback, gdoffset+2*i); + SB(x3, gback, gdoffset+2*i+1); + } + } else { + GETEM(x2, 0); + for(int i=0; i<4; ++i) { + // GX->ub[2 * i + 1] = EX->ub[i + 4]; + LBU(x3, wback, fixedaddress+i+4); + SB(x3, gback, gdoffset+2*i+1); + } + } + break; + case 0x69: + INST_NAME("PUNPCKHWD Gm,Em"); + nextop = F8; + GETGM(); + for(int i=0; i<2; ++i) { + // GX->uw[2 * i] = GX->uw[i + 2]; + LHU(x3, gback, gdoffset+(i+2)*2); + SH(x3, gback, gdoffset+2*i*2); + } + if (MODREG && gd==(nextop&7)) { + for(int i=0; i<2; ++i) { + // GX->uw[2 * i + 1] = GX->uw[2 * i]; + LHU(x3, gback, gdoffset+2*i*2); + SH(x3, gback, gdoffset+(2*i+1)*2); + } + } else { + GETEM(x1, 0); + for(int i=0; i<2; ++i) { + // GX->uw[2 * i + 1] = EX->uw[i + 2]; + LHU(x3, wback, fixedaddress+(i+2)*2); + SH(x3, gback, gdoffset+(2*i+1)*2); + } + } + break; + case 0x6A: + INST_NAME("PUNPCKHDQ Gm,Em"); + nextop = F8; + GETEM(x1, 0); + GETGM(); + // GM->ud[0] = GM->ud[1]; + LWU(x3, gback, gdoffset+1*4); + SW(x3, gback, gdoffset+0*4); + if (!(MODREG && (gd==ed))) { + // GM->ud[1] = EM->ud[1]; + LWU(x3, wback, fixedaddress+1*4); + SW(x3, gback, gdoffset+1*4); + } + break; + case 0x6E: + INST_NAME("MOVD Gm, Ed"); + nextop = F8; + GETGM(); + if(MODREG) { + ed = xRAX + (nextop&7) + (rex.b<<3); + } else { + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, 0); + if(rex.w) { + LD(x4, ed, fixedaddress); + } else { + LW(x4, ed, fixedaddress); } + ed = x4; + } + if(rex.w) SD(ed, gback, gdoffset+0); else SW(ed, gback, gdoffset+0); + break; + case 0x6F: + INST_NAME("MOVQ Gm, Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + LD(x3, wback, fixedaddress); + SD(x3, gback, gdoffset+0); + break; + case 0x71: + nextop = F8; + switch((nextop>>3)&7) { + case 2: + INST_NAME("PSRLW Em, Ib"); + GETEM(x1, 1); + u8 = F8; + if (u8>15) { + // just zero dest + SD(xZR, wback, fixedaddress); + } else if(u8) { + for (int i=0; i<4; ++i) { + // EX->uw[i] >>= u8; + LHU(x3, wback, fixedaddress+i*2); + SRLI(x3, x3, u8); + SH(x3, wback, fixedaddress+i*2); + } + } + break; + case 4: + INST_NAME("PSRAW Em, Ib"); + GETEM(x1, 1); + u8 = F8; + if(u8>15) u8=15; + if(u8) { + for (int i=0; i<4; ++i) { + // EX->sw[i] >>= u8; + LH(x3, wback, fixedaddress+i*2); + SRAI(x3, x3, u8); + SH(x3, wback, fixedaddress+i*2); + } + } + break; + case 6: + INST_NAME("PSLLW Em, Ib"); + GETEM(x1, 1); + u8 = F8; + if (u8>15) { + // just zero dest + SD(xZR, wback, fixedaddress+0); + } else if(u8) { + for (int i=0; i<4; ++i) { + // EX->uw[i] <<= u8; + LHU(x3, wback, fixedaddress+i*2); + SLLI(x3, x3, u8); + SH(x3, wback, fixedaddress+i*2); + } + } + break; + default: + *ok = 0; + DEFAULT; } break; + case 0x75: + INST_NAME("PCMPEQW Gm,Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + MMX_LOOP_W(x3, x4, SUB(x3, x3, x4); SEQZ(x3, x3); NEG(x3, x3)); + break; case 0x77: INST_NAME("EMMS"); // empty MMX, FPU now usable @@ -525,7 +821,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni /*emu->top = 0; emu->fpu_stack = 0;*/ //TODO: Check if something is needed here? break; - + case 0x7F: + INST_NAME("MOVQ Em, Gm"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + LD(x3, gback, gdoffset+0); + SD(x3, wback, fixedaddress); + break; #define GO(GETFLAGS, NO, YES, F) \ READFLAGS(F); \ i32_ = F32S; \ @@ -570,9 +873,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni eb1 = xRAX+(ed&3); \ } \ if (eb2) { \ - LUI(x1, 0xffffffffffff0); \ + LUI(x1, 0xffff0); \ ORI(x1, x1, 0xff); \ AND(eb1, eb1, x1); \ + SLLI(x3, x3, 8); \ } else { \ ANDI(eb1, eb1, 0xf00); \ } \ @@ -585,7 +889,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni GOCOND(0x90, "SET", "Eb"); #undef GO - + case 0xA2: INST_NAME("CPUID"); NOTEST(x1); @@ -787,7 +1091,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni wback = 0; } else { SMREAD(); - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); SRAI(x1, gd, 5+rex.w); SLLI(x1, x1, 2+rex.w); ADD(x3, wback, x1); @@ -804,10 +1108,10 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni ANDI(x4, x4, 1); // F_CF is 1 ANDI(xFlags, xFlags, ~1); OR(xFlags, xFlags, x4); - ADDI(x3, xZR, 1); - SLL(x3, x3, x2); - NOT(x3, x3); - AND(ed, ed, x3); + ADDI(x5, xZR, 1); + SLL(x5, x5, x2); + NOT(x5, x5); + AND(ed, ed, x5); if(wback) { SDxw(ed, wback, fixedaddress); SMWRITE(); @@ -844,8 +1148,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni GETGD; if(MODREG) { ed = xRAX+(nextop&7)+(rex.b<<3); - SLLI(gd, ed, 48); - SRLI(gd, gd, 48); + ZEXTH(gd, ed); } else { SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); @@ -998,14 +1301,18 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni ORI(xFlags, xFlags, 1<<F_ZF); B_NEXT_nocond; MARK; - NEG(x2, ed); - AND(x2, x2, ed); - TABLE64(x3, 0x03f79d71b4ca8b09ULL); - MUL(x2, x2, x3); - SRLI(x2, x2, 64-6); - TABLE64(x1, (uintptr_t)&deBruijn64tab); - ADD(x1, x1, x2); - LBU(gd, x1, 0); + if(rv64_zbb) { + CTZxw(gd, ed); + } else { + NEG(x2, ed); + AND(x2, x2, ed); + TABLE64(x3, 0x03f79d71b4ca8b09ULL); + MUL(x2, x2, x3); + SRLI(x2, x2, 64-6); + TABLE64(x1, (uintptr_t)&deBruijn64tab); + ADD(x1, x1, x2); + LBU(gd, x1, 0); + } ANDI(xFlags, xFlags, ~(1<<F_ZF)); break; case 0xBD: @@ -1024,37 +1331,43 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni B_NEXT_nocond; MARK; ANDI(xFlags, xFlags, ~(1<<F_ZF)); - if(ed!=gd) - u8 = gd; - else - u8 = x1; - ADDI(u8, xZR, 0); - if(rex.w) { - MV(x2, ed); - SRLI(x3, x2, 32); + if(rv64_zbb) { + MOV32w(x1, rex.w?63:31); + CLZxw(gd, ed); + SUB(gd, x1, gd); + } else { + if(ed!=gd) + u8 = gd; + else + u8 = x1; + ADDI(u8, xZR, 0); + if(rex.w) { + MV(x2, ed); + SRLI(x3, x2, 32); + BEQZ(x3, 4+2*4); + ADDI(u8, u8, 32); + MV(x2, x3); + } else { + AND(x2, ed, xMASK); + } + SRLI(x3, x2, 16); BEQZ(x3, 4+2*4); - ADDI(u8, u8, 32); + ADDI(u8, u8, 16); MV(x2, x3); - } else { - AND(x2, ed, xMASK); + SRLI(x3, x2, 8); + BEQZ(x3, 4+2*4); + ADDI(u8, u8, 8); + MV(x2, x3); + SRLI(x3, x2, 4); + BEQZ(x3, 4+2*4); + ADDI(u8, u8, 4); + MV(x2, x3); + ANDI(x2, x2, 0b1111); + TABLE64(x3, (uintptr_t)&lead0tab); + ADD(x3, x3, x2); + LBU(x2, x3, 0); + ADD(gd, u8, x2); } - SRLI(x3, x2, 16); - BEQZ(x3, 4+2*4); - ADDI(u8, u8, 16); - MV(x2, x3); - SRLI(x3, x2, 8); - BEQZ(x3, 4+2*4); - ADDI(u8, u8, 8); - MV(x2, x3); - SRLI(x3, x2, 4); - BEQZ(x3, 4+2*4); - ADDI(u8, u8, 4); - MV(x2, x3); - ANDI(x2, x2, 0b1111); - TABLE64(x3, (uintptr_t)&lead0tab); - ADD(x3, x3, x2); - LBU(x2, x3, 0); - ADD(gd, u8, x2); break; case 0xBE: INST_NAME("MOVSX Gd, Eb"); @@ -1098,13 +1411,13 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0xC2: INST_NAME("CMPPS Gx, Ex, Ib"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 1); u8 = F8; d0 = fpu_get_scratch(dyn); d1 = fpu_get_scratch(dyn); for(int i=0; i<4; ++i) { - FLW(d0, gback, i*4); + FLW(d0, gback, gdoffset+i*4); FLW(d1, wback, fixedaddress+i*4); if ((u8&7) == 0) { // Equal FEQS(x3, d0, d1); @@ -1135,7 +1448,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } case 7: break; // Not NaN } - + // MARK2; if ((u8&7) == 5 || (u8&7) == 6) { MOV32w(x3, 1); @@ -1143,7 +1456,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni // MARK; } NEG(x3, x3); - SW(x3, gback, i*4); + SW(x3, gback, gdoffset+i*4); } break; case 0xC3: @@ -1160,24 +1473,24 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0xC6: // TODO: Optimize this! INST_NAME("SHUFPS Gx, Ex, Ib"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 1); u8 = F8; int32_t idx; idx = (u8>>(0*2))&3; - LWU(x3, gback, idx*4); + LWU(x3, gback, gdoffset+idx*4); idx = (u8>>(1*2))&3; - LWU(x4, gback, idx*4); + LWU(x4, gback, gdoffset+idx*4); idx = (u8>>(2*2))&3; LWU(x5, wback, fixedaddress+idx*4); idx = (u8>>(3*2))&3; LWU(x6, wback, fixedaddress+idx*4); - SW(x3, gback, 0*4); - SW(x4, gback, 1*4); - SW(x5, gback, 2*4); - SW(x6, gback, 3*4); + SW(x3, gback, gdoffset+0*4); + SW(x4, gback, gdoffset+1*4); + SW(x5, gback, gdoffset+2*4); + SW(x6, gback, gdoffset+3*4); break; case 0xC8: @@ -1190,90 +1503,111 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0xCF: /* BSWAP reg */ INST_NAME("BSWAP Reg"); gd = xRAX+(opcode&7)+(rex.b<<3); - #if 1 - ANDI(x1, gd, 0xff); - SLLI(x1, x1, (rex.w?64:32)-8); - SRLI(x2, gd, 8); - ANDI(x3, x2, 0xff); - SLLI(x3, x3, (rex.w?64:32)-16); - OR(x1, x1, x3); - SRLI(x2, gd, 16); - ANDI(x3, x2, 0xff); - SLLI(x3, x3, (rex.w?64:32)-24); - OR(x1, x1, x3); - SRLI(x2, gd, 24); - if(rex.w) { - ANDI(x3, x2, 0xff); - SLLI(x3, x3, (rex.w?64:32)-32); - OR(x1, x1, x3); - SRLI(x2, gd, 32); - ANDI(x3, x2, 0xff); - SLLI(x3, x3, 64-40); - OR(x1, x1, x3); - SRLI(x2, gd, 40); + if(rv64_zbb) { + REV8(gd, gd); + if(!rex.w) + SRLI(gd, gd, 32); + } else { + gback = gd; + if (!rex.w) { + AND(x4, gd, xMASK); + gd = x4; + } + ANDI(x1, gd, 0xff); + SLLI(x1, x1, (rex.w?64:32)-8); + SRLI(x2, gd, 8); ANDI(x3, x2, 0xff); - SLLI(x3, x3, 64-48); + SLLI(x3, x3, (rex.w?64:32)-16); OR(x1, x1, x3); - SRLI(x2, gd, 48); + SRLI(x2, gd, 16); ANDI(x3, x2, 0xff); - SLLI(x3, x3, 64-56); + SLLI(x3, x3, (rex.w?64:32)-24); OR(x1, x1, x3); - SRLI(x2, gd, 56); + SRLI(x2, gd, 24); + if(rex.w) { + ANDI(x3, x2, 0xff); + SLLI(x3, x3, 64-32); + OR(x1, x1, x3); + SRLI(x2, gd, 32); + ANDI(x3, x2, 0xff); + SLLI(x3, x3, 64-40); + OR(x1, x1, x3); + SRLI(x2, gd, 40); + ANDI(x3, x2, 0xff); + SLLI(x3, x3, 64-48); + OR(x1, x1, x3); + SRLI(x2, gd, 48); + ANDI(x3, x2, 0xff); + SLLI(x3, x3, 64-56); + OR(x1, x1, x3); + SRLI(x2, gd, 56); + } + OR(gback, x1, x2); } - OR(gd, x1, x2); - #else - MOV_U12(x1, 0xff); - SLLI(x4, x1, 8); // mask 0xff00 - if (rex.w) { - SLLI(x5, x1, 16); // mask 0xff0000 - SLLI(x6, x1, 24); // mask 0xff000000 - - SRLI(x2, gd, 56); - - SRLI(x3, gd, 40); - AND(x3, x3, x4); - OR(x2, x2, x3); - - SRLI(x3, gd, 24); - AND(x3, x3, x5); - OR(x2, x2, x3); - - SRLI(x3, gd, 8); - AND(x3, x3, x6); - OR(x2, x2, x3); - - AND(x3, gd, x6); - SLLI(x3, x3, 8); - OR(x2, x2, x3); - - AND(x3, gd, x5); - SLLI(x3, x3, 24); - OR(x2, x2, x3); - - AND(x3, gd, x4); - SLLI(x3, x3, 40); - OR(x2, x2, x3); - - SLLI(x3, x3, 56); - OR(gd, x2, x3); + break; + case 0xE5: + INST_NAME("PMULHW Gm,Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + for(int i=0; i<4; ++i) { + LH(x3, gback, gdoffset+2*i); + LH(x4, wback, fixedaddress+2*i); + MULW(x3, x3, x4); + SRAIW(x3, x3, 16); + SH(x3, gback, gdoffset+2*i); + } + break; + case 0xED: + INST_NAME("PADDSW Gm,Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + for(int i=0; i<4; ++i) { + // tmp32s = (int32_t)GX->sw[i] + EX->sw[i]; + // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s); + LH(x3, gback, gdoffset+2*i); + LH(x4, wback, fixedaddress+2*i); + ADDW(x3, x3, x4); + LUI(x4, 0xFFFF8); // -32768 + BGE(x3, x4, 12); + SH(x4, gback, gdoffset+2*i); + J(20); // continue + LUI(x4, 8); // 32768 + BLT(x3, x4, 8); + ADDIW(x3, x4, -1); + SH(x3, gback, gdoffset+2*i); + } + break; + case 0xEF: + INST_NAME("PXOR Gm,Em"); + nextop = F8; + GETGM(); + if(MODREG && gd==(nextop&7)) { + // just zero dest + SD(xZR, gback, gdoffset+0); } else { - SRLIW(x2, gd, 24); - - SRLIW(x3, gd, 8); - AND(x3, x3, x4); - OR(x2, x2, x3); - - AND(x3, gd, x4); - SLLI(x3, x3, 8); - OR(x2, x2, x3); - - AND(x3, gd, x1); - SLLI(x3, x3, 24); - OR(gd, x2, x3); + GETEM(x2, 0); + LD(x3, gback, gdoffset+0); + LD(x4, wback, fixedaddress); + XOR(x3, x3, x4); + SD(x3, gback, gdoffset+0); } - #endif break; - + case 0xF9: + INST_NAME("PSUBW Gm, Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + MMX_LOOP_W(x3, x4, SUBW(x3, x3, x4)); + break; + case 0xFD: + INST_NAME("PADDW Gm, Em"); + nextop = F8; + GETGM(); + GETEM(x2, 0); + MMX_LOOP_W(x3, x4, ADDW(x3, x3, x4)); + break; default: DEFAULT; } diff --git a/src/dynarec/rv64/dynarec_rv64_64.c b/src/dynarec/rv64/dynarec_rv64_64.c index 455a8d72..bc3b2c96 100644 --- a/src/dynarec/rv64/dynarec_rv64_64.c +++ b/src/dynarec/rv64/dynarec_rv64_64.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -23,8 +22,6 @@ #include "dynarec_rv64_helper.h" #include "dynarec_rv64_functions.h" -#define GETG gd = ((nextop&0x38)>>3)+(rex.r<<3) - uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int seg, int* ok, int* need_epilog) { (void)ip; (void)rep; (void)need_epilog; @@ -33,12 +30,13 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni uint8_t nextop; uint8_t u8; uint8_t gd, ed, eb1, eb2, gb1, gb2; - uint8_t wback, wb1, wb2, wb; + uint8_t gback, wback, wb1, wb2, wb; int64_t i64, j64; + uint64_t u64; int v0, v1; int q0; int d0; - int64_t fixedaddress; + int64_t fixedaddress, gdoffset; int unscaled; MAYUSE(eb1); MAYUSE(eb2); @@ -56,14 +54,85 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni rep = opcode-0xF1; opcode = F8; } - // REX prefix before the F0 are ignored - rex.rex = 0; - while(opcode>=0x40 && opcode<=0x4f) { - rex.rex = opcode; - opcode = F8; - } + + GETREX(); switch(opcode) { + case 0x03: + INST_NAME("ADD Gd, Seg:Ed"); + SETFLAGS(X_ALL, SF_SET_PENDING); + grab_segdata(dyn, addr, ninst, x4, seg); + nextop = F8; + GETGD; + GETEDO(x4, 0, x5); + emit_add32(dyn, ninst, rex, gd, ed, x3, x4, x5); + break; + case 0x0F: + opcode = F8; + switch(opcode) { + case 0x11: + switch(rep) { + case 0: + INST_NAME("MOVUPS Ex,Gx"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + if(!MODREG) { + grab_segdata(dyn, addr, ninst, x4, seg); + ADD(x4, x4, wback); + wback = x4; + } + LD(x3, gback, gdoffset+0); + LD(x5, gback, gdoffset+8); + SD(x3, wback, fixedaddress+0); + SD(x5, wback, fixedaddress+8); + if(!MODREG) + SMWRITE2(); + break; + case 1: + INST_NAME("MOVSD Ex, Gx"); + nextop = F8; + GETG; + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); + if(MODREG) { + ed = (nextop&7)+ (rex.b<<3); + d0 = sse_get_reg(dyn, ninst, x1, ed, 0); + FMVD(d0, v0); + } else { + grab_segdata(dyn, addr, ninst, x4, seg); + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0); + ADD(x4, x4, ed); + ed = x4; + FSD(v0, ed, fixedaddress); + SMWRITE2(); + } + break; + case 2: + INST_NAME("MOVSS Ex, Gx"); + nextop = F8; + GETG; + v0 = sse_get_reg(dyn, ninst, x1, gd, 1); + if(MODREG) { + q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 1); + FMVS(q0, v0); + } else { + grab_segdata(dyn, addr, ninst, x4, seg); + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0); + ADD(x4, x4, ed); + ed = x4; + FSW(v0, ed, fixedaddress); + SMWRITE2(); + } + break; + default: + DEFAULT; + } + break; + + default: + DEFAULT; + } + break; case 0x2B: INST_NAME("SUB Gd, Seg:Ed"); SETFLAGS(X_ALL, SF_SET_PENDING); @@ -84,6 +153,174 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni emit_xor32(dyn, ninst, rex, gd, ed, x3, x4); break; + case 0x66: + addr = dynarec64_6664(dyn, addr, ip, ninst, rex, seg, ok, need_epilog); + break; + case 0x80: + nextop = F8; + switch((nextop>>3)&7) { + case 0: // ADD + INST_NAME("ADD Eb, Ib"); + grab_segdata(dyn, addr, ninst, x1, seg); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEBO(x1, 1); + u8 = F8; + emit_add8c(dyn, ninst, x1, u8, x2, x4, x5); + EBBACK(x5, 0); + break; + case 1: // OR + INST_NAME("OR Eb, Ib"); + grab_segdata(dyn, addr, ninst, x1, seg); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEBO(x1, 1); + u8 = F8; + emit_or8c(dyn, ninst, x1, u8, x2, x4, x5); + EBBACK(x5, 0); + break; + case 2: // ADC + INST_NAME("ADC Eb, Ib"); + grab_segdata(dyn, addr, ninst, x1, seg); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEBO(x1, 1); + u8 = F8; + emit_adc8c(dyn, ninst, x1, u8, x2, x4, x5, x6); + EBBACK(x5, 0); + break; + case 3: // SBB + INST_NAME("SBB Eb, Ib"); + grab_segdata(dyn, addr, ninst, x1, seg); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEBO(x1, 1); + u8 = F8; + emit_sbb8c(dyn, ninst, x1, u8, x2, x4, x5, x6); + EBBACK(x5, 0); + break; + case 4: // AND + INST_NAME("AND Eb, Ib"); + grab_segdata(dyn, addr, ninst, x1, seg); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEBO(x1, 1); + u8 = F8; + emit_and8c(dyn, ninst, x1, u8, x2, x4); + EBBACK(x5, 0); + break; + case 5: // SUB + INST_NAME("SUB Eb, Ib"); + grab_segdata(dyn, addr, ninst, x1, seg); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEBO(x1, 1); + u8 = F8; + emit_sub8c(dyn, ninst, x1, u8, x2, x4, x5, x6); + EBBACK(x5, 0); + break; + case 6: // XOR + INST_NAME("XOR Eb, Ib"); + grab_segdata(dyn, addr, ninst, x1, seg); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEBO(x1, 1); + u8 = F8; + emit_xor8c(dyn, ninst, x1, u8, x2, x4); + EBBACK(x5, 0); + break; + case 7: // CMP + INST_NAME("CMP Eb, Ib"); + grab_segdata(dyn, addr, ninst, x1, seg); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEBO(x1, 1); + u8 = F8; + if(u8) { + MOV32w(x2, u8); + emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6); + } else { + emit_cmp8_0(dyn, ninst, x1, x3, x4); + } + break; + default: + DEFAULT; + } + break; + case 0x81: + case 0x83: + nextop = F8; + grab_segdata(dyn, addr, ninst, x6, seg); + switch((nextop>>3)&7) { + case 0: // ADD + if(opcode==0x81) {INST_NAME("ADD Ed, Id");} else {INST_NAME("ADD Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEDO(x6, (opcode==0x81)?4:1, x5); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_add32c(dyn, ninst, rex, ed, i64, x3, x4, x5, x9); + WBACKO(x6); + break; + case 1: // OR + if(opcode==0x81) {INST_NAME("OR Ed, Id");} else {INST_NAME("OR Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEDO(x6, (opcode==0x81)?4:1, x5); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_or32c(dyn, ninst, rex, ed, i64, x3, x4); + WBACKO(x6); + break; + case 2: // ADC + if(opcode==0x81) {INST_NAME("ADC Ed, Id");} else {INST_NAME("ADC Ed, Ib");} + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEDO(x6, (opcode==0x81)?4:1, x5); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + MOV64xw(x5, i64); + SD(x6, xEmu, offsetof(x64emu_t, scratch)); + emit_adc32(dyn, ninst, rex, ed, x5, x3, x4, x6, x9); + LD(x6, xEmu, offsetof(x64emu_t, scratch)); + WBACKO(x6); + break; + case 3: // SBB + if(opcode==0x81) {INST_NAME("SBB Ed, Id");} else {INST_NAME("SBB Ed, Ib");} + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEDO(x6, (opcode==0x81)?4:1, x5); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + MOV64xw(x5, i64); + emit_sbb32(dyn, ninst, rex, ed, x5, x3, x4, x9); + WBACKO(x6); + break; + case 4: // AND + if(opcode==0x81) {INST_NAME("AND Ed, Id");} else {INST_NAME("AND Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEDO(x6, (opcode==0x81)?4:1, x5); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_and32c(dyn, ninst, rex, ed, i64, x3, x4); + WBACKO(x6); + break; + case 5: // SUB + if(opcode==0x81) {INST_NAME("SUB Ed, Id");} else {INST_NAME("SUB Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEDO(x6, (opcode==0x81)?4:1, x5); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_sub32c(dyn, ninst, rex, ed, i64, x3, x4, x5, x9); + WBACKO(x6); + break; + case 6: // XOR + if(opcode==0x81) {INST_NAME("XOR Ed, Id");} else {INST_NAME("XOR Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEDO(x6, (opcode==0x81)?4:1, x5); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_xor32c(dyn, ninst, rex, ed, i64, x3, x4); + WBACKO(x6); + break; + case 7: // CMP + if(opcode==0x81) {INST_NAME("CMP Ed, Id");} else {INST_NAME("CMP Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETEDO(x6, (opcode==0x81)?4:1, x5); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + if(i64) { + MOV64xw(x2, i64); + emit_cmp32(dyn, ninst, rex, ed, x2, x3, x4, x5, x6); + } else + emit_cmp32_0(dyn, ninst, rex, ed, x3, x4); + break; + } + break; case 0x88: INST_NAME("MOV Seg:Eb, Gb"); grab_segdata(dyn, addr, ninst, x4, seg); @@ -156,6 +393,81 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni LDxw(gd, x4, fixedaddress); } break; + + case 0xA1: + INST_NAME("MOV EAX,FS:Od"); + grab_segdata(dyn, addr, ninst, x4, seg); + if(rex.is32bits) + u64 = F32; + else + u64 = F64; + // TODO: could be optimized. + MOV64z(x1, u64); + ADD(x1, x1, x4); + LDxw(xRAX, x1, 0); + break; + + case 0xA3: + INST_NAME("MOV FS:Od,EAX"); + grab_segdata(dyn, addr, ninst, x4, seg); + if(rex.is32bits) + u64 = F32; + else + u64 = F64; + // TODO: could be optimized. + MOV64z(x1, u64); + ADD(x1, x1, x4); + SDxw(xRAX, x1, 0); + SMWRITE2(); + break; + + case 0xC6: + INST_NAME("MOV Seg:Eb, Ib"); + grab_segdata(dyn, addr, ninst, x4, seg); + nextop=F8; + if(MODREG) { // reg <= u8 + u8 = F8; + if(!rex.rex) { + ed = (nextop&7); + eb1 = xRAX+(ed&3); // Ax, Cx, Dx or Bx + eb2 = (ed&4)>>2; // L or H + } else { + eb1 = xRAX+(nextop&7)+(rex.b<<3); + eb2 = 0; + } + + if (eb2) { + // load a mask to x3 (ffffffffffff00ff) + LUI(x3, 0xffff0); + ORI(x3, x3, 0xff); + // apply mask + AND(eb1, eb1, x3); + if(u8) { + if((u8<<8)<2048) { + ADDI(x4, xZR, u8<<8); + } else { + ADDI(x4, xZR, u8); + SLLI(x4, x4, 8); + } + OR(eb1, eb1, x4); + } + } else { + ANDI(eb1, eb1, 0xf00); // mask ffffffffffffff00 + ORI(eb1, eb1, u8); + } + } else { // mem <= u8 + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 1); + u8 = F8; + if(u8) { + ADDI(x3, xZR, u8); + ed = x3; + } else + ed = xZR; + ADD(x4, wback, x4); + SB(ed, x4, fixedaddress); + SMWRITE2(); + } + break; case 0xC7: INST_NAME("MOV Seg:Ed, Id"); grab_segdata(dyn, addr, ninst, x4, seg); @@ -165,11 +477,15 @@ uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni ed = xRAX+(nextop&7)+(rex.b<<3); MOV64xw(ed, i64); } else { // mem <= i32 - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 4); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 4); i64 = F32S; - MOV64xw(x3, i64); - ADD(x4, ed, x4); - SDxw(x3, x4, fixedaddress); + if(i64) { + MOV64xw(x3, i64); + ed = x3; + } else + ed = xZR; + ADD(x4, wback, x4); + SDxw(ed, x4, fixedaddress); SMWRITE2(); } break; diff --git a/src/dynarec/rv64/dynarec_rv64_66.c b/src/dynarec/rv64/dynarec_rv64_66.c index 7bc996e9..49a7ef65 100644 --- a/src/dynarec/rv64/dynarec_rv64_66.c +++ b/src/dynarec/rv64/dynarec_rv64_66.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -50,14 +49,10 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni rep = opcode-0xF1; opcode = F8; } - // REX prefix before the 66 are ignored - rex.rex = 0; - while(opcode>=0x40 && opcode<=0x4f) { - rex.rex = opcode; - opcode = F8; - } - if(rex.w && opcode!=0x0f) // rex.w cancels "66", but not for 66 0f type of prefix + GETREX(); + + if(rex.w && !(opcode==0x0f || opcode==0xf0 || opcode==0x64 || opcode==0x65)) // rex.w cancels "66", but not for 66 0f type of prefix return dynarec64_00(dyn, addr-1, ip, ninst, rex, rep, ok, need_epilog); // addr-1, to "put back" opcode switch(opcode) { @@ -83,8 +78,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("ADD AX, Iw"); SETFLAGS(X_ALL, SF_SET_PENDING); i32 = F16; - SLLI(x1 , xRAX, 48); - SRLI(x1, x1, 48); + ZEXTH(x1 , xRAX); MOV32w(x2, i32); emit_add16(dyn, ninst, x1, x2, x3, x4, x6); LUI(x3, 0xffff0); @@ -113,8 +107,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("OR AX, Iw"); SETFLAGS(X_ALL, SF_SET_PENDING); i32 = F16; - SLLI(x1, xRAX, 48); - SRLI(x1, x1, 48); + ZEXTH(x1, xRAX); MOV32w(x2, i32); emit_or16(dyn, ninst, x1, x2, x3, x4); LUI(x3, 0xffff0); @@ -124,6 +117,16 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x0F: addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); break; + case 0x19: + INST_NAME("SBB Ew, Gw"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGW(x2); + GETEW(x1, 0); + emit_sbb16(dyn, ninst, x1, x2, x4, x5, x6); + EWBACK; + break; case 0x1B: INST_NAME("SBB Gw, Ew"); READFLAGS(X_CF); @@ -156,8 +159,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("AND AX, Iw"); SETFLAGS(X_ALL, SF_SET_PENDING); i32 = F16; - SLLI(x1, xRAX, 48); - SRLI(x1, x1, 48); + ZEXTH(x1, xRAX); MOV32w(x2, i32); emit_and16(dyn, ninst, x1, x2, x3, x4); LUI(x3, 0xffff0); @@ -186,8 +188,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("SUB AX, Iw"); SETFLAGS(X_ALL, SF_SET_PENDING); i32 = F16; - SLLI(x1, xRAX, 48); - SRLI(x1, x1, 48); + ZEXTH(x1, xRAX); MOV32w(x2, i32); emit_sub16(dyn, ninst, x1, x2, x3, x4, x5); LUI(x2, 0xffff0); @@ -216,8 +217,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("XOR AX, Iw"); SETFLAGS(X_ALL, SF_SET_PENDING); i32 = F16; - SLLI(x1, xRAX, 48); - SRLI(x1, x1, 48); + ZEXTH(x1, xRAX); MOV32w(x2, i32); emit_xor16(dyn, ninst, x1, x2, x3, x4, x5); LUI(x5, 0xffff0); @@ -244,8 +244,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("CMP AX, Iw"); SETFLAGS(X_ALL, SF_SET_PENDING); i32 = F16; - SLLI(x1, xRAX, 48); - SRLI(x1, x1, 48); + ZEXTH(x1, xRAX); if(i32) { MOV32w(x2, i32); emit_cmp16(dyn, ninst, x1, x2, x3, x4, x5, x6); @@ -253,6 +252,51 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni emit_cmp16_0(dyn, ninst, x1, x3, x4); } break; + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + INST_NAME("INC Reg16 (32bits)"); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); + gd = xRAX + (opcode&7); + ZEXTH(x1, gd); + emit_inc16(dyn, ninst, x1, x2, x3, x4); + LUI(x3, 0xffff0); + AND(gd, gd, x3); + OR(gd, gd, x1); + ZEROUP(gd); + break; + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + INST_NAME("DEC Reg16 (32bits)"); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); + gd = xRAX + (opcode&7); + ZEXTH(x1, gd); + emit_dec16(dyn, ninst, x1, x2, x3, x4, x5); + LUI(x3, 0xffff0); + AND(gd, gd, x3); + OR(gd, gd, x1); + ZEROUP(gd); + break; + case 0x64: + addr = dynarec64_6664(dyn, addr, ip, ninst, rex, _FS, ok, need_epilog); + break; + case 0x65: + addr = dynarec64_6664(dyn, addr, ip, ninst, rex, _GS, ok, need_epilog); + break; + case 0x66: + addr = dynarec64_66(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + break; case 0x69: case 0x6B: if(opcode==0x69) { @@ -267,8 +311,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni if(opcode==0x69) i32 = F16S; else i32 = F8S; MOV32w(x2, i32); MULW(x2, x2, x1); - SLLI(x2, x2, 48); - SRLI(x2, x2, 48); + ZEXTH(x2, x2); UFLAG_RES(x2); gd=x2; GWBACK; @@ -394,8 +437,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni // we don't use GETGW above, so we need let gd & 0xffff. LUI(x1, 0xffff0); AND(ed, ed, x1); - SLLI(x2, gd, 48); - SRLI(x2, x2, 48); + ZEXTH(x2, gd); OR(ed, ed, x2); } } else { @@ -413,8 +455,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni if(ed!=gd) { LUI(x1, 0xffff0); AND(gd, gd, x1); - SLLI(x2, ed, 48); - SRLI(x2, x2, 48); + ZEXTH(x2, ed); OR(gd, gd, x2); } } else { @@ -443,13 +484,11 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni // x2 <- rax MV(x2, xRAX); // rax[15:0] <- gd[15:0] - SLLI(x3, gd, 48); - SRLI(x3, x3, 48); + ZEXTH(x3, gd); AND(xRAX, xRAX, x4); OR(xRAX, xRAX, x3); // gd[15:0] <- x2[15:0] - SLLI(x2, x2, 48); - SRLI(x2, x2, 48); + ZEXTH(x2, x2); AND(gd, gd, x4); OR(gd, gd, x2); } @@ -530,6 +569,54 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni ADD(xRDI, xRDI, x3); } break; + + case 0xAF: + switch (rep) { + case 1: + case 2: + if(rep==1) {INST_NAME("REPNZ SCASW");} else {INST_NAME("REPZ SCASW");} + MAYSETFLAGS(); + SETFLAGS(X_ALL, SF_SET_PENDING); + CBZ_NEXT(xRCX); + GETDIR(x3, x1, rex.w?8:2); + if (rex.w) { + MARK; + LD(x2, xRDI, 0); + ADD(xRDI, xRDI, x3); + ADDI(xRCX, xRCX, -1); + if (rep==1) {BEQ_MARK3(xRAX, x2);} else {BNE_MARK3(xRAX, x2);} + BNE_MARK(xRCX, xZR); + MARK3; + emit_cmp32(dyn, ninst, rex, xRAX, x2, x3, x4, x5, x6); + } else { + ZEXTH(x1, xRAX); + MARK; + LHU(x2, xRDI, 0); + ADD(xRDI, xRDI, x3); + ADDI(xRCX, xRCX, -1); + if (rep==1) {BEQ_MARK3(x1, x2);} else {BNE_MARK3(x1, x2);} + BNE_MARK(xRCX, xZR); + MARK3; + emit_cmp16(dyn, ninst, x1, x2, x3, x4, x5, x6); + } + break; + default: + INST_NAME("SCASW"); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETDIR(x3, x1, rex.w?8:2); + if (rex.w) { + LD(x2, xRDI, 0); + emit_cmp32(dyn, ninst, rex, xRAX, x2, x3, x4, x5, x6); + } else { + ZEXTH(x1, xRAX); + LHU(x2, xRDI, 0); + emit_cmp16(dyn, ninst, x1, x2, x3, x4, x5, x6); + } + ADD(xRDI, xRDI, x3); + break; + } + break; + case 0xB8: case 0xB9: case 0xBA: @@ -555,7 +642,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } } break; - + case 0xC1: nextop = F8; switch((nextop>>3)&7) { @@ -643,8 +730,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni UFLAG_OP12(ed, x2) SRAI(ed, ed, u8&0x1f); if(MODREG) { - SLLI(ed, ed, 48); - SRLI(ed, ed, 48); + ZEXTH(ed, ed); } EWBACK; UFLAG_RES(ed); @@ -652,7 +738,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni break; } break; - + case 0xC7: INST_NAME("MOV Ew, Iw"); nextop = F8; @@ -693,6 +779,25 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni UFLAG_RES(ed); UFLAG_DF(x3, d_shr16); break; + case 4: + case 6: + if(opcode==0xD1) { + INST_NAME("SHL Ew, 1"); + MOV32w(x4, 1); + } else { + INST_NAME("SHL Ew, CL"); + ANDI(x4, xRCX, 0x1f); + } + UFLAG_IF {MESSAGE(LOG_DUMP, "Need Optimization for flags\n");} + SETFLAGS(X_ALL, SF_PENDING); + GETEW(x1, 0); + UFLAG_OP12(ed, x4) + SLL(ed, ed, x4); + ZEXTH(ed, ed); + EWBACK; + UFLAG_RES(ed); + UFLAG_DF(x3, d_shl16); + break; case 7: if(opcode==0xD1) { INST_NAME("SAR Ew, 1"); @@ -704,10 +809,9 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni UFLAG_IF {MESSAGE(LOG_DUMP, "Need Optimization for flags\n");} SETFLAGS(X_ALL, SF_PENDING); GETSEW(x1, 0); - UFLAG_OP12(ed, x4) + UFLAG_OP12(ed, x4); SRA(ed, ed, x4); - SLLI(ed, ed, 48); - SRLI(ed, ed, 48); + ZEXTH(ed, ed); EWBACK; UFLAG_RES(ed); UFLAG_DF(x3, d_sar16); @@ -716,6 +820,10 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni DEFAULT; } break; + + case 0xF0: + return dynarec64_66F0(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); + case 0xF7: nextop = F8; switch((nextop>>3)&7) { @@ -745,9 +853,8 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni INST_NAME("DIV Ew"); SETFLAGS(X_ALL, SF_SET); GETEW(x1, 0); - SLLI(x2, xRAX, 48); + ZEXTH(x2, xRAX); SLLI(x3, xRDX, 48); - SRLI(x2, x2, 48); SRLI(x3, x3, 32); OR(x2, x2, x3); DIVUW(x3, x2, ed); @@ -766,9 +873,8 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni NOTEST(x1); SETFLAGS(X_ALL, SF_SET); GETSEW(x1, 0); - SLLI(x2, xRAX, 48); + ZEXTH(x2, xRAX); SLLI(x3, xRDX, 48); - SRLI(x2, x2, 48); SRLI(x3, x3, 32); OR(x2, x2, x3); DIVW(x3, x2, ed); diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c index 260ea32b..3f51289e 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f.c +++ b/src/dynarec/rv64/dynarec_rv64_660f.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -27,7 +26,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int (void)ip; (void)need_epilog; uint8_t opcode = F8; - uint8_t nextop, u8; + uint8_t nextop, u8, s8; int32_t i32; uint8_t gd, ed; uint8_t wback, wb1, wb2, gback; @@ -37,7 +36,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int int v0, v1; int q0, q1; int d0, d1; - int64_t fixedaddress; + int64_t fixedaddress, gdoffset; int unscaled; MAYUSE(d0); @@ -49,27 +48,27 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int MAYUSE(j64); static const int8_t round_round[] = { RD_RNE, RD_RDN, RD_RUP, RD_RTZ }; - + switch(opcode) { case 0x10: INST_NAME("MOVUPD Gx,Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_MV_Q(x3); break; case 0x11: INST_NAME("MOVUPD Ex,Gx"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_MV_Q2(x3); if(!MODREG) SMWRITE2(); break; case 0x12: INST_NAME("MOVLPD Gx, Eq"); nextop = F8; - GETGX(x1); + GETGX(); if(MODREG) { // access register instead of memory is bad opcode! DEFAULT; @@ -78,33 +77,47 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SMREAD(); addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); LD(x3, wback, fixedaddress); - SD(x3, gback, 0); + SD(x3, gback, gdoffset+0); + break; + case 0x13: + INST_NAME("MOVLPD Eq, Gx"); + nextop = F8; + GETGX(); + if(MODREG) { + // access register instead of memory is bad opcode! + DEFAULT; + return addr; + } + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); + LD(x3, gback, gdoffset+0); + SD(x3, wback, fixedaddress); + SMWRITE2(); break; case 0x14: INST_NAME("UNPCKLPD Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); // GX->q[1] = EX->q[0]; LD(x3, wback, fixedaddress+0); - SD(x3, gback, 8); + SD(x3, gback, gdoffset+8); break; case 0x15: INST_NAME("UNPCKHPD Gx, Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); // GX->q[0] = GX->q[1]; - LD(x3, gback, 8); - SD(x3, gback, 0); + LD(x3, gback, gdoffset+8); + SD(x3, gback, gdoffset+0); // GX->q[1] = EX->q[1]; LD(x3, wback, fixedaddress+8); - SD(x3, gback, 8); + SD(x3, gback, gdoffset+8); break; case 0x16: INST_NAME("MOVHPD Gx, Eq"); nextop = F8; - GETGX(x1); + GETGX(); if(MODREG) { // access register instead of memory is bad opcode! DEFAULT; @@ -113,56 +126,32 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SMREAD(); addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); LD(x3, wback, fixedaddress); - SD(x3, gback, 8); + SD(x3, gback, gdoffset+8); break; case 0x1F: INST_NAME("NOP (multibyte)"); nextop = F8; FAKEED; break; - - #define GO(GETFLAGS, NO, YES, F) \ - READFLAGS(F); \ - GETFLAGS; \ - nextop=F8; \ - GETGD; \ - if(MODREG) { \ - ed = xRAX+(nextop&7)+(rex.b<<3); \ - SLLI(x4, ed, 48); \ - SRLI(x4, x4, 48); \ - } else { \ - SMREAD(); \ - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \ - LHU(x4, ed, fixedaddress); \ - ed = x4; \ - } \ - B##NO(x1, 4+4*4); \ - ADDI(x3, xZR, -1); \ - SRLI(x3, x3, 48); \ - AND(gd, gd, x3); \ - OR(gd, gd, ed); - - GOCOND(0x40, "CMOV", "Gw, Ew"); - #undef GO case 0x28: INST_NAME("MOVAPD Gx,Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_MV_Q(x3); break; case 0x29: INST_NAME("MOVAPD Ex,Gx"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_MV_Q2(x3); if(!MODREG) SMWRITE2(); break; case 0x2B: INST_NAME("MOVNTPD Ex, Gx"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_MV_Q2(x3); break; @@ -207,15 +196,15 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x00: INST_NAME("PSHUFB Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); sse_forget_reg(dyn, ninst, x5); ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); // perserve gd - LD(x3, gback, 0); - LD(x4, gback, 8); + LD(x3, gback, gdoffset+0); + LD(x4, gback, gdoffset+8); SD(x3, x5, 0); SD(x4, x5, 8); @@ -223,29 +212,29 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int LBU(x3, wback, fixedaddress+i); ANDI(x4, x3, 128); BEQZ(x4, 12); - SB(xZR, gback, i); + SB(xZR, gback, gdoffset+i); BEQZ(xZR, 20); // continue ANDI(x4, x3, 15); ADD(x4, x4, x5); LBU(x4, x4, 0); - SB(x4, gback, i); + SB(x4, gback, gdoffset+i); } break; case 0x01: INST_NAME("PHADDW Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); for (int i=0; i<4; ++i) { // GX->sw[i] = GX->sw[i*2+0]+GX->sw[i*2+1]; - LH(x3, gback, 2*(i*2+0)); - LH(x4, gback, 2*(i*2+1)); + LH(x3, gback, gdoffset+2*(i*2+0)); + LH(x4, gback, gdoffset+2*(i*2+1)); ADDW(x3, x3, x4); - SH(x3, gback, 2*i); + SH(x3, gback, gdoffset+2*i); } if (MODREG && gd==(nextop&7)+(rex.b<<3)) { // GX->q[1] = GX->q[0]; - LD(x3, gback, 0); - SD(x3, gback, 8); + LD(x3, gback, gdoffset+0); + SD(x3, gback, gdoffset+8); } else { GETEX(x2, 0); for (int i=0; i<4; ++i) { @@ -253,47 +242,150 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int LH(x3, wback, fixedaddress+2*(i*2+0)); LH(x4, wback, fixedaddress+2*(i*2+1)); ADDW(x3, x3, x4); - SH(x3, gback, 2*(4+i)); + SH(x3, gback, gdoffset+2*(4+i)); } } break; case 0x02: INST_NAME("PHADDD Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); // GX->sd[0] += GX->sd[1]; - LW(x3, gback, 0*4); - LW(x4, gback, 1*4); + LW(x3, gback, gdoffset+0*4); + LW(x4, gback, gdoffset+1*4); ADDW(x3, x3, x4); - SW(x3, gback, 0*4); + SW(x3, gback, gdoffset+0*4); // GX->sd[1] = GX->sd[2] + GX->sd[3]; - LW(x3, gback, 2*4); - LW(x4, gback, 3*4); + LW(x3, gback, gdoffset+2*4); + LW(x4, gback, gdoffset+3*4); ADDW(x3, x3, x4); - SW(x3, gback, 1*4); + SW(x3, gback, gdoffset+1*4); if (MODREG && gd==(nextop&7)+(rex.b<<3)) { // GX->q[1] = GX->q[0]; - LD(x3, gback, 0); - SD(x3, gback, 8); + LD(x3, gback, gdoffset+0); + SD(x3, gback, gdoffset+8); } else { GETEX(x2, 0); // GX->sd[2] = EX->sd[0] + EX->sd[1]; LW(x3, wback, fixedaddress+0*4); LW(x4, wback, fixedaddress+1*4); ADDW(x3, x3, x4); - SW(x3, gback, 2*4); + SW(x3, gback, gdoffset+2*4); // GX->sd[3] = EX->sd[2] + EX->sd[3]; LW(x3, wback, fixedaddress+2*4); LW(x4, wback, fixedaddress+3*4); ADDW(x3, x3, x4); - SW(x3, gback, 3*4); + SW(x3, gback, gdoffset+3*4); + } + break; + + case 0x04: + INST_NAME("PADDUBSW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + MOV64x(x5, 32767); + MOV64x(x6, -32768); + for(int i=0; i<8; ++i) { + LBU(x3, gback, gdoffset+i*2); + LB(x4, wback, fixedaddress+i*2); + MUL(x9, x3, x4); + LBU(x3, gback, gdoffset+i*2+1); + LB(x4, wback, fixedaddress+i*2+1); + MUL(x3, x3, x4); + ADD(x3, x3, x9); + if(rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, x6); + } else { + BLT(x3, x5, 4+4); + MV(x3, x5); + BLT(x6, x3, 4+4); + MV(x3, x6); + } + SH(x3, gback, gdoffset+i*2); + } + break; + + case 0x08: + INST_NAME("PSIGNB Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<16; ++i) { + LB(x3, gback, gdoffset+i); + LB(x4, wback, fixedaddress+i); + BGE(x4, xZR, 4+4); + NEG(x3, x3); + BNE(x4, xZR, 4+4); + MOV_U12(x3, 0); + SB(x3, gback, gdoffset+i); + } + break; + case 0x09: + INST_NAME("PSIGNW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<8; ++i) { + LH(x3, gback, gdoffset+i*2); + LH(x4, wback, fixedaddress+i*2); + BGE(x4, xZR, 4+4); + NEG(x3, x3); + BNE(x4, xZR, 4+4); + MOV_U12(x3, 0); + SH(x3, gback, gdoffset+i*2); + } + break; + case 0x0A: + INST_NAME("PSIGND Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<4; ++i) { + LW(x3, gback, gdoffset+i*4); + LW(x4, wback, fixedaddress+i*4); + BGE(x4, xZR, 4+4); + NEG(x3, x3); + BNE(x4, xZR, 4+4); + ADDI(x3, xZR, 0); + SW(x3, gback, gdoffset+i*4); + } + break; + case 0x0B: + INST_NAME("PMULHRSW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<8; ++i) { + LH(x3, gback, gdoffset+i*2); + LH(x4, wback, fixedaddress+i*2); + MUL(x3, x3, x4); + SRAI(x3, x3, 14); + ADDI(x3, x3, 1); + SRAI(x3, x3, 1); + SH(x3, gback, gdoffset+i*2); + } + break; + case 0x10: + INST_NAME("PBLENDVB Gx,Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + sse_forget_reg(dyn, ninst, 0); // forget xmm[0] + for (int i=0; i<16; ++i) { + LB(x3, xEmu, offsetof(x64emu_t, xmm[0])+i); + BGE(x3, xZR, 12); // continue + LBU(x3, wback, fixedaddress+i); + SB(x3, gback, gdoffset+i); + // continue } break; case 0x17: INST_NAME("PTEST Gx, Ex"); nextop = F8; SETFLAGS(X_ALL, SF_SET); - GETGX(x1); + GETGX(); GETEX(x2, 0); CLEAR_FLAGS(); SET_DFNONE(); @@ -302,8 +394,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int LD(x6, wback, fixedaddress+8); IFX(X_ZF) { - LD(x3, gback, 0); - LD(x4, gback, 8); + LD(x3, gback, gdoffset+0); + LD(x4, gback, gdoffset+8); AND(x3, x3, x5); AND(x4, x4, x6); OR(x3, x3, x4); @@ -311,9 +403,9 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ORI(xFlags, xFlags, 1<<F_ZF); } IFX(X_CF) { - LD(x3, gback, 0); + LD(x3, gback, gdoffset+0); NOT(x3, x3); - LD(x4, gback, 8); + LD(x4, gback, gdoffset+8); NOT(x4, x4); AND(x3, x3, x5); AND(x4, x4, x6); @@ -323,19 +415,306 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } } break; + + case 0x1C: + INST_NAME("PABSB Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<16; ++i) { + LB(x4, wback, fixedaddress+i); + BGE(x4, xZR, 4+4); + NEG(x4, x4); + SB(x4, gback, gdoffset+i); + } + break; + case 0x1D: + INST_NAME("PABSW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<8; ++i) { + LH(x4, wback, fixedaddress+i*2); + BGE(x4, xZR, 4+4); + NEG(x4, x4); + SH(x4, gback, gdoffset+i*2); + } + break; + case 0x1E: + INST_NAME("PABSD Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + MOV64x(x5, ~(1<<31)); + for(int i=0; i<4; ++i) { + LW(x4, wback, fixedaddress+i*4); + BGE(x4, xZR, 4+4); + NEG(x4, x4); + SW(x4, gback, gdoffset+i*4); + } + break; + + case 0x2B: + INST_NAME("PACKUSDW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + MOV64x(x5, 65535); + for(int i=0; i<4; ++i) { + LW(x3, gback, gdoffset+i*4); + if(rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, xZR); + } else { + BGE(x3, xZR, 4+4); + MV(x3, xZR); + BLT(x3, x5, 4+4); + MV(x3, x5); + } + SH(x3, gback, gdoffset+i*2); + } + if(MODREG && gd==ed) { + LD(x3, gback, gdoffset+0); + SD(x3, gback, gdoffset+8); + } else for(int i=0; i<4; ++i) { + LW(x3, wback, fixedaddress+i*4); + if(rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, xZR); + } else { + BGE(x3, xZR, 4+4); + MV(x3, xZR); + BLT(x3, x5, 4+4); + MV(x3, x5); + } + SH(x3, gback, gdoffset+8+i*2); + } + break; + + case 0x30: + INST_NAME("PMOVZXBW Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=7; i>=0; --i) { + LBU(x3, wback, fixedaddress+i); + SH(x3, gback, gdoffset+i*2); + } + break; + case 0x31: + INST_NAME("PMOVZXBD Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=3; i>=0; --i) { + LBU(x3, wback, fixedaddress+i); + SW(x3, gback, gdoffset+i*4); + } + break; + case 0x32: + INST_NAME("PMOVZXBQ Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=1; i>=0; --i) { + LBU(x3, wback, fixedaddress+i); + SD(x3, gback, gdoffset+i*8); + } + break; + case 0x33: + INST_NAME("PMOVZXWD Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=3; i>=0; --i) { + LHU(x3, wback, fixedaddress+i*2); + SW(x3, gback, gdoffset+i*4); + } + break; + case 0x34: + INST_NAME("PMOVZXWQ Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=1; i>=0; --i) { + LHU(x3, wback, fixedaddress+i*2); + SD(x3, gback, gdoffset+i*8); + } + break; + case 0x35: + INST_NAME("PMOVZXDQ Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=1; i>=0; --i) { + LWU(x3, wback, fixedaddress+i*4); + SD(x3, gback, gdoffset+i*8); + } + break; + + case 0x38: + INST_NAME("PMINSB Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<16; ++i) { + LB(x3, gback, gdoffset+i); + LB(x4, wback, fixedaddress+i); + if(rv64_zbb) MIN(x4, x3, x4); else BLT(x3, x4, 4+4); + SB(x4, gback, gdoffset+i); + } + break; + case 0x39: + INST_NAME("PMINSD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<4; ++i) { + LW(x3, gback, gdoffset+i*4); + LW(x4, wback, fixedaddress+i*4); + if(rv64_zbb) MIN(x4, x3, x4); else BLT(x3, x4, 4+4); + SW(x4, gback, gdoffset+i*4); + } + break; case 0x3A: INST_NAME("PMINUW Gx, Ex"); // SSE4 opcode! nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<8; ++i) { - // if(GX->uw[i]>EX->uw[i]) GX->uw[i] = EX->uw[i]; - LHU(x3, gback, i*2); + LHU(x3, gback, gdoffset+i*2); LHU(x4, wback, fixedaddress+i*2); - BLTU(x3, x4, 8); - SH(x4, gback, i*2); + if(rv64_zbb) MINU(x4, x3, x4); else BLTU(x3, x4, 4+4); + SH(x4, gback, gdoffset+i*2); } break; + case 0x3B: + INST_NAME("PMINUD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<4; ++i) { + LWU(x3, gback, gdoffset+i*4); + LWU(x4, wback, fixedaddress+i*4); + if(rv64_zbb) MINU(x4, x3, x4); else BLTU(x3, x4, 4+4); + SW(x4, gback, gdoffset+i*4); + } + break; + case 0x3C: + INST_NAME("PMAXSB Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<16; ++i) { + LB(x3, gback, gdoffset+i); + LB(x4, wback, fixedaddress+i); + if(rv64_zbb) MAX(x4, x3, x4); else BLT(x4, x3, 4+4); + SB(x4, gback, gdoffset+i); + } + break; + case 0x3D: + INST_NAME("PMAXSD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<4; ++i) { + LW(x3, gback, gdoffset+i*4); + LW(x4, wback, fixedaddress+i*4); + if(rv64_zbb) MAX(x4, x3, x4); else BLT(x4, x3, 4+4); + SW(x4, gback, gdoffset+i*4); + } + break; + case 0x3E: + INST_NAME("PMAXUW Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<8; ++i) { + LHU(x3, gback, gdoffset+i*2); + LHU(x4, wback, fixedaddress+i*2); + if(rv64_zbb) MAXU(x4, x3, x4); else BLTU(x4, x3, 4+4); + SH(x4, gback, gdoffset+i*2); + } + break; + case 0x3F: + INST_NAME("PMAXUD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<4; ++i) { + LWU(x3, gback, gdoffset+i*4); + LWU(x4, wback, fixedaddress+i*4); + if(rv64_zbb) MAXU(x4, x3, x4); else BLTU(x4, x3, 4+4); + SW(x4, gback, gdoffset+i*4); + } + break; + case 0x40: + INST_NAME("PMULLD Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + for(int i=0; i<4; ++i) { + LW(x3, gback, gdoffset+i*4); + LW(x4, wback, fixedaddress+i*4); + MUL(x3, x3, x4); + SW(x3, gback, gdoffset+i*4); + } + break; + case 0xDB: + INST_NAME("AESIMC Gx, Ex"); // AES-NI + nextop = F8; + GETGX(); + GETEX(x2, 0); + SSE_LOOP_MV_Q(x3); + sse_forget_reg(dyn, ninst, gd); + MOV32w(x1, gd); + CALL(native_aesimc, -1); + break; + case 0xDC: + INST_NAME("AESENC Gx, Ex"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, gd); + MOV32w(x1, gd); + CALL(native_aese, -1); + GETGX(); + GETEX(x2, 0); + SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); + break; + case 0xDD: + INST_NAME("AESENCLAST Gx, Ex"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, gd); + MOV32w(x1, gd); + CALL(native_aeselast, -1); + GETGX(); + GETEX(x2, 0); + SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); + break; + case 0xDE: + INST_NAME("AESDEC Gx, Ex"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, gd); + MOV32w(x1, gd); + CALL(native_aesd, -1); + GETGX(); + GETEX(x2, 0); + SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); + break; + + case 0xDF: + INST_NAME("AESDECLAST Gx, Ex"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, gd); + MOV32w(x1, gd); + CALL(native_aesdlast, -1); + GETGX(); + GETEX(x2, 0); + SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); + break; default: DEFAULT; } @@ -346,19 +725,20 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x0B: INST_NAME("ROUNDSD Gx, Ex, Ib"); nextop = F8; - GETEXSD(d0, 0); + GETEXSD(d0, 1); GETGXSD_empty(v0); d1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); u8 = F8; FEQD(x2, d0, d0); BNEZ_MARK(x2); - FADDD(v0, d0, d0); + if (v0!=d0) FMVD(v0, d0); B_NEXT_nocond; MARK; // d0 is not nan - FABSD(v0, d0); + FABSD(v1, d0); MOV64x(x3, 1ULL << __DBL_MANT_DIG__); FCVTDL(d1, x3, RD_RTZ); - FLTD(x3, v0, d1); + FLTD(x3, v1, d1); BNEZ_MARK2(x3); if (v0!=d0) FMVD(v0, d0); B_NEXT_nocond; @@ -366,17 +746,258 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if(u8&4) { u8 = sse_setround(dyn, ninst, x4, x2); FCVTLD(x5, d0, RD_DYN); - FCVTDL(v0, x5, RD_DYN); + FCVTDL(v0, x5, RD_RTZ); x87_restoreround(dyn, ninst, u8); } else { FCVTLD(x5, d0, round_round[u8&3]); - FCVTDL(v0, x5, round_round[u8&3]); + FCVTDL(v0, x5, RD_RTZ); } break; - default: + case 0x09: + INST_NAME("ROUNDPD Gx, Ex, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1); + u8 = F8; + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + MOV64x(x3, 1ULL << __DBL_MANT_DIG__); + FCVTDL(d1, x3, RD_RTZ); + + // i = 0 + FLD(d0, wback, fixedaddress); + FEQD(x4, d0, d0); + BNEZ(x4, 8); + B_MARK_nocond; + // d0 is not nan + FABSD(v1, d0); + FLTD(x4, v1, d1); + BNEZ(x4, 8); + B_MARK_nocond; + if(u8&4) { + u8 = sse_setround(dyn, ninst, x4, x5); + FCVTLD(x5, d0, RD_DYN); + FCVTDL(d0, x5, RD_RTZ); + x87_restoreround(dyn, ninst, u8); + } else { + FCVTLD(x5, d0, round_round[u8&3]); + FCVTDL(d0, x5, RD_RTZ); + } + MARK; + FSD(d0, gback, gdoffset+0); + + // i = 1 + FLD(d0, wback, fixedaddress+8); + FEQD(x4, d0, d0); + BNEZ(x4, 8); + B_MARK2_nocond; + // d0 is not nan + FABSD(v1, d0); + FLTD(x4, v1, d1); + BNEZ(x4, 8); + B_MARK2_nocond; + if(u8&4) { + u8 = sse_setround(dyn, ninst, x4, x5); + FCVTLD(x5, d0, RD_DYN); + FCVTDL(d0, x5, RD_RTZ); + x87_restoreround(dyn, ninst, u8); + } else { + FCVTLD(x5, d0, round_round[u8&3]); + FCVTDL(d0, x5, RD_RTZ); + } + MARK2; + FSD(d0, gback, gdoffset+8); + break; + case 0x0E: + INST_NAME("PBLENDW Gx, Ex, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1); + u8 = F8; + i32 = 0; + if (MODREG && gd==ed) break; + while (u8) + if(u8&1) { + if(!(i32&1) && u8&2) { + if(!(i32&3) && (u8&0xf)==0xf) { + // whole 64bits + LD(x3, wback, fixedaddress+8*(i32>>2)); + SD(x3, gback, gdoffset+8*(i32>>2)); + i32+=4; + u8>>=4; + } else { + // 32bits + LWU(x3, wback, fixedaddress+4*(i32>>1)); + SW(x3, gback, gdoffset+4*(i32>>1)); + i32+=2; + u8>>=2; + } + } else { + // 16 bits + LHU(x3, wback, fixedaddress+2*i32); + SH(x3, gback, gdoffset+2*i32); + i32++; + u8>>=1; + } + } else { + // nope + i32++; + u8>>=1; + } + break; + case 0x0F: + INST_NAME("PALIGNR Gx, Ex, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1); + u8 = F8; + sse_forget_reg(dyn, ninst, x5); + ADDI(x5, xEmu, offsetof(x64emu_t, scratch)); + // perserve gd + LD(x3, gback, gdoffset+0); + LD(x4, gback, gdoffset+8); + SD(x3, x5, 0); + SD(x4, x5, 8); + if(u8>31) { + SD(xZR, gback, gdoffset+0); + SD(xZR, gback, gdoffset+8); + } else { + for (int i=0; i<16; ++i, ++u8) { + if (u8>15) { + if(u8>31) { + SB(xZR, gback, gdoffset+i); + continue; + } + else LBU(x3, x5, u8-16); + } else { + LBU(x3, wback, fixedaddress+u8); + } + SB(x3, gback, gdoffset+i); + } + } + break; + case 0x16: + if(rex.w) {INST_NAME("PEXTRQ Ed, Gx, Ib");} else {INST_NAME("PEXTRD Ed, Gx, Ib");} + nextop = F8; + GETGX(); + GETED(1); + u8 = F8; + if(rex.w) + LD(ed, gback, gdoffset+8*(u8&1)); + else + LWU(ed, gback, gdoffset+4*(u8&3)); + if (wback) { + SDxw(ed, wback, fixedaddress); + SMWRITE2(); + } + break; + case 0x20: + INST_NAME("PINSRB Gx, ED, Ib"); + nextop = F8; + GETGX(); + GETED(1); + u8 = F8; + SB(ed, gback, gdoffset+u8&0xF); + break; + case 0x21: + INST_NAME("INSERTPS GX, EX, Ib"); + nextop = F8; + GETGX(); + GETEX(x2, 1); + u8 = F8; + if(MODREG) s8 = (u8>>6)&3; else s8 = 0; + // GX->ud[(tmp8u>>4)&3] = EX->ud[tmp8s]; + LWU(x3, wback, fixedaddress+4*s8); + SW(x3, gback, gdoffset+4*(u8>>4)); + for(int i=0; i<4; ++i) { + if(u8&(1<<i)) + // GX->ud[i] = 0; + SW(xZR, gback, gdoffset+4*i); + } + break; + case 0x22: + INST_NAME("PINSRD Gx, ED, Ib"); + nextop = F8; + GETGX(); + GETED(1); + u8 = F8; + if(rex.w) { + SD(ed, gback, gdoffset+8*(u8&0x1)); + } else { + SW(ed, gback, gdoffset+4*(u8&0x3)); + } + break; + case 0x44: + INST_NAME("PCLMULQDQ Gx, Ex, Ib"); + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, gd); + MOV32w(x1, gd); // gx + if(MODREG) { + ed = (nextop&7)+(rex.b<<3); + sse_forget_reg(dyn, ninst, ed); + MOV32w(x2, ed); + MOV32w(x3, 0); // p = NULL + } else { + MOV32w(x2, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 0, 1); + if(ed!=x3) { + MV(x3, ed); + } + } + u8 = F8; + MOV32w(x4, u8); + CALL(native_pclmul, -1); + break; + case 0xDF: + INST_NAME("AESKEYGENASSIST Gx, Ex, Ib"); // AES-NI + nextop = F8; + GETG; + sse_forget_reg(dyn, ninst, gd); + MOV32w(x1, gd); // gx + if(MODREG) { + ed = (nextop&7)+(rex.b<<3); + sse_forget_reg(dyn, ninst, ed); + MOV32w(x2, ed); + MOV32w(x3, 0); //p = NULL + } else { + MOV32w(x2, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 1); + if(ed!=x3) { + MV(x3, ed); + } + } + u8 = F8; + MOV32w(x4, u8); + CALL(native_aeskeygenassist, -1); + break; + default: DEFAULT; } break; + #define GO(GETFLAGS, NO, YES, F) \ + READFLAGS(F); \ + GETFLAGS; \ + nextop=F8; \ + GETGD; \ + if(MODREG) { \ + ed = xRAX+(nextop&7)+(rex.b<<3); \ + ZEXTH(x4, ed); \ + ed = x4; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); \ + LHU(x4, ed, fixedaddress); \ + ed = x4; \ + } \ + B##NO(x1, 4+3*4); \ + LUI(x3, 0xffff0); \ + AND(gd, gd, x3); \ + OR(gd, gd, ed); + + GOCOND(0x40, "CMOV", "Gw, Ew"); + #undef GO case 0x50: INST_NAME("PMOVMSKD Gd, Ex"); nextop = F8; @@ -390,11 +1011,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if (i) SLLI(x2, x2, 1); OR(gd, gd, x2); } - break; + break; case 0x51: INST_NAME("SQRTPD Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); d0 = fpu_get_scratch(dyn); if(!box64_dynarec_fastnan) { @@ -411,42 +1032,42 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int BEQ(x3, xZR, 8); FNEGD(d0, d0); } - FSD(d0, gback, i*8); + FSD(d0, gback, gdoffset+i*8); } break; case 0x54: INST_NAME("ANDPD Gx, Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_Q(x3, x4, AND(x3, x3, x4)); break; case 0x55: INST_NAME("ANDNPD Gx, Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_Q(x3, x4, NOT(x3, x3); AND(x3, x3, x4)); break; case 0x56: INST_NAME("ORPD Gx, Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_Q(x3, x4, OR(x3, x3, x4)); break; case 0x57: INST_NAME("XORPD Gx, Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); break; case 0x58: INST_NAME("ADDPD Gx, Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_FQ(x3, x4, { if(!box64_dynarec_fastnan) { FEQD(x3, v0, v0); @@ -466,7 +1087,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("MULPD Gx, Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_FQ(x3, x4, { if(!box64_dynarec_fastnan) { FEQD(x3, v0, v0); @@ -485,24 +1106,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x5A: INST_NAME("CVTPD2PS Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); d0 = fpu_get_scratch(dyn); // GX->f[0] = EX->d[0]; FLD(d0, wback, fixedaddress+0); FCVTSD(d0, d0); - FSD(d0, gback, 0); + FSD(d0, gback, gdoffset+0); // GX->f[1] = EX->d[1]; FLD(d0, wback, fixedaddress+8); FCVTSD(d0, d0); - FSD(d0, gback, 4); + FSD(d0, gback, gdoffset+4); // GX->q[1] = 0; - SD(xZR, gback, 8); + SD(xZR, gback, gdoffset+8); break; case 0x5B: INST_NAME("CVTPS2DQ Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); d0 = fpu_get_scratch(dyn); u8 = sse_setround(dyn, ninst, x6, x4); @@ -513,7 +1134,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SUB(x5, x5, x3); BEQZ(x5, 8); LUI(x3, 0x80000); // INT32_MIN - SW(x3, gback, 4*i); + SW(x3, gback, gdoffset+4*i); } x87_restoreround(dyn, ninst, u8); break; @@ -521,7 +1142,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("SUBPD Gx, Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_FQ(x3, x4, { if(!box64_dynarec_fastnan) { FEQD(x3, v0, v0); @@ -540,12 +1161,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x5D: INST_NAME("MINPD Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); d0 = fpu_get_scratch(dyn); d1 = fpu_get_scratch(dyn); for (int i=0; i<2; ++i) { - FLD(d0, gback, 8*i); + FLD(d0, gback, gdoffset+8*i); FLD(d1, wback, fixedaddress+8*i); FEQD(x3, d0, d0); FEQD(x4, d1, d1); @@ -553,14 +1174,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int BEQ(x3, xZR, 12); FLTD(x3, d1, d0); BEQ(x3, xZR, 8); // continue - FSD(d1, gback, 8*i); + FSD(d1, gback, gdoffset+8*i); } break; case 0x5E: INST_NAME("DIVPD Gx, Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_FQ(x3, x4, { if(!box64_dynarec_fastnan) { FEQD(x3, v0, v0); @@ -579,12 +1200,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x5F: INST_NAME("MAXPD Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); d0 = fpu_get_scratch(dyn); d1 = fpu_get_scratch(dyn); for (int i=0; i<2; ++i) { - FLD(d0, gback, 8*i); + FLD(d0, gback, gdoffset+8*i); FLD(d1, wback, fixedaddress+8*i); FEQD(x3, d0, d0); FEQD(x4, d1, d1); @@ -592,54 +1213,54 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int BEQ(x3, xZR, 12); FLTD(x3, d0, d1); BEQ(x3, xZR, 8); // continue - FSD(d1, gback, 8*i); + FSD(d1, gback, gdoffset+8*i); } break; case 0x60: INST_NAME("PUNPCKLBW Gx,Ex"); nextop = F8; - GETGX(x2); + GETGX(); for(int i=7; i>0; --i) { // 0 is untouched // GX->ub[2 * i] = GX->ub[i]; - LBU(x3, gback, i); - SB(x3, gback, 2*i); + LBU(x3, gback, gdoffset+i); + SB(x3, gback, gdoffset+2*i); } if (MODREG && gd==(nextop&7)+(rex.b<<3)) { for(int i=0; i<8; ++i) { // GX->ub[2 * i + 1] = GX->ub[2 * i]; - LBU(x3, gback, 2*i); - SB(x3, gback, 2*i+1); + LBU(x3, gback, gdoffset+2*i); + SB(x3, gback, gdoffset+2*i+1); } } else { GETEX(x1, 0); for(int i=0; i<8; ++i) { // GX->ub[2 * i + 1] = EX->ub[i]; LBU(x3, wback, fixedaddress+i); - SB(x3, gback, 2*i+1); + SB(x3, gback, gdoffset+2*i+1); } } break; case 0x61: INST_NAME("PUNPCKLWD Gx,Ex"); nextop = F8; - GETGX(x2); + GETGX(); for(int i=3; i>0; --i) { // GX->uw[2 * i] = GX->uw[i]; - LHU(x3, gback, i*2); - SH(x3, gback, 2*i*2); + LHU(x3, gback, gdoffset+i*2); + SH(x3, gback, gdoffset+2*i*2); } if (MODREG && gd==(nextop&7)+(rex.b<<3)) { for(int i=0; i<4; ++i) { // GX->uw[2 * i + 1] = GX->uw[2 * i]; - LHU(x3, gback, 2*i*2); - SH(x3, gback, (2*i+1)*2); + LHU(x3, gback, gdoffset+2*i*2); + SH(x3, gback, gdoffset+(2*i+1)*2); } } else { GETEX(x1, 0); for(int i=0; i<4; ++i) { // GX->uw[2 * i + 1] = EX->uw[i]; LHU(x3, wback, fixedaddress+i*2); - SH(x3, gback, (2*i+1)*2); + SH(x3, gback, gdoffset+(2*i+1)*2); } } break; @@ -647,71 +1268,108 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("PUNPCKLDQ Gx,Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); // GX->ud[3] = EX->ud[1]; - LWU(x3, x1, fixedaddress+1*4); - SW(x3, x2, 3*4); + LWU(x3, wback, fixedaddress+1*4); + SW(x3, gback, gdoffset+3*4); // GX->ud[2] = GX->ud[1]; - LWU(x3, x2, 1*4); - SW(x3, x2, 2*4); + LWU(x3, gback, gdoffset+1*4); + SW(x3, gback, gdoffset+2*4); // GX->ud[1] = EX->ud[0]; - LWU(x3, x1, fixedaddress+0*4); - SW(x3, x2, 1*4); + LWU(x3, wback, fixedaddress+0*4); + SW(x3, gback, gdoffset+1*4); + break; + case 0x63: + INST_NAME("PACKSSWB Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + MOV64x(x5, 127); + MOV64x(x6, -128); + for(int i=0; i<8; ++i) { + LH(x3, gback, gdoffset+i*2); + if(rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, x6); + } else { + BLT(x3, x5, 4+4); + MV(x3, x5); + BGE(x3, x6, 4+4); + MV(x3, x6); + } + SB(x3, gback, gdoffset+i); + } + if(MODREG && gd==ed) { + LD(x3, gback, gdoffset+0); + SD(x3, gback, gdoffset+8); + } else for(int i=0; i<8; ++i) { + LH(x3, wback, fixedaddress+i*2); + if(rv64_zbb) { + MIN(x3, x3, x5); + MAX(x3, x3, x6); + } else { + BLT(x3, x5, 4+4); + MV(x3, x5); + BGE(x3, x6, 4+4); + MV(x3, x6); + } + SB(x3, gback, gdoffset+8+i); + } break; case 0x64: INST_NAME("PCMPGTB Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<16; ++i) { // GX->ub[i] = (GX->sb[i]>EX->sb[i])?0xFF:0x00; LB(x3, wback, fixedaddress+i); - LB(x4, gback, i); + LB(x4, gback, gdoffset+i); SLT(x3, x3, x4); NEG(x3, x3); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0x65: INST_NAME("PCMPGTW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<8; ++i) { // GX->uw[i] = (GX->sw[i]>EX->sw[i])?0xFFFF:0x0000; LH(x3, wback, fixedaddress+i*2); - LH(x4, gback, i*2); + LH(x4, gback, gdoffset+i*2); SLT(x3, x3, x4); NEG(x3, x3); - SH(x3, gback, i*2); + SH(x3, gback, gdoffset+i*2); } break; case 0x66: INST_NAME("PCMPGTD Gx,Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); SSE_LOOP_DS(x3, x4, SLT(x4, x4, x3); SLLI(x3, x4, 63); SRAI(x3, x3, 63)); break; case 0x67: INST_NAME("PACKUSWB Gx, Ex"); nextop = F8; - GETGX(x2); + GETGX(); ADDI(x5, xZR, 0xFF); for(int i=0; i<8; ++i) { // GX->ub[i] = (GX->sw[i]<0)?0:((GX->sw[i]>0xff)?0xff:GX->sw[i]); - LH(x3, gback, i*2); + LH(x3, gback, gdoffset+i*2); BGE(x5, x3, 8); ADDI(x3, xZR, 0xFF); NOT(x4, x3); SRAI(x4, x4, 63); AND(x3, x3, x4); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } if (MODREG && gd==(nextop&7)+(rex.b<<3)) { // GX->q[1] = GX->q[0]; - LD(x3, gback, 0*8); - SD(x3, gback, 1*8); + LD(x3, gback, gdoffset+0*8); + SD(x3, gback, gdoffset+1*8); } else { GETEX(x1, 0); for(int i=0; i<8; ++i) { @@ -722,55 +1380,55 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int NOT(x4, x3); SRAI(x4, x4, 63); AND(x3, x3, x4); - SB(x3, gback, 8+i); + SB(x3, gback, gdoffset+8+i); } } break; case 0x68: INST_NAME("PUNPCKHBW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); for(int i=0; i<8; ++i) { // GX->ub[2 * i] = GX->ub[i + 8]; - LBU(x3, gback, i+8); - SB(x3, gback, 2*i); + LBU(x3, gback, gdoffset+i+8); + SB(x3, gback, gdoffset+2*i); } if (MODREG && gd==(nextop&7)+(rex.b<<3)) { for(int i=0; i<8; ++i) { // GX->ub[2 * i + 1] = GX->ub[2 * i]; - LBU(x3, gback, 2*i); - SB(x3, gback, 2*i+1); + LBU(x3, gback, gdoffset+2*i); + SB(x3, gback, gdoffset+2*i+1); } } else { GETEX(x2, 0); for(int i=0; i<8; ++i) { // GX->ub[2 * i + 1] = EX->ub[i + 8]; LBU(x3, wback, fixedaddress+i+8); - SB(x3, gback, 2*i+1); + SB(x3, gback, gdoffset+2*i+1); } } break; case 0x69: INST_NAME("PUNPCKHWD Gx,Ex"); nextop = F8; - GETGX(x2); + GETGX(); for(int i=0; i<4; ++i) { // GX->uw[2 * i] = GX->uw[i + 4]; - LHU(x3, gback, (i+4)*2); - SH(x3, gback, 2*i*2); + LHU(x3, gback, gdoffset+(i+4)*2); + SH(x3, gback, gdoffset+2*i*2); } if (MODREG && gd==(nextop&7)+(rex.b<<3)) { for(int i=0; i<4; ++i) { // GX->uw[2 * i + 1] = GX->uw[2 * i]; - LHU(x3, gback, 2*i*2); - SH(x3, gback, (2*i+1)*2); + LHU(x3, gback, gdoffset+2*i*2); + SH(x3, gback, gdoffset+(2*i+1)*2); } } else { GETEX(x1, 0); for(int i=0; i<4; ++i) { // GX->uw[2 * i + 1] = EX->uw[i + 4]; LHU(x3, wback, fixedaddress+(i+4)*2); - SH(x3, gback, (2*i+1)*2); + SH(x3, gback, gdoffset+(2*i+1)*2); } } break; @@ -778,41 +1436,41 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int INST_NAME("PUNPCKHDQ Gx,Ex"); nextop = F8; GETEX(x1, 0); - GETGX(x2); + GETGX(); // GX->ud[0] = GX->ud[2]; - LWU(x3, gback, 2*4); - SW(x3, gback, 0*4); + LWU(x3, gback, gdoffset+2*4); + SW(x3, gback, gdoffset+0*4); // GX->ud[1] = EX->ud[2]; LWU(x3, wback, fixedaddress+2*4); - SW(x3, gback, 1*4); + SW(x3, gback, gdoffset+1*4); // GX->ud[2] = GX->ud[3]; - LWU(x3, gback, 3*4); - SW(x3, gback, 2*4); + LWU(x3, gback, gdoffset+3*4); + SW(x3, gback, gdoffset+2*4); // GX->ud[3] = EX->ud[3]; if (!(MODREG && (gd==ed))) { LWU(x3, wback, fixedaddress+3*4); - SW(x3, gback, 3*4); + SW(x3, gback, gdoffset+3*4); } break; case 0x6B: INST_NAME("PACKSSDW Gx,Ex"); nextop = F8; - GETGX(x2); + GETGX(); MOV64x(x5, 32768); NEG(x6, x5); for(int i=0; i<4; ++i) { // GX->sw[i] = (GX->sd[i]<-32768)?-32768:((GX->sd[i]>32767)?32767:GX->sd[i]); - LW(x3, gback, i*4); + LW(x3, gback, gdoffset+i*4); BGE(x5, x3, 8); ADDI(x3, x5, -1); BGE(x3, x6, 8); MV(x3, x6); - SH(x3, gback, i*2); + SH(x3, gback, gdoffset+i*2); } if (MODREG && gd==(nextop&7)+(rex.b<<3)) { // GX->q[1] = GX->q[0]; - LD(x3, gback, 0*8); - SD(x3, gback, 1*8); + LD(x3, gback, gdoffset+0*8); + SD(x3, gback, gdoffset+1*8); } else { GETEX(x1, 0); for(int i=0; i<4; ++i) { @@ -822,32 +1480,32 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ADDI(x3, x5, -1); BGE(x3, x6, 8); MV(x3, x6); - SH(x3, gback, (4+i)*2); + SH(x3, gback, gdoffset+(4+i)*2); } } break; case 0x6C: INST_NAME("PUNPCKLQDQ Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); if(MODREG) { v1 = sse_get_reg(dyn, ninst, x2, (nextop&7)+(rex.b<<3), 0); - FSD(v1, gback, 8); + FSD(v1, gback, gdoffset+8); } else { - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0); LD(x3, ed, fixedaddress+0); - SD(x3, gback, 8); + SD(x3, gback, gdoffset+8); } break; case 0x6D: INST_NAME("PUNPCKHQDQ Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); - LD(x3, gback, 8); - SD(x3, gback, 0); + LD(x3, gback, gdoffset+8); + SD(x3, gback, gdoffset+0); LD(x3, wback, fixedaddress+8); - SD(x3, gback, 8); + SD(x3, gback, gdoffset+8); break; case 0x6E: INST_NAME("MOVD Gx, Ed"); @@ -869,14 +1527,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x6F: INST_NAME("MOVDQA Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_MV_Q(x3); break; case 0x70: // TODO: Optimize this! INST_NAME("PSHUFD Gx,Ex,Ib"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 1); u8 = F8; int32_t idx; @@ -890,10 +1548,10 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int idx = (u8>>(3*2))&3; LWU(x6, wback, fixedaddress+idx*4); - SW(x3, gback, 0*4); - SW(x4, gback, 1*4); - SW(x5, gback, 2*4); - SW(x6, gback, 3*4); + SW(x3, gback, gdoffset+0*4); + SW(x4, gback, gdoffset+1*4); + SW(x5, gback, gdoffset+2*4); + SW(x6, gback, gdoffset+3*4); break; case 0x71: nextop = F8; @@ -904,8 +1562,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int u8 = F8; if (u8>15) { // just zero dest - SD(xZR, x1, fixedaddress+0); - SD(xZR, x1, fixedaddress+8); + SD(xZR, wback, fixedaddress+0); + SD(xZR, wback, fixedaddress+8); } else if(u8) { for (int i=0; i<8; ++i) { // EX->uw[i] >>= u8; @@ -935,8 +1593,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int u8 = F8; if (u8>15) { // just zero dest - SD(xZR, x1, fixedaddress+0); - SD(xZR, x1, fixedaddress+8); + SD(xZR, wback, fixedaddress+0); + SD(xZR, wback, fixedaddress+8); } else if(u8) { for (int i=0; i<8; ++i) { // EX->uw[i] <<= u8; @@ -961,8 +1619,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if(u8) { if (u8>31) { // just zero dest - SD(xZR, x1, fixedaddress+0); - SD(xZR, x1, fixedaddress+8); + SD(xZR, wback, fixedaddress+0); + SD(xZR, wback, fixedaddress+8); } else if(u8) { SSE_LOOP_D_S(x3, SRLI(x3, x3, u8)); } @@ -984,8 +1642,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if(u8) { if (u8>31) { // just zero dest - SD(xZR, x1, fixedaddress+0); - SD(xZR, x1, fixedaddress+8); + SD(xZR, wback, fixedaddress+0); + SD(xZR, wback, fixedaddress+8); } else if(u8) { SSE_LOOP_D_S(x3, SLLI(x3, x3, u8)); } @@ -1023,24 +1681,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if(!u8) break; if(u8>15) { // just zero dest - SD(xZR, x1, fixedaddress+0); - SD(xZR, x1, fixedaddress+8); + SD(xZR, wback, fixedaddress+0); + SD(xZR, wback, fixedaddress+8); } else { u8*=8; if (u8 < 64) { - LD(x3, x1, fixedaddress+0); - LD(x4, x1, fixedaddress+8); + LD(x3, wback, fixedaddress+0); + LD(x4, wback, fixedaddress+8); SRLI(x3, x3, u8); SLLI(x5, x4, 64-u8); OR(x3, x3, x5); - SD(x3, x1, fixedaddress+0); + SD(x3, wback, fixedaddress+0); SRLI(x4, x4, u8); - SD(x4, x1, fixedaddress+8); + SD(x4, wback, fixedaddress+8); } else { - LD(x3, x1, fixedaddress+8); + LD(x3, wback, fixedaddress+8); if (u8-64 > 0) { SRLI(x3, x3, u8-64); } - SD(x3, x1, fixedaddress+0); - SD(xZR, x1, fixedaddress+8); + SD(x3, wback, fixedaddress+0); + SD(xZR, wback, fixedaddress+8); } } break; @@ -1051,8 +1709,8 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if(!u8) break; if(u8>63) { // just zero dest - SD(xZR, x1, fixedaddress+0); - SD(xZR, x1, fixedaddress+8); + SD(xZR, wback, fixedaddress+0); + SD(xZR, wback, fixedaddress+8); } else { LD(x3, wback, fixedaddress+0); LD(x4, wback, fixedaddress+8); @@ -1069,24 +1727,24 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if(!u8) break; if(u8>15) { // just zero dest - SD(xZR, x1, fixedaddress+0); - SD(xZR, x1, fixedaddress+8); + SD(xZR, wback, fixedaddress+0); + SD(xZR, wback, fixedaddress+8); } else { u8*=8; if (u8 < 64) { - LD(x3, x1, fixedaddress+0); - LD(x4, x1, fixedaddress+8); + LD(x3, wback, fixedaddress+0); + LD(x4, wback, fixedaddress+8); SLLI(x4, x4, u8); SRLI(x5, x3, 64-u8); OR(x4, x4, x5); - SD(x4, x1, fixedaddress+8); + SD(x4, wback, fixedaddress+8); SLLI(x3, x3, u8); - SD(x3, x1, fixedaddress+0); + SD(x3, wback, fixedaddress+0); } else { - LD(x3, x1, fixedaddress+0); + LD(x3, wback, fixedaddress+0); if (u8-64 > 0) { SLLI(x3, x3, u8-64); } - SD(x3, x1, fixedaddress+8); - SD(xZR, x1, fixedaddress+0); + SD(x3, wback, fixedaddress+8); + SD(xZR, wback, fixedaddress+0); } } break; @@ -1097,52 +1755,94 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x74: INST_NAME("PCMPEQB Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for (int i=0; i<16; ++i) { - LBU(x3, gback, i); + LBU(x3, gback, gdoffset+i); LBU(x4, wback, fixedaddress+i); SUB(x3, x3, x4); SEQZ(x3, x3); NEG(x3, x3); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0x75: INST_NAME("PCMPEQW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_W(x3, x4, SUB(x3, x3, x4); SEQZ(x3, x3); NEG(x3, x3)); break; case 0x76: INST_NAME("PCMPEQD Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_D(x3, x4, XOR(x3, x3, x4); SNEZ(x3, x3); ADDI(x3, x3, -1)); break; + case 0x7C: + INST_NAME("HADDPD Gx, Ex"); + nextop = F8; + GETGX(); + d0 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch(dyn); + FLD(d0, gback, gdoffset+0); + FLD(d1, gback, gdoffset+8); + if(!box64_dynarec_fastnan) { + FEQD(x3, d0, d0); + FEQD(x4, d1, d1); + AND(x3, x3, x4); + } + FADDD(d0, d0, d1); + if(!box64_dynarec_fastnan) { + FEQD(x4, d0, d0); + BEQZ(x3, 12); + BNEZ(x4, 8); + FNEGD(d0, d0); + } + FSD(d0, gback, gdoffset+0); + if(MODREG && gd==(nextop&7)+(rex.b<<3)) { + FSD(d0, gback, gdoffset+8); + } else { + GETEX(x2, 0); + FLD(d0, wback, fixedaddress+0); + FLD(d1, wback, fixedaddress+8); + if(!box64_dynarec_fastnan) { + FEQD(x3, d0, d0); + FEQD(x4, d1, d1); + AND(x3, x3, x4); + } + FADDD(d0, d0, d1); + if(!box64_dynarec_fastnan) { + FEQD(x4, d0, d0); + BEQZ(x3, 12); + BNEZ(x4, 8); + FNEGD(d0, d0); + } + FSD(d0, gback, gdoffset+8); + } + break; case 0x7E: INST_NAME("MOVD Ed,Gx"); nextop = F8; - GETGX(x1); + GETGX(); if(rex.w) { if(MODREG) { ed = xRAX + (nextop&7) + (rex.b<<3); - LD(ed, x1, 0); + LD(ed, gback, gdoffset+0); } else { - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0); - LD(x3, x1, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0); + LD(x3, gback, gdoffset+0); SD(x3, ed, fixedaddress); SMWRITE2(); } } else { if(MODREG) { ed = xRAX + (nextop&7) + (rex.b<<3); - LWU(ed, x1, 0); + LWU(ed, gback, gdoffset+0); } else { - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0); - LWU(x3, x1, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 1, 0); + LWU(x3, gback, gdoffset+0); SW(x3, ed, fixedaddress); SMWRITE2(); } @@ -1151,7 +1851,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x7F: INST_NAME("MOVDQA Ex,Gx"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_MV_Q2(x3); if(!MODREG) SMWRITE2(); @@ -1165,8 +1865,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETSGW(x2); MULW(x2, x2, x1); UFLAG_RES(x2); - SLLI(x2, x2, 48); - SRLI(x2, x2, 48); + ZEXTH(x2, x2); GWBACK; break; @@ -1188,7 +1887,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SRAI(x1, x1, 56); } else { SMREAD(); - addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 0, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x4, &fixedaddress, rex, NULL, 1, 0); LB(x1, ed, fixedaddress); } LUI(x5, 0xffff0); @@ -1200,13 +1899,13 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xC2: INST_NAME("CMPPD Gx, Ex, Ib"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 1); u8 = F8; d0 = fpu_get_scratch(dyn); d1 = fpu_get_scratch(dyn); for(int i=0; i<2; ++i) { - FLD(d0, gback, 8*i); + FLD(d0, gback, gdoffset+8*i); FLD(d1, wback, fixedaddress+8*i); if ((u8&7) == 0) { // Equal FEQD(x3, d0, d1); @@ -1237,7 +1936,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } case 7: break; // Not NaN } - + // MARK2; if ((u8&7) == 5 || (u8&7) == 6) { MOV32w(x3, 1); @@ -1245,16 +1944,16 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int // MARK; } NEG(x3, x3); - SD(x3, gback, 8*i); + SD(x3, gback, gdoffset+8*i); } break; case 0xC4: INST_NAME("PINSRW Gx,Ed,Ib"); nextop = F8; GETED(1); - GETGX(x3); + GETGX(); u8 = (F8)&7; - SH(ed, gback, u8*2); + SH(ed, gback, gdoffset+u8*2); break; case 0xC5: INST_NAME("PEXTRW Gd,Ex,Ib"); @@ -1267,90 +1966,90 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xC6: INST_NAME("SHUFPD Gx, Ex, Ib"); nextop = F8; - GETGX(x1); + GETGX(); + GETEX(x2, 1); u8 = F8; if (MODREG && gd==(nextop&7)+(rex.b<<3) && u8==0) { - LD(x3, gback, 0); - SD(x3, gback, 8); + LD(x3, gback, gdoffset+0); + SD(x3, gback, gdoffset+8); break; } - GETEX(x2, 1) - LD(x3, gback, 8*(u8&1)); + LD(x3, gback, gdoffset+8*(u8&1)); LD(x4, wback, fixedaddress+8*((u8>>1)&1)); - SD(x3, gback, 0); - SD(x4, gback, 8); + SD(x3, gback, gdoffset+0); + SD(x4, gback, gdoffset+8); break; case 0xD1: INST_NAME("PSRLW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); LD(x3, wback, fixedaddress); ADDI(x4, xZR, 16); BLTU_MARK(x3, x4); - SD(xZR, gback, 0); - SD(xZR, gback, 8); + SD(xZR, gback, gdoffset+0); + SD(xZR, gback, gdoffset+8); B_NEXT_nocond; MARK; for (int i=0; i<8; ++i) { - LHU(x5, gback, 2*i); + LHU(x5, gback, gdoffset+2*i); SRLW(x5, x5, x3); - SH(x5, gback, 2*i); + SH(x5, gback, gdoffset+2*i); } break; case 0xD2: INST_NAME("PSRLD Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); LD(x3, wback, fixedaddress); ADDI(x4, xZR, 32); BLTU_MARK(x3, x4); - SD(xZR, gback, 0); - SD(xZR, gback, 8); + SD(xZR, gback, gdoffset+0); + SD(xZR, gback, gdoffset+8); B_NEXT_nocond; MARK; for (int i=0; i<4; ++i) { - LWU(x5, gback, 4*i); + LWU(x5, gback, gdoffset+4*i); SRLW(x5, x5, x3); - SW(x5, gback, 4*i); + SW(x5, gback, gdoffset+4*i); } break; case 0xD3: INST_NAME("PSRLQ Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); LD(x3, wback, fixedaddress); ADDI(x4, xZR, 64); BLTU_MARK(x3, x4); - SD(xZR, gback, 0); - SD(xZR, gback, 8); + SD(xZR, gback, gdoffset+0); + SD(xZR, gback, gdoffset+8); B_NEXT_nocond; MARK; for (int i=0; i<2; ++i) { - LD(x5, gback, 8*i); + LD(x5, gback, gdoffset+8*i); SRL(x5, x5, x3); - SD(x5, gback, 8*i); + SD(x5, gback, gdoffset+8*i); } break; case 0xD4: INST_NAME("PADDQ Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_Q(x3, x4, ADD(x3, x3, x4)); break; case 0xD5: INST_NAME("PMULLW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<8; ++i) { - LH(x3, gback, 2*i); + LH(x3, gback, gdoffset+2*i); LH(x4, wback, fixedaddress+2*i); MULW(x3, x3, x4); - SH(x3, gback, 2*i); + SH(x3, gback, gdoffset+2*i); } break; case 0xD6: @@ -1381,314 +2080,347 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xD8: INST_NAME("PSUBUSB Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<16; ++i) { - LBU(x3, gback, i); + LBU(x3, gback, gdoffset+i); LBU(x4, wback, fixedaddress+i); SUB(x3, x3, x4); NOT(x4, x3); SRAI(x4, x4, 63); AND(x3, x3, x4); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0xD9: INST_NAME("PSUBUSW Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_W(x3, x4, SUB(x3, x3, x4); NOT(x4, x3); SRAI(x4, x4, 63); AND(x3, x3, x4)); break; case 0xDA: INST_NAME("PMINUB Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for (int i=0; i<16; ++i) { - LBU(x3, gback, i); + LBU(x3, gback, gdoffset+i); LBU(x4, wback, fixedaddress+i); BLTU(x3, x4, 8); MV(x3, x4); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0xDB: INST_NAME("PAND Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_Q(x3, x4, AND(x3, x3, x4)); break; case 0xDC: INST_NAME("PADDUSB Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); ADDI(x5, xZR, 0xFF); for(int i=0; i<16; ++i) { - LBU(x3, gback, i); + LBU(x3, gback, gdoffset+i); LBU(x4, wback, fixedaddress+i); ADD(x3, x3, x4); BLT(x3, x5, 8); ADDI(x3, xZR, 0xFF); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0xDD: INST_NAME("PADDUSW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<8; ++i) { // tmp32s = (int32_t)GX->uw[i] + EX->uw[i]; // GX->uw[i] = (tmp32s>65535)?65535:tmp32s; - LHU(x3, gback, i*2); + LHU(x3, gback, gdoffset+i*2); LHU(x4, wback, fixedaddress+i*2); ADDW(x3, x3, x4); MOV32w(x4, 65536); BLT(x3, x4, 8); ADDIW(x3, x4, -1); - SH(x3, gback, i*2); + SH(x3, gback, gdoffset+i*2); } break; case 0xDE: INST_NAME("PMAXUB Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for (int i=0; i<16; ++i) { - LBU(x3, gback, i); + LBU(x3, gback, gdoffset+i); LBU(x4, wback, fixedaddress+i); BLTU(x4, x3, 8); MV(x3, x4); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0xDF: INST_NAME("PANDN Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_Q(x3, x4, NOT(x3, x3); AND(x3, x3, x4)); break; case 0xE0: INST_NAME("PAVGB Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for (int i=0; i<16; ++i) { - LBU(x3, gback, i); + LBU(x3, gback, gdoffset+i); LBU(x4, wback, fixedaddress+i); ADDW(x3, x3, x4); ADDIW(x3, x3, 1); SRAIW(x3, x3, 1); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0xE1: INST_NAME("PSRAW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); ADDI(x4, xZR, 16); LD(x3, wback, fixedaddress); BLTU(x3, x4, 8); SUBI(x3, x4, 1); for (int i=0; i<8; ++i) { - LH(x4, gback, 2*i); + LH(x4, gback, gdoffset+2*i); SRAW(x4, x4, x3); - SH(x4, gback, 2*i); + SH(x4, gback, gdoffset+2*i); } break; case 0xE2: INST_NAME("PSRAD Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); ADDI(x4, xZR, 32); LD(x3, wback, fixedaddress); BLTU(x3, x4, 8); SUBI(x3, x4, 1); for (int i=0; i<4; ++i) { - LW(x4, gback, 4*i); + LW(x4, gback, gdoffset+4*i); SRAW(x4, x4, x3); - SW(x4, gback, 4*i); + SW(x4, gback, gdoffset+4*i); } break; case 0xE3: INST_NAME("PAVGW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for (int i=0; i<8; ++i) { - LHU(x3, gback, 2*i); + LHU(x3, gback, gdoffset+2*i); LHU(x4, wback, fixedaddress+2*i); ADDW(x3, x3, x4); ADDIW(x3, x3, 1); SRAIW(x3, x3, 1); - SH(x3, gback, 2*i); + SH(x3, gback, gdoffset+2*i); } break; case 0xE4: INST_NAME("PMULHUW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<8; ++i) { - LHU(x3, gback, 2*i); + LHU(x3, gback, gdoffset+2*i); LHU(x4, wback, fixedaddress+2*i); MULW(x3, x3, x4); SRLIW(x3, x3, 16); - SH(x3, gback, 2*i); + SH(x3, gback, gdoffset+2*i); } break; case 0xE5: INST_NAME("PMULHW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<8; ++i) { - LH(x3, gback, 2*i); + LH(x3, gback, gdoffset+2*i); LH(x4, wback, fixedaddress+2*i); MULW(x3, x3, x4); SRAIW(x3, x3, 16); - SH(x3, gback, 2*i); + SH(x3, gback, gdoffset+2*i); } break; + case 0xE6: + INST_NAME("CVTTPD2DQ Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + FLD(v0, wback, fixedaddress+0); + FLD(v1, wback, fixedaddress+8); + if(!box64_dynarec_fastround) { + FSFLAGSI(0); // // reset all bits + } + FCVTWD(x3, v0, RD_RTZ); + if(!box64_dynarec_fastround) { + FRFLAGS(x5); // get back FPSR to check the IOC bit + ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF)); + BEQ_MARK(x5, xZR); + MOV32w(x3, 0x80000000); + MARK; + FSFLAGSI(0); // // reset all bits + } + FCVTWD(x4, v1, RD_RTZ); + if(!box64_dynarec_fastround) { + FRFLAGS(x5); // get back FPSR to check the IOC bit + ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF)); + BEQ_MARK2(x5, xZR); + MOV32w(x4, 0x80000000); + MARK2; + } + SW(x3, gback, gdoffset+0); + SW(x4, gback, gdoffset+4); + SD(xZR, gback, gdoffset+8); + break; case 0xE7: INST_NAME("MOVNTDQ Ex, Gx"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_MV_Q2(x3); break; case 0xE8: INST_NAME("PSUBSB Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<16; ++i) { // tmp16s = (int16_t)GX->sb[i] - EX->sb[i]; // GX->sb[i] = (tmp16s<-128)?-128:((tmp16s>127)?127:tmp16s); - LB(x3, gback, i); + LB(x3, gback, gdoffset+i); LB(x4, wback, fixedaddress+i); SUBW(x3, x3, x4); SLLIW(x3, x3, 16); SRAIW(x3, x3, 16); ADDI(x4, xZR, 0x7f); BLT(x3, x4, 12); // tmp16s>127? - SB(x4, gback, i); + SB(x4, gback, gdoffset+i); J(24); // continue ADDI(x4, xZR, 0xf80); BLT(x4, x3, 12); // tmp16s<-128? - SB(x4, gback, i); + SB(x4, gback, gdoffset+i); J(8); // continue - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0xE9: INST_NAME("PSUBSW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<8; ++i) { // tmp32s = (int32_t)GX->sw[i] - EX->sw[i]; // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s); - LH(x3, gback, 2*i); + LH(x3, gback, gdoffset+2*i); LH(x4, wback, fixedaddress+2*i); SUBW(x3, x3, x4); LUI(x4, 0xFFFF8); // -32768 BGE(x3, x4, 12); - SH(x4, gback, 2*i); + SH(x4, gback, gdoffset+2*i); J(20); // continue LUI(x4, 8); // 32768 BLT(x3, x4, 8); ADDIW(x3, x4, -1); - SH(x3, gback, 2*i); + SH(x3, gback, gdoffset+2*i); } break; case 0xEA: INST_NAME("PMINSW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for (int i=0; i<8; ++i) { - LH(x3, gback, 2*i); + LH(x3, gback, gdoffset+2*i); LH(x4, wback, fixedaddress+2*i); BLT(x3, x4, 8); MV(x3, x4); - SH(x3, gback, 2*i); + SH(x3, gback, gdoffset+2*i); } break; case 0xEB: INST_NAME("POR Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_Q(x3, x4, OR(x3, x3, x4)); break; case 0xEC: INST_NAME("PADDSB Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<16; ++i) { // tmp16s = (int16_t)GX->sb[i] + EX->sb[i]; // GX->sb[i] = (tmp16s>127)?127:((tmp16s<-128)?-128:tmp16s); - LB(x3, gback, i); + LB(x3, gback, gdoffset+i); LB(x4, wback, fixedaddress+i); ADDW(x3, x3, x4); SLLIW(x3, x3, 16); SRAIW(x3, x3, 16); ADDI(x4, xZR, 0x7f); BLT(x3, x4, 12); // tmp16s>127? - SB(x4, gback, i); + SB(x4, gback, gdoffset+i); J(24); // continue ADDI(x4, xZR, 0xf80); BLT(x4, x3, 12); // tmp16s<-128? - SB(x4, gback, i); + SB(x4, gback, gdoffset+i); J(8); // continue - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0xED: INST_NAME("PADDSW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<8; ++i) { // tmp32s = (int32_t)GX->sw[i] + EX->sw[i]; // GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s); - LH(x3, gback, 2*i); + LH(x3, gback, gdoffset+2*i); LH(x4, wback, fixedaddress+2*i); ADDW(x3, x3, x4); LUI(x4, 0xFFFF8); // -32768 BGE(x3, x4, 12); - SH(x4, gback, 2*i); + SH(x4, gback, gdoffset+2*i); J(20); // continue LUI(x4, 8); // 32768 BLT(x3, x4, 8); ADDIW(x3, x4, -1); - SH(x3, gback, 2*i); + SH(x3, gback, gdoffset+2*i); } break; case 0xEE: INST_NAME("PMAXSW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_WS(x3, x4, BGE(x3, x4, 8); MV(x3, x4)); break; case 0xEF: INST_NAME("PXOR Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); if(MODREG && gd==(nextop&7)+(rex.b<<3)) { // just zero dest - SD(xZR, x1, 0); - SD(xZR, x1, 8); + SD(xZR, gback, gdoffset+0); + SD(xZR, gback, gdoffset+8); } else { GETEX(x2, 0); SSE_LOOP_Q(x3, x4, XOR(x3, x3, x4)); @@ -1697,102 +2429,102 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xF1: INST_NAME("PSLLQ Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); ADDI(x4, xZR, 16); LD(x3, wback, fixedaddress+0); BLTU_MARK(x3, x4); // just zero dest - SD(xZR, gback, 0); - SD(xZR, gback, 8); + SD(xZR, gback, gdoffset+0); + SD(xZR, gback, gdoffset+8); B_NEXT_nocond; MARK; for (int i=0; i<8; ++i) { - LHU(x4, gback, 2*i); + LHU(x4, gback, gdoffset+2*i); SLLW(x4, x4, x3); - SH(x4, gback, 2*i); + SH(x4, gback, gdoffset+2*i); } break; case 0xF2: INST_NAME("PSLLQ Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); ADDI(x4, xZR, 32); LD(x3, wback, fixedaddress+0); BLTU_MARK(x3, x4); // just zero dest - SD(xZR, gback, 0); - SD(xZR, gback, 8); + SD(xZR, gback, gdoffset+0); + SD(xZR, gback, gdoffset+8); B_NEXT_nocond; MARK; for (int i=0; i<4; ++i) { - LWU(x4, gback, 4*i); + LWU(x4, gback, gdoffset+4*i); SLLW(x4, x4, x3); - SW(x4, gback, 4*i); + SW(x4, gback, gdoffset+4*i); } break; case 0xF3: INST_NAME("PSLLQ Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); ADDI(x4, xZR, 64); LD(x3, wback, fixedaddress+0); BLTU_MARK(x3, x4); // just zero dest - SD(xZR, gback, 0); - SD(xZR, gback, 8); + SD(xZR, gback, gdoffset+0); + SD(xZR, gback, gdoffset+8); B_NEXT_nocond; MARK; for (int i=0; i<2; ++i) { - LD(x4, gback, 8*i); + LD(x4, gback, gdoffset+8*i); SLL(x4, x4, x3); - SD(x4, gback, 8*i); + SD(x4, gback, gdoffset+8*i); } break; case 0xF4: INST_NAME("PMULUDQ Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); // GX->q[1] = (uint64_t)EX->ud[2]*GX->ud[2]; - LWU(x3, gback, 2*4); + LWU(x3, gback, gdoffset+2*4); LWU(x4, wback, fixedaddress+2*4); MUL(x3, x3, x4); - SD(x3, gback, 8); + SD(x3, gback, gdoffset+8); // GX->q[0] = (uint64_t)EX->ud[0]*GX->ud[0]; - LWU(x3, gback, 0*4); + LWU(x3, gback, gdoffset+0*4); LWU(x4, wback, fixedaddress+0*4); MUL(x3, x3, x4); - SD(x3, gback, 0); + SD(x3, gback, gdoffset+0); break; case 0xF5: INST_NAME("PMADDWD Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for (int i=0; i<4; ++i) { - // GX->sd[i] = (int32_t)(GX->sw[i*2+0])*EX->sw[i*2+0] + + // GX->sd[i] = (int32_t)(GX->sw[i*2+0])*EX->sw[i*2+0] + // (int32_t)(GX->sw[i*2+1])*EX->sw[i*2+1]; - LH(x3, gback, 2*(i*2+0)); + LH(x3, gback, gdoffset+2*(i*2+0)); LH(x4, wback, fixedaddress+2*(i*2+0)); MULW(x5, x3, x4); - LH(x3, gback, 2*(i*2+1)); + LH(x3, gback, gdoffset+2*(i*2+1)); LH(x4, wback, fixedaddress+2*(i*2+1)); MULW(x6, x3, x4); ADDW(x5, x5, x6); - SW(x5, gback, 4*i); + SW(x5, gback, gdoffset+4*i); } break; case 0xF6: INST_NAME("PSADBW Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); MV(x6, xZR); for (int i=0; i<16; ++i) { - LBU(x3, gback, i); + LBU(x3, gback, gdoffset+i); LBU(x4, wback, fixedaddress+i); SUBW(x3, x3, x4); SRAIW(x5, x3, 31); @@ -1801,7 +2533,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ANDI(x3, x3, 0xff); ADDW(x6, x6, x3); if (i==7 || i == 15) { - SD(x6, gback, i+1-8); + SD(x6, gback, gdoffset+i+1-8); if (i==7) MV(x6, xZR); } } @@ -1809,61 +2541,61 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xF8: INST_NAME("PSUBB Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<16; ++i) { // GX->sb[i] -= EX->sb[i]; LB(x3, wback, fixedaddress+i); - LB(x4, gback, i); + LB(x4, gback, gdoffset+i); SUB(x3, x4, x3); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0xF9: INST_NAME("PSUBW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_W(x3, x4, SUBW(x3, x3, x4)); break; case 0xFA: INST_NAME("PSUBD Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_D(x3, x4, SUBW(x3, x3, x4)); break; case 0xFB: INST_NAME("PSUBQ Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_Q(x3, x4, SUB(x3, x3, x4)); break; case 0xFC: INST_NAME("PADDB Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); for(int i=0; i<16; ++i) { // GX->sb[i] += EX->sb[i]; - LB(x3, gback, i); + LB(x3, gback, gdoffset+i); LB(x4, wback, fixedaddress+i); ADDW(x3, x3, x4); - SB(x3, gback, i); + SB(x3, gback, gdoffset+i); } break; case 0xFD: INST_NAME("PADDW Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_W(x3, x4, ADDW(x3, x3, x4)); break; case 0xFE: INST_NAME("PADDD Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_D(x3, x4, ADDW(x3, x3, x4)); break; diff --git a/src/dynarec/rv64/dynarec_rv64_6664.c b/src/dynarec/rv64/dynarec_rv64_6664.c new file mode 100644 index 00000000..a139e3ae --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_6664.c @@ -0,0 +1,77 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_helper.h" +#include "dynarec_rv64_functions.h" + +uintptr_t dynarec64_6664(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog) +{ + (void)ip; (void)need_epilog; + + uint8_t opcode = F8; + uint8_t nextop; + uint8_t gd, ed; + int64_t j64; + int v0, v1; + int64_t fixedaddress; + int unscaled; + MAYUSE(j64); + + GETREX(); + + switch(opcode) { + case 0x8B: + INST_NAME("MOV Gd, FS:Ed"); + nextop=F8; + GETGD; + if(MODREG) { // reg <= reg + ed = xRAX+(nextop&7)+(rex.b<<3); + if(rex.w) { + MV(gd, ed); + } else { + if(ed!=gd) { + LUI(x1, 0xffff0); + AND(gd, gd, x1); + ZEXTH(x1, ed); + OR(gd, gd, x1); + } + } + } else { // mem <= reg + grab_segdata(dyn, addr, ninst, x4, seg); + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0); + ADD(x4, ed, x4); + if(rex.w) { + LD(gd, x4, fixedaddress); + } else { + LHU(x1, x4, fixedaddress); + SRLI(gd, gd, 16); + SLLI(gd, gd, 16); + OR(gd, gd, x1); + } + } + break; + + + default: + DEFAULT; + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_66f0.c b/src/dynarec/rv64/dynarec_rv64_66f0.c new file mode 100644 index 00000000..863e535d --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_66f0.c @@ -0,0 +1,129 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_helper.h" +#include "dynarec_rv64_functions.h" + + +uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; (void)rep; (void)need_epilog; + + uint8_t opcode = F8; + uint8_t nextop; + uint8_t gd, ed, u8; + uint8_t wback, wb1, wb2, gb1, gb2; + int32_t i32; + int64_t i64, j64; + int64_t fixedaddress; + int unscaled; + MAYUSE(gb1); + MAYUSE(gb2); + MAYUSE(wb1); + MAYUSE(wb2); + MAYUSE(j64); + + while((opcode==0xF2) || (opcode==0xF3)) { + rep = opcode-0xF1; + opcode = F8; + } + + GETREX(); + + switch(opcode) { + case 0x81: + case 0x83: + nextop = F8; + SMDMB(); + switch((nextop>>3)&7) { + case 0: //ADD + if(opcode==0x81) { + INST_NAME("LOCK ADD Ew, Iw"); + } else { + INST_NAME("LOCK ADD Ew, Ib"); + } + SETFLAGS(X_ALL, SF_SET_PENDING); + if(MODREG) { + if(opcode==0x81) i32 = F16S; else i32 = F8S; + ed = xRAX+(nextop&7)+(rex.b<<3); + MOV32w(x5, i32); + ZEXTH(x6, ed); + emit_add16(dyn, ninst, x6, x5, x3, x4, x2); + SRLI(ed, ed, 16); + SLLI(ed, ed, 16); + OR(ed, ed, x6); + } else { + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, LOCK_LOCK, 0, (opcode==0x81)?2:1); + if(opcode==0x81) i32 = F16S; else i32 = F8S; + MOV32w(x5, i32); + + ANDI(x3, wback, 0b10); + BNEZ_MARK(x3); + + // lower 16 bits + MARKLOCK; + LR_W(x1, wback, 1, 1); + SRLIW(x3, x1, 16); + SLLIW(x3, x3, 16); + ADD(x4, x1, x5); + SLLIW(x4, x4, 16); + SRLIW(x4, x4, 16); + OR(x4, x4, x3); + SC_W(x3, x4, wback, 1, 1); + BNEZ_MARKLOCK(x3); + IFX(X_ALL|X_PEND) { + SLLIW(x1, x1, 16); + SRLIW(x1, x1, 16); + } + B_MARK3_nocond; + + MARK; + // upper 16 bits + XORI(wback, wback, 0b10); + MARK2; + LR_W(x1, wback, 1, 1); + SLLIW(x3, x1, 16); + SRLIW(x3, x3, 16); + SRLIW(x1, x1, 16); + ADD(x4, x1, x5); + SLLIW(x4, x4, 16); + OR(x4, x4, x3); + SC_W(x3, x4, wback, 1, 1); + BNEZ_MARK2(x3); + + MARK3; + // final + IFX(X_ALL|X_PEND) { + emit_add16(dyn, ninst, x1, x5, x3, x4, x6); + } + } + break; + default: + DEFAULT; + } + SMDMB(); + break; + + default: + DEFAULT; + } + + return addr; +} \ No newline at end of file diff --git a/src/dynarec/rv64/dynarec_rv64_67.c b/src/dynarec/rv64/dynarec_rv64_67.c new file mode 100644 index 00000000..cb7702a8 --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_67.c @@ -0,0 +1,574 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> +#include <assert.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_helper.h" +#include "dynarec_rv64_functions.h" + +uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; (void)need_epilog; + + uint8_t opcode = F8; + uint8_t nextop; + uint8_t gd, ed, wback, wb, wb1, wb2, gb1, gb2, eb1, eb2; + int64_t fixedaddress; + int unscaled; + int8_t i8; + uint8_t u8; + int32_t i32; + int64_t j64, i64; + int cacheupd = 0; + int lock; + int v0, v1, s0; + MAYUSE(i32); + MAYUSE(j64); + MAYUSE(v0); + MAYUSE(v1); + MAYUSE(s0); + MAYUSE(lock); + MAYUSE(cacheupd); + + if(rex.is32bits) { + // should do a different file + DEFAULT; + return addr; + } + + GETREX(); + + rep = 0; + while((opcode==0xF2) || (opcode==0xF3)) { + rep = opcode-0xF1; + opcode = F8; + } + + switch(opcode) { + + case 0x01: + INST_NAME("ADD Ed, Gd"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_add32(dyn, ninst, rex, ed, gd, x3, x4, x5); + WBACK; + break; + case 0x02: + INST_NAME("ADD Gb, Eb"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETEB32(x2, 0); + GETGB(x1); + emit_add8(dyn, ninst, x1, x2, x3, x4); + GBBACK(x4); + break; + case 0x03: + INST_NAME("ADD Gd, Ed"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_add32(dyn, ninst, rex, gd, ed, x3, x4, x5); + break; + + case 0x05: + INST_NAME("ADD EAX, Id"); + SETFLAGS(X_ALL, SF_SET_PENDING); + i64 = F32S; + emit_add32c(dyn, ninst, rex, xRAX, i64, x3, x4, x5, x6); + break; + + case 0x09: + INST_NAME("OR Ed, Gd"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_or32(dyn, ninst, rex, ed, gd, x3, x4); + WBACK; + break; + case 0x0A: + INST_NAME("OR Gb, Eb"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETEB32(x2, 0); + GETGB(x1); + emit_or8(dyn, ninst, x1, x2, x3, x4); + GBBACK(x4); + break; + case 0x0B: + INST_NAME("OR Gd, Ed"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_or32(dyn, ninst, rex, gd, ed, x3, x4); + break; + + case 0x0D: + INST_NAME("OR EAX, Id"); + SETFLAGS(X_ALL, SF_SET_PENDING); + i64 = F32S; + emit_or32c(dyn, ninst, rex, xRAX, i64, x3, x4); + break; + + case 0x0F: + opcode=F8; + switch(opcode) { + case 0x2E: + // no special check... + case 0x2F: + switch (rep) { + case 0: + if(opcode==0x2F) {INST_NAME("COMISS Gx, Ex");} else {INST_NAME("UCOMISS Gx, Ex");} + SETFLAGS(X_ALL, SF_SET); + nextop = F8; + GETGXSS(s0); + if(MODREG) { + v0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 1); + } else { + v0 = fpu_get_scratch(dyn); + SMREAD(); + addr = geted32(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, 0); + FLW(v0, ed, fixedaddress); + } + CLEAR_FLAGS(); + // if isnan(s0) || isnan(v0) + IFX(X_ZF | X_PF | X_CF) { + FEQS(x3, s0, s0); + FEQS(x2, v0, v0); + AND(x2, x2, x3); + BNE_MARK(x2, xZR); + ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF)); + B_NEXT_nocond; + } + MARK; + // else if isless(d0, v0) + IFX(X_CF) { + FLTS(x2, s0, v0); + BEQ_MARK2(x2, xZR); + ORI(xFlags, xFlags, 1<<F_CF); + B_NEXT_nocond; + } + MARK2; + // else if d0 == v0 + IFX(X_ZF) { + FEQS(x2, s0, v0); + CBZ_NEXT(x2); + ORI(xFlags, xFlags, 1<<F_ZF); + } + break; + default: + DEFAULT; + } + break; + default: + DEFAULT; + } + break; + + case 0x11: + INST_NAME("ADC Ed, Gd"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_adc32(dyn, ninst, rex, ed, gd, x3, x4, x5, x6); + WBACK; + break; + + case 0x13: + INST_NAME("ADC Gd, Ed"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_adc32(dyn, ninst, rex, gd, ed, x3, x4, x5, x6); + break; + + case 0x15: + INST_NAME("ADC EAX, Id"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + i64 = F32S; + MOV64xw(x1, i64); + emit_adc32(dyn, ninst, rex, xRAX, x1, x3, x4, x5, x6); + break; + + case 0x19: + INST_NAME("SBB Ed, Gd"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_sbb32(dyn, ninst, rex, ed, gd, x3, x4, x5); + WBACK; + break; + case 0x1A: + INST_NAME("SBB Gb, Eb"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETEB32(x2, 0); + GETGB(x1); + emit_sbb8(dyn, ninst, x1, x2, x3, x4, x5); + GBBACK(x4); + break; + case 0x1B: + INST_NAME("SBB Gd, Ed"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_sbb32(dyn, ninst, rex, gd, ed, x3, x4, x5); + break; + + case 0x1D: + INST_NAME("SBB EAX, Id"); + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + i64 = F32S; + MOV64xw(x2, i64); + emit_sbb32(dyn, ninst, rex, xRAX, x2, x3, x4, x5); + break; + + case 0x21: + INST_NAME("AND Ed, Gd"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_and32(dyn, ninst, rex, ed, gd, x3, x4); + WBACK; + break; + case 0x22: + INST_NAME("AND Gb, Eb"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETEB32(x2, 0); + GETGB(x1); + emit_and8(dyn, ninst, x1, x2, x3, x4); + GBBACK(x4); + break; + case 0x23: + INST_NAME("AND Gd, Ed"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_and32(dyn, ninst, rex, gd, ed, x3, x4); + break; + + case 0x25: + INST_NAME("AND EAX, Id"); + SETFLAGS(X_ALL, SF_SET_PENDING); + i64 = F32S; + emit_and32c(dyn, ninst, rex, xRAX, i64, x3, x4); + break; + + case 0x29: + INST_NAME("SUB Ed, Gd"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_sub32(dyn, ninst, rex, ed, gd, x3, x4, x5); + WBACK; + break; + case 0x2A: + INST_NAME("SUB Gb, Eb"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETEB32(x2, 0); + GETGB(x1); + emit_sub8(dyn, ninst, x1, x2, x3, x4, x5); + GBBACK(x5); + break; + case 0x2B: + INST_NAME("SUB Gd, Ed"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_sub32(dyn, ninst, rex, gd, ed, x3, x4, x5); + break; + + case 0x2D: + INST_NAME("SUB EAX, Id"); + SETFLAGS(X_ALL, SF_SET_PENDING); + i64 = F32S; + emit_sub32c(dyn, ninst, rex, xRAX, i64, x3, x4, x5, x6); + break; + + case 0x31: + INST_NAME("XOR Ed, Gd"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_xor32(dyn, ninst, rex, ed, gd, x3, x4); + WBACK; + break; + case 0x32: + INST_NAME("XOR Gb, Eb"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETEB32(x2, 0); + GETGB(x1); + emit_xor8(dyn, ninst, x1, x2, x3, x4); + GBBACK(x4); + break; + case 0x33: + INST_NAME("XOR Gd, Ed"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_xor32(dyn, ninst, rex, gd, ed, x3, x4); + break; + + case 0x35: + INST_NAME("XOR EAX, Id"); + SETFLAGS(X_ALL, SF_SET_PENDING); + i64 = F32S; + emit_xor32c(dyn, ninst, rex, xRAX, i64, x3, x4); + break; + + case 0x38: + INST_NAME("CMP Eb, Gb"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETEB32(x1, 0); + GETGB(x2); + emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6); + break; + case 0x39: + INST_NAME("CMP Ed, Gd"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_cmp32(dyn, ninst, rex, ed, gd, x3, x4, x5, x6); + break; + case 0x3A: + INST_NAME("CMP Gb, Eb"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETEB32(x2, 0); + GETGB(x1); + emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6); + break; + case 0x3B: + INST_NAME("CMP Gd, Ed"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + GETED32(0); + emit_cmp32(dyn, ninst, rex, gd, ed, x3, x4, x5, x6); + break; + case 0x3C: + INST_NAME("CMP AL, Ib"); + SETFLAGS(X_ALL, SF_SET_PENDING); + u8 = F8; + ANDI(x1, xRAX, 0xff); + if(u8) { + MOV32w(x2, u8); + emit_cmp8(dyn, ninst, x1, x2, x3, x4, x5, x6); + } else { + emit_cmp8_0(dyn, ninst, x1, x3, x4); + } + break; + case 0x3D: + INST_NAME("CMP EAX, Id"); + SETFLAGS(X_ALL, SF_SET_PENDING); + i64 = F32S; + if(i64) { + MOV64xw(x2, i64); + emit_cmp32(dyn, ninst, rex, xRAX, x2, x3, x4, x5, x6); + } else + emit_cmp32_0(dyn, ninst, rex, xRAX, x3, x4); + break; + + case 0x81: + case 0x83: + nextop = F8; + switch((nextop>>3)&7) { + case 0: //ADD + if(opcode==0x81) {INST_NAME("ADD Ed, Id");} else {INST_NAME("ADD Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETED32((opcode==0x81)?4:1); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_add32c(dyn, ninst, rex, ed, i64, x3, x4, x5, x6); + WBACK; + break; + case 1: //OR + if(opcode==0x81) {INST_NAME("OR Ed, Id");} else {INST_NAME("OR Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETED32((opcode==0x81)?4:1); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_or32c(dyn, ninst, rex, ed, i64, x3, x4); + WBACK; + break; + case 2: //ADC + if(opcode==0x81) {INST_NAME("ADC Ed, Id");} else {INST_NAME("ADC Ed, Ib");} + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETED32((opcode==0x81)?4:1); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + MOV64xw(x5, i64); + emit_adc32(dyn, ninst, rex, ed, x5, x3, x4, x5, x6); + WBACK; + break; + case 3: //SBB + if(opcode==0x81) {INST_NAME("SBB Ed, Id");} else {INST_NAME("SBB Ed, Ib");} + READFLAGS(X_CF); + SETFLAGS(X_ALL, SF_SET_PENDING); + GETED32((opcode==0x81)?4:1); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + MOV64xw(x5, i64); + emit_sbb32(dyn, ninst, rex, ed, x5, x3, x4, x5); + WBACK; + break; + case 4: //AND + if(opcode==0x81) {INST_NAME("AND Ed, Id");} else {INST_NAME("AND Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETED32((opcode==0x81)?4:1); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_and32c(dyn, ninst, rex, ed, i64, x3, x4); + WBACK; + break; + case 5: //SUB + if(opcode==0x81) {INST_NAME("SUB Ed, Id");} else {INST_NAME("SUB Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETED32((opcode==0x81)?4:1); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_sub32c(dyn, ninst, rex, ed, i64, x3, x4, x5, x6); + WBACK; + break; + case 6: //XOR + if(opcode==0x81) {INST_NAME("XOR Ed, Id");} else {INST_NAME("XOR Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETED32((opcode==0x81)?4:1); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + emit_xor32c(dyn, ninst, rex, ed, i64, x3, x4); + WBACK; + break; + case 7: //CMP + if(opcode==0x81) {INST_NAME("CMP Ed, Id");} else {INST_NAME("CMP Ed, Ib");} + SETFLAGS(X_ALL, SF_SET_PENDING); + GETED32((opcode==0x81)?4:1); + if(opcode==0x81) i64 = F32S; else i64 = F8S; + if(i64) { + MOV64xw(x2, i64); + emit_cmp32(dyn, ninst, rex, ed, x2, x3, x4, x5, x6); + } else + emit_cmp32_0(dyn, ninst, rex, ed, x3, x4); + break; + } + break; + + case 0x88: + INST_NAME("MOV Eb, Gb"); + nextop = F8; + gd = ((nextop&0x38)>>3)+(rex.r<<3); + if(rex.rex) { + gb2 = 0; + gb1 = xRAX + gd; + } else { + gb2 = ((gd&4)>>2); + gb1 = xRAX+(gd&3); + } + gd = x4; + if(gb2) { + SRLI(x4, gb1, 8); + gb1 = x4; + } + if(MODREG) { + ed = (nextop&7) + (rex.b<<3); + if(rex.rex) { + eb1 = xRAX+ed; + eb2 = 0; + } else { + eb1 = xRAX+(ed&3); // Ax, Cx, Dx or Bx + eb2 = ((ed&4)>>2); // L or H + } + ANDI(gd, gb1, 0xff); + if(eb2) { + MOV64x(x1, 0xffffffffffff00ffLL); + AND(x1, eb1, x1); + SLLI(gd, gd, 8); + OR(eb1, x1, gd); + } else { + ANDI(x1, eb1, ~0xff); + OR(eb1, x1, gd); + } + } else { + addr = geted32(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0); + SB(gb1, ed, fixedaddress); + SMWRITELOCK(lock); + } + break; + case 0x89: + INST_NAME("MOV Ed, Gd"); + nextop=F8; + GETGD; + if(MODREG) { // reg <= reg + MVxw(xRAX+(nextop&7)+(rex.b<<3), gd); + } else { // mem <= reg + addr = geted32(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0); + SDxw(gd, ed, fixedaddress); + SMWRITELOCK(lock); + } + break; + case 0x8B: + INST_NAME("MOV Gd, Ed"); + nextop=F8; + GETGD; + if(MODREG) { + MVxw(gd, xRAX+(nextop&7)+(rex.b<<3)); + } else { + addr = geted32(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, &lock, 1, 0); + SMREADLOCK(lock); + LDxw(gd, ed, fixedaddress); + } + break; + case 0x8D: + INST_NAME("LEA Gd, Ed"); + nextop=F8; + GETGD; + if(MODREG) { // reg <= reg? that's an invalid operation + DEFAULT; + } else { // mem <= reg + addr = geted32(dyn, addr, ninst, nextop, &ed, gd, x1, &fixedaddress, rex, NULL, 0, 0); + if(ed!=gd) { + AND(gd, ed, xMASK); + } + } + break; + default: + DEFAULT; + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_d8.c b/src/dynarec/rv64/dynarec_rv64_d8.c index beadb202..7f14468b 100644 --- a/src/dynarec/rv64/dynarec_rv64_d8.c +++ b/src/dynarec/rv64/dynarec_rv64_d8.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -50,13 +49,73 @@ uintptr_t dynarec64_D8(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0xD0 ... 0xD7: case 0xD8 ... 0xDF: - + INST_NAME("FCOMP ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + LHU(x3, xEmu, offsetof(x64emu_t, sw)); + MOV32w(x1, 0b1110100011111111); // mask off c0,c1,c2,c3 + AND(x3, x3, x1); + if(ST_IS_F(0)) { + FEQS(x5, v1, v1); + FEQS(x4, v2, v2); + AND(x5, x5, x4); + BEQZ(x5, 24); // undefined/NaN + FEQS(x5, v1, v2); + BNEZ(x5, 28); // equal + FLTS(x3, v1, v2); // x3 = (v1<v2)?1:0 + SLLI(x1, x3, 8); + J(20); // end + // undefined/NaN + LUI(x1, 1); + ADDI(x1, x1, 0b010100000000); + J(8); // end + // equal + LUI(x1, 1); + // end + } else { + FEQD(x5, v1, v1); + FEQD(x4, v2, v2); + AND(x5, x5, x4); + BEQZ(x5, 24); // undefined/NaN + FEQD(x5, v1, v2); + BNEZ(x5, 28); // equal + FLTD(x3, v1, v2); // x3 = (v1<v2)?1:0 + SLLI(x1, x3, 8); + J(20); // end + // undefined/NaN + LUI(x1, 1); + ADDI(x1, x1, 0b010100000000); + J(8); // end + // equal + LUI(x1, 1); + // end + } + OR(x3, x3, x1); + SH(x3, xEmu, offsetof(x64emu_t, sw)); + x87_do_pop(dyn, ninst, x3); + break; case 0xE0 ... 0xE7: - + INST_NAME("FSUB ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FSUBS(v1, v1, v2); + } else { + FSUBD(v1, v1, v2); + } + break; case 0xE8 ... 0xEF: case 0xF0 ... 0xF7: - + INST_NAME("FDIV ST0, STx"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FDIVS(v1, v1, v2); + } else { + FDIVD(v1, v1, v2); + } + break; case 0xF8 ... 0xFF: DEFAULT; break; diff --git a/src/dynarec/rv64/dynarec_rv64_d9.c b/src/dynarec/rv64/dynarec_rv64_d9.c index 9378c650..4940d6b4 100644 --- a/src/dynarec/rv64/dynarec_rv64_d9.c +++ b/src/dynarec/rv64/dynarec_rv64_d9.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -34,13 +33,16 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni uint8_t u8; int64_t fixedaddress; int unscaled; - int v1, v2; + int v0, v1, v2; int s0; int i1, i2, i3; + int64_t j64; MAYUSE(s0); - MAYUSE(v2); + MAYUSE(v0); MAYUSE(v1); + MAYUSE(v2); + MAYUSE(j64); switch(nextop) { case 0xC0: @@ -260,7 +262,12 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni break; case 0xFA: INST_NAME("FSQRT"); - DEFAULT; + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if(ST_IS_F(0)) { + FSQRTS(v1, v1); + } else { + FSQRTD(v1, v1); + } break; case 0xFB: INST_NAME("FSINCOS"); @@ -271,7 +278,43 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni break; case 0xFC: INST_NAME("FRNDINT"); - DEFAULT; + v0 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + v1 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn); + u8 = x87_setround(dyn, ninst, x1, x2); + + if(ST_IS_F(0)) { + FEQS(x2, v0, v0); + BNEZ_MARK(x2); + B_NEXT_nocond; + MARK; // v0 is not nan + FABSS(v1, v0); + MOV64x(x3, 1ULL << __FLT_MANT_DIG__); + FCVTSL(v2, x3, RD_RTZ); + FLTS(x3, v1, v2); + BNEZ_MARK2(x3); + B_NEXT_nocond; + MARK2; + FCVTLS(x3, v0, RD_DYN); + FCVTSL(v1, x3, RD_DYN); + FSGNJS(v0, v1, v0); + } else { + FEQD(x2, v0, v0); + BNEZ_MARK(x2); + B_NEXT_nocond; + MARK; // v0 is not nan + FABSD(v1, v0); + MOV64x(x3, 1ULL << __DBL_MANT_DIG__); + FCVTDL(v2, x3, RD_RTZ); + FLTD(x3, v1, v2); + BNEZ_MARK2(x3); + B_NEXT_nocond; + MARK2; + FCVTLD(x3, v0, RD_DYN); + FCVTDL(v1, x3, RD_DYN); + FSGNJD(v0, v1, v0); + } + x87_restoreround(dyn, ninst, u8); break; case 0xFD: INST_NAME("FSCALE"); diff --git a/src/dynarec/rv64/dynarec_rv64_db.c b/src/dynarec/rv64/dynarec_rv64_db.c index 95e350c0..7a5dddb0 100644 --- a/src/dynarec/rv64/dynarec_rv64_db.c +++ b/src/dynarec/rv64/dynarec_rv64_db.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -150,7 +149,45 @@ uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0xEE: case 0xEF: INST_NAME("FUCOMI ST0, STx"); - DEFAULT; + SETFLAGS(X_ALL, SF_SET); + SET_DFNONE(); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + IFX(F_ZF | F_PF | F_CF) { + if(ST_IS_F(0)) { + FEQS(x5, v1, v1); + FEQS(x4, v2, v2); + AND(x5, x5, x4); + BEQZ(x5, 24); // undefined/NaN + FEQS(x5, v1, v2); + BNEZ(x5, 24); // equal + FLTS(x3, v1, v2); // x3 = (v1<v2)?1:0 + OR(xFlags, xFlags, x3); // CF is the least significant bit + J(16); // end + // NaN + ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF)); + J(8); // end + // equal + ORI(xFlags, xFlags, 1<<F_ZF); + // end + } else { + FEQD(x5, v1, v1); + FEQD(x4, v2, v2); + AND(x5, x5, x4); + BEQZ(x5, 24); // undefined/NaN + FEQD(x5, v1, v2); + BNEZ(x5, 24); // equal + FLTD(x3, v1, v2); // x3 = (v1<v2)?1:0 + OR(xFlags, xFlags, x3); // CF is the least significant bit + J(16); // end + // NaN + ORI(xFlags, xFlags, (1<<F_ZF) | (1<<F_PF) | (1<<F_CF)); + J(8); // end + // equal + ORI(xFlags, xFlags, 1<<F_ZF); + // end + } + } break; case 0xF0: case 0xF1: @@ -191,7 +228,24 @@ uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni break; case 3: INST_NAME("FISTP Ed, ST0"); - DEFAULT; + v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D); + u8 = x87_setround(dyn, ninst, x1, x2); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); + v2 = fpu_get_scratch(dyn); + if(!box64_dynarec_fastround) { + FSFLAGSI(0); // reset all bits + } + FCVTWD(x4, v1, RD_DYN); + x87_restoreround(dyn, ninst, u8); + if(!box64_dynarec_fastround) { + FRFLAGS(x5); // get back FPSR to check the IOC bit + ANDI(x5, x5, 1<<FR_NV); + BEQ_MARK2(x5, xZR); + MOV32w(x4, 0x80000000); + } + MARK2; + SW(x4, wback, fixedaddress); + x87_do_pop(dyn, ninst, x3); break; case 5: INST_NAME("FLD tbyte"); diff --git a/src/dynarec/rv64/dynarec_rv64_dc.c b/src/dynarec/rv64/dynarec_rv64_dc.c new file mode 100644 index 00000000..d802e2fb --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_dc.c @@ -0,0 +1,119 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_helper.h" +#include "dynarec_rv64_functions.h" + + +uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; (void)rep; (void)need_epilog; + + uint8_t nextop = F8; + uint8_t wback; + int64_t fixedaddress; + int unscaled; + int v1, v2; + + MAYUSE(v2); + MAYUSE(v1); + + switch(nextop) { + case 0xC0 ... 0xC7: + INST_NAME("FADD STx, ST0"); + DEFAULT; + break; + case 0xC8 ... 0xCF: + INST_NAME("FMUL STx, ST0"); + DEFAULT; + break; + case 0xD0 ... 0xD7: + INST_NAME("FCOM ST0, STx"); //yep + DEFAULT; + break; + case 0xD8 ... 0xDF: + INST_NAME("FCOMP ST0, STx"); + DEFAULT; + break; + case 0xE0 ... 0xE7: + INST_NAME("FSUBR STx, ST0"); + DEFAULT; + break; + break; + case 0xE8 ... 0xEF: + INST_NAME("FSUB STx, ST0"); + DEFAULT; + break; + case 0xF0 ... 0xF7: + INST_NAME("FDIVR STx, ST0"); + DEFAULT; + break; + case 0xF8 ... 0xFF: + INST_NAME("FDIV STx, ST0"); + DEFAULT; + break; + default: + switch((nextop>>3)&7) { + case 3: + INST_NAME("FCOMP ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD(v2, wback, fixedaddress); + + LHU(x3, xEmu, offsetof(x64emu_t, sw)); + MOV32w(x1, 0b1110100011111111); // mask off c0,c1,c2,c3 + AND(x3, x3, x1); + FEQD(x5, v1, v1); + FEQD(x4, v2, v2); + AND(x5, x5, x4); + BEQZ(x5, 24); // undefined/NaN + FEQD(x5, v1, v2); + BNEZ(x5, 28); // equal + FLTD(x3, v1, v2); // x3 = (v1<v2)?1:0 + SLLI(x1, x3, 8); + J(20); // end + // undefined/NaN + LUI(x1, 1); + ADDI(x1, x1, 0b010100000000); + J(8); // end + // equal + LUI(x1, 1); + // end + OR(x3, x3, x1); + SH(x3, xEmu, offsetof(x64emu_t, sw)); + + x87_do_pop(dyn, ninst, x3); + break; + case 6: + INST_NAME("FDIV ST0, double[ED]"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D); + v2 = fpu_get_scratch(dyn); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD(v2, wback, fixedaddress); + FDIVD(v1, v1, v2); + break; + default: + DEFAULT; + } + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_dd.c b/src/dynarec/rv64/dynarec_rv64_dd.c new file mode 100644 index 00000000..044f9aab --- /dev/null +++ b/src/dynarec/rv64/dynarec_rv64_dd.c @@ -0,0 +1,179 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> + +#include "debug.h" +#include "box64context.h" +#include "dynarec.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "x64run.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "rv64_printer.h" +#include "dynarec_rv64_private.h" +#include "dynarec_rv64_helper.h" +#include "dynarec_rv64_functions.h" + + +uintptr_t dynarec64_DD(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog) +{ + (void)ip; (void)rep; (void)need_epilog; + + uint8_t nextop = F8; + uint8_t ed, wback; + int64_t fixedaddress; + int unscaled; + int v1, v2; + int s0; + int64_t j64; + + MAYUSE(s0); + MAYUSE(v2); + MAYUSE(v1); + MAYUSE(j64); + + switch(nextop) { + case 0xC0: + case 0xC1: + case 0xC2: + case 0xC3: + case 0xC4: + case 0xC5: + case 0xC6: + case 0xC7: + INST_NAME("FFREE STx"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + x87_purgecache(dyn, ninst, 0, x1, x2, x3); + MOV32w(x1, nextop&7); + CALL(fpu_do_free, -1); + break; + case 0xD0: + case 0xD1: + case 0xD2: + case 0xD3: + case 0xD4: + case 0xD5: + case 0xD6: + case 0xD7: + INST_NAME("FST ST0, STx"); + DEFAULT; + break; + case 0xD8: + INST_NAME("FSTP ST0, ST0"); + x87_do_pop(dyn, ninst, x3); + break; + case 0xD9: + case 0xDA: + case 0xDB: + case 0xDC: + case 0xDD: + case 0xDE: + case 0xDF: + INST_NAME("FSTP ST0, STx"); + // copy the cache value for st0 to stx + x87_get_st_empty(dyn, ninst, x1, x2, nextop&7, X87_ST(nextop&7)); + x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + x87_swapreg(dyn, ninst, x1, x2, 0, nextop&7); + x87_do_pop(dyn, ninst, x3); + break; + case 0xE0: + case 0xE1: + case 0xE2: + case 0xE3: + case 0xE4: + case 0xE5: + case 0xE6: + case 0xE7: + INST_NAME("FUCOM ST0, STx"); + DEFAULT; + break; + case 0xE8: + case 0xE9: + case 0xEA: + case 0xEB: + case 0xEC: + case 0xED: + case 0xEE: + case 0xEF: + INST_NAME("FUCOMP ST0, STx"); + DEFAULT; + break; + case 0xC8: + case 0xC9: + case 0xCA: + case 0xCB: + case 0xCC: + case 0xCD: + case 0xCE: + case 0xCF: + case 0xF0: + case 0xF1: + case 0xF2: + case 0xF3: + case 0xF4: + case 0xF5: + case 0xF6: + case 0xF7: + case 0xF8: + case 0xF9: + case 0xFA: + case 0xFB: + case 0xFC: + case 0xFD: + case 0xFE: + case 0xFF: + DEFAULT; + break; + + default: + switch((nextop>>3)&7) { + case 0: + INST_NAME("FLD double"); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FLD(v1, wback, fixedaddress); + break; + case 2: + INST_NAME("FST double"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FSD(v1, wback, fixedaddress); + break; + case 3: + INST_NAME("FSTP double"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, 0); + FSD(v1, wback, fixedaddress); + x87_do_pop(dyn, ninst, x3); + break; + case 7: + INST_NAME("FNSTSW m2byte"); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + addr = geted(dyn, addr, ninst, nextop, &ed, x4, x6, &fixedaddress, rex, NULL, 0, 0); + LWU(x2, xEmu, offsetof(x64emu_t, top)); + LHU(x3, xEmu, offsetof(x64emu_t, sw)); + if(dyn->e.x87stack) { + // update top + ADDI(x2, x2, -dyn->e.x87stack); + ANDI(x2, x2, 7); + } + MOV32w(x5, ~0x3800); + AND(x3, x3, x5); // mask out TOP + SLLI(x2, x2, 11); // shift TOP to bit 11 + OR(x3, x3, x2); // inject TOP + SH(x3, ed, fixedaddress); // store whole sw flags + break; + default: + DEFAULT; + } + } + return addr; +} diff --git a/src/dynarec/rv64/dynarec_rv64_de.c b/src/dynarec/rv64/dynarec_rv64_de.c index 1511c6ef..a2341b40 100644 --- a/src/dynarec/rv64/dynarec_rv64_de.c +++ b/src/dynarec/rv64/dynarec_rv64_de.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" diff --git a/src/dynarec/rv64/dynarec_rv64_df.c b/src/dynarec/rv64/dynarec_rv64_df.c index a96a45f1..de99b02a 100644 --- a/src/dynarec/rv64/dynarec_rv64_df.c +++ b/src/dynarec/rv64/dynarec_rv64_df.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -41,15 +40,29 @@ uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni switch(nextop) { case 0xC0 ... 0xC7: - - case 0xE0: - - case 0xE8 ... 0xEF: + INST_NAME("FFREEP STx"); DEFAULT; break; - - case 0xF0 ... 0xF7: - INST_NAME("FCOMIP ST0, STx"); + + case 0xE0: + INST_NAME("FNSTSW AX"); + LHU(x2, xEmu, offsetof(x64emu_t, top)); + LHU(x1, xEmu, offsetof(x64emu_t, sw)); + MOV32w(x3, 0b1100011111111111); // mask + AND(x1, x1, x3); + SLLI(x2, x2, 11); + OR(x1, x1, x2); // inject top + SH(x1, xEmu, offsetof(x64emu_t, sw)); + SRLI(xRAX, xRAX, 16); + SLLI(xRAX, xRAX, 16); + OR(xRAX, xRAX, x1); + break; + case 0xE8 ... 0xF7: + if (nextop < 0xF0) { + INST_NAME("FUCOMIP ST0, STx"); + } else { + INST_NAME("FCOMIP ST0, STx"); + } SETFLAGS(X_ALL, SF_SET); SET_DFNONE(); v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); @@ -114,9 +127,9 @@ uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 1: INST_NAME("FISTTP Ew, ST0"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_F); - addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 0, 0); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 1, 0); if(!box64_dynarec_fastround) { - FSFLAGSI(xZR); // reset all bits + FSFLAGSI(0); // reset all bits } FCVTWD(x4, v1, RD_RTZ); if(!box64_dynarec_fastround) { @@ -136,12 +149,12 @@ uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 3: INST_NAME("FISTP Ew, ST0"); v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_F); - addr = geted(dyn, addr, ninst, nextop, &wback, x3, x4, &fixedaddress, rex, NULL, 0, 0); - u8 = sse_setround(dyn, ninst, x2, x3); + u8 = x87_setround(dyn, ninst, x1, x2); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); if(!box64_dynarec_fastround) { - FSFLAGSI(xZR); // reset all bits + FSFLAGSI(0); // reset all bits } - FCVTWD(x4, v1, RD_RM); + FCVTWD(x4, v1, RD_DYN); x87_restoreround(dyn, ninst, u8); if(!box64_dynarec_fastround) { FRFLAGS(x5); // get back FPSR to check the IOC bit @@ -157,6 +170,71 @@ uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni SH(x4, wback, fixedaddress); x87_do_pop(dyn, ninst, x3); break; + case 5: + INST_NAME("FILD ST0, i64"); + v1 = x87_do_push(dyn, ninst, x1, EXT_CACHE_ST_D); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); + LD(x1, wback, fixedaddress); + if (rex.is32bits) { + // need to also feed the STll stuff... + ADDI(x4, xEmu, offsetof(x64emu_t, fpu_ll)); + LWU(x5, xEmu, offsetof(x64emu_t, top)); + int a = 0 - dyn->e.x87stack; + if(a) { + ADDIW(x5, x5, a); + ANDI(x5, x5, 0x7); + } + SLLI(x5, x5, 4); // fpu_ll is 2 i64 + ADD(x5, x5, x4); + SD(x1, x5, 8); // ll + } + FCVTDL(v1, x1, RD_RTZ); + if(rex.is32bits) { + FSD(v1, x5, 0); // ref + } + break; + case 7: + INST_NAME("FISTP i64, ST0"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, EXT_CACHE_ST_D); + u8 = x87_setround(dyn, ninst, x1, x2); + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x3, &fixedaddress, rex, NULL, 1, 0); + + if(rex.is32bits) { + // need to check STll first... + ADDI(x4, xEmu, offsetof(x64emu_t, fpu_ll)); + LWU(x5, xEmu, offsetof(x64emu_t, top)); + int a = 0 - dyn->e.x87stack; + if(a) { + ADDIW(x5, x5, a); + ANDI(x5, x5, 0x7); + } + SLLI(x5, x5, 4); // fpu_ll is 2 i64 + ADD(x5, x5, x4); + FMVXD(x3, v1); + LD(x6, x5, 0); // ref + BNE_MARK(x6, x3); + LD(x6, x5, 8); // ll + SD(x6, wback, fixedaddress); + B_MARK3_nocond; + MARK; + } + + if(!box64_dynarec_fastround) { + FSFLAGSI(0); // reset all bits + } + FCVTLD(x4, v1, RD_DYN); + if(!box64_dynarec_fastround) { + FRFLAGS(x5); // get back FPSR to check the IOC bit + ANDI(x5, x5, 1<<FR_NV); + BEQ_MARK2(x5, xZR); + MOV64x(x4, 0x8000000000000000LL); + } + MARK2; + SD(x4, wback, fixedaddress); + MARK3; + x87_restoreround(dyn, ninst, u8); + x87_do_pop(dyn, ninst, x3); + break; default: DEFAULT; break; diff --git a/src/dynarec/rv64/dynarec_rv64_emit_logic.c b/src/dynarec/rv64/dynarec_rv64_emit_logic.c index 6d17895f..1352868b 100644 --- a/src/dynarec/rv64/dynarec_rv64_emit_logic.c +++ b/src/dynarec/rv64/dynarec_rv64_emit_logic.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -16,7 +15,6 @@ #include "emu/x64run_private.h" #include "x64trace.h" #include "dynarec_native.h" -#include "../tools/bridge_private.h" #include "rv64_printer.h" #include "dynarec_rv64_private.h" @@ -165,8 +163,7 @@ void emit_xor16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, } XOR(s1, s1, s2); - SLLI(s1, s1, 48); - SRLI(s1, s1, 48); + ZEXTH(s1, s1); IFX(X_PEND) { SH(s1, xEmu, offsetof(x64emu_t, res)); @@ -197,8 +194,7 @@ void emit_or16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4) { } OR(s1, s1, s2); - SLLI(s1, s1, 48); - SRLI(s1, s1, 48); + ZEXTH(s1, s1); IFX(X_PEND) { SD(s1, xEmu, offsetof(x64emu_t, res)); } @@ -426,7 +422,7 @@ void emit_and32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i MOV64xw(s3, c); AND(s1, s1, s3); // res = s1 & s2 } - if (!rex.w) ZEROUP(s1); + if (!rex.w && c<0 && c>=-2048) ZEROUP(s1); IFX(X_PEND) { SDxw(s1, xEmu, offsetof(x64emu_t, res)); diff --git a/src/dynarec/rv64/dynarec_rv64_emit_math.c b/src/dynarec/rv64/dynarec_rv64_emit_math.c index 01579ea3..5d6f7e0e 100644 --- a/src/dynarec/rv64/dynarec_rv64_emit_math.c +++ b/src/dynarec/rv64/dynarec_rv64_emit_math.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -16,7 +15,6 @@ #include "emu/x64run_private.h" #include "x64trace.h" #include "dynarec_native.h" -#include "../tools/bridge_private.h" #include "rv64_printer.h" #include "dynarec_rv64_private.h" @@ -37,8 +35,7 @@ void emit_add32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s IFX(X_CF) { if (rex.w) { AND(s5, xMASK, s1); - AND(s4, xMASK, s2); - ADD(s5, s5, s4); // lo + if(rv64_zba) ADDUW(s5, s2, s5); else {AND(s4, xMASK, s2); ADD(s5, s5, s4);} // lo SRLI(s3, s1, 0x20); SRLI(s4, s2, 0x20); ADD(s4, s4, s3); @@ -65,8 +62,12 @@ void emit_add32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s SDxw(s1, xEmu, offsetof(x64emu_t, res)); } IFX(X_AF | X_OF) { - NOT(s5, s1); // s5 = ~res - AND(s3, s5, s3); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s3); // s3 = ~res & (op1 | op2) + } else { + NOT(s5, s1); // s5 = ~res + AND(s3, s5, s3); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s4); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s4, s3, 0x08); // AF: cc & 0x08 @@ -126,8 +127,7 @@ void emit_add32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i IFX(X_CF) { if (rex.w) { AND(s5, xMASK, s1); - AND(s4, xMASK, s2); - ADD(s5, s5, s4); // lo + if(rv64_zba) ADDUW(s5, s2, s5); else {AND(s4, xMASK, s2); ADD(s5, s5, s4);} // lo SRLI(s3, s1, 0x20); SRLI(s4, s2, 0x20); ADD(s4, s4, s3); @@ -159,8 +159,12 @@ void emit_add32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i SDxw(s1, xEmu, offsetof(x64emu_t, res)); } IFX(X_AF | X_OF) { - NOT(s2, s1); // s2 = ~res - AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s3); // s3 = ~res & (op1 | op2) + } else { + NOT(s2, s1); // s2 = ~res + AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s4); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s4, s3, 0x08); // AF: cc & 0x08 @@ -213,8 +217,12 @@ void emit_add16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, SW(s1, xEmu, offsetof(x64emu_t, res)); } IFX(X_AF | X_OF) { - NOT(s5, s1); // s5 = ~res - AND(s3, s5, s3); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s3); // s3 = ~res & (op1 | op2) + } else { + NOT(s5, s1); // s5 = ~res + AND(s3, s5, s3); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s4); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s4, s3, 0x08); // AF: cc & 0x08 @@ -237,8 +245,7 @@ void emit_add16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, ORI(xFlags, xFlags, 1 << F_CF); } - SLLI(s1, s1, 48); - SRLI(s1, s1, 48); + ZEXTH(s1, s1); IFX(X_ZF) { BNEZ(s1, 8); @@ -272,8 +279,12 @@ void emit_add8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4) ADD(s1, s1, s2); IFX(X_AF|X_OF) { - NOT(s4, s1); // s4 = ~res - AND(s3, s4, s3); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s3); // s3 = ~res & (op1 | op2) + } else { + NOT(s4, s1); // s4 = ~res + AND(s3, s4, s3); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s2); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s4, s3, 0x08); // AF: cc & 0x08 @@ -332,8 +343,12 @@ void emit_add8c(dynarec_rv64_t* dyn, int ninst, int s1, int c, int s2, int s3, i ADDI(s1, s1, c); IFX(X_AF|X_OF) { - NOT(s2, s1); // s2 = ~res - AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s3); // s3 = ~res & (op1 | op2) + } else { + NOT(s2, s1); // s2 = ~res + AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s4); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s4, s3, 0x08); // AF: cc & 0x08 @@ -580,8 +595,12 @@ void emit_inc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4) SB(s1, xEmu, offsetof(x64emu_t, res)); } IFX(X_AF | X_OF) { - NOT(s2, s1); // s2 = ~res - AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s3); // s3 = ~res & (op1 | op2) + } else { + NOT(s2, s1); // s2 = ~res + AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s4); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s2, s3, 0x08); // AF: cc & 0x08 @@ -625,8 +644,9 @@ void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4) SET_DFNONE(); } IFX(X_AF | X_OF) { - ORI(s3, s1, 1); // s3 = op1 | op2 - ANDI(s4, s1, 1); // s4 = op1 & op2 + NOT(s4, s1); // s4 = ~op1 + ORI(s3, s4, 1); // s3 = ~op1 | op2 + ANDI(s4, s4, 1); // s4 = ~op1 & op2 } ADDIW(s1, s1, -1); @@ -635,9 +655,8 @@ void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4) SB(s1, xEmu, offsetof(x64emu_t, res)); } IFX(X_AF | X_OF) { - NOT(s2, s1); // s2 = ~res - AND(s3, s2, s3); // s3 = ~res & (op1 | op2) - OR(s3, s3, s4); // cc = (~res & (op1 | op2)) | (op1 & op2) + AND(s3, s1, s3); // s3 = res & (~op1 | op2) + OR(s3, s3, s4); // cc = (res & (~op1 | op2)) | (~op1 & op2) IFX(X_AF) { ANDI(s2, s3, 0x08); // AF: cc & 0x08 BEQZ(s2, 8); @@ -689,8 +708,12 @@ void emit_inc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s SDxw(s1, xEmu, offsetof(x64emu_t, res)); } IFX(X_AF | X_OF) { - NOT(s2, s1); // s2 = ~res - AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s3); // s3 = ~res & (op1 | op2) + } else { + NOT(s2, s1); // s2 = ~res + AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s5); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s2, s3, 0x08); // AF: cc & 0x08 @@ -781,6 +804,9 @@ void emit_dec32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s // emit INC16 instruction, from s1, store result in s1 using s3 and s4 as scratch void emit_inc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4) { + IFX(X_ALL) { + ANDI(xFlags, xFlags, ~((1UL<<F_AF) | (1UL<<F_OF2) | (1UL<<F_ZF) | (1UL<<F_SF) | (1UL<<F_PF))); + } IFX(X_PEND) { SH(s1, xEmu, offsetof(x64emu_t, op1)); SET_DF(s3, d_inc16); @@ -798,8 +824,12 @@ void emit_inc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4) SH(s1, xEmu, offsetof(x64emu_t, res)); } IFX(X_AF | X_OF) { - NOT(s2, s1); // s2 = ~res - AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s3); // s3 = ~res & (op1 | op2) + } else { + NOT(s2, s1); // s2 = ~res + AND(s3, s2, s3); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s4); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s4, s3, 0x08); // AF: cc & 0x08 @@ -816,8 +846,7 @@ void emit_inc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4) } } - SLLI(s1, s1, 48); - SRLI(s1, s1, 48); + ZEXTH(s1, s1); IFX(X_ZF) { BNEZ(s1, 8); @@ -909,6 +938,7 @@ void emit_sbb8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i SUBW(s1, s1, s3); ANDI(s1, s1, 0xff); + CLEAR_FLAGS(); IFX(X_PEND) { SB(s1, xEmu, offsetof(x64emu_t, res)); } @@ -928,6 +958,78 @@ void emit_sbb8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, i } } +// emit ADC8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch +void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5) { + IFX(X_PEND) { + SH(s1, xEmu, offsetof(x64emu_t, op1)); + SH(s2, xEmu, offsetof(x64emu_t, op2)); + SET_DF(s3, d_adc8); + } else IFX(X_ALL) { + SET_DFNONE(); + } + IFX(X_AF | X_OF) { + OR(s4, s1, s2); // s3 = op1 | op2 + AND(s5, s1, s2); // s4 = op1 & op2 + } + + ADD(s1, s1, s2); + ANDI(s3, xFlags, 1 << F_CF); + ADD(s1, s1, s3); + + CLEAR_FLAGS(); + IFX(X_PEND) { + SW(s1, xEmu, offsetof(x64emu_t, res)); + } + IFX(X_AF | X_OF) { + if(rv64_zbb) { + ANDN(s3, s1, s4); // s3 = ~res & (op1 | op2) + } else { + NOT(s2, s1); // s2 = ~res + AND(s3, s2, s4); // s3 = ~res & (op1 | op2) + } + OR(s3, s3, s5); // cc = (~res & (op1 | op2)) | (op1 & op2) + IFX(X_AF) { + ANDI(s4, s3, 0x08); // AF: cc & 0x08 + BEQZ(s4, 8); + ORI(xFlags, xFlags, 1 << F_AF); + } + IFX(X_OF) { + SRLI(s3, s3, 6); + SRLI(s4, s3, 1); + XOR(s3, s3, s4); + ANDI(s3, s3, 1); // OF: xor of two MSB's of cc + BEQZ(s3, 8); + ORI(xFlags, xFlags, 1 << F_OF2); + } + } + IFX(X_CF) { + SRLI(s3, s1, 8); + BEQZ(s3, 8); + ORI(xFlags, xFlags, 1 << F_CF); + } + + ANDI(s1, s1, 0xff); + + IFX(X_ZF) { + BNEZ(s1, 8); + ORI(xFlags, xFlags, 1 << F_ZF); + } + IFX(X_SF) { + SRLI(s3, s1, 7); + BEQZ(s3, 8); + ORI(xFlags, xFlags, 1 << F_SF); + } + IFX(X_PF) { + emit_pf(dyn, ninst, s1, s3, s4); + } +} + +// emit ADC8 instruction, from s1, const c, store result in s1 using s3, s4, s5 and s6 as scratch +void emit_adc8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5, int s6) { + MOV32w(s5, c&0xff); + emit_adc8(dyn, ninst, s1, s5, s3, s4, s6); +} + // emit SBB8 instruction, from s1, constant c, store result in s1 using s3, s4, s5 and s6 as scratch void emit_sbb8c(dynarec_rv64_t* dyn, int ninst, int s1, int c, int s3, int s4, int s5, int s6) { @@ -955,6 +1057,7 @@ void emit_sbb16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, ANDI(s3, xFlags, 1 << F_CF); SUBW(s1, s1, s3); + CLEAR_FLAGS(); SLLIW(s1, s1, 16); IFX(X_SF) { BGE(s1, xZR, 8); @@ -996,6 +1099,7 @@ void emit_sbb32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s ANDI(s3, xFlags, 1 << F_CF); SUBxw(s1, s1, s3); + CLEAR_FLAGS(); IFX(X_SF) { BGE(s1, xZR, 8); ORI(xFlags, xFlags, 1 << F_SF); @@ -1091,8 +1195,7 @@ void emit_neg16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3) } NEG(s1, s1); - SLLI(s1, s1, 48); - SRLI(s1, s1, 48); + ZEXTH(s1, s1); IFX(X_PEND) { SH(s1, xEmu, offsetof(x64emu_t, res)); } @@ -1121,7 +1224,8 @@ void emit_neg16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3) } } IFX(X_SF) { - ANDI(s3, s1, 1 << F_SF); // 1<<F_SF is sign bit, so just mask + SRLI(s3, s1, 15-F_SF); // put sign bit in place + ANDI(s3, s3, 1 << F_SF); // 1<<F_SF is sign bit, so just mask OR(xFlags, xFlags, s3); } IFX(X_PF) { @@ -1192,7 +1296,6 @@ void emit_neg8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3) // emit ADC16 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5) { - CLEAR_FLAGS(); IFX(X_PEND) { SH(s1, xEmu, offsetof(x64emu_t, op1)); SH(s2, xEmu, offsetof(x64emu_t, op2)); @@ -1209,12 +1312,17 @@ void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, ANDI(s3, xFlags, 1 << F_CF); ADD(s1, s1, s3); + CLEAR_FLAGS(); IFX(X_PEND) { SW(s1, xEmu, offsetof(x64emu_t, res)); } IFX(X_AF | X_OF) { - NOT(s2, s1); // s2 = ~res - AND(s3, s2, s4); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s4); // s3 = ~res & (op1 | op2) + } else { + NOT(s2, s1); // s2 = ~res + AND(s3, s2, s4); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s5); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s4, s3, 0x08); // AF: cc & 0x08 @@ -1236,8 +1344,7 @@ void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, ORI(xFlags, xFlags, 1 << F_CF); } - SLLI(s1, s1, 48); - SRLI(s1, s1, 48); + ZEXTH(s1, s1); IFX(X_ZF) { BNEZ(s1, 8); @@ -1254,9 +1361,8 @@ void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, } // emit ADC32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch -void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5) +void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6) { - CLEAR_FLAGS(); IFX(X_PEND) { SDxw(s1, xEmu, offsetof(x64emu_t, op1)); SDxw(s2, xEmu, offsetof(x64emu_t, op2)); @@ -1267,21 +1373,16 @@ void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s IFX(X_CF) { if (rex.w) { AND(s5, xMASK, s1); - AND(s4, xMASK, s2); - ADD(s5, s5, s4); // lo + if(rv64_zba) ADDUW(s5, s2, s5); else {AND(s4, xMASK, s2); ADD(s5, s5, s4);} // lo SRLI(s3, s1, 0x20); SRLI(s4, s2, 0x20); ADD(s4, s4, s3); SRLI(s5, s5, 0x20); ADD(s5, s5, s4); // hi - SRAI(s5, s5, 0x20); - BEQZ(s5, 8); - ORI(xFlags, xFlags, 1 << F_CF); + SRAI(s6, s5, 0x20); } else { ADD(s5, s1, s2); - SRLI(s5, s5, 0x20); - BEQZ(s5, 8); - ORI(xFlags, xFlags, 1 << F_CF); + SRLI(s6, s5, 0x20); } } IFX(X_AF | X_OF) { @@ -1293,12 +1394,21 @@ void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s ANDI(s3, xFlags, 1 << F_CF); ADDxw(s1, s1, s3); + CLEAR_FLAGS(); IFX(X_PEND) { SDxw(s1, xEmu, offsetof(x64emu_t, res)); } + IFX(X_CF) { + BEQZ(s6, 8); + ORI(xFlags, xFlags, 1 << F_CF); + } IFX(X_AF | X_OF) { - NOT(s2, s1); // s2 = ~res - AND(s3, s2, s4); // s3 = ~res & (op1 | op2) + if(rv64_zbb) { + ANDN(s3, s1, s4); // s3 = ~res & (op1 | op2) + } else { + NOT(s2, s1); // s2 = ~res + AND(s3, s2, s4); // s3 = ~res & (op1 | op2) + } OR(s3, s3, s5); // cc = (~res & (op1 | op2)) | (op1 & op2) IFX(X_AF) { ANDI(s4, s3, 0x08); // AF: cc & 0x08 diff --git a/src/dynarec/rv64/dynarec_rv64_emit_shift.c b/src/dynarec/rv64/dynarec_rv64_emit_shift.c index dbcc2d5f..7030c674 100644 --- a/src/dynarec/rv64/dynarec_rv64_emit_shift.c +++ b/src/dynarec/rv64/dynarec_rv64_emit_shift.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -16,7 +15,6 @@ #include "emu/x64run_private.h" #include "x64trace.h" #include "dynarec_native.h" -#include "../tools/bridge_private.h" #include "rv64_printer.h" #include "dynarec_rv64_private.h" @@ -327,11 +325,15 @@ void emit_rol32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s } else { ANDI(s4, s2, 0x1f); } - SLLxw(s3, s1, s4); - NEG(s4, s4); - ADDI(s4, s4, rex.w?64:32); - SRLxw(s1, s1, s4); - OR(s1, s3, s1); + if(rv64_zbb) { + ROLxw(s1, s1, s4); + } else { + SLLxw(s3, s1, s4); + NEG(s4, s4); + ADDI(s4, s4, rex.w?64:32); + SRLxw(s1, s1, s4); + OR(s1, s3, s1); + } IFX(X_PEND) { SDxw(s1, xEmu, offsetof(x64emu_t, res)); } @@ -370,11 +372,15 @@ void emit_ror32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s } else { ANDI(s4, s2, 0x1f); } - SRLxw(s3, s1, s4); - NEG(s4, s4); - ADDI(s4, s4, rex.w?64:32); - SLLxw(s1, s1, s4); - OR(s1, s3, s1); + if(rv64_zbb) { + RORxw(s1, s1, s4); + } else { + SRLxw(s3, s1, s4); + NEG(s4, s4); + ADDI(s4, s4, rex.w?64:32); + SLLxw(s1, s1, s4); + OR(s1, s3, s1); + } IFX(X_PEND) { SDxw(s1, xEmu, offsetof(x64emu_t, res)); } @@ -413,9 +419,13 @@ void emit_rol32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, } return; } - SLLIxw(s3, s1, c); - SRLIxw(s1, s1, (rex.w?64:32)-c); - OR(s1, s3, s1); + if(rv64_zbb) { + RORIxw(s1, s1, (rex.w?64:32)-c); + } else { + SLLIxw(s3, s1, c); + SRLIxw(s1, s1, (rex.w?64:32)-c); + OR(s1, s3, s1); + } IFX(X_PEND) { SDxw(s1, xEmu, offsetof(x64emu_t, res)); } @@ -454,9 +464,13 @@ void emit_ror32c(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, } return; } - SRLIxw(s3, s1, c); - SLLIxw(s1, s1, (rex.w?64:32)-c); - OR(s1, s3, s1); + if(rv64_zbb) { + RORIxw(s1, s1, c); + } else { + SRLIxw(s3, s1, c); + SLLIxw(s1, s1, (rex.w?64:32)-c); + OR(s1, s3, s1); + } IFX(X_PEND) { SDxw(s1, xEmu, offsetof(x64emu_t, res)); } diff --git a/src/dynarec/rv64/dynarec_rv64_emit_tests.c b/src/dynarec/rv64/dynarec_rv64_emit_tests.c index 79ebe6cb..00c1fb7d 100644 --- a/src/dynarec/rv64/dynarec_rv64_emit_tests.c +++ b/src/dynarec/rv64/dynarec_rv64_emit_tests.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -16,7 +15,6 @@ #include "emu/x64run_private.h" #include "x64trace.h" #include "dynarec_native.h" -#include "../tools/bridge_private.h" #include "rv64_printer.h" #include "dynarec_rv64_private.h" @@ -108,8 +106,7 @@ void emit_cmp16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, // It's a cmp, we can't store the result back to s1. SUB(s6, s1, s2); IFX(X_ALL) { - SLLI(s6, s6, 48); - SRLI(s6, s6, 48); + ZEXTH(s6, s6); } IFX_PENDOR0 { SH(s6, xEmu, offsetof(x64emu_t, res)); diff --git a/src/dynarec/rv64/dynarec_rv64_f0.c b/src/dynarec/rv64/dynarec_rv64_f0.c index 3ccaafa4..348f2905 100644 --- a/src/dynarec/rv64/dynarec_rv64_f0.c +++ b/src/dynarec/rv64/dynarec_rv64_f0.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -47,12 +46,8 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni rep = opcode-0xF1; opcode = F8; } - // REX prefix before the F0 are ignored - rex.rex = 0; - while(opcode>=0x40 && opcode<=0x4f) { - rex.rex = opcode; - opcode = F8; - } + + GETREX(); // TODO: Take care of unligned memory access for all the LOCK ones. // https://github.com/ptitSeb/box64/pull/604 @@ -104,6 +99,101 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni case 0x0F: nextop = F8; switch(nextop) { + case 0xB0: + switch(rep) { + case 0: + INST_NAME("LOCK CMPXCHG Eb, Gb"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + ANDI(x6, xRAX, 0xff); // AL + SMDMB(); + if(MODREG) { + if(rex.rex) { + wback = xRAX+(nextop&7)+(rex.b<<3); + wb2 = 0; + } else { + wback = (nextop&7); + wb2 = (wback>>2)*8; + wback = xRAX+(wback&3); + } + if (wb2) { + MV(x2, wback); + SRLI(x2, x2, wb2); + ANDI(x2, x2, 0xff); + } else { + ANDI(x2, wback, 0xff); + } + wb1 = 0; + ed = x2; + UFLAG_IF { + emit_cmp8(dyn, ninst, x6, ed, x3, x4, x5, x1); + } + BNE_MARK2(x6, x2); + if (wb2) { + MV(wback, x2); + SRLI(wback, wback, wb2); + ANDI(wback, wback, 0xff); + } else { + ANDI(wback, x2, 0xff); + } + GETGB(x1); + MV(ed, gd); + MARK2; + ANDI(xRAX, xRAX, ~0xff); + OR(xRAX, xRAX, x2); + B_NEXT_nocond; + } else { + // this one is tricky, and did some repetitive work. + // mostly because we only got 6 scratch registers, + // and has so much to do. + if(rex.rex) { + gb1 = xRAX+((nextop&0x38)>>3)+(rex.r<<3); + gb2 = 0; + } else { + gd = (nextop&0x38)>>3; + gb2 = ((gd&4)>>2); + gb1 = xRAX+(gd&3); + } + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, LOCK_LOCK, 0, 0); + ANDI(x5, wback, 0b11); + SLLI(x5, x5, 3); // shamt + MARKLOCK; + ANDI(x2, wback, ~0b11); // align to 32bit + LWU(x1, x2, 0); + LR_W(x4, x2, 1, 1); + SRL(x4, x4, x5); + ANDI(x4, x4, 0xff); + BNE_MARK(x6, x4); // compare AL with m8 + // AL == m8, r8 is loaded into m8 + ADDI(x2, xZR, 0xff); + SLL(x2, x2, x5); + NOT(x2, x2); + AND(x2, x1, x2); + if (gb2) { + MV(x1, gb1); + SRLI(x1, x1, 8); + ANDI(x1, x1, 0xff); + } else { + ANDI(x1, gb1, 0xff); + } + SLL(x1, x1, x5); + OR(x1, x1, x2); + ANDI(x2, wback, ~0b11); // align to 32bit again + SC_W(x9, x1, x2, 1, 1); + BNEZ_MARKLOCK(x9); + // done + MARK; + UFLAG_IF {emit_cmp8(dyn, ninst, x6, x4, x1, x2, x3, x5);} + // load m8 into AL + ANDI(xRAX, xRAX, ~0xff); + OR(xRAX, xRAX, x4); + } + SMDMB(); + break; + default: + DEFAULT; + } + break; case 0xB1: switch (rep) { case 0: @@ -188,9 +278,16 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni addr = geted(dyn, addr, ninst, nextop, &wback, x1, x2, &fixedaddress, rex, LOCK_LOCK, 0, 0); ANDI(xFlags, xFlags, ~(1<<F_ZF)); if (rex.w) { - // there is no atomic move on 16bytes, so faking it + // there is no atomic move on 16bytes, so implement it with mutex + LD(x9, xEmu, offsetof(x64emu_t, context)); + ADDI(x9, x9, offsetof(box64context_t, mutex_16b)); + ADDI(x4, xZR, 1); + MARKLOCK; + AMOSWAP_W(x4, x4, x9, 1, 1); + // x4 == 1 if locked + BNEZ_MARKLOCK(x4); + SMDMB(); - // MARKLOCK; LD(x2, wback, 0); LD(x3, wback, 8); BNE_MARK(x2, xRAX); @@ -204,6 +301,9 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni MV(xRDX, x3); MARK3; SMDMB(); + + // unlock + AMOSWAP_W(xZR, xZR, x9, 1, 1); } else { SMDMB(); MARKLOCK; @@ -260,6 +360,64 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni } SMDMB(); break; + case 0x29: + INST_NAME("LOCK SUB Ed, Gd"); + SETFLAGS(X_ALL, SF_SET_PENDING); + nextop = F8; + GETGD; + SMDMB(); + if(MODREG) { + ed = xRAX+(nextop&7)+(rex.b<<3); + emit_sub32(dyn, ninst, rex, ed, gd, x3, x4, x5); + } else { + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, LOCK_LOCK, 0, 0); + MARKLOCK; + LRxw(x1, wback, 1, 1); + SUB(x4, x1, gd); + SCxw(x3, x4, wback, 1, 1); + BNEZ_MARKLOCK(x3); + IFX(X_ALL|X_PEND) + emit_sub32(dyn, ninst, rex, x1, gd, x3, x4, x5); + } + SMDMB(); + break; + case 0x80: + nextop = F8; + SMDMB(); + switch((nextop>>3)&7) { + case 1: // OR + INST_NAME("LOCK OR Eb, Ib"); + SETFLAGS(X_ALL, SF_SET_PENDING); + if(MODREG) { + GETEB(x1, 1); + u8 = F8; + emit_or8c(dyn, ninst, x1, u8, x2, x4, x5); + EBBACK(x5, 0); + } else { + addr = geted(dyn, addr, ninst, nextop, &wback, x5, x1, &fixedaddress, rex, LOCK_LOCK, 0, 1); + u8 = F8; + ANDI(x2, wback, 3); + SLLI(x2, x2, 3); // offset in bits + ANDI(x3, wback, ~3); // aligned addr + ADDI(x1, xZR, u8); + SLL(x1, x1, x2); // Ib << offset + MARKLOCK; + LR_W(x4, x3, 1, 1); + OR(x6, x4, x1); + SC_W(x6, x6, x3, 1, 1); + BNEZ_MARKLOCK(x6); + IFX(X_ALL|X_PEND) { + SRL(x1, x4, x2); + ANDI(x1, x1, 0xFF); + emit_or8c(dyn, ninst, x1, u8, x2, x4, x5); + } + } + break; + default: + DEFAULT; + } + SMDMB(); + break; case 0x81: case 0x83: nextop = F8; @@ -379,7 +537,7 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni emit_sub32c(dyn, ninst, rex, x1, i64, x3, x4, x5, x6); } break; - default: + default: DEFAULT; } SMDMB(); diff --git a/src/dynarec/rv64/dynarec_rv64_f20f.c b/src/dynarec/rv64/dynarec_rv64_f20f.c index 95f526f0..ac3da811 100644 --- a/src/dynarec/rv64/dynarec_rv64_f20f.c +++ b/src/dynarec/rv64/dynarec_rv64_f20f.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -35,7 +34,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int int v0, v1; int q0; int d0, d1; - int64_t fixedaddress; + int64_t fixedaddress, gdoffset; int unscaled; MAYUSE(d0); @@ -82,11 +81,11 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x12: INST_NAME("MOVDDUP Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); LD(x3, wback, fixedaddress+0); - SD(x3, gback, 0); - SD(x3, gback, 8); + SD(x3, gback, gdoffset+0); + SD(x3, gback, gdoffset+8); break; case 0x2A: INST_NAME("CVTSI2SD Gx, Ed"); @@ -105,7 +104,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETGD; GETEXSD(v0, 0); if(!box64_dynarec_fastround) { - FSFLAGSI(xZR); // // reset all bits + FSFLAGSI(0); // // reset all bits } FCVTLDxw(gd, v0, RD_RTZ); if(!rex.w) @@ -127,7 +126,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETGD; GETEXSD(v0, 0); if(!box64_dynarec_fastround) { - FSFLAGSI(xZR); // // reset all bits + FSFLAGSI(0); // // reset all bits } u8 = sse_setround(dyn, ninst, x2, x3); FCVTLDxw(gd, v0, RD_DYN); @@ -184,8 +183,9 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEXSD(d0, 0); GETGXSD_empty(d1); if(!box64_dynarec_fastnan) { - FMVDX(d1, xZR); - FLTD(x3, d0, d1); + v0 = fpu_get_scratch(dyn); // need a scratch in case d0 == d1 + FMVDX(v0, xZR); + FLTD(x3, d0, v0); } FSQRTD(d1, d0); if(!box64_dynarec_fastnan) { @@ -275,7 +275,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x70: // TODO: Optimize this! INST_NAME("PSHUFLW Gx, Ex, Ib"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 1); u8 = F8; int32_t idx; @@ -289,14 +289,14 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int idx = (u8>>(3*2))&3; LHU(x6, wback, fixedaddress+idx*2); - SH(x3, gback, 0*2); - SH(x4, gback, 1*2); - SH(x5, gback, 2*2); - SH(x6, gback, 3*2); + SH(x3, gback, gdoffset+0*2); + SH(x4, gback, gdoffset+1*2); + SH(x5, gback, gdoffset+2*2); + SH(x6, gback, gdoffset+3*2); if (!(MODREG && (gd==ed))) { LD(x3, wback, fixedaddress+8); - SD(x3, gback, 8); + SD(x3, gback, gdoffset+8); } break; case 0xC2: @@ -334,7 +334,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } case 7: break; // Not NaN } - + MARK2; if ((u8&7) == 5 || (u8&7) == 6) { MOV32w(x2, 1); @@ -347,7 +347,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xE6: INST_NAME("CVTPD2DQ Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); d0 = fpu_get_scratch(dyn); u8 = sse_setround(dyn, ninst, x6, x4); @@ -358,10 +358,17 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SUB(x5, x5, x3); BEQZ(x5, 8); LUI(x3, 0x80000); // INT32_MIN - SW(x3, gback, 4*i); + SW(x3, gback, gdoffset+4*i); } x87_restoreround(dyn, ninst, u8); - SD(xZR, gback, 8); + SD(xZR, gback, gdoffset+8); + break; + case 0xF0: + INST_NAME("LDDQU Gx,Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + SSE_LOOP_MV_Q(x3); break; default: DEFAULT; diff --git a/src/dynarec/rv64/dynarec_rv64_f30f.c b/src/dynarec/rv64/dynarec_rv64_f30f.c index 489d5ca0..0c0676e0 100644 --- a/src/dynarec/rv64/dynarec_rv64_f30f.c +++ b/src/dynarec/rv64/dynarec_rv64_f30f.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include "debug.h" @@ -35,7 +34,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int int v0, v1; int q0, q1; int d0, d1; - int64_t fixedaddress; + int64_t fixedaddress, gdoffset; int unscaled; int64_t j64; @@ -80,7 +79,22 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SMWRITE2(); } break; - + + case 0x12: + INST_NAME("MOVSLDUP Gx, Ex"); + nextop = F8; + GETGX(); + GETEX(x2, 0); + + // GX->ud[1] = GX->ud[0] = EX->ud[0]; + // GX->ud[3] = GX->ud[2] = EX->ud[2]; + LD(x3, wback, fixedaddress+0); + SD(x3, gback, gdoffset+0); + SD(x3, gback, gdoffset+4); + LD(x3, wback, fixedaddress+8); + SD(x3, gback, gdoffset+8); + SD(x3, gback, gdoffset+12); + break; case 0x1E: INST_NAME("NOP / ENDBR32 / ENDBR64"); nextop = F8; @@ -105,7 +119,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETGD; GETEXSS(d0, 0); if(!box64_dynarec_fastround) { - FSFLAGSI(xZR); // // reset all bits + FSFLAGSI(0); // // reset all bits } FCVTSxw(gd, d0, RD_RTZ); if(!rex.w) @@ -121,6 +135,31 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } } break; + case 0x2D: + INST_NAME("CVTSS2SI Gd, Ex"); + nextop = F8; + GETGD; + GETEXSS(d0, 0); + if(!box64_dynarec_fastround) { + FSFLAGSI(0); // // reset all bits + } + u8 = sse_setround(dyn, ninst, x5, x6); + FCVTSxw(gd, d0, RD_DYN); + x87_restoreround(dyn, ninst, u8); + if(!rex.w) + ZEROUP(gd); + if(!box64_dynarec_fastround) { + FRFLAGS(x5); // get back FPSR to check the IOC bit + ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF)); + CBZ_NEXT(x5); + if(rex.w) { + MOV64x(gd, 0x8000000000000000LL); + } else { + MOV32w(gd, 0x80000000); + } + } + break; + case 0x51: INST_NAME("SQRTSS Gx, Ex"); nextop = F8; @@ -128,6 +167,16 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int GETGXSS_empty(v1); FSQRTS(v1, v0); break; + case 0x53: + INST_NAME("RCPSS Gx, Ex"); + nextop = F8; + GETEXSS(v0, 0); + GETGXSS_empty(v1); + q0 = fpu_get_scratch(dyn); + LUI(x3, 0x3F800); // 1.0f + FMVWX(q0, x3); + FDIVS(v1, q0, v0); + break; case 0x58: INST_NAME("ADDSS Gx, Ex"); nextop = F8; @@ -196,14 +245,14 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x6F: INST_NAME("MOVDQU Gx,Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_MV_Q(x3); break; case 0x70: // TODO: Optimize this! INST_NAME("PSHUFHW Gx, Ex, Ib"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 1); u8 = F8; int32_t idx; @@ -217,14 +266,14 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int idx = 4+((u8>>(3*2))&3); LHU(x6, wback, fixedaddress+idx*2); - SH(x3, gback, (4+0)*2); - SH(x4, gback, (4+1)*2); - SH(x5, gback, (4+2)*2); - SH(x6, gback, (4+3)*2); + SH(x3, gback, gdoffset+(4+0)*2); + SH(x4, gback, gdoffset+(4+1)*2); + SH(x5, gback, gdoffset+(4+2)*2); + SH(x6, gback, gdoffset+(4+3)*2); if (!(MODREG && (gd==ed))) { LD(x3, wback, fixedaddress+0); - SD(x3, gback, 0); + SD(x3, gback, gdoffset+0); } break; case 0x7E: @@ -246,21 +295,21 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x7F: INST_NAME("MOVDQU Ex,Gx"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); SSE_LOOP_MV_Q2(x3); if(!MODREG) SMWRITE2(); break; - + case 0x5B: INST_NAME("CVTTPS2DQ Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); v0 = fpu_get_scratch(dyn); for(int i=0; i<4; ++i) { if(!box64_dynarec_fastround) { - FSFLAGSI(xZR); // reset all bits + FSFLAGSI(0); // reset all bits } FLW(v0, wback, fixedaddress+i*4); FCVTWS(x3, v0, RD_RTZ); @@ -270,7 +319,49 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int BEQZ(x5, 8); MOV32w(x3, 0x80000000); } - SW(x3, gback, i*4); + SW(x3, gback, gdoffset+i*4); + } + break; + case 0xB8: + INST_NAME("POPCNT Gd, Ed"); + SETFLAGS(X_ALL, SF_SET); + SET_DFNONE(); + nextop = F8; + GETED(0); + GETGD; + if(!rex.w && MODREG) { + AND(x4, ed, xMASK); + ed = x4; + } + CLEAR_FLAGS(); + BNE_MARK(ed, xZR); + ORI(xFlags, xFlags, 1<<F_ZF); + MOV32w(gd, 0); + B_NEXT_nocond; + MARK; + if(rv64_zbb) { + CPOPxw(gd, ed); + } else { + TABLE64(x1, 0x5555555555555555uLL); + SRLI(x5, ed, 1); + AND(x5, x5, x1); + SUB(x5, ed, x5); + TABLE64(x3, 0x3333333333333333uLL); + SRLI(x1, x5, 2); + AND(x1, x1, x3); + AND(x5, x5, x3); + ADD(x5, x5, x1); + TABLE64(x3, 0x0F0F0F0F0F0F0F0FuLL); + SRLI(x1, x5, 4); + ADD(x5, x5, x1); + AND(x5, x5, x3); + SRLI(x1, x5, 32); + ADDW(x5, x5, x1); + SRLIW(x1, x5, 16); + ADDW(x5, x5, x1); + SRLIW(x1, x5, 8); + ADDW(x5, x5, x1); + ANDI(gd, x5, 0x7F); } break; case 0xBC: @@ -284,21 +375,24 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int AND(x4, ed, xMASK); ed = x4; } - BNE_MARK(ed, xZR); ANDI(xFlags, xFlags, ~((1<<F_ZF) | (1<<F_CF))); + BNE_MARK(ed, xZR); ORI(xFlags, xFlags, 1<<F_CF); MOV32w(gd, rex.w?64:32); B_NEXT_nocond; MARK; - NEG(x2, ed); - AND(x2, x2, ed); - TABLE64(x3, 0x03f79d71b4ca8b09ULL); - MUL(x2, x2, x3); - SRLI(x2, x2, 64-6); - TABLE64(x1, (uintptr_t)&deBruijn64tab); - ADD(x1, x1, x2); - LBU(gd, x1, 0); - ANDI(xFlags, xFlags, ~((1<<F_ZF) | (1<<F_CF))); + if(rv64_zbb) { + CTZxw(gd, ed); + } else { + NEG(x2, ed); + AND(x2, x2, ed); + TABLE64(x3, 0x03f79d71b4ca8b09ULL); + MUL(x2, x2, x3); + SRLI(x2, x2, 64-6); + TABLE64(x1, (uintptr_t)&deBruijn64tab); + ADD(x1, x1, x2); + LBU(gd, x1, 0); + } BNE(gd, xZR, 4+4); ORI(xFlags, xFlags, 1<<F_ZF); break; @@ -319,38 +413,42 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ORI(xFlags, xFlags, 1<<F_CF); B_NEXT_nocond; MARK; - if(ed!=gd) - u8 = gd; - else - u8 = x1; - ADDI(u8, xZR, rex.w?63:31); - if(rex.w) { - MV(x2, ed); - SRLI(x3, x2, 32); + if(rv64_zbb) { + CLZxw(gd, ed); + } else { + if(ed!=gd) + u8 = gd; + else + u8 = x1; + ADDI(u8, xZR, rex.w?63:31); + if(rex.w) { + MV(x2, ed); + SRLI(x3, x2, 32); + BEQZ(x3, 4+2*4); + SUBI(u8, u8, 32); + MV(x2, x3); + } else { + AND(x2, ed, xMASK); + } + SRLI(x3, x2, 16); BEQZ(x3, 4+2*4); - SUBI(u8, u8, 32); + SUBI(u8, u8, 16); MV(x2, x3); - } else { - AND(x2, ed, xMASK); + SRLI(x3, x2, 8); + BEQZ(x3, 4+2*4); + SUBI(u8, u8, 8); + MV(x2, x3); + SRLI(x3, x2, 4); + BEQZ(x3, 4+2*4); + SUBI(u8, u8, 4); + MV(x2, x3); + ANDI(x2, x2, 0b1111); + TABLE64(x3, (uintptr_t)&lead0tab); + ADD(x3, x3, x2); + LBU(x2, x3, 0); + SUB(gd, u8, x2); + MARK2; } - SRLI(x3, x2, 16); - BEQZ(x3, 4+2*4); - SUBI(u8, u8, 16); - MV(x2, x3); - SRLI(x3, x2, 8); - BEQZ(x3, 4+2*4); - SUBI(u8, u8, 8); - MV(x2, x3); - SRLI(x3, x2, 4); - BEQZ(x3, 4+2*4); - SUBI(u8, u8, 4); - MV(x2, x3); - ANDI(x2, x2, 0b1111); - TABLE64(x3, (uintptr_t)&lead0tab); - ADD(x3, x3, x2); - LBU(x2, x3, 0); - SUB(gd, u8, x2); - MARK2; ANDI(xFlags, xFlags, ~((1<<F_ZF) | (1<<F_CF))); BNE(gd, xZR, 4+4); ORI(xFlags, xFlags, 1<<F_ZF); @@ -391,7 +489,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int } case 7: break; // Not NaN } - + MARK2; if ((u8&7) == 5 || (u8&7) == 6) { MOV32w(x2, 1); @@ -405,7 +503,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0xE6: INST_NAME("CVTDQ2PD Gx, Ex"); nextop = F8; - GETGX(x1); + GETGX(); GETEX(x2, 0); q0 = fpu_get_scratch(dyn); q1 = fpu_get_scratch(dyn); @@ -413,8 +511,8 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int LW(x4, wback, fixedaddress+4); FCVTDW(q0, x3, RD_RTZ); FCVTDW(q1, x4, RD_RTZ); - FSD(q0, gback, 0); - FSD(q1, gback, 8); + FSD(q0, gback, gdoffset+0); + FSD(q1, gback, gdoffset+8); break; default: diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c index dade3016..541ac45f 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.c +++ b/src/dynarec/rv64/dynarec_rv64_functions.c @@ -1,7 +1,6 @@ #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> -#include <pthread.h> #include <errno.h> #include <string.h> #include <math.h> @@ -13,7 +12,6 @@ #include "box64context.h" #include "dynarec.h" #include "emu/x64emu_private.h" -#include "tools/bridge_private.h" #include "x64run.h" #include "x64emu.h" #include "box64stack.h" @@ -130,7 +128,7 @@ int extcache_get_st_f(dynarec_rv64_t* dyn, int ninst, int a) && dyn->insts[ninst].e.extcache[i].n==a) return i; return -1; -} +} int extcache_get_st_f_noback(dynarec_rv64_t* dyn, int ninst, int a) { for(int i=0; i<24; ++i) @@ -138,7 +136,7 @@ int extcache_get_st_f_noback(dynarec_rv64_t* dyn, int ninst, int a) && dyn->insts[ninst].e.extcache[i].n==a) return i; return -1; -} +} int extcache_get_current_st_f(dynarec_rv64_t* dyn, int a) { for(int i=0; i<24; ++i) @@ -146,7 +144,7 @@ int extcache_get_current_st_f(dynarec_rv64_t* dyn, int a) && dyn->e.extcache[i].n==a) return i; return -1; -} +} static void extcache_promote_double_forward(dynarec_rv64_t* dyn, int ninst, int maxinst, int a); static void extcache_promote_double_internal(dynarec_rv64_t* dyn, int ninst, int maxinst, int a); @@ -155,7 +153,7 @@ static void extcache_promote_double_combined(dynarec_rv64_t* dyn, int ninst, int if(a == dyn->insts[ninst].e.combined1 || a == dyn->insts[ninst].e.combined2) { if(a == dyn->insts[ninst].e.combined1) { a = dyn->insts[ninst].e.combined2; - } else + } else a = dyn->insts[ninst].e.combined1; int i = extcache_get_st_f_noback(dyn, ninst, a); //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "extcache_promote_double_combined, ninst=%d combined%c %d i=%d (stack:%d/%d)\n", ninst, (a == dyn->insts[ninst].e.combined2)?'2':'1', a ,i, dyn->insts[ninst].e.stack_push, -dyn->insts[ninst].e.stack_pop); @@ -328,7 +326,7 @@ void extcacheUnwind(extcache_t* cache) { if(cache->swapped) { // unswap - int a = -1; + int a = -1; int b = -1; for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j) if((cache->extcache[j].t == EXT_CACHE_ST_D || cache->extcache[j].t == EXT_CACHE_ST_F)) { @@ -346,12 +344,21 @@ void extcacheUnwind(extcache_t* cache) cache->combined1 = cache->combined2 = 0; } if(cache->news) { - // reove the newly created extcache + // remove the newly created extcache for(int i=0; i<24; ++i) if(cache->news&(1<<i)) cache->extcache[i].v = 0; cache->news = 0; } + // add/change bad regs + for(int i=0; i<16; ++i) { + if(cache->olds[i].changed) { + cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD; + } else if(cache->olds[i].purged) { + cache->extcache[i].n = i; + cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD; + } + } if(cache->stack_push) { // unpush for(int j=0; j<24; ++j) { @@ -484,15 +491,22 @@ const char* getCacheName(int t, int n) return buff; } -void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name) +void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex) { + static const char* fnames[] = { + "ft0"," ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", + "fs0", "fs1", + "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", + "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fs8", "fs9", "fs10", "fs11", + "ft8", "ft9", "ft10", "ft11" + }; if(box64_dynarec_dump) { - printf_x64_instruction(my_context->dec, &dyn->insts[ninst].x64, name); + printf_x64_instruction(rex.is32bits?my_context->dec32:my_context->dec, &dyn->insts[ninst].x64, name); dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d/%d", (box64_dynarec_dump>1)?"\e[32m":"", (void*)(dyn->native_start+dyn->insts[ninst].address), dyn->insts[ninst].size/4, - ninst, + ninst, dyn->insts[ninst].x64.barrier, dyn->insts[ninst].x64.state_flags, dyn->f.pending, @@ -517,12 +531,12 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name) dynarec_log(LOG_NONE, ", last_ip=%p", (void*)dyn->last_ip); for(int ii=0; ii<24; ++ii) { switch(dyn->insts[ninst].e.extcache[ii].t) { - case EXT_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; - case EXT_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; - case EXT_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; - case EXT_CACHE_SS: dynarec_log(LOG_NONE, " S%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; - case EXT_CACHE_SD: dynarec_log(LOG_NONE, " D%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; - case EXT_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", EXTREG(ii), getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_ST_D: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_ST_F: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_MM: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_SS: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_SD: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; + case EXT_CACHE_SCR: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break; case EXT_CACHE_NONE: default: break; } diff --git a/src/dynarec/rv64/dynarec_rv64_functions.h b/src/dynarec/rv64/dynarec_rv64_functions.h index fc53dcd7..451336bd 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.h +++ b/src/dynarec/rv64/dynarec_rv64_functions.h @@ -45,7 +45,7 @@ void extcacheUnwind(extcache_t* cache); const char* getCacheName(int t, int n); -void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name); +void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex); void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode); void print_newinst(dynarec_native_t* dyn, int ninst); diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index 37bcec29..a005c3b9 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <stddef.h> -#include <pthread.h> #include <errno.h> #include <assert.h> #include <string.h> @@ -20,7 +19,6 @@ #include "x64trace.h" #include "dynarec_native.h" #include "../dynablock_private.h" -#include "../tools/bridge_private.h" #include "custommem.h" #include "rv64_printer.h" @@ -28,11 +26,16 @@ #include "dynarec_rv64_functions.h" #include "dynarec_rv64_helper.h" +static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, int *l, int i12); + /* setup r2 to address pointed by ED, also fixaddress is an optionnal delta in the range [-absmax, +absmax], with delta&mask==0 to be added to ed for LDR/STR */ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int *l, int i12, int delta) { MAYUSE(dyn); MAYUSE(ninst); MAYUSE(delta); + if(rex.is32bits) + return geted_32(dyn, addr, ninst, nextop, ed, hint, scratch, fixaddress, l, i12); + int lock = l?((l==LOCK_LOCK)?1:2):0; if(lock==2) *l = 0; @@ -47,14 +50,19 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, if((nextop&7)==4) { uint8_t sib = F8; int sib_reg = ((sib>>3)&7)+(rex.x<<3); + int sib_reg2 = (sib&0x7)+(rex.b<<3); if((sib&0x7)==5) { int64_t tmp = F32S; if (sib_reg!=4) { if(tmp && ((tmp<-2048) || (tmp>maxval) || !i12)) { MOV64x(scratch, tmp); if((sib>>6)) { - SLLI(ret, xRAX+sib_reg, (sib>>6)); - ADD(ret, ret, scratch); + if(rv64_zba) { + SHxADD(ret, xRAX+sib_reg, sib>>6, scratch); + } else { + SLLI(ret, xRAX+sib_reg, (sib>>6)); + ADD(ret, ret, scratch); + } } else { ADD(ret, xRAX+sib_reg, scratch); } @@ -75,13 +83,17 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, } else { if (sib_reg!=4) { if(sib>>6) { - SLLI(scratch, xRAX+sib_reg, (sib>>6)); - ADD(ret, xRAX+(sib&0x7)+(rex.b<<3), scratch); + if(rv64_zba) { + SHxADD(ret, xRAX+sib_reg, sib>>6, xRAX+sib_reg2); + } else { + SLLI(scratch, xRAX+sib_reg, (sib>>6)); + ADD(ret, xRAX+sib_reg2, scratch); + } } else { - ADD(ret, xRAX+(sib&0x7)+(rex.b<<3), xRAX+sib_reg); + ADD(ret, xRAX+sib_reg2, xRAX+sib_reg); } } else { - ret = xRAX+(sib&0x7)+(rex.b<<3); + ret = xRAX+sib_reg2; } } } else if((nextop&7)==5) { @@ -125,6 +137,7 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, sib = F8; sib_reg = ((sib>>3)&7)+(rex.x<<3); } + int sib_reg2 = (sib&0x07)+(rex.b<<3); if(nextop&0x80) i64 = F32S; else @@ -134,13 +147,17 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, if((nextop&7)==4) { if (sib_reg!=4) { if(sib>>6) { - SLLI(scratch, xRAX+sib_reg, (sib>>6)); - ADD(ret, xRAX+(sib&0x07)+(rex.b<<3), scratch); + if(rv64_zba) { + SHxADD(ret, xRAX+sib_reg, sib>>6, xRAX+sib_reg2); + } else { + SLLI(scratch, xRAX+sib_reg, (sib>>6)); + ADD(ret, xRAX+sib_reg2, scratch); + } } else { - ADD(ret, xRAX+(sib&0x07)+(rex.b<<3), xRAX+sib_reg); + ADD(ret, xRAX+sib_reg2, xRAX+sib_reg); } } else { - ret = xRAX+(sib&0x07)+(rex.b<<3); + ret = xRAX+sib_reg2; } } else ret = xRAX+(nextop&0x07)+(rex.b<<3); @@ -149,13 +166,17 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, if((nextop&7)==4) { if (sib_reg!=4) { if(sib>>6) { - SLLI(scratch, xRAX+sib_reg, (sib>>6)); - ADD(scratch, xRAX+(sib&0x07)+(rex.b<<3), scratch); + if(rv64_zba) { + SHxADD(scratch, xRAX+sib_reg, sib>>6, xRAX+sib_reg2); + } else { + SLLI(scratch, xRAX+sib_reg, (sib>>6)); + ADD(scratch, xRAX+sib_reg2, scratch); + } } else { - ADD(scratch, xRAX+(sib&0x07)+(rex.b<<3), xRAX+sib_reg); + ADD(scratch, xRAX+sib_reg2, xRAX+sib_reg); } } else { - scratch = xRAX+(sib&0x07)+(rex.b<<3); + scratch = xRAX+sib_reg2; } } else scratch = xRAX+(nextop&0x07)+(rex.b<<3); @@ -164,15 +185,19 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, MOV64x(scratch, i64); if((nextop&7)==4) { if (sib_reg!=4) { - ADD(scratch, scratch, xRAX+(sib&0x07)+(rex.b<<3)); + ADD(scratch, scratch, xRAX+sib_reg2); if(sib>>6) { - SLLI(ret, xRAX+sib_reg, (sib>>6)); - ADD(ret, scratch, ret); + if(rv64_zba) { + SHxADD(ret, xRAX+sib_reg, sib>>6, scratch); + } else { + SLLI(ret, xRAX+sib_reg, (sib>>6)); + ADD(ret, scratch, ret); + } } else { ADD(ret, scratch, xRAX+sib_reg); } } else { - PASS3(int tmp = xRAX+(sib&0x07)+(rex.b<<3)); + PASS3(int tmp = xRAX+sib_reg2); ADD(ret, tmp, scratch); } } else { @@ -186,6 +211,269 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, return addr; } +static uintptr_t geted_32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, int *l, int i12) +{ + MAYUSE(dyn); MAYUSE(ninst); + + int lock = l?((l==LOCK_LOCK)?1:2):0; + if(lock==2) + *l = 0; + uint8_t ret = x2; + *fixaddress = 0; + if(hint>0) ret = hint; + int maxval = 2047; + if(i12>1) + maxval -= i12; + MAYUSE(scratch); + if(!(nextop&0xC0)) { + if((nextop&7)==4) { + uint8_t sib = F8; + int sib_reg = (sib>>3)&0x7; + int sib_reg2 = sib&0x7; + if(sib_reg2==5) { + int64_t tmp = F32S; + if (sib_reg!=4) { + if(tmp && ((tmp<-2048) || (tmp>maxval) || !i12)) { + MOV32w(scratch, tmp); + if((sib>>6)) { + if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), scratch); else {SLLI(ret, xRAX+sib_reg, sib>>6); ADDW(ret, ret, scratch);} + } else + ADDW(ret, xRAX+sib_reg, scratch); + } else { + if(sib>>6) + SLLI(ret, xRAX+sib_reg, (sib>>6)); + else + ret = xRAX+sib_reg; + *fixaddress = tmp; + } + } else { + switch(lock) { + case 1: addLockAddress((int32_t)tmp); break; + case 2: if(isLockAddress((int32_t)tmp)) *l=1; break; + } + MOV32w(ret, tmp); + } + } else { + if (sib_reg!=4) { + if((sib>>6)) { + if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else { SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, xRAX+sib_reg2);} + } else + ADDW(ret, xRAX+sib_reg2, xRAX+sib_reg); + } else { + ret = xRAX+sib_reg2; + } + } + } else if((nextop&7)==5) { + uint32_t tmp = F32; + MOV32w(ret, tmp); + switch(lock) { + case 1: addLockAddress(tmp); break; + case 2: if(isLockAddress(tmp)) *l=1; break; + } + } else { + ret = xRAX+(nextop&7); + if(ret==hint) { + AND(hint, ret, xMASK); //to clear upper part + } + } + } else { + int64_t i32; + uint8_t sib = 0; + int sib_reg = 0; + if((nextop&7)==4) { + sib = F8; + sib_reg = (sib>>3)&7; + } + int sib_reg2 = sib&0x07; + if(nextop&0x80) + i32 = F32S; + else + i32 = F8S; + if(i32==0 || ((i32>=-2048) && (i32<=2047) && i12)) { + *fixaddress = i32; + if((nextop&7)==4) { + if (sib_reg!=4) { + if(sib>>6) { + if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else {SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, xRAX+sib_reg2);} + } else + ADDW(ret, xRAX+sib_reg2, xRAX+sib_reg); + } else { + ret = xRAX+sib_reg2; + } + } else { + ret = xRAX+(nextop&0x07); + } + } else { + if(i32>=-2048 && i32<=2047) { + if((nextop&7)==4) { + if (sib_reg!=4) { + if(sib>>6) { + if(rv64_zba) SHxADDUW(scratch, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else {SLLI(scratch, xRAX+sib_reg, sib>>6); ADDW(scratch, scratch, xRAX+sib_reg2);} + } else + ADDW(scratch, xRAX+sib_reg2, xRAX+sib_reg); + } else { + scratch = xRAX+sib_reg2; + } + } else + scratch = xRAX+(nextop&0x07); + ADDIW(ret, scratch, i32); + } else { + MOV32w(scratch, i32); + if((nextop&7)==4) { + if (sib_reg!=4) { + ADDW(scratch, scratch, xRAX+sib_reg2); + if(sib>>6) { + if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), scratch); else {SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, scratch);} + } else + ADDW(ret, scratch, xRAX+sib_reg); + } else { + PASS3(int tmp = xRAX+sib_reg2); + ADDW(ret, tmp, scratch); + } + } else { + PASS3(int tmp = xRAX+(nextop&0x07)); + ADDW(ret, tmp, scratch); + } + } + } + } + *ed = ret; + return addr; +} + +/* setup r2 to address pointed by ED, also fixaddress is an optionnal delta in the range [-absmax, +absmax], with delta&mask==0 to be added to ed for LDR/STR */ +uintptr_t geted32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int *l, int i12, int delta) +{ + MAYUSE(dyn); MAYUSE(ninst); MAYUSE(delta); + + int lock = l?((l==LOCK_LOCK)?1:2):0; + if(lock==2) + *l = 0; + uint8_t ret = x2; + *fixaddress = 0; + if(hint>0) ret = hint; + int maxval = 2047; + if(i12>1) + maxval -= i12; + MAYUSE(scratch); + if(!(nextop&0xC0)) { + if((nextop&7)==4) { + uint8_t sib = F8; + int sib_reg = ((sib>>3)&0x7)+(rex.x<<3); + int sib_reg2 = (sib&0x7)+(rex.b<<3); + if((sib&0x7)==5) { + int64_t tmp = F32S; + if (sib_reg!=4) { + if(tmp && ((tmp<-2048) || (tmp>maxval) || !i12)) { + MOV64x(scratch, tmp); + if((sib>>6)) { + if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), scratch); else {SLLI(ret, xRAX+sib_reg, sib>>6); ADDW(ret, ret, scratch);} + } else + ADDW(ret, xRAX+sib_reg, scratch); + } else { + if(sib>>6) + SLLI(ret, xRAX+sib_reg, (sib>>6)); + else + ret = xRAX+sib_reg; + *fixaddress = tmp; + } + } else { + switch(lock) { + case 1: addLockAddress(tmp); break; + case 2: if(isLockAddress(tmp)) *l=1; break; + } + MOV64x(ret, tmp); + } + } else { + if (sib_reg!=4) { + if((sib>>6)) { + if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else { SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, xRAX+sib_reg2);} + } else + ADDW(ret, xRAX+sib_reg2, xRAX+sib_reg); + } else { + ret = xRAX+sib_reg2; + } + } + } else if((nextop&7)==5) { + uint32_t tmp = F32; + MOV32w(ret, tmp); + GETIP(addr+delta); + ADDW(ret, ret, xRIP); + switch(lock) { + case 1: addLockAddress(addr+delta+tmp); break; + case 2: if(isLockAddress(addr+delta+tmp)) *l=1; break; + } + } else { + ret = xRAX+(nextop&7)+(rex.b<<3); + if(ret==hint) { + AND(hint, ret, xMASK); //to clear upper part + } + } + } else { + int64_t i64; + uint8_t sib = 0; + int sib_reg = 0; + if((nextop&7)==4) { + sib = F8; + sib_reg = ((sib>>3)&7)+(rex.x<<3); + } + int sib_reg2 = (sib&0x07)+(rex.b<<3); + if(nextop&0x80) + i64 = F32S; + else + i64 = F8S; + if(i64==0 || ((i64>=-2048) && (i64<=2047) && i12)) { + *fixaddress = i64; + if((nextop&7)==4) { + if (sib_reg!=4) { + if(sib>>6) { + if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else {SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, xRAX+sib_reg2);} + } else + ADDW(ret, xRAX+sib_reg2, xRAX+sib_reg); + } else { + ret = xRAX+sib_reg2; + } + } else { + ret = xRAX+(nextop&0x07)+(rex.b<<3); + } + } else { + if(i64>=-2048 && i64<=2047) { + if((nextop&7)==4) { + if (sib_reg!=4) { + if(sib>>6) { + if(rv64_zba) SHxADDUW(scratch, xRAX+sib_reg, (sib>>6), xRAX+sib_reg2); else {SLLI(scratch, xRAX+sib_reg, sib>>6); ADDW(scratch, scratch, xRAX+sib_reg2);} + } else + ADDW(scratch, xRAX+sib_reg2, xRAX+sib_reg); + } else { + scratch = xRAX+sib_reg2; + } + } else + scratch = xRAX+(nextop&0x07)+(rex.b<<3); + ADDIW(ret, scratch, i64); + } else { + MOV32w(scratch, i64); + if((nextop&7)==4) { + if (sib_reg!=4) { + ADDW(scratch, scratch, xRAX+sib_reg2); + if(sib>>6) { + if(rv64_zba) SHxADDUW(ret, xRAX+sib_reg, (sib>>6), scratch); else {SLLI(ret, xRAX+sib_reg, (sib>>6)); ADDW(ret, ret, scratch);} + } else + ADDW(ret, scratch, xRAX+sib_reg); + } else { + PASS3(int tmp = xRAX+sib_reg2); + ADDW(ret, tmp, scratch); + } + } else { + PASS3(int tmp = xRAX+(nextop&0x07)+(rex.b<<3)); + ADDW(ret, tmp, scratch); + } + } + } + } + *ed = ret; + return addr; +} + void jump_to_epilog(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst) { MAYUSE(dyn); MAYUSE(ip); MAYUSE(ninst); @@ -233,8 +521,7 @@ void jump_to_next(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst) MAYUSE(tbl); TABLE64(x3, tbl); SRLI(x2, xRIP, JMPTABL_START3); - SLLI(x2, x2, 3); - ADD(x3, x3, x2); + if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);} LD(x3, x3, 0); // could be LR_D(x3, x3, 1, 1); for better safety MOV64x(x4, JMPTABLE_MASK2<<3); // x4 = mask SRLI(x2, xRIP, JMPTABL_START2-3); @@ -256,8 +543,7 @@ void jump_to_next(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst) } AND(x2, xRIP, x4); } - SLLI(x2, x2, 3); - ADD(x3, x3, x2); + if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);} LD(x2, x3, 0); //LR_D(x2, x3, 1, 1); } else { uintptr_t p = getJumpTableAddress64(ip); @@ -277,12 +563,12 @@ void jump_to_next(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst) JALR(x2); // save LR... } -void ret_to_epilog(dynarec_rv64_t* dyn, int ninst) +void ret_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex) { MAYUSE(dyn); MAYUSE(ninst); MESSAGE(LOG_DUMP, "Ret to epilog\n"); - POP1(xRIP); - MV(x1, xRIP); + POP1z(xRIP); + MVz(x1, xRIP); SMEND(); /*if(box64_dynarec_callret) { // pop the actual return address from RV64 stack @@ -297,8 +583,7 @@ void ret_to_epilog(dynarec_rv64_t* dyn, int ninst) uintptr_t tbl = getJumpTable64(); MOV64x(x3, tbl); SRLI(x2, xRIP, JMPTABL_START3); - SLLI(x2, x2, 3); - ADD(x3, x3, x2); + if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);} LD(x3, x3, 0); MOV64x(x4, JMPTABLE_MASK2<<3); // x4 = mask SRLI(x2, xRIP, JMPTABL_START2-3); @@ -320,25 +605,24 @@ void ret_to_epilog(dynarec_rv64_t* dyn, int ninst) } AND(x2, xRIP, x4); } - SLLI(x2, x2, 3); - ADD(x3, x3, x2); + if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);} LD(x2, x3, 0); JALR(x2); // save LR CLEARIP(); } -void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, int n) +void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex, int n) { MAYUSE(dyn); MAYUSE(ninst); MESSAGE(LOG_DUMP, "Retn to epilog\n"); - POP1(xRIP); + POP1z(xRIP); if(n>0x7ff) { MOV64x(w1, n); - ADD(xRSP, xRSP, x1); + ADDz(xRSP, xRSP, x1); } else { - ADDI(xRSP, xRSP, n); + ADDIz(xRSP, xRSP, n); } - MV(x1, xRIP); + MVz(x1, xRIP); SMEND(); /*if(box64_dynarec_callret) { // pop the actual return address from RV64 stack @@ -353,8 +637,7 @@ void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, int n) uintptr_t tbl = getJumpTable64(); MOV64x(x3, tbl); SRLI(x2, xRIP, JMPTABL_START3); - SLLI(x2, x2, 3); - ADD(x3, x3, x2); + if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);} LD(x3, x3, 0); MOV64x(x4, JMPTABLE_MASK2<<3); // x4 = mask SRLI(x2, xRIP, JMPTABL_START2-3); @@ -376,8 +659,7 @@ void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, int n) } AND(x2, xRIP, x4); } - SLLI(x2, x2, 3); - ADD(x3, x3, x2); + if(rv64_zba) SH3ADD(x3, x2, x3); else {SLLI(x2, x2, 3); ADD(x3, x3, x2);} LD(x2, x3, 0); JALR(x2); // save LR CLEARIP(); @@ -388,26 +670,35 @@ void iret_to_epilog(dynarec_rv64_t* dyn, int ninst, int is64bits) //#warning TODO: is64bits MAYUSE(ninst); MESSAGE(LOG_DUMP, "IRet to epilog\n"); - // POP IP NOTEST(x2); - POP1(xRIP); - // POP CS - POP1(x2); + if(is64bits) { + POP1(xRIP); + POP1(x2); + POP1(xFlags); + } else { + POP1_32(xRIP); + POP1_32(x2); + POP1_32(xFlags); + } + SH(x2, xEmu, offsetof(x64emu_t, segs[_CS])); - MV(x1, xZR); - SD(x1, xEmu, offsetof(x64emu_t, segs_serial[_CS])); - SD(x1, xEmu, offsetof(x64emu_t, segs_serial[_SS])); - // POP EFLAGS - POP1(xFlags); + SW(xZR, xEmu, offsetof(x64emu_t, segs_serial[_CS])); + // clean EFLAGS MOV32w(x1, 0x3F7FD7); AND(xFlags, xFlags, x1); ORI(xFlags, xFlags, 0x2); SET_DFNONE(); // POP RSP - POP1(x3); + if (is64bits) { + POP1(x3); //rsp + POP1(x2); //ss + } else { + POP1_32(x3); //rsp + POP1_32(x2); //ss + } // POP SS - POP1(x2); SH(x2, xEmu, offsetof(x64emu_t, segs[_SS])); + SW(xZR, xEmu, offsetof(x64emu_t, segs_serial[_SS])); // set new RSP MV(xRSP, x3); // Ret.... @@ -434,6 +725,7 @@ void call_c(dynarec_rv64_t* dyn, int ninst, void* fnc, int reg, int ret, int sav // x5..x8, x10..x17, x28..x31 those needs to be saved by caller STORE_REG(RAX); STORE_REG(RCX); + STORE_REG(RDX); STORE_REG(R12); STORE_REG(R13); STORE_REG(R14); @@ -452,6 +744,7 @@ void call_c(dynarec_rv64_t* dyn, int ninst, void* fnc, int reg, int ret, int sav #define GO(A) if(ret!=x##A) {LOAD_REG(A);} GO(RAX); GO(RCX); + GO(RDX); GO(R12); GO(R13); GO(R14); @@ -703,16 +996,14 @@ void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, in for (int i=0; i<a; ++i) { SUBI(s2, s2, 1); ANDI(s2, s2, 7); // (emu->top + st)&7 - SLLI(s1, s2, 2); - ADD(s1, xEmu, s1); + if(rv64_zba) SH2ADD(s1, s2, xEmu); else {SLLI(s1, s2, 2); ADD(s1, xEmu, s1);} SW(s3, s1, offsetof(x64emu_t, p_regs)); } } else { // empty tags ADDI(s3, xZR, 0b11); for (int i=0; i<-a; ++i) { - SLLI(s1, s2, 2); - ADD(s1, xEmu, s1); + if(rv64_zba) SH2ADD(s1, s2, xEmu); else {SLLI(s1, s2, 2); ADD(s1, xEmu, s1);} SW(s3, s1, offsetof(x64emu_t, p_regs)); ADDI(s2, s2, 1); ANDI(s2, s2, 7); // (emu->top + st)&7 @@ -741,8 +1032,7 @@ void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, in #endif ADDI(s3, s2, dyn->e.x87cache[i]); ANDI(s3, s3, 7); // (emu->top + st)&7 - SLLI(s1, s3, 3); - ADD(s1, xEmu, s1); + if(rv64_zba) SH3ADD(s1, s3, xEmu); else {SLLI(s1, s3, 3); ADD(s1, xEmu, s1);} if(next) { // need to check if a ST_F need local promotion if(extcache_get_st_f(dyn, ninst, dyn->e.x87cache[i])>=0) { @@ -801,8 +1091,7 @@ static void x87_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int if(dyn->e.x87cache[i]!=-1) { ADDI(s3, s2, dyn->e.x87cache[i]); ANDI(s3, s3, 7); // (emu->top + i)&7 - SLLI(s1, s3, 3); - ADD(s1, xEmu, s1); + if(rv64_zba) SH3ADD(s1, s3, xEmu); else {SLLI(s1, s3, 3); ADD(s1, xEmu, s1);} if(extcache_get_st_f(dyn, ninst, dyn->e.x87cache[i])>=0) { FCVTDS(SCRATCH0, dyn->e.x87reg[i]); FSD(SCRATCH0, s1, offsetof(x64emu_t, x87)); @@ -834,7 +1123,7 @@ int x87_get_current_cache(dynarec_rv64_t* dyn, int ninst, int st, int t) for (int i=0; i<8; ++i) { if(dyn->e.x87cache[i]==st) { #if STEP == 1 - if(t==EXT_CACHE_ST_D && (dyn->e.extcache[dyn->e.x87reg[i]].t==EXT_CACHE_ST_F)) + if(t==EXT_CACHE_ST_D && (dyn->e.extcache[EXTIDX(dyn->e.x87reg[i])].t==EXT_CACHE_ST_F)) extcache_promote_double(dyn, ninst, st); #endif return i; @@ -866,8 +1155,7 @@ int x87_get_cache(dynarec_rv64_t* dyn, int ninst, int populate, int s1, int s2, ADDI(s2, s2, a); ANDI(s2, s2, 7); } - SLLI(s2, s2, 3); - ADD(s1, xEmu, s2); + if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);} FLD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87)); } MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); @@ -912,7 +1200,7 @@ void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) ANDI(s2, s2, 7); // (emu->top + i)&7 } ADD(s1, xEmu, s2); - if(dyn->e.extcache[dyn->e.x87reg[ret]].t==EXT_CACHE_ST_F) { + if(dyn->e.extcache[EXTIDX(dyn->e.x87reg[ret])].t==EXT_CACHE_ST_F) { FCVTDS(SCRATCH0, dyn->e.x87reg[ret]); FSD(SCRATCH0, s1, offsetof(x64emu_t, x87)); } else { @@ -932,23 +1220,24 @@ void x87_forget(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) return; MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st); #if STEP == 1 - if(dyn->e.extcache[dyn->e.x87reg[ret]].t==EXT_CACHE_ST_F) + if(dyn->e.extcache[EXTIDX(dyn->e.x87reg[ret])].t==EXT_CACHE_ST_F) extcache_promote_double(dyn, ninst, st); #endif // prepare offset to fpu => s1 // Get top LW(s2, xEmu, offsetof(x64emu_t, top)); // Update - if(st) { - ADDI(s2, s2, st); + int a = st - dyn->e.x87stack; + if(a) { + ADDI(s2, s2, a); ANDI(s2, s2, 7); // (emu->top + i)&7 } - ADD(s1, xEmu, s2); + if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);} FSD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87)); MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st); // and forget that cache fpu_free_reg(dyn, dyn->e.x87reg[ret]); - dyn->e.extcache[dyn->e.x87reg[ret]].v = 0; + dyn->e.extcache[EXTIDX(dyn->e.x87reg[ret])].v = 0; dyn->e.x87cache[ret] = -1; dyn->e.x87reg[ret] = -1; } @@ -963,15 +1252,16 @@ void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) // refresh the value MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st); #if STEP == 1 - if(dyn->e.extcache[dyn->e.x87reg[i]].t==EXT_CACHE_ST_F) + if(dyn->e.extcache[EXTIDX(dyn->e.x87reg[i])].t==EXT_CACHE_ST_F) extcache_promote_double(dyn, ninst, st); #endif LW(s2, xEmu, offsetof(x64emu_t, top)); int a = st - dyn->e.x87stack; - ADDI(s2, s2, a); - AND(s2, s2, 7); - SLLI(s2, s2, 3); - ADD(s1, xEmu, s2); + if(a) { + ADDI(s2, s2, a); + AND(s2, s2, 7); + } + if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);} FLD(dyn->e.x87reg[i], s1, offsetof(x64emu_t, x87)); MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); // ok @@ -991,8 +1281,7 @@ void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) int a = st - dyn->e.x87stack; ADDI(s2, s2, a); ANDI(s2, s2, 7); // (emu->top + i)&7 - SLLI(s2, s2, 3); - ADD(s1, xEmu, s2); + if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);} FLD(dyn->e.x87reg[ret], s1, offsetof(x64emu_t, x87)); MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); } @@ -1084,6 +1373,16 @@ static int isx87Empty(dynarec_rv64_t* dyn) return 1; } +// forget ext register for a MMX reg, does nothing if the regs is not loaded +void mmx_forget_reg(dynarec_rv64_t* dyn, int ninst, int a) +{ + if (dyn->e.mmxcache[a] == -1) + return; + FSD(dyn->e.mmxcache[a], xEmu, offsetof(x64emu_t, mmx[a])); + fpu_free_reg(dyn, dyn->e.mmxcache[a]); + return; +} + // get neon register for a MMX reg, create the entry if needed int mmx_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a) { @@ -1153,6 +1452,10 @@ int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single) // forget / reload if change of size if(dyn->e.ssecache[a].single!=single) { sse_forget_reg(dyn, ninst, a); + // update olds after the forget... + dyn->e.olds[a].changed = 1; + dyn->e.olds[a].purged = 0; + dyn->e.olds[a].single = 1-single; return sse_get_reg(dyn, ninst, s1, a, single); } return dyn->e.ssecache[a].reg; @@ -1176,6 +1479,10 @@ int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single) // need to wipe the half high 32bits of old Double because we now have a single //SW(xZR, xEmu, offsetof(x64emu_t, xmm[a])+4); } + dyn->e.olds[a].changed = 1; + dyn->e.olds[a].purged = 0; + dyn->e.olds[a].reg = EXTIDX(dyn->e.ssecache[a].reg); + dyn->e.olds[a].single = 1-single; dyn->e.ssecache[a].single = single; dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = single?EXT_CACHE_SS:EXT_CACHE_SD; return dyn->e.ssecache[a].reg; @@ -1194,6 +1501,10 @@ void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a) else FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); fpu_free_reg(dyn, dyn->e.ssecache[a].reg); + dyn->e.olds[a].changed = 0; + dyn->e.olds[a].purged = 1; + dyn->e.olds[a].reg = dyn->e.ssecache[a].reg; + dyn->e.olds[a].single = dyn->e.ssecache[a].single; dyn->e.ssecache[a].v = -1; return; } @@ -1235,6 +1546,10 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1) FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); if(!next) { fpu_free_reg(dyn, dyn->e.ssecache[i].reg); + dyn->e.olds[i].changed = 0; + dyn->e.olds[i].purged = 1; + dyn->e.olds[i].reg = dyn->e.ssecache[i].reg; + dyn->e.olds[i].single = dyn->e.ssecache[i].single; dyn->e.ssecache[i].v = -1; } } @@ -1286,8 +1601,8 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) for(int i=17; i<24; ++i) if(dyn->e.extcache[i].v!=0) { switch(dyn->e.extcache[i].t) { - case EXT_CACHE_ST_F: - case EXT_CACHE_SS: + case EXT_CACHE_ST_F: + case EXT_CACHE_SS: FSW(EXTREG(i), xSP, p*8); break; default: @@ -1328,8 +1643,8 @@ void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07) for(int i=17; i<24; ++i) if(dyn->e.extcache[i].v!=0) { switch(dyn->e.extcache[i].t) { - case EXT_CACHE_ST_F: - case EXT_CACHE_SS: + case EXT_CACHE_ST_F: + case EXT_CACHE_SS: FLW(EXTREG(i), xSP, p*8); break; default: @@ -1387,7 +1702,7 @@ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t * int j_single = 0; if(cache->extcache[j].t==EXT_CACHE_SS || cache->extcache[j].t==EXT_CACHE_ST_F) j_single =1; - + if(!cache->extcache[i].v) { // a mov is enough, no need to swap MESSAGE(LOG_DUMP, "\t - Moving %d <- %d\n", i, j); @@ -1454,12 +1769,12 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int FLD(reg, xEmu, offsetof(x64emu_t, xmm[n])); break; case EXT_CACHE_MM: - MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); FLD(reg, xEmu, offsetof(x64emu_t, mmx[i])); break; case EXT_CACHE_ST_D: case EXT_CACHE_ST_F: - MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); if((*s3_top) == 0xffff) { LW(s3, xEmu, offsetof(x64emu_t, top)); *s3_top = 0; @@ -1471,18 +1786,17 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int } *s3_top += a; *s2_val = 0; - SLLI(s2, s3, 3); - ADD(s2, xEmu, s2); + if(rv64_zba) SH3ADD(s2, s3, xEmu); else {SLLI(s2, s3, 3); ADD(s2, xEmu, s2);} FLD(reg, s2, offsetof(x64emu_t, x87)); if(t==EXT_CACHE_ST_F) { FCVTSD(reg, reg); } - break; + break; case EXT_CACHE_NONE: case EXT_CACHE_SCR: default: /* nothing done */ MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); - break; + break; } cache->extcache[i].n = n; cache->extcache[i].t = t; @@ -1501,12 +1815,12 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i FSD(reg, xEmu, offsetof(x64emu_t, xmm[n])); break; case EXT_CACHE_MM: - MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); FSD(reg, xEmu, offsetof(x64emu_t, mmx[n])); break; case EXT_CACHE_ST_D: case EXT_CACHE_ST_F: - MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); if((*s3_top)==0xffff) { LW(s3, xEmu, offsetof(x64emu_t, top)); *s3_top = 0; @@ -1517,19 +1831,18 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i ANDI(s3, s3, 7); } *s3_top += a; - SLLI(s2, s3, 3); - ADD(s2, xEmu, s2); + if(rv64_zba) SH3ADD(s2, s3, xEmu); else {SLLI(s2, s3, 3); ADD(s2, xEmu, s2);} *s2_val = 0; if(t==EXT_CACHE_ST_F) { FCVTDS(reg, reg); } FSD(reg, s2, offsetof(x64emu_t, x87)); - break; + break; case EXT_CACHE_NONE: case EXT_CACHE_SCR: default: /* nothing done */ MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); - break; + break; } cache->extcache[i].v = 0; } @@ -1678,18 +1991,18 @@ static void flagsCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1) int go = 0; switch (dyn->insts[jmp].f_entry.pending) { case SF_UNKNOWN: break; - case SF_SET: - if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING) - go = 1; + case SF_SET: + if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING) + go = 1; break; case SF_SET_PENDING: - if(dyn->f.pending!=SF_SET + if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING - && dyn->f.pending!=SF_PENDING) - go = 1; + && dyn->f.pending!=SF_PENDING) + go = 1; break; case SF_PENDING: - if(dyn->f.pending!=SF_SET + if(dyn->f.pending!=SF_SET && dyn->f.pending!=SF_SET_PENDING && dyn->f.pending!=SF_PENDING) go = 1; @@ -1702,11 +2015,11 @@ static void flagsCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1) if(go) { if(dyn->f.pending!=SF_PENDING) { LW(s1, xEmu, offsetof(x64emu_t, df)); - j64 = (GETMARK3)-(dyn->native_size); + j64 = (GETMARKF2)-(dyn->native_size); BEQZ(s1, j64); } CALL_(UpdateFlags, -1, 0); - MARK3; + MARKF2; } #endif } @@ -1734,7 +2047,7 @@ void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zerou LUI(reg, hi20); src = reg; } - if (lo12 || !hi20) ADDI(reg, src, lo12); + if (lo12 || !hi20) ADDIW(reg, src, lo12); if((zeroup && ((hi20&0x80000) || (!hi20 && (lo12&0x800))) || (!zeroup && !(val&0x80000000) && ((hi20&0x80000) || (!hi20 && (lo12&0x800)))))) { ZEROUP(reg); diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index b12ee96b..0b1023b3 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -99,6 +99,25 @@ LD(x1, wback, fixedaddress); \ ed = x1; \ } +#define GETEDz(D) if(MODREG) { \ + ed = xRAX+(nextop&7)+(rex.b<<3); \ + wback = 0; \ + } else { \ + SMREAD() \ + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \ + LDz(x1, wback, fixedaddress); \ + ed = x1; \ + } +// GETED32 can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI +#define GETED32(D) if(MODREG) { \ + ed = xRAX+(nextop&7)+(rex.b<<3); \ + wback = 0; \ + } else { \ + SMREAD() \ + addr = geted32(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \ + LDxw(x1, wback, fixedaddress); \ + ed = x1; \ + } //GETEDH can use hint for ed, and x1 or x2 for wback (depending on hint), might also use x3. wback is 0 if ed is xEAX..xEDI #define GETEDH(hint, D) if(MODREG) { \ ed = xRAX+(nextop&7)+(rex.b<<3); \ @@ -109,13 +128,23 @@ LDxw(hint, wback, fixedaddress); \ ed = hint; \ } +//GETEDW can use hint for wback and ret for ed. wback is 0 if ed is xEAX..xEDI +#define GETEDW(hint, ret, D) if(MODREG) { \ + ed = xRAX+(nextop&7)+(rex.b<<3); \ + MV(ret, ed); \ + wback = 0; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, (hint==x2)?x1:x2, (hint==x1)?x1:x3, &fixedaddress, rex, NULL, 0, D); \ + ed = ret; \ + LDxw(ed, wback, fixedaddress); \ + } // GETGW extract x64 register in gd, that is i -#define GETGW(i) gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3); SLLI(i, gd, 48); SRLI(i, i, 48); gd = i; +#define GETGW(i) gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3); ZEXTH(i, gd); gd = i; //GETEWW will use i for ed, and can use w for wback. #define GETEWW(w, i, D) if(MODREG) { \ wback = xRAX+(nextop&7)+(rex.b<<3);\ - SLLI(i, wback, 48); \ - SRLI(i, i, 48); \ + ZEXTH(i, wback); \ ed = i; \ wb1 = 0; \ } else { \ @@ -130,8 +159,7 @@ //GETSEW will use i for ed, and can use r3 for wback. This is the Signed version #define GETSEW(i, D) if(MODREG) { \ wback = xRAX+(nextop&7)+(rex.b<<3);\ - SLLI(i, wback, 48); \ - SRAI(i, i, 48); \ + if(rv64_zbb) SEXTH(i, wback); else {SLLI(i, wback, 48); SRAI(i, i, 48);}\ ed = i; \ wb1 = 0; \ } else { \ @@ -159,6 +187,7 @@ LDxw(x1, S, fixedaddress); \ ed = x1; \ } +#define WBACKO(O) if(wback) {ADD(O, wback, O); SDxw(ed, O, 0); SMWRITE2();} // FAKEED like GETED, but doesn't get anything #define FAKEED if(!MODREG) { \ @@ -191,6 +220,28 @@ wb1 = 1; \ ed = i; \ } +//GETEBO will use i for ed, i is also Offset, and can use r3 for wback. +#define GETEBO(i, D) if(MODREG) { \ + if(rex.rex) { \ + wback = xRAX+(nextop&7)+(rex.b<<3); \ + wb2 = 0; \ + } else { \ + wback = (nextop&7); \ + wb2 = (wback>>2)*8; \ + wback = xRAX+(wback&3); \ + } \ + if (wb2) {MV(i, wback); SRLI(i, i, wb2); ANDI(i, i, 0xff);} else {ANDI(i, wback, 0xff);} \ + wb1 = 0; \ + ed = i; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \ + ADD(x3, wback, i); \ + if(wback!=x3) wback = x3; \ + LBU(i, wback, fixedaddress);\ + wb1 = 1; \ + ed = i; \ + } //GETSEB sign extend EB, will use i for ed, and can use r3 for wback. #define GETSEB(i, D) if(MODREG) { \ if(rex.rex) { \ @@ -213,6 +264,26 @@ wb1 = 1; \ ed = i; \ } +// GETEB32 will use i for ed, and can use r3 for wback. +#define GETEB32(i, D) if(MODREG) { \ + if(rex.rex) { \ + wback = xRAX+(nextop&7)+(rex.b<<3); \ + wb2 = 0; \ + } else { \ + wback = (nextop&7); \ + wb2 = (wback>>2)*8; \ + wback = xRAX+(wback&3); \ + } \ + if (wb2) {MV(i, wback); SRLI(i, i, wb2); ANDI(i, i, 0xff);} else {ANDI(i, wback, 0xff);} \ + wb1 = 0; \ + ed = i; \ + } else { \ + SMREAD(); \ + addr = geted32(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \ + LBU(i, wback, fixedaddress);\ + wb1 = 1; \ + ed = i; \ + } //GETGB will use i for gd #define GETGB(i) if(rex.rex) { \ @@ -228,7 +299,6 @@ // Write gb (gd) back to original register / memory, using s1 as scratch #define GBBACK(s1) if(gb2) { \ - assert(gb2 == 8); \ MOV64x(s1, 0xffffffffffff00ffLL); \ AND(gb1, gb1, s1); \ SLLI(s1, gd, 8); \ @@ -243,7 +313,6 @@ SB(ed, wback, fixedaddress); \ SMWRITE(); \ } else if(wb2) { \ - assert(wb2 == 8); \ MOV64x(s1, 0xffffffffffff00ffLL); \ AND(wback, wback, s1); \ if (c) {ANDI(ed, ed, 0xff);} \ @@ -309,31 +378,49 @@ } // Will get pointer to GX in general register a, will purge SS or SD if loaded. can use gback as load address -#define GETGX(a) \ - gd = ((nextop&0x38)>>3)+(rex.r<<3); \ - sse_forget_reg(dyn, ninst, gd); \ - gback = a; \ - ADDI(a, xEmu, offsetof(x64emu_t, xmm[gd])) +#define GETGX() \ + gd = ((nextop&0x38)>>3)+(rex.r<<3); \ + sse_forget_reg(dyn, ninst, gd); \ + gback = xEmu; \ + gdoffset = offsetof(x64emu_t, xmm[gd]) // Get Ex address in general register a, will purge SS or SD if it's reg and is loaded. May use x3. Use wback as load address! #define GETEX(a, D) \ if(MODREG) { \ ed = (nextop&7)+(rex.b<<3); \ sse_forget_reg(dyn, ninst, ed); \ - fixedaddress = 0; \ - ADDI(a, xEmu, offsetof(x64emu_t, xmm[ed])); \ - wback = a; \ + fixedaddress = offsetof(x64emu_t, xmm[ed]); \ + wback = xEmu; \ } else { \ SMREAD(); \ ed=16; \ addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D); \ } +#define GETGM() \ + gd = ((nextop&0x38)>>3); \ + mmx_forget_reg(dyn, ninst, gd); \ + gback = xEmu; \ + gdoffset = offsetof(x64emu_t, mmx[gd]) + +// Get EM, might use x3 +#define GETEM(a, D) \ + if(MODREG) { \ + ed = (nextop&7); \ + mmx_forget_reg(dyn, ninst, ed); \ + fixedaddress = offsetof(x64emu_t, mmx[ed]); \ + wback = xEmu; \ + } else { \ + SMREAD(); \ + ed=8; \ + addr = geted(dyn, addr, ninst, nextop, &wback, a, x3, &fixedaddress, rex, NULL, 1, D); \ + } + #define SSE_LOOP_D_ITEM(GX1, EX1, F, i) \ - LWU(GX1, gback, i*4); \ + LWU(GX1, gback, gdoffset+i*4); \ LWU(EX1, wback, fixedaddress+i*4); \ F; \ - SW(GX1, gback, i*4); + SW(GX1, gback, gdoffset+i*4); // Loop for SSE opcode that use 32bits value and write to GX. #define SSE_LOOP_D(GX1, EX1, F) \ @@ -343,10 +430,10 @@ SSE_LOOP_D_ITEM(GX1, EX1, F, 3) #define SSE_LOOP_DS_ITEM(GX1, EX1, F, i) \ - LW(GX1, gback, i*4); \ + LW(GX1, gback, gdoffset+i*4); \ LW(EX1, wback, fixedaddress+i*4); \ F; \ - SW(GX1, gback, i*4); + SW(GX1, gback, gdoffset+i*4); // Loop for SSE opcode that use 32bits value and write to GX. #define SSE_LOOP_DS(GX1, EX1, F) \ @@ -355,20 +442,28 @@ SSE_LOOP_DS_ITEM(GX1, EX1, F, 2) \ SSE_LOOP_DS_ITEM(GX1, EX1, F, 3) +#define MMX_LOOP_W(GX1, EX1, F) \ + for (int i=0; i<4; ++i) { \ + LHU(GX1, gback, gdoffset+i*2); \ + LHU(EX1, wback, fixedaddress+i*2); \ + F; \ + SH(GX1, gback, gdoffset+i*2); \ + } + #define SSE_LOOP_W(GX1, EX1, F) \ for (int i=0; i<8; ++i) { \ - LHU(GX1, gback, i*2); \ + LHU(GX1, gback, gdoffset+i*2); \ LHU(EX1, wback, fixedaddress+i*2); \ F; \ - SH(GX1, gback, i*2); \ + SH(GX1, gback, gdoffset+i*2); \ } #define SSE_LOOP_WS(GX1, EX1, F) \ for (int i=0; i<8; ++i) { \ - LH(GX1, gback, i*2); \ + LH(GX1, gback, gdoffset+i*2); \ LH(EX1, wback, fixedaddress+i*2); \ F; \ - SH(GX1, gback, i*2); \ + SH(GX1, gback, gdoffset+i*2); \ } #define SSE_LOOP_D_S_ITEM(EX1, F, i) \ @@ -384,10 +479,10 @@ SSE_LOOP_D_S_ITEM(EX1, F, 3) #define SSE_LOOP_Q_ITEM(GX1, EX1, F, i) \ - LD(GX1, gback, i*8); \ + LD(GX1, gback, gdoffset+i*8); \ LD(EX1, wback, fixedaddress+i*8); \ F; \ - SD(GX1, gback, i*8); + SD(GX1, gback, gdoffset+i*8); // Loop for SSE opcode that use 64bits value and write to GX. #define SSE_LOOP_Q(GX1, EX1, F) \ @@ -396,10 +491,10 @@ #define SSE_LOOP_FQ_ITEM(GX1, EX1, F, i) \ - FLD(v0, gback, i*8); \ + FLD(v0, gback, gdoffset+i*8); \ FLD(v1, wback, fixedaddress+i*8); \ F; \ - FSD(v0, gback, i*8); + FSD(v0, gback, gdoffset+i*8); #define SSE_LOOP_FQ(GX1, EX1, F) \ v0 = fpu_get_scratch(dyn); \ @@ -410,7 +505,7 @@ #define SSE_LOOP_MV_Q_ITEM(s, i) \ LD(s, wback, fixedaddress+i*8); \ - SD(s, gback, i*8); + SD(s, gback, gdoffset+i*8); // Loop for SSE opcode that moves 64bits value from wback to gback, use s as scratch. #define SSE_LOOP_MV_Q(s) \ @@ -418,7 +513,7 @@ SSE_LOOP_MV_Q_ITEM(s, 1) #define SSE_LOOP_MV_Q_ITEM2(s, i) \ - LD(s, gback, i*8); \ + LD(s, gback, gdoffset+i*8); \ SD(s, wback, fixedaddress+i*8); // Loop for SSE opcode that moves 64bits value from gback to wback, use s as scratch. @@ -436,17 +531,19 @@ // R0 will not be pushed/popd if ret is -2. Flags are not save/restored #define CALL_S(F, ret) call_c(dyn, ninst, F, x6, ret, 0, 0) -#define MARK dyn->insts[ninst].mark = dyn->native_size -#define GETMARK dyn->insts[ninst].mark -#define MARK2 dyn->insts[ninst].mark2 = dyn->native_size -#define GETMARK2 dyn->insts[ninst].mark2 -#define MARK3 dyn->insts[ninst].mark3 = dyn->native_size -#define GETMARK3 dyn->insts[ninst].mark3 -#define MARKF dyn->insts[ninst].markf = dyn->native_size -#define GETMARKF dyn->insts[ninst].markf -#define MARKSEG dyn->insts[ninst].markseg = dyn->native_size -#define GETMARKSEG dyn->insts[ninst].markseg -#define MARKLOCK dyn->insts[ninst].marklock = dyn->native_size +#define MARK dyn->insts[ninst].mark = dyn->native_size +#define GETMARK dyn->insts[ninst].mark +#define MARK2 dyn->insts[ninst].mark2 = dyn->native_size +#define GETMARK2 dyn->insts[ninst].mark2 +#define MARK3 dyn->insts[ninst].mark3 = dyn->native_size +#define GETMARK3 dyn->insts[ninst].mark3 +#define MARKF dyn->insts[ninst].markf = dyn->native_size +#define GETMARKF dyn->insts[ninst].markf +#define MARKF2 dyn->insts[ninst].markf2 = dyn->native_size +#define GETMARKF2 dyn->insts[ninst].markf2 +#define MARKSEG dyn->insts[ninst].markseg = dyn->native_size +#define GETMARKSEG dyn->insts[ninst].markseg +#define MARKLOCK dyn->insts[ninst].marklock = dyn->native_size #define GETMARKLOCK dyn->insts[ninst].marklock #define Bxx_gen(OP, M, reg1, reg2) \ @@ -526,7 +623,7 @@ #define STORE_REG(A) SD(x##A, xEmu, offsetof(x64emu_t, regs[_##A])) #define LOAD_REG(A) LD(x##A, xEmu, offsetof(x64emu_t, regs[_##A])) -// Need to also store current value of some register, as they may be used by functions like setjump +// Need to also store current value of some register, as they may be used by functions like setjmp #define STORE_XEMU_CALL() \ STORE_REG(RBX); \ STORE_REG(RDX); \ @@ -606,11 +703,11 @@ // Adjust the xFlags bit 5 -> bit 11, src and dst can be the same (and can be xFlags, but not s1) #define FLAGS_ADJUST_TO11(dst, src, s1) \ - MOV64x(s1, ~(1<<11)); \ - AND(dst, src, s1); \ - ANDI(s1, dst, 1<<5); \ - SLLI(s1, s1, 11-5); \ - ANDI(dst, dst, ~(1<<5)); \ + LUI(s1, 0xFFFFF); \ + ADDIW(s1, s1, 0x7DF); \ + AND(s1, src, s1); \ + ANDI(dst, src, 1<<5); \ + SLLI(dst, dst, 11-5); \ OR(dst, dst, s1) #ifndef MAYSETFLAGS @@ -721,8 +818,8 @@ #define MODREG ((nextop&0xC0)==0xC0) -void rv64_epilog(); -void rv64_epilog_fast(); +void rv64_epilog(void); +void rv64_epilog_fast(void); void* rv64_next(x64emu_t* emu, uintptr_t addr); #ifndef STEPNAME @@ -863,6 +960,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define sse_setround STEPNAME(sse_setround) #define mmx_get_reg STEPNAME(mmx_get_reg) #define mmx_get_reg_empty STEPNAME(mmx_get_reg_empty) +#define mmx_forget_reg STEPNAME(mmx_forget_reg) #define sse_get_reg STEPNAME(sse_get_reg) #define sse_get_reg_empty STEPNAME(sse_get_reg_empty) #define sse_forget_reg STEPNAME(sse_forget_reg) @@ -888,7 +986,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta); /* setup r2 to address pointed by */ -//uintptr_t geted32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, rex_t rex, int* l, int s, int delta); +uintptr_t geted32(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta); /* setup r2 to address pointed by */ //uintptr_t geted16(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, int s); @@ -898,8 +996,8 @@ uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, void jump_to_epilog(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst); void jump_to_epilog_fast(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst); void jump_to_next(dynarec_rv64_t* dyn, uintptr_t ip, int reg, int ninst); -void ret_to_epilog(dynarec_rv64_t* dyn, int ninst); -void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, int n); +void ret_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex); +void retn_to_epilog(dynarec_rv64_t* dyn, int ninst, rex_t rex, int n); void iret_to_epilog(dynarec_rv64_t* dyn, int ninst, int is64bits); void call_c(dynarec_rv64_t* dyn, int ninst, void* fnc, int reg, int ret, int saveflags, int save_reg); void call_n(dynarec_rv64_t* dyn, int ninst, void* fnc, int w); @@ -950,10 +1048,10 @@ void emit_inc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4); void emit_dec32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); void emit_dec16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); void emit_dec8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4); -void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_adc32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6); //void emit_adc32c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4); -//void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4); -//void emit_adc8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5); +void emit_adc8(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_adc8c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5, int s6); void emit_adc16(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); //void emit_adc16c(dynarec_rv64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4); void emit_sbb32(dynarec_rv64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); @@ -1047,12 +1145,20 @@ int extcache_st_coherency(dynarec_rv64_t* dyn, int ninst, int a, int b); #define X87_ST(A) extcache_get_st(dyn, ninst, A) #endif +//MMX helpers +// get float register for a MMX reg, create the entry if needed +int mmx_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a); +// get float register for a MMX reg, but don't try to synch it if it needed to be created +int mmx_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int a); +// forget float register for a MMX reg, create the entry if needed +void mmx_forget_reg(dynarec_rv64_t* dyn, int ninst, int a); + //SSE/SSE2 helpers -// get neon register for a SSE reg, create the entry if needed +// get float register for a SSE reg, create the entry if needed int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single); -// get neon register for a SSE reg, but don't try to synch it if it needed to be created +// get float register for a SSE reg, but don't try to synch it if it needed to be created int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single); -// forget neon register for a SSE reg, create the entry if needed +// forget float register for a SSE reg, create the entry if needed void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a); // purge the XMM0..XMM7 cache (before function call) void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1); @@ -1085,19 +1191,19 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni uintptr_t dynarec64_64(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int seg, int* ok, int* need_epilog); //uintptr_t dynarec64_65(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep,int* ok, int* need_epilog); uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); -//uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_67(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_D8(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); //uintptr_t dynarec64_DA(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_DB(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); -//uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); -//uintptr_t dynarec64_DD(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_DC(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_DD(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_DE(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_DF(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); -//uintptr_t dynarec64_6664(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog); -//uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); +uintptr_t dynarec64_6664(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog); +uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog); uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); @@ -1231,4 +1337,12 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SW(s2, xEmu, offsetof(x64emu_t, test.test)); \ } +#define GETREX() \ + rex.rex = 0; \ + if(!rex.is32bits) \ + while(opcode>=0x40 && opcode<=0x4f) { \ + rex.rex = opcode; \ + opcode = F8; \ + } + #endif //__DYNAREC_RV64_HELPER_H__ diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h index b07162eb..fbba8f22 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass0.h +++ b/src/dynarec/rv64/dynarec_rv64_pass0.h @@ -22,13 +22,14 @@ #define NEW_INST \ ++dyn->size; \ if(dyn->size+3>=dyn->cap) { \ - dyn->insts = (instruction_native_t*)customRealloc(dyn->insts, sizeof(instruction_native_t)*dyn->cap*2);\ + dyn->insts = (instruction_native_t*)dynaRealloc(dyn->insts, sizeof(instruction_native_t)*dyn->cap*2);\ memset(&dyn->insts[dyn->cap], 0, sizeof(instruction_native_t)*dyn->cap); \ dyn->cap *= 2; \ } \ dyn->insts[ninst].x64.addr = ip; \ dyn->e.combined1 = dyn->e.combined2 = 0;\ dyn->e.swapped = 0; dyn->e.barrier = 0; \ + for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\ dyn->insts[ninst].f_entry = dyn->f; \ if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;} @@ -40,9 +41,10 @@ #define DEFAULT \ --dyn->size; \ *ok = -1; \ - if(box64_dynarec_log>=LOG_INFO) {\ - dynarec_log(LOG_NONE, "%p: Dynarec stopped because of Opcode %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X", \ - (void*)ip, PKip(0), \ + if(box64_dynarec_log>=LOG_INFO || box64_dynarec_dump || box64_dynarec_missing) {\ + dynarec_log(LOG_NONE, "%p: Dynarec stopped because of %sOpcode %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X", \ + (void*)ip, rex.is32bits?"32bits ":"",\ + PKip(0), \ PKip(1), PKip(2), PKip(3), \ PKip(4), PKip(5), PKip(6), \ PKip(7), PKip(8), PKip(9), \ diff --git a/src/dynarec/rv64/dynarec_rv64_pass1.h b/src/dynarec/rv64/dynarec_rv64_pass1.h index c818c26c..34d0a468 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass1.h +++ b/src/dynarec/rv64/dynarec_rv64_pass1.h @@ -5,6 +5,7 @@ #define NEW_INST \ dyn->insts[ninst].f_entry = dyn->f; \ dyn->e.combined1 = dyn->e.combined2 = 0;\ + for(int i=0; i<16; ++i) dyn->e.olds[i].v = 0;\ dyn->e.swapped = 0; dyn->e.barrier = 0 #define INST_EPILOG \ diff --git a/src/dynarec/rv64/dynarec_rv64_pass2.h b/src/dynarec/rv64/dynarec_rv64_pass2.h index d71f9180..1c6e4734 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass2.h +++ b/src/dynarec/rv64/dynarec_rv64_pass2.h @@ -2,7 +2,7 @@ #define FINI \ if(ninst) { \ dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size); \ - dyn->insts_size += 1+((dyn->insts[ninst].x64.size>dyn->insts[ninst].size)?dyn->insts[ninst].x64.size:dyn->insts[ninst].size)/15; \ + dyn->insts_size += 1+((dyn->insts[ninst].x64.size>(dyn->insts[ninst].size/4))?dyn->insts[ninst].x64.size:(dyn->insts[ninst].size/4))/15; \ } #define MESSAGE(A, ...) @@ -10,7 +10,7 @@ #define NEW_INST \ if(ninst) { \ dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size); \ - dyn->insts_size += 1+((dyn->insts[ninst-1].x64.size>dyn->insts[ninst-1].size)?dyn->insts[ninst-1].x64.size:dyn->insts[ninst-1].size)/15; \ + dyn->insts_size += 1+((dyn->insts[ninst-1].x64.size>(dyn->insts[ninst-1].size/4))?dyn->insts[ninst-1].x64.size:(dyn->insts[ninst-1].size/4))/15; \ } #define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; #define INST_NAME(name) diff --git a/src/dynarec/rv64/dynarec_rv64_pass3.h b/src/dynarec/rv64/dynarec_rv64_pass3.h index dafef0c5..459c4e13 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass3.h +++ b/src/dynarec/rv64/dynarec_rv64_pass3.h @@ -1,4 +1,4 @@ -#define INIT +#define INIT #define FINI \ if(ninst) \ addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst].x64.size, dyn->insts[ninst].size/4); \ @@ -16,8 +16,8 @@ if(box64_dynarec_dump) print_newinst(dyn, ninst); \ if(ninst) \ addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst-1].x64.size, dyn->insts[ninst-1].size/4); -#define INST_EPILOG -#define INST_NAME(name) inst_name_pass3(dyn, ninst, name) +#define INST_EPILOG +#define INST_NAME(name) inst_name_pass3(dyn, ninst, name, rex) #define TABLE64(A, V) {int val64offset = Table64(dyn, (V), 3); MESSAGE(LOG_DUMP, " Table64: 0x%lx\n", (V)); AUIPC(A, SPLIT20(val64offset)); LD(A, A, SPLIT12(val64offset));} #define FTABLE64(A, V) {mmx87_regs_t v = {.d = V}; int val64offset = Table64(dyn, v.q, 3); MESSAGE(LOG_DUMP, " FTable64: %g\n", v.d); AUIPC(x1, SPLIT20(val64offset)); FLD(A, x1, SPLIT12(val64offset));} diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index 01657427..b9cbb2af 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -31,6 +31,15 @@ typedef union sse_cache_s { uint8_t single:1; }; } sse_cache_t; +typedef union sse_old_s { + int8_t v; + struct { + uint8_t changed:1; + uint8_t purged:1; + uint8_t reg:4; + uint8_t single:1; + }; +} sse_old_t; typedef struct extcache_s { // ext cache ext_cache_t extcache[24]; @@ -43,6 +52,7 @@ typedef struct extcache_s { uint8_t swapped; // the combined reg were swapped uint8_t barrier; // is there a barrier at instruction epilog? uint32_t news; // bitmask, wich neoncache are new for this opcode + sse_old_t olds[16]; // SSE regs has changed or has been removed // fpu cache int8_t x87cache[8]; // cache status for the 8 x87 register behind the fpu stack int8_t x87reg[8]; // reg used for x87cache entry @@ -70,7 +80,7 @@ typedef struct instruction_rv64_s { int pred_sz; // size of predecessor list int *pred; // predecessor array uintptr_t mark, mark2, mark3; - uintptr_t markf; + uintptr_t markf, markf2; uintptr_t markseg; uintptr_t marklock; int pass2choice;// value for choices that are fixed on pass2 for pass3 diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h index 29336895..e9fa2f6d 100644 --- a/src/dynarec/rv64/rv64_emitter.h +++ b/src/dynarec/rv64/rv64_emitter.h @@ -74,6 +74,7 @@ f28–31 ft8–11 FP temporaries Caller #define x4 14 #define x5 15 #define x6 6 +#define x9 9 // used to clear the upper 32bits #define xMASK 5 // 32bits version of scratch @@ -112,6 +113,7 @@ f28–31 ft8–11 FP temporaries Caller #define MOV64x(A, B) rv64_move64(dyn, ninst, A, B) #define MOV32w(A, B) rv64_move32(dyn, ninst, A, B, 1) #define MOV64xw(A, B) if(rex.w) {MOV64x(A, B);} else {MOV32w(A, B);} +#define MOV64z(A, B) if(rex.is32bits) {MOV32w(A, B);} else {MOV64x(A, B);} // ZERO the upper part #define ZEROUP(r) AND(r, r, xMASK) @@ -174,12 +176,16 @@ f28–31 ft8–11 FP temporaries Caller #define ADDW(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, 0b0111011)) // rd = rs1 + rs2 #define ADDxw(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, rex.w?0b0110011:0b0111011)) +// rd = rs1 + rs2 +#define ADDz(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b000, rd, rex.is32bits?0b0111011:0b0110011)) // rd = rs1 - rs2 #define SUB(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, 0b0110011)) // rd = rs1 - rs2 #define SUBW(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, 0b0111011)) // rd = rs1 - rs2 #define SUBxw(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, rex.w?0b0110011:0b0111011)) +// rd = rs1 - rs2 +#define SUBz(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b000, rd, rex.is32bits?0b0111011:0b0110011)) // rd = rs1<<rs2 #define SLL(rd, rs1, rs2) EMIT(R_type(0b0000000, rs2, rs1, 0b001, rd, 0b0110011)) // rd = (rs1<rs2)?1:0 @@ -202,7 +208,9 @@ f28–31 ft8–11 FP temporaries Caller // rd = rs1 (pseudo instruction) #define MV(rd, rs1) ADDI(rd, rs1, 0) // rd = rs1 (pseudo instruction) -#define MVxw(rd, rs1) if(rex.w) {MV(rd, rs1); } else {AND(rd, rs1, xMASK);} +#define MVxw(rd, rs1) if(rex.w) {MV(rd, rs1);} else {AND(rd, rs1, xMASK);} +// rd = rs1 (pseudo instruction) +#define MVz(rd, rs1) if(rex.is32bits) {AND(rd, rs1, xMASK);} else {MV(rd, rs1);} // rd = !rs1 #define NOT(rd, rs1) XORI(rd, rs1, -1) // rd = -rs1 @@ -254,7 +262,12 @@ f28–31 ft8–11 FP temporaries Caller #define SW(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b010, 0b0100011)) #define PUSH1(reg) do {SD(reg, xRSP, -8); SUBI(xRSP, xRSP, 8);} while(0) -#define POP1(reg) do {LD(reg, xRSP, 0); ADDI(xRSP, xRSP, 8);}while(0) +#define POP1(reg) do {LD(reg, xRSP, 0); if (reg!=xRSP) ADDI(xRSP, xRSP, 8);} while(0) +#define PUSH1_32(reg) do {SW(reg, xRSP, -4); SUBIW(xRSP, xRSP, 4);} while(0) +#define POP1_32(reg) do {LWU(reg, xRSP, 0); if (reg!=xRSP) ADDIW(xRSP, xRSP, 4);} while(0) + +#define POP1z(reg) if(rex.is32bits) {POP1_32(reg);} else {POP1(reg);} +#define PUSH1z(reg) if(rex.is32bits) {PUSH1_32(reg);} else {PUSH1(reg);} #define FENCE_gen(pred, succ) (((pred)<<24) | ((succ)<<20) | 0b0001111) #define FENCE() EMIT(FENCE_gen(3, 3)) @@ -271,10 +284,14 @@ f28–31 ft8–11 FP temporaries Caller #define LD(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b011, rd, 0b0000011)) // rd = [rs1 + imm12] #define LDxw(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b011<<(1-rex.w), rd, 0b0000011)) +// rd = [rs1 + imm12] +#define LDz(rd, rs1, imm12) EMIT(I_type(imm12, rs1, 0b011<<rex.is32bits, rd, 0b0000011)) // [rs1 + imm12] = rs2 #define SD(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b011, 0b0100011)) // [rs1 + imm12] = rs2 #define SDxw(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b010+rex.w, 0b0100011)) +// [rs1 + imm12] = rs2 +#define SDz(rs2, rs1, imm12) EMIT(S_type(imm12, rs2, rs1, 0b010+(1-rex.is32bits), 0b0100011)) // Shift Left Immediate #define SLLI(rd, rs1, imm6) EMIT(I_type(imm6, rs1, 0b001, rd, 0b0010011)) @@ -285,8 +302,12 @@ f28–31 ft8–11 FP temporaries Caller // rd = rs1 + imm12 #define ADDIW(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, 0b0011011)) +// rd = rs1 - imm12 +#define SUBIW(rd, rs1, imm12) EMIT(I_type((-imm12)&0b111111111111, rs1, 0b000, rd, 0b0011011)) // rd = rs1 + imm12 #define ADDIxw(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, rex.w?0b0010011:0b0011011)) +// rd = rs1 + imm12 +#define ADDIz(rd, rs1, imm12) EMIT(I_type((imm12)&0b111111111111, rs1, 0b000, rd, rex.is32bits?0b0011011:0b0010011)) #define SEXT_W(rd, rs1) ADDIW(rd, rs1, 0) @@ -359,6 +380,8 @@ f28–31 ft8–11 FP temporaries Caller #define LR_W(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010, rd, 0b0101111)) #define SC_W(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010, rd, 0b0101111)) +#define AMOSWAP_W(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00001, aq, rl), rs2, rs1, 0b010, rd, 0b0101111)) + // RV64A #define LR_D(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b011, rd, 0b0101111)) #define SC_D(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b011, rd, 0b0101111)) @@ -366,6 +389,8 @@ f28–31 ft8–11 FP temporaries Caller #define LRxw(rd, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00010, aq, rl), 0, rs1, 0b010|rex.w, rd, 0b0101111)) #define SCxw(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00011, aq, rl), rs2, rs1, 0b010|rex.w, rd, 0b0101111)) +#define AMOSWAP_D(rd, rs2, rs1, aq, rl) EMIT(R_type(AQ_RL(0b00001, aq, rl), rs2, rs1, 0b011, rd, 0b0101111)) + // RV32F // Read round mode #define FRRM(rd) CSRRS(rd, xZR, 0x002) @@ -509,4 +534,120 @@ f28–31 ft8–11 FP temporaries Caller // Convert from Double to unsigned integer #define FCVTLUDxw(rd, frs1, rm) EMIT(R_type(0b1100001, 0b00001+(rex.w?0b10:0b00), frs1, rm, rd, 0b1010011)) +//Zba +// Add unsigned word (Wz(rs1) + X(rs2)) +#define ADDUW(rd, rs1, rs2) EMIT(R_type(0b0000100, rs2, rs1, 0b000, rd, 0b0111011)) +// Zero-extend Word +#define ZEXTW(rd, rs1) ADDUW(rd, rs1, xZR) +// Shift left by 1 and add (rd = X(rs2) + X(rs1)<<1) +#define SH1ADD(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b0110011)) +// Shift unsigned word left by 1 and add (rd = X(rs2) + Wz(rs1)<<1) +#define SH1ADDUW(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b010, rd, 0b0111011)) +// Shift left by 2 and add (rd = X(rs2) + X(rs1)<<2) +#define SH2ADD(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b100, rd, 0b0110011)) +// Shift unsigned word left by 2 and add (rd = X(rs2) + Wz(rs1)<<2) +#define SH2ADDUW(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b100, rd, 0b0111011)) +// Shift left by 3 and add (rd = X(rs2) + X(rs1)<<3) +#define SH3ADD(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b110, rd, 0b0110011)) +// Shift unsigned word left by 3 and add (rd = X(rs2) + Wz(rs1)<<3) +#define SH3ADDUW(rd, rs1, rs2) EMIT(R_type(0b0010000, rs2, rs1, 0b110, rd, 0b0111011)) +// Shift left unsigned word (immediate) +#define SLLIUW(rd, rs1, imm) EMIT(R_type(0b0000100, imm, rs1, 0b001, rd, 0b0011011)) +// Shift left by 1,2 or 3 and add (rd = X(rs2) + X(rs1)<<x) +#define SHxADD(rd, rs1, x, rs2) EMIT(R_type(0b0010000, rs2, rs1, (x)<<1, rd, 0b0110011)) +// Shift unsigned word left by 1,2 or 3 and add (rd = X(rs2) + Wz(rs1)<<x) +#define SHxADDUW(rd, rs1, x, rs2) EMIT(R_type(0b0010000, rs2, rs1, (x)<<1, rd, 0b0111011)) + +//Zbb +// AND with reverted operand (rs1 & ~rs2) +#define ANDN(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b111, rd, 0b0110011)) +// OR with reverted operand (rs1 | ~rs2) +#define ORN(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b110, rd, 0b0110011)) +// Exclusive NOR (~(rs1 ^ rs2)) +#define XNOR(rd, rs1, rs2) EMIT(R_type(0b0100000, rs2, rs1, 0b100, rd, 0b0110011)) +// Count leading zero bits +#define CLZ(rd, rs) EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, 0b0010011)) +// Count leading zero bits in word +#define CLZW(rd, rs) EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, 0b0011011)) +// Count leading zero bits +#define CLZxw(rd, rs) EMIT(R_type(0b0110000, 0b00000, rs, 0b001, rd, rex.w?0b0010011:0b0011011)) +// Count trailing zero bits +#define CTZ(rd, rs) EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, 0b0010011)) +// Count trailing zero bits in word +#define CTZW(rd, rs) EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, 0b0011011)) +// Count trailing zero bits +#define CTZxw(rd, rs) EMIT(R_type(0b0110000, 0b00001, rs, 0b001, rd, rex.w?0b0010011:0b0011011)) +// Count set bits +#define CPOP(rd, rs) EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, 0b0010011)) +// Count set bits in word +#define CPOPW(rd, rs) EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, 0b0011011)) +// Count set bits +#define CPOPxw(rd, rs) EMIT(R_type(0b0110000, 0b00010, rs, 0b001, rd, rex.w?0b0010011:0b0011011)) +// Maximum +#define MAX(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b110, rd, 0b0110011)) +// Unisgned maximum +#define MAXU(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b111, rd, 0b0110011)) +// Minimum +#define MIN(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b100, rd, 0b0110011)) +// Unsigned minimum +#define MINU(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b101, rd, 0b0110011)) +// Sign-extend byte +#define SEXTB(rd, rs) EMIT(R_type(0b0110000, 0b00100, rs, 0b001, rd, 0b0010011)) +// Sign-extend half-word +#define SEXTH(rd, rs) EMIT(R_type(0b0110000, 0b00101, rs, 0b001, rd, 0b0010011)) +// Zero-extend half-word +#define ZEXTH_(rd, rs) EMIT(R_type(0b0000100, 0b00000, rs, 0b100, rd, 0b0111011)) +// Zero-extend half-word +#define ZEXTH(rd, rs) if(rv64_zbb) ZEXTH_(rd, rs); else {SLLI(rd, rs, 48); SRLI(rd, rd, 48);} +// Rotate left (register) +#define ROL(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, 0b0110011)) +// Rotate left word (register) +#define ROLW(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, 0b0111011)) +// Rotate left (register) +#define ROLxw(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b001, rd, rex.w?0b0110011:0b0111011)) +// Rotate right (register) +#define ROR(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, 0b0110011)) +// Rotate right (immediate) +#define RORI(rd, rs1, shamt) EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, 0b0010011)) +// Rotate right word (immediate) +#define RORIW(rd, rs1, shamt) EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, 0b0011011)) +// Rotate right (immediate) +#define RORIxw(rd, rs1, shamt) EMIT(R_type(0b0110000, shamt, rs1, 0b101, rd, rex.w?0b0010011:0b0011011)) +// Rotate right word (register) +#define RORW(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, 0b0111011)) +// Rotate right (register) +#define RORxw(rd, rs1, rs2) EMIT(R_type(0b0110000, rs2, rs1, 0b101, rd, rex.w?0b0110011:0b0111011)) +// Bitwise OR Combine, byte granule (for all byte, if byte==0, res.byte=0, else res.byte=0xff) +#define ORCB(rd, rs) EMIT(I_type(0b001010000111, rs, 0b101, rd, 0b0010011)) +// Byte-reverse register +#define REV8(rd, rs) EMIT(I_type(0b011010111000, rs, 0b101, rd, 0b0010011)) + +//Zbc +// Carry-less multily (low-part) +#define CLMUL(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b001, rd, 0b0110011)) +// Carry-less multiply (high-part) +#define CLMULH(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b011, rd, 0b0110011)) +// Carry-less multiply (reversed) +#define CLMULR(rd, rs1, rs2) EMIT(R_type(0b0000101, rs2, rs1, 0b010, rd, 0b0110011)) + +//Zbs +// encoding of the "imm" on RV64 use a slight different mask, but it will work using R_type with high bit of imm ovewriting low bit op func +// Single-bit Clear (Register) +#define BCLR(rd, rs1, rs2) EMIT(R_type(0b0100100, rs2, rs1, 0b001, rd, 0b0110011)) +// Single-bit Clear (Immediate) +#define BCLI(rd, rs1, imm) EMIT(R_type(0b0100100, imm, rs1, 0b001, rd, 0b0010011)) +// Single-bit Extreact (Register) +#define BEXT(rd, rs1, rs2) EMIT(R_type(0b0100100, rs2, rs1, 0b101, rd, 0b0110011)) +// Single-bit Extract (Immediate) +#define BEXTI(rd, rs1, imm) EMIT(R_type(0b0100100, imm, rs1, 0b101, rd, 0b0010011)) +// Single-bit Invert (Register) +#define BINV(rd, rs1, rs2) EMIT(R_type(0b0110100, rs2, rs1, 0b001, rd, 0b0110011)) +// Single-bit Invert (Immediate) +#define BINVI(rd, rs1, imm) EMIT(R_type(0b0110100, imm, rs1, 0b001, rd, 0b0010011)) +// Single-bit Set (Register) +#define BSET(rd, rs1, rs2) EMIT(R_type(0b0010100, rs2, rs1, 0b001, rd, 0b0110011)) +// Single-bit Set (Immediate) +#define BSETI(rd, rs1, imm) EMIT(R_type(0b0010100, imm, rs1, 0b001, rd, 0b0010011)) + + #endif //__RV64_EMITTER_H__ diff --git a/src/dynarec/rv64/rv64_epilog.S b/src/dynarec/rv64/rv64_epilog.S index 6a299d9d..17dc117f 100644 --- a/src/dynarec/rv64/rv64_epilog.S +++ b/src/dynarec/rv64/rv64_epilog.S @@ -39,26 +39,27 @@ rv64_epilog: rv64_epilog_fast: ld ra, (sp) // save ra ld x8, 8(sp) // save fp - ld x18, 16(sp) - ld x19, 24(sp) - ld x20, 32(sp) - ld x21, 40(sp) - ld x22, 48(sp) - ld x23, 56(sp) - ld x24, 64(sp) - ld x25, 72(sp) - ld x26, 80(sp) - ld x27, 88(sp) - fld f18, (12*8)(sp) - fld f19, (13*8)(sp) - fld f20, (14*8)(sp) - fld f21, (15*8)(sp) - fld f22, (16*8)(sp) - fld f23, (17*8)(sp) - fld f24, (18*8)(sp) - fld f25, (19*8)(sp) - fld f26, (20*8)(sp) - fld f27, (21*8)(sp) - addi sp, sp, (8 * 22) + ld x18, (2*8)(sp) + ld x19, (3*8)(sp) + ld x20, (4*8)(sp) + ld x21, (5*8)(sp) + ld x22, (6*8)(sp) + ld x23, (7*8)(sp) + ld x24, (8*8)(sp) + ld x25, (9*8)(sp) + ld x26, (10*8)(sp) + ld x27, (11*8)(sp) + ld x9, (12*8)(sp) + fld f18, (13*8)(sp) + fld f19, (14*8)(sp) + fld f20, (15*8)(sp) + fld f21, (16*8)(sp) + fld f22, (17*8)(sp) + fld f23, (19*8)(sp) + fld f24, (19*8)(sp) + fld f25, (20*8)(sp) + fld f26, (21*8)(sp) + fld f27, (22*8)(sp) + addi sp, sp, (8 * 24) //end, return... ret diff --git a/src/dynarec/rv64/rv64_printer.c b/src/dynarec/rv64/rv64_printer.c index bdc424c1..db013c32 100644 --- a/src/dynarec/rv64/rv64_printer.c +++ b/src/dynarec/rv64/rv64_printer.c @@ -785,6 +785,9 @@ const char* rv64_print(uint32_t data, uintptr_t addr) } else if (imm116 == 0x10) { /* SRAI */ insn.name = "srai"; insn.imm&=0b111111; + } else if (insn.imm==0b011010111000) { + insn.name = "rev8"; + PRINT_rd_rs1(); } break; } @@ -968,6 +971,20 @@ const char* rv64_print(uint32_t data, uintptr_t addr) } } break; + case 0x10: { + switch (funct3) { + case 0b010: + insn.name = "sh1add"; + break; + case 0b100: + insn.name = "sh2add"; + break; + case 0b110: + insn.name = "sh3add"; + break; + } + } + break; case 0x20: { switch (funct3) { case 0x0: /* SUB */ diff --git a/src/dynarec/rv64/rv64_prolog.S b/src/dynarec/rv64/rv64_prolog.S index 0817bdc1..96a85d3b 100644 --- a/src/dynarec/rv64/rv64_prolog.S +++ b/src/dynarec/rv64/rv64_prolog.S @@ -11,29 +11,30 @@ .global rv64_prolog rv64_prolog: //save all 18 used register - addi sp, sp, -(8 * 22) + addi sp, sp, -(8 * 24) // 16 bytes aligned sd ra, (sp) // save ra sd x8, 8(sp) // save fp - sd x18, 16(sp) - sd x19, 24(sp) - sd x20, 32(sp) - sd x21, 40(sp) - sd x22, 48(sp) - sd x23, 56(sp) - sd x24, 64(sp) - sd x25, 72(sp) - sd x26, 80(sp) - sd x27, 88(sp) - fsd f18, (12*8)(sp) - fsd f19, (13*8)(sp) - fsd f20, (14*8)(sp) - fsd f21, (15*8)(sp) - fsd f22, (16*8)(sp) - fsd f23, (17*8)(sp) - fsd f24, (18*8)(sp) - fsd f25, (19*8)(sp) - fsd f26, (20*8)(sp) - fsd f27, (21*8)(sp) + sd x18, (2*8)(sp) + sd x19, (3*8)(sp) + sd x20, (4*8)(sp) + sd x21, (5*8)(sp) + sd x22, (6*8)(sp) + sd x23, (7*8)(sp) + sd x24, (8*8)(sp) + sd x25, (9*8)(sp) + sd x26, (10*8)(sp) + sd x27, (11*8)(sp) + sd x9, (12*8)(sp) + fsd f18, (13*8)(sp) + fsd f19, (14*8)(sp) + fsd f20, (15*8)(sp) + fsd f21, (16*8)(sp) + fsd f22, (17*8)(sp) + fsd f23, (19*8)(sp) + fsd f24, (19*8)(sp) + fsd f25, (20*8)(sp) + fsd f26, (21*8)(sp) + fsd f27, (22*8)(sp) //setup emu -> register ld x16, (a0) ld x17, 8(a0) |