diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2022-04-09 18:48:12 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2022-04-09 18:48:12 +0200 |
| commit | f779d4fadb524857a8c6d74c62013ea2f381bc14 (patch) | |
| tree | 5e266a3ffac83db5a892f7546bbe1303908a1204 /src | |
| parent | c6b08a936b575392f194c1ec07fd7ca8cf168b86 (diff) | |
| download | box64-f779d4fadb524857a8c6d74c62013ea2f381bc14.tar.gz box64-f779d4fadb524857a8c6d74c62013ea2f381bc14.zip | |
[DYNAREC] Refactored dynarec, using box86 refactor
Diffstat (limited to 'src')
29 files changed, 2638 insertions, 1232 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c index 3e6098de..a247b382 100755 --- a/src/dynarec/arm64/dynarec_arm64_00.c +++ b/src/dynarec/arm64/dynarec_arm64_00.c @@ -636,21 +636,30 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin } break; - #define GO(GETFLAGS, NO, YES, F) \ - READFLAGS(F); \ - i8 = F8S; \ - BARRIER(2); \ - JUMP(addr+i8);\ - GETFLAGS; \ - if(dyn->insts[ninst].x64.jmp_insts==-1) { \ - /* out of the block */ \ - i32 = dyn->insts[ninst+1].address-(dyn->native_size); \ - Bcond(NO, i32); \ - jump_to_next(dyn, addr+i8, 0, ninst); \ - } else { \ - /* inside the block */ \ + #define GO(GETFLAGS, NO, YES, F) \ + READFLAGS(F); \ + i8 = F8S; \ + BARRIER(BARRIER_MAYBE); \ + JUMP(addr+i8, 1); \ + GETFLAGS; \ + if(dyn->insts[ninst].x64.jmp_insts==-1 || \ + CHECK_CACHE()) { \ + /* out of the block */ \ + i32 = dyn->insts[ninst].epilog-(dyn->native_size); \ + Bcond(NO, i32); \ + if(dyn->insts[ninst].x64.jmp_insts==-1) { \ + if(!dyn->insts[ninst].x64.barrier) \ + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ + jump_to_next(dyn, addr+i8, 0, ninst); \ + } else { \ + fpuCacheTransform(dyn, ninst, x1, x2, x3); \ + i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);\ + B(i32); \ + } \ + } else { \ + /* inside the block */ \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); \ - Bcond(YES, i32); \ + Bcond(YES, i32); \ } GOCOND(0x70, "J", "ib"); @@ -1480,7 +1489,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("ROL Ed, Ib"); - SETFLAGS(X_OF|X_CF, SF_SUBSET); + SETFLAGS(X_OF|X_CF, SF_SUBSET_PENDING); GETED(1); u8 = (F8)&(rex.w?0x3f:0x1f); emit_rol32c(dyn, ninst, rex, ed, u8, x3, x4); @@ -1488,7 +1497,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 1: INST_NAME("ROR Ed, Ib"); - SETFLAGS(X_OF|X_CF, SF_SUBSET); + SETFLAGS(X_OF|X_CF, SF_SUBSET_PENDING); GETED(1); u8 = (F8)&(rex.w?0x3f:0x1f); emit_ror32c(dyn, ninst, rex, ed, u8, x3, x4); @@ -1551,7 +1560,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("RETN"); //SETFLAGS(X_ALL, SF_SET); // Hack, set all flags (to an unknown state...) READFLAGS(X_PEND); // lets play safe here too - BARRIER(2); + BARRIER(BARRIER_FLOAT); i32 = F16; retn_to_epilog(dyn, ninst, i32); *need_epilog = 0; @@ -1561,7 +1570,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("RET"); // SETFLAGS(X_ALL, SF_SET); // Hack, set all flags (to an unknown state...) READFLAGS(X_PEND); // so instead, force the defered flags, so it's not too slow, and flags are not lost - BARRIER(2); + BARRIER(BARRIER_FLOAT); ret_to_epilog(dyn, ninst); *need_epilog = 0; *ok = 0; @@ -1614,7 +1623,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin SETFLAGS(X_ALL, SF_SET); // Hack, set all flags (to an unknown state...) if(PK(0)=='S' && PK(1)=='C') { addr+=2; - BARRIER(2); + BARRIER(BARRIER_FLOAT); INST_NAME("Special Box64 instruction"); if((PK64(0)==0)) { @@ -1672,7 +1681,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xCF: INST_NAME("IRET"); SETFLAGS(X_ALL, SF_SET); // Not a hack, EFLAGS are restored - BARRIER(2); + BARRIER(BARRIER_FLOAT); iret_to_epilog(dyn, ninst, rex.w); *need_epilog = 0; *ok = 0; @@ -1785,14 +1794,14 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("ROL Ed, 1"); - SETFLAGS(X_OF|X_CF, SF_SUBSET); + SETFLAGS(X_OF|X_CF, SF_SUBSET_PENDING); GETED(0); emit_rol32c(dyn, ninst, rex, ed, 1, x3, x4); WBACK; break; case 1: INST_NAME("ROR Ed, 1"); - SETFLAGS(X_OF|X_CF, SF_SUBSET); + SETFLAGS(X_OF|X_CF, SF_SUBSET_PENDING); GETED(0); emit_ror32c(dyn, ninst, rex, ed, 1, x3, x4); WBACK; @@ -1997,17 +2006,26 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin addr = dynarec64_DF(dyn, addr, ip, ninst, rex, rep, ok, need_epilog); break; #define GO(Z) \ - BARRIER(2); \ - JUMP(addr+i8); \ - if(dyn->insts[ninst].x64.jmp_insts==-1) { \ - /* out of the block */ \ - i32 = dyn->insts[ninst+1].address-(dyn->native_size); \ - if(Z) {CBNZx(xRCX, i32);} else {CBZx(xRCX, i32);}; \ - jump_to_next(dyn, addr+i8, 0, ninst); \ - } else { \ - /* inside the block */ \ + BARRIER(BARRIER_MAYBE); \ + JUMP(addr+i8, 1); \ + if(dyn->insts[ninst].x64.jmp_insts==-1 || \ + CHECK_CACHE()) { \ + /* out of the block */ \ + i32 = dyn->insts[ninst].epilog-(dyn->native_size); \ + if(Z) {CBNZx(xRCX, i32);} else {CBZx(xRCX, i32);}; \ + if(dyn->insts[ninst].x64.jmp_insts==-1) { \ + if(!dyn->insts[ninst].x64.barrier) \ + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ + jump_to_next(dyn, addr+i8, 0, ninst); \ + } else { \ + fpuCacheTransform(dyn, ninst, x1, x2, x3); \ + i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); \ + Bcond(c__, i32); \ + } \ + } else { \ + /* inside the block */ \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); \ - if(Z) {CBZx(xRCX, i32);} else {CBNZx(xRCX, i32);}; \ + if(Z) {CBZx(xRCX, i32);} else {CBNZx(xRCX, i32);}; \ } case 0xE0: INST_NAME("LOOPNZ"); @@ -2057,8 +2075,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch(tmp) { case 3: SETFLAGS(X_ALL, SF_SET); // Hack to set flags to "dont'care" state - BARRIER(1); - BARRIER_NEXT(1); + BARRIER(BARRIER_FULL); + //BARRIER_NEXT(BARRIER_FULL); TABLE64(x2, addr); PUSH1(x2); MESSAGE(LOG_DUMP, "Native Call to %s (retn=%d)\n", GetNativeName(GetNativeFnc(dyn->insts[ninst].natcall-1)), dyn->insts[ninst].retn); @@ -2098,8 +2116,8 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin SETFLAGS(X_ALL, SF_SET); // Hack to set flags to "dont'care" state } // regular call - BARRIER(1); - BARRIER_NEXT(1); + BARRIER(BARRIER_FULL); + //BARRIER_NEXT(1); *need_epilog = 0; *ok = 0; TABLE64(x2, addr); @@ -2115,7 +2133,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 0xE9: case 0xEB: - BARRIER(1); + BARRIER(BARRIER_MAYBE); if(opcode==0xE9) { INST_NAME("JMP Id"); i32 = F32S; @@ -2123,12 +2141,14 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("JMP Ib"); i32 = F8S; } - JUMP(addr+i32); - PASS2IF(dyn->insts[ninst].x64.jmp_insts==-1, 1) { + JUMP(addr+i32, 0); + if(dyn->insts[ninst].x64.jmp_insts==-1) { // out of the block + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); jump_to_next(dyn, addr+i32, 0, ninst); } else { // inside the block + fpuCacheTransform(dyn, ninst, x1, x2, x3); tmp = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); if(tmp==4) { NOP; @@ -2373,14 +2393,14 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("INC Eb"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); GETEB(x1, 0); emit_inc8(dyn, ninst, x1, x2, x4); EBBACK; break; case 1: INST_NAME("DEC Eb"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); GETEB(x1, 0); emit_dec8(dyn, ninst, x1, x2, x4); EBBACK; @@ -2394,14 +2414,14 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: // INC Ed INST_NAME("INC Ed"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); GETED(0); emit_inc32(dyn, ninst, rex, ed, x3, x4); WBACK; break; case 1: //DEC Ed INST_NAME("DEC Ed"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); GETED(0); emit_dec32(dyn, ninst, rex, ed, x3, x4); WBACK; @@ -2416,19 +2436,19 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin SETFLAGS(X_ALL, SF_SET); //Hack to put flag in "don't care" state } GETEDx(0); - BARRIER(1); - BARRIER_NEXT(1); + BARRIER(BARRIER_FLOAT); + //BARRIER_NEXT(BARRIER_FULL); if(!dyn->insts || ninst==dyn->size-1) { *need_epilog = 0; *ok = 0; } - GETIP(addr); + GETIP_(addr); PUSH1(xRIP); jump_to_next(dyn, 0, ed, ninst); break; case 4: // JMP Ed INST_NAME("JMP Ed"); - BARRIER(1); + BARRIER(BARRIER_FULL); GETEDx(0); jump_to_next(dyn, 0, ed, ninst); *need_epilog = 0; diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c index df36e90f..733eb5dd 100755 --- a/src/dynarec/arm64/dynarec_arm64_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_0f.c @@ -27,34 +27,34 @@ #define GETG \ gd = ((nextop&0x38)>>3)+(rex.r<<3) \ -#define GETGX(a) \ +#define GETGX(a, w) \ gd = ((nextop&0x38)>>3)+(rex.r<<3); \ - a = sse_get_reg(dyn, ninst, x1, gd) + a = sse_get_reg(dyn, ninst, x1, gd, w) #define GETGX_empty(a) \ gd = ((nextop&0x38)>>3)+(rex.r<<3); \ a = sse_get_reg_empty(dyn, ninst, x1, gd) -#define GETEX(a, D) \ - if(MODREG) { \ - a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); \ - } else { \ +#define GETEX(a, w, D) \ + if(MODREG) { \ + a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ + } else { \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, 0, D); \ - a = fpu_get_scratch(dyn); \ - VLDR128_U12(a, ed, fixedaddress); \ + a = fpu_get_scratch(dyn); \ + VLDR128_U12(a, ed, fixedaddress); \ } #define GETGM(a) \ gd = ((nextop&0x38)>>3); \ - a = mmx_get_reg(dyn, ninst, x1, gd) + a = mmx_get_reg(dyn, ninst, x1, x2, x3, gd) -#define GETEM(a, D) \ - if(MODREG) { \ - a = mmx_get_reg(dyn, ninst, x1, (nextop&7));\ - } else { \ +#define GETEM(a, D) \ + if(MODREG) { \ + a = mmx_get_reg(dyn, ninst, x1, x2, x3, (nextop&7)); \ + } else { \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, D); \ - a = fpu_get_scratch(dyn); \ - VLDR64_U12(a, ed, fixedaddress); \ + a = fpu_get_scratch(dyn); \ + VLDR64_U12(a, ed, fixedaddress); \ } #define PUTEM(a) \ @@ -151,7 +151,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin GETG; if(MODREG) { ed = (nextop&7)+(rex.b<<3); - v1 = sse_get_reg(dyn, ninst, x1, ed); + v1 = sse_get_reg(dyn, ninst, x1, ed, 0); v0 = sse_get_reg_empty(dyn, ninst, x1, gd); VMOVQ(v0, v1); } else { @@ -164,7 +164,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("MOVUPS Ex,Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { ed = (nextop&7)+(rex.b<<3); v1 = sse_get_reg_empty(dyn, ninst, x1, ed); @@ -178,12 +178,12 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; if(MODREG) { INST_NAME("MOVHLPS Gx,Ex"); - GETGX(v0); - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + GETGX(v0, 1); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); VMOVeD(v0, 0, v1, 1); } else { INST_NAME("MOVLPS Gx,Ex"); - GETGX(v0); + GETGX(v0, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); VLD1_64(v0, 0, ed); } @@ -191,9 +191,9 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0x13: nextop = F8; INST_NAME("MOVLPS Ex,Gx"); - GETGX(v0); + GETGX(v0, 0); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1); VMOVeD(v1, 0, v0, 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); @@ -203,27 +203,27 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0x14: INST_NAME("UNPCKLPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VZIP1Q_32(v0, v0, q0); break; case 0x15: INST_NAME("UNPCKHPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VZIP2Q_32(v0, v0, q0); break; case 0x16: nextop = F8; if(MODREG) { INST_NAME("MOVLHPS Gx,Ex"); - GETGX(v0); - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + GETGX(v0, 1); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); VMOVeD(v0, 1, v1, 0); } else { INST_NAME("MOVHPS Gx,Ex"); - GETGX(v0); + GETGX(v0, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); VLD1_64(v0, 1, ed); } @@ -231,9 +231,9 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0x17: nextop = F8; INST_NAME("MOVHPS Ex,Gx"); - GETGX(v0); + GETGX(v0, 0); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1); VMOVeD(v1, 0, v0, 1); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); @@ -284,7 +284,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin GETG; if(MODREG) { ed = (nextop&7)+(rex.b<<3); - v1 = sse_get_reg(dyn, ninst, x1, ed); + v1 = sse_get_reg(dyn, ninst, x1, ed, 0); v0 = sse_get_reg_empty(dyn, ninst, x1, gd); VMOVQ(v0, v1); } else { @@ -297,7 +297,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("MOVAPS Ex,Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { ed = (nextop&7)+(rex.b<<3); v1 = sse_get_reg_empty(dyn, ninst, x1, ed); @@ -312,7 +312,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("MOVNTPS Ex,Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { ed = (nextop&7)+(rex.b<<3); v1 = sse_get_reg_empty(dyn, ninst, x1, ed); @@ -329,9 +329,9 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(opcode==0x2F) {INST_NAME("COMISS Gx, Ex");} else {INST_NAME("UCOMISS Gx, Ex");} SETFLAGS(X_ALL, SF_SET); nextop = F8; - GETGX(v0); + GETGX(v0, 0); if(MODREG) { - s0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); + s0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0); } else { s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); @@ -439,7 +439,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MOV32w(gd, 0); if((nextop&0xC0)==0xC0) { // EX is an xmm reg - GETEX(q0, 0); + GETEX(q0, 0, 0); VMOVQDto(x1, q0, 0); LSRx(x1, x1, 31); BFIx(gd, x1, 0, 1); @@ -468,14 +468,14 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0x51: INST_NAME("SQRTPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); + GETEX(q0, 0, 0); GETGX_empty(v0); VFSQRTQS(v0, q0); break; case 0x52: INST_NAME("RSQRTPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); + GETEX(q0, 0, 0); GETGX_empty(q1); v0 = fpu_get_scratch(dyn); // more precise @@ -491,7 +491,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0x53: INST_NAME("RCPPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); + GETEX(q0, 0, 0); GETGX_empty(q1); if(q0 == q1) v1 = fpu_get_scratch(dyn); @@ -505,22 +505,22 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0x54: INST_NAME("ANDPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VANDQ(v0, v0, q0); break; case 0x55: INST_NAME("ANDNPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VBICQ(v0, q0, v0); break; case 0x56: INST_NAME("ORPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VORRQ(v0, v0, q0); break; case 0x57: @@ -532,65 +532,65 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin q0 = sse_get_reg_empty(dyn, ninst, x1, gd); VEORQ(q0, q0, q0); } else { - q0 = sse_get_reg(dyn, ninst, x1, gd); - GETEX(q1, 0); + q0 = sse_get_reg(dyn, ninst, x1, gd, 1); + GETEX(q1, 0, 0); VEORQ(q0, q0, q1); } break; case 0x58: INST_NAME("ADDPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VFADDQS(v0, v0, q0); break; case 0x59: INST_NAME("MULPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VFMULQS(v0, v0, q0); break; case 0x5A: INST_NAME("CVTPS2PD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(q1); + GETEX(q0, 0, 0); + GETGX(q1, 1); FCVTL(q1, q0); break; case 0x5B: INST_NAME("CVTDQ2PS Gx, Ex"); nextop = F8; - GETEX(q0, 0); + GETEX(q0, 0, 0); GETGX_empty(q1); SCVTQFS(q1, q0); break; case 0x5C: INST_NAME("SUBPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VFSUBQS(v0, v0, q0); break; case 0x5D: INST_NAME("MINPS Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); VFMINQS(v0, v0, v1); break; case 0x5E: INST_NAME("DIVPS Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VFDIVQS(v0, v0, q0); break; case 0x5F: INST_NAME("MAXPS Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); VFMAXQS(v0, v0, v1); break; case 0x60: @@ -625,21 +625,21 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin SQXTN_8(d0, q0); break; case 0x64: - INST_NAME("PCMPGTB Gx,Ex"); + INST_NAME("PCMPGTB Gm,Em"); nextop = F8; GETGM(v0); GETEM(v1, 0); VCMGT_8(v0, v0, v1); break; case 0x65: - INST_NAME("PCMPGTW Gx,Ex"); + INST_NAME("PCMPGTW Gm,Em"); nextop = F8; GETGM(v0); GETEM(v1, 0); VCMGT_16(v0, v0, v1); break; case 0x66: - INST_NAME("PCMPGTD Gx,Ex"); + INST_NAME("PCMPGTD Gm,Em"); nextop = F8; GETGM(v0); GETEM(v1, 0); @@ -652,7 +652,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin q0 = fpu_get_scratch(dyn); VMOVeD(q0, 0, v0, 0); if(MODREG) { - v1 = mmx_get_reg(dyn, ninst, x1, (nextop&7)); + v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, (nextop&7)); VMOVeD(q0, 1, v1, 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); @@ -702,7 +702,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("MOVD Gm, Ed"); nextop = F8; gd = (nextop&0x38)>>3; - v0 = mmx_get_reg_empty(dyn, ninst, x3, gd); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); if(MODREG) { ed = xRAX + (nextop&7) + (rex.b<<3); if(rex.w) { @@ -711,7 +711,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin FMOVSw(v0, ed); } } else { - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<(2+rex.w), (1<<(2+rex.w))-1, rex, 0, 0); if(rex.w) { VLDR64_U12(v0, ed, fixedaddress); @@ -725,11 +725,11 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETG; if(MODREG) { - v1 = mmx_get_reg(dyn, ninst, x1, nextop&7); // no rex.b on MMX - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); // no rex.b on MMX + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); VMOVeD(v0, 0, v1, 0); } else { - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); VLDR64_U12(v0, ed, fixedaddress); } @@ -740,8 +740,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin gd = (nextop&0x38)>>3; if(MODREG) { u8 = F8; - v1 = mmx_get_reg(dyn, ninst, x1, (nextop&7)); - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, (nextop&7)); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); if(u8==0x4E) { if(v0==v1) { VEXT_8(v0, v0, v0, 4); // Swap Up/Lower 32bits parts @@ -797,7 +797,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin VTBL1_8(v0, v1, d0); } } else { - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 1); u8 = F8; if (u8) { @@ -971,7 +971,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0x77: INST_NAME("EMMS"); // empty MMX, FPU now usable - mmx_purgecache(dyn, ninst, x1); + mmx_purgecache(dyn, ninst, 0, x1); /*emu->top = 0; emu->fpu_stack = 0;*/ //TODO: Check if something is needed here? break; @@ -1002,7 +1002,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(v0); if(MODREG) { - v1 = mmx_get_reg_empty(dyn, ninst, x1, nextop&7); + v1 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, nextop&7); VMOV(v1, v0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); @@ -1011,21 +1011,30 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; #define GO(GETFLAGS, NO, YES, F) \ - READFLAGS(F); \ - i32_ = F32S; \ - BARRIER(2); \ - JUMP(addr+i32_);\ - GETFLAGS; \ - if(dyn->insts[ninst].x64.jmp_insts==-1) { \ - /* out of the block */ \ - i32 = dyn->insts[ninst+1].address-(dyn->native_size); \ - Bcond(NO, i32); \ - jump_to_next(dyn, addr+i32_, 0, ninst); \ - } else { \ - /* inside the block */ \ + READFLAGS(F); \ + i32_ = F32S; \ + BARRIER(BARRIER_MAYBE); \ + JUMP(addr+i32_, 1); \ + GETFLAGS; \ + if(dyn->insts[ninst].x64.jmp_insts==-1 || \ + CHECK_CACHE()) { \ + /* out of the block */ \ + i32 = dyn->insts[ninst].epilog-(dyn->native_size); \ + Bcond(NO, i32); \ + if(dyn->insts[ninst].x64.jmp_insts==-1) { \ + if(!dyn->insts[ninst].x64.barrier) \ + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ + jump_to_next(dyn, addr+i32_, 0, ninst); \ + } else { \ + fpuCacheTransform(dyn, ninst, x1, x2, x3); \ + i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); \ + B(i32); \ + } \ + } else { \ + /* inside the block */ \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); \ - Bcond(YES, i32); \ - } \ + Bcond(YES, i32); \ + } GOCOND(0x80, "J", "Id"); #undef GO @@ -1179,7 +1188,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0: INST_NAME("FXSAVE Ed"); MESSAGE(LOG_DUMP, "Need Optimization\n"); - fpu_purgecache(dyn, ninst, x1, x2, x3); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); if(MODREG) { DEFAULT; } else { @@ -1191,7 +1200,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 1: INST_NAME("FXRSTOR Ed"); MESSAGE(LOG_DUMP, "Need Optimization\n"); - fpu_purgecache(dyn, ninst, x1, x2, x3); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); if(MODREG) { DEFAULT; } else { @@ -1535,8 +1544,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xC2: INST_NAME("CMPPS Gx, Ex, Ib"); nextop = F8; - GETGX(v0); - GETEX(v1, 1); + GETGX(v0, 1); + GETEX(v1, 0, 1); u8 = F8; switch(u8&7) { // the inversion of the params in the comparison is there to handle NaN the same way SSE does @@ -1605,7 +1614,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xC6: INST_NAME("SHUFPS Gx, Ex, Ib"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); if(!MODREG) addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 1); u8 = F8; @@ -1616,7 +1625,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin } // second two from Ex if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); for(int i=2; i<4; ++i) { VMOVeS(d0, i, v1, (u8>>(i*2)&3)); } @@ -1768,7 +1777,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if((nextop&0xC0)==0xC0) { DEFAULT; } else { - v0 = mmx_get_reg(dyn, ninst, x1, gd); + v0 = mmx_get_reg(dyn, ninst, x1, x2, x3, gd); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); VSTR64_U12(v0, ed, fixedaddress); } @@ -1816,10 +1825,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin gd = ((nextop&0x38)>>3); if(MODREG && ((nextop&7))==gd) { // special case for PXOR Gx, Gx - q0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + q0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); VEOR(q0, q0, q0); } else { - q0 = mmx_get_reg(dyn, ninst, x1, gd); + q0 = mmx_get_reg(dyn, ninst, x1, x2, x3, gd); GETEM(q1, 0); VEOR(q0, q0, q1); } diff --git a/src/dynarec/arm64/dynarec_arm64_64.c b/src/dynarec/arm64/dynarec_arm64_64.c index da52b278..9927f2e3 100644 --- a/src/dynarec/arm64/dynarec_arm64_64.c +++ b/src/dynarec/arm64/dynarec_arm64_64.c @@ -85,8 +85,8 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin GETG; if(MODREG) { ed = (nextop&7)+ (rex.b<<3); - v0 = sse_get_reg(dyn, ninst, x1, gd); - d0 = sse_get_reg(dyn, ninst, x1, ed); + v0 = sse_get_reg(dyn, ninst, x1, gd, 1); + d0 = sse_get_reg(dyn, ninst, x1, ed, 0); VMOVeD(v0, 0, d0, 0); } else { grab_segdata(dyn, addr, ninst, x4, seg); @@ -101,8 +101,8 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETG; if(MODREG) { - v0 = sse_get_reg(dyn, ninst, x1, gd); - q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); + v0 = sse_get_reg(dyn, ninst, x1, gd, 1); + q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0); VMOVeS(v0, 0, q0, 0); } else { grab_segdata(dyn, addr, ninst, x4, seg); @@ -122,7 +122,7 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("MOVUPS Ex,Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { ed = (nextop&7)+(rex.b<<3); v1 = sse_get_reg_empty(dyn, ninst, x1, ed); @@ -138,10 +138,10 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("MOVSD Ex, Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { ed = (nextop&7)+ (rex.b<<3); - d0 = sse_get_reg(dyn, ninst, x1, ed); + d0 = sse_get_reg(dyn, ninst, x1, ed, 1); VMOVeD(d0, 0, v0, 0); } else { grab_segdata(dyn, addr, ninst, x4, seg); @@ -154,9 +154,9 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin INST_NAME("MOVSS Ex, Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { - q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); + q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 1); VMOVeS(q0, 0, v0, 0); } else { grab_segdata(dyn, addr, ninst, x4, seg); @@ -178,7 +178,7 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin GETG; v0 = sse_get_reg_empty(dyn, ninst, x1, gd); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); VMOVQ(v0, v1); } else { grab_segdata(dyn, addr, ninst, x4, seg); @@ -577,14 +577,14 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("ROL Ed, 1"); - SETFLAGS(X_OF|X_CF, SF_SUBSET); + SETFLAGS(X_OF|X_CF, SF_SUBSET_PENDING); GETEDO(x6, 0); emit_rol32c(dyn, ninst, rex, ed, 1, x3, x4); WBACKO(x6); break; case 1: INST_NAME("ROR Ed, 1"); - SETFLAGS(X_OF|X_CF, SF_SUBSET); + SETFLAGS(X_OF|X_CF, SF_SUBSET_PENDING); GETEDO(x6, 0); emit_ror32c(dyn, ninst, rex, ed, 1, x3, x4); WBACKO(x6); @@ -923,14 +923,14 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: // INC Ed INST_NAME("INC Ed"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); GETEDO(x6, 0); emit_inc32(dyn, ninst, rex, ed, x3, x4); WBACKO(x6); break; case 1: //DEC Ed INST_NAME("DEC Ed"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); GETEDO(x6, 0); emit_dec32(dyn, ninst, rex, ed, x3, x4); WBACKO(x6); @@ -945,13 +945,12 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin SETFLAGS(X_ALL, SF_SET); //Hack to put flag in "don't care" state } GETEDOx(x6, 0); - BARRIER(1); - BARRIER_NEXT(1); + BARRIER(BARRIER_FLOAT); if(!dyn->insts || ninst==dyn->size-1) { *need_epilog = 0; *ok = 0; } - GETIP(addr); + GETIP_(addr); PUSH1(xRIP); jump_to_next(dyn, 0, ed, ninst); break; diff --git a/src/dynarec/arm64/dynarec_arm64_66.c b/src/dynarec/arm64/dynarec_arm64_66.c index 3135c399..b3b34b49 100755 --- a/src/dynarec/arm64/dynarec_arm64_66.c +++ b/src/dynarec/arm64/dynarec_arm64_66.c @@ -867,14 +867,14 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("INC Ew"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); GETEW(x1, 0); emit_inc16(dyn, ninst, x1, x2, x4); EWBACK; break; case 1: INST_NAME("DEC Ew"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); GETEW(x1, 0); emit_dec16(dyn, ninst, x1, x2, x4); EWBACK; diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c index b90e49a9..ae6b04d0 100755 --- a/src/dynarec/arm64/dynarec_arm64_660f.c +++ b/src/dynarec/arm64/dynarec_arm64_660f.c @@ -23,9 +23,9 @@ #include "dynarec_arm64_helper.h" // Get EX as a quad -#define GETEX(a, D) \ +#define GETEX(a, w, D) \ if(MODREG) { \ - a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); \ + a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, 0, D); \ a = fpu_get_scratch(dyn); \ @@ -34,9 +34,9 @@ #define GETG gd = ((nextop&0x38)>>3)+(rex.r<<3) -#define GETGX(a) \ +#define GETGX(a, w) \ gd = ((nextop&0x38)>>3)+(rex.r<<3); \ - a = sse_get_reg(dyn, ninst, x1, gd) + a = sse_get_reg(dyn, ninst, x1, gd, w) #define GETGX_empty(a) \ gd = ((nextop&0x38)>>3)+(rex.r<<3); \ @@ -77,7 +77,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETG; if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); v0 = sse_get_reg_empty(dyn, ninst, x1, gd); VMOVQ(v0, v1); } else { @@ -90,7 +90,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MOVUPD Ex,Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { v1 = sse_get_reg_empty(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); VMOVQ(v1, v0); @@ -102,7 +102,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x12: INST_NAME("MOVLPD Gx, Eq"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); if(MODREG) { // access register instead of memory is bad opcode! DEFAULT; @@ -114,7 +114,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x13: INST_NAME("MOVLPD Eq, Gx"); nextop = F8; - GETGX(v0); + GETGX(v0, 0); if(MODREG) { // access register instead of memory is bad opcode! DEFAULT; @@ -126,9 +126,9 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x14: INST_NAME("UNPCKLPD Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); VMOVeD(v0, 1, v1, 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); @@ -138,10 +138,10 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x15: INST_NAME("UNPCKHPD Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); VMOVeD(v0, 0, v0, 1); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1); VMOVeD(v0, 1, v1, 1); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); @@ -153,7 +153,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x16: INST_NAME("MOVHPD Gx, Eq"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); if(MODREG) { // access register instead of memory is bad opcode! DEFAULT; @@ -165,7 +165,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x17: INST_NAME("MOVHPD Eq, Gx"); nextop = F8; - GETGX(v0); + GETGX(v0, 0); if(MODREG) { // access register instead of memory is bad opcode! DEFAULT; @@ -187,7 +187,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETG; if(MODREG) { ed = (nextop&7)+(rex.b<<3); - v1 = sse_get_reg(dyn, ninst, x1, ed); + v1 = sse_get_reg(dyn, ninst, x1, ed, 0); v0 = sse_get_reg_empty(dyn, ninst, x1, gd); VMOVQ(v0, v1); } else { @@ -200,7 +200,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MOVAPD Ex,Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { ed = (nextop&7)+(rex.b<<3); v1 = sse_get_reg_empty(dyn, ninst, x1, ed); @@ -217,8 +217,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n if(opcode==0x2F) {INST_NAME("COMISD Gx, Ex");} else {INST_NAME("UCOMISD Gx, Ex");} SETFLAGS(X_ALL, SF_SET); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 0); + GETEX(q0, 0, 0); FCMPD(v0, q0); FCOMI(x1, x2); break; @@ -229,8 +229,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x00: INST_NAME("PSHUFB Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); d0 = fpu_get_scratch(dyn); MOVIQ_8(d0, 0b10001111); VANDQ(d0, d0, q1); // mask the index @@ -239,23 +239,23 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x01: INST_NAME("PHADDW Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); VADDPQ_16(q0, q0, q1); break; case 0x02: INST_NAME("PHADDD Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); VADDPQ_32(q0, q0, q1); break; case 0x04: INST_NAME("PMADDUBSW Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); UXTL_8(v0, q0); // this is unsigned, so 0 extended @@ -273,8 +273,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x08: INST_NAME("PSIGNB Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v1 = fpu_get_scratch(dyn); v0 = fpu_get_scratch(dyn); NEGQ_8(v0, q0); // get NEG @@ -288,8 +288,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x09: INST_NAME("PSIGNW Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v1 = fpu_get_scratch(dyn); v0 = fpu_get_scratch(dyn); NEGQ_16(v0, q0); // get NEG @@ -303,8 +303,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x0A: INST_NAME("PSIGND Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v1 = fpu_get_scratch(dyn); v0 = fpu_get_scratch(dyn); NEGQ_32(v0, q0); // get NEG @@ -318,47 +318,44 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x0B: INST_NAME("PMULHRSW Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); SQRDMULHQ_16(q0, q0, q1); break; case 0x1C: INST_NAME("PABSB Gx,Ex"); nextop = F8; - GETEX(q1, 0); - GETG; - q0 = sse_get_reg_empty(dyn, ninst, x1, gd); + GETEX(q1, 0, 0); + GETGX_empty(q0); ABSQ_8(q0, q1); break; case 0x1D: INST_NAME("PABSW Gx,Ex"); nextop = F8; - GETEX(q1, 0); - GETG; - q0 = sse_get_reg_empty(dyn, ninst, x1, gd); + GETEX(q1, 0, 0); + GETGX_empty(q0); ABSQ_16(q0, q1); break; case 0x1E: INST_NAME("PABSD Gx,Ex"); nextop = F8; - GETEX(q1, 0); - GETG; - q0 = sse_get_reg_empty(dyn, ninst, x1, gd); + GETEX(q1, 0, 0); + GETGX_empty(q0); ABSQ_32(q0, q1); break; case 0x20: INST_NAME("PMOVSXBW Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); SXTL_8(q0, q1); // 8bits->16bits break; case 0x21: INST_NAME("PMOVSXBD Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); SXTL_8(q0, q1); // 8bits->16bits SXTL_16(q0, q0); //16bits->32bits @@ -366,7 +363,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x22: INST_NAME("PMOVSXBQ Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); SXTL_8(q0, q1); // 8bits->16bits SXTL_16(q0, q0); //16bits->32bits @@ -375,14 +372,14 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x23: INST_NAME("PMOVSXWD Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); SXTL_16(q0, q1); // 16bits->32bits break; case 0x24: INST_NAME("PMOVSXWQ Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); SXTL_16(q0, q1); // 16bits->32bits SXTL_32(q0, q0); // 32bits->64bits @@ -390,7 +387,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x25: INST_NAME("PMOVSXDQ Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); SXTL_32(q0, q1); // 32bits->64bits break; @@ -398,14 +395,14 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x30: INST_NAME("PMOVZXBW Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); UXTL_8(q0, q1); // 8bits->16bits break; case 0x31: INST_NAME("PMOVZXBD Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); UXTL_8(q0, q1); // 8bits->16bits UXTL_16(q0, q0); //16bits->32bits @@ -413,7 +410,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x32: INST_NAME("PMOVZXBQ Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); UXTL_8(q0, q1); // 8bits->16bits UXTL_16(q0, q0); //16bits->32bits @@ -422,14 +419,14 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x33: INST_NAME("PMOVZXWD Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); UXTL_16(q0, q1); // 16bits->32bits break; case 0x34: INST_NAME("PMOVZXWQ Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); UXTL_16(q0, q1); // 16bits->32bits UXTL_32(q0, q0); // 32bits->64bits @@ -437,7 +434,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x35: INST_NAME("PMOVZXDQ Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); UXTL_32(q0, q1); // 32bits->64bits break; @@ -445,16 +442,16 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x39: INST_NAME("PMINSD Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); - GETGX(q0); + GETEX(q1, 0, 0); + GETGX(q0,1); SMINQ_32(q0, q0, q1); break; case 0x3D: INST_NAME("PMAXSD Gx, Ex"); // SSE4 opcode! nextop = F8; - GETEX(q1, 0); - GETGX(q0); + GETEX(q1, 0, 0); + GETGX(q0, 1); SMAXQ_32(q0, q0, q1); break; @@ -462,11 +459,11 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("AESIMC Gx, Ex"); // AES-NI nextop = F8; if(arm64_aes) { - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); AESIMC(q0, q1); } else { - GETEX(q1, 0); + GETEX(q1, 0, 0); GETGX_empty(q0); if(q0!=q1) { VMOVQ(q0, q1); @@ -480,8 +477,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("AESENC Gx, Ex"); // AES-NI nextop = F8; if(arm64_aes) { - GETEX(q1, 0); - GETGX(q0); + GETEX(q1, 0, 0); + GETGX(q0, 1); v0 = fpu_get_scratch(dyn); // ARM64 internal operation differs a bit from x86_64 VEORQ(v0, q0, q1); AESE(v0, q1); @@ -492,8 +489,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n sse_forget_reg(dyn, ninst, gd); MOV32w(x1, gd); CALL(arm_aese, -1); - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); VEORQ(q0, q0, q1); } break; @@ -501,8 +498,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("AESENCLAST Gx, Ex"); // AES-NI nextop = F8; if(arm64_aes) { - GETEX(q1, 0); - GETGX(q0); + GETEX(q1, 0, 0); + GETGX(q0, 1); v0 = fpu_get_scratch(dyn); // ARM64 internal operation differs a bit from x86_64 VEORQ(v0, q0, q1); AESE(v0, q1); @@ -512,8 +509,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n sse_forget_reg(dyn, ninst, gd); MOV32w(x1, gd); CALL(arm_aeselast, -1); - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); VEORQ(q0, q0, q1); } break; @@ -521,8 +518,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("AESDEC Gx, Ex"); // AES-NI nextop = F8; if(arm64_aes) { - GETEX(q1, 0); - GETGX(q0); + GETEX(q1, 0, 0); + GETGX(q0, 1); v0 = fpu_get_scratch(dyn); // ARM64 internal operation differs a bit from x86_64 VEORQ(v0, q0, q1); AESD(v0, q1); @@ -533,8 +530,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n sse_forget_reg(dyn, ninst, gd); MOV32w(x1, gd); CALL(arm_aesd, -1); - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); VEORQ(q0, q0, q1); } break; @@ -542,8 +539,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("AESDECLAST Gx, Ex"); // AES-NI nextop = F8; if(arm64_aes) { - GETEX(q1, 0); - GETGX(q0); + GETEX(q1, 0, 0); + GETGX(q0, 1); v0 = fpu_get_scratch(dyn); // ARM64 internal operation differs a bit from x86_64 VEORQ(v0, q0, q1); AESD(v0, q1); @@ -553,8 +550,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n sse_forget_reg(dyn, ninst, gd); MOV32w(x1, gd); CALL(arm_aesdlast, -1); - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); VEORQ(q0, q0, q1); } break; @@ -570,8 +567,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x0B: INST_NAME("ROUNDSD Gx, Ex, Ib"); nextop = F8; - GETGX(q0); - GETEX(q1, 1); + GETGX(q0, 1); + GETEX(q1, 0, 1); u8 = F8; v1 = fpu_get_scratch(dyn); if(u8&4) { @@ -589,8 +586,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x0F: INST_NAME("PALIGNR Gx, Ex, Ib"); nextop = F8; - GETGX(q0); - GETEX(q1, 1); + GETGX(q0, 1); + GETEX(q1, 0, 1); u8 = F8; if(u8>31) { VEORQ(q0, q0, q0); @@ -606,7 +603,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x16: if(rex.w) {INST_NAME("PEXTRQ Ed, Gx, Ib");} else {INST_NAME("PEXTRD Ed, Gx, Ib");} nextop = F8; - GETGX(q0); + GETGX(q0, 0); GETED(1); u8 = F8; if(rex.w) { @@ -619,7 +616,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x22: INST_NAME("PINSRD Gx, ED, Ib"); nextop = F8; - GETGX(q0); + GETGX(q0, 1); GETED(1); u8 = F8; if(rex.w) { @@ -655,7 +652,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x50: nextop = F8; INST_NAME("PMOVMSKD Gd, Ex"); - GETEX(q0, 0); + GETEX(q0, 0, 0); GETGD; VMOVQDto(x1, q0, 1); VMOVQDto(gd, q0, 0); @@ -667,43 +664,43 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x54: INST_NAME("ANDPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0,0); + GETGX(v0, 1); VANDQ(v0, v0, q0); break; case 0x55: INST_NAME("ANDNPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VBICQ(v0, q0, v0); break; case 0x56: INST_NAME("ORPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VORRQ(v0, v0, q0); break; case 0x57: INST_NAME("XORPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VEORQ(v0, v0, q0); break; case 0x58: INST_NAME("ADDPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VFADDQD(v0, v0, q0); break; case 0x59: INST_NAME("MULPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(q1); + GETEX(q0, 0, 0); + GETGX(q1, 1); if(!box64_dynarec_fastnan) { v0 = fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); @@ -722,14 +719,14 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5A: INST_NAME("CVTPD2PS Gx, Ex"); nextop = F8; - GETEX(v1, 0); + GETEX(v1, 0, 0); GETGX_empty(v0); FCVTXN(v0, v1); break; case 0x5B: INST_NAME("CVTPS2DQ Gx, Ex"); nextop = F8; - GETEX(v1, 0); + GETEX(v1, 0, 0); GETGX_empty(v0); #ifdef PRECISE_CVT LDRH_U12(x1, xEmu, offsetof(x64emu_t, mxcsr)); @@ -754,22 +751,22 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5C: INST_NAME("SUBPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VFSUBQD(v0, v0, q0); break; case 0x5D: INST_NAME("MINPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VFMINQD(v0, v0, q0); break; case 0x5E: INST_NAME("DIVPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(q1); + GETEX(q0, 0, 0); + GETGX(q1, 1); if(!box64_dynarec_fastnan) { v0 = fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); @@ -788,36 +785,36 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5F: INST_NAME("MAXPD Gx, Ex"); nextop = F8; - GETEX(q0, 0); - GETGX(v0); + GETEX(q0, 0, 0); + GETGX(v0, 1); VFMAXQD(v0, v0, q0); break; case 0x60: INST_NAME("PUNPCKLBW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VZIP1Q_8(v0, v0, q0); break; case 0x61: INST_NAME("PUNPCKLWD Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VZIP1Q_16(v0, v0, q0); break; case 0x62: INST_NAME("PUNPCKLDQ Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VZIP1Q_32(v0, v0, q0); break; case 0x63: INST_NAME("PACKSSWB Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); SQXTN_8(q0, q0); if(q0==q1) { VMOVeD(q0, 1, q0, 0); @@ -828,29 +825,29 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x64: INST_NAME("PCMPGTB Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); VCMGTQ_8(v0, v0, v1); break; case 0x65: INST_NAME("PCMPGTW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); VCMGTQ_16(v0, v0, v1); break; case 0x66: INST_NAME("PCMPGTD Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); VCMGTQ_32(v0, v0, v1); break; case 0x67: INST_NAME("PACKUSWB Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); SQXTUN_8(v0, v0); if(v0==v1) { VMOVeD(v0, 1, v0, 0); @@ -861,29 +858,29 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x68: INST_NAME("PUNPCKHBW Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 1); + GETGX(q0, 1); + GETEX(q1, 0, 1); VZIP2Q_8(q0, q0, q1); break; case 0x69: INST_NAME("PUNPCKHWD Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 1); + GETGX(q0, 1); + GETEX(q1, 0, 1); VZIP2Q_16(q0, q0, q1); break; case 0x6A: INST_NAME("PUNPCKHDQ Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 1); + GETGX(q0, 1); + GETEX(q1, 0, 1); VZIP2Q_32(q0, q0, q1); break; case 0x6B: INST_NAME("PACKSSDW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); SQXTN_16(v0, v0); if(v0==v1) { VMOVeD(v0, 1, v0, 0); @@ -894,9 +891,9 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x6C: INST_NAME("PUNPCKLQDQ Gx,Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); VMOVeD(v0, 1, v1, 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); @@ -906,10 +903,10 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x6D: INST_NAME("PUNPCKHQDQ Gx,Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); VMOVeD(v0, 0, v0, 1); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); VMOVeD(v0, 1, v1, 1); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); @@ -920,9 +917,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x6E: INST_NAME("MOVD Gx, Ed"); nextop = F8; - GETG; + GETGX_empty(v0); GETED(0); - v0 = sse_get_reg_empty(dyn, ninst, x1, gd); if(rex.w) { FMOVDx(v0, ed); } else { @@ -933,12 +929,12 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x6F: INST_NAME("MOVDQA Gx,Ex"); nextop = F8; - GETG; - v0 = sse_get_reg_empty(dyn, ninst, x1, gd); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); + GETGX_empty(v0); VMOVQ(v0, v1); } else { + GETGX_empty(v0); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, 0, 0); VLDR128_U12(v0, ed, fixedaddress); } @@ -946,12 +942,11 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x70: INST_NAME("PSHUFD Gx,Ex,Ib"); nextop = F8; - GETG; i32 = -1; if(MODREG) { u8 = F8; - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); - v0 = sse_get_reg_empty(dyn, ninst, x1, gd); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); + GETGX_empty(v0); if(u8==0x4E) { if(v0==v1) { VEXTQ_8(v0, v0, v0, 8); // Swap Up/Lower 64bits parts @@ -1013,7 +1008,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n VTBLQ1_8(v0, v1, d0); } } else { - v0 = sse_get_reg_empty(dyn, ninst, x1, gd); + GETGX_empty(v0); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 1); u8 = F8; if (u8) { @@ -1035,7 +1030,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n switch((nextop>>3)&7) { case 2: INST_NAME("PSRLW Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8) { if (u8>15) { @@ -1050,7 +1045,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n break; case 4: INST_NAME("PSRAW Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8>15) u8=15; if(u8) { @@ -1062,7 +1057,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n break; case 6: INST_NAME("PSLLW Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8) { if (u8>15) { @@ -1085,7 +1080,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n switch((nextop>>3)&7) { case 2: INST_NAME("PSRLD Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8) { if (u8>31) { @@ -1100,7 +1095,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n break; case 4: INST_NAME("PSRAD Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8>31) u8=31; if(u8) { @@ -1112,7 +1107,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n break; case 6: INST_NAME("PSLLD Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8) { if (u8>31) { @@ -1134,7 +1129,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n switch((nextop>>3)&7) { case 2: INST_NAME("PSRLQ Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8) { if (u8>63) { @@ -1149,7 +1144,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n break; case 3: INST_NAME("PSRLDQ Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8) { if(u8>15) { @@ -1166,7 +1161,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n break; case 6: INST_NAME("PSLLQ Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8) { if (u8>63) { @@ -1181,7 +1176,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n break; case 7: INST_NAME("PSLLDQ Ex, Ib"); - GETEX(q0, 1); + GETEX(q0, 1, 1); u8 = F8; if(u8) { if(u8>15) { @@ -1204,29 +1199,29 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x74: INST_NAME("PCMPEQB Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VCMEQQ_8(v0, v0, q0); break; case 0x75: INST_NAME("PCMPEQW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VCMEQQ_16(v0, v0, q0); break; case 0x76: INST_NAME("PCMPEQD Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VCMEQQ_32(v0, v0, q0); break; case 0x7E: INST_NAME("MOVD Ed,Gx"); nextop = F8; - GETGX(v0); + GETGX(v0, 0); if(rex.w) { if(MODREG) { ed = xRAX + (nextop&7) + (rex.b<<3); @@ -1248,9 +1243,9 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x7F: INST_NAME("MOVDQA Ex,Gx"); nextop = F8; - GETGX(v0); + GETGX(v0, 0); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1); VMOVQ(v1, v0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, 0, 0); @@ -1511,8 +1506,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xC2: INST_NAME("CMPPD Gx, Ex, Ib"); nextop = F8; - GETGX(v0); - GETEX(v1, 1); + GETGX(v0, 1); + GETEX(v1, 0, 1); u8 = F8; switch(u8&7) { // the inversion of the params in the comparison is there to handle NaN the same way SSE does @@ -1543,7 +1538,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xC4: INST_NAME("PINSRW Gx,Ed,Ib"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); if(MODREG) { u8 = (F8)&7; ed = xRAX+(nextop&7)+(rex.b<<3); @@ -1559,7 +1554,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGD; if(MODREG) { - GETEX(v0, 1); + GETEX(v0, 0, 1); u8 = (F8)&7; VMOVHto(gd, v0, u8); } else { @@ -1571,8 +1566,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xC6: INST_NAME("SHUFPD Gx, Ex, Ib"); nextop = F8; - GETGX(v0); - GETEX(v1, 1); + GETGX(v0, 1); + GETEX(v1, 0, 1); u8 = F8; if(v0==v1 && u8==0) { VMOVeD(v0, 1, v0, 0); @@ -1610,8 +1605,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xD1: INST_NAME("PSRLW Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); VDUPQ_16(v0, q1, 0); NEGQ_16(v0, v0); // neg, because SHR @@ -1620,8 +1615,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xD2: INST_NAME("PSRLD Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); VDUPQ_32(v0, q1, 0); NEGQ_32(v0, v0); // neg, because SHR @@ -1630,8 +1625,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xD3: INST_NAME("PSRLQ Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); NEG_64(v0, q1); VMOVeD(v0, 1, v0, 0); @@ -1640,22 +1635,21 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xD4: INST_NAME("PADDQ Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VADDQ_64(v0, v0, q0); break; case 0xD5: INST_NAME("PMULLW Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); VMULQ_16(q0, q0, q1); break; case 0xD6: INST_NAME("MOVQ Ex, Gx"); nextop = F8; - GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + GETGX(v0, 0); if(MODREG) { v1 = sse_get_reg_empty(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); FMOVD(v1, v0); @@ -1670,7 +1664,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n v0 = fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); q1 = fpu_get_scratch(dyn); - GETEX(q0, 0); + GETEX(q0, 0, 0); GETGD; TABLE64(x1, (uintptr_t)&mask_shift8); VLDR64_U12(v0, x1, 0); // load shift @@ -1690,73 +1684,73 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xD8: INST_NAME("PSUBUSB Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); UQSUBQ_8(q0, q0, q1); break; case 0xD9: INST_NAME("PSUBUSW Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); UQSUBQ_16(q0, q0, q1); break; case 0xDA: INST_NAME("PMINUB Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1,0); + GETGX(q0, 1); + GETEX(q1, 0, 0); UMINQ_8(q0, q0, q1); break; case 0xDB: INST_NAME("PAND Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VANDQ(v0, v0, q0); break; case 0xDC: INST_NAME("PADDUSB Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); UQADDQ_8(q0, q0, q1); break; case 0xDD: INST_NAME("PADDUSW Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); UQADDQ_16(q0, q0, q1); break; case 0xDE: INST_NAME("PMAXUB Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); UMAXQ_8(q0, q0, q1); break; case 0xDF: INST_NAME("PANDN Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VBICQ(v0, q0, v0); break; case 0xE0: INST_NAME("PAVGB Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); URHADDQ_8(v0, v0, v1); break; case 0xE1: INST_NAME("PSRAW Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); VMOVeD(v0, 0, q1, 0); VMOVeD(v0, 1, q1, 0); @@ -1770,8 +1764,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xE2: INST_NAME("PSRAD Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); VMOVeD(v0, 0, q1, 0); VMOVeD(v0, 1, q1, 0); @@ -1783,15 +1777,15 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xE3: INST_NAME("PAVGW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); URHADDQ_16(v0, v0, q0); break; case 0xE4: INST_NAME("PMULHUW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); q0 = fpu_get_scratch(dyn); q1 = fpu_get_scratch(dyn); VUMULL_16(q0, v0, v1); @@ -1802,8 +1796,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xE5: INST_NAME("PMULHW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); q0 = fpu_get_scratch(dyn); q1 = fpu_get_scratch(dyn); VSMULL_16(q0, v0, v1); @@ -1814,7 +1808,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xE6: INST_NAME("CVTTPD2DQ Gx, Ex"); nextop = F8; - GETEX(v1, 0); + GETEX(v1, 0, 0); GETGX_empty(v0); VFCVTNSQD(v0, v1); // convert double -> int64 SQXTN_32(v0, v0); // convert int64 -> int32 with saturation in lower part, RaZ high part @@ -1822,7 +1816,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xE7: INST_NAME("MOVNTDQ Ex, Gx"); nextop = F8; - GETGX(v0); + GETGX(v0, 0); if(MODREG) { v1 = sse_get_reg_empty(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); VMOVQ(v1, v0); @@ -1834,50 +1828,50 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xE8: INST_NAME("PSUBSB Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); SQSUBQ_8(v0, v0, q0); break; case 0xE9: INST_NAME("PSUBSW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); SQSUBQ_16(v0, v0, q0); break; case 0xEA: INST_NAME("PMINSW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); SMINQ_16(v0, v0, q0); break; case 0xEB: INST_NAME("POR Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VORRQ(v0, v0, q0); break; case 0xEC: INST_NAME("PADDSB Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); SQADDQ_8(v0, v0, q0); break; case 0xED: INST_NAME("PADDSW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); SQADDQ_16(v0, v0, q0); break; case 0xEE: INST_NAME("PMAXSW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); SMAXQ_16(v0, v0, q0); break; case 0xEF: @@ -1889,8 +1883,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n q0 = sse_get_reg_empty(dyn, ninst, x1, gd); VEORQ(q0, q0, q0); } else { - q0 = sse_get_reg(dyn, ninst, x1, gd); - GETEX(q1, 0); + q0 = sse_get_reg(dyn, ninst, x1, gd, 1); + GETEX(q1, 0, 0); VEORQ(q0, q0, q1); } break; @@ -1898,8 +1892,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xF2: INST_NAME("PSLLD Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); VMOVeD(v0, 0, q1, 0); VMOVeD(v0, 1, q1, 0); @@ -1910,8 +1904,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xF3: INST_NAME("PSLLQ Gx,Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); VMOVQ(v0, q1); VMOVeD(v0, 1, v0, 0); @@ -1920,8 +1914,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xF4: INST_NAME("PMULUDQ Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); q0 = fpu_get_scratch(dyn); VUZP1Q_32(q0, v0, v0); //A3 A2 A1 A0 -> A3 A1 A2 A0 if(MODREG) { @@ -1935,8 +1929,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xF5: INST_NAME("PMADDWD Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); q0 = fpu_get_scratch(dyn); q1 = fpu_get_scratch(dyn); VSMULL_16(q0, v0, v1); @@ -1946,8 +1940,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xF6: INST_NAME("PSADBW Gx, Ex"); nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); d0 = fpu_get_scratch(dyn); d1 = fpu_get_scratch(dyn); VEOR(d1, d1, d1); // is it necessary? @@ -1961,8 +1955,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xF7: INST_NAME("MASKMOVDQU Gx, Ex") nextop = F8; - GETGX(q0); - GETEX(q1, 0); + GETGX(q0, 1); + GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); VLDR128_U12(v0, xRDI, 0); if(MODREG) @@ -1978,50 +1972,50 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xF8: INST_NAME("PSUBB Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VSUBQ_8(v0, v0, q0); break; case 0xF9: INST_NAME("PSUBW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VSUBQ_16(v0, v0, q0); break; case 0xFA: INST_NAME("PSUBD Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VSUBQ_32(v0, v0, q0); break; case 0xFB: INST_NAME("PSUBQ Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VSUBQ_64(v0, v0, q0); break; case 0xFC: INST_NAME("PADDB Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VADDQ_8(v0, v0, q0); break; case 0xFD: INST_NAME("PADDW Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VADDQ_16(v0, v0, q0); break; case 0xFE: INST_NAME("PADDD Gx,Ex"); nextop = F8; - GETGX(v0); - GETEX(q0, 0); + GETGX(v0, 1); + GETEX(q0, 0, 0); VADDQ_32(v0, v0, q0); break; diff --git a/src/dynarec/arm64/dynarec_arm64_6664.c b/src/dynarec/arm64/dynarec_arm64_6664.c index 422c673b..363b99ce 100644 --- a/src/dynarec/arm64/dynarec_arm64_6664.c +++ b/src/dynarec/arm64/dynarec_arm64_6664.c @@ -56,7 +56,7 @@ uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MOVQ Ex, Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { v1 = sse_get_reg_empty(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); FMOVD(v1, v0); diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c index 0e846784..7b379488 100755 --- a/src/dynarec/arm64/dynarec_arm64_67.c +++ b/src/dynarec/arm64/dynarec_arm64_67.c @@ -22,13 +22,13 @@ #include "dynarec_arm64_helper.h" #include "dynarec_arm64_functions.h" -#define GETGX(a) \ +#define GETGX(a, w) \ gd = ((nextop&0x38)>>3)+(rex.r<<3); \ - a = sse_get_reg(dyn, ninst, x1, gd) + a = sse_get_reg(dyn, ninst, x1, gd, w) #define GETGM(a) \ gd = ((nextop&0x38)>>3); \ - a = mmx_get_reg(dyn, ninst, x1, gd) + a = mmx_get_reg(dyn, ninst, x1, x2, x3, gd) #define GETGm gd = ((nextop&0x38)>>3) @@ -78,9 +78,9 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(opcode==0x2F) {INST_NAME("COMISS Gx, Ex");} else {INST_NAME("UCOMISS Gx, Ex");} SETFLAGS(X_ALL, SF_SET); nextop = F8; - GETGX(v0); + GETGX(v0, 0); if(MODREG) { - s0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); + s0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0); } else { s0 = fpu_get_scratch(dyn); addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); @@ -96,11 +96,11 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGm; if(MODREG) { - v1 = mmx_get_reg(dyn, ninst, x1, nextop&7); // no rex.b on MMX - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); // no rex.b on MMX + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); VMOVeD(v0, 0, v1, 0); } else { - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); VLDR64_U12(v0, ed, fixedaddress); } @@ -111,7 +111,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin nextop = F8; GETGM(v0); if(MODREG) { - v1 = mmx_get_reg_empty(dyn, ninst, x1, nextop&7); + v1 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, nextop&7); VMOV(v1, v0); } else { addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); @@ -156,7 +156,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("ROL Ed, Ib"); - SETFLAGS(X_OF|X_CF, SF_SUBSET); + SETFLAGS(X_OF|X_CF, SF_SUBSET_PENDING); GETED32(1); u8 = (F8)&(rex.w?0x3f:0x1f); emit_rol32c(dyn, ninst, rex, ed, u8, x3, x4); @@ -164,7 +164,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 1: INST_NAME("ROR Ed, Ib"); - SETFLAGS(X_OF|X_CF, SF_SUBSET); + SETFLAGS(X_OF|X_CF, SF_SUBSET_PENDING); GETED32(1); u8 = (F8)&(rex.w?0x3f:0x1f); emit_ror32c(dyn, ninst, rex, ed, u8, x3, x4); @@ -224,14 +224,23 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin } break; - #define GO(NO, YES) \ - BARRIER(2); \ - JUMP(addr+i8);\ - if(dyn->insts[ninst].x64.jmp_insts==-1) { \ - /* out of the block */ \ - i32 = dyn->insts[ninst+1].address-(dyn->native_size); \ - Bcond(NO, i32); \ - jump_to_next(dyn, addr+i8, 0, ninst); \ + #define GO(NO, YES) \ + BARRIER(BARRIER_MAYBE); \ + JUMP(addr+i8, 1); \ + if(dyn->insts[ninst].x64.jmp_insts==-1 || \ + CHECK_CACHE()) { \ + /* out of the block */ \ + i32 = dyn->insts[ninst].epilog-(dyn->native_size); \ + Bcond(NO, i32); \ + if(dyn->insts[ninst].x64.jmp_insts==-1) { \ + if(!dyn->insts[ninst].x64.barrier) \ + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); \ + jump_to_next(dyn, addr+i8, 0, ninst); \ + } else { \ + fpuCacheTransform(dyn, ninst, x1, x2, x3); \ + i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size);\ + B(i32); \ + } \ } else { \ /* inside the block */ \ i32 = dyn->insts[dyn->insts[ninst].x64.jmp_insts].address-(dyn->native_size); \ diff --git a/src/dynarec/arm64/dynarec_arm64_d8.c b/src/dynarec/arm64/dynarec_arm64_d8.c index 2a963bb4..a7f8d652 100644 --- a/src/dynarec/arm64/dynarec_arm64_d8.c +++ b/src/dynarec/arm64/dynarec_arm64_d8.c @@ -48,9 +48,13 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xC6: case 0xC7: INST_NAME("FADD ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FADDD(v1, v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FADDS(v1, v1, v2); + } else { + FADDD(v1, v1, v2); + } break; case 0xC8: case 0xC9: @@ -61,9 +65,13 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xCE: case 0xCF: INST_NAME("FMUL ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FMULD(v1, v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FMULS(v1, v1, v2); + } else { + FMULD(v1, v1, v2); + } break; case 0xD0: case 0xD1: @@ -74,9 +82,13 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xD6: case 0xD7: INST_NAME("FCOM ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOM(x1, x2, x3); break; case 0xD8: @@ -88,11 +100,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xDE: case 0xDF: INST_NAME("FCOMP ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOM(x1, x2, x3); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xE0: case 0xE1: @@ -103,9 +119,13 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xE6: case 0xE7: INST_NAME("FSUB ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FSUBD(v1, v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FSUBS(v1, v1, v2); + } else { + FSUBD(v1, v1, v2); + } break; case 0xE8: case 0xE9: @@ -116,9 +136,13 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xEE: case 0xEF: INST_NAME("FSUBR ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FSUBD(v1, v2, v1); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FSUBS(v1, v2, v1); + } else { + FSUBD(v1, v2, v1); + } break; case 0xF0: case 0xF1: @@ -129,9 +153,13 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xF6: case 0xF7: INST_NAME("FDIV ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FDIVD(v1, v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FDIVS(v1, v1, v2); + } else { + FDIVD(v1, v1, v2); + } break; case 0xF8: case 0xF9: @@ -142,87 +170,123 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xFE: case 0xFF: INST_NAME("FDIVR ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FDIVD(v1, v2, v1); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FDIVS(v1, v2, v1); + } else { + FDIVD(v1, v2, v1); + } break; default: switch((nextop>>3)&7) { case 0: INST_NAME("FADD ST0, float[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); - FCVT_D_S(s0, s0); - FADDD(v1, v1, s0); + if(ST_IS_F(0)) { + FADDS(v1, v1, s0); + } else { + FCVT_D_S(s0, s0); + FADDD(v1, v1, s0); + } break; case 1: INST_NAME("FMUL ST0, float[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); - FCVT_D_S(s0, s0); - FMULD(v1, v1, s0); + if(ST_IS_F(0)) { + FMULS(v1, v1, s0); + } else { + FCVT_D_S(s0, s0); + FMULD(v1, v1, s0); + } break; case 2: INST_NAME("FCOM ST0, float[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); - FCVT_D_S(s0, s0); - FCMPD(v1, s0); + if(ST_IS_F(0)) { + FCMPS(v1, s0); + } else { + FCVT_D_S(s0, s0); + FCMPD(v1, s0); + } FCOM(x1, x2, x3); break; case 3: INST_NAME("FCOMP ST0, float[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); - FCVT_D_S(s0, s0); - FCMPD(v1, s0); + if(ST_IS_F(0)) { + FCMPS(v1, s0); + } else { + FCVT_D_S(s0, s0); + FCMPD(v1, s0); + } FCOM(x1, x2, x3); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 4: INST_NAME("FSUB ST0, float[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); - FCVT_D_S(s0, s0); - FSUBD(v1, v1, s0); + if(ST_IS_F(0)) { + FSUBS(v1, v1, s0); + } else { + FCVT_D_S(s0, s0); + FSUBD(v1, v1, s0); + } break; case 5: INST_NAME("FSUBR ST0, float[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); - FCVT_D_S(s0, s0); - FSUBD(v1, s0, v1); + if(ST_IS_F(0)) { + FSUBS(v1, s0, v1); + } else { + FCVT_D_S(s0, s0); + FSUBD(v1, s0, v1); + } break; case 6: INST_NAME("FDIV ST0, float[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); - FCVT_D_S(s0, s0); - FDIVD(v1, v1, s0); + if(ST_IS_F(0)) { + FDIVS(v1, v1, s0); + } else { + FCVT_D_S(s0, s0); + FDIVD(v1, v1, s0); + } break; case 7: INST_NAME("FDIVR ST0, float[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); - FCVT_D_S(s0, s0); - FDIVD(v1, s0, v1); + if(ST_IS_F(0)) { + FDIVS(v1, s0, v1); + } else { + FCVT_D_S(s0, s0); + FDIVD(v1, s0, v1); + } break; default: DEFAULT; diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c index cd11b6f2..4a24504f 100644 --- a/src/dynarec/arm64/dynarec_arm64_d9.c +++ b/src/dynarec/arm64/dynarec_arm64_d9.c @@ -50,12 +50,18 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xC6: case 0xC7: INST_NAME("FLD STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - v2 = x87_do_push(dyn, ninst); - FMOVD(v2, v1); + v2 = x87_do_push(dyn, ninst, x1, X87_ST(nextop&7)); + v1 = x87_get_st(dyn, ninst, x1, x2, (nextop&7)+1, X87_COMBINE(0, (nextop&7)+1)); + if(ST_IS_F(0)) { + FMOVS(v2, v1); + } else { + FMOVD(v2, v1); + } break; case 0xC8: + INST_NAME("FXCH ST0"); + break; case 0xC9: case 0xCA: case 0xCB: @@ -65,11 +71,8 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xCF: INST_NAME("FXCH STx"); // swap the cache value, not the double value itself :p - i1 = x87_get_cache(dyn, ninst, x1, x2, nextop&7); - i2 = x87_get_cache(dyn, ninst, x1, x2, 0); - i3 = dyn->x87cache[i1]; - dyn->x87cache[i1] = dyn->x87cache[i2]; - dyn->x87cache[i2] = i3; + x87_swapreg(dyn, ninst, x1, x2, 0, nextop&7); + // should set C1 to 0 break; case 0xD0: @@ -78,19 +81,31 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xE0: INST_NAME("FCHS"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - FNEGD(v1, v1); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if(ST_IS_F(0)) { + FNEGS(v1, v1); + } else { + FNEGD(v1, v1); + } break; case 0xE1: INST_NAME("FABS"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - FABSD(v1, v1); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if(ST_IS_F(0)) { + FABSS(v1, v1); + } else { + FABSD(v1, v1); + } break; case 0xE4: INST_NAME("FTST"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - FCMPD_0(v1); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if(ST_IS_F(0)) { + FCMPS_0(v1); + } else { + FCMPD_0(v1); + } FCOM(x1, x2, x3); // same flags... break; case 0xE5: @@ -102,64 +117,44 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xE8: INST_NAME("FLD1"); - v1 = x87_do_push(dyn, ninst); - FTABLE64(v1, 1.0); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_F); + if(ST_IS_F(0)) { + FMOVS_8(v1, 0b01110000); + } else { + FMOVD_8(v1, 0b01110000); + } break; case 0xE9: INST_NAME("FLDL2T"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); FTABLE64(v1, L2T); break; case 0xEA: INST_NAME("FLDL2E"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); FTABLE64(v1, L2E); break; case 0xEB: INST_NAME("FLDPI"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); FTABLE64(v1, PI); break; case 0xEC: INST_NAME("FLDLG2"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); FTABLE64(v1, LG2); break; case 0xED: INST_NAME("FLDLN2"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); FTABLE64(v1, LN2); break; case 0xEE: INST_NAME("FLDZ"); - v1 = x87_do_push(dyn, ninst); - FTABLE64(v1, 0.0); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_F); + VEOR(v1, v1, v1); break; - case 0xFA: - INST_NAME("FSQRT"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - FSQRTD(v1, v1); - break; - - case 0xFC: - INST_NAME("FRNDINT"); - MESSAGE(LOG_DUMP, "Need Optimization\n"); - // use C helper for now, nothing staightforward is available - x87_forget(dyn, ninst, x1, x2, 0); - CALL(arm_frndint, -1); - /* - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - VCMP_F64_0(v1); - VMRS_APSR(); - B_NEXT(cVS); // Unordered, skip - B_NEXT(cEQ); // Zero, skip - u8 = x87_setround(dyn, ninst, x1, x2, x3); - VCVT_S32_F64(x1, v1); // limit to 32bits.... - VCVT_F64_S32(v1, x1); - x87_restoreround(dyn, ninst, u8); - */ - break; case 0xF0: INST_NAME("F2XM1"); MESSAGE(LOG_DUMP, "Need Optimization\n"); @@ -172,15 +167,19 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); CALL(arm_fyl2x, -1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xF2: - INST_NAME("FTAN"); + INST_NAME("FPTAN"); MESSAGE(LOG_DUMP, "Need Optimization\n"); x87_forget(dyn, ninst, x1, x2, 0); CALL(arm_ftan, -1); - v1 = x87_do_push(dyn, ninst); - FTABLE64(v1, 1.0); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_F); + if(ST_IS_F(0)) { + FMOVS_8(v1, 0b01110000); + } else { + FMOVD_8(v1, 0b01110000); + } break; case 0xF3: INST_NAME("FPATAN"); @@ -188,7 +187,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); CALL(arm_fpatan, -1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xF4: INST_NAME("FXTRACT"); @@ -206,7 +205,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 0xF6: INST_NAME("FDECSTP"); - fpu_purgecache(dyn, ninst, x1, x2, x3); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); LDRw_U12(x2, xEmu, offsetof(x64emu_t, top)); SUBw_U12(x2, x2, 1); ANDw_mask(x2, x2, 0, 2); //mask=7 @@ -214,7 +213,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 0xF7: INST_NAME("FINCSTP"); - fpu_purgecache(dyn, ninst, x1, x2, x3); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); LDRw_U12(x2, xEmu, offsetof(x64emu_t, top)); ADDw_U12(x2, x2, 1); ANDw_mask(x2, x2, 0, 2); //mask=7 @@ -233,7 +232,16 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); CALL(arm_fyl2xp1, -1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); + break; + case 0xFA: + INST_NAME("FSQRT"); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0); + if(ST_IS_F(0)) { + FSQRTS(v1, v1); + } else { + FSQRTD(v1, v1); + } break; case 0xFB: INST_NAME("FSINCOS"); @@ -242,6 +250,24 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin x87_forget(dyn, ninst, x1, x2, 1); CALL(arm_fsincos, -1); break; + case 0xFC: + INST_NAME("FRNDINT"); + MESSAGE(LOG_DUMP, "Need Optimization\n"); + // use C helper for now, nothing staightforward is available + x87_forget(dyn, ninst, x1, x2, 0); + CALL(arm_frndint, -1); + /* + v1 = x87_get_st(dyn, ninst, x1, x2, 0); + VCMP_F64_0(v1); + VMRS_APSR(); + B_NEXT(cVS); // Unordered, skip + B_NEXT(cEQ); // Zero, skip + u8 = x87_setround(dyn, ninst, x1, x2, x3); + VCVT_S32_F64(x1, v1); // limit to 32bits.... + VCVT_F64_S32(v1, x1); + x87_restoreround(dyn, ninst, u8); + */ + break; case 0xFD: INST_NAME("FSCALE"); MESSAGE(LOG_DUMP, "Need Optimization\n"); @@ -288,33 +314,46 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("FLD ST0, float[ED]"); - v1 = x87_do_push(dyn, ninst); - s0 = fpu_get_scratch(dyn); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_F); + if(ST_IS_F(0)) + s0 = v1; + else + s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); - FCVT_D_S(v1, s0); + if(!ST_IS_F(0)) { + FCVT_D_S(v1, s0); + } break; case 2: INST_NAME("FST float[ED], ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - s0 = fpu_get_scratch(dyn); - FCVT_S_D(s0, v1); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F); + if(ST_IS_F(0)) + s0 = v1; + else { + s0 = fpu_get_scratch(dyn); + FCVT_S_D(s0, v1); + } addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VSTR32_U12(s0, ed, fixedaddress); break; case 3: INST_NAME("FSTP float[ED], ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - s0 = fpu_get_scratch(dyn); - FCVT_S_D(s0, v1); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F); + if(ST_IS_F(0)) + s0 = v1; + else { + s0 = fpu_get_scratch(dyn); + FCVT_S_D(s0, v1); + } addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VSTR32_U12(s0, ed, fixedaddress); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 4: INST_NAME("FLDENV Ed"); MESSAGE(LOG_DUMP, "Need Optimization\n"); - fpu_purgecache(dyn, ninst, x1, x2, x3); // maybe only x87, not SSE? + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE? addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); if(ed!=x1) { MOVx_REG(x1, ed); @@ -332,7 +371,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FNSTENV Ed"); MESSAGE(LOG_DUMP, "Need Optimization\n"); - fpu_purgecache(dyn, ninst, x1, x2, x3); // maybe only x87, not SSE? + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); // maybe only x87, not SSE? addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); if(ed!=x1) { MOVx_REG(x1, ed); diff --git a/src/dynarec/arm64/dynarec_arm64_db.c b/src/dynarec/arm64/dynarec_arm64_db.c index cbc2c0ef..1836ce64 100644 --- a/src/dynarec/arm64/dynarec_arm64_db.c +++ b/src/dynarec/arm64/dynarec_arm64_db.c @@ -53,10 +53,14 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xC7: INST_NAME("FCMOVNB ST0, STx"); READFLAGS(X_CF); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); TSTw_mask(xFlags, 0, 0); //mask=1<<F_CF - FCSELD(v1, v2, v1, cEQ); // F_CF==0 + if(ST_IS_F(0)) { + FCSELS(v1, v2, v1, cEQ); + } else { + FCSELD(v1, v2, v1, cEQ); // F_CF==0 + } break; case 0xC8: case 0xC9: @@ -68,10 +72,14 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xCF: INST_NAME("FCMOVNE ST0, STx"); READFLAGS(X_ZF); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); TSTw_mask(xFlags, 0b011010, 0); //mask=1<<F_ZF - FCSELD(v1, v2, v1, cEQ); // F_ZF==0 + if(ST_IS_F(0)) { + FCSELS(v1, v2, v1, cEQ); + } else { + FCSELD(v1, v2, v1, cEQ); // F_ZF==0 + } break; case 0xD0: case 0xD1: @@ -83,11 +91,15 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xD7: INST_NAME("FCMOVNBE ST0, STx"); READFLAGS(X_CF|X_ZF); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); MOV32w(x1, (1<<F_CF)|(1<<F_ZF)); TSTw_REG(xFlags, x1); - FCSELD(v1, v2, v1, cEQ); // F_CF==0 & F_ZF==0 + if(ST_IS_F(0)) { + FCSELS(v1, v2, v1, cEQ); + } else { + FCSELD(v1, v2, v1, cEQ); // F_CF==0 & F_ZF==0 + } break; case 0xD8: case 0xD9: @@ -99,10 +111,14 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xDF: INST_NAME("FCMOVNU ST0, STx"); READFLAGS(X_PF); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); TSTw_mask(xFlags, 0b011110, 0); //mask=1<<F_PF - FCSELD(v1, v2, v1, cEQ); // F_PF==0 + if(ST_IS_F(0)) { + FCSELS(v1, v2, v1, cEQ); + } else { + FCSELD(v1, v2, v1, cEQ); // F_PF==0 + } break; case 0xE1: INST_NAME("FDISI8087_NOP"); // so.. NOP? @@ -118,7 +134,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xE3: INST_NAME("FNINIT"); MESSAGE(LOG_DUMP, "Need Optimization\n"); - x87_purgecache(dyn, ninst, x1, x2, x3); + x87_purgecache(dyn, ninst, 0, x1, x2, x3); CALL(reset_fpu, -1); break; case 0xE8: @@ -131,9 +147,13 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xEF: INST_NAME("FUCOMI ST0, STx"); SETFLAGS(X_ALL, SF_SET); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOMI(x1, x2); break; case 0xF0: @@ -146,9 +166,13 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xF7: INST_NAME("FCOMI ST0, STx"); SETFLAGS(X_ALL, SF_SET); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOMI(x1, x2); break; @@ -164,7 +188,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("FILD ST0, Ed"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); s0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); VLDR32_U12(s0, ed, fixedaddress); @@ -173,7 +197,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 1: INST_NAME("FISTTP Ed, ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); if(MODREG) { ed = xRAX+(nextop&7)+(rex.b<<3); wback = 0; @@ -200,11 +224,11 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin STRw_U12(x5, wback, fixedaddress); MARK3; #endif - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 2: INST_NAME("FIST Ed, ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); u8 = x87_setround(dyn, ninst, x1, x2, x4); // x1 have the modified RPSCR reg if(MODREG) { ed = xRAX+(nextop&7)+(rex.b<<3); @@ -236,7 +260,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 3: INST_NAME("FISTP Ed, ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); u8 = x87_setround(dyn, ninst, x1, x2, x4); // x1 have the modified RPSCR reg if(MODREG) { ed = xRAX+(nextop&7)+(rex.b<<3); @@ -265,7 +289,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MARK3; #endif x87_restoreround(dyn, ninst, u8); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 5: INST_NAME("FLD tbyte"); @@ -297,7 +321,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MOVx_REG(x1, ed); } CALL(arm_fstp, -1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; default: DEFAULT; diff --git a/src/dynarec/arm64/dynarec_arm64_dc.c b/src/dynarec/arm64/dynarec_arm64_dc.c index 3877bcc9..58572038 100644 --- a/src/dynarec/arm64/dynarec_arm64_dc.c +++ b/src/dynarec/arm64/dynarec_arm64_dc.c @@ -46,9 +46,13 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xC6: case 0xC7: INST_NAME("FADD STx, ST0"); - v2 = x87_get_st(dyn, ninst, x1, x2, 0); - v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FADDD(v1, v1, v2); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FADDS(v1, v1, v2); + } else { + FADDD(v1, v1, v2); + } break; case 0xC8: case 0xC9: @@ -59,9 +63,13 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xCE: case 0xCF: INST_NAME("FMUL STx, ST0"); - v2 = x87_get_st(dyn, ninst, x1, x2, 0); - v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FMULD(v1, v1, v2); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FMULS(v1, v1, v2); + } else { + FMULD(v1, v1, v2); + } break; case 0xD0: case 0xD1: @@ -72,9 +80,13 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xD6: case 0xD7: INST_NAME("FCOM ST0, STx"); //yep - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOM(x1, x2, x3); break; case 0xD8: @@ -86,11 +98,15 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xDE: case 0xDF: INST_NAME("FCOMP ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOM(x1, x2, x3); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xE0: case 0xE1: @@ -101,9 +117,13 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xE6: case 0xE7: INST_NAME("FSUBR STx, ST0"); - v2 = x87_get_st(dyn, ninst, x1, x2, 0); - v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FSUBD(v1, v2, v1); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FSUBS(v1, v2, v1); + } else { + FSUBD(v1, v2, v1); + } break; case 0xE8: case 0xE9: @@ -114,9 +134,13 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xEE: case 0xEF: INST_NAME("FSUB STx, ST0"); - v2 = x87_get_st(dyn, ninst, x1, x2, 0); - v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FSUBD(v1, v1, v2); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FSUBS(v1, v1, v2); + } else { + FSUBD(v1, v1, v2); + } break; case 0xF0: case 0xF1: @@ -127,9 +151,13 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xF6: case 0xF7: INST_NAME("FDIVR STx, ST0"); - v2 = x87_get_st(dyn, ninst, x1, x2, 0); - v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FDIVD(v1, v2, v1); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FDIVS(v1, v2, v1); + } else { + FDIVD(v1, v2, v1); + } break; case 0xF8: case 0xF9: @@ -140,15 +168,19 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xFE: case 0xFF: INST_NAME("FDIV STx, ST0"); - v2 = x87_get_st(dyn, ninst, x1, x2, 0); - v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FDIVD(v1, v1, v2); + v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FDIVS(v1, v1, v2); + } else { + FDIVD(v1, v1, v2); + } break; default: switch((nextop>>3)&7) { case 0: INST_NAME("FADD ST0, double[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, 0, 0); VLDR64_U12(v2, wback, fixedaddress); @@ -156,7 +188,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 1: INST_NAME("FMUL ST0, double[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, 0, 0); VLDR64_U12(v2, wback, fixedaddress); @@ -164,7 +196,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 2: INST_NAME("FCOM ST0, double[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, 0, 0); VLDR64_U12(v2, wback, fixedaddress); @@ -173,17 +205,17 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 3: INST_NAME("FCOMP ST0, double[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, 0, 0); VLDR64_U12(v2, wback, fixedaddress); FCMPD(v1, v2); FCOM(x1, x2, x3); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 4: INST_NAME("FSUB ST0, double[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, 0, 0); VLDR64_U12(v2, wback, fixedaddress); @@ -191,7 +223,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 5: INST_NAME("FSUBR ST0, double[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, 0, 0); VLDR64_U12(v2, wback, fixedaddress); @@ -199,7 +231,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 6: INST_NAME("FDIV ST0, double[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, 0, 0); VLDR64_U12(v2, wback, fixedaddress); @@ -207,7 +239,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 7: INST_NAME("FDIVR ST0, double[ED]"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); v2 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, 0, 0); VLDR64_U12(v2, wback, fixedaddress); diff --git a/src/dynarec/arm64/dynarec_arm64_dd.c b/src/dynarec/arm64/dynarec_arm64_dd.c index 4b73cc97..fbd481a9 100644 --- a/src/dynarec/arm64/dynarec_arm64_dd.c +++ b/src/dynarec/arm64/dynarec_arm64_dd.c @@ -49,8 +49,8 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xC7: INST_NAME("FFREE STx"); MESSAGE(LOG_DUMP, "Need Optimization\n"); - x87_purgecache(dyn, ninst, x1, x2, x3); - MOV32w(x1, nextop-0xC0); + x87_purgecache(dyn, ninst, 0, x1, x2, x3); + MOV32w(x1, nextop&7); CALL(fpu_do_free, -1); break; case 0xD0: @@ -62,13 +62,17 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xD6: case 0xD7: INST_NAME("FST ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FMOVD(v2, v1); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FMOVS(v2, v1); + } else { + FMOVD(v2, v1); + } break; case 0xD8: INST_NAME("FSTP ST0, ST0"); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xD9: case 0xDA: @@ -78,10 +82,9 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xDE: case 0xDF: INST_NAME("FSTP ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FMOVD(v2, v1); - x87_do_pop(dyn, ninst); + // copy the cache value for st0 to stx + x87_swapreg(dyn, ninst, x1, x2, 0, nextop&7); + x87_do_pop(dyn, ninst, x3); break; case 0xE0: @@ -93,9 +96,13 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xE6: case 0xE7: INST_NAME("FUCOM ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOM(x1, x2, x3); break; case 0xE8: @@ -107,11 +114,15 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xEE: case 0xEF: INST_NAME("FUCOMP ST0, STx"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOM(x1, x2, x3); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xC8: @@ -145,37 +156,37 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("FLD double"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x3, NEON_CACHE_ST_D); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); VLDR64_U12(v1, ed, fixedaddress); break; case 1: INST_NAME("FISTTP i64, ST0"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x3, NEON_CACHE_ST_D); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); s0 = fpu_get_scratch(dyn); FRINT64ZD(s0, v1); FCVTZSxD(x2, s0); STRx_U12(x2, ed, fixedaddress); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 2: INST_NAME("FST double"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); VSTR64_U12(v1, ed, fixedaddress); break; case 3: INST_NAME("FSTP double"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); VSTR64_U12(v1, ed, fixedaddress); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 4: INST_NAME("FRSTOR m108byte"); MESSAGE(LOG_DUMP, "Need Optimization\n"); - fpu_purgecache(dyn, ninst, x1, x2, x3); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} CALL(arm_frstor, -1); @@ -183,18 +194,27 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 6: INST_NAME("FSAVE m108byte"); MESSAGE(LOG_DUMP, "Need Optimization\n"); - fpu_purgecache(dyn, ninst, x1, x2, x3); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} CALL(arm_fsave, -1); break; case 7: INST_NAME("FNSTSW m2byte"); - fpu_purgecache(dyn, ninst, x1, x2, x3); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); addr = geted(dyn, addr, ninst, nextop, &ed, x4, &fixedaddress, 0xfff<<1, 1, rex, 0, 0); - LDRw_U12(x1, xEmu, offsetof(x64emu_t, top)); + LDRw_U12(x2, xEmu, offsetof(x64emu_t, top)); LDRH_U12(x3, xEmu, offsetof(x64emu_t, sw)); - BFIw(x3, x1, 11, 3); // inject TOP at bit 11 (3 bits) + if(dyn->n.x87stack) { + // update top + if(dyn->n.x87stack>0) { + SUBw_U12(x2, x2, dyn->n.x87stack); + } else { + ADDw_U12(x2, x2, -dyn->n.x87stack); + } + ANDw_mask(x2, x2, 0, 2); + } + BFIw(x3, x2, 11, 3); // inject TOP at bit 11 (3 bits) STRH_U12(x3, ed, fixedaddress); // store whole sw flags break; default: diff --git a/src/dynarec/arm64/dynarec_arm64_df.c b/src/dynarec/arm64/dynarec_arm64_df.c index a90f3331..15fd8d32 100644 --- a/src/dynarec/arm64/dynarec_arm64_df.c +++ b/src/dynarec/arm64/dynarec_arm64_df.c @@ -51,7 +51,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xC7: INST_NAME("FFREEP STx"); // not handling Tag... - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xE0: @@ -71,11 +71,16 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xEF: INST_NAME("FUCOMIP ST0, STx"); SETFLAGS(X_ALL, SF_SET); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) + { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOMI(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xF0: case 0xF1: @@ -87,11 +92,16 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin case 0xF7: INST_NAME("FCOMIP ST0, STx"); SETFLAGS(X_ALL, SF_SET); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); - v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - FCMPD(v1, v2); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7)); + v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7)); + if(ST_IS_F(0)) + { + FCMPS(v1, v2); + } else { + FCMPD(v1, v2); + } FCOMI(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xC8: @@ -140,19 +150,23 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin switch((nextop>>3)&7) { case 0: INST_NAME("FILD ST0, Ew"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_F); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<1, 1, rex, 0, 0); LDRSHw_U12(x1, wback, fixedaddress); - SCVTFDw(v1, x1); + if(ST_IS_F(0)) { + SCVTFSw(v1, x1); + } else { + SCVTFDw(v1, x1); + } break; case 1: INST_NAME("FISTTP Ew, ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F); addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<1, 1, rex, 0, 0); ed = x1; s0 = fpu_get_scratch(dyn); #if 0 - // this version needs ARM v8.5, //TODO: add detection of this extensio to use it + // this version needs ARM v8.5, //TODO: add detection of this extension to use it FRINT32ZD(s0, v1); // no saturation instruction on Arm, so using NEON VFCVTZSd(s0, s0); @@ -163,8 +177,12 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); - VFCVTZSd(s0, v1); - SQXTN_S_D(s0, s0); + if(ST_IS_F(0)) { + VFCVTZSs(s0, v1); + } else { + VFCVTZSd(s0, v1); + SQXTN_S_D(s0, s0); + } SQXTN_H_S(s0, s0); VSTR16_U12(s0, wback, fixedaddress); MRS_fpsr(x5); // get back FPSR to check the IOC bit @@ -173,11 +191,11 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin STRH_U12(x5, wback, fixedaddress); MARK3; #endif - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 2: INST_NAME("FIST Ew, ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F); u8 = x87_setround(dyn, ninst, x1, x2, x4); addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<1, 1, rex, 0, 0); ed = x1; @@ -193,9 +211,14 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); - FRINTXD(s0, v1); - VFCVTZSd(s0, s0); - SQXTN_S_D(s0, s0); + if(ST_IS_F(0)) { + FRINTXS(s0, v1); + VFCVTZSs(s0, s0); + } else { + FRINTXD(s0, v1); + VFCVTZSd(s0, s0); + SQXTN_S_D(s0, s0); + } SQXTN_H_S(s0, s0); VSTR16_U12(s0, wback, fixedaddress); MRS_fpsr(x5); // get back FPSR to check the IOC bit @@ -208,7 +231,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 3: INST_NAME("FISTP Ew, ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F); u8 = x87_setround(dyn, ninst, x1, x2, x4); addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<1, 1, rex, 0, 0); ed = x1; @@ -224,9 +247,14 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MRS_fpsr(x5); BFCw(x5, FPSR_IOC, 1); // reset IOC bit MSR_fpsr(x5); - FRINTXD(s0, v1); - VFCVTZSd(s0, s0); - SQXTN_S_D(s0, s0); + if(ST_IS_F(0)) { + FRINTXS(s0, v1); + VFCVTZSs(s0, s0); + } else { + FRINTXD(s0, v1); + VFCVTZSd(s0, s0); + SQXTN_S_D(s0, s0); + } SQXTN_H_S(s0, s0); VSTR16_U12(s0, wback, fixedaddress); MRS_fpsr(x5); // get back FPSR to check the IOC bit @@ -235,7 +263,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin STRH_U12(x5, wback, fixedaddress); MARK3; #endif - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); x87_restoreround(dyn, ninst, u8); break; case 4: @@ -247,7 +275,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 5: INST_NAME("FILD ST0, i64"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); LDRx_U12(x1, wback, fixedaddress); SCVTFDx(v1, x1); @@ -258,11 +286,11 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, 0, 0); if(ed!=x1) {MOVx_REG(x1, ed);} CALL(fpu_fbst, -1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 7: INST_NAME("FISTP i64, ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); u8 = x87_setround(dyn, ninst, x1, x2, x4); addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); ed = x1; @@ -285,7 +313,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin MARK3; #endif x87_restoreround(dyn, ninst, u8); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; default: DEFAULT; diff --git a/src/dynarec/arm64/dynarec_arm64_f0.c b/src/dynarec/arm64/dynarec_arm64_f0.c index 76540039..aea67cae 100644 --- a/src/dynarec/arm64/dynarec_arm64_f0.c +++ b/src/dynarec/arm64/dynarec_arm64_f0.c @@ -917,7 +917,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin { case 0: // INC Ed INST_NAME("LOCK INC Ed"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); DMB_ISH(); if(MODREG) { ed = xRAX+(nextop&7)+(rex.b<<3); @@ -945,7 +945,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 1: //DEC Ed INST_NAME("LOCK DEC Ed"); - SETFLAGS(X_ALL&~X_CF, SF_SUBSET); + SETFLAGS(X_ALL&~X_CF, SF_SUBSET_PENDING); DMB_ISH(); if(MODREG) { ed = xRAX+(nextop&7)+(rex.b<<3); diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c index 0db99b64..f6169f3a 100755 --- a/src/dynarec/arm64/dynarec_arm64_f20f.c +++ b/src/dynarec/arm64/dynarec_arm64_f20f.c @@ -23,9 +23,9 @@ #include "dynarec_arm64_helper.h" // Get Ex as a double, not a quad (warning, x2 get used) -#define GETEX(a, D) \ +#define GETEX(a, w, D) \ if(MODREG) { \ - a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); \ + a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ a = fpu_get_scratch(dyn); \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, D); \ @@ -34,15 +34,15 @@ #define GETG gd = ((nextop&0x38)>>3)+(rex.r<<3) -#define GETGX(a) gd = ((nextop&0x38)>>3)+(rex.r<<3); \ - a = sse_get_reg(dyn, ninst, x1, gd) +#define GETGX(a, w) gd = ((nextop&0x38)>>3)+(rex.r<<3); \ + a = sse_get_reg(dyn, ninst, x1, gd, w) #define GETGX_empty(a) gd = ((nextop&0x38)>>3)+(rex.r<<3); \ a = sse_get_reg_empty(dyn, ninst, x1, gd) #define GETGM(a) \ gd = ((nextop&0x38)>>3); \ - a = mmx_get_reg(dyn, ninst, x1, gd) + a = mmx_get_reg(dyn, ninst, x1, x2, x3, gd) uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) { @@ -77,8 +77,8 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETG; if(MODREG) { ed = (nextop&7)+ (rex.b<<3); - v0 = sse_get_reg(dyn, ninst, x1, gd); - d0 = sse_get_reg(dyn, ninst, x1, ed); + v0 = sse_get_reg(dyn, ninst, x1, gd, 1); + d0 = sse_get_reg(dyn, ninst, x1, ed, 0); VMOVeD(v0, 0, d0, 0); } else { v0 = sse_get_reg_empty(dyn, ninst, x1, gd); @@ -90,10 +90,10 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MOVSD Ex, Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 0); if(MODREG) { ed = (nextop&7)+ (rex.b<<3); - d0 = sse_get_reg(dyn, ninst, x1, ed); + d0 = sse_get_reg(dyn, ninst, x1, ed, 1); VMOVeD(d0, 0, v0, 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); @@ -105,7 +105,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETG; if(MODREG) { - d0 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + d0 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); v0 = sse_get_reg_empty(dyn, ninst, x1, gd); VMOVeD(v0, 0, d0, 0); } else { @@ -119,7 +119,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x2A: INST_NAME("CVTSI2SD Gx, Ed"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); GETED(0); d1 = fpu_get_scratch(dyn); if(rex.w) { @@ -134,14 +134,14 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("CVTTSD2SI Gd, Ex"); nextop = F8; GETGD; - GETEX(q0, 0); + GETEX(q0, 0, 0); FCVTZSxwD(gd, q0); break; case 0x2D: INST_NAME("CVTSD2SI Gd, Ex"); nextop = F8; GETGD; - GETEX(q0, 0); + GETEX(q0, 0, 0); #ifdef PRECISE_CVT LDRH_U12(x1, xEmu, offsetof(x64emu_t, mxcsr)); UBFXx(x1, x1, 13, 2); // extract round requested @@ -167,9 +167,9 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x51: INST_NAME("SQRTSD Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); d1 = fpu_get_scratch(dyn); - GETEX(d0, 0); + GETEX(d0, 0, 0); if(!box64_dynarec_fastnan) { v1 = fpu_get_scratch(dyn); FCMLTD_0(v1, d0); @@ -185,18 +185,18 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x58: INST_NAME("ADDSD Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); d1 = fpu_get_scratch(dyn); - GETEX(d0, 0); + GETEX(d0, 0, 0); FADDD(d1, v0, d0); // the high part of the vector is erased... VMOVeD(v0, 0, d1, 0); break; case 0x59: INST_NAME("MULSD Gx, Ex"); nextop = F8; - GETGX(d1); + GETGX(d1, 1); v1 = fpu_get_scratch(dyn); - GETEX(d0, 0); + GETEX(d0, 0, 0); if(!box64_dynarec_fastnan) { v0 = fpu_get_scratch(dyn); q0 = fpu_get_scratch(dyn); @@ -216,8 +216,8 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5A: INST_NAME("CVTSD2SS Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(d0, 0); + GETGX(v0, 1); + GETEX(d0, 0, 0); d1 = fpu_get_scratch(dyn); FCVT_S_D(d1, d0); VMOVeS(v0, 0, d1, 0); @@ -226,18 +226,17 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5C: INST_NAME("SUBSD Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); d1 = fpu_get_scratch(dyn); - GETEX(d0, 0); + GETEX(d0, 0, 0); FSUBD(d1, v0, d0); VMOVeD(v0, 0, d1, 0); break; case 0x5D: INST_NAME("MINSD Gx, Ex"); nextop = F8; - GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); // MINSD: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0] #if 0 d0 = fpu_get_scratch(dyn); @@ -252,9 +251,9 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5E: INST_NAME("DIVSD Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); d1 = fpu_get_scratch(dyn); - GETEX(v1, 0); + GETEX(v1, 0, 0); if(!box64_dynarec_fastnan) { d0 = fpu_get_scratch(dyn); q0 = fpu_get_scratch(dyn); @@ -274,9 +273,8 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5F: INST_NAME("MAXSD Gx, Ex"); nextop = F8; - GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); // MAXSD: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0] #if 0 d0 = fpu_get_scratch(dyn); @@ -292,8 +290,8 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x70: INST_NAME("PSHUFLW Gx, Ex, Ib"); nextop = F8; - GETEX(v1, 1); - GETGX(v0); + GETEX(v1, 0, 1); + GETGX(v0, 1); u8 = F8; // only low part need to be suffled. VTBL only handle 8bits value, so the 16bits suffles need to be changed in 8bits @@ -315,9 +313,9 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x7C: INST_NAME("HADDPS Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, 0, 0); v1 = fpu_get_scratch(dyn); @@ -329,8 +327,8 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xC2: INST_NAME("CMPSD Gx, Ex, Ib"); nextop = F8; - GETGX(v0); - GETEX(v1, 1); + GETGX(v0, 1); + GETEX(v1, 0, 1); u8 = F8; FCMPD(v0, v1); switch(u8&7) { @@ -349,8 +347,8 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xD0: INST_NAME("ADDSUBPS Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); q0 = fpu_get_scratch(dyn); static float addsubps[4] = {-1.f, 1.f, -1.f, 1.f}; MAYUSE(addsubps); @@ -363,14 +361,14 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MOVDQ2Q Gm, Ex"); nextop = F8; GETGM(v0); - GETEX(v1, 0); + GETEX(v1, 0, 0); VMOV(v0, v1); break; case 0xE6: INST_NAME("CVTPD2DQ Gx, Ex"); nextop = F8; - GETEX(v1, 0); + GETEX(v1, 0, 0); GETGX_empty(v0); u8 = sse_setround(dyn, ninst, x1, x2, x3); VFRINTIDQ(v0, v1); @@ -384,7 +382,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETG; if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); v0 = sse_get_reg_empty(dyn, ninst, x1, gd); VMOVQ(v0, v1); } else { diff --git a/src/dynarec/arm64/dynarec_arm64_f30f.c b/src/dynarec/arm64/dynarec_arm64_f30f.c index 4e441172..d0996f6e 100755 --- a/src/dynarec/arm64/dynarec_arm64_f30f.c +++ b/src/dynarec/arm64/dynarec_arm64_f30f.c @@ -23,9 +23,9 @@ #include "dynarec_arm64_helper.h" // Get Ex as a single, not a quad (warning, x2 get used) -#define GETEX(a, D) \ +#define GETEX(a, w, D) \ if(MODREG) { \ - a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); \ + a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ a = fpu_get_scratch(dyn); \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<2, 3, rex, 0, D); \ @@ -34,8 +34,8 @@ #define GETG gd = ((nextop&0x38)>>3)+(rex.r<<3) -#define GETGX(a) gd = ((nextop&0x38)>>3)+(rex.r<<3); \ - a = sse_get_reg(dyn, ninst, x1, gd) +#define GETGX(a, w) gd = ((nextop&0x38)>>3)+(rex.r<<3); \ + a = sse_get_reg(dyn, ninst, x1, gd, w) #define GETGX_empty(a) gd = ((nextop&0x38)>>3)+(rex.r<<3); \ a = sse_get_reg_empty(dyn, ninst, x1, gd) @@ -70,8 +70,8 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETG; if(MODREG) { - v0 = sse_get_reg(dyn, ninst, x1, gd); - q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); + v0 = sse_get_reg(dyn, ninst, x1, gd, 1); + q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0); VMOVeS(v0, 0, q0, 0); } else { v0 = sse_get_reg_empty(dyn, ninst, x1, gd); @@ -83,9 +83,9 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MOVSS Ex, Gx"); nextop = F8; GETG; - v0 = sse_get_reg(dyn, ninst, x1, gd); + v0 = sse_get_reg(dyn, ninst, x1, gd, 1); if(MODREG) { - q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); + q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0); VMOVeS(q0, 0, v0, 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<2, 3, rex, 0, 0); @@ -96,7 +96,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MOVSLDUP Gx, Ex"); nextop = F8; if(MODREG) { - q1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + q1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, 0, 0); q1 = fpu_get_scratch(dyn); @@ -110,7 +110,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MOVSHDUP Gx, Ex"); nextop = F8; if(MODREG) { - q1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + q1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, 0, 0); q1 = fpu_get_scratch(dyn); @@ -129,7 +129,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x2A: INST_NAME("CVTSI2SS Gx, Ed"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); GETED(0); d1 = fpu_get_scratch(dyn); if(rex.w) { @@ -144,14 +144,14 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("CVTTSS2SI Gd, Ex"); nextop = F8; GETGD; - GETEX(d0, 0); + GETEX(d0, 0, 0); FCVTZSxwS(gd, d0); break; case 0x2D: INST_NAME("CVTSS2SI Gd, Ex"); nextop = F8; GETGD; - GETEX(q0, 0); + GETEX(q0, 0, 0); #ifdef PRECISE_CVT LDRH_U12(x1, xEmu, offsetof(x64emu_t, mxcsr)); UBFXx(x1, x1, 13, 2); // extract round requested @@ -174,17 +174,17 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x51: INST_NAME("SQRTSS Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); d1 = fpu_get_scratch(dyn); - GETEX(d0, 0); + GETEX(d0, 0, 0); FSQRTS(d1, d0); VMOVeS(v0, 0, d1, 0); break; case 0x52: INST_NAME("RSQRTSS Gx, Ex"); nextop = F8; - GETEX(v1, 0); - GETGX_empty(v0); + GETGX(v0, 1); + GETEX(v1, 0, 0); d0 = fpu_get_scratch(dyn); d1 = fpu_get_scratch(dyn); // so here: F32: Imm8 = abcd efgh that gives => aBbbbbbc defgh000 00000000 00000000 @@ -200,8 +200,8 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x53: INST_NAME("RCPSS Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); d0 = fpu_get_scratch(dyn); FMOVS_8(d0, 0b01110000); //1.0f FDIVS(d0, d0, v1); @@ -211,26 +211,26 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x58: INST_NAME("ADDSS Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); d1 = fpu_get_scratch(dyn); - GETEX(d0, 0); + GETEX(d0, 0, 0); FADDS(d1, v0, d0); // the high part of the vector is erased... VMOVeS(v0, 0, d1, 0); break; case 0x59: INST_NAME("MULSS Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); d1 = fpu_get_scratch(dyn); - GETEX(d0, 0); + GETEX(d0, 0, 0); FMULS(d1, v0, d0); VMOVeS(v0, 0, d1, 0); break; case 0x5A: INST_NAME("CVTSS2SD Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); d0 = fpu_get_scratch(dyn); FCVT_D_S(d0, v1); VMOVeD(v0, 0, d0, 0); @@ -238,7 +238,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5B: INST_NAME("CVTPS2DQ Gx, Ex"); nextop = F8; - GETEX(d0, 0); + GETEX(d0, 0, 0); GETGX_empty(v0); VFCVTZSQS(v0, d0); break; @@ -246,17 +246,17 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5C: INST_NAME("SUBSS Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); d1 = fpu_get_scratch(dyn); - GETEX(d0, 0); + GETEX(d0, 0, 0); FSUBS(d1, v0, d0); VMOVeS(v0, 0, d1, 0); break; case 0x5D: INST_NAME("MINSS Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); // MINSS: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0] #if 0 d0 = fpu_get_scratch(dyn); @@ -271,17 +271,17 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x5E: INST_NAME("DIVSS Gx, Ex"); nextop = F8; - GETGX(v0); + GETGX(v0, 1); d1 = fpu_get_scratch(dyn); - GETEX(d0, 0); + GETEX(d0, 0, 0); FDIVS(d1, v0, d0); VMOVeS(v0, 0, d1, 0); break; case 0x5F: INST_NAME("MAXSS Gx, Ex"); nextop = F8; - GETGX(v0); - GETEX(v1, 0); + GETGX(v0, 1); + GETEX(v1, 0, 0); // MAXSS: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0] #if 0 d0 = fpu_get_scratch(dyn); @@ -298,11 +298,12 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n INST_NAME("MOVDQU Gx,Ex");// no alignment constraint on NEON here, so same as MOVDQA nextop = F8; GETG; - v0 = sse_get_reg_empty(dyn, ninst, x1, gd); if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); + GETGX_empty(v0); VMOVQ(v0, v1); } else { + GETGX_empty(v0); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, 0, 0); VLDR128_U12(v0, ed, fixedaddress); } @@ -310,9 +311,8 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x70: INST_NAME("PSHUFHW Gx, Ex, Ib"); nextop = F8; - GETEX(v1, 1); - GETGX(v0); - + GETEX(v1, 0, 1); + GETGX(v0, 1); u8 = F8; // only high part need to be suffled. VTBL only handle 8bits value, so the 16bits suffles need to be changed in 8bits u64 = 0; @@ -333,13 +333,12 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x7E: INST_NAME("MOVQ Gx, Ex"); nextop = F8; - GETG; if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); - v0 = sse_get_reg_empty(dyn, ninst, x1, gd); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0); + GETGX_empty(v0); FMOVD(v0, v1); } else { - v0 = sse_get_reg_empty(dyn, ninst, x1, gd); + GETGX_empty(v0); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); VLDR64_U12(v0, ed, fixedaddress); } @@ -347,13 +346,11 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0x7F: INST_NAME("MOVDQU Ex,Gx"); nextop = F8; - GETG; + GETGX(v0, 0); if(MODREG) { - v0 = sse_get_reg(dyn, ninst, x1, gd); v1 = sse_get_reg_empty(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); VMOVQ(v1, v0); } else { - v0 = sse_get_reg(dyn, ninst, x1, gd); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, 0, 0); VSTR128_U12(v0, ed, fixedaddress); } @@ -392,8 +389,8 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n case 0xC2: INST_NAME("CMPSS Gx, Ex, Ib"); nextop = F8; - GETGX(v0); - GETEX(v1, 1); + GETGX(v0, 1); + GETEX(v1, 0, 1); u8 = F8; FCMPS(v0, v1); switch(u8&7) { @@ -414,7 +411,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; GETGX_empty(v0); if(MODREG) { - v1 = mmx_get_reg(dyn, ninst, x1, (nextop&7)); + v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, (nextop&7)); VEORQ(v0, v0, v0); // usefull? VMOV(v0, v1); } else { @@ -428,7 +425,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n nextop = F8; // cannot use GETEX because we want 64bits not 32bits if(MODREG) { - v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3)); + v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0); } else { v1 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, 0, 0); diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index 32bea7d6..900e2b7a 100755 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -319,50 +319,409 @@ void arm_aese(x64emu_t* emu, int xmm) // Get a FPU scratch reg int fpu_get_scratch(dynarec_arm_t* dyn) { - return SCRATCH0 + dyn->fpu_scratch++; // return an Sx + return SCRATCH0 + dyn->n.fpu_scratch++; // return an Sx } // Reset scratch regs counter void fpu_reset_scratch(dynarec_arm_t* dyn) { - dyn->fpu_scratch = 0; + dyn->n.fpu_scratch = 0; } // Get a x87 double reg -int fpu_get_reg_x87(dynarec_arm_t* dyn) +int fpu_get_reg_x87(dynarec_arm_t* dyn, int t, int n) { int i=X870; - while (dyn->fpuused[i]) ++i; - dyn->fpuused[i] = 1; + while (dyn->n.fpuused[i]) ++i; + dyn->n.fpuused[i] = 1; + dyn->n.neoncache[i].n = n; + dyn->n.neoncache[i].t = t; + dyn->n.news |= (1<<i); return i; // return a Dx } // Free a FPU double reg void fpu_free_reg(dynarec_arm_t* dyn, int reg) { // TODO: check upper limit? - dyn->fpuused[reg] = 0; + dyn->n.fpuused[reg] = 0; + if(dyn->n.neoncache[reg].t!=NEON_CACHE_ST_F && dyn->n.neoncache[reg].t!=NEON_CACHE_ST_D) + dyn->n.neoncache[reg].v = 0; } // Get an MMX double reg int fpu_get_reg_emm(dynarec_arm_t* dyn, int emm) { - dyn->fpuused[EMM0 + emm] = 1; + dyn->n.fpuused[EMM0 + emm] = 1; + dyn->n.neoncache[EMM0 + emm].t = NEON_CACHE_MM; + dyn->n.neoncache[EMM0 + emm].n = emm; return EMM0 + emm; } // Get an XMM quad reg -int fpu_get_reg_xmm(dynarec_arm_t* dyn, int xmm) +int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm) { + int i; if(xmm>7) { - dyn->fpuused[XMM8 + xmm - 8] = 1; - return XMM8 + xmm - 8; + i = XMM8 + xmm - 8; } else { - dyn->fpuused[XMM0 + xmm] = 1; - return XMM0 + xmm; + i = XMM0 + xmm; } + dyn->n.fpuused[i] = 1; + dyn->n.neoncache[i].t = t; + dyn->n.neoncache[i].n = xmm; + return i; } // Reset fpu regs counter void fpu_reset_reg(dynarec_arm_t* dyn) { - dyn->fpu_reg = 0; - for (int i=0; i<32; ++i) - dyn->fpuused[i]=0; + dyn->n.fpu_reg = 0; + for (int i=0; i<24; ++i) { + dyn->n.fpuused[i]=0; + dyn->n.neoncache[i].v = 0; + } + +} + +int neoncache_get_st(dynarec_arm_t* dyn, int ninst, int a) +{ + if (dyn->insts[ninst].n.swapped) { + if(dyn->insts[ninst].n.combined1 == a) + a = dyn->insts[ninst].n.combined2; + else if(dyn->insts[ninst].n.combined2 == a) + a = dyn->insts[ninst].n.combined1; + } + for(int i=0; i<24; ++i) + if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F + || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D) + && dyn->insts[ninst].n.neoncache[i].n==a) + return dyn->insts[ninst].n.neoncache[i].t; + // not in the cache yet, so will be fetched... + return NEON_CACHE_ST_D; +} + +int neoncache_get_current_st(dynarec_arm_t* dyn, int ninst, int a) +{ + (void)ninst; + if(!dyn->insts) + return NEON_CACHE_ST_D; + for(int i=0; i<24; ++i) + if((dyn->n.neoncache[i].t==NEON_CACHE_ST_F + || dyn->n.neoncache[i].t==NEON_CACHE_ST_D) + && dyn->n.neoncache[i].n==a) + return dyn->n.neoncache[i].t; + // not in the cache yet, so will be fetched... + return NEON_CACHE_ST_D; +} + +int neoncache_get_st_f(dynarec_arm_t* dyn, int ninst, int a) +{ + /*if(a+dyn->insts[ninst].n.stack_next-st<0) + // The STx has been pushed at the end of instructon, so stop going back + return -1;*/ + for(int i=0; i<24; ++i) + if(dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F + && dyn->insts[ninst].n.neoncache[i].n==a) + return i; + return -1; +} +int neoncache_get_st_f_noback(dynarec_arm_t* dyn, int ninst, int a) +{ + for(int i=0; i<24; ++i) + if(dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F + && dyn->insts[ninst].n.neoncache[i].n==a) + return i; + return -1; +} +int neoncache_get_current_st_f(dynarec_arm_t* dyn, int a) +{ + for(int i=0; i<24; ++i) + if(dyn->n.neoncache[i].t==NEON_CACHE_ST_F + && dyn->n.neoncache[i].n==a) + return i; + return -1; +} +static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int maxinst, int a); +static void neoncache_promote_double_internal(dynarec_arm_t* dyn, int ninst, int maxinst, int a); +static void neoncache_promote_double_combined(dynarec_arm_t* dyn, int ninst, int maxinst, int a) +{ + if(a == dyn->insts[ninst].n.combined1 || a == dyn->insts[ninst].n.combined2) { + if(a == dyn->insts[ninst].n.combined1) { + a = dyn->insts[ninst].n.combined2; + } else + a = dyn->insts[ninst].n.combined1; + int i = neoncache_get_st_f_noback(dyn, ninst, a); + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_combined, ninst=%d combined%c %d i=%d (stack:%d/%d)\n", ninst, (a == dyn->insts[ninst].n.combined2)?'2':'1', a ,i, dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop); + if(i>=0) { + dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D; + if(!dyn->insts[ninst].n.barrier) + neoncache_promote_double_internal(dyn, ninst-1, maxinst, a-dyn->insts[ninst].n.stack_push); + // go forward is combined is not pop'd + if(a-dyn->insts[ninst].n.stack_pop>=0) + if(!dyn->insts[ninst+1].n.barrier) + neoncache_promote_double_forward(dyn, ninst+1, maxinst, a-dyn->insts[ninst].n.stack_pop); + } + } +} +static void neoncache_promote_double_internal(dynarec_arm_t* dyn, int ninst, int maxinst, int a) +{ + if(dyn->insts[ninst+1].n.barrier) + return; + while(ninst>=0) { + a+=dyn->insts[ninst].n.stack_pop; // adjust Stack depth: add pop'd ST (going backward) + int i = neoncache_get_st_f(dyn, ninst, a); + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_internal, ninst=%d, a=%d st=%d:%d, i=%d\n", ninst, a, dyn->insts[ninst].n.stack, dyn->insts[ninst].n.stack_next, i); + if(i<0) return; + dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D; + // check combined propagation too + if(dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) { + if(dyn->insts[ninst].n.swapped) { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_internal, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].n.combined1 ,dyn->insts[ninst].n.combined2, a, dyn->insts[ninst].n.stack); + if (a==dyn->insts[ninst].n.combined1) + a = dyn->insts[ninst].n.combined2; + else if (a==dyn->insts[ninst].n.combined2) + a = dyn->insts[ninst].n.combined1; + } else { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_internal, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].n.combined1 ,dyn->insts[ninst].n.combined2, a, dyn->insts[ninst].n.stack); + neoncache_promote_double_combined(dyn, ninst, maxinst, a); + } + } + a-=dyn->insts[ninst].n.stack_push; // // adjust Stack depth: remove push'd ST (going backward) + --ninst; + if(ninst<0 || a<0 || dyn->insts[ninst].n.barrier) + return; + } +} + +static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int maxinst, int a) +{ + while((ninst!=-1) && (ninst<maxinst) && (a>=0)) { + a+=dyn->insts[ninst].n.stack_push; // // adjust Stack depth: add push'd ST (going forward) + if((dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) && dyn->insts[ninst].n.swapped) { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_forward, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].n.combined1 ,dyn->insts[ninst].n.combined2, a, dyn->insts[ninst].n.stack); + if (a==dyn->insts[ninst].n.combined1) + a = dyn->insts[ninst].n.combined2; + else if (a==dyn->insts[ninst].n.combined2) + a = dyn->insts[ninst].n.combined1; + } + int i = neoncache_get_st_f_noback(dyn, ninst, a); + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_forward, ninst=%d, a=%d st=%d:%d(%d/%d), i=%d\n", ninst, a, dyn->insts[ninst].n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop, i); + if(i<0) return; + dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D; + // check combined propagation too + if((dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) && !dyn->insts[ninst].n.swapped) { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_forward, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].n.combined1 ,dyn->insts[ninst].n.combined2, a, dyn->insts[ninst].n.stack); + neoncache_promote_double_combined(dyn, ninst, maxinst, a); + } + a-=dyn->insts[ninst].n.stack_pop; // adjust Stack depth: remove pop'd ST (going forward) + if(dyn->insts[ninst].x64.has_next && !dyn->insts[ninst].n.barrier) + ++ninst; + else + ninst=-1; + } + if(ninst==maxinst) + neoncache_promote_double(dyn, ninst, a); +} + +void neoncache_promote_double(dynarec_arm_t* dyn, int ninst, int a) +{ + int i = neoncache_get_current_st_f(dyn, a); + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double, ninst=%d a=%d st=%d i=%d\n", ninst, a, dyn->n.stack, i); + if(i<0) return; + dyn->n.neoncache[i].t = NEON_CACHE_ST_D; + dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D; + // check combined propagation too + if(dyn->n.combined1 || dyn->n.combined2) { + if(dyn->n.swapped) { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double, ninst=%d swapped! %d/%d vs %d\n", ninst, dyn->n.combined1 ,dyn->n.combined2, a); + if(dyn->n.combined1 == a) + a = dyn->n.combined2; + else if(dyn->n.combined2 == a) + a = dyn->n.combined1; + } else { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double, ninst=%d combined! %d/%d vs %d\n", ninst, dyn->n.combined1 ,dyn->n.combined2, a); + if(dyn->n.combined1 == a) + neoncache_promote_double(dyn, ninst, dyn->n.combined2); + else if(dyn->n.combined2 == a) + neoncache_promote_double(dyn, ninst, dyn->n.combined1); + } + } + a-=dyn->insts[ninst].n.stack_push; // // adjust Stack depth: remove push'd ST (going backward) + if(!ninst || a<0) return; + neoncache_promote_double_internal(dyn, ninst-1, ninst, a); +} + +int neoncache_combine_st(dynarec_arm_t* dyn, int ninst, int a, int b) +{ + dyn->n.combined1=a; + dyn->n.combined2=b; + if( neoncache_get_current_st(dyn, ninst, a)==NEON_CACHE_ST_F + && neoncache_get_current_st(dyn, ninst, b)==NEON_CACHE_ST_F ) + return NEON_CACHE_ST_F; + return NEON_CACHE_ST_D; +} + +int isPred(dynarec_arm_t* dyn, int ninst, int pred) { + for(int i=0; i<dyn->insts[ninst].pred_sz; ++i) + if(dyn->insts[ninst].pred[i]==pred) + return pred; + return -1; +} +int getNominalPred(dynarec_arm_t* dyn, int ninst) { + if((ninst<=0) || !dyn->insts[ninst].pred_sz) + return -1; + if(isPred(dyn, ninst, ninst-1)!=-1) + return ninst-1; + return dyn->insts[ninst].pred[0]; +} + +int isCacheEmpty(dynarec_arm_t* dyn, int ninst) { + if(dyn->insts[ninst].n.stack_next) { + return 0; + } + for(int i=0; i<24; ++i) + if(dyn->insts[ninst].n.neoncache[i].v) { // there is something at ninst for i + if(!( + (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D) + && dyn->insts[ninst].n.neoncache[i].n<dyn->insts[ninst].n.stack_pop)) + return 0; + } + return 1; + +} + +int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { + int i2 = dyn->insts[ninst].x64.jmp_insts; + if(i2<0) + return 1; + if((dyn->insts[i2].x64.barrier&BARRIER_FLOAT)) + // if the barrier as already been apply, no transform needed + return ((dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))?0:(isCacheEmpty(dyn, ninst)?0:1); + int ret = 0; + if(!i2) { // just purge + if(dyn->insts[ninst].n.stack_next) { + return 1; + } + for(int i=0; i<24 && !ret; ++i) + if(dyn->insts[ninst].n.neoncache[i].v) { // there is something at ninst for i + if(!( + (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D) + && dyn->insts[ninst].n.neoncache[i].n<dyn->insts[ninst].n.stack_pop)) + ret = 1; + } + return ret; + } + // Check if ninst can be compatible to i2 + if(dyn->insts[ninst].n.stack_next != dyn->insts[i2].n.stack-dyn->insts[i2].n.stack_push) { + return 1; + } + neoncache_t cache_i2 = dyn->insts[i2].n; + neoncacheUnwind(&cache_i2); + + for(int i=0; i<24; ++i) { + if(dyn->insts[ninst].n.neoncache[i].v) { // there is something at ninst for i + if(!cache_i2.neoncache[i].v) { // but there is nothing at i2 for i + ret = 1; + } else if(dyn->insts[ninst].n.neoncache[i].v!=cache_i2.neoncache[i].v) { // there is something different + if(dyn->insts[ninst].n.neoncache[i].n!=cache_i2.neoncache[i].n) { // not the same x64 reg + ret = 1; + } + else if(dyn->insts[ninst].n.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) + {/* nothing */ } + else + ret = 1; + } + } else if(cache_i2.neoncache[i].v) + ret = 1; + } + return ret; +} + +void neoncacheUnwind(neoncache_t* cache) +{ + if(cache->swapped) { + // unswap + int a = -1; + int b = -1; + for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j) + if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F)) { + if(cache->neoncache[j].n == cache->combined1) + a = j; + else if(cache->neoncache[j].n == cache->combined2) + b = j; + } + if(a!=-1 && b!=-1) { + int tmp = cache->neoncache[a].n; + cache->neoncache[a].n = cache->neoncache[b].n; + cache->neoncache[b].n = tmp; + } + cache->swapped = 0; + cache->combined1 = cache->combined2 = 0; + } + if(cache->news) { + // reove the newly created neoncache + for(int i=0; i<24; ++i) + if(cache->news&(1<<i)) + cache->neoncache[i].v = 0; + cache->news = 0; + } + if(cache->stack_push) { + // unpush + for(int j=0; j<24; ++j) { + if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F)) { + if(cache->neoncache[j].n<cache->stack_push) + cache->neoncache[j].v = 0; + else + cache->neoncache[j].n-=cache->stack_push; + } + } + cache->x87stack-=cache->stack_push; + cache->stack-=cache->stack_push; + cache->stack_push = 0; + } + cache->x87stack+=cache->stack_pop; + cache->stack_next = cache->stack; + cache->stack_pop = 0; + cache->barrier = 0; + // And now, rebuild the x87cache info with neoncache + cache->mmxcount = 0; + cache->fpu_scratch = 0; + cache->fpu_extra_qscratch = 0; + cache->fpu_reg = 0; + for(int i=0; i<8; ++i) { + cache->x87cache[i] = -1; + cache->mmxcache[i] = -1; + cache->x87reg[i] = 0; + cache->ssecache[i*2].v = -1; + cache->ssecache[i*2+1].v = -1; + } + int x87reg = 0; + for(int i=0; i<24; ++i) { + if(cache->neoncache[i].v) { + cache->fpuused[i] = 1; + switch (cache->neoncache[i].t) { + case NEON_CACHE_MM: + cache->mmxcache[cache->neoncache[i].n] = i; + ++cache->mmxcount; + ++cache->fpu_reg; + break; + case NEON_CACHE_XMMR: + case NEON_CACHE_XMMW: + cache->ssecache[cache->neoncache[i].n].reg = i; + cache->ssecache[cache->neoncache[i].n].write = (cache->neoncache[i].t==NEON_CACHE_XMMW)?1:0; + ++cache->fpu_reg; + break; + case NEON_CACHE_ST_F: + case NEON_CACHE_ST_D: + cache->x87cache[x87reg] = cache->neoncache[i].n; + cache->x87reg[x87reg] = i; + ++x87reg; + ++cache->fpu_reg; + break; + case NEON_CACHE_SCR: + cache->fpuused[i] = 0; + cache->neoncache[i].v = 0; + break; + } + } else { + cache->fpuused[i] = 0; + } + } } #define F8 *(uint8_t*)(addr++) @@ -464,3 +823,36 @@ int isNativeCall(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t* calladdress, int #undef PK32 #undef PK } + +const char* getCacheName(int t, int n) +{ + static char buff[20]; + switch(t) { + case NEON_CACHE_ST_D: sprintf(buff, "ST%d", n); break; + case NEON_CACHE_ST_F: sprintf(buff, "st%d", n); break; + case NEON_CACHE_MM: sprintf(buff, "MM%d", n); break; + case NEON_CACHE_XMMW: sprintf(buff, "XMM%d", n); break; + case NEON_CACHE_XMMR: sprintf(buff, "xmm%d", n); break; + case NEON_CACHE_SCR: sprintf(buff, "Scratch"); break; + case NEON_CACHE_NONE: buff[0]='\0'; break; + } + return buff; +} + +// is inst clean for a son branch? +int isInstClean(dynarec_arm_t* dyn, int ninst) +{ + // check flags cache + if(dyn->insts[ninst].f_entry.dfnone || dyn->insts[ninst].f_entry.pending) + return 0; + if(dyn->insts[ninst].x64.state_flags) + return 0; + // check neoncache + neoncache_t* n = &dyn->insts[ninst].n; + if(n->news || n->stack || n->stack_next) + return 0; + for(int i=0; i<24; ++i) + if(n->neoncache[i].v) + return 0; + return 1; +} diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h index d4c861c9..bf6c270f 100755 --- a/src/dynarec/arm64/dynarec_arm64_functions.h +++ b/src/dynarec/arm64/dynarec_arm64_functions.h @@ -43,16 +43,43 @@ int fpu_get_scratch(dynarec_arm_t* dyn); // Reset scratch regs counter void fpu_reset_scratch(dynarec_arm_t* dyn); // Get an x87 double reg -int fpu_get_reg_x87(dynarec_arm_t* dyn); +int fpu_get_reg_x87(dynarec_arm_t* dyn, int t, int n); // Get an MMX double reg int fpu_get_reg_emm(dynarec_arm_t* dyn, int emm); // Get an XMM quad reg -int fpu_get_reg_xmm(dynarec_arm_t* dyn, int xmm); +int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm); // Free a FPU/MMX/XMM reg void fpu_free_reg(dynarec_arm_t* dyn, int reg); // Reset fpu regs counter void fpu_reset_reg(dynarec_arm_t* dyn); +// ---- Neon cache functions +// Get type for STx +int neoncache_get_st(dynarec_arm_t* dyn, int ninst, int a); +// Get if STx is FLOAT or DOUBLE +int neoncache_get_st_f(dynarec_arm_t* dyn, int ninst, int a); +// Get actual type for STx +int neoncache_get_current_st(dynarec_arm_t* dyn, int ninst, int a); +// Get actual STx is FLOAT or DOUBLE +int neoncache_get_current_st_f(dynarec_arm_t* dyn, int a); +// Back-propagate a change float->double +void neoncache_promote_double(dynarec_arm_t* dyn, int ninst, int a); +// Combine and propagate if needed (pass 1 only) +int neoncache_combine_st(dynarec_arm_t* dyn, int ninst, int a, int b); // with stack current dyn->n_stack* + +// FPU Cache transformation (for loops) +int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int i1); + +// Undo the changes of a neoncache to get the status before the instruction +void neoncacheUnwind(neoncache_t* cache); + +// is inst clean for a son branch? +int isInstClean(dynarec_arm_t* dyn, int ninst); + +// predecessor access +int isPred(dynarec_arm_t* dyn, int ninst, int pred); +int getNominalPred(dynarec_arm_t* dyn, int ninst); + // Get if ED will have the correct parity. Not emiting anything. Parity is 2 for DWORD or 3 for QWORD int getedparity(dynarec_arm_t* dyn, int ninst, uintptr_t addr, uint8_t nextop, int parity, int delta); // Do the GETED, but don't emit anything... @@ -61,4 +88,6 @@ uintptr_t fakeed(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop); // Is what pointed at addr a native call? And if yes, to what function? int isNativeCall(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t* calladdress, int* retn); +const char* getCacheName(int t, int n); + #endif //__DYNAREC_ARM_FUNCTIONS_H__ \ No newline at end of file diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index fc28663c..5fb55a6a 100755 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -3,6 +3,7 @@ #include <stddef.h> #include <pthread.h> #include <errno.h> +#include <assert.h> #include "debug.h" #include "box64context.h" @@ -539,7 +540,7 @@ void call_n(dynarec_arm_t* dyn, int ninst, void* fnc, int w) if(abs(w)>1) { MESSAGE(LOG_DUMP, "Getting %d XMM args\n", abs(w)-1); for(int i=0; i<abs(w)-1; ++i) { - sse_get_reg(dyn, ninst, x7, i); + sse_get_reg(dyn, ninst, x7, i, w); } } if(w<0) { @@ -556,7 +557,7 @@ void call_n(dynarec_arm_t* dyn, int ninst, void* fnc, int w) // native call TABLE64(16, (uintptr_t)fnc); // using x16 as scratch regs for call address BLR(16); - // put return value in x86 regs + // put return value in x64 regs if(w>0) { MOVx_REG(xRAX, 0); MOVx_REG(xRDX, x1); @@ -598,26 +599,32 @@ void grab_segdata(dynarec_arm_t* dyn, uintptr_t addr, int ninst, int reg, int se } // x87 stuffs -static void x87_reset(dynarec_arm_t* dyn, int ninst) +static void x87_reset(dynarec_arm_t* dyn) { - (void)ninst; -#if STEP > 1 for (int i=0; i<8; ++i) - dyn->x87cache[i] = -1; - dyn->x87stack = 0; -#else - (void)dyn; -#endif + dyn->n.x87cache[i] = -1; + dyn->n.x87stack = 0; + dyn->n.stack = 0; + dyn->n.stack_next = 0; + dyn->n.stack_pop = 0; + dyn->n.stack_push = 0; + dyn->n.combined1 = dyn->n.combined2 = 0; + dyn->n.swapped = 0; + dyn->n.barrier = 0; + for(int i=0; i<24; ++i) + if(dyn->n.neoncache[i].t == NEON_CACHE_ST_F || dyn->n.neoncache[i].t == NEON_CACHE_ST_D) + dyn->n.neoncache[i].v = 0; } void x87_stackcount(dynarec_arm_t* dyn, int ninst, int scratch) { -#if STEP > 1 MAYUSE(scratch); - if(!dyn->x87stack) + if(!dyn->n.x87stack) return; - MESSAGE(LOG_DUMP, "\tSynch x87 Stackcount (%d)\n", dyn->x87stack); - int a = dyn->x87stack; + if(dyn->n.mmxcount) + mmx_purgecache(dyn, ninst, 0, scratch); + MESSAGE(LOG_DUMP, "\tSynch x87 Stackcount (%d)\n", dyn->n.x87stack); + int a = dyn->n.x87stack; // Add x87stack to emu fpu_stack LDRw_U12(scratch, xEmu, offsetof(x64emu_t, fpu_stack)); if(a>0) { @@ -635,83 +642,106 @@ void x87_stackcount(dynarec_arm_t* dyn, int ninst, int scratch) } ANDw_mask(scratch, scratch, 0, 2); //mask=7 STRw_U12(scratch, xEmu, offsetof(x64emu_t, top)); - // reset x87stack - dyn->x87stack = 0; + // reset x87stack, but not the stack count of neoncache + dyn->n.x87stack = 0; + dyn->n.stack_next -= dyn->n.stack; + dyn->n.stack = 0; MESSAGE(LOG_DUMP, "\t------x87 Stackcount\n"); -#else - (void)dyn; (void)ninst; (void)scratch; -#endif } -int x87_do_push(dynarec_arm_t* dyn, int ninst) +int neoncache_st_coherency(dynarec_arm_t* dyn, int ninst, int a, int b) { - (void)ninst; -#if STEP > 1 - dyn->x87stack+=1; + int i1 = neoncache_get_st(dyn, ninst, a); + int i2 = neoncache_get_st(dyn, ninst, b); + if(i1!=i2) { + MESSAGE(LOG_DUMP, "Warning, ST cache incoherent between ST%d(%d) and ST%d(%d)\n", a, i1, b, i2); + } + + return i1; +} + +// On step 1, Float/Double for ST is actualy computed and back-propagated +// On step 2-3, the value is just read for inst[...].n.neocache[..] +// the reg returned is *2 for FLOAT +int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t) +{ + if(dyn->n.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->n.x87stack+=1; + dyn->n.stack+=1; + dyn->n.stack_next+=1; + dyn->n.stack_push+=1; // move all regs in cache, and find a free one + for(int j=0; j<24; ++j) + if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) || (dyn->n.neoncache[j].t == NEON_CACHE_ST_F)) + ++dyn->n.neoncache[j].n; int ret = -1; for(int i=0; i<8; ++i) - if(dyn->x87cache[i]!=-1) - ++dyn->x87cache[i]; + if(dyn->n.x87cache[i]!=-1) + ++dyn->n.x87cache[i]; else if(ret==-1) { - dyn->x87cache[i] = 0; - ret=dyn->x87reg[i]=fpu_get_reg_x87(dyn); + dyn->n.x87cache[i] = 0; + ret=dyn->n.x87reg[i]=fpu_get_reg_x87(dyn, t, 0); + #if STEP == 1 + // need to check if reg is compatible with float + if((ret>15) && (t == NEON_CACHE_ST_F)) + dyn->n.neoncache[ret].t = NEON_CACHE_ST_D; + #else + dyn->n.neoncache[ret].t = X87_ST0; + #endif } return ret; -#else - (void)dyn; - return 0; -#endif } void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1) { -#if STEP > 1 - dyn->x87stack+=1; + if(dyn->n.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->n.x87stack+=1; + dyn->n.stack+=1; + dyn->n.stack_next+=1; + dyn->n.stack_push+=1; // move all regs in cache + for(int j=0; j<24; ++j) + if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) || (dyn->n.neoncache[j].t == NEON_CACHE_ST_F)) + ++dyn->n.neoncache[j].n; for(int i=0; i<8; ++i) - if(dyn->x87cache[i]!=-1) - ++dyn->x87cache[i]; + if(dyn->n.x87cache[i]!=-1) + ++dyn->n.x87cache[i]; if(s1) x87_stackcount(dyn, ninst, s1); -#else - (void)dyn; (void)ninst; (void)s1; -#endif } -void x87_do_pop(dynarec_arm_t* dyn, int ninst) +void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1) { - (void)ninst; -#if STEP > 1 - dyn->x87stack-=1; + if(dyn->n.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->n.x87stack-=1; + dyn->n.stack_next-=1; + dyn->n.stack_pop+=1; // move all regs in cache, poping ST0 for(int i=0; i<8; ++i) - if(dyn->x87cache[i]!=-1) { - --dyn->x87cache[i]; - if(dyn->x87cache[i]==-1) { - fpu_free_reg(dyn, dyn->x87reg[i]); - dyn->x87reg[i] = -1; + if(dyn->n.x87cache[i]!=-1) { + --dyn->n.x87cache[i]; + if(dyn->n.x87cache[i]==-1) { + fpu_free_reg(dyn, dyn->n.x87reg[i]); + dyn->n.x87reg[i] = -1; } } -#else - (void)dyn; -#endif } -void x87_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) +void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int s3) { - (void)ninst; -#if STEP > 1 - MAYUSE(s1); MAYUSE(s2); MAYUSE(s3); int ret = 0; for (int i=0; i<8 && !ret; ++i) - if(dyn->x87cache[i] != -1) + if(dyn->n.x87cache[i] != -1) ret = 1; - if(!ret && !dyn->x87stack) // nothing to do + if(!ret && !dyn->n.x87stack) // nothing to do return; - MESSAGE(LOG_DUMP, "\tPurge x87 Cache and Synch Stackcount (%+d)\n", dyn->x87stack); - int a = dyn->x87stack; + MESSAGE(LOG_DUMP, "\tPurge %sx87 Cache and Synch Stackcount (%+d)---\n", next?"locally ":"", dyn->n.x87stack); + int a = dyn->n.x87stack; if(a!=0) { // reset x87stack - dyn->x87stack = 0; + if(!next) + dyn->n.x87stack = 0; // Add x87stack to emu fpu_stack LDRw_U12(s2, xEmu, offsetof(x64emu_t, fpu_stack)); if(a>0) { @@ -753,29 +783,54 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) // Get top // loop all cache entries for (int i=0; i<8; ++i) - if(dyn->x87cache[i]!=-1) { - ADDw_U12(s3, s2, dyn->x87cache[i]); + if(dyn->n.x87cache[i]!=-1) { + #if STEP == 1 + if(!next) { // don't force promotion here + // pre-apply pop, because purge happens in-between + neoncache_promote_double(dyn, ninst, dyn->n.x87cache[i]+dyn->n.stack_pop); + } + #endif + #if STEP == 3 + if(!next && neoncache_get_st_f(dyn, ninst, dyn->n.x87cache[i])>=0) { + MESSAGE(LOG_DUMP, "Warning, incoherency with purged ST%d cache\n", dyn->n.x87cache[i]); + } + #endif + ADDw_U12(s3, s2, dyn->n.x87cache[i]); ANDw_mask(s3, s3, 0, 2); //mask=7 // (emu->top + st)&7 - VSTR64_REG_LSL3(dyn->x87reg[i], s1, s3); - fpu_free_reg(dyn, dyn->x87reg[i]); - dyn->x87reg[i] = -1; - dyn->x87cache[i] = -1; + if(next) { + // need to check if a ST_F need local promotion + if(neoncache_get_st_f(dyn, ninst, dyn->n.x87cache[i])>=0) { + FCVT_S_D(0, dyn->n.x87reg[i]); + VSTR64_REG_LSL3(0, s1, s3); // save the value + } else { + VSTR64_REG_LSL3(dyn->n.x87reg[i], s1, s3); // save the value + } + } else { + VSTR64_REG_LSL3(dyn->n.x87reg[i], s1, s3); + fpu_free_reg(dyn, dyn->n.x87reg[i]); + dyn->n.x87reg[i] = -1; + dyn->n.x87cache[i] = -1; + //dyn->n.stack_pop+=1; //no pop, but the purge because of barrier will have the n.barrier flags set + } } } -#else - (void)dyn; (void)s1; (void)s2; (void)s3; -#endif + if(!next) { + dyn->n.stack_next = 0; + #if STEP < 2 + // refresh the cached valued, in case it's a purge outside a instruction + dyn->insts[ninst].n.barrier = 1; + #endif + } + MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n"); } #ifdef HAVE_TRACE static void x87_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) { -#if STEP > 1 - MAYUSE(s2); MAYUSE(s3); x87_stackcount(dyn, ninst, s1); int ret = 0; for (int i=0; (i<8) && (!ret); ++i) - if(dyn->x87cache[i] != -1) + if(dyn->n.x87cache[i] != -1) ret = 1; if(!ret) // nothing to do return; @@ -785,75 +840,88 @@ static void x87_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int LDRw_U12(s2, xEmu, offsetof(x64emu_t, top)); // loop all cache entries for (int i=0; i<8; ++i) - if(dyn->x87cache[i]!=-1) { - ADDw_U12(s3, s2, dyn->x87cache[i]); + if(dyn->n.x87cache[i]!=-1) { + ADDw_U12(s3, s2, dyn->n.x87cache[i]); ANDw_mask(s3, s3, 0, 2); // mask=7 // (emu->top + i)&7 - VSTR64_REG_LSL3(dyn->x87reg[i], s1, s3); + VSTR64_REG_LSL3(dyn->n.x87reg[i], s1, s3); } -#else - (void)dyn; (void)ninst; (void)s1; (void)s2; (void)s3; -#endif } #endif -int x87_get_cache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) +int x87_get_current_cache(dynarec_arm_t* dyn, int ninst, int st, int t) { - (void)ninst; -#if STEP > 1 - MAYUSE(s1); MAYUSE(s2); // search in cache first - for (int i=0; i<8; ++i) - if(dyn->x87cache[i]==st) + for (int i=0; i<8; ++i) { + if(dyn->n.x87cache[i]==st) { + #if STEP == 1 + if(t==NEON_CACHE_ST_D && (dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F)) + neoncache_promote_double(dyn, ninst, st); + #endif return i; - MESSAGE(LOG_DUMP, "\tCreate x87 Cache for ST%d\n", st); + } + assert(dyn->n.x87cache[i]<8); + } + return -1; +} + +int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, int st, int t) +{ + if(dyn->n.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + int ret = x87_get_current_cache(dyn, ninst, st, t); + if(ret!=-1) + return ret; + MESSAGE(LOG_DUMP, "\tCreate %sx87 Cache for ST%d\n", populate?"and populate ":"", st); // get a free spot - int ret = -1; for (int i=0; (i<8) && (ret==-1); ++i) - if(dyn->x87cache[i]==-1) + if(dyn->n.x87cache[i]==-1) ret = i; // found, setup and grab the value - dyn->x87cache[ret] = st; - dyn->x87reg[ret] = fpu_get_reg_x87(dyn); - ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87)); - LDRw_U12(s2, xEmu, offsetof(x64emu_t, top)); - int a = st - dyn->x87stack; - if(a) { - if(a<0) { - SUBw_U12(s2, s2, -a); - } else { - ADDw_U12(s2, s2, a); + dyn->n.x87cache[ret] = st; + dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, t, st); + if(populate) { + ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87)); + LDRw_U12(s2, xEmu, offsetof(x64emu_t, top)); + int a = st - dyn->n.x87stack; + if(a) { + if(a<0) { + SUBw_U12(s2, s2, -a); + } else { + ADDw_U12(s2, s2, a); + } + ANDw_mask(s2, s2, 0, 2); //mask=7 } - ANDw_mask(s2, s2, 0, 2); //mask=7 + VLDR64_REG_LSL3(dyn->n.x87reg[ret], s1, s2); } - VLDR64_REG_LSL3(dyn->x87reg[ret], s1, s2); MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); return ret; -#else - (void)dyn; (void)s1; (void)s2; (void)st; - return 0; -#endif } - -int x87_get_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a) +int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) { -#if STEP > 1 - return dyn->x87reg[x87_get_cache(dyn, ninst, s1, s2, a)]; -#else - (void)dyn; (void)ninst; (void)s1; (void)s2; (void)a; - return 0; -#endif + for(int ii=0; ii<24; ++ii) + if((dyn->n.neoncache[ii].t == NEON_CACHE_ST_F || dyn->n.neoncache[ii].t == NEON_CACHE_ST_D) + && dyn->n.neoncache[ii].n==st) + return ii; + assert(0); + return -1; +} +int x87_get_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t) +{ + return dyn->n.x87reg[x87_get_cache(dyn, ninst, 1, s1, s2, a, t)]; +} +int x87_get_st_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t) +{ + return dyn->n.x87reg[x87_get_cache(dyn, ninst, 0, s1, s2, a, t)]; } void x87_refresh(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) { -#if STEP > 1 - MAYUSE(s2); x87_stackcount(dyn, ninst, s1); int ret = -1; for (int i=0; (i<8) && (ret==-1); ++i) - if(dyn->x87cache[i] == st) + if(dyn->n.x87cache[i] == st) ret = i; if(ret==-1) // nothing to do return; @@ -867,21 +935,16 @@ void x87_refresh(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) ADDw_U12(s2, s2, st); ANDw_mask(s2, s2, 0, 2); //mask=7 // (emu->top + i)&7 } - VLDR64_REG_LSL3(dyn->x87reg[ret], s1, s2); + VLDR64_REG_LSL3(dyn->n.x87reg[ret], s1, s2); MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st); -#else - (void)dyn; (void)ninst; (void)s1; (void)s2; (void)st; -#endif } void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) { -#if STEP > 1 - MAYUSE(s2); x87_stackcount(dyn, ninst, s1); int ret = -1; for (int i=0; (i<8) && (ret==-1); ++i) - if(dyn->x87cache[i] == st) + if(dyn->n.x87cache[i] == st) ret = i; if(ret==-1) // nothing to do return; @@ -895,37 +958,34 @@ void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) ADDw_U12(s2, s2, st); ANDw_mask(s2, s2, 0, 2); //mask=7 // (emu->top + i)&7 } - VSTR64_REG_LSL3(dyn->x87reg[ret], s1, s2); + VSTR64_REG_LSL3(dyn->n.x87reg[ret], s1, s2); MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st); // and forget that cache - fpu_free_reg(dyn, dyn->x87reg[ret]); - dyn->x87cache[ret] = -1; - dyn->x87reg[ret] = -1; -#else - (void)dyn; (void)ninst; (void)s1; (void)s2; (void)st; -#endif + fpu_free_reg(dyn, dyn->n.x87reg[ret]); + dyn->n.neoncache[dyn->n.x87reg[ret]].v = 0; + dyn->n.x87cache[ret] = -1; + dyn->n.x87reg[ret] = -1; } void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) { - (void)ninst; -#if STEP > 1 - MAYUSE(s1); MAYUSE(s2); + if(dyn->n.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); // search in cache first for (int i=0; i<8; ++i) - if(dyn->x87cache[i]==st) { + if(dyn->n.x87cache[i]==st) { // refresh the value MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st); ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87)); LDRw_U12(s2, xEmu, offsetof(x64emu_t, top)); - int a = st - dyn->x87stack; + int a = st - dyn->n.x87stack; if(a<0) { SUBw_U12(s2, s2, -a); } else { ADDw_U12(s2, s2, a); } ANDw_mask(s2, s2, 0, 2); //mask=7 // (emu->top + i)&7 - VLDR64_REG_LSL3(dyn->x87reg[i], s1, s2); + VLDR64_REG_LSL3(dyn->n.x87reg[i], s1, s2); MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); // ok return; @@ -935,25 +995,42 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) // get a free spot int ret = -1; for (int i=0; (i<8) && (ret==-1); ++i) - if(dyn->x87cache[i]==-1) + if(dyn->n.x87cache[i]==-1) ret = i; // found, setup and grab the value - dyn->x87cache[ret] = st; - dyn->x87reg[ret] = fpu_get_reg_x87(dyn); + dyn->n.x87cache[ret] = st; + dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, NEON_CACHE_ST_D, st); ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87)); LDRw_U12(s2, xEmu, offsetof(x64emu_t, top)); - int a = st - dyn->x87stack; + int a = st - dyn->n.x87stack; if(a<0) { SUBw_U12(s2, s2, -a); } else { ADDw_U12(s2, s2, a); } ANDw_mask(s2, s2, 0, 2); //mask=7 // (emu->top + i)&7 - VLDR64_REG_LSL3(dyn->x87reg[ret], s1, s2); + VLDR64_REG_LSL3(dyn->n.x87reg[ret], s1, s2); MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); -#else - (void)dyn; (void)s1; (void)s2; (void)st; -#endif +} + +void x87_swapreg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int b) +{ + int i1, i2, i3; + i1 = x87_get_cache(dyn, ninst, 1, s1, s2, b, X87_ST(b)); + i2 = x87_get_cache(dyn, ninst, 1, s1, s2, a, X87_ST(a)); + i3 = dyn->n.x87cache[i1]; + dyn->n.x87cache[i1] = dyn->n.x87cache[i2]; + dyn->n.x87cache[i2] = i3; + // swap those too + int j1, j2, j3; + j1 = x87_get_neoncache(dyn, ninst, s1, s2, b); + j2 = x87_get_neoncache(dyn, ninst, s1, s2, a); + j3 = dyn->n.neoncache[j1].n; + dyn->n.neoncache[j1].n = dyn->n.neoncache[j2].n; + dyn->n.neoncache[j2].n = j3; + // mark as swapped + dyn->n.swapped = 1; + dyn->n.combined1= a; dyn->n.combined2=b; } // Set rounding according to cw flags, return reg to restore flags @@ -977,7 +1054,7 @@ int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) MAYUSE(dyn); MAYUSE(ninst); MAYUSE(s1); MAYUSE(s2); LDRH_U12(s1, xEmu, offsetof(x64emu_t, mxcsr)); - RBITw(s2, s1); // round is on bits 13-14 on x86, + RBITw(s2, s1); // round is on bits 13-14 on x64, LSRw(s2, s2, 17); // but we want the reverse of that MRS_fpcr(s1); // get fpscr MOVx_REG(s3, s1); @@ -995,254 +1072,536 @@ void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1) } // MMX helpers -static void mmx_reset(dynarec_arm_t* dyn, int ninst) +static void mmx_reset(dynarec_arm_t* dyn) { - (void)ninst; -#if STEP > 1 - MAYUSE(dyn); + dyn->n.mmxcount = 0; for (int i=0; i<8; ++i) - dyn->mmxcache[i] = -1; -#else - (void)dyn; -#endif + dyn->n.mmxcache[i] = -1; +} +static int isx87Empty(dynarec_arm_t* dyn) +{ + for (int i=0; i<8; ++i) + if(dyn->n.x87cache[i] != -1) + return 0; + return 1; } + // get neon register for a MMX reg, create the entry if needed -int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a) +int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a) { - (void)ninst; (void)s1; -#if STEP > 1 - if(dyn->mmxcache[a]!=-1) - return dyn->mmxcache[a]; - int ret = dyn->mmxcache[a] = fpu_get_reg_emm(dyn, a); + if(!dyn->n.x87stack && isx87Empty(dyn)) + x87_purgecache(dyn, ninst, 0, s1, s2, s3); + if(dyn->n.mmxcache[a]!=-1) + return dyn->n.mmxcache[a]; + ++dyn->n.mmxcount; + int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, a); VLDR64_U12(ret, xEmu, offsetof(x64emu_t, mmx[a])); return ret; -#else - (void)dyn; (void)a; - return 0; -#endif } // get neon register for a MMX reg, but don't try to synch it if it needed to be created -int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a) +int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a) { - (void)ninst; (void)s1; -#if STEP > 1 - if(dyn->mmxcache[a]!=-1) - return dyn->mmxcache[a]; - int ret = dyn->mmxcache[a] = fpu_get_reg_emm(dyn, a); + if(!dyn->n.x87stack && isx87Empty(dyn)) + x87_purgecache(dyn, ninst, 0, s1, s2, s3); + if(dyn->n.mmxcache[a]!=-1) + return dyn->n.mmxcache[a]; + ++dyn->n.mmxcount; + int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, a); return ret; -#else - (void)dyn; (void)a; - return 0; -#endif } // purge the MMX cache only(needs 3 scratch registers) -void mmx_purgecache(dynarec_arm_t* dyn, int ninst, int s1) +void mmx_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1) { - (void)ninst; (void)s1; -#if STEP > 1 + if(!dyn->n.mmxcount) + return; + if(!next) + dyn->n.mmxcount = 0; int old = -1; for (int i=0; i<8; ++i) - if(dyn->mmxcache[i]!=-1) { + if(dyn->n.mmxcache[i]!=-1) { if (old==-1) { - MESSAGE(LOG_DUMP, "\tPurge MMX Cache ------\n"); + MESSAGE(LOG_DUMP, "\tPurge %sMMX Cache ------\n", next?"locally ":""); ++old; } - VSTR64_U12(dyn->mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i])); - fpu_free_reg(dyn, dyn->mmxcache[i]); - dyn->mmxcache[i] = -1; + VSTR64_U12(dyn->n.mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i])); + if(!next) { + fpu_free_reg(dyn, dyn->n.mmxcache[i]); + dyn->n.mmxcache[i] = -1; + } } if(old!=-1) { MESSAGE(LOG_DUMP, "\t------ Purge MMX Cache\n"); } -#else - (void)dyn; -#endif } #ifdef HAVE_TRACE static void mmx_reflectcache(dynarec_arm_t* dyn, int ninst, int s1) { - (void) ninst; (void)s1; -#if STEP > 1 for (int i=0; i<8; ++i) - if(dyn->mmxcache[i]!=-1) { - VLDR64_U12(dyn->mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i])); + if(dyn->n.mmxcache[i]!=-1) { + VLDR64_U12(dyn->n.mmxcache[i], xEmu, offsetof(x64emu_t, mmx[i])); } -#else - (void)dyn; -#endif } #endif // SSE / SSE2 helpers -static void sse_reset(dynarec_arm_t* dyn, int ninst) +static void sse_reset(dynarec_arm_t* dyn) { - (void)ninst; -#if STEP > 1 for (int i=0; i<16; ++i) - dyn->ssecache[i] = -1; -#else - (void)dyn; -#endif + dyn->n.ssecache[i].v = -1; } // get neon register for a SSE reg, create the entry if needed -int sse_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a) +int sse_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite) { - (void) ninst; (void)s1; -#if STEP > 1 - if(dyn->ssecache[a]!=-1) - return dyn->ssecache[a]; - int ret = dyn->ssecache[a] = fpu_get_reg_xmm(dyn, a); + if(dyn->n.ssecache[a].v!=-1) { + if(forwrite) { + dyn->n.ssecache[a].write = 1; // update only if forwrite + dyn->n.neoncache[dyn->n.ssecache[a].reg].t = NEON_CACHE_XMMW; + } + return dyn->n.ssecache[a].reg; + } + dyn->n.ssecache[a].reg = fpu_get_reg_xmm(dyn, forwrite?NEON_CACHE_XMMW:NEON_CACHE_XMMR, a); + int ret = dyn->n.ssecache[a].reg; + dyn->n.ssecache[a].write = forwrite; VLDR128_U12(ret, xEmu, offsetof(x64emu_t, xmm[a])); return ret; -#else - (void)dyn; (void)a; - return 0; -#endif } // get neon register for a SSE reg, but don't try to synch it if it needed to be created int sse_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a) { - (void) ninst; (void)s1; -#if STEP > 1 - if(dyn->ssecache[a]!=-1) - return dyn->ssecache[a]; - int ret = dyn->ssecache[a] = fpu_get_reg_xmm(dyn, a); - return ret; -#else - (void)dyn; (void)a; - return 0; -#endif + if(dyn->n.ssecache[a].v!=-1) { + dyn->n.ssecache[a].write = 1; + dyn->n.neoncache[dyn->n.ssecache[a].reg].t = NEON_CACHE_XMMW; + return dyn->n.ssecache[a].reg; + } + dyn->n.ssecache[a].reg = fpu_get_reg_xmm(dyn, NEON_CACHE_XMMW, a); + dyn->n.ssecache[a].write = 1; // it will be write... + return dyn->n.ssecache[a].reg; } // forget neon register for a SSE reg, create the entry if needed void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a) { - (void) ninst; -#if STEP > 1 - if(dyn->ssecache[a]==-1) + if(dyn->n.ssecache[a].v==-1) return; - VSTR128_U12(dyn->ssecache[a], xEmu, offsetof(x64emu_t, xmm[a])); - fpu_free_reg(dyn, dyn->ssecache[a]); - dyn->ssecache[a] = -1; -#else - (void)dyn; (void)a; -#endif + if(dyn->n.neoncache[dyn->n.ssecache[a].reg].t == NEON_CACHE_XMMW) { + VSTR128_U12(dyn->n.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); + } + fpu_free_reg(dyn, dyn->n.ssecache[a].reg); + dyn->n.ssecache[a].v = -1; return; } // purge the SSE cache for XMM0..XMM7 (to use before function native call) void sse_purge07cache(dynarec_arm_t* dyn, int ninst, int s1) { - (void) ninst; (void)s1; -#if STEP > 1 int old = -1; for (int i=0; i<8; ++i) - if(dyn->ssecache[i]!=-1) { + if(dyn->n.ssecache[i].v!=-1) { if (old==-1) { MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n"); ++old; } - VSTR128_U12(dyn->ssecache[i], xEmu, offsetof(x64emu_t, xmm[i])); - fpu_free_reg(dyn, dyn->ssecache[i]); - dyn->ssecache[i] = -1; + if(dyn->n.neoncache[dyn->n.ssecache[i].reg].t == NEON_CACHE_XMMW) { + VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + } + fpu_free_reg(dyn, dyn->n.ssecache[i].reg); + dyn->n.ssecache[i].v = -1; } if(old!=-1) { MESSAGE(LOG_DUMP, "\t------ Purge XMM0..7 Cache\n"); } -#else - (void)dyn; -#endif } // purge the SSE cache only -static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int s1) +static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1) { - (void) ninst; (void)s1; -#if STEP > 1 int old = -1; for (int i=0; i<16; ++i) - if(dyn->ssecache[i]!=-1) { - if (old==-1) { - MESSAGE(LOG_DUMP, "\tPurge SSE Cache ------\n"); - ++old; + if(dyn->n.ssecache[i].v!=-1) { + if(dyn->n.ssecache[i].write) { + if (old==-1) { + MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":""); + ++old; + } + VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); + if(!next) { + fpu_free_reg(dyn, dyn->n.ssecache[i].reg); + dyn->n.ssecache[i].v = -1; + } } - VSTR128_U12(dyn->ssecache[i], xEmu, offsetof(x64emu_t, xmm[i])); - fpu_free_reg(dyn, dyn->ssecache[i]); - dyn->ssecache[i] = -1; } if(old!=-1) { MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n"); } -#else - (void)dyn; -#endif } #ifdef HAVE_TRACE static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1) { - (void) ninst; (void)s1; -#if STEP > 1 for (int i=0; i<16; ++i) - if(dyn->ssecache[i]!=-1) { - VSTR128_U12(dyn->ssecache[i], xEmu, offsetof(x64emu_t, xmm[i])); + if(dyn->n.ssecache[i].v!=-1 && dyn->n.ssecache[i].write) { + VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } -#else - (void)dyn; -#endif } #endif void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) { - (void) ninst; (void)s1; -#if STEP > 1 int start = not07?8:0; // only SSE regs needs to be push back to xEmu int n=0; for (int i=start; i<16; i++) - if(dyn->ssecache[i]!=-1) + if(dyn->n.ssecache[i].v!=-1) ++n; if(!n) return; MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n); for (int i=start; i<16; ++i) - if(dyn->ssecache[i]!=-1) { - VSTR128_U12(dyn->ssecache[i], xEmu, offsetof(x64emu_t, xmm[i])); + if(dyn->n.ssecache[i].v!=-1) { + VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n); -#else - (void)dyn; -#endif } void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07) { - (void) ninst; (void)s1; -#if STEP > 1 int start = not07?8:0; // only SSE regs needs to be pop back from xEmu int n=0; for (int i=start; i<16; i++) - if(dyn->ssecache[i]!=-1) + if(dyn->n.ssecache[i].v!=-1) ++n; if(!n) return; MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n); for (int i=start; i<16; ++i) - if(dyn->ssecache[i]!=-1) { - VLDR128_U12(dyn->ssecache[i], xEmu, offsetof(x64emu_t, xmm[i])); + if(dyn->n.ssecache[i].v!=-1) { + VLDR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i])); } MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n); -#else - (void)dyn; -#endif } -void fpu_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) +void fpu_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int s3) { - x87_purgecache(dyn, ninst, s1, s2, s3); - mmx_purgecache(dyn, ninst, s1); - sse_purgecache(dyn, ninst, s1); - fpu_reset_reg(dyn); + x87_purgecache(dyn, ninst, next, s1, s2, s3); + mmx_purgecache(dyn, ninst, next, s1); + sse_purgecache(dyn, ninst, next, s1); + if(!next) + fpu_reset_reg(dyn); +} + +static int findCacheSlot(dynarec_arm_t* dyn, int ninst, int t, int n, neoncache_t* cache) +{ + neon_cache_t f; + f.n = n; f.t = t; + for(int i=0; i<24; ++i) { + if(cache->neoncache[i].v == f.v) + return i; + if(cache->neoncache[i].n == n) { + switch(cache->neoncache[i].t) { + case NEON_CACHE_ST_F: + if (t==NEON_CACHE_ST_D) + return i; + break; + case NEON_CACHE_ST_D: + if (t==NEON_CACHE_ST_F) + return i; + break; + case NEON_CACHE_XMMR: + if(t==NEON_CACHE_XMMW) + return i; + break; + case NEON_CACHE_XMMW: + if(t==NEON_CACHE_XMMR) + return i; + break; + } + } + } + return -1; +} + +static void swapCache(dynarec_arm_t* dyn, int ninst, int i, int j, neoncache_t *cache) +{ + if (i==j) + return; + int quad = 0; + if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW) + quad =1; + if(cache->neoncache[j].t==NEON_CACHE_XMMR || cache->neoncache[j].t==NEON_CACHE_XMMW) + quad =1; + + if(!cache->neoncache[i].v) { + // a mov is enough, no need to swap + MESSAGE(LOG_DUMP, "\t - Moving %d <- %d\n", i, j); + if(quad) { + VMOVQ(i, j); + } else { + VMOV(i, j); + } + cache->neoncache[i].v = cache->neoncache[j].v; + cache->neoncache[j].v = 0; + return; + } + // SWAP + neon_cache_t tmp; + MESSAGE(LOG_DUMP, "\t - Swaping %d <-> %d\n", i, j); + // There is no VSWP in Arm64 NEON to swap 2 register contents! + // so use a scratch... + #define SCRATCH 31 + if(quad) { + VMOVQ(SCRATCH, i); + VMOVQ(i, j); + VMOVQ(j, SCRATCH); + } else { + VMOV(SCRATCH, i); + VMOV(i, j); + VMOV(j, SCRATCH); + } + #undef SCRATCH + tmp.v = cache->neoncache[i].v; + cache->neoncache[i].v = cache->neoncache[j].v; + cache->neoncache[j].v = tmp.v; +} + +static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, neoncache_t *cache, int i, int t, int n) +{ + if(cache->neoncache[i].v) { + int quad = 0; + if(t==NEON_CACHE_XMMR || t==NEON_CACHE_XMMW) + quad = 1; + if(cache->neoncache[i].t==NEON_CACHE_XMMR || cache->neoncache[i].t==NEON_CACHE_XMMW) + quad = 1; + int j = i+1; + while(cache->neoncache[j].v) + ++j; + MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i); + if(quad) { + VMOVQ(j, i); + } else { + VMOV(j, i); + } + cache->neoncache[j].v = cache->neoncache[i].v; + } + switch(t) { + case NEON_CACHE_XMMR: + case NEON_CACHE_XMMW: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + VLDR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n])); + break; + case NEON_CACHE_MM: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + VLDR64_U12(i, xEmu, offsetof(x64emu_t, mmx[i])); + break; + case NEON_CACHE_ST_D: + case NEON_CACHE_ST_F: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + if((*s3_top) == 0xffff) { + LDRw_U12(s3, xEmu, offsetof(x64emu_t, top)); + *s3_top = 0; + } + int a = n - (*s3_top) - stack_cnt; + if(a) { + if(a<0) { + SUBw_U12(s3, s3, -a); + } else { + ADDw_U12(s3, s3, a); + } + ANDw_mask(s3, s3, 0, 2); //mask=7 // (emu->top + i)&7 + } + *s3_top += a; + *s2_val = 0; + ADDw_REG_LSL(s2, xEmu, s3, 3); + VLDR64_U12(i, s2, offsetof(x64emu_t, x87)); + if(t==NEON_CACHE_ST_F) { + FCVT_S_D(i, i); + } + break; + case NEON_CACHE_NONE: + case NEON_CACHE_SCR: + default: /* nothing done */ + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + } + cache->neoncache[i].n = n; + cache->neoncache[i].t = t; +} + +static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, neoncache_t *cache, int i, int t, int n) +{ + switch(t) { + case NEON_CACHE_XMMR: + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + case NEON_CACHE_XMMW: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[n])); + break; + case NEON_CACHE_MM: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + VSTR64_U12(i, xEmu, offsetof(x64emu_t, mmx[n])); + break; + case NEON_CACHE_ST_D: + case NEON_CACHE_ST_F: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + if((*s3_top)==0xffff) { + LDRw_U12(s3, xEmu, offsetof(x64emu_t, top)); + *s3_top = 0; + } + int a = n - (*s3_top) - stack_cnt; + if(a) { + if(a<0) { + SUBw_U12(s3, s3, -a); + } else { + ADDw_U12(s3, s3, a); + } + ANDw_mask(s3, s3, 0, 2); //mask=7 // (emu->top + i)&7 + } + *s3_top += a; + ADDw_REG_LSL(s2, xEmu, s3, 3); + *s2_val = 0; + if(t==NEON_CACHE_ST_F) { + FCVT_D_S(i, i); + } + VSTR64_U12(i, s2, offsetof(x64emu_t, x87)); + break; + case NEON_CACHE_NONE: + case NEON_CACHE_SCR: + default: /* nothing done */ + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + } + cache->neoncache[i].v = 0; +} + +void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) +{ +#if STEP > 1 + int i2 = dyn->insts[ninst].x64.jmp_insts; + if(i2<0) + return; + MESSAGE(LOG_DUMP, "\tCache Transform ---- ninst=%d -> %d\n", ninst, i2); + if((!i2) || (dyn->insts[i2].x64.barrier&BARRIER_FLOAT)) { + if(dyn->n.stack_next) { + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + for(int i=0; i<24; ++i) + if(dyn->n.neoncache[i].v) { // there is something at ninst for i + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + neoncache_t cache_i2 = dyn->insts[i2].n; + neoncacheUnwind(&cache_i2); + + if(!cache_i2.stack) { + int purge = 1; + for (int i=0; i<24 && purge; ++i) + if(cache_i2.neoncache[i].v) + purge = 0; + if(purge) { + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + } + int stack_cnt = dyn->n.stack_next; + int s3_top = 0xffff; + if(stack_cnt != cache_i2.stack) { + MESSAGE(LOG_DUMP, "\t - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack); + int a = stack_cnt - cache_i2.stack; + // Add x87stack to emu fpu_stack + LDRw_U12(s3, xEmu, offsetof(x64emu_t, fpu_stack)); + if(a>0) { + ADDw_U12(s3, s3, a); + } else { + SUBw_U12(s3, s3, -a); + } + STRw_U12(s3, xEmu, offsetof(x64emu_t, fpu_stack)); + // Sub x87stack to top, with and 7 + LDRw_U12(s3, xEmu, offsetof(x64emu_t, top)); + // update tags (and top at the same time) + if(a>0) { + // new tag to fulls + MOVZw(s2, 0); + ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs)); + for (int i=0; i<a; ++i) { + SUBw_U12(s3, s3, 1); + ANDw_mask(s3, s3, 0, 2); // (emu->top + st)&7 + STRw_REG_LSL2(s2, s1, s3); // that slot is full + } + } else { + // empty tags + MOVZw(s2, 0b11); + ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs)); + for (int i=0; i<-a; ++i) { + STRw_REG_LSL2(s2, s1, s3); // empty slot before leaving it + ADDw_U12(s3, s3, 1); + ANDw_mask(s3, s3, 0, 2); // (emu->top + st)&7 + } + } + STRw_U12(s3, xEmu, offsetof(x64emu_t, top)); + s3_top = 0; + stack_cnt = cache_i2.stack; + } + neoncache_t cache = dyn->n; + int s1_val = 0; + int s2_val = 0; + // unload every uneeded cache + // check SSE first, than MMX, in order, for optimisation issue + for(int i=0; i<16; ++i) { + int j=findCacheSlot(dyn, ninst, NEON_CACHE_XMMW, i, &cache); + if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_XMMW, i, &cache_i2)==-1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n); + } + for(int i=0; i<8; ++i) { + int j=findCacheSlot(dyn, ninst, NEON_CACHE_MM, i, &cache); + if(j>=0 && findCacheSlot(dyn, ninst, NEON_CACHE_MM, i, &cache_i2)==-1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.neoncache[j].t, cache.neoncache[j].n); + } + for(int i=0; i<24; ++i) { + if(cache.neoncache[i].v) + if(findCacheSlot(dyn, ninst, cache.neoncache[i].t, cache.neoncache[i].n, &cache_i2)==-1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.neoncache[i].t, cache.neoncache[i].n); + } + // and now load/swap the missing one + for(int i=0; i<24; ++i) { + if(cache_i2.neoncache[i].v) { + if(cache_i2.neoncache[i].v != cache.neoncache[i].v) { + int j; + if((j=findCacheSlot(dyn, ninst, cache_i2.neoncache[i].t, cache_i2.neoncache[i].n, &cache))==-1) + loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.neoncache[i].t, cache_i2.neoncache[i].n); + else { + // it's here, lets swap if needed + if(j!=i) + swapCache(dyn, ninst, i, j, &cache); + } + } + if(cache.neoncache[i].t != cache_i2.neoncache[i].t) { + if(cache.neoncache[i].t == NEON_CACHE_ST_D && cache_i2.neoncache[i].t == NEON_CACHE_ST_F) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); + FCVT_S_D(i, i); + cache.neoncache[i].t = NEON_CACHE_ST_F; + } else if(cache.neoncache[i].t == NEON_CACHE_ST_F && cache_i2.neoncache[i].t == NEON_CACHE_ST_D) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); + FCVT_D_S(i, i); + cache.neoncache[i].t = NEON_CACHE_ST_D; + } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) + { cache.neoncache[i].t = NEON_CACHE_XMMW; } + else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) { + // refresh cache... + MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); + VSTR128_U12(i, xEmu, offsetof(x64emu_t, xmm[cache.neoncache[i].n])); + cache.neoncache[i].t = NEON_CACHE_XMMR; + } + } + } + } + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); +#endif } #ifdef HAVE_TRACE @@ -1256,11 +1615,11 @@ void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) } #endif -void fpu_reset(dynarec_arm_t* dyn, int ninst) +void fpu_reset(dynarec_arm_t* dyn) { - x87_reset(dyn, ninst); - mmx_reset(dyn, ninst); - sse_reset(dyn, ninst); + x87_reset(dyn); + mmx_reset(dyn); + sse_reset(dyn); fpu_reset_reg(dyn); } diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index 8a8e3022..c2355f3a 100755 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -570,35 +570,47 @@ LDP_REGS(R12, R13); \ LDP_REGS(R14, R15) -#define SET_DFNONE(S) if(!dyn->dfnone) {MOVZw(S, d_none); STRw_U12(S, xEmu, offsetof(x64emu_t, df)); dyn->dfnone=1;} -#define SET_DF(S, N) if((N)!=d_none) {MOVZw(S, (N)); STRw_U12(S, xEmu, offsetof(x64emu_t, df)); dyn->dfnone=0;} else SET_DFNONE(S) -#define SET_NODF() dyn->dfnone = 0 -#define SET_DFOK() dyn->dfnone = 1 +#define SET_DFNONE(S) if(!dyn->f.dfnone) {MOVZw(S, d_none); STRw_U12(S, xEmu, offsetof(x64emu_t, df)); dyn->f.dfnone=1;} +#define SET_DF(S, N) if((N)!=d_none) {MOVZw(S, (N)); STRw_U12(S, xEmu, offsetof(x64emu_t, df)); dyn->f.dfnone=0;} else SET_DFNONE(S) +#define SET_NODF() dyn->f.dfnone = 0 +#define SET_DFOK() dyn->f.dfnone = 1 #ifndef READFLAGS #define READFLAGS(A) \ - if(((A)!=X_PEND) && dyn->state_flags!=SF_SET && dyn->state_flags!=SF_SET_PENDING) { \ - if(dyn->state_flags!=SF_PENDING) { \ + if(((A)!=X_PEND && dyn->f.pending!=SF_SET) \ + && (dyn->f.pending!=SF_SET_PENDING)) { \ + if(dyn->f.pending!=SF_PENDING) { \ LDRw_U12(x3, xEmu, offsetof(x64emu_t, df)); \ - j64 = (GETMARKF)-(dyn->native_size); \ + j64 = (GETMARKF)-(dyn->native_size); \ CBZw(x3, j64); \ } \ CALL_(UpdateFlags, -1, 0); \ MARKF; \ - dyn->state_flags = SF_SET; \ + dyn->f.pending = SF_SET; \ SET_DFOK(); \ } #endif +// SF_MAYSET doesn't change the flags status cache +// it also doesn't consume any needed flags #ifndef SETFLAGS -#define SETFLAGS(A, B) \ - if(dyn->state_flags!=SF_SET && B==SF_SUBSET && (dyn->insts[ninst].x64.need_flags&(~((A)/*|X_PEND*/)))) \ - READFLAGS(dyn->insts[ninst].x64.need_flags&(~(A)|X_PEND)); \ - dyn->state_flags = (B==SF_SUBSET)?SF_SET: \ - ((B==SF_SET_PENDING && !(dyn->insts[ninst].x64.need_flags&X_PEND)?SF_SET:B)) - +#define SETFLAGS(A, B) \ + if(dyn->f.pending!=SF_SET \ + && (B==SF_SUBSET || B==SF_SUBSET_PENDING) \ + && (dyn->insts[ninst].x64.need_flags&(~((A)|((B==SF_SUBSET_PENDING)?X_PEND:0))))) \ + READFLAGS(dyn->insts[ninst].x64.need_flags&(~(A)|X_PEND)); \ + if(dyn->insts[ninst].x64.need_flags) switch(B) { \ + case SF_SUBSET: \ + case SF_SET: dyn->f.pending = SF_SET; break; \ + case SF_PENDING: dyn->f.pending = SF_PENDING; break; \ + case SF_SUBSET_PENDING: \ + case SF_SET_PENDING: \ + dyn->f.pending = (dyn->insts[ninst].x64.need_flags&X_PEND)?SF_SET_PENDING:SF_SET; \ + break; \ + case SF_MAYSET: break; \ + } else dyn->f.pending = SF_SET #endif #ifndef JUMP -#define JUMP(A) +#define JUMP(A, C) #endif #ifndef BARRIER #define BARRIER(A) @@ -783,12 +795,16 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr); #define x87_do_push STEPNAME(x87_do_push) #define x87_do_push_empty STEPNAME(x87_do_push_empty) #define x87_do_pop STEPNAME(x87_do_pop) +#define x87_get_current_cache STEPNAME(x87_get_current_cache) #define x87_get_cache STEPNAME(x87_get_cache) +#define x87_get_neoncache STEPNAME(x87_get_neoncache) #define x87_get_st STEPNAME(x87_get_st) +#define x87_get_st_empty STEPNAME(x87_get_st) #define x87_refresh STEPNAME(x87_refresh) #define x87_forget STEPNAME(x87_forget) #define x87_reget_st STEPNAME(x87_reget_st) #define x87_stackcount STEPNAME(x87_stackcount) +#define x87_swapreg STEPNAME(x87_swapreg) #define x87_setround STEPNAME(x87_setround) #define x87_restoreround STEPNAME(x87_restoreround) #define sse_setround STEPNAME(sse_setround) @@ -809,6 +825,8 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr); #define fpu_reflectcache STEPNAME(fpu_reflectcache) #endif +#define fpuCacheTransform STEPNAME(fpuCacheTransform) + /* setup r2 to address pointed by */ uintptr_t geted(dynarec_arm_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, int64_t* fixaddress, int absmax, uint32_t mask, rex_t rex, int s, int delta); @@ -904,21 +922,29 @@ void emit_pf(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4); // cache of the local stack counter, to avoid upadte at every call void x87_stackcount(dynarec_arm_t* dyn, int ninst, int scratch); // fpu push. Return the Dd value to be used -int x87_do_push(dynarec_arm_t* dyn, int ninst); +int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t); // fpu push. Do not allocate a cache register. Needs a scratch register to do x87stack synch (or 0 to not do it) void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1); // fpu pop. All previous returned Dd should be considered invalid -void x87_do_pop(dynarec_arm_t* dyn, int ninst); +void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1); +// get cache index for a x87 reg, return -1 if cache doesn't exist +int x87_get_current_cache(dynarec_arm_t* dyn, int ninst, int st, int t); // get cache index for a x87 reg, create the entry if needed -int x87_get_cache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a); +int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, int a, int t); +// get neoncache index for a x87 reg +int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a); // get vfpu register for a x87 reg, create the entry if needed -int x87_get_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a); +int x87_get_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t); +// get vfpu register for a x87 reg, create the entry if needed. Do not fetch the Stx if not already in cache +int x87_get_st_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int t); // refresh a value from the cache ->emu (nothing done if value is not cached) void x87_refresh(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st); // refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached) void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st); // refresh the cache value from emu void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st); +// swap 2 x87 regs +void x87_swapreg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a, int b); // Set rounding according to cw flags, return reg to restore flags int x87_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); // Restore round flag @@ -926,15 +952,47 @@ void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1); // Set rounding according to mxcsr flags, return reg to restore flags int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); +void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); + +#if STEP < 2 +#define CHECK_CACHE() 0 +#else +#define CHECK_CACHE() fpuCacheNeedsTransform(dyn, ninst) +#endif + +#define neoncache_st_coherency STEPNAME(neoncache_st_coherency) +int neoncache_st_coherency(dynarec_arm_t* dyn, int ninst, int a, int b); + +#if STEP == 0 +#define ST_IS_F(A) 0 +#define X87_COMBINE(A, B) NEON_CACHE_ST_D +#define X87_ST0 NEON_CACHE_ST_D +#define X87_ST(A) NEON_CACHE_ST_D +#elif STEP == 1 +#define ST_IS_F(A) (neoncache_get_current_st(dyn, ninst, A)==NEON_CACHE_ST_F) +#define X87_COMBINE(A, B) neoncache_combine_st(dyn, ninst, A, B) +#define X87_ST0 neoncache_get_current_st(dyn, ninst, 0) +#define X87_ST(A) neoncache_get_current_st(dyn, ninst, A) +#else +#define ST_IS_F(A) (neoncache_get_st(dyn, ninst, A)==NEON_CACHE_ST_F) +#if STEP == 3 +#define X87_COMBINE(A, B) neoncache_st_coherency(dyn, ninst, A, B) +#else +#define X87_COMBINE(A, B) neoncache_get_st(dyn, ninst, A) +#endif +#define X87_ST0 neoncache_get_st(dyn, ninst, 0) +#define X87_ST(A) neoncache_get_st(dyn, ninst, A) +#endif + //MMX helpers // get neon register for a MMX reg, create the entry if needed -int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a); +int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a); // get neon register for a MMX reg, but don't try to synch it if it needed to be created -int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a); +int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a); //SSE/SSE2 helpers // get neon register for a SSE reg, create the entry if needed -int sse_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a); +int sse_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite); // get neon register for a SSE reg, but don't try to synch it if it needed to be created int sse_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a); // forget neon register for a SSE reg, create the entry if needed @@ -944,13 +1002,13 @@ void sse_purge07cache(dynarec_arm_t* dyn, int ninst, int s1); // common coproc helpers // reset the cache -void fpu_reset(dynarec_arm_t* dyn, int ninst); +void fpu_reset(dynarec_arm_t* dyn); // purge the FPU cache (needs 3 scratch registers) -void fpu_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); +void fpu_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int s3); // purge MMX cache -void mmx_purgecache(dynarec_arm_t* dyn, int ninst, int s1); +void mmx_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1); // purge x87 cache -void x87_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); +void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int s3); #ifdef HAVE_TRACE void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); #endif diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h index d4818ac5..87b256b5 100755 --- a/src/dynarec/arm64/dynarec_arm64_pass0.h +++ b/src/dynarec/arm64/dynarec_arm64_pass0.h @@ -6,22 +6,37 @@ if(ninst) dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr #define MESSAGE(A, ...) -#define SETFLAGS(A, B) -#define READFLAGS(A) +#define READFLAGS(A) \ + dyn->insts[ninst].x64.use_flags = A; dyn->f.dfnone = 1;\ + dyn->f.pending=SF_SET +#define SETFLAGS(A,B) \ + dyn->insts[ninst].x64.set_flags = A; \ + if(B!=SF_MAYSET) { \ + dyn->insts[ninst].x64.state_flags = B; \ + dyn->f.pending=(B)&SF_SET_PENDING; \ + dyn->f.dfnone=((B)&SF_SET)?1:0; \ + } #define EMIT(A) -#define JUMP(A) add_next(dyn, (uintptr_t)A); dyn->insts[ninst].x64.jmp = A -#define BARRIER(A) dyn->insts[ninst].x64.barrier = A -#define BARRIER_NEXT(A) if(ninst<dyn->size) dyn->insts[ninst+1].x64.barrier = A +#define JUMP(A, C) add_next(dyn, (uintptr_t)A); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C +#define BARRIER(A) if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1 +#define BARRIER_NEXT(A) dyn->insts[ninst+1].x64.barrier = A #define NEW_INST \ + ++dyn->size; \ if(dyn->size+3>=dyn->cap) { \ - dyn->insts = (instruction_arm64_t*)realloc(dyn->insts, sizeof(instruction_arm64_t)*dyn->cap*2); \ - memset(&dyn->insts[dyn->cap], 0, sizeof(instruction_arm64_t)*dyn->cap); \ + dyn->insts = (instruction_native_t*)realloc(dyn->insts, sizeof(instruction_native_t)*dyn->cap*2); \ + memset(&dyn->insts[dyn->cap], 0, sizeof(instruction_native_t)*dyn->cap); \ dyn->cap *= 2; \ } \ - ++dyn->size; \ dyn->insts[ninst].x64.addr = ip; \ - if(ninst) dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr -#define INST_EPILOG + dyn->n.combined1 = dyn->n.combined2 = 0;\ + dyn->n.swapped = 0; dyn->n.barrier = 0; \ + dyn->insts[ninst].f_entry = dyn->f; \ + if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;} + +#define INST_EPILOG \ + dyn->insts[ninst].f_exit = dyn->f; \ + dyn->insts[ninst].n = dyn->n; \ + dyn->insts[ninst].x64.has_next = (ok>0)?1:0; #define INST_NAME(name) #define DEFAULT \ --dyn->size; \ diff --git a/src/dynarec/arm64/dynarec_arm64_pass1.h b/src/dynarec/arm64/dynarec_arm64_pass1.h index a4abcf19..6cf92feb 100755 --- a/src/dynarec/arm64/dynarec_arm64_pass1.h +++ b/src/dynarec/arm64/dynarec_arm64_pass1.h @@ -2,9 +2,13 @@ #define FINI #define MESSAGE(A, ...) #define EMIT(A) -#define READFLAGS(A) dyn->insts[ninst].x64.use_flags = A -#define SETFLAGS(A,B) {dyn->insts[ninst].x64.set_flags = A; dyn->insts[ninst].x64.state_flags = B;} +#define NEW_INST \ + dyn->insts[ninst].f_entry = dyn->f; \ + dyn->n.combined1 = dyn->n.combined2 = 0;\ + dyn->n.swapped = 0; dyn->n.barrier = 0 + +#define INST_EPILOG \ + dyn->insts[ninst].n = dyn->n; \ + dyn->insts[ninst].f_exit = dyn->f -#define NEW_INST -#define INST_EPILOG #define INST_NAME(name) diff --git a/src/dynarec/arm64/dynarec_arm64_pass2.h b/src/dynarec/arm64/dynarec_arm64_pass2.h index 3d4b6f03..3de38aad 100755 --- a/src/dynarec/arm64/dynarec_arm64_pass2.h +++ b/src/dynarec/arm64/dynarec_arm64_pass2.h @@ -3,9 +3,15 @@ #define MESSAGE(A, ...) #define EMIT(A) dyn->insts[ninst].size+=4; dyn->native_size+=4 -#define NEW_INST if(ninst) {dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size);} +#define NEW_INST \ + if(ninst) { \ + dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size); \ + if(ninst && isInstClean(dyn, ninst)) { \ + dyn->last_ip = 0; \ + ++dyn->sons_size; \ + } \ + } #define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; #define INST_NAME(name) -#define NEW_BARRIER_INST if(ninst) ++dyn->sons_size #define TABLE64(A, V) {Table64(dyn, (V)); EMIT(0);} #define FTABLE64(A, V) {mmx87_regs_t v = {.d = V}; Table64(dyn, v.q); EMIT(0);} \ No newline at end of file diff --git a/src/dynarec/arm64/dynarec_arm64_pass3.h b/src/dynarec/arm64/dynarec_arm64_pass3.h index 00f39f74..97a4b27b 100755 --- a/src/dynarec/arm64/dynarec_arm64_pass3.h +++ b/src/dynarec/arm64/dynarec_arm64_pass3.h @@ -7,29 +7,58 @@ dyn->insts[ninst].size2 += 4 #define MESSAGE(A, ...) if(box64_dynarec_dump) dynarec_log(LOG_NONE, __VA_ARGS__) -#define NEW_INST +#define NEW_INST \ + if(ninst && isInstClean(dyn, ninst)) { \ + dyn->last_ip = 0; \ + dyn->sons_x64[dyn->sons_size] = (uintptr_t)ip; \ + dyn->sons_native[dyn->sons_size] = dyn->block; \ + MESSAGE(LOG_DUMP, "----> potential Son here %p/%p\n", (void*)ip, dyn->block); \ + ++dyn->sons_size; \ + } #define INST_EPILOG #define INST_NAME(name) \ if(box64_dynarec_dump) {\ printf_x64_instruction(my_context->dec, &dyn->insts[ninst].x64, name); \ - dynarec_log(LOG_NONE, "%s%p: %d emited opcodes, state=%d/%d, set=%X, use=%X, need=%X%s\n", \ + dynarec_log(LOG_NONE, "%s%p: %d emited opcodes, inst=%d, barrier=%d state=%d/%d(%d), set=%X, use=%X, need=%X", \ (box64_dynarec_dump>1)?"\e[32m":"", \ (void*)(dyn->native_start+dyn->insts[ninst].address), \ dyn->insts[ninst].size/4, \ + ninst, \ + dyn->insts[ninst].x64.barrier, \ dyn->insts[ninst].x64.state_flags, \ - dyn->state_flags, \ + dyn->f.pending, \ + dyn->f.dfnone, \ dyn->insts[ninst].x64.set_flags, \ dyn->insts[ninst].x64.use_flags, \ - dyn->insts[ninst].x64.need_flags, \ - (box64_dynarec_dump>1)?"\e[m":""); \ - } - -#define NEW_BARRIER_INST \ - if(ninst) { \ - dyn->sons_x64[dyn->sons_size] = (uintptr_t)ip; \ - dyn->sons_native[dyn->sons_size] = dyn->block; \ - MESSAGE(LOG_DUMP, "----> potential Son here\n");\ - ++dyn->sons_size; \ + dyn->insts[ninst].x64.need_flags); \ + if(dyn->insts[ninst].pred_sz) { \ + dynarec_log(LOG_NONE, ", pred="); \ + for(int ii=0; ii<dyn->insts[ninst].pred_sz; ++ii)\ + dynarec_log(LOG_NONE, "%s%d", ii?"/":"", dyn->insts[ninst].pred[ii]);\ + } \ + if(dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts>=0)\ + dynarec_log(LOG_NONE, ", jmp=%d", dyn->insts[ninst].x64.jmp_insts);\ + if(dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts==-1)\ + dynarec_log(LOG_NONE, ", jmp=out"); \ + if(dyn->last_ip) \ + dynarec_log(LOG_NONE, ", last_ip=%p", dyn->last_ip);\ + for(int ii=0; ii<24; ++ii) { \ + switch(dyn->insts[ninst].n.neoncache[ii].t) { \ + case NEON_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; \ + case NEON_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; \ + case NEON_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; \ + case NEON_CACHE_XMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; \ + case NEON_CACHE_XMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; \ + case NEON_CACHE_SCR: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; \ + case NEON_CACHE_NONE: \ + default: break; \ + } \ + } \ + if(dyn->n.stack || dyn->insts[ninst].n.stack_next || dyn->insts[ninst].n.x87stack) \ + dynarec_log(LOG_NONE, " X87:%d/%d(+%d/-%d)%d", dyn->n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, dyn->insts[ninst].n.stack_pop, dyn->insts[ninst].n.x87stack); \ + if(dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) \ + dynarec_log(LOG_NONE, " %s:%d/%d", dyn->insts[ninst].n.swapped?"SWP":"CMB", dyn->insts[ninst].n.combined1, dyn->insts[ninst].n.combined2); \ + dynarec_log(LOG_NONE, "%s\n", (box64_dynarec_dump>1)?"\e[m":""); \ } #define TABLE64(A, V) {int val64offset = Table64(dyn, (V)); MESSAGE(LOG_DUMP, " Table64: 0x%lx\n", (V)); LDRx_literal(A, val64offset);} diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index b8a19756..23fe4af5 100755 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -6,12 +6,67 @@ typedef struct x64emu_s x64emu_t; typedef struct dynablock_s dynablock_t; +#define BARRIER_MAYBE 8 + +#define NEON_CACHE_NONE 0 +#define NEON_CACHE_ST_D 1 +#define NEON_CACHE_ST_F 2 +#define NEON_CACHE_MM 3 +#define NEON_CACHE_XMMW 4 +#define NEON_CACHE_XMMR 5 +#define NEON_CACHE_SCR 6 +typedef union neon_cache_s { + int8_t v; + struct { + unsigned int t:4; // reg type + unsigned int n:4; // reg number + }; +} neon_cache_t; +typedef union sse_cache_s { + int8_t v; + struct { + unsigned int reg:7; + unsigned int write:1; + }; +} sse_cache_t; +typedef struct neoncache_s { + // Neon cache + neon_cache_t neoncache[24]; + int8_t stack; + int8_t stack_next; + int8_t stack_pop; + int8_t stack_push; + uint8_t combined1; + uint8_t combined2; + uint8_t swapped; // the combined reg were swapped + uint8_t barrier; // is there a barrier at instruction epilog? + uint32_t news; // bitmask, wich neoncache are new for this opcode + // fpu cache + int8_t x87cache[8]; // cache status for the 8 x87 register behind the fpu stack + int8_t x87reg[8]; // reg used for x87cache entry + int8_t mmxcache[8]; // cache status for the 8 MMX registers + sse_cache_t ssecache[16]; // cache status for the 16 SSE(2) registers + int8_t fpuused[24]; // all 0..24 double reg from fpu, used by x87, sse and mmx + int8_t x87stack; // cache stack counter + int8_t mmxcount; // number of mmx register used (not both mmx and x87 at the same time) + int8_t fpu_scratch; // scratch counter + int8_t fpu_extra_qscratch; // some opcode need an extra quad scratch register + int8_t fpu_reg; // x87/sse/mmx reg counter +} neoncache_t; + +typedef struct flagcache_s { + int pending; // is there a pending flags here, or to check? + int dfnone; // if defered flags is already set to df_none +} flagcache_t; + typedef struct instruction_arm64_s { instruction_x64_t x64; uintptr_t address; // (start) address of the arm emited instruction - uintptr_t epilog; // epilog of current instruction (can be start of next, of barrier stuff) + uintptr_t epilog; // epilog of current instruction (can be start of next, or barrier stuff) int size; // size of the arm emited instruction int size2; // size of the arm emited instrucion after pass2 + int pred_sz; // size of predecessor list + int *pred; // predecessor array uintptr_t mark, mark2, mark3; uintptr_t markf; uintptr_t markseg; @@ -19,6 +74,10 @@ typedef struct instruction_arm64_s { int pass2choice;// value for choices that are fixed on pass2 for pass3 uintptr_t natcall; int retn; + int barrier_maybe; + flagcache_t f_exit; // flags status at end of intruction + neoncache_t n; // neoncache at end of intruction (but before poping) + flagcache_t f_entry; // flags status before the instruction begin } instruction_arm64_t; typedef struct dynarec_arm_s { @@ -30,27 +89,20 @@ typedef struct dynarec_arm_s { void* block; // memory pointer where next instruction is emited uintptr_t native_start; // start of the arm code size_t native_size; // size of emitted arm code - int state_flags;// actual state for on-demand flags - uintptr_t last_ip; // last set IP in RIP (or NULL if unclean state) - int8_t x87cache[8];// cache status for the 8 x87 register behind the fpu stack - int8_t x87reg[8]; // reg used for x87cache entry - int8_t mmxcache[8];// cache status for the 8 MMX registers - int8_t ssecache[16];// cache status for the 16 SSE(2) registers - int8_t fpuused[32];// all 8..31 Q reg from fpu, used by x87, sse and mmx - int x87stack; // cache stack counter - int fpu_scratch;// scratch counter - int fpu_reg; // x87/sse/mmx reg counter - int dfnone; // if defered flags is already set to df_none + uintptr_t last_ip; // last set IP in RIP (or NULL if unclean state) TODO: move to a cache something uint64_t *table64; // table of 64bits value int table64size;// size of table (will be appended at end of executable code) int table64cap; uintptr_t tablestart; + flagcache_t f; + neoncache_t n; // cache for the 8..31 double reg from fpu, plus x87 stack delta uintptr_t* next; // variable array of "next" jump address int next_sz; int next_cap; uintptr_t* sons_x64; // the x64 address of potential dynablock sons void** sons_native; // the arm address of potential dynablock sons int sons_size; // number of potential dynablock sons + int* predecessor;// single array of all predecessor dynablock_t* dynablock; } dynarec_arm_t; @@ -62,7 +114,7 @@ int is_instructions(dynarec_arm_t *dyn, uintptr_t addr, int n); int Table64(dynarec_arm_t *dyn, uint64_t val); // add a value to etable64 (if needed) and gives back the imm19 to use in LDR_literal #define GO_TRACE() \ - GETIP(ip); \ + GETIP_(ip); \ MOVx_REG(x1, xRIP); \ STORE_XEMU_CALL(xRIP); \ MOV32w(x2, 1); \ diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c index 97f5c881..b7ab0ff9 100755 --- a/src/dynarec/dynarec_native.c +++ b/src/dynarec/dynarec_native.c @@ -3,6 +3,7 @@ #include <pthread.h> #include <errno.h> #include <string.h> +#include <assert.h> #include "debug.h" #include "box64context.h" @@ -231,53 +232,6 @@ int is_instructions(dynarec_native_t *dyn, uintptr_t addr, int n) return (i==n)?1:0; } -uint32_t needed_flags(dynarec_native_t *dyn, int ninst, uint32_t setf, int recurse) -{ - if(recurse == 10) - return X_PEND; - if(ninst == dyn->size) - return X_PEND; // no more instructions, or too many jmp loop, stop - - uint32_t needed = dyn->insts[ninst].x64.use_flags; - if(needed) { - setf &= ~needed; - if(!setf) // all flags already used, no need to continue - return needed; - } - - if(!needed && !dyn->insts[ninst].x64.set_flags && !dyn->insts[ninst].x64.jmp_insts) { - int start = ninst; - int end = ninst; - while(end<dyn->size && !dyn->insts[end].x64.use_flags && !dyn->insts[end].x64.set_flags && !dyn->insts[end].x64.jmp_insts) - ++end; - needed = needed_flags(dyn, end, setf, recurse); - for(int i=start; i<end; ++i) - dyn->insts[i].x64.need_flags = needed; - return needed; - } - - if(dyn->insts[ninst].x64.set_flags && (dyn->insts[ninst].x64.state_flags!=SF_MAYSET)) { - if((setf & ~dyn->insts[ninst].x64.set_flags) == 0) - return needed; // all done, gives all the flags needed - setf |= dyn->insts[ninst].x64.set_flags; // add new flags to continue - } - - int jinst = dyn->insts[ninst].x64.jmp_insts; - if(dyn->insts[ninst].x64.jmp) { - dyn->insts[ninst].x64.need_flags = (jinst==-1)?X_PEND:needed_flags(dyn, jinst, setf, recurse+1); - if(dyn->insts[ninst].x64.use_flags) // conditionnal jump - dyn->insts[ninst].x64.need_flags |= needed_flags(dyn, ninst+1, setf, recurse); - } else - dyn->insts[ninst].x64.need_flags = needed_flags(dyn, ninst+1, setf, recurse); - if(dyn->insts[ninst].x64.state_flags==SF_MAYSET) - needed |= dyn->insts[ninst].x64.need_flags; - else - needed |= (dyn->insts[ninst].x64.need_flags & ~dyn->insts[ninst].x64.set_flags); - if(needed == (X_PEND|X_ALL)) - needed = X_ALL; - return needed; -} - instsize_t* addInst(instsize_t* insts, size_t* size, size_t* cap, int x64_size, int native_size) { // x64 instruction is <16 bytes @@ -329,6 +283,98 @@ int Table64(dynarec_native_t *dyn, uint64_t val) return delta; } +static void fillPredecessors(dynarec_native_t* dyn) +{ + int pred_sz = 0; + // compute total size of predecessor to alocate the array + // first compute the jumps + for(int i=0; i<dyn->size; ++i) { + if(dyn->insts[i].x64.jmp && dyn->insts[i].x64.jmp_insts!=-1) { + ++pred_sz; + ++dyn->insts[dyn->insts[i].x64.jmp_insts].pred_sz; + } + } + // second the "has_next" + for(int i=0; i<dyn->size; ++i) { + if(i!=dyn->size-1 && dyn->insts[i].x64.has_next && (!i || dyn->insts[i].pred_sz)) { + ++pred_sz; + ++dyn->insts[i+1].pred_sz; + } + } + dyn->predecessor = (int*)malloc(pred_sz*sizeof(int)); + // fill pred pointer + int* p = dyn->predecessor; + for(int i=0; i<dyn->size; ++i) { + dyn->insts[i].pred = p; + p += dyn->insts[i].pred_sz; + dyn->insts[i].pred_sz=0; // reset size, it's reused to actually fill pred[] + } + assert(p==dyn->predecessor+pred_sz); + // fill pred + for(int i=0; i<dyn->size; ++i) { + if(i!=dyn->size-1 && dyn->insts[i].x64.has_next && (!i || dyn->insts[i].pred_sz)) + dyn->insts[i+1].pred[dyn->insts[i+1].pred_sz++] = i; + if(dyn->insts[i].x64.jmp && dyn->insts[i].x64.jmp_insts!=-1) + dyn->insts[dyn->insts[i].x64.jmp_insts].pred[dyn->insts[dyn->insts[i].x64.jmp_insts].pred_sz++] = i; + } + +} + +static void updateNeed(dynarec_native_t* dyn, int ninst, uint32_t need) { + uint32_t old_need = dyn->insts[ninst].x64.need_flags; + uint32_t new_need = old_need | need; + uint32_t new_use = dyn->insts[ninst].x64.use_flags; + uint32_t old_use = dyn->insts[ninst].x64.old_use; + + if((new_need&X_PEND) && dyn->insts[ninst].x64.state_flags==SF_SUBSET) { + new_need &=~X_PEND; + new_need |= X_ALL; + } + + uint32_t new_set = 0; + if(dyn->insts[ninst].x64.state_flags & SF_SET) + new_set = dyn->insts[ninst].x64.set_flags; + if(dyn->insts[ninst].x64.state_flags & SF_PENDING) + new_set |= X_PEND; + if((new_need&X_PEND) && ( + dyn->insts[ninst].x64.state_flags==SF_SET || dyn->insts[ninst].x64.state_flags==SF_SUBSET)) { + new_need &=~X_PEND; + new_need |=X_ALL; + } + + dyn->insts[ninst].x64.need_flags = new_need; + dyn->insts[ninst].x64.old_use = new_use; + + if(dyn->insts[ninst].x64.jmp_insts==-1) + new_need |= X_PEND; + + // a Flag Barrier will change all need to "Pending", as it clear all flags optimisation + if(new_need && dyn->insts[ninst].x64.barrier&BARRIER_FLAGS) + new_need = X_PEND; + + if((new_need == old_need) && (new_use == old_use)) // no changes, bye + return; + + if(!(new_need && dyn->insts[ninst].x64.barrier&BARRIER_FLAGS)) { + new_need &=~new_set; // clean needed flag that were suplied + new_need |= new_use; // new need + } + + if((new_need == (X_ALL|X_PEND)) && (dyn->insts[ninst].x64.state_flags & SF_SET)) + new_need = X_ALL; + + //update need to new need on predecessor + for(int i=0; i<dyn->insts[ninst].pred_sz; ++i) + updateNeed(dyn, dyn->insts[ninst].pred[i], new_need); +} + +static void resetNeed(dynarec_native_t* dyn) { + for(int i = dyn->size; i-- > 0;) { + dyn->insts[i].x64.old_use = 0; + dyn->insts[i].x64.need_flags = dyn->insts[i].x64.default_need; + } +} + __thread void* current_helper = NULL; void CancelBlock64() @@ -370,7 +416,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) { uintptr_t start = addr; helper.cap = 64; // needs epilog handling helper.insts = (instruction_native_t*)calloc(helper.cap, sizeof(instruction_native_t)); - // pass 0, addresses, x86 jump addresses, overall size of the block + // pass 0, addresses, x64 jump addresses, overall size of the block uintptr_t end = native_pass0(&helper, addr); // no need for next anymore free(helper.next); @@ -392,33 +438,93 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) { protectDB(addr, end-addr); //end is 1byte after actual end // compute hash signature uint32_t hash = X31_hash_code((void*)addr, end-addr); + // Compute flag_need, without with current barriers + resetNeed(&helper); + for(int i = helper.size; i-- > 0;) + updateNeed(&helper, i, 0); // calculate barriers for(int i=0; i<helper.size; ++i) if(helper.insts[i].x64.jmp) { uintptr_t j = helper.insts[i].x64.jmp; - if(j<start || j>=end) + if(j<start || j>=end) { helper.insts[i].x64.jmp_insts = -1; - else { + helper.insts[i].x64.use_flags |= X_PEND; + } else { // find jump address instruction int k=-1; - for(int i2=0; (i2<helper.size) && (k==-1); ++i2) { + for(int i2=0; i2<helper.size && k==-1; ++i2) { if(helper.insts[i2].x64.addr==j) k=i2; } - - if(k!=-1) // -1 if not found, mmm, probably wrong, exit anyway - helper.insts[k].x64.barrier = 1; + if(k!=-1 && !helper.insts[i].barrier_maybe) + helper.insts[k].x64.barrier |= BARRIER_FULL; helper.insts[i].x64.jmp_insts = k; } } - // pass 1, flags - native_pass1(&helper, addr); - for(int i=0; i<helper.size; ++i) - if(helper.insts[i].x64.set_flags && !helper.insts[i].x64.need_flags) { - helper.insts[i].x64.need_flags = needed_flags(&helper, i+1, helper.insts[i].x64.set_flags, 0); - if((helper.insts[i].x64.need_flags&X_PEND) && (helper.insts[i].x64.state_flags==SF_MAYSET)) - helper.insts[i].x64.need_flags = X_ALL; + // fill predecessors with the jump address + fillPredecessors(&helper); + // check for the optionnal barriers now + for(int i=helper.size-1; i>=0; --i) { + if(helper.insts[i].barrier_maybe) { + // out-of-block jump + if(helper.insts[i].x64.jmp_insts == -1) { + // nothing for now + } else { + // inside block jump + int k = helper.insts[i].x64.jmp_insts; + if(k>i) { + // jump in the future + if(helper.insts[k].pred_sz>1) { + // with multiple flow, put a barrier + helper.insts[k].x64.barrier|=BARRIER_FLAGS; + } + } else { + // jump back + helper.insts[k].x64.barrier|=BARRIER_FLAGS; + } + } + } + } + // check to remove useless barrier, in case of jump when destination doesn't needs flags + /*for(int i=helper.size-1; i>=0; --i) { + int k; + if(helper.insts[i].x64.jmp + && ((k=helper.insts[i].x64.jmp_insts)>=0) + && helper.insts[k].x64.barrier&BARRIER_FLAGS) { + //TODO: optimize FPU barrier too + if((!helper.insts[k].x64.need_flags) + ||(helper.insts[k].x64.set_flags==X_ALL + && helper.insts[k].x64.state_flags==SF_SET) + ||(helper.insts[k].x64.state_flags==SF_SET_PENDING)) { + //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Removed barrier for inst %d\n", k); + helper.insts[k].x64.barrier &= ~BARRIER_FLAGS; // remove flag barrier + } + } + }*/ + // reset need_flags and compute again, now taking barrier into account (because barrier change use_flags) + for(int i = helper.size; i-- > 0;) { + int k; + if(helper.insts[i].x64.jmp + && ((k=helper.insts[i].x64.jmp_insts)>=0) + ) { + if(helper.insts[k].x64.barrier&BARRIER_FLAGS) + // jumpto barrier + helper.insts[i].x64.use_flags |= X_PEND; + if(helper.insts[i].x64.barrier&BARRIER_FLAGS && (helper.insts[k].x64.need_flags | helper.insts[k].x64.use_flags)) + helper.insts[k].x64.barrier|=BARRIER_FLAGS; + else + helper.insts[i].x64.use_flags |= (helper.insts[k].x64.need_flags | helper.insts[k].x64.use_flags); } + if(helper.insts[i].x64.barrier&BARRIER_FLAGS) + // immediate barrier + helper.insts[i].x64.use_flags |= X_PEND; + } + resetNeed(&helper); + for(int i = helper.size; i-- > 0;) + updateNeed(&helper, i, 0); + + // pass 1, float optimisations, first pass for flags + native_pass1(&helper, addr); // pass 2, instruction size native_pass2(&helper, addr); @@ -496,13 +602,14 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr) { dynarec_log(LOG_INFO, "Warning, a block changed while beeing processed hash(%p:%ld)=%x/%x\n", block->x64_addr, block->x64_size, block->hash, hash); CancelBlock64(); return NULL; - } // fill sons if any + } if(!isprotectedDB(addr, end-addr)) { dynarec_log(LOG_INFO, "Warning, block unprotected while beeing processed %p:%ld, cancelling\n", block->x64_addr, block->x64_size); CancelBlock64(); return NULL; //protectDB(addr, end-addr); } + // fill sons if any dynablock_t** sons = NULL; int sons_size = 0; if(helper.sons_size) { diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index 24690250..f46b3623 100755 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -37,19 +37,90 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr) int need_epilog = 1; dyn->sons_size = 0; // Clean up (because there are multiple passes) - dyn->state_flags = 0; - dyn->dfnone = 0; + dyn->f.pending = 0; + dyn->f.dfnone = 0; + fpu_reset(dyn); + int reset_n = -1; dyn->last_ip = ip; // RIP is always set at start of block! - MAYUSE(init_addr); - fpu_reset(dyn, ninst); // ok, go now INIT; while(ok) { ip = addr; - if((dyn->insts[ninst].x64.barrier==1)) { + if (reset_n!=-1) { dyn->last_ip = 0; - NEW_BARRIER_INST; + if(reset_n==-2) { + MESSAGE(LOG_DEBUG, "Reset Caches to zero\n"); + dyn->f.dfnone = 0; + dyn->f.pending = 0; + fpu_reset(dyn); + } else { + MESSAGE(LOG_DEBUG, "Reset Caches with %d\n",reset_n); + #if STEP > 1 + // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap) + dyn->n = dyn->insts[ninst].n; + neoncacheUnwind(&dyn->n); + #ifdef HAVE_TRACE + if(box64_dynarec_dump) + if(memcmp(&dyn->n, &dyn->insts[reset_n].n, sizeof(neon_cache_t))) { + MESSAGE(LOG_DEBUG, "Warning, difference in neoncache: reset="); + for(int i=0; i<24; ++i) + if(dyn->insts[reset_n].n.neoncache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[reset_n].n.neoncache[i].t, dyn->insts[reset_n].n.neoncache[i].n)); + if(dyn->insts[reset_n].n.combined1 || dyn->insts[reset_n].n.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[reset_n].n.swapped?"SWP":"CMB", dyn->insts[reset_n].n.combined1, dyn->insts[reset_n].n.combined2); + if(dyn->insts[reset_n].n.stack_push || dyn->insts[reset_n].n.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[reset_n].n.stack_push, -dyn->insts[reset_n].n.stack_pop); + MESSAGE(LOG_DEBUG, " ==> "); + for(int i=0; i<24; ++i) + if(dyn->insts[ninst].n.neoncache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[ninst].n.neoncache[i].t, dyn->insts[ninst].n.neoncache[i].n)); + if(dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[ninst].n.swapped?"SWP":"CMB", dyn->insts[ninst].n.combined1, dyn->insts[ninst].n.combined2); + if(dyn->insts[ninst].n.stack_push || dyn->insts[ninst].n.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop); + MESSAGE(LOG_DEBUG, " -> "); + for(int i=0; i<24; ++i) + if(dyn->n.neoncache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->n.neoncache[i].t, dyn->n.neoncache[i].n)); + if(dyn->n.combined1 || dyn->n.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->n.swapped?"SWP":"CMB", dyn->n.combined1, dyn->n.combined2); + if(dyn->n.stack_push || dyn->n.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->n.stack_push, -dyn->n.stack_pop); + MESSAGE(LOG_DEBUG, "\n"); + } + #endif //HAVE_TRACE + #else + dyn->n = dyn->insts[reset_n].n; + #endif + dyn->f = dyn->insts[reset_n].f_exit; + if(dyn->insts[ninst].x64.barrier&BARRIER_FLOAT) { + MESSAGE(LOG_DEBUG, "Apply Barrier Float\n"); + fpu_reset(dyn); + } + if(dyn->insts[ninst].x64.barrier&BARRIER_FLAGS) { + MESSAGE(LOG_DEBUG, "Apply Barrier Flags\n"); + dyn->f.dfnone = 0; + dyn->f.pending = 0; + } + } + reset_n = -1; + } else if(dyn->insts[ninst].pred_sz!=1) + dyn->last_ip = 0; // reset IP if some jump are comming here + // propagate ST stack state, especial stack pop that are defered + if(dyn->n.stack_pop) { + for(int j=0; j<24; ++j) + if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D || dyn->n.neoncache[j].t == NEON_CACHE_ST_F)) { + if(dyn->n.neoncache[j].n<dyn->n.stack_pop) + dyn->n.neoncache[j].v = 0; + else + dyn->n.neoncache[j].n-=dyn->n.stack_pop; + } + dyn->n.stack_pop = 0; } + dyn->n.stack = dyn->n.stack_next; + dyn->n.news = 0; + dyn->n.stack_push = 0; + dyn->n.swapped = 0; NEW_INST; fpu_reset_scratch(dyn); #ifdef HAVE_TRACE @@ -86,51 +157,88 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr) INST_EPILOG; - if(dyn->insts[ninst+1].x64.barrier) { - fpu_purgecache(dyn, ninst, x1, x2, x3); - if(dyn->insts[ninst+1].x64.barrier!=2) { - dyn->state_flags = 0; - dyn->dfnone = 0; + int next = ninst+1; + #if STEP > 0 + if(!dyn->insts[ninst].x64.has_next && dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts!=-1) + next = dyn->insts[ninst].x64.jmp_insts; + #endif + if(dyn->insts[ninst].x64.has_next && dyn->insts[next].x64.barrier) { + if(dyn->insts[next].x64.barrier&BARRIER_FLOAT) + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); + if(dyn->insts[next].x64.barrier&BARRIER_FLAGS) { + dyn->f.pending = 0; + dyn->f.dfnone = 0; + dyn->last_ip = 0; } } - #if STEP == 0 - if(!ok && !need_epilog && box64_dynarec_bigblock && getProtection(addr+3)&~PROT_CUSTOM && !IsInHotPage(addr+3)) + #if STEP != 0 + if(!ok && !need_epilog && (addr < (dyn->start+dyn->isize))) { + ok = 1; + // we use the 1st predecessor here + int ii = ninst+1; + while(ii<dyn->size && !dyn->insts[ii].pred_sz) + ++ii; + if((dyn->insts[ii].x64.barrier&BARRIER_FULL)==BARRIER_FULL) + reset_n = -2; // hack to say Barrier! + else { + reset_n = getNominalPred(dyn, ii); // may get -1 if no predecessor are availble + if(reset_n==-1) { + reset_n = -2; + MESSAGE(LOG_DEBUG, "Warning, Reset Caches mark not found\n"); + } + } + } + #else + if(!ok && !need_epilog && box64_dynarec_bigblock && getProtection(addr+3)&~PROT_CUSTOM) if(*(uint32_t*)addr!=0) { // check if need to continue (but is next 4 bytes are 0, stop) uintptr_t next = get_closest_next(dyn, addr); if(next && ( (((next-addr)<15) && is_nops(dyn, addr, next-addr)) - ||(((next-addr)<30) && is_instructions(dyn, addr, next-addr)) )) + /*||(((next-addr)<30) && is_instructions(dyn, addr, next-addr))*/ )) { - dynarec_log(LOG_DEBUG, "Extend block %p, %p -> %p (ninst=%d)\n", dyn, (void*)addr, (void*)next, ninst); ok = 1; + // need to find back that instruction to copy the caches, as previous version cannot be used anymore + reset_n = -2; + for(int ii=0; ii<ninst; ++ii) + if(dyn->insts[ii].x64.jmp == next) { + reset_n = ii; + ii=ninst; + } + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Extend block %p, %p -> %p (ninst=%d, jump from %d)\n", dyn, (void*)addr, (void*)next, ninst, reset_n); } else if(next && (next-addr)<30) { - dynarec_log(LOG_DEBUG, "Cannot extend block %p -> %p (%02X %02X %02X %02X %02X %02X %02X %02x)\n", (void*)addr, (void*)next, PK(0), PK(1), PK(2), PK(3), PK(4), PK(5), PK(6), PK(7)); + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Cannot extend block %p -> %p (%02X %02X %02X %02X %02X %02X %02X %02x)\n", (void*)addr, (void*)next, PK(0), PK(1), PK(2), PK(3), PK(4), PK(5), PK(6), PK(7)); } } - #else - if(!ok && !need_epilog && (addr < (dyn->start+dyn->isize))) { - ok = 1; - } #endif if(ok<0) {ok = 0; need_epilog=1;} ++ninst; #if STEP == 0 - if(ok && !isJumpTableDefault64((void*)addr)) + if(ok && !isJumpTableDefault64((void*)addr) && (box64_dynarec_bigblock<2)) #else if(ok && (ninst==dyn->size)) #endif { - #if STEP == 3 - dynarec_log(LOG_DEBUG, "Stopping block %p (%d / %d)\n",(void*)init_addr, ninst, dyn->size); + int j32; + MAYUSE(j32); + MESSAGE(LOG_DEBUG, "Stopping block %p (%d / %d)\n",(void*)init_addr, ninst, dyn->size); + --ninst; + if(!dyn->insts[ninst].x64.barrier) { + BARRIER(BARRIER_FLOAT); + } + #if STEP == 0 + if(dyn->insts[ninst].x64.set_flags) + dyn->insts[ninst].x64.default_need |= X_PEND; + else + dyn->insts[ninst].x64.use_flags |= X_PEND; #endif - BARRIER(2); - fpu_purgecache(dyn, ninst, x1, x2, x3); + ++ninst; + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); jump_to_next(dyn, addr, 0, ninst); ok=0; need_epilog=0; } } if(need_epilog) { - fpu_purgecache(dyn, ninst, x1, x2, x3); + fpu_purgecache(dyn, ninst, 0, x1, x2, x3); jump_to_epilog(dyn, ip, 0, ninst); // no linker here, it's an unknow instruction } FINI; diff --git a/src/dynarec/dynarec_private.h b/src/dynarec/dynarec_private.h index f6800148..71966bdf 100755 --- a/src/dynarec/dynarec_private.h +++ b/src/dynarec/dynarec_private.h @@ -1,6 +1,11 @@ #ifndef __DYNAREC_PRIVATE_H_ #define __DYNAREC_PRIVATE_H_ +#define BARRIER_NONE 0 +#define BARRIER_FLAGS 1 +#define BARRIER_FLOAT 2 +#define BARRIER_FULL 3 + // all flags for the use_flags field #define X_CF (1<<0) #define X_PF (1<<1) @@ -9,26 +14,32 @@ #define X_SF (1<<4) #define X_OF (1<<5) #define X_ALL ((1<<6)-1) -#define X_PEND (0x1000) +#define X_PEND (0x80) // all state flags #define SF_UNKNOWN 0 #define SF_SET 1 #define SF_PENDING 2 -#define SF_SET_PENDING 3 -#define SF_MAYSET 4 -#define SF_SUBSET 5 +#define SF_SET_PENDING (SF_SET|SF_PENDING) +#define SF_SUB 4 +#define SF_SUBSET (SF_SUB|SF_SET) +#define SF_SUBSET_PENDING (SF_SUBSET|SF_PENDING) +#define SF_MAYSET 8 typedef struct instruction_x64_s { - uintptr_t addr; //address of the instruction - int32_t size; // size of the instruction - int barrier; // next instruction is a jump point, so no optim allowed - uintptr_t jmp; // offset to jump to, even if conditionnal (0 if not), no relative offset here + uintptr_t addr; //address of the instruction + int32_t size; // size of the instruction + uintptr_t jmp; // offset to jump to, even if conditionnal (0 if not), no relative offset here int jmp_insts; // instuction to jump to (-1 if out of the block) - uint32_t use_flags; // 0 or combination of X_?F - uint32_t set_flags; // 0 or combination of X_?F - uint32_t need_flags; // calculated - int state_flags; // One of SF_XXX state + uint8_t jmp_cond; // 1 of conditionnal jump + uint8_t has_next; // does this opcode can continue to the next? + uint8_t barrier; // next instruction is a jump point, so no optim allowed + uint8_t state_flags;// One of SF_XXX state + uint8_t use_flags; // 0 or combination of X_?F + uint8_t set_flags; // 0 or combination of X_?F + uint8_t default_need;// 0 or X_PEND basically + uint8_t need_flags; // calculated + uint8_t old_use; // calculated } instruction_x64_t; void printf_x64_instruction(zydis_dec_t* dec, instruction_x64_t* inst, const char* name); diff --git a/src/main.c b/src/main.c index 92d23936..777641ee 100755 --- a/src/main.c +++ b/src/main.c @@ -391,11 +391,14 @@ void LoadLogEnv() p = getenv("BOX64_DYNAREC_BIGBLOCK"); if(p) { if(strlen(p)==1) { - if(p[0]>='0' && p[0]<='1') + if(p[0]>='0' && p[0]<='2') box64_dynarec_bigblock = p[0]-'0'; } if(!box64_dynarec_bigblock) - printf_log(LOG_INFO, "Dynarec will not try to make big block\n"); + printf_log(LOG_INFO, "Dynarec will not try to make big block\n"); + else if (box64_dynarec_bigblock>1) + printf_log(LOG_INFO, "Dynarec will try to make bigger blocks\n"); + } p = getenv("BOX64_DYNAREC_STRONGMEM"); if(p) { |