diff options
| -rw-r--r-- | src/dynarec/arm64/arm64_emitter.h | 9 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_660f.c | 164 | ||||
| -rw-r--r-- | src/dynarec/dynarec_private.h | 2 |
3 files changed, 152 insertions, 23 deletions
diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h index 05728675..027bb0b0 100644 --- a/src/dynarec/arm64/arm64_emitter.h +++ b/src/dynarec/arm64/arm64_emitter.h @@ -1564,6 +1564,15 @@ int convert_bitmask(uint64_t bitmask); // Signed saturating extract Unsigned Narrow, takes Rn element and reduce 16->8 with Unsigned saturation and fit higher part of Rd #define SQXTUN2_8(Rd, Rn) EMIT(QXTUN_vector(1, 1, 0b00, Rn, Rd)) +#define XTN_vector(Q, size, Rn, Rd) ((Q)<<30 | 0b01110<<24 | (size)<<22 | 0b10000<<17 | 0b10010<<12 | 0b10<<10 | (Rn)<<5 | (Rd)) +// Xtract narrow to X bits +#define XTN_8(Vd, Vn) EMIT(XTN_vector(0, 0b00, Vn, Vd)) +#define XTN_16(Vd, Vn) EMIT(XTN_vector(0, 0b01, Vn, Vd)) +#define XTN_32(Vd, Vn) EMIT(XTN_vector(0, 0b10, Vn, Vd)) +#define XTN2_8(Vd, Vn) EMIT(XTN_vector(1, 0b00, Vn, Vd)) +#define XTN2_16(Vd, Vn) EMIT(XTN_vector(1, 0b01, Vn, Vd)) +#define XTN2_32(Vd, Vn) EMIT(XTN_vector(1, 0b10, Vn, Vd)) + // Integer CMP // EQual #define CMEQ_vector(Q, U, size, Rm, Rn, Rd) ((Q)<<30 | (U)<<29 | 0b01110<<24 | (size)<<22 | 1<<21 | (Rm)<<16 | 0b10001<<11 | 1<<10 | (Rn)<<5 | (Rd)) diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c index 505e135e..bed76d36 100644 --- a/src/dynarec/arm64/dynarec_arm64_660f.c +++ b/src/dynarec/arm64/dynarec_arm64_660f.c @@ -1236,7 +1236,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n ADDx_U12(x3, xEmu, offsetof(x64emu_t, xmm[gd])); if(MODREG) { ed = (nextop&7)+(rex.b<<3); - sse_reflect_reg(dyn, ninst, ed); + if(ed>7) + sse_reflect_reg(dyn, ninst, ed); ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed])); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1); @@ -1286,33 +1287,147 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n SETFLAGS(X_ALL, SF_SET); nextop = F8; GETG; - sse_reflect_reg(dyn, ninst, gd); - ADDx_U12(x3, xEmu, offsetof(x64emu_t, xmm[gd])); - if(MODREG) { - ed = (nextop&7)+(rex.b<<3); - sse_reflect_reg(dyn, ninst, ed); - ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed])); + u8 = geted_ib(dyn, addr, ninst, nextop); + if((u8&0b1100)==0b1000) { + // this case is (un)signed word, equal each + GETGX(v0, 0); + GETEX(v1, 0, 1); + u8 = F8; + q0 = fpu_get_scratch(dyn); + if(u8&1) { + //16bits + VCMEQQ_16(q0, v0, v1); // equal => mask regs + XTN_8(q0, q0); // 8 bits mask, in lower 64bits + // transform that a mask in x1 + q1 = fpu_get_scratch(dyn); + VSHL_8(q0, q0, 7); // keep only bit 0x80 + TABLE64(x1, (uintptr_t)&mask_shift8); + VLDR64_U12(q1, x1, 0); // load shift + USHL_8(q0, q0, q1); // shift + UADDLV_8(q0, q0); // accumulate + VMOVBto(x1, q0, 0); + } else { + //8 bits + VCMEQQ_8(q0, v0, v1); // equal => mask regs + // transform that a mask in x1 + q1 = fpu_get_scratch(dyn); + d0 = fpu_get_scratch(dyn); + VSHL_8(d0, q0, 7); // keep only bit 0x80 + TABLE64(x1, (uintptr_t)&mask_shift8); + VLDR64_U12(q1, x1, 0); // load shift + USHL_8(d0, d0, q1); // shift + UADDLV_8(d0, d0); // accumulate + VMOVBto(x1, d0, 0); + // high part + VMOVeD(d0, 0, q0, 1); + VSHL_8(d0, d0, 7); // keep only bit 0x80 + USHL_8(d0, d0, q1); // shift + UADDLV_8(d0, d0); // accumulate + VMOVBto(x2, d0, 0); + BFIw(x1, x2, 8, 8); // insert + } + // get abs of eax / edx and find min + ADDSxw_U12(x2, xRAX, 0); + Bcond(cPL, 4+4); + NEGxw_REG(x2, x2); + ADDSxw_U12(x3, xRDX, 0); + Bcond(cPL, 4+4); + NEGxw_REG(x3, x3); + MOV32w(x4, (u8&1)?8:16); + CMPSw_REG(x3, x4); + CSELw(x3, x3, x4, cLT); // x3 is lmem + CMPSw_REG(x2, x4); + CSELw(x2, x2, x4, cLT); // x2 is lreg + CMPSw_REG(x2, x3); + CSELw(x5, x3, x2, cLT); // x5 is max(lmem, lreg) + CSELw(x2, x2, x3, cLT); // x2 is min(lmem, lreg) + // x2 is min length 0-n_packed + MVNw_REG(x4, xZR); + LSLw_REG(x7, x4, x2); + BICw_REG(x1, x1, x7); + LSLw_REG(x4, x4, x5); + ORRw_REG(x1, x1, x4); + ANDw_mask(x1, x1, 0, (u8&1)?7:15); + // x1 is intres1, transform to intres2 + switch((u8>>4)&3) { + case 0b01: + MOV32w(x4, (1<<((u8&1)?8:16))-1); + EORw_REG(x1, x1, x4); + break; + case 0b11: + MOV32w(x4, 1); + LSLw_REG(x4, x4, x3); + SUBw_U12(x4, x4, 1); + EORw_REG(x1, x1, x4); + } + // flags + IFX(X_ALL) { + SET_DFNONE(x4); + IFX(X_CF) { + CMPSw_REG(x1, xZR); + CSETw(x4, cNE); + BFIw(xFlags, x4, F_CF, 1); + } + IFX(X_ZF|X_SF) { + MOV32w(x4, 8); + IFX(X_ZF) { + CMPSw_REG(x3, x4); + CSETw(x4, cLT); + BFIw(xFlags, x4, F_ZF, 1); + } + IFX(F_SF) { + CMPSw_REG(x2, x4); + CSETw(x4, cLT); + BFIw(xFlags, x4, F_SF, 1); + } + } + IFX(X_OF) { + BFIw(xFlags, x1, F_OF, 1); + } + IFX(X_AF) { + CMPSw_U12(x1, 0); + CSETw(x4, cEQ); + CMPSw_U12(x3, (u8&1)?8:16); + CSETw(x5, cEQ); + ANDw_REG(x4, x4, x5); + BFIw(xFlags, x4, F_AF, 1); + } + IFX(X_PF) { + BFCw(xFlags, F_PF, 1); + } + } } else { - addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1); - if(ed!=x1) { - MOVx_REG(x1, ed); + if(gd>7) // no need to reflect cache as xmm0-xmm7 will be saved before the function call anyway + sse_reflect_reg(dyn, ninst, gd); + ADDx_U12(x3, xEmu, offsetof(x64emu_t, xmm[gd])); + if(MODREG) { + ed = (nextop&7)+(rex.b<<3); + if(ed>7) + sse_reflect_reg(dyn, ninst, ed); + ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed])); + } else { + addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1); + if(ed!=x1) { + MOVx_REG(x1, ed); + } } + MOVw_REG(x2, xRDX); + MOVw_REG(x4, xRAX); + u8 = F8; + MOV32w(x5, u8); + CALL(sse42_compare_string_explicit_len, x1); } - MOVx_REG(x2, xRDX); - MOVx_REG(x4, xRAX); - u8 = F8; - MOV32w(x5, u8); - CALL(sse42_compare_string_explicit_len, x1); - CBNZw_MARK(x1); - MOV32w(xRCX, (u8&1)?8:16); - B_NEXT_nocond; - MARK; if(u8&0b1000000) { + CBNZw_MARK(x1); + MOV32w(xRCX, (u8&1)?8:16); + B_NEXT_nocond; + MARK; CLZw(xRCX, x1); MOV32w(x2, 31); SUBw_REG(xRCX, x2, xRCX); } else { - RBITxw(xRCX, x1); + ORRw_mask(xRCX, x1, (u8&1)?0b011000:0b010000,0); + RBITw(xRCX, xRCX); CLZw(xRCX, xRCX); } break; @@ -1325,7 +1440,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n ADDx_U12(x2, xEmu, offsetof(x64emu_t, xmm[gd])); if(MODREG) { ed = (nextop&7)+(rex.b<<3); - sse_reflect_reg(dyn, ninst, ed); + if(ed>7) + sse_reflect_reg(dyn, ninst, ed); ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed])); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1); @@ -1373,11 +1489,13 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n SETFLAGS(X_OF|X_CF|X_AF|X_ZF|X_SF|X_PF, SF_SET); nextop = F8; GETG; - sse_reflect_reg(dyn, ninst, gd); + if(gd>7) + sse_reflect_reg(dyn, ninst, gd); ADDx_U12(x2, xEmu, offsetof(x64emu_t, xmm[gd])); if(MODREG) { ed = (nextop&7)+(rex.b<<3); - sse_reflect_reg(dyn, ninst, ed); + if(ed>7) + sse_reflect_reg(dyn, ninst, ed); ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed])); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1); diff --git a/src/dynarec/dynarec_private.h b/src/dynarec/dynarec_private.h index 86920700..0c1094d3 100644 --- a/src/dynarec/dynarec_private.h +++ b/src/dynarec/dynarec_private.h @@ -24,6 +24,8 @@ #define SF_SUB 4 #define SF_SUBSET (SF_SUB|SF_SET) #define SF_SUBSET_PENDING (SF_SUBSET|SF_PENDING) +#define SF_DF 8 +#define SF_SET_DF (SF_SET|SF_DF) typedef struct instruction_x64_s { uintptr_t addr; //address of the instruction |