about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-04-22 17:14:53 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-04-22 17:14:53 +0200
commitb6f58caf240f2e7c19f83efd774904ec862677ad (patch)
tree00026d364e7aa9639e6d77618a07aadd603d786b
parent8c19c3a72c7f163e29f7da858c610b79e238dfc9 (diff)
downloadbox64-b6f58caf240f2e7c19f83efd774904ec862677ad.tar.gz
box64-b6f58caf240f2e7c19f83efd774904ec862677ad.zip
[ARM64_DYNAREC] Improved some specific cases of pcmpestri opcode
-rw-r--r--src/dynarec/arm64/arm64_emitter.h9
-rw-r--r--src/dynarec/arm64/dynarec_arm64_660f.c164
-rw-r--r--src/dynarec/dynarec_private.h2
3 files changed, 152 insertions, 23 deletions
diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h
index 05728675..027bb0b0 100644
--- a/src/dynarec/arm64/arm64_emitter.h
+++ b/src/dynarec/arm64/arm64_emitter.h
@@ -1564,6 +1564,15 @@ int convert_bitmask(uint64_t bitmask);
 // Signed saturating extract Unsigned Narrow, takes Rn element and reduce 16->8 with Unsigned saturation and fit higher part of Rd
 #define SQXTUN2_8(Rd, Rn)           EMIT(QXTUN_vector(1, 1, 0b00, Rn, Rd))
 
+#define XTN_vector(Q, size, Rn, Rd) ((Q)<<30 | 0b01110<<24 | (size)<<22 | 0b10000<<17 | 0b10010<<12 | 0b10<<10 | (Rn)<<5 | (Rd))
+// Xtract narrow to X bits
+#define XTN_8(Vd, Vn)               EMIT(XTN_vector(0, 0b00, Vn, Vd))
+#define XTN_16(Vd, Vn)              EMIT(XTN_vector(0, 0b01, Vn, Vd))
+#define XTN_32(Vd, Vn)              EMIT(XTN_vector(0, 0b10, Vn, Vd))
+#define XTN2_8(Vd, Vn)              EMIT(XTN_vector(1, 0b00, Vn, Vd))
+#define XTN2_16(Vd, Vn)             EMIT(XTN_vector(1, 0b01, Vn, Vd))
+#define XTN2_32(Vd, Vn)             EMIT(XTN_vector(1, 0b10, Vn, Vd))
+
 // Integer CMP
 // EQual
 #define CMEQ_vector(Q, U, size, Rm, Rn, Rd)     ((Q)<<30 | (U)<<29 | 0b01110<<24 | (size)<<22 | 1<<21 | (Rm)<<16 | 0b10001<<11 | 1<<10 | (Rn)<<5 | (Rd))
diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index 505e135e..bed76d36 100644
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -1236,7 +1236,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     ADDx_U12(x3, xEmu, offsetof(x64emu_t, xmm[gd]));

                     if(MODREG) {

                         ed = (nextop&7)+(rex.b<<3);

-                        sse_reflect_reg(dyn, ninst, ed);

+                        if(ed>7)

+                            sse_reflect_reg(dyn, ninst, ed);

                         ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed]));

                     } else {

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);

@@ -1286,33 +1287,147 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     SETFLAGS(X_ALL, SF_SET);

                     nextop = F8;

                     GETG;

-                    sse_reflect_reg(dyn, ninst, gd);

-                    ADDx_U12(x3, xEmu, offsetof(x64emu_t, xmm[gd]));

-                    if(MODREG) {

-                        ed = (nextop&7)+(rex.b<<3);

-                        sse_reflect_reg(dyn, ninst, ed);

-                        ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed]));

+                    u8 = geted_ib(dyn, addr, ninst, nextop);

+                    if((u8&0b1100)==0b1000) {

+                        // this case is (un)signed word, equal each

+                        GETGX(v0, 0);

+                        GETEX(v1, 0, 1);

+                        u8 = F8;

+                        q0 = fpu_get_scratch(dyn);

+                        if(u8&1) {

+                            //16bits

+                            VCMEQQ_16(q0, v0, v1);   // equal => mask regs

+                            XTN_8(q0, q0);          // 8 bits mask, in lower 64bits

+                            // transform that a mask in x1

+                            q1 = fpu_get_scratch(dyn);

+                            VSHL_8(q0, q0, 7);  // keep only bit 0x80

+                            TABLE64(x1, (uintptr_t)&mask_shift8);

+                            VLDR64_U12(q1, x1, 0);     // load shift

+                            USHL_8(q0, q0, q1); // shift

+                            UADDLV_8(q0, q0);   // accumulate

+                            VMOVBto(x1, q0, 0);

+                        } else {

+                            //8 bits

+                            VCMEQQ_8(q0, v0, v1);   // equal => mask regs

+                            // transform that a mask in x1

+                            q1 = fpu_get_scratch(dyn);

+                            d0 = fpu_get_scratch(dyn);

+                            VSHL_8(d0, q0, 7);  // keep only bit 0x80

+                            TABLE64(x1, (uintptr_t)&mask_shift8);

+                            VLDR64_U12(q1, x1, 0);     // load shift

+                            USHL_8(d0, d0, q1); // shift

+                            UADDLV_8(d0, d0);   // accumulate

+                            VMOVBto(x1, d0, 0);

+                            // high part

+                            VMOVeD(d0, 0, q0, 1);

+                            VSHL_8(d0, d0, 7);  // keep only bit 0x80

+                            USHL_8(d0, d0, q1); // shift

+                            UADDLV_8(d0, d0);   // accumulate

+                            VMOVBto(x2, d0, 0);

+                            BFIw(x1, x2, 8, 8); // insert

+                        }

+                        // get abs of eax / edx and find min

+                        ADDSxw_U12(x2, xRAX, 0);

+                        Bcond(cPL, 4+4);

+                        NEGxw_REG(x2, x2);

+                        ADDSxw_U12(x3, xRDX, 0);

+                        Bcond(cPL, 4+4);

+                        NEGxw_REG(x3, x3);

+                        MOV32w(x4, (u8&1)?8:16);

+                        CMPSw_REG(x3, x4);

+                        CSELw(x3, x3, x4, cLT); // x3 is lmem

+                        CMPSw_REG(x2, x4);

+                        CSELw(x2, x2, x4, cLT); // x2 is lreg

+                        CMPSw_REG(x2, x3);

+                        CSELw(x5, x3, x2, cLT); // x5 is max(lmem, lreg)

+                        CSELw(x2, x2, x3, cLT); // x2 is min(lmem, lreg)

+                        // x2 is min length 0-n_packed

+                        MVNw_REG(x4, xZR);

+                        LSLw_REG(x7, x4, x2);

+                        BICw_REG(x1, x1, x7);

+                        LSLw_REG(x4, x4, x5);

+                        ORRw_REG(x1, x1, x4);

+                        ANDw_mask(x1, x1, 0, (u8&1)?7:15);

+                        // x1 is intres1, transform to intres2

+                        switch((u8>>4)&3) {

+                            case 0b01:

+                                MOV32w(x4, (1<<((u8&1)?8:16))-1);

+                                EORw_REG(x1, x1, x4);

+                                break;

+                            case 0b11:

+                                MOV32w(x4, 1);

+                                LSLw_REG(x4, x4, x3);

+                                SUBw_U12(x4, x4, 1);

+                                EORw_REG(x1, x1, x4);

+                        }

+                        // flags

+                        IFX(X_ALL) {

+                            SET_DFNONE(x4);

+                            IFX(X_CF) {

+                                CMPSw_REG(x1, xZR);

+                                CSETw(x4, cNE);

+                                BFIw(xFlags, x4, F_CF, 1);

+                            }

+                            IFX(X_ZF|X_SF) {

+                                MOV32w(x4, 8);

+                                IFX(X_ZF) {

+                                    CMPSw_REG(x3, x4);

+                                    CSETw(x4, cLT);

+                                    BFIw(xFlags, x4, F_ZF, 1);

+                                }

+                                IFX(F_SF) {

+                                    CMPSw_REG(x2, x4);

+                                    CSETw(x4, cLT);

+                                    BFIw(xFlags, x4, F_SF, 1);

+                                }

+                            }

+                            IFX(X_OF) {

+                                BFIw(xFlags, x1, F_OF, 1);

+                            }

+                            IFX(X_AF) {

+                                CMPSw_U12(x1, 0);

+                                CSETw(x4, cEQ);

+                                CMPSw_U12(x3, (u8&1)?8:16);

+                                CSETw(x5, cEQ);

+                                ANDw_REG(x4, x4, x5);

+                                BFIw(xFlags, x4, F_AF, 1);

+                            }

+                            IFX(X_PF) {

+                                BFCw(xFlags, F_PF, 1);

+                            }

+                        }

                     } else {

-                        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);

-                        if(ed!=x1) {

-                            MOVx_REG(x1, ed);

+                        if(gd>7)    // no need to reflect cache as xmm0-xmm7 will be saved before the function call anyway

+                            sse_reflect_reg(dyn, ninst, gd);

+                        ADDx_U12(x3, xEmu, offsetof(x64emu_t, xmm[gd]));

+                        if(MODREG) {

+                            ed = (nextop&7)+(rex.b<<3);

+                            if(ed>7)

+                                sse_reflect_reg(dyn, ninst, ed);

+                            ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed]));

+                        } else {

+                            addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);

+                            if(ed!=x1) {

+                                MOVx_REG(x1, ed);

+                            }

                         }

+                        MOVw_REG(x2, xRDX);

+                        MOVw_REG(x4, xRAX);

+                        u8 = F8;

+                        MOV32w(x5, u8);

+                        CALL(sse42_compare_string_explicit_len, x1);

                     }

-                    MOVx_REG(x2, xRDX);

-                    MOVx_REG(x4, xRAX);

-                    u8 = F8;

-                    MOV32w(x5, u8);

-                    CALL(sse42_compare_string_explicit_len, x1);

-                    CBNZw_MARK(x1);

-                    MOV32w(xRCX, (u8&1)?8:16);

-                    B_NEXT_nocond;

-                    MARK;

                     if(u8&0b1000000) {

+                        CBNZw_MARK(x1);

+                        MOV32w(xRCX, (u8&1)?8:16);

+                        B_NEXT_nocond;

+                        MARK;

                         CLZw(xRCX, x1);

                         MOV32w(x2, 31);

                         SUBw_REG(xRCX, x2, xRCX);

                     } else {

-                        RBITxw(xRCX, x1);

+                        ORRw_mask(xRCX, x1, (u8&1)?0b011000:0b010000,0);

+                        RBITw(xRCX, xRCX);

                         CLZw(xRCX, xRCX);

                     }

                     break;

@@ -1325,7 +1440,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     ADDx_U12(x2, xEmu, offsetof(x64emu_t, xmm[gd]));

                     if(MODREG) {

                         ed = (nextop&7)+(rex.b<<3);

-                        sse_reflect_reg(dyn, ninst, ed);

+                        if(ed>7)

+                            sse_reflect_reg(dyn, ninst, ed);

                         ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed]));

                     } else {

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);

@@ -1373,11 +1489,13 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     SETFLAGS(X_OF|X_CF|X_AF|X_ZF|X_SF|X_PF, SF_SET);

                     nextop = F8;

                     GETG;

-                    sse_reflect_reg(dyn, ninst, gd);

+                    if(gd>7)

+                        sse_reflect_reg(dyn, ninst, gd);

                     ADDx_U12(x2, xEmu, offsetof(x64emu_t, xmm[gd]));

                     if(MODREG) {

                         ed = (nextop&7)+(rex.b<<3);

-                        sse_reflect_reg(dyn, ninst, ed);

+                        if(ed>7)

+                            sse_reflect_reg(dyn, ninst, ed);

                         ADDx_U12(x1, xEmu, offsetof(x64emu_t, xmm[ed]));

                     } else {

                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);

diff --git a/src/dynarec/dynarec_private.h b/src/dynarec/dynarec_private.h
index 86920700..0c1094d3 100644
--- a/src/dynarec/dynarec_private.h
+++ b/src/dynarec/dynarec_private.h
@@ -24,6 +24,8 @@
 #define SF_SUB      4
 #define SF_SUBSET   (SF_SUB|SF_SET)
 #define SF_SUBSET_PENDING   (SF_SUBSET|SF_PENDING)
+#define SF_DF       8
+#define SF_SET_DF   (SF_SET|SF_DF)
 
 typedef struct instruction_x64_s {
     uintptr_t   addr;       //address of the instruction