about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2025-04-24 19:04:42 +0200
committerptitSeb <sebastien.chev@gmail.com>2025-04-24 19:04:42 +0200
commit4585b74310598068c36f9b9b30a940d7e96a1e1d (patch)
treec7feb164f55ff84a8429cd0e1a0b9207191d9b6d /src
parent768dfd37bb83c9214d05f70e035b140f7269e460 (diff)
downloadbox64-4585b74310598068c36f9b9b30a940d7e96a1e1d.tar.gz
box64-4585b74310598068c36f9b9b30a940d7e96a1e1d.zip
[ARM64_DYNAREC] Some optimisation to some (V)(P)BLEND* opcodes
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_660f.c41
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c43
2 files changed, 21 insertions, 63 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index 41ff0349..f6b5bbe4 100644
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -466,9 +466,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     v1 = fpu_get_scratch(dyn, ninst);

                     if(q0!=q1) {

                         VSSHRQ_8(v1, v0, 7);    // bit[7]-> bit[7..0]

-                        VBICQ(q0, q0, v1);

-                        VANDQ(v1, q1, v1);

-                        VORRQ(q0, q0, v1);

+                        VBITQ(q0, q1, v1);

                     }

                     break;

 

@@ -1050,38 +1048,13 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 case 0x0E:

                     INST_NAME("PBLENDW Gx, Ex, Ib");

                     nextop = F8;

-                    GETGX(q0, 1);

-                    GETEX(q1, 0, 1);

+                    GETGX(v0, 1);

+                    GETEX(v1, 0, 1);

                     u8 = F8;

-                    i32 = 0;

-                    if(q0!=q1)

-                        while(u8) {

-                            if(u8&1) {

-                                if(!(i32&1) && u8&2) {

-                                    if(!(i32&3) && (u8&0xf)==0xf) {

-                                        // whole 64bits

-                                        VMOVeD(q0, i32>>2, q1, i32>>2);

-                                        i32+=4;

-                                        u8>>=4;

-                                    } else {

-                                        // 32bits

-                                        VMOVeS(q0, i32>>1, q1, i32>>1);

-                                        i32+=2;

-                                        u8>>=2;

-                                    }

-                                } else {

-                                    // 16 bits

-                                    VMOVeH(q0, i32, q1, i32);

-                                    i32++;

-                                    u8>>=1;

-                                }

-                            } else {

-                                // nope

-                                i32++;

-                                u8>>=1;

-                            }

-

-                        }

+                    q0 = fpu_get_scratch(dyn, ninst);

+                    MOVI_64(q0, u8);

+                    SXTL_8(q0, q0);    // expand 8bits to 16bits...

+                    VBITQ(v0, v1, q0);

                     break;

                 case 0x0F:

                     INST_NAME("PALIGNR Gx, Ex, Ib");

diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
index 5830a2f2..549ec78d 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
@@ -343,12 +343,8 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                     MOVI_64(q0, u8);
                     SXTL_8(q0, q0);    // expand 8bits to 16bits...
                 }
-                if(v0==v1) {
-                    VBIFQ(v0, v2, q0);
-                } else {
-                    if(v0!=v2) VMOVQ(v0, v2);
-                    VBITQ(v0, v1, q0);
-                }
+                if(v0!=v2) VBIFQ(v0, v2, q0);
+                if(v0!=v1) VBITQ(v0, v1, q0);
             }
             if(!vex.l) YMM0(gd);
             break;
@@ -747,12 +743,8 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                     v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, u8, (MODREG)?((nextop&7)+(rex.b<<3)):-1);
                 }
                 VSSHRQ_32(q0, q1, 31);   // create mask
-                if(v0==v1)
-                    VBIFQ(v0, v2, q0);
-                else {
-                    if(v0!=v2) VMOVQ(v0, v2);
-                    VBITQ(v0, v1, q0);
-                }
+                if(v0!=v2) VBIFQ(v0, v2, q0);
+                if(v0!=v1) VBITQ(v0, v1, q0);
             }
             if(!vex.l) YMM0(gd);
             break;
@@ -776,41 +768,34 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                     v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, u8, (MODREG)?((nextop&7)+(rex.b<<3)):-1);
                 }
                 VSSHRQ_64(q0, q1, 63);   // create mask
-                if(v0==v1)
-                    VBIFQ(v0, v2, q0);
-                else {
-                    if(v0!=v2) VMOVQ(v0, v2);
-                    VBITQ(v0, v1, q0);
-                }
+                if(v0!=v2) VBIFQ(v0, v2, q0);
+                if(v0!=v1) VBITQ(v0, v1, q0);
             }
             if(!vex.l) YMM0(gd);
             break;
         case 0x4C:
-            INST_NAME("VBLENDPVB Gx, Vx, Ex, XMMImm8");
+            INST_NAME("VPBLENDVB Gx, Vx, Ex, XMMImm8");
             nextop = F8;
             q0 = fpu_get_scratch(dyn, ninst);
             u8 = geted_ib(dyn, addr, ninst, nextop)>>4;
+            ed = (nextop&7)+(rex.b<<3);
             for(int l=0; l<1+vex.l; ++l) {
                 if(!l) { 
                     q1 = sse_get_reg(dyn, ninst, x1, u8, 0);
                     GETGX_empty_VXEX(v0, v2, v1, 1); 
                     F8;
                 } else { 
-                    v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, u8, (MODREG)?((nextop&7)+(rex.b<<3)):-1);
+                    v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, u8, (MODREG)?ed:-1);
                     if(MODREG)
-                        v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, u8);
+                        v1 = ymm_get_reg(dyn, ninst, x1, ed, 0, gd, vex.v, u8);
                     else
                         VLDR128_U12(v1, ed, fixedaddress+16);
-                    q1 = ymm_get_reg(dyn, ninst, x1, u8, 0, vex.v, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1);
-                    v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, u8, (MODREG)?((nextop&7)+(rex.b<<3)):-1);
+                    q1 = ymm_get_reg(dyn, ninst, x1, u8, 0, vex.v, gd, (MODREG)?ed:-1);
+                    v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, u8, (MODREG)?ed:-1);
                 }
                 VSSHRQ_8(q0, q1, 7);   // create mask
-                if(v0==v1)
-                    VBIFQ(v0, v2, q0);
-                else {
-                    if(v0!=v2) VMOVQ(v0, v2);
-                    VBITQ(v0, v1, q0);
-                }
+                if(v0!=v2) VBIFQ(v0, v2, q0);
+                if(v0!=v1) VBITQ(v0, v1, q0);
             }
             if(!vex.l) YMM0(gd);
             break;