about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-06-23 20:23:04 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-06-23 20:23:04 +0200
commitb0727d6f2b4f2119e835019be2bab577293f0e4f (patch)
tree216009dc61e2fe03d79e4d38f30afb1c820f57b2 /src
parentc66be063aa30b8ec8e3366fa7767596ef2ecc8bb (diff)
downloadbox64-b0727d6f2b4f2119e835019be2bab577293f0e4f.tar.gz
box64-b0727d6f2b4f2119e835019be2bab577293f0e4f.zip
[ARM64_DYNAREC] Some small optims to a few AVX opcodes
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_0f.c17
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f.c10
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c23
3 files changed, 44 insertions, 6 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
index cfe57d6f..4a048e00 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
@@ -640,8 +640,16 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
             } else if(v2==v1 && (u8==0xe5)) {   // easy special case
                 VMOVQ(v0, v2);
                 VMOVeS(v0, 0, v0, 1);
+            } else if(MODREG && u8==0x88) {
+                VUZP1Q_32(v0, v2, v1);
+            } else if(MODREG && u8==0xdd) {
+                VUZP2Q_32(v0, v2, v1);
             } else {
-                d0 = fpu_get_scratch(dyn, ninst);
+                if((v0==v1) || (v0==v2)) {
+                    d0 = fpu_get_scratch(dyn, ninst); s0 = 0;
+                } else {
+                    d0 = v0; s0 = 1;
+                }
                 // first two elements from Vx
                 for(int i=0; i<2; ++i) {
                     VMOVeS(d0, i, v2, (u8>>(i*2))&3);
@@ -658,7 +666,7 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                         VLD1_32(d0, i, x2);
                     }
                 }
-                VMOVQ(v0, d0);
+                if(v0!=d0) VMOVQ(v0, d0);
             }
             if(vex.l) {
                 if(MODREG)
@@ -672,7 +680,12 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                 } else if(v2==v1 && (u8==0xe5)) {
                     VMOVQ(v0, v2);
                     VMOVeS(v0, 0, v0, 1);
+                } else if(MODREG && u8==0x88) {
+                    VUZP1Q_32(v0, v2, v1);
+                } else if(MODREG && u8==0xdd) {
+                    VUZP2Q_32(v0, v2, v1);
                 } else {
+                    if(s0) d0 = v0;
                     for(int i=0; i<2; ++i) {
                         VMOVeS(d0, i, v2, (u8>>(i*2))&3);
                     }
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
index 2e23b0e1..ca141f08 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
@@ -770,7 +770,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             if(!vex.l) YMM0(gd);
             break;
         case 0x70:
-            INST_NAME("VPSHUFD Gx,Ex,Ib");
+            INST_NAME("VPSHUFD Gx, Ex, Ib");
             nextop = F8;
             if(MODREG) {
                 u8 = F8;
@@ -803,6 +803,14 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
                         VDUPQ_64(v0, v1, 1);
                     } else if(u8==0xB1) {
                         VREV64Q_32(v0, v1);
+                    } else if(u8==0xFA) {
+                        VZIP2Q_32(v0, v1, v1);
+                    } else if(u8==0x50) {
+                        VZIP1Q_32(v0, v1, v1);
+                    } else if(u8==0xF5) {
+                        VTRNQ2_32(v0, v1, v1);
+                    } else if(u8==0xA0) {
+                        VTRNQ1_32(v0, v1, v1);
                     } else if(v0!=v1) {
                         VMOVeS(v0, 0, v1, (u8>>(0*2))&3);
                         VMOVeS(v0, 1, v1, (u8>>(1*2))&3);
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
index ae065c3f..a79c95a0 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
@@ -115,6 +115,14 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                 }
                 if(u8==0x00 || u8==0x55 || u8==0xAA || u8==0xFF)
                     VDUPQ_32(v0, (v0==v1)?q1:v1, u8&3);
+                else if(u8==0x50)
+                    VZIP1Q_32(v0, v1, v1);
+                else if(u8==0xFA)
+                    VZIP2Q_32(v0, v1, v1);
+                else if(u8==0xA0)
+                    VTRNQ1_32(v0, v1, v1);
+                else if(u8==0xF5)
+                    VTRNQ2_32(v0, v1, v1);
                 else for(int i=0; i<4; ++i)
                     VMOVeS(v0, i, (v0==v1)?q1:v1, (u8>>(i*2))&3);
             }
@@ -254,7 +262,9 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             nextop = F8;
             GETGX_empty_VXEX(q0, q2, q1, 1);
             u8 = F8;
-            if(q0==q1) {
+            if((u8&0xf)==0xf) {
+                if(q0!=q1) VMOVQ(q0, q1);
+            } else if(q0==q1) {
                 for(int i=0; i<4; ++i)
                     if(u8&(1<<i)) {
                         VMOVeS(q0, i, q1, i);
@@ -274,7 +284,9 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             }
             if(vex.l) {
                 GETGY_empty_VYEY(q0, q2, q1);
-                if(q0==q1) {
+                if((u8&0xf0)==0xf0) {
+                    if(q0!=q1) VMOVQ(q0, q1);
+                } else if(q0==q1) {
                     for(int i=0; i<4; ++i)
                         if(u8&(1<<(i+4))) {
                             VMOVeS(q0, i, q1, i);
@@ -435,7 +447,12 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             u8 = F8;
             GETVX(v2, 0);
             GETGX_empty(v0);
-            GETGY_empty_VY(q0, q2, 0, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
+            if(v0==v2 && u8==1) {
+                GETGY_empty(q0, 0, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
+                q2 = q0;
+            } else {
+                GETGY_empty_VY(q0, q2, 0, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
+            }
             if(MODREG)
                 VMOVQ((u8&1)?q0:v0, v1);
             else