about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-06-05 20:40:26 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-06-05 20:40:26 +0200
commit79c1ad1431277d17cc7ce34f255b0af4c23ccbaa (patch)
tree9b59ff894dd7bc771e31ca841c7c6349c4ce69b3 /src
parentc0ebe095213b5048b54ff41d0d5550750af2cbdb (diff)
downloadbox64-79c1ad1431277d17cc7ce34f255b0af4c23ccbaa.tar.gz
box64-79c1ad1431277d17cc7ce34f255b0af4c23ccbaa.zip
[ARM64_DYNAREC] Added a few more AVX opcodes
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_0f.c11
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f.c32
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c10
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c29
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c64
5 files changed, 138 insertions, 8 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
index 268e1f86..044c131d 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
@@ -320,7 +320,16 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                 }
             }
             break;
-
+        case 0x51:
+            INST_NAME("VSQRTPS Gx, Ex");
+            nextop = F8;
+            SKIPTEST(x1);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_EX(q0, q1, 0); } else { GETGY_empty_EY(q0, q1); }
+                VFSQRTQS(q0, q1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
         case 0x52:
             INST_NAME("VRSQRTPS Gx, Ex");
             nextop = F8;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
index b7a3f80a..c9243180 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
@@ -1396,6 +1396,38 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             }
             if(!vex.l) YMM0(gd);
             break;
+        case 0xE1:
+            INST_NAME("VPSRAW Gx, Vx, Ex");
+            nextop = F8;
+            q0 = fpu_get_scratch(dyn, ninst);
+            q1 = fpu_get_scratch(dyn, ninst);
+            MOVI_32(q1, 15);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                UQXTN_32(q0, v1);
+                UMIN_32(q0, q0, q1);    // limit to -15 .. +15 values
+                NEG_16(q0, q0);
+                VDUPQ_16(q0, q0, 0);    // only the low 8bits will be used anyway
+                SSHLQ_16(v0, v2, q0);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+        case 0xE2:
+            INST_NAME("VPSRAD Gx, Vx, Ex");
+            nextop = F8;
+            q0 = fpu_get_scratch(dyn, ninst);
+            q1 = fpu_get_scratch(dyn, ninst);
+            MOVI_32(q1, 31);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                UQXTN_32(q0, v1);
+                UMIN_32(q0, q0, q1);        // limit to 0 .. +31 values
+                NEG_32(q0, q0);
+                VDUPQ_32(q0, q0, 0);    // only the low 8bits will be used anyway
+                SSHLQ_32(v0, v2, q0);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
 
         case 0xE4:
             INST_NAME("VPMULHUW Gx, Vx, Ex");
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
index f3ea8f41..783b77c2 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
@@ -77,6 +77,16 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             if(!vex.l) YMM0(gd);
             break;
 
+        case 0x02:
+            INST_NAME("VPHADDD Gx, Vx, Ex");
+            nextop = F8;
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                VADDPQ_32(v0, v2, v1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+
         case 0x08:
             INST_NAME("VPSIGNB Gx, Vx, Ex");
             nextop = F8;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
index 371fe25a..cdbe93f6 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
@@ -647,6 +647,35 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             }
             if(!vex.l) YMM0(gd);
             break;
+        case 0x4C:
+            INST_NAME("VBLENDPVB Gx, Vx, Ex, XMMImm8");
+            nextop = F8;
+            q0 = fpu_get_scratch(dyn, ninst);
+            u8 = geted_ib(dyn, addr, ninst, nextop)>>4;
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { 
+                    q1 = sse_get_reg(dyn, ninst, x1, u8, 0);
+                    GETGX_empty_VXEX(v0, v2, v1, 1); 
+                    F8;
+                } else { 
+                    v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, u8, (MODREG)?((nextop&7)+(rex.b<<3)):-1);
+                    if(MODREG)
+                        v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, u8);
+                    else
+                        VLDR128_U12(v1, ed, fixedaddress+16);
+                    q1 = ymm_get_reg(dyn, ninst, x1, u8, 0, vex.v, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1);
+                    v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, u8, (MODREG)?((nextop&7)+(rex.b<<3)):-1);
+                }
+                VSSHRQ_8(q0, q1, 7);   // create mask
+                if(v0==v1)
+                    VBIFQ(v0, v2, q0);
+                else {
+                    if(v0!=v2) VMOVQ(v0, v2);
+                    VBITQ(v0, v1, q0);
+                }
+            }
+            if(!vex.l) YMM0(gd);
+            break;
 
         default:
             DEFAULT;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
index 11aee1d2..8e2ed65c 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
@@ -39,7 +39,7 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
     int q0, q1, q2;
     int d0, d1, d2;
     int s0;
-    uint64_t tmp64u;
+    uint64_t tmp64u, u64;
     int64_t j64;
     int64_t fixedaddress;
     int unscaled;
@@ -375,6 +375,34 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             }
             if(!vex.l) YMM0(gd);
             break;
+        case 0x70:
+            INST_NAME("VPSHUFHW Gx, Ex, Ib");
+            nextop = F8;
+            d0 = fpu_get_scratch(dyn, ninst);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETEX(v1, 0, 1); GETGX(v0, 1); u8 = F8; } else { GETGY(v0, 1, MODREG?((nextop&7)+(rex.b<<3)):-1, -1, -1); GETEY(v1); }
+                if(u8==0b00000000 || u8==0b01010101 || u8==0b10101010 || u8==0b11111111) {
+                    VDUP_16(d0, v1, 4+(u8&3));
+                } else {
+                    // only high part need to be suffled. VTBL only handle 8bits value, so the 16bits suffles need to be changed in 8bits
+                    if(!l) {
+                        u64 = 0;
+                        for (int i=0; i<4; ++i) {
+                            u64 |= ((uint64_t)((u8>>(i*2))&3)*2+8)<<(i*16+0);
+                            u64 |= ((uint64_t)((u8>>(i*2))&3)*2+9)<<(i*16+8);
+                        }
+                        MOV64x(x2, u64);
+                    }
+                    VMOVQDfrom(d0, 0, x2);
+                    VTBL1_8(d0, v1, d0);
+                }
+                VMOVeD(v0, 1, d0, 0);
+                if(v0!=v1) {
+                    VMOVeD(v0, 0, v1, 0);
+                }
+            }
+            if(!vex.l) YMM0(gd);
+            break;
 
         case 0x7E:
             INST_NAME("MOVQ Gx, Ex");
@@ -420,10 +448,13 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             GETEXSS(v1, 0, 1);
             GETGX_empty_VX(v0, v2);
             u8 = F8;
-            if(((u8&15)==12)||((u8&15)==13)||((u8&15)==9)||((u8&15)==10))
-                FCMPS(v1, v2);
-            else
-                FCMPS(v2, v1);
+            if(((u8&15)!=12) && ((u8&15)!=15)) {
+                if(((u8&15)==12)||((u8&15)==13)||((u8&15)==9)||((u8&15)==10))
+                    FCMPS(v1, v2);
+                else
+                    FCMPS(v2, v1);
+            }
+            // TODO: create a test for this one, there might be an issue with cases 9, 10 and 13
             if(v0!=v2) VMOVQ(v0, v2);
             switch(u8&7) {
                 case 0x00: CSETMw(x2, cEQ); break;  // Equal
@@ -435,8 +466,8 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
                 case 0x06: CSETMw(x2, cHI); break;  // Greater or unordered
                 case 0x07: CSETMw(x2, cVC); break;  // not NaN
                 case 0x08: CSETMw(x2, cEQ); CSETMw(x3, cVS); ORRw_REG(x2, x2, x3); break;  // Equal than or ordered
-                case 0x09: CSETMw(x2, cCS); break;  // Less than or ordered
-                case 0x0a: CSETMw(x2, cHI); break;  // Less or equal or ordered
+                case 0x09: CSETMw(x2, cCS); break;  // Less than or unordered
+                case 0x0a: CSETMw(x2, cHI); break;  // Less or equal or unordered
                 case 0x0b: MOV32w(x2, 0); break;    // false
                 case 0x0c: CSETMw(x2, cNE); CSETMw(x3, cVC); ANDw_REG(x2, x2, x3); break;  // Not Equal not unordered
                 case 0x0d: CSETMw(x2, cCC); break;  // Greater or equal not unordered
@@ -447,6 +478,25 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             YMM0(gd);
             break;
 
+        case 0xE6:
+            INST_NAME("VCVTDQ2PD Gx, Ex");
+            nextop = F8;
+            if(vex.l) {
+                GETEX_Y(v1, 0, 0);
+            } else {
+                GETEXSD(v1, 0, 0);
+            }
+            GETGX_empty(v0);
+            d0 = fpu_get_scratch(dyn, ninst);
+            if(vex.l) {
+                q0 = ymm_get_reg_empty(dyn, ninst, x1, gd, -1, -1, -1);
+                SXTL2_32(q0, v1);
+                SCVTQFD(q0, q0);
+            } else YMM0(gd);
+            SXTL_32(v0, v1);
+            SCVTQFD(v0, v0);
+            break;
+
         default:
             DEFAULT;
     }