about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-06-02 09:15:37 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-06-02 09:15:37 +0200
commitdc71840a5a1e065e5f98e3a230f6714ec8d946ed (patch)
tree7779dd3f00d277b3254ac6b472d83ae346ba1f5b /src
parent07b17cd0f63dfea2318cb19d08673265984eedfa (diff)
downloadbox64-dc71840a5a1e065e5f98e3a230f6714ec8d946ed.tar.gz
box64-dc71840a5a1e065e5f98e3a230f6714ec8d946ed.zip
[ARM64_DYNAREC] Added AVX.66.0F3A 21 and fixed a bunch of issues
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_0f.c12
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c8
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c33
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c12
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h18
5 files changed, 62 insertions, 21 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
index 2acb719a..b8d21d9e 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
@@ -81,11 +81,11 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
             } else {
                 v0 = sse_get_reg_empty(dyn, ninst, x1, gd);
                 SMREAD();
-                addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xffe<<4, 15, rex, NULL, 0, 0);
-                VLD128(v0, ed, fixedaddress);   // no alignment issue with ARMv8 NEON :)
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0);
+                VLDR128_U12(v0, ed, fixedaddress);   // no alignment issue with ARMv8 NEON :)
                 if(vex.l) {
                     v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, -1, -1, -1);
-                    VLD128(v0, ed, fixedaddress+16);
+                    VLDR128_U12(v0, ed, fixedaddress+16);
                 }
             }
             if(!vex.l) YMM0(gd);
@@ -105,11 +105,11 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                     VMOVQ(v1, v0);
                 }
             } else {
-                addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xffe<<4, 15, rex, NULL, 0, 0);
-                VST128(v0, ed, fixedaddress);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0);
+                VSTR128_U12(v0, ed, fixedaddress);
                 if(vex.l) {
                     v0 = ymm_get_reg(dyn, ninst, x1, gd, 0, ed, -1, -1);
-                    VST128(v0, ed, fixedaddress+16);
+                    VSTR128_U12(v0, ed, fixedaddress+16);
                 }
                 SMWRITE2();
             }
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
index afebb155..be983b14 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
@@ -132,14 +132,14 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             VSSHRQ_32(q0, v2, 31);
             VBITQ(v1, v0, q0);
             if(!MODREG) {
-                VST128(v1, ed, fixedaddress);
+                VSTR128_U12(v1, ed, fixedaddress);
             }
             if(vex.l) {
                 GETGYVYEY(v0, v2, v1);
                 VSSHRQ_32(q0, v2, 31);
                 VBITQ(v1, v0, q0);
                 if(!MODREG)
-                    VST128(v1, ed, fixedaddress+16);
+                    VSTR128_U12(v1, ed, fixedaddress+16);
             }
             break;
         case 0x2F:
@@ -151,14 +151,14 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             VSSHRQ_64(q0, v2, 63);
             VBITQ(v1, v0, q0);
             if(!MODREG) {
-                VST128(v1, ed, fixedaddress);
+                VSTR128_U12(v1, ed, fixedaddress);
             }
             if(vex.l) {
                 GETGYVYEY(v0, v2, v1);
                 VSSHRQ_64(q0, v2, 63);
                 VBITQ(v1, v0, q0);
                 if(!MODREG) {
-                    VST128(v1, ed, fixedaddress+16);
+                    VSTR128_U12(v1, ed, fixedaddress+16);
                 }
             }
             break;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
index 14b3c30e..901bcef0 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
@@ -223,6 +223,39 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             F8; // read u8, but it's been already handled
             break;
 
+        case 0x21:
+            INST_NAME("VINSERTPS Gx, Vx, Ex, Ib");
+            nextop = F8;
+            GETGX_empty_VX(v0, v2);
+            if (MODREG) {
+                v1 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0);
+                u8 = F8;
+                if(v0==v1) {
+                    d0 = fpu_get_scratch(dyn, ninst);
+                    VMOVQ(d0, v1);
+                    if(v0!=v2) VMOVQ(v0, v2);
+                    VMOVeS(v0, (u8>>4)&3, d0, (u8>>6)&3);
+                } else {
+                    if(v0!=v2) VMOVQ(v0, v2);
+                    VMOVeS(v0, (u8>>4)&3, v1, (u8>>6)&3);
+                }
+            } else {
+                if(v0!=v2) VMOVQ(v0, v2);
+                SMREAD();
+                addr = geted(dyn, addr, ninst, nextop, &wback, x1, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 1);
+                u8 = F8;
+                LDW(x2, wback, fixedaddress);
+                VMOVQSfrom(v0, (u8>>4)&3, x2);
+            }
+            uint8_t zmask = u8 & 0xf;
+            for (uint8_t i=0; i<4; i++) {
+                if (zmask & (1<<i)) {
+                    VMOVQSfrom(v0, i, wZR);
+                }
+            }
+            YMM0(gd);
+            break;
+
         case 0x44:
             INST_NAME("PCLMULQDQ Gx, Vx, Ex, Ib");
             nextop = F8;
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 908b757d..e054e99c 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -2112,7 +2112,7 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in
 
 static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
 {
-#if 1//STEP > 1
+#if STEP > 0
     int i2 = dyn->insts[ninst].x64.jmp_insts;
     if(i2<0)
         return;
@@ -2494,27 +2494,27 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst)
 
 void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1)
 {
-    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Purge YMM mask=%04x --------\n", dyn->insts[ninst].purge_ymm);
+    MESSAGE(LOG_NONE, "Purge YMM mask=%04x --------\n", dyn->insts[ninst].purge_ymm);
     int s1_set = 0;
     for(int i=0; i<16; ++i)
         if(dyn->insts[ninst].purge_ymm&(1<<i)) {
-            if(is_avx_zero(dyn, ninst, i)) {
+            if(is_avx_zero_unset(dyn, ninst, i)) {
                 if(!s1_set) {
                     ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0]));
                     s1_set = 1;
                 }
                 STPx_S7_offset(xZR, xZR, s1, i*16);
-                avx_unmark_zero(dyn, ninst, i);
             }
-            int reg = -1;
             for(int j=0; j<32; ++j)
                 if(dyn->n.neoncache[j].t==NEON_CACHE_YMMR && dyn->n.neoncache[j].n==i) {
                     // just forget the reg....
                     dyn->n.neoncache[j].v = 0;
+                    j=32;
                 } else if(dyn->n.neoncache[j].t==NEON_CACHE_YMMW && dyn->n.neoncache[j].n==i) {
                     VSTR128_U12(j, xEmu, offsetof(x64emu_t, ymm[i]));
                     dyn->n.neoncache[j].v = 0;
+                    j=32;
                 }
         }
-    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "---------- Purge YMM\n");
+    MESSAGE(LOG_NONE, "---------- Purge YMM\n");
 }
\ No newline at end of file
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 30f967a7..0252a052 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -517,7 +517,7 @@
     if(MODREG)                                                                                  \
         ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, -1);              \
     else                                                                                        \
-        VLD128(ey, ed, fixedaddress+16);                                                        \
+        VLDR128_U12(ey, ed, fixedaddress+16);                                                   \
     gy = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1)
 
 // Get EY and non-writen VY and GY
@@ -526,9 +526,16 @@
     if(MODREG)                                                                                  \
         ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1, gd, vex.v, -1);              \
     else                                                                                        \
-        VLD128(ey, ed, fixedaddress+16);                                                        \
+        VLDR128_U12(ey, ed, fixedaddress+16);                                                   \
     gy = ymm_get_reg(dyn, ninst, x1, gd, 0, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1)
 
+// Get empty EY and non-writen VY and GY
+#define GETGYVYEY_empty(gy, vy, ey)                                                             \
+    vy = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);    \
+    gy = ymm_get_reg(dyn, ninst, x1, gd, 0, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);    \
+    if(MODREG)                                                                                  \
+        ey = ymm_get_reg_empty(dyn, ninst, x1, (nextop&7)+(rex.b<<3), gd, vex.v, -1)
+
 // Get EY and non-writen GY
 #define GETGYEY(gy, ey)                                                                         \
     if(MODREG)                                                                                  \
@@ -561,9 +568,9 @@
 // Get empty VY, and non-writen EY
 #define GETVY_empty_EY(vy, ey)                                                      \
     if(MODREG)                                                                      \
-        ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, vex.v, -1, -1);     \
+        ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, vex.v, -1, -1);  \
     else                                                                            \
-        VLD128(ey, ed, fixedaddress+16);                                            \
+        VLDR128_U12(ey, ed, fixedaddress+16);                                       \
     vy = ymm_get_reg_empty(dyn, ninst, x1, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1, -1)
 
 // Get EX as a quad, (x3 is used)
@@ -575,7 +582,7 @@
         addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, D);  \
         unscaled = 0;                                                                                   \
         a = fpu_get_scratch(dyn, ninst);                                                                \
-        VLD128(a, ed, fixedaddress);                                                                    \
+        VLDR128_U12(a, ed, fixedaddress);                                                               \
     }
 // Get EX as a quad, (x3 is used)
 #define GETEX_empty_Y(a, D)                                                                             \
@@ -583,6 +590,7 @@
         a = sse_get_reg_empty(dyn, ninst, x3, (nextop&7)+(rex.b<<3));                                   \
     } else {                                                                                            \
         WILLWRITE2();                                                                                   \
+        a = fpu_get_scratch(dyn, ninst);                                                                \
         addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, D);  \
         unscaled = 0;                                                                                   \
     }