about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2025-01-31 16:50:51 +0100
committerptitSeb <sebastien.chev@gmail.com>2025-01-31 16:50:51 +0100
commit2239f92816ab68ea99e8a756af1d01360644eb6d (patch)
tree417360752c3d374f2540cdb077c8d90688b711e1 /src
parent637e177408d5cd7f04e52fb79b6183bee3088988 (diff)
downloadbox64-2239f92816ab68ea99e8a756af1d01360644eb6d.tar.gz
box64-2239f92816ab68ea99e8a756af1d01360644eb6d.zip
[ARM64_DYNAREC] Various small fixes and optims in a few AVX opcodes
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_0f.c22
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_0f38.c72
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c7
4 files changed, 69 insertions, 34 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
index 9e9e6a76..387faf1b 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
@@ -643,13 +643,13 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                                 4 -> 5  // Inexact
                                 5 -> 1  // denormal
                             */
-                            // doing X86 -> ARM here, 0 1 2 3 4 5 -> 0 5 1 2 3 4
+                            // doing X86 -> ARM here, 0 1 2 3 4 5 -> 0 2 3 4 5 1
                             if(ed!=x1)
-                                MOVw_REG(x1, ed);
-                            BFXILw(x2, x1, 1, 5);   // x2 = 1 2 3 4 5 ...
-                            BFIw(x1, x2, 2, 4); // x1 = 0 1 1 2 3 4
-                            RORw(x2, x2, 4);    // x2 = 5 .... 1 2 3 4
-                            BFIw(x1, x2, 1, 1); // x1 = 0 5 1 2 3 4
+                                MOVw_REG(x1, ed);   // x1 = 543210
+                            RORw(x3, x1, 2);    // x3 = 10.....5432
+                            BFIw(x1, x3, 1, 4); // x1 = 54320
+                            RORw(x3, x3, 32-1); // x3 = 0.....54321
+                            BFIw(x1, x3, 5, 1); // x1 = 154320
                             MRS_fpsr(x2);
                             BFIx(x2, x1, 0, 6);
                             MSR_fpsr(x2);
@@ -661,12 +661,12 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                         LDRw_U12(x4, xEmu, offsetof(x64emu_t, mxcsr));
                         if(BOX64ENV(sse_flushto0)) {
                             // sync with fpsr, with mask from mxcsr
-                            // doing ARM -> X86 here, 0 1 2 3 4 5 -> 0 2 3 4 5 1
                             MRS_fpsr(x1);
-                            RORw(x3, x1, 2);    //x3 = 2 3 4 5 .... 0 1
-                            BFIw(x1, x3, 1, 4);
-                            RORw(x3, x3, 32-1);
-                            BFIw(x1, x3, 5, 1); // x1 is Flags
+                            // doing ARM -> X86 here,  543210 => 432150
+                            UBFXw(x2, x1, 1, 5);   // x2 = 54321
+                            BFIw(x1, x2, 2, 4); // x1 = 432110
+                            LSRw(x2, x2, 4);    // x2 = 5
+                            BFIw(x1, x2, 1, 1); // x1 = 432150
                             //BFXILw(x3, x4, 7, 6); // this would the mask, but let's ignore that for now
                             BFIw(x4, x1, 0, 6); // inject back the flags
                         }
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_0f38.c
index da34d5f7..bd0a1baf 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_0f38.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_0f38.c
@@ -39,6 +39,7 @@ uintptr_t dynarec64_AVX_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, i
     int q0, q1, q2;
     int d0, d1, d2;
     int s0;
+    int need_tst;
     uint64_t tmp64u;
     int64_t j64;
     int64_t fixedaddress;
@@ -68,21 +69,32 @@ uintptr_t dynarec64_AVX_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, i
             GETGD;
             GETED(0);
             GETVD;
-            IFX(X_ZF)
+            need_tst = 0;
+            IFX(X_ZF) need_tst = 1;
+            IFXNATIVE(X_SF, NF_SF) need_tst = 1;
+            IFXNATIVE(X_OF, NF_VF) need_tst = 1;
+            IFXNATIVE(X_CF, NF_CF) need_tst = 1;
+            if(need_tst)
                 BICSxw(gd, ed, vd);
             else
                 BICxw(gd, ed, vd);
             IFX(X_ZF) {
-                CSETw(x1, cEQ);
-                BFIw(xFlags, x1, F_ZF, 1);
+                IFNATIVE(NF_EQ) {} else {
+                    CSETw(x1, cEQ);
+                    BFIw(xFlags, x1, F_ZF, 1);
+                }
+            }
+            IFX(X_OF) {
+                IFNATIVE(NF_VF) {} else {BFCw(xFlags, F_OF, 1);}
+            }
+            IFX(X_CF) {
+                IFNATIVE(NF_CF) {} else {BFCw(xFlags, F_CF, 1);}
             }
-            IFX(X_OF)
-                BFCw(xFlags, F_OF, 1);
-            IFX(X_CF)
-                BFCw(xFlags, F_CF, 1);
             IFX(X_SF) {
-                LSRxw_IMM(x1, gd, rex.w?63:31);
-                BFIw(xFlags, x1, F_SF, 1);
+                IFNATIVE(NF_SF) {} else {
+                    LSRxw_IMM(x1, gd, rex.w?63:31);
+                    BFIw(xFlags, x1, F_SF, 1);
+                }   
             }
             break;
         case 0xF3:
@@ -99,19 +111,27 @@ uintptr_t dynarec64_AVX_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, i
                         BFIw(xFlags, x3, F_CF, 1);
                     }
                     SUBxw_U12(x3, ed, 1);
-                    IFX(X_ZF)
+                    need_tst = 0;
+                    IFX(X_ZF) need_tst = 1;
+                    IFXNATIVE(X_SF, NF_SF) need_tst = 1;
+                    IFXNATIVE(X_OF, NF_VF) need_tst = 1;
+                    if(need_tst)
                         ANDSxw_REG(vd, ed, x3);
                     else
                         ANDxw_REG(vd, ed, x3);
                     IFX(X_ZF) {
-                        CSETMw(x3, cEQ);
-                        BFIw(xFlags, x3, F_ZF, 1);
+                        IFNATIVE(NF_EQ) {} else {
+                            CSETMw(x3, cEQ);
+                            BFIw(xFlags, x3, F_ZF, 1);
+                        }
                     }
                     IFX(X_SF) {
-                        LSRxw(x3, vd, rex.w?63:31);
-                        BFIw(xFlags, x3, F_SF, 1);
+                        IFNATIVE(NF_SF) {} else {
+                            LSRxw(x3, vd, rex.w?63:31);
+                            BFIw(xFlags, x3, F_SF, 1);
+                        }
                     }
-                    IFX(X_OF) BFCw(xFlags, F_OF, 1);
+                    IFX(X_OF) IFNATIVE(NF_VF) {} else {BFCw(xFlags, F_OF, 1);}
                     if (BOX64ENV(dynarec_test)) {
                         IFX(X_AF) BFCw(xFlags, F_AF, 1);
                         IFX(X_PF) BFCw(xFlags, F_PF, 1);
@@ -139,19 +159,29 @@ uintptr_t dynarec64_AVX_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, i
             B_MARK(cPL);
             LSLxw_REG(x2, x2, x1);
             MARK;
-            IFX(X_ZF) {
+            need_tst = 0;
+            IFX(X_ZF) need_tst = 1;
+            IFXNATIVE(X_SF, NF_SF) need_tst = 1;
+            IFXNATIVE(X_OF, NF_VF) need_tst = 1;
+            if(need_tst) {
                 BICSxw(gd, ed, x2);
-                CSETw(x3, cEQ);
-                BFIw(xFlags, x3, F_ZF, 1);
             } else
                 BICxw(gd, ed, x2);
+            IFX(X_ZF) {
+                IFNATIVE(NF_EQ) {} else {
+                    CSETw(x3, cEQ);
+                    BFIw(xFlags, x3, F_ZF, 1);
+                }
+            }
             IFX(X_SF) {
-                LSRxw(x3, gd, rex.w?63:31);
-                BFIw(xFlags, x3, F_SF, 1);
+                IFNATIVE(NF_SF) {} else {
+                    LSRxw(x3, gd, rex.w?63:31);
+                    BFIw(xFlags, x3, F_SF, 1);
+                }
             }
             IFX(X_AF) BFCw(xFlags, F_AF, 1);
             IFX(X_PF) BFCw(xFlags, F_PF, 1);
-            IFX(X_OF) BFCw(xFlags, F_OF, 1);
+            IFX(X_OF) IFNATIVE(NF_VF) {} else {BFCw(xFlags, F_OF, 1);}
             break;
 
         default:
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
index 81169c55..642cf169 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
@@ -454,7 +454,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
                             FRINTIS(d0, d0);
                             VFCVTZSs(d0, d0);
                             MRS_fpsr(x5);   // get back FPSR to check the IOC bit
-                            TSTw_mask(x5, 0, 0);    // mask=1
+                            TSTw_mask(x5, 0, 0);    // mask=(1<<IOC)
                             FCSELS(d0, d0, d1, cEQ);
                             VMOVeS(v0, i, d0, 0);
                         }
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
index 35e30357..afc1ed55 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
@@ -258,6 +258,7 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
                 VMOVQ(v0, v2);
             }
             VMOVeD(v0, 0, q2, 0);
+            YMM0(gd);
             break;
         case 0x5A:
             INST_NAME("VCVTSD2SS Gx, Vx, Ex");
@@ -461,7 +462,11 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             for(int l=0; l<1+vex.l; ++l) {
                 if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
                 if(v0==v1) {
-                    VFMLAQS(v0, v2, q0);
+                    //TODO: find a better way
+                    if(!l) q1 = fpu_get_scratch(dyn, ninst);
+                    VMOVQ(q1, v2);
+                    VFMLAQS(q1, v1, q0);
+                    VMOVQ(v0, q1);
                 } else {
                     if(v0!=v2) VMOVQ(v0, v2);
                     VFMLAQS(v0, v1, q0);