about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-06-05 19:44:02 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-06-05 19:44:02 +0200
commitb568cc529e9b10b6b42b2139351b4b3cb0858a28 (patch)
tree437dada7817da906f7a89cf7f503d029e9d20f88
parentdc8e24c7b785874eb6d7cca0df75f0fe6b597ebb (diff)
downloadbox64-b568cc529e9b10b6b42b2139351b4b3cb0858a28.tar.gz
box64-b568cc529e9b10b6b42b2139351b4b3cb0858a28.zip
[ARM64_DYNAREC] Added a bunch of AVX ocpodes and some fixes too
-rw-r--r--src/dynarec/arm64/arm64_emitter.h2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_660f.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_0f.c55
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f.c164
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c31
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c8
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c78
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c50
8 files changed, 301 insertions, 89 deletions
diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h
index b8c71b6b..c7bb614d 100644
--- a/src/dynarec/arm64/arm64_emitter.h
+++ b/src/dynarec/arm64/arm64_emitter.h
@@ -1862,9 +1862,11 @@ int convert_bitmask(uint64_t bitmask);
 #define MOVIQ_8(Rd, imm8)           EMIT(MOVI_vector(1, 0, (((imm8)>>5)&0b111), 0b1110, ((imm8)&0b11111), Rd))
 #define MOVIQ_16(Rd, imm8, lsl8)    EMIT(MOVI_vector(1, 0, (((imm8)>>5)&0b111), 0b1000|((lsl8)?0b10:0), ((imm8)&0b11111), Rd))
 #define MOVIQ_32(Rd, imm8)          EMIT(MOVI_vector(1, 0, (((imm8)>>5)&0b111), 0b0000, ((imm8)&0b11111), Rd))
+#define MOVIQ_32_lsl(Rd, imm8, lsl8) EMIT(MOVI_vector(1, 0, (((imm8)>>5)&0b111), (lsl8<<1), ((imm8)&0b11111), Rd))
 #define MOVIQ_64(Rd, imm8)          EMIT(MOVI_vector(1, 1, (((imm8)>>5)&0b111), 0b1110, ((imm8)&0b11111), Rd))
 #define MOVI_8(Rd, imm8)            EMIT(MOVI_vector(0, 0, (((imm8)>>5)&0b111), 0b1110, ((imm8)&0b11111), Rd))
 #define MOVI_16(Rd, imm8, lsl8)     EMIT(MOVI_vector(0, 0, (((imm8)>>5)&0b111), 0b1000|((lsl8)?0b10:0), ((imm8)&0b11111), Rd))
+#define MOVI_32_lsl(Rd, imm8, lsl8) EMIT(MOVI_vector(0, 0, (((imm8)>>5)&0b111), (lsl8<<1), ((imm8)&0b11111), Rd))
 #define MOVI_32(Rd, imm8)           EMIT(MOVI_vector(0, 0, (((imm8)>>5)&0b111), 0b0000, ((imm8)&0b11111), Rd))
 #define MOVI_64(Rd, imm8)           EMIT(MOVI_vector(0, 1, (((imm8)>>5)&0b111), 0b1110, ((imm8)&0b11111), Rd))
 
diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index b6450d96..caeaa465 100644
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -1742,8 +1742,6 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 VFCVTZSQS(v0, v0);

             } else {

                 MRS_fpsr(x5);

-                BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

-                MSR_fpsr(x5);

                 u8 = sse_setround(dyn, ninst, x1, x2, x3);

                 MOV32w(x4, 0x80000000);

                 d0 = fpu_get_scratch(dyn, ninst);

diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
index 3b7d453c..268e1f86 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
@@ -326,8 +326,11 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             SKIPTEST(x1);
             v0 = fpu_get_scratch(dyn, ninst);
+            VFMOVSQ_8(v0, 0b01110000);    //1.0f
             for(int l=0; l<1+vex.l; ++l) {
                 if(!l) { GETGX_empty_EX(q0, q1, 0); } else { GETGY_empty_EY(q0, q1); }
+                #if 0
+                // the aproximation doesn't not work on Death Stranding. code around 0x1419c9100 fail...
                 if(!l) {
                     if(q1==q0)
                         v1 = fpu_get_scratch(dyn, ninst);
@@ -339,6 +342,10 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                 VFMULQS(v1, v0, q1);
                 VFRSQRTSQS(v1, v1, v0);
                 VFMULQS(q0, v1, v0);
+                #else
+                VFSQRTQS(q0, q1);
+                VFDIVQS(q0, v0, q0);
+                #endif
             }
             if(!vex.l) YMM0(gd);
             break;
@@ -347,18 +354,25 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             SKIPTEST(x1);
             q0 = fpu_get_scratch(dyn, ninst);
+            VFMOVSQ_8(q0, 0b01110000);    //1.0f
             for(int l=0; l<1+vex.l; ++l) {
                 if(!l) {
                     GETGX_empty_EX(v0, v1, 0);
+                    #if 0
                     if(v0==v1)
                         q1 = fpu_get_scratch(dyn, ninst);
+                    #endif
                 } else {
                     GETGY_empty_EY(v0, v1);
                 }
+                #if 0
                 if(v0!=v1) q1 = v0;
                 VFRECPEQS(q0, v1);
                 VFRECPSQS(q1, q0, v1);
                 VFMULQS(v0, q0, q1);
+                #else
+                VFDIVQS(v0, q0, v1);
+                #endif
             }
             if(!vex.l) YMM0(gd);
             break;
@@ -550,27 +564,28 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
             q0 = fpu_get_scratch(dyn, ninst);
             for(int l=0; l<1+vex.l; ++l) {
                 if(!l) { GETGX_empty_VXEX(v0, v2, v1, 1); u8 = F8; } else { GETGY_empty_VYEY(v0, v2, v1); }
-                switch(u8&7) {
+                if(((u8&15)==3) || ((u8&15)==7) || ((u8&15)==8) || ((u8&15)==9) || ((u8&15)==10) || ((u8&15)==12) || ((u8&15)==13) || ((u8&15)==14)) {
+                    VFMAXQS(q0, v2, v1);    // propagate NAN
+                    VFCMEQQS(((u8&15)==7)?v0:q0, q0, q0);   // 0 if NAN, 1 if not NAN
+                }
+                switch(u8&0xf) {
                     // the inversion of the params in the comparison is there to handle NaN the same way SSE does
-                    case 0: VFCMEQQS(v0, v2, v1); break;   // Equal
-                    case 1: VFCMGTQS(v0, v1, v2); break;   // Less than
-                    case 2: VFCMGEQS(v0, v1, v2); break;   // Less or equal
-                    case 3: VFCMEQQS(v0, (v0==v1)?v1:v2, (v0==v1)?v1:v2);
-                            if(v2!=v1) {
-                                VFCMEQQS(q0, (v0==v1)?v2:v1, (v0==v1)?v2:v1);
-                                VANDQ(v0, v0, q0);
-                            }
-                            VMVNQ(v0, v0);
-                            break;   // NaN (NaN is not equal to himself)
-                    case 4: VFCMEQQS(v0, v2, v1); VMVNQ(v0, v0); break;   // Not Equal (or unordered on ARM, not on X86...)
-                    case 5: VFCMGTQS(v0, v1, v2); VMVNQ(v0, v0); break;   // Greater or equal or unordered
-                    case 6: VFCMGEQS(v0, v1, v2); VMVNQ(v0, v0); break;   // Greater or unordered
-                    case 7: VFCMEQQS(v0, (v0==v1)?v1:v2, (v0==v1)?v1:v2);
-                            if(v2!=v1) {
-                                VFCMEQQS(q0, (v0==v1)?v2:v1, (v0==v1)?v2:v1);
-                                VANDQ(v0, v0, q0);
-                            }
-                            break;   // not NaN
+                    case 0x00: VFCMEQQS(v0, v2, v1); break;   // Equal, not unordered
+                    case 0x01: VFCMGTQS(v0, v1, v2); break;   // Less than
+                    case 0x02: VFCMGEQS(v0, v1, v2); break;   // Less or equal
+                    case 0x03: VMVNQ(v0, q0); break;   // unordered
+                    case 0x04: VFCMEQQS(v0, v2, v1); VMVNQ(v0, v0); break;   // Not Equal (or unordered on ARM, not on X86...)
+                    case 0x05: VFCMGTQS(v0, v1, v2); VMVNQ(v0, v0); break;   // Greater or equal or unordered
+                    case 0x06: VFCMGEQS(v0, v1, v2); VMVNQ(v0, v0); break;   // Greater or unordered
+                    case 0x07: break;  // ordered
+                    case 0x08: VFCMEQQS(v0, v2, v1); VORNQ(v0, v0, q0); break;   // Equal, or unordered
+                    case 0x09: VFCMGTQS(v0, v1, v2); VORNQ(v0, v0, q0); break;   // Less than or unordered
+                    case 0x0a: VFCMGEQS(v0, v1, v2); VORNQ(v0, v0, q0); break;   // Less or equal or unordered
+                    case 0x0b: VEORQ(v0, v0, v0); break; // false
+                    case 0x0c: VFCMEQQS(v0, v2, v1); VBICQ(v0, q0, v0); break;
+                    case 0x0d: VFCMGEQS(v0, v2, v1); break;
+                    case 0x0e: VFCMGTQS(v0, v2, v1); break;
+                    case 0x0f: MOVIQ_64(v0, 0xff); break; //true
                 }
             }
             if(!vex.l) YMM0(gd);
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
index 12dc5144..b7a3f80a 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
@@ -319,22 +319,50 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
         case 0x58:
             INST_NAME("VADDPD Gx, Vx, Ex");
             nextop = F8;
-            GETGX_empty_VXEX(v0, v2, v1, 0);
-            VFADDQD(v0, v2, v1);
-            if(vex.l) {
-                GETGY_empty_VYEY(v0, v2, v1);
+            if(!box64_dynarec_fastnan) {
+                q0 = fpu_get_scratch(dyn, ninst);
+                q1 = fpu_get_scratch(dyn, ninst);
+            }
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                if(!box64_dynarec_fastnan) {
+                    // check if any input value was NAN
+                    VFMAXQD(q0, v2, v1);    // propagate NAN
+                    VFCMEQQD(q0, q0, q0);    // 0 if NAN, 1 if not NAN
+                }
                 VFADDQD(v0, v2, v1);
-            } else YMM0(gd)
+                if(!box64_dynarec_fastnan) {
+                    VFCMEQQD(q1, v0, v0);    // 0 => out is NAN
+                    VBICQ(q1, q0, q1);      // forget it in any input was a NAN already
+                    VSHLQ_64(q1, q1, 63);   // only keep the sign bit
+                    VORRQ(v0, v0, q1);      // NAN -> -NAN
+                }
+            }
+            if(!vex.l) YMM0(gd)
             break;
         case 0x59:
             INST_NAME("VMULPD Gx, Vx, Ex");
             nextop = F8;
-            GETGX_empty_VXEX(v0, v2, v1, 0);
-            VFMULQD(v0, v2, v1);
-            if(vex.l) {
-                GETGY_empty_VYEY(v0, v2, v1);
+            if(!box64_dynarec_fastnan) {
+                q0 = fpu_get_scratch(dyn, ninst);
+                q1 = fpu_get_scratch(dyn, ninst);
+            }
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                if(!box64_dynarec_fastnan) {
+                    // check if any input value was NAN
+                    VFMAXQD(q0, v2, v1);    // propagate NAN
+                    VFCMEQQD(q0, q0, q0);    // 0 if NAN, 1 if not NAN
+                }
                 VFMULQD(v0, v2, v1);
-            } else YMM0(gd)
+                if(!box64_dynarec_fastnan) {
+                    VFCMEQQD(q1, v0, v0);    // 0 => out is NAN
+                    VBICQ(q1, q0, q1);      // forget it in any input was a NAN already
+                    VSHLQ_64(q1, q1, 63);   // only keep the sign bit
+                    VORRQ(v0, v0, q1);      // NAN -> -NAN
+                }
+            }
+            if(!vex.l) YMM0(gd)
             break;
         case 0x5A:
             INST_NAME("VCVTPD2PS Gx, Ex");
@@ -362,59 +390,39 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
         case 0x5B:
             INST_NAME("VCVTPS2DQ Gx, Ex");
             nextop = F8;
-            GETEX(v1, 0, 0);
-            GETGX_empty(v0);
-            if(box64_dynarec_fastround) {
-                u8 = sse_setround(dyn, ninst, x1, x2, x3);
-                VFRINTISQ(v0, v1);
-                if(!vex.l) x87_restoreround(dyn, ninst, u8);
-                VFCVTZSQS(v0, v0);
-            } else {
-                MRS_fpsr(x5);
-                BFCw(x5, FPSR_IOC, 1);   // reset IOC bit
-                MSR_fpsr(x5);
-                u8 = sse_setround(dyn, ninst, x1, x2, x3);
-                MOV32w(x4, 0x80000000);
+            u8 = sse_setround(dyn, ninst, x1, x2, x6);
+            if(!box64_dynarec_fastround && !arm64_frintts) {
                 d0 = fpu_get_scratch(dyn, ninst);
-                for(int i=0; i<4; ++i) {
-                    BFCw(x5, FPSR_IOC, 1);   // reset IOC bit
-                    MSR_fpsr(x5);
-                    VMOVeS(d0, 0, v1, i);
-                    FRINTIS(d0, d0);
-                    VFCVTZSs(d0, d0);
-                    MRS_fpsr(x5);   // get back FPSR to check the IOC bit
-                    TBZ(x5, FPSR_IOC, 4+4);
-                    VMOVQSfrom(d0, 0, x4);
-                    VMOVeS(v0, i, d0, 0);
-                }
-                if(!vex.l) x87_restoreround(dyn, ninst, u8);
+                d1 = fpu_get_scratch(dyn, ninst);
+                MOVI_32_lsl(d1, 0x80, 3);
             }
-            if(vex.l) {
-                GETGY_empty_EY(v0, v1);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETEX(v1, 0, 0); GETGX_empty(v0); } else { GETGY_empty_EY(v0, v1); }
                 if(box64_dynarec_fastround) {
                     VFRINTISQ(v0, v1);
-                    x87_restoreround(dyn, ninst, u8);
                     VFCVTZSQS(v0, v0);
                 } else {
-                    MRS_fpsr(x5);
-                    BFCw(x5, FPSR_IOC, 1);   // reset IOC bit
-                    MSR_fpsr(x5);
-                    MOV32w(x4, 0x80000000);
-                    d0 = fpu_get_scratch(dyn, ninst);
-                    for(int i=0; i<4; ++i) {
-                        BFCw(x5, FPSR_IOC, 1);   // reset IOC bit
-                        MSR_fpsr(x5);
-                        VMOVeS(d0, 0, v1, i);
-                        FRINTIS(d0, d0);
-                        VFCVTZSs(d0, d0);
-                        MRS_fpsr(x5);   // get back FPSR to check the IOC bit
-                        TBZ(x5, FPSR_IOC, 4+4);
-                        VMOVQSfrom(d0, 0, x4);
-                        VMOVeS(v0, i, d0, 0);
+                    if(arm64_frintts) {
+                         VFRINT32XSQ(v0, v1);
+                         VFCVTZSQS(v0, v0);
+                    } else {
+                        if(!l) MRS_fpsr(x5);
+                        for(int i=0; i<4; ++i) {
+                            BFCx(x5, FPSR_IOC, 1);  // reset IOC bits
+                            MSR_fpsr(x5);
+                            VMOVeS(d0, 0, v1, i);
+                            FRINTIS(d0, d0);
+                            VFCVTZSs(d0, d0);
+                            MRS_fpsr(x5);   // get back FPSR to check the IOC bit
+                            TSTw_mask(x5, 0, 0);    // mask=1
+                            FCSELS(d0, d0, d1, cEQ);
+                            VMOVeS(v0, i, d0, 0);
+                        }
                     }
-                    x87_restoreround(dyn, ninst, u8);
                 }
-            } else YMM0(gd);
+            }
+            x87_restoreround(dyn, ninst, u8);
+            if(!vex.l) YMM0(gd);
             break;
         case 0x5C:
             INST_NAME("VSUBPD Gx, Vx, Ex");
@@ -1552,7 +1560,51 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             }
             if(!vex.l) YMM0(gd);
             break;
-
+        case 0xF1:
+            INST_NAME("VPSLLW Gx, Vx, Ex");
+            nextop = F8;
+            q0 = fpu_get_scratch(dyn, ninst);
+            q1 = fpu_get_scratch(dyn, ninst);
+            MOVI_32(q1, 16);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                UQXTN_32(q0, v1);
+                UMIN_32(q0, q0, q1);    // limit to 0 .. +16 values
+                VDUPQ_16(q0, q0, 0);    // only the low 8bits will be used anyway
+                USHLQ_16(v0, v2, q0);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+        case 0xF2:
+            INST_NAME("VPSLLD Gx, Vx, Ex");
+            nextop = F8;
+            q0 = fpu_get_scratch(dyn, ninst);
+            q1 = fpu_get_scratch(dyn, ninst);
+            MOVI_32(q1, 32);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                UQXTN_32(q0, v1);
+                UMIN_32(q0, q0, q1);    // limit to 0 .. +32 values
+                VDUPQ_32(q0, q0, 0);    // only the low 8bits will be used anyway
+                USHLQ_32(v0, v2, q0);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+        case 0xF3:
+            INST_NAME("VPSLLQ Gx, Vx, Ex");
+            nextop = F8;
+            q0 = fpu_get_scratch(dyn, ninst);
+            q1 = fpu_get_scratch(dyn, ninst);
+            MOVI_32(q1, 64);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                UQXTN_32(q0, v1);
+                UMIN_32(q0, q0, q1);    // limit to 0 .. +64 values
+                VDUPQ_64(q0, q0, 0);    // only the low 8bits will be used anyway
+                USHLQ_64(v0, v2, q0);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
         case 0xF4:
             INST_NAME("VPMULUDQ Gx, Vx, Ex");
             nextop = F8;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
index d358597d..f3ea8f41 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
@@ -123,6 +123,31 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             if(!vex.l) YMM0(gd);
             break;
 
+        case 0x0C:
+            INST_NAME("VPERMILPS Gx, Vx, Ex");
+            nextop = F8;
+            q1 = fpu_get_scratch(dyn, ninst);
+            q0 = fpu_get_scratch(dyn, ninst);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                // transform u32 index in V1 to 4 u8 index in q0 for VTBL
+                MOVIQ_32(q0, 3); // index and 3
+                VANDQ(q0, v1, q0);
+                SQXTN_16(q0, q0);   // index in 16bits
+                VSHL_16(q0, q0, 1); // double the index
+                VZIP1Q_16(q0, q0, q0);   // repeat the index by pair
+                MOVIQ_32_lsl(q1, 1, 2);    // q1 as 16bits is 0 / 1
+                VADDQ_16(q0, q0, q1);
+                SQXTN_8(q0, q0);   // index in 8bits
+                VSHL_8(q0, q0, 1); // double the index
+                VZIP1Q_8(q0, q0, q0);   // repeat the index by pair
+                MOVIQ_16(q1, 1, 1);
+                VADDQ_8(q0, q0, q1);
+                VTBLQ1_8(v0, v2, q0);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+
         case 0x17:
             INST_NAME("VPTEST GX, EX");
             SETFLAGS(X_ALL, SF_SET);
@@ -148,17 +173,19 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             }
             IFX(X_ZF) {
                 VANDQ(v2, v0, v1);
+                CMEQQ_0_64(v2, v2);
                 UQXTN_32(v2, v2);
                 VMOVQDto(x2, v2, 0);
-                CMPSw_U12(x2, 0);
+                ADDSx_U12(xZR, x2, 1);
                 CSETw(x2, cEQ);
                 BFIw(xFlags, x2, F_ZF, 1);
             }
             IFX(X_CF) {
                 VBICQ(v2, v1, v0);
+                CMEQQ_0_64(v2, v2);
                 UQXTN_32(v2, v2);
                 VMOVQDto(x2, v2, 0);
-                CMPSw_U12(x2, 0);
+                ADDSx_U12(xZR, x2, 1);
                 CSETw(x2, cEQ);
                 BFIw(xFlags, x2, F_CF, 1);
             }
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
index 71b50105..371fe25a 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
@@ -111,7 +111,9 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                     GETGY_empty_EY(v0, v1);
                     if(v0==v1) {VMOVQ(q1, v1);}
                 }
-                for(int i=0; i<4; ++i)
+                if(u8==0x00 || u8==0x55 || u8==0xAA || u8==0xFF)
+                    VDUPQ_32(v0, (v0==v1)?q1:v1, u8&3);
+                else for(int i=0; i<4; ++i)
                     VMOVeS(v0, i, (v0==v1)?q1:v1, (u8>>(i*2))&3);
             }
             if(!vex.l) YMM0(gd);
@@ -128,7 +130,9 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                     GETGY_empty_EY(v0, v1);
                     if(v0==v1) {VMOVQ(q1, v1);}
                 }
-                for(int i=0; i<2; ++i)
+                if(((u8>>(l*2))&1)==((u8>>(1+l*2))&1))
+                    VDUPQ_64(v0, (v0==v1)?q1:v1, ((u8>>(l*2))&1));
+                else for(int i=0; i<2; ++i)
                     VMOVeD(v0, i, (v0==v1)?q1:v1, (u8>>(i+l*2))&1);
             }
             if(!vex.l) YMM0(gd);
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
index fb791452..3391a293 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
@@ -39,7 +39,7 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
     int q0, q1, q2;
     int d0, d1, d2;
     int s0;
-    uint64_t tmp64u;
+    uint64_t tmp64u, u64;
     int64_t j64;
     int64_t fixedaddress;
     int unscaled;
@@ -242,8 +242,9 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             d1 = fpu_get_scratch(dyn, ninst);
             GETEXSD(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
+            // VMINSD: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0]
             FCMPD(v2, v1);
-            FCSELD(d1, v2, v1, cLS);
+            FCSELD(d1, v1, v2, cCS);
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
@@ -256,7 +257,20 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             d1 = fpu_get_scratch(dyn, ninst);
             GETEXSD(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
+            if(!box64_dynarec_fastnan) {
+                q0 = fpu_get_scratch(dyn, ninst);
+                q1 = fpu_get_scratch(dyn, ninst);
+                // check if any input value was NAN
+                FMAXD(q0, v2, v1);    // propagate NAN
+                FCMEQD(q0, q0, q0);    // 0 if NAN, 1 if not NAN
+            }
             FDIVD(d1, v2, v1);
+            if(!box64_dynarec_fastnan) {
+                FCMEQD(q1, d1, d1);    // 0 => out is NAN
+                VBIC(q1, q0, q1);      // forget it in any input was a NAN already
+                VSHLQ_64(q1, q1, 63);   // only keep the sign bit
+                VORR(d1, d1, q1);      // NAN -> -NAN
+            }
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
@@ -269,8 +283,8 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             d1 = fpu_get_scratch(dyn, ninst);
             GETEXSD(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
-            FCMPD(v2, v1);
-            FCSELD(d1, v2, v1, cGE);
+            FCMPD(v1, v2);
+            FCSELD(d1, v1, v2, cCS);
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
@@ -278,6 +292,62 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             YMM0(gd)
             break;
 
+        case 0x70:
+            INST_NAME("VPSHUFLW Gx, Ex, Ib");
+            nextop = F8;
+            d0 = fpu_get_scratch(dyn, ninst);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETEX(v1, 0, 1); GETGX(v0, 1); u8 = F8; } else { GETGY(v0, 1, MODREG?((nextop&7)+(rex.b<<3)):-1, -1, -1); GETEY(v1); }
+                if(u8==0b00000000 || u8==0b01010101 || u8==0b10101010 || u8==0b11111111) {
+                    if(v0==v1) {
+                        VMOVQ(d0, v1);
+                    }
+                    VDUP_16(v0, v1, u8&3);
+                    if(v0==v1)
+                        v1 = d0;
+                } else {
+                    // only low part need to be suffled. VTBL only handle 8bits value, so the 16bits suffles need to be changed in 8bits
+                    if(!l) {
+                        u64 = 0;
+                        for (int i=0; i<4; ++i) {
+                            u64 |= ((uint64_t)((u8>>(i*2))&3)*2+0)<<(i*16+0);
+                            u64 |= ((uint64_t)((u8>>(i*2))&3)*2+1)<<(i*16+8);
+                        }
+                        MOV64x(x2, u64);
+                    }
+                    VMOVQDfrom(d0, 0, x2);
+                    VTBL1_8(d0, v1, d0);
+                    VMOVeD(v0, 0, d0, 0);
+                }
+                if(v0!=v1) {
+                    VMOVeD(v0, 1, v1, 1);
+                }
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+
+        case 0xC2:
+            INST_NAME("CMPSD Gx, Ex, Ib");
+            nextop = F8;
+            GETEXSD(v1, 0, 1);
+            GETGX_empty_VX(v0, v2);
+            u8 = F8;
+            FCMPD(v2, v1);
+            if(v0!=v2) VMOVQ(v0, v2);
+            switch(u8&7) {
+                case 0: CSETMx(x2, cEQ); break;   // Equal
+                case 1: CSETMx(x2, cCC); break;   // Less than
+                case 2: CSETMx(x2, cLS); break;   // Less or equal
+                case 3: CSETMx(x2, cVS); break;   // NaN
+                case 4: CSETMx(x2, cNE); break;   // Not Equal or unordered
+                case 5: CSETMx(x2, cCS); break;   // Greater or equal or unordered
+                case 6: CSETMx(x2, cHI); break;   // Greater or unordered, test inverted, N!=V so unordered or less than (inverted)
+                case 7: CSETMx(x2, cVC); break;   // not NaN
+            }
+            VMOVQDfrom(v0, 0, x2);
+            YMM0(gd);
+            break;
+
         default:
             DEFAULT;
     }
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
index 47c6391d..11aee1d2 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
@@ -182,6 +182,17 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             }
             break;
 
+        case 0x51:
+            INST_NAME("SQRTSS Gx, Ex");
+            nextop = F8;
+            GETEXSS(d0, 0, 0);
+            GETGX_empty_VX(v0, v2);
+            d1 = fpu_get_scratch(dyn, ninst);
+            FSQRTS(d1, d0);
+            if(v0!=v2) VMOVQ(v0, v2);
+            VMOVeS(v0, 0, d1, 0);
+            YMM0(gd);
+            break;
         case 0x52:
             INST_NAME("VRSQRTSS Gx, Vx Ex");
             nextop = F8;
@@ -305,7 +316,7 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             GETEXSS(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
             FCMPS(v2, v1);
-            FCSELS(d1, v2, v1, cLS);
+            FCSELS(d1, v1, v2, cCS);
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
@@ -331,8 +342,8 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             d1 = fpu_get_scratch(dyn, ninst);
             GETEXSS(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
-            FCMPS(v2, v1);
-            FCSELS(d1, v2, v1, cGE);
+            FCMPS(v1, v2);
+            FCSELS(d1, v1, v2, cCS);
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
@@ -403,6 +414,39 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             }
             break;
 
+        case 0xC2:
+            INST_NAME("VCMPSS Gx, Vx, Ex, Ib");
+            nextop = F8;
+            GETEXSS(v1, 0, 1);
+            GETGX_empty_VX(v0, v2);
+            u8 = F8;
+            if(((u8&15)==12)||((u8&15)==13)||((u8&15)==9)||((u8&15)==10))
+                FCMPS(v1, v2);
+            else
+                FCMPS(v2, v1);
+            if(v0!=v2) VMOVQ(v0, v2);
+            switch(u8&7) {
+                case 0x00: CSETMw(x2, cEQ); break;  // Equal
+                case 0x01: CSETMw(x2, cCC); break;  // Less than
+                case 0x02: CSETMw(x2, cLS); break;  // Less or equal
+                case 0x03: CSETMw(x2, cVS); break;  // NaN
+                case 0x04: CSETMw(x2, cNE); break;  // Not Equal or unordered
+                case 0x05: CSETMw(x2, cCS); break;  // Greater or equal or unordered
+                case 0x06: CSETMw(x2, cHI); break;  // Greater or unordered
+                case 0x07: CSETMw(x2, cVC); break;  // not NaN
+                case 0x08: CSETMw(x2, cEQ); CSETMw(x3, cVS); ORRw_REG(x2, x2, x3); break;  // Equal than or ordered
+                case 0x09: CSETMw(x2, cCS); break;  // Less than or ordered
+                case 0x0a: CSETMw(x2, cHI); break;  // Less or equal or ordered
+                case 0x0b: MOV32w(x2, 0); break;    // false
+                case 0x0c: CSETMw(x2, cNE); CSETMw(x3, cVC); ANDw_REG(x2, x2, x3); break;  // Not Equal not unordered
+                case 0x0d: CSETMw(x2, cCC); break;  // Greater or equal not unordered
+                case 0x0e: CSETMw(x2, cLS); break;  // Greater not unordered
+                case 0x0f: MOV32w(x2, 0xffffffff); break; // true
+            }
+            VMOVQSfrom(v0, 0, x2);
+            YMM0(gd);
+            break;
+
         default:
             DEFAULT;
     }