about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2025-01-22 20:37:17 +0100
committerptitSeb <sebastien.chev@gmail.com>2025-01-22 20:37:17 +0100
commitf4a9f8dd79456cbb16d2048c36b48074c831c0bf (patch)
tree58fa4036ab4372d8b50185bca6e011df0a62926c /src
parentf235c7f702a4a5873e3b7ef04e3fdf17627e6ca9 (diff)
downloadbox64-f4a9f8dd79456cbb16d2048c36b48074c831c0bf.tar.gz
box64-f4a9f8dd79456cbb16d2048c36b48074c831c0bf.zip
[ARM64_DYNAREC] Added a few AVX opcode and Improved/Fixed some existing SSE and AVX ones
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_0f.c68
-rw-r--r--src/dynarec/arm64/dynarec_arm64_660f.c20
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_0f.c124
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f.c16
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c137
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c60
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c112
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c84
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f20f.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f30f.c69
10 files changed, 557 insertions, 135 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c
index 98f84863..71c844d2 100644
--- a/src/dynarec/arm64/dynarec_arm64_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_0f.c
@@ -1136,15 +1136,41 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("ADDPS Gx, Ex");

             nextop = F8;

             GETEX(q0, 0, 0);

-            GETGX(v0, 1);

-            VFADDQS(v0, v0, q0);

+            GETGX(q1, 1);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

+                // check if any input value was NAN

+                VFMAXQS(v0, q0, q1);    // propagate NAN

+                VFCMEQQS(v0, v0, v0);    // 0 if NAN, 1 if not NAN

+            }

+            VFADDQS(q1, q1, q0);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                VFCMEQQS(v1, q1, q1);    // 0 => out is NAN

+                VBICQ(v1, v0, v1);      // forget it in any input was a NAN already

+                VSHLQ_32(v1, v1, 31);   // only keep the sign bit

+                VORRQ(q1, q1, v1);      // NAN -> -NAN

+            }

             break;

         case 0x59:

             INST_NAME("MULPS Gx, Ex");

             nextop = F8;

             GETEX(q0, 0, 0);

-            GETGX(v0, 1);

-            VFMULQS(v0, v0, q0);

+            GETGX(q1, 1);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

+                // check if any input value was NAN

+                VFMAXQS(v0, q0, q1);    // propagate NAN

+                VFCMEQQS(v0, v0, v0);    // 0 if NAN, 1 if not NAN

+            }

+            VFMULQS(q1, q1, q0);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                VFCMEQQS(v1, q1, q1);    // 0 => out is NAN

+                VBICQ(v1, v0, v1);      // forget it in any input was a NAN already

+                VSHLQ_32(v1, v1, 31);   // only keep the sign bit

+                VORRQ(q1, q1, v1);      // NAN -> -NAN

+            }

             break;

         case 0x5A:

             INST_NAME("CVTPS2PD Gx, Ex");

@@ -1164,8 +1190,21 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("SUBPS Gx, Ex");

             nextop = F8;

             GETEX(q0, 0, 0);

-            GETGX(v0, 1);

-            VFSUBQS(v0, v0, q0);

+            GETGX(q1, 1);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

+                // check if any input value was NAN

+                VFMAXQS(v0, q0, q1);    // propagate NAN

+                VFCMEQQS(v0, v0, v0);    // 0 if NAN, 1 if not NAN

+            }

+            VFSUBQS(q1, q1, q0);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                VFCMEQQS(v1, q1, q1);    // 0 => out is NAN

+                VBICQ(v1, v0, v1);      // forget it in any input was a NAN already

+                VSHLQ_32(v1, v1, 31);   // only keep the sign bit

+                VORRQ(q1, q1, v1);      // NAN -> -NAN

+            }

             break;

         case 0x5D:

             INST_NAME("MINPS Gx, Ex");

@@ -1185,8 +1224,21 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("DIVPS Gx, Ex");

             nextop = F8;

             GETEX(q0, 0, 0);

-            GETGX(v0, 1);

-            VFDIVQS(v0, v0, q0);

+            GETGX(q1, 1);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

+                // check if any input value was NAN

+                VFMAXQS(v0, q0, q1);    // propagate NAN

+                VFCMEQQS(v0, v0, v0);    // 0 if NAN, 1 if not NAN

+            }

+            VFDIVQS(q1, q1, q0);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                VFCMEQQS(v1, q1, q1);    // 0 => out is NAN

+                VBICQ(v1, v0, v1);      // forget it in any input was a NAN already

+                VSHLQ_32(v1, v1, 31);   // only keep the sign bit

+                VORRQ(q1, q1, v1);      // NAN -> -NAN

+            }

             break;

         case 0x5F:

             INST_NAME("MAXPS Gx, Ex");

diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index 54f98516..893ac0ce 100644
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -475,7 +475,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     break;

 

                 case 0x14:

-                    INST_NAME("PBLENDVPS Gx,Ex");

+                    INST_NAME("BLENDVPS Gx,Ex");

                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

@@ -483,13 +483,11 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     v1 = fpu_get_scratch(dyn, ninst);

                     if(q0!=q1) {

                         VSSHRQ_32(v1, v0, 31);    // bit[31]-> bit[31..0]

-                        VBICQ(q0, q0, v1);

-                        VANDQ(v1, q1, v1);

-                        VORRQ(q0, q0, v1);

+                        VBITQ(q0, q1, v1);

                     }

                     break;

                 case 0x15:

-                    INST_NAME("PBLENDVPD Gx,Ex");

+                    INST_NAME("BLENDVPD Gx,Ex");

                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

@@ -497,9 +495,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     v1 = fpu_get_scratch(dyn, ninst);

                     if(q0!=q1) {

                         VSSHRQ_64(v1, v0, 63);    // bit[63]-> bit[63..0]

-                        VBICQ(q0, q0, v1);

-                        VANDQ(v1, q1, v1);

-                        VORRQ(q0, q0, v1);

+                        VBITQ(q0, q1, v1);

                     }

                     break;

 

@@ -1028,9 +1024,13 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     u8 = F8&0b1111;

                     if(u8==0b0011) {

                         VMOVeD(q0, 0, q1, 0);

-                    } else if(u8==0b1100) {

+                        u8&=~0b0011;

+                    }

+                    if(u8==0b1100) {

                         VMOVeD(q0, 1, q1, 1);

-                    } else for(int i=0; i<4; ++i)

+                        u8&=~0b1100;

+                    }

+                    for(int i=0; i<4; ++i)

                         if(u8&(1<<i)) {

                             VMOVeS(q0, i, q1, i);

                         }

diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
index 8c333f7d..5ce00466 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
@@ -440,22 +440,50 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x58:
             INST_NAME("VADDPS Gx, Vx, Ex");
             nextop = F8;
-            GETGX_empty_VXEX(v0, v2, v1, 0);
-            VFADDQS(v0, v2, v1);
-            if(vex.l) {
-                GETGY_empty_VYEY(v0, v2, v1);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q0 = fpu_get_scratch(dyn, ninst);
+                q1 = fpu_get_scratch(dyn, ninst);
+            }
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                if(!BOX64ENV(dynarec_fastnan)) {
+                    // check if any input value was NAN
+                    VFMAXQS(q0, v2, v1);    // propagate NAN
+                    VFCMEQQS(q0, q0, q0);    // 0 if NAN, 1 if not NAN
+                }
                 VFADDQS(v0, v2, v1);
-            } else YMM0(gd)
+                if(!BOX64ENV(dynarec_fastnan)) {
+                    VFCMEQQS(q1, v0, v0);    // 0 => out is NAN
+                    VBICQ(q1, q0, q1);      // forget it in any input was a NAN already
+                    VSHLQ_32(q1, q1, 31);   // only keep the sign bit
+                    VORRQ(v0, v0, q1);      // NAN -> -NAN
+                }
+            }
+            if(!vex.l) YMM0(gd)
             break;
         case 0x59:
             INST_NAME("VMULPS Gx, Vx, Ex");
             nextop = F8;
-            GETGX_empty_VXEX(v0, v2, v1, 0);
-            VFMULQS(v0, v2, v1);
-            if(vex.l) {
-                GETGY_empty_VYEY(v0, v2, v1);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q0 = fpu_get_scratch(dyn, ninst);
+                q1 = fpu_get_scratch(dyn, ninst);
+            }
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                if(!BOX64ENV(dynarec_fastnan)) {
+                    // check if any input value was NAN
+                    VFMAXQS(q0, v2, v1);    // propagate NAN
+                    VFCMEQQS(q0, q0, q0);    // 0 if NAN, 1 if not NAN
+                }
                 VFMULQS(v0, v2, v1);
-            } else YMM0(gd)
+                if(!BOX64ENV(dynarec_fastnan)) {
+                    VFCMEQQS(q1, v0, v0);    // 0 => out is NAN
+                    VBICQ(q1, q0, q1);      // forget it in any input was a NAN already
+                    VSHLQ_32(q1, q1, 31);   // only keep the sign bit
+                    VORRQ(v0, v0, q1);      // NAN -> -NAN
+                }
+            }
+            if(!vex.l) YMM0(gd)
             break;
         case 0x5A:
             INST_NAME("VCVTPS2PD Gx, Ex");
@@ -480,12 +508,26 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5C:
             INST_NAME("VSUBPS Gx, Vx, Ex");
             nextop = F8;
-            GETGX_empty_VXEX(v0, v2, v1, 0);
-            VFSUBQS(v0, v2, v1);
-            if(vex.l) {
-                GETGY_empty_VYEY(v0, v2, v1);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q0 = fpu_get_scratch(dyn, ninst);
+                q1 = fpu_get_scratch(dyn, ninst);
+            }
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                if(!BOX64ENV(dynarec_fastnan)) {
+                    // check if any input value was NAN
+                    VFMAXQS(q0, v2, v1);    // propagate NAN
+                    VFCMEQQS(q0, q0, q0);    // 0 if NAN, 1 if not NAN
+                }
                 VFSUBQS(v0, v2, v1);
-            } else YMM0(gd)
+                if(!BOX64ENV(dynarec_fastnan)) {
+                    VFCMEQQS(q1, v0, v0);    // 0 => out is NAN
+                    VBICQ(q1, q0, q1);      // forget it in any input was a NAN already
+                    VSHLQ_32(q1, q1, 31);   // only keep the sign bit
+                    VORRQ(v0, v0, q1);      // NAN -> -NAN
+                }
+            }
+            if(!vex.l) YMM0(gd)
             break;
         case 0x5D:
             INST_NAME("VMINPS Gx, Vx, Ex");
@@ -508,12 +550,26 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5E:
             INST_NAME("VDIVPS Gx, Vx, Ex");
             nextop = F8;
-            GETGX_empty_VXEX(v0, v2, v1, 0);
-            VFDIVQS(v0, v2, v1);
-            if(vex.l) {
-                GETGY_empty_VYEY(v0, v2, v1);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q0 = fpu_get_scratch(dyn, ninst);
+                q1 = fpu_get_scratch(dyn, ninst);
+            }
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                if(!BOX64ENV(dynarec_fastnan)) {
+                    // check if any input value was NAN
+                    VFMAXQS(q0, v2, v1);    // propagate NAN
+                    VFCMEQQS(q0, q0, q0);    // 0 if NAN, 1 if not NAN
+                }
                 VFDIVQS(v0, v2, v1);
-            } else YMM0(gd)
+                if(!BOX64ENV(dynarec_fastnan)) {
+                    VFCMEQQS(q1, v0, v0);    // 0 => out is NAN
+                    VBICQ(q1, q0, q1);      // forget it in any input was a NAN already
+                    VSHLQ_32(q1, q1, 31);   // only keep the sign bit
+                    VORRQ(v0, v0, q1);      // NAN -> -NAN
+                }
+            }
+            if(!vex.l) YMM0(gd)
             break;
         case 0x5F:
             INST_NAME("VMAXPS Gx, Vx, Ex");
@@ -634,15 +690,21 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
             u8 = F8;
             if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) {
                 VDUPQ_32(v0, v2, u8&3);
-            } else if(v2==v1 && (u8==0xe0)) {   // easy special case
+            } else if(v2==v1 && (u8==0xe0)) {   // elements 3 2 0 0
                 VMOVQ(v0, v2);
                 VMOVeS(v0, 1, v0, 0);
-            } else if(v2==v1 && (u8==0xe5)) {   // easy special case
+            } else if(v2==v1 && (u8==0xe5)) {   // elements 3 2 1 1
                 VMOVQ(v0, v2);
                 VMOVeS(v0, 0, v0, 1);
-            } else if(MODREG && u8==0x88) {
+            } else if(v2==v1 && (u8==0xa0)) {   // elements 2 2 0 0
+                VTRNQ1_32(v0, v1, v2);
+            } else if(v2==v1 && (u8==0xf5)) {   // elements 3 3 1 1 
+                VTRNQ2_32(v0, v1, v2);
+            } else if(v2==v1 && (u8==0xb1)) {   // elements 2 3 0 1
+                VREV64Q_32(v0, v1);
+            } else if(MODREG && u8==0x88) {     // elements 2 0 2 0
                 VUZP1Q_32(v0, v2, v1);
-            } else if(MODREG && u8==0xdd) {
+            } else if(MODREG && u8==0xdd) {     // elements 3 1 3 1
                 VUZP2Q_32(v0, v2, v1);
             } else {
                 if((v0==v1) || (v0==v2)) {
@@ -674,15 +736,21 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                 GETGY_empty_VY(v0, v2, 0, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
                 if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) {
                     VDUPQ_32(v0, v2, u8&3);
-                } else if(v2==v1 && (u8==0xe0)) {
+                } else if(v2==v1 && (u8==0xe0)) {   // elements 3 2 0 0
                     VMOVQ(v0, v2);
                     VMOVeS(v0, 1, v0, 0);
-                } else if(v2==v1 && (u8==0xe5)) {
+                } else if(v2==v1 && (u8==0xe5)) {   // elements 3 2 1 1
                     VMOVQ(v0, v2);
                     VMOVeS(v0, 0, v0, 1);
-                } else if(MODREG && u8==0x88) {
+                } else if(v2==v1 && (u8==0xa0)) {   // elements 2 2 0 0
+                    VTRNQ1_32(v0, v1, v2);
+                } else if(v2==v1 && (u8==0xf5)) {   // elements 3 3 1 1 
+                    VTRNQ2_32(v0, v1, v2);
+                } else if(v2==v1 && (u8==0xb1)) {   // elements 2 3 0 1
+                    VREV64Q_32(v0, v1);
+                } else if(MODREG && u8==0x88) {     // elements 2 0 2 0
                     VUZP1Q_32(v0, v2, v1);
-                } else if(MODREG && u8==0xdd) {
+                } else if(MODREG && u8==0xdd) {     // elements 3 1 3 1
                     VUZP2Q_32(v0, v2, v1);
                 } else {
                     if(s0) d0 = v0;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
index 570098e4..81169c55 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
@@ -1877,7 +1877,21 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             }
             if(!vex.l) YMM0(gd);
             break;
-
+        case 0xF7:
+            INST_NAME("VMASKMOVDQU Gx, Ex");
+            nextop = F8;
+            GETGX(q0, 1);
+            GETEX(q1, 0, 0);
+            v0 = fpu_get_scratch(dyn, ninst);
+            VLDR128_U12(v0, xRDI, 0);
+            if(MODREG)
+                v1 = fpu_get_scratch(dyn, ninst); // need to preserve the register
+            else
+                v1 = q1;
+            VSSHRQ_8(v1, q1, 7);  // get the mask
+            VBITQ(v0, q0, v1);
+            VSTR128_U12(v0, xRDI, 0);  // put back
+            break;
         case 0xF8:
             INST_NAME("VPSUBB Gx, Vx, Ex");
             nextop = F8;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
index 0647dca1..dcc455e1 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
@@ -26,6 +26,8 @@
 
 static const float addsubps[4] = {-1.f, 1.f, -1.f, 1.f};
 static const double addsubpd[2] = {-1., 1.};
+static const float subaddps[4] = {1.f, -1.f, 1.f, -1.f};
+static const double subaddpd[2] = {1., -1.};
 
 uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog)
 {
@@ -204,6 +206,109 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             if(!vex.l) YMM0(gd);
             break;
 
+        case 0x0E:
+            INST_NAME("VTESTPS GX, EX");
+            SETFLAGS(X_ALL, SF_SET);
+            nextop = F8;
+            GETGX(v0, 0);
+            GETEX(v1, 0, 0);
+            v2 = fpu_get_scratch(dyn, ninst);
+            if(vex.l) {
+                if(!MODREG)
+                    q1 = fpu_get_scratch(dyn, ninst);
+                q2 = fpu_get_scratch(dyn, ninst);
+                GETGY(q0, 0, MODREG?((nextop&7)+(rex.b<<3)):-1, -1, -1);
+                GETEY(q1);
+            }
+            IFX(X_CF) {
+                VBICQ(v2, v1, v0);
+                VSHRQ_32(v2, v2, 31);
+                if(vex.l) {
+                    VBICQ(q2, q1, q0);
+                    VSHRQ_32(q2, q2, 31);
+                    VORRQ(v2, v2, q2);
+                }
+                CMEQQ_0_64(v2, v2);
+                UQXTN_32(v2, v2);
+                VMOVQDto(x2, v2, 0);
+                ADDSx_U12(xZR, x2, 1);
+                CSETw(x2, cEQ);
+                BFIw(xFlags, x2, F_CF, 1);
+            }
+            IFX(X_ZF) {
+                VANDQ(v2, v0, v1);
+                VSHRQ_32(v2, v2, 31);
+                if(vex.l) {
+                    VANDQ(q2, q0, q1);
+                    VSHRQ_32(q2, q2, 31);
+                    VORRQ(v2, v2, q2);
+                }
+                CMEQQ_0_64(v2, v2);
+                UQXTN_32(v2, v2);
+                VMOVQDto(x2, v2, 0);
+                ADDSx_U12(xZR, x2, 1);
+                IFNATIVE(NF_EQ) {} else {
+                    CSETw(x2, cEQ);
+                    BFIw(xFlags, x2, F_ZF, 1);
+                }
+            }
+            IFX(X_AF|X_SF|X_OF|X_PF) {
+                MOV32w(x2, (1<<F_AF) | (1<<F_OF) | (1<<F_SF) | (1<<F_PF));
+                BICw(xFlags, xFlags, x2);
+            }
+            break;
+        case 0x0F:
+            INST_NAME("VTESTPD GX, EX");
+            SETFLAGS(X_ALL, SF_SET);
+            nextop = F8;
+            GETGX(v0, 0);
+            GETEX(v1, 0, 0);
+            v2 = fpu_get_scratch(dyn, ninst);
+            if(vex.l) {
+                if(!MODREG)
+                    q1 = fpu_get_scratch(dyn, ninst);
+                q2 = fpu_get_scratch(dyn, ninst);
+                GETGY(q0, 0, MODREG?((nextop&7)+(rex.b<<3)):-1, -1, -1);
+                GETEY(q1);
+            }
+            IFX(X_CF) {
+                VBICQ(v2, v1, v0);
+                VSHRQ_64(v2, v2, 63);
+                if(vex.l) {
+                    VBICQ(q2, q1, q0);
+                    VSHRQ_64(q2, q2, 63);
+                    VORRQ(v2, v2, q2);
+                }
+                CMEQQ_0_64(v2, v2);
+                UQXTN_32(v2, v2);
+                VMOVQDto(x2, v2, 0);
+                ADDSx_U12(xZR, x2, 1);
+                CSETw(x2, cEQ);
+                BFIw(xFlags, x2, F_CF, 1);
+            }
+            IFX(X_ZF) {
+                VANDQ(v2, v0, v1);
+                VSHRQ_64(v2, v2, 63);
+                if(vex.l) {
+                    VANDQ(q2, q0, q1);
+                    VSHRQ_64(q2, q2, 63);
+                    VORRQ(v2, v2, q2);
+                }
+                CMEQQ_0_64(v2, v2);
+                UQXTN_32(v2, v2);
+                VMOVQDto(x2, v2, 0);
+                ADDSx_U12(xZR, x2, 1);
+                IFNATIVE(NF_EQ) {} else {
+                    CSETw(x2, cEQ);
+                    BFIw(xFlags, x2, F_ZF, 1);
+                }
+            }
+            IFX(X_AF|X_SF|X_OF|X_PF) {
+                MOV32w(x2, (1<<F_AF) | (1<<F_OF) | (1<<F_SF) | (1<<F_PF));
+                BICw(xFlags, xFlags, x2);
+            }
+            break;
+
         case 0x13:
             INST_NAME("VCVTPH2PS Gx, Ex");
             nextop = F8;
@@ -1015,6 +1120,10 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                     if(!l) {
                         GETGX_empty_VX(v0, v2);
                         addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
+                        if(ed!=x3) {
+                            MOVx_REG(x3, ed);
+                            ed = x3;
+                        }
                         v1 = fpu_get_scratch(dyn, ninst);
                     } else {
                         GETGY_empty_VY(v0, v2, 0, -1, -1);
@@ -1028,12 +1137,12 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                         VMOVQDto(x4, q0, 0);
                         CBZx(x4, 4+1*4);
                         VLD1_64(v1, 0, ed);
-                        ADDx_U12(ed, ed, 4);
+                        ADDx_U12(ed, ed, 8);
                         VMOVQDto(x4, q0, 1);
                         CBZx(x4, 4+1*4);
                         VLD1_64(v1, 1, ed);
                         if(!l && vex.l)
-                            ADDx_U12(ed, ed, 4);
+                            ADDx_U12(ed, ed, 8);
                     } else {
                         VSSHRQ_32(q0, v2, 31);
                         VMOVSto(x4, q0, 0);
@@ -1503,7 +1612,29 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             }
             if(!vex.l) YMM0(gd);
             break;
-
+        case 0xB7:
+            INST_NAME("VFMSUBADD231PS/D Gx, Vx, Ex");
+            nextop = F8;
+            q0 = fpu_get_scratch(dyn, ninst);
+            TABLE64(x2, (rex.w)?((uintptr_t)&subaddpd):((uintptr_t)&subaddps));
+            VLDR128_U12(q0, x2, 0);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_VXEX(v0, v2, v1, 0); if(v0==v2 || v0==v1) q1 = fpu_get_scratch(dyn, ninst); } else { GETGY_VYEY(v0, v2, v1); }
+                if(v0!=v1 && v0!=v2) {
+                    q1 = v0;
+                }
+                if(rex.w) {
+                    VFMULQD(q1, v0, q0);
+                    VFMLAQD(q1, v1, v2);
+                } else {
+                    VFMULQS(q1, v0, q0);
+                    VFMLAQS(q1, v1, v2);
+                }
+                if(q1!=v0)
+                    VMOVQ(v0, q1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
         case 0xB8:
             INST_NAME("VFMADD231PS/D Gx, Vx, Ex");
             nextop = F8;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
index dee4a568..a2111a4e 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
@@ -135,15 +135,21 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                 if(!l) {
                     GETGX_empty_EX(v0, v1, 1);
                     u8 = F8;
-                    if(v0==v1) {q1 = fpu_get_scratch(dyn, ninst); VMOVQ(q1, v1);}
                 } else {
                     GETGY_empty_EY(v0, v1);
-                    if(v0==v1) {VMOVQ(q1, v1);}
                 }
-                if(((u8>>(l*2))&1)==((u8>>(1+l*2))&1))
-                    VDUPQ_64(v0, (v0==v1)?q1:v1, ((u8>>(l*2))&1));
-                else for(int i=0; i<2; ++i)
-                    VMOVeD(v0, i, (v0==v1)?q1:v1, (u8>>(i+l*2))&1);
+                switch(((u8>>(l*2))&3)) {
+                    case 0b00:
+                    case 0b11:
+                        VDUPQ_64(v0, v1, ((u8>>(l*2))&1));
+                        break;
+                    case 0b10:
+                        if(v0!=v1) VMOVQ(v0, v1);
+                        break;
+                    case 0b01:
+                       VEXTQ_8(v0, v1, v1, 8); // invert 64bits values
+                       break;
+                }
             }
             if(!vex.l) YMM0(gd);
             break;
@@ -308,7 +314,7 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             } else YMM0(gd);
             break;
         case 0x0D:
-            INST_NAME("VPBLENDPD Gx, Vx, Ex, Ib");
+            INST_NAME("VBLENDPD Gx, Vx, Ex, Ib");
             nextop = F8;
             for(int l=0; l<1+vex.l; ++l) {
                 if(!l) { GETGX_empty_VXEX(q0, q2, q1, 1); u8 = F8; } else { GETGY_empty_VYEY(q0, q2, q1); }
@@ -592,28 +598,36 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             u8 = geted_ib(dyn, addr, ninst, nextop);
             q0 = fpu_get_scratch(dyn, ninst);
             // first mask
-            wb1 = 0; // mask
-            for(int i=0; i<4; ++i)
-                if(u8&(1<<i))
-                    wb1 |= (3<<(i*2));
-            MOVI_64(q0, wb1);   // load 8bits value as a 8bytes mask
-            SXTL_16(q0, q0);    // expand 16bits to 32bits...
-            q1 = fpu_get_scratch(dyn, ninst);
-            // second mask
-            wb1 = 0; // mask
-            for(int i=0; i<4; ++i)
-                if((u8>>4)&(1<<i))
-                    wb1 |= (3<<(i*2));
-            MOVI_64(q1, wb1);   // load 8bits value as a 8bytes mask
-            SXTL_16(q1, q1);    // expand 16bits to 32bits...
+            if((u8&0x0f)!=0x0f) {
+                wb1 = 0; // mask
+                for(int i=0; i<4; ++i)
+                    if(u8&(1<<i))
+                        wb1 |= (3<<(i*2));
+                MOVI_64(q0, wb1);   // load 8bits value as a 8bytes mask
+                SXTL_16(q0, q0);    // expand 16bits to 32bits...
+            }
+            if((u8&0xf0)!=0xf0) {
+                q1 = fpu_get_scratch(dyn, ninst);
+                // second mask
+                wb1 = 0; // mask
+                for(int i=0; i<4; ++i)
+                    if((u8>>4)&(1<<i))
+                        wb1 |= (3<<(i*2));
+                MOVI_64(q1, wb1);   // load 8bits value as a 8bytes mask
+                SXTL_16(q1, q1);    // expand 16bits to 32bits...
+            }
             for(int l=0; l<1+vex.l; ++l) {
                 if(!l) { GETGX_empty_VXEX(v0, v2, v1, 1); u8 = F8; } else { GETGY_empty_VYEY(v0, v2, v1); }
                 VFMULQS(v0, v2, v1);
-                VANDQ(v0, v0, q1);  // second mask
+                if((u8&0xf0)!=0xf0) {
+                    VANDQ(v0, v0, q1);  // second mask
+                }
                 VFADDPQS(v0, v0, v0);
                 FADDPS(v0, v0);
                 VDUPQ_32(v0, v0, 0);
-                VANDQ(v0, v0, q0);  // first mask
+                if((u8&0x0f)!=0x0f) {
+                    VANDQ(v0, v0, q0);  // first mask
+                }
             }
             if(!vex.l) YMM0(gd);
             break;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
index 8ee698ab..35e30357 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f.c
@@ -211,28 +211,53 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
         case 0x58:
             INST_NAME("VADDSD Gx, Vx, Ex");
             nextop = F8;
-            d1 = fpu_get_scratch(dyn, ninst);
+            q2 = fpu_get_scratch(dyn, ninst);
             GETEXSD(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
-            FADDD(d1, v2, v1);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q1 = fpu_get_scratch(dyn, ninst);
+                q0 = fpu_get_scratch(dyn, ninst);
+                // check if any input value was NAN
+                FMAXD(q1, v0, v1);    // propagate NAN
+                FCMEQD(q1, q1, q1);    // 0 if NAN, 1 if not NAN
+                FADDD(q2, v1, v2);  // the high part of the vector is erased...
+                FCMEQD(q0, q2, q2);    // 0 => out is NAN
+                VBIC(q0, q1, q0);      // forget it in any input was a NAN already
+                SHL_64(q0, q0, 63);     // only keep the sign bit
+                VORR(q2, q2, q0);      // NAN -> -NAN
+            } else {
+                FADDD(q2, v1, v2);  // the high part of the vector is erased...
+            }
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
-            VMOVeD(v0, 0, d1, 0);
+            VMOVeD(v0, 0, q2, 0);
             YMM0(gd)
             break;
         case 0x59:
             INST_NAME("VMULSD Gx, Vx, Ex");
             nextop = F8;
-            d1 = fpu_get_scratch(dyn, ninst);
+            q2 = fpu_get_scratch(dyn, ninst);
             GETEXSD(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
-            FMULD(d1, v2, v1);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q1 = fpu_get_scratch(dyn, ninst);
+                q0 = fpu_get_scratch(dyn, ninst);
+                // check if any input value was NAN
+                FMAXD(q1, v0, v1);    // propagate NAN
+                FCMEQD(q1, q1, q1);    // 0 if NAN, 1 if not NAN
+                FMULD(q2, v1, v2);  // the high part of the vector is erased...
+                FCMEQD(q0, q2, q2);    // 0 => out is NAN
+                VBIC(q0, q1, q0);      // forget it in any input was a NAN already
+                SHL_64(q0, q0, 63);     // only keep the sign bit
+                VORR(q2, q2, q0);      // NAN -> -NAN
+            } else {
+                FMULD(q2, v1, v2);  // the high part of the vector is erased...
+            }
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
-            VMOVeD(v0, 0, d1, 0);
-            YMM0(gd)
+            VMOVeD(v0, 0, q2, 0);
             break;
         case 0x5A:
             INST_NAME("VCVTSD2SS Gx, Vx, Ex");
@@ -257,14 +282,27 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
         case 0x5C:
             INST_NAME("VSUBSD Gx, Vx, Ex");
             nextop = F8;
-            d1 = fpu_get_scratch(dyn, ninst);
+            q2 = fpu_get_scratch(dyn, ninst);
             GETEXSD(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
-            FSUBD(d1, v2, v1);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q1 = fpu_get_scratch(dyn, ninst);
+                q0 = fpu_get_scratch(dyn, ninst);
+                // check if any input value was NAN
+                FMAXD(q1, v0, v1);    // propagate NAN
+                FCMEQD(q1, q1, q1);    // 0 if NAN, 1 if not NAN
+                FSUBD(q2, v2, v1);  // the high part of the vector is erased...
+                FCMEQD(q0, q2, q2);    // 0 => out is NAN
+                VBIC(q0, q1, q0);      // forget it in any input was a NAN already
+                SHL_64(q0, q0, 63);     // only keep the sign bit
+                VORR(q2, q2, q0);      // NAN -> -NAN
+            } else {
+                FSUBD(q2, v2, v1);  // the high part of the vector is erased...
+            }
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
-            VMOVeD(v0, 0, d1, 0);
+            VMOVeD(v0, 0, q2, 0);
             YMM0(gd)
             break;
         case 0x5D:
@@ -285,27 +323,27 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
         case 0x5E:
             INST_NAME("VDIVSD Gx, Vx, Ex");
             nextop = F8;
-            d1 = fpu_get_scratch(dyn, ninst);
+            q2 = fpu_get_scratch(dyn, ninst);
             GETEXSD(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
             if(!BOX64ENV(dynarec_fastnan)) {
-                q0 = fpu_get_scratch(dyn, ninst);
                 q1 = fpu_get_scratch(dyn, ninst);
+                q0 = fpu_get_scratch(dyn, ninst);
                 // check if any input value was NAN
-                FMAXD(q0, v2, v1);    // propagate NAN
-                FCMEQD(q0, q0, q0);    // 0 if NAN, 1 if not NAN
-            }
-            FDIVD(d1, v2, v1);
-            if(!BOX64ENV(dynarec_fastnan)) {
-                FCMEQD(q1, d1, d1);    // 0 => out is NAN
-                VBIC(q1, q0, q1);      // forget it in any input was a NAN already
-                VSHLQ_64(q1, q1, 63);   // only keep the sign bit
-                VORR(d1, d1, q1);      // NAN -> -NAN
+                FMAXD(q1, v0, v1);    // propagate NAN
+                FCMEQD(q1, q1, q1);    // 0 if NAN, 1 if not NAN
+                FDIVD(q2, v2, v1);  // the high part of the vector is erased...
+                FCMEQD(q0, q2, q2);    // 0 => out is NAN
+                VBIC(q0, q1, q0);      // forget it in any input was a NAN already
+                SHL_64(q0, q0, 63);     // only keep the sign bit
+                VORR(q2, q2, q0);      // NAN -> -NAN
+            } else {
+                FDIVD(q2, v2, v1);  // the high part of the vector is erased...
             }
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
-            VMOVeD(v0, 0, d1, 0);
+            VMOVeD(v0, 0, q2, 0);
             YMM0(gd)
             break;
         case 0x5F:
@@ -366,6 +404,29 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             }
             if(!vex.l) YMM0(gd);
             break;
+        case 0x7D:
+            INST_NAME("VHSUBPS Gx, Vx, Ex");
+            nextop = F8;
+            q0 = fpu_get_scratch(dyn, ninst);
+            if(MODREG || (v1==v2)) {
+                q1 = fpu_get_scratch(dyn, ninst);
+            } else 
+                q1 = v1;
+            if(vex.l)
+                q2 = fpu_get_scratch(dyn, ninst);
+            else
+                q2 = q0;
+            // q0 will contains -1 / 0 / -1 / 0
+            MOVIQ_64(q0, 0xf0);
+            VSHLQ_32(q0, q0, 31);   // keep sign bit
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                VEORQ(q1, v1, q0);
+                VEORQ(q2, v2, q0);
+                VFADDPQS(v0, q2, q1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
 
         case 0xC2:
             INST_NAME("CMPSD Gx, Ex, Ib");
@@ -395,13 +456,12 @@ uintptr_t dynarec64_AVX_F2_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             q0 = fpu_get_scratch(dyn, ninst);
             static float addsubps[4] = {-1.f, 1.f, -1.f, 1.f};
             MAYUSE(addsubps);
-            TABLE64(x2, (uintptr_t)&addsubps);
+            MOV64x(x2, (uintptr_t)&addsubps);
             VLDR128_U12(q0, x2, 0);
             for(int l=0; l<1+vex.l; ++l) {
-                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); if(v0==v1) q1 = fpu_get_scratch(dyn, ninst); } else { GETGY_empty_VYEY(v0, v2, v1); }
+                if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); }
                 if(v0==v1) {
-                    VFMULQS(q1, v1, q0);
-                    VFADDQS(v0, v2, q1);
+                    VFMLAQS(v0, v2, q0);
                 } else {
                     if(v0!=v2) VMOVQ(v0, v2);
                     VFMLAQS(v0, v1, q0);
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
index eed9fb59..1dc4c55b 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
@@ -234,35 +234,53 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
         case 0x58:
             INST_NAME("VADDSS Gx, Vx, Ex");
             nextop = F8;
-            d1 = fpu_get_scratch(dyn, ninst);
+            q2 = fpu_get_scratch(dyn, ninst);
             GETEXSS(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q1 = fpu_get_scratch(dyn, ninst);
+                q0 = fpu_get_scratch(dyn, ninst);
+                // check if any input value was NAN
+                FMAXS(q1, v0, v1);    // propagate NAN
+                FCMEQS(q1, q1, q1);    // 0 if NAN, 1 if not NAN
+                FADDS(q2, v1, v2);  // the high part of the vector is erased...
+                FCMEQS(q0, q2, q2);    // 0 => out is NAN
+                VBIC(q0, q1, q0);      // forget it in any input was a NAN already
+                VSHL_32(q0, q0, 31);     // only keep the sign bit
+                VORR(q2, q2, q0);      // NAN -> -NAN
+            } else {
+                FADDS(q2, v1, v2);  // the high part of the vector is erased...
+            }
             if(v0!=v2) {
-                if(v0==v1)  {
-                    VMOV(d1, v1);
-                    v1 = d1;
-                }
                 VMOVQ(v0, v2);
             }
-            FADDS(d1, v0, v1);
-            VMOVeS(v0, 0, d1, 0);
+            VMOVeS(v0, 0, q2, 0);
             YMM0(gd)
             break;
         case 0x59:
             INST_NAME("VMULSS Gx, Vx, Ex");
             nextop = F8;
-            d1 = fpu_get_scratch(dyn, ninst);
+            q2 = fpu_get_scratch(dyn, ninst);
             GETEXSS(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q1 = fpu_get_scratch(dyn, ninst);
+                q0 = fpu_get_scratch(dyn, ninst);
+                // check if any input value was NAN
+                FMAXS(q1, v0, v1);    // propagate NAN
+                FCMEQS(q1, q1, q1);    // 0 if NAN, 1 if not NAN
+                FMULS(q2, v1, v2);  // the high part of the vector is erased...
+                FCMEQS(q0, q2, q2);    // 0 => out is NAN
+                VBIC(q0, q1, q0);      // forget it in any input was a NAN already
+                VSHL_32(q0, q0, 31);     // only keep the sign bit
+                VORR(q2, q2, q0);      // NAN -> -NAN
+            } else {
+                FMULS(q2, v1, v2);  // the high part of the vector is erased...
+            }
             if(v0!=v2) {
-                if(v0==v1)  {
-                    VMOV(d1, v1);
-                    v1 = d1;
-                }
                 VMOVQ(v0, v2);
             }
-            FMULS(d1, v0, v1);
-            VMOVeS(v0, 0, d1, 0);
+            VMOVeS(v0, 0, q2, 0);
             YMM0(gd)
             break;
         case 0x5A:
@@ -314,14 +332,27 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
         case 0x5C:
             INST_NAME("VSUBSS Gx, Vx, Ex");
             nextop = F8;
-            d1 = fpu_get_scratch(dyn, ninst);
+            q2 = fpu_get_scratch(dyn, ninst);
             GETEXSS(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
-            FSUBS(d1, v2, v1);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q1 = fpu_get_scratch(dyn, ninst);
+                q0 = fpu_get_scratch(dyn, ninst);
+                // check if any input value was NAN
+                FMAXS(q1, v0, v1);    // propagate NAN
+                FCMEQS(q1, q1, q1);    // 0 if NAN, 1 if not NAN
+                FSUBS(q2, v2, v1);  // the high part of the vector is erased...
+                FCMEQS(q0, q2, q2);    // 0 => out is NAN
+                VBIC(q0, q1, q0);      // forget it in any input was a NAN already
+                VSHL_32(q0, q0, 31);     // only keep the sign bit
+                VORR(q2, q2, q0);      // NAN -> -NAN
+            } else {
+                FSUBS(q2, v2, v1);  // the high part of the vector is erased...
+            }
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
-            VMOVeS(v0, 0, d1, 0);
+            VMOVeS(v0, 0, q2, 0);
             YMM0(gd)
             break;
         case 0x5D:
@@ -341,14 +372,27 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
         case 0x5E:
             INST_NAME("VDIVSS Gx, Vx, Ex");
             nextop = F8;
-            d1 = fpu_get_scratch(dyn, ninst);
+            q2 = fpu_get_scratch(dyn, ninst);
             GETEXSS(v1, 0, 0);
             GETGX_empty_VX(v0, v2);
-            FDIVS(d1, v2, v1);
+            if(!BOX64ENV(dynarec_fastnan)) {
+                q1 = fpu_get_scratch(dyn, ninst);
+                q0 = fpu_get_scratch(dyn, ninst);
+                // check if any input value was NAN
+                FMAXS(q1, v0, v1);    // propagate NAN
+                FCMEQS(q1, q1, q1);    // 0 if NAN, 1 if not NAN
+                FDIVS(q2, v2, v1);  // the high part of the vector is erased...
+                FCMEQS(q0, q2, q2);    // 0 => out is NAN
+                VBIC(q0, q1, q0);      // forget it in any input was a NAN already
+                VSHL_32(q0, q0, 31);     // only keep the sign bit
+                VORR(q2, q2, q0);      // NAN -> -NAN
+            } else {
+                FDIVS(q2, v2, v1);  // the high part of the vector is erased...
+            }
             if(v0!=v2) {
                 VMOVQ(v0, v2);
             }
-            VMOVeS(v0, 0, d1, 0);
+            VMOVeS(v0, 0, q2, 0);
             YMM0(gd)
             break;
         case 0x5F:
diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c
index 984ebc2e..d1ff597b 100644
--- a/src/dynarec/arm64/dynarec_arm64_f20f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f20f.c
@@ -497,7 +497,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             q0 = fpu_get_scratch(dyn, ninst);

             static float addsubps[4] = {-1.f, 1.f, -1.f, 1.f};

             MAYUSE(addsubps);

-            TABLE64(x2, (uintptr_t)&addsubps);

+            MOV64x(x2, (uintptr_t)&addsubps);   // no need to use table64, as box64 is loaded in low memory

             VLDR128_U12(q0, x2, 0);

             VFMLAQS(v0, v1, q0);

             break;

diff --git a/src/dynarec/arm64/dynarec_arm64_f30f.c b/src/dynarec/arm64/dynarec_arm64_f30f.c
index 45a5c454..b136c59c 100644
--- a/src/dynarec/arm64/dynarec_arm64_f30f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f30f.c
@@ -246,11 +246,24 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
         case 0x58:

             INST_NAME("ADDSS Gx, Ex");

             nextop = F8;

-            GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn, ninst);

+            GETGX(d1, 1);

+            v1 = fpu_get_scratch(dyn, ninst);

             GETEXSS(d0, 0, 0);

-            FADDS(d1, v0, d0);  // the high part of the vector is erased...

-            VMOVeS(v0, 0, d1, 0);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                v0 = fpu_get_scratch(dyn, ninst);

+                q0 = fpu_get_scratch(dyn, ninst);

+                // check if any input value was NAN

+                FMAXS(v0, d0, d1);    // propagate NAN

+                FCMEQS(v0, v0, v0);    // 0 if NAN, 1 if not NAN

+                FADDS(v1, d1, d0);  // the high part of the vector is erased...

+                FCMEQS(q0, v1, v1);    // 0 => out is NAN

+                VBIC(q0, v0, q0);      // forget it in any input was a NAN already

+                VSHL_32(q0, q0, 31);     // only keep the sign bit

+                VORR(v1, v1, q0);      // NAN -> -NAN

+            } else {

+                FADDS(v1, d1, d0);  // the high part of the vector is erased...

+            }

+            VMOVeS(d1, 0, v1, 0);

             break;

         case 0x59:

             INST_NAME("MULSS Gx, Ex");

@@ -264,13 +277,13 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 // check if any input value was NAN

                 FMAXS(v0, d0, d1);    // propagate NAN

                 FCMEQS(v0, v0, v0);    // 0 if NAN, 1 if not NAN

-                FMULS(v1, d1, d0);

+                FMULS(v1, d1, d0);  // the high part of the vector is erased...

                 FCMEQS(q0, v1, v1);    // 0 => out is NAN

                 VBIC(q0, v0, q0);      // forget it in any input was a NAN already

-                VSHL_32(q0, q0, 31);   // only keep the sign bit

+                VSHL_32(q0, q0, 31);     // only keep the sign bit

                 VORR(v1, v1, q0);      // NAN -> -NAN

             } else {

-                FMULS(v1, d1, d0);

+                FMULS(v1, d1, d0);  // the high part of the vector is erased...

             }

             VMOVeS(d1, 0, v1, 0);

             break;

@@ -311,11 +324,24 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
         case 0x5C:

             INST_NAME("SUBSS Gx, Ex");

             nextop = F8;

-            GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn, ninst);

+            GETGX(d1, 1);

+            v1 = fpu_get_scratch(dyn, ninst);

             GETEXSS(d0, 0, 0);

-            FSUBS(d1, v0, d0);

-            VMOVeS(v0, 0, d1, 0);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                v0 = fpu_get_scratch(dyn, ninst);

+                q0 = fpu_get_scratch(dyn, ninst);

+                // check if any input value was NAN

+                FMAXS(v0, d0, d1);    // propagate NAN

+                FCMEQS(v0, v0, v0);    // 0 if NAN, 1 if not NAN

+                FSUBS(v1, d1, d0);  // the high part of the vector is erased...

+                FCMEQS(q0, v1, v1);    // 0 => out is NAN

+                VBIC(q0, v0, q0);      // forget it in any input was a NAN already

+                VSHL_32(q0, q0, 31);     // only keep the sign bit

+                VORR(v1, v1, q0);      // NAN -> -NAN

+            } else {

+                FSUBS(v1, d1, d0);  // the high part of the vector is erased...

+            }

+            VMOVeS(d1, 0, v1, 0);

             break;

         case 0x5D:

             INST_NAME("MINSS Gx, Ex");

@@ -336,11 +362,24 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
         case 0x5E:

             INST_NAME("DIVSS Gx, Ex");

             nextop = F8;

-            GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn, ninst);

+            GETGX(d1, 1);

+            v1 = fpu_get_scratch(dyn, ninst);

             GETEXSS(d0, 0, 0);

-            FDIVS(d1, v0, d0);

-            VMOVeS(v0, 0, d1, 0);

+            if(!BOX64ENV(dynarec_fastnan)) {

+                v0 = fpu_get_scratch(dyn, ninst);

+                q0 = fpu_get_scratch(dyn, ninst);

+                // check if any input value was NAN

+                FMAXS(v0, d0, d1);    // propagate NAN

+                FCMEQS(v0, v0, v0);    // 0 if NAN, 1 if not NAN

+                FDIVS(v1, d1, d0);  // the high part of the vector is erased...

+                FCMEQS(q0, v1, v1);    // 0 => out is NAN

+                VBIC(q0, v0, q0);      // forget it in any input was a NAN already

+                VSHL_32(q0, q0, 31);     // only keep the sign bit

+                VORR(v1, v1, q0);      // NAN -> -NAN

+            } else {

+                FDIVS(v1, d1, d0);  // the high part of the vector is erased...

+            }

+            VMOVeS(d1, 0, v1, 0);

             break;

         case 0x5F:

             INST_NAME("MAXSS Gx, Ex");