More test17 impovements, fixed NAN for mulpd ([DYNAREC] too, introducing BOX64_DYNAREC_FASTNAN env. var. to keep old faster behaviour selectable)

author: ptitSeb <sebastien.chev@gmail.com> 2022-03-31 10:28:18 +0200
committer: ptitSeb <sebastien.chev@gmail.com> 2022-03-31 10:28:18 +0200
commit: 716eb97af90b21ed1085c9c6e1eb8d132d9f3f18 (patch)
tree: 5414a91c1be56719b37c54c1ffec9ace90be1a5d
parent: 3623cb9785a1c7b593ebc65c42c23a4db981ada4 (diff)
download: box64-716eb97af90b21ed1085c9c6e1eb8d132d9f3f18.tar.gz
box64-716eb97af90b21ed1085c9c6e1eb8d132d9f3f18.zip
8 files changed, 97 insertions, 6 deletions
diff --git a/docs/USAGE.md b/docs/USAGE.md
index d9de9d93..052239b9 100755
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -131,6 +131,11 @@ Enable/Disable simulation of Strong Memory model
 * 1 : Enable some Memory Barrier when reading from memory (on some MOV opcode) to simulate Strong Memory Model while trying to limit performance impact (Default when libmonobdwgc-2.0.so is loaded)
 * 2 : Enable some Memory Barrier when reading from memory (on some MOV opcode) to simulate Strong Memory Model
 
+#### BOX64_DYNAREC_FASTNAN
+Enable/Disable generation of -NAN
+* 0 : Generate -NAN like on x86 (Default.)
+* 1 : Don't do anything special with NAN, to go as fast as possible (was default before this option exsted)
+
 #### BOX64_LIBGL
  * libXXXX set the name for libGL (defaults to libGL.so.1).
  * /PATH/TO/libGLXXX : Sets the name and path for libGL
diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index c739ee15..abd12c04 100755
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -703,8 +703,21 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("MULPD Gx, Ex");
             nextop = F8;
             GETEX(q0, 0);
-            GETGX(v0);
-            VFMULQD(v0, v0, q0);
+            GETGX(q1);
+            if(!box64_dynarec_fastnan) {
+                v0 = fpu_get_scratch(dyn);
+                v1 = fpu_get_scratch(dyn);
+                // check if any input value was NAN
+                VFMAXQD(v0, q0, q1);    // propagate NAN
+                FCMEQQD(v0, v0, v0);    // 0 if NAN, 1 if not NAN
+            }
+            VFMULQD(q1, q1, q0);
+            if(!box64_dynarec_fastnan) {
+                FCMEQQD(v1, q1, q1);    // 0 => out is NAN
+                VBICQ(v1, v0, v1);      // forget it in any input was a NAN already
+                VSHLQ_64(v1, v1, 63);   // only keep the sign bit
+                VORRQ(q1, q1, v1);      // NAN -> -NAN
+            }
             break;
         case 0x5A:
             INST_NAME("CVTPD2PS Gx, Ex");
diff --git a/src/emu/x64run660f.c b/src/emu/x64run660f.c
index 09354192..5db9994a 100644
--- a/src/emu/x64run660f.c
+++ b/src/emu/x64run660f.c
@@ -796,8 +796,8 @@ int Run660F(x64emu_t *emu, rex_t rex)
         GETGX;
         for (int i=0; i<2; ++i) {
             #ifndef NOALIGN
-            if(EX->d[i]<0.0) // on x86, default nan are negative
-                GX->d[i] = -NAN;
+            if(EX->d[i]<0.0)        // on x86, default nan are negative
+                GX->d[i] = -NAN;    // but input NAN are not touched (so sqrt(+nan) -> +nan)
             else
             #endif
             GX->d[i] = sqrt(EX->d[i]);
@@ -843,8 +843,15 @@ int Run660F(x64emu_t *emu, rex_t rex)
         nextop = F8;
         GETEX(0);
         GETGX;
-        GX->d[0] *= EX->d[0];
-        GX->d[1] *= EX->d[1];
+        for(int i=0; i<2; ++i) {
+            #ifndef NOALIGN
+                // mul generate a -NAN only if doing (+/-)inf * (+/-)0
+                if((isinf(GX->d[i]) && EX->d[i]==0.0) || (isinf(EX->d[i]) && GX->d[i]==0.0))
+                    GX->d[i] = -NAN;
+                else
+            #endif
+            GX->d[i] *= EX->d[i];
+        }
         break;
     case 0x5A:                      /* CVTPD2PS Gx, Ex */
         nextop = F8;
diff --git a/src/include/debug.h b/src/include/debug.h
index b5f08599..b8ce4995 100755
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -15,6 +15,7 @@ extern int box64_dynarec_forced;
 extern uintptr_t box64_nodynarec_start, box64_nodynarec_end;
 extern int box64_dynarec_bigblock;
 extern int box64_dynarec_strongmem;
+extern int box64_dynarec_fastnan;
 #ifdef ARM64
 extern int arm64_asimd;
 extern int arm64_aes;
diff --git a/src/main.c b/src/main.c
index 208c251d..92d23936 100755
--- a/src/main.c
+++ b/src/main.c
@@ -45,6 +45,7 @@ int box64_dynarec_dump = 0;
 int box64_dynarec_forced = 0;
 int box64_dynarec_bigblock = 1;
 int box64_dynarec_strongmem = 0;
+int box64_dynarec_fastnan = 0;
 uintptr_t box64_nodynarec_start = 0;
 uintptr_t box64_nodynarec_end = 0;
 #ifdef ARM64
@@ -405,6 +406,15 @@ void LoadLogEnv()
         if(box64_dynarec_strongmem)
             printf_log(LOG_INFO, "Dynarec will try to emulate a strong memory model%s\n", (box64_dynarec_strongmem==1)?" with limited performance loss":"");
     }
+    p = getenv("BOX64_DYNAREC_FASTNAN");
+    if(p) {
+        if(strlen(p)==1) {
+            if(p[0]>='0' && p[0]<='1')
+                box64_dynarec_fastnan = p[0]-'0';
+        }
+        if(box64_dynarec_fastnan)
+            printf_log(LOG_INFO, "Dynarec will not try to normalize generated NAN\n");
+    }
     p = getenv("BOX64_NODYNAREC");
     if(p) {
         if (strchr(p,'-')) {
diff --git a/tests/ref17.txt b/tests/ref17.txt
index 4ed0bbc0..674f824d 100644
--- a/tests/ref17.txt
+++ b/tests/ref17.txt
@@ -198,3 +198,39 @@ psqrtpd(1 2 ) = 1 1.41421
 psqrtpd(0 -2 ) = 0 0xfff8000000000000 
 psqrtpd(inf -inf ) = inf 0xfff8000000000000 
 psqrtpd(0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -0 
+andpd(1 2 , 0 -2 ) = 0 2 
+andpd(0 -2 , inf -inf ) = 0 -2 
+andpd(1 2 , 0x7ff8000000000000 -0 ) = 1 0 
+andpd(0 -2 , 0x7ff8000000000000 -0 ) = 0 -0 
+andpd(inf -inf , 0x7ff8000000000000 -0 ) = inf -0 
+andpd(0x7ff8000000000000 -0 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -0 
+andnpd(1 2 , 0 -2 ) = 0 -0 
+andnpd(0 -2 , inf -inf ) = inf 1 
+andnpd(1 2 , 0x7ff8000000000000 -0 ) = 3 -0 
+andnpd(0 -2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 0 
+andnpd(inf -inf , 0x7ff8000000000000 -0 ) = 1.11254e-308 0 
+andnpd(0x7ff8000000000000 -0 , 0x7ff8000000000000 -0 ) = 0 0 
+orpd(1 2 , 0 -2 ) = 1 -2 
+orpd(0 -2 , inf -inf ) = inf -inf 
+orpd(1 2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -2 
+orpd(0 -2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -2 
+orpd(inf -inf , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -inf 
+orpd(0x7ff8000000000000 -0 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -0 
+xorpd(1 2 , 0 -2 ) = 1 -0 
+xorpd(0 -2 , inf -inf ) = inf 1 
+xorpd(1 2 , 0x7ff8000000000000 -0 ) = 3 -2 
+xorpd(0 -2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 2 
+xorpd(inf -inf , 0x7ff8000000000000 -0 ) = 1.11254e-308 inf 
+xorpd(0x7ff8000000000000 -0 , 0x7ff8000000000000 -0 ) = 0 0 
+addpd(1 2 , 0 -2 ) = 1 0 
+addpd(0 -2 , inf -inf ) = inf -inf 
+addpd(1 2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 2 
+addpd(0 -2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -2 
+addpd(inf -inf , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -inf 
+addpd(0x7ff8000000000000 -0 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -0 
+mulpd(1 2 , 0 -2 ) = 0 -4 
+mulpd(0 -2 , inf -inf ) = 0xfff8000000000000 inf 
+mulpd(1 2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -0 
+mulpd(0 -2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 0 
+mulpd(inf -inf , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 0xfff8000000000000 
+mulpd(0x7ff8000000000000 -0 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 0 
diff --git a/tests/test17 b/tests/test17
index 264d8d40..9c23e0a3 100755
--- a/tests/test17
+++ b/tests/test17
Binary files differdiff --git a/tests/test17.c b/tests/test17.c
index 2f55b431..e885892d 100644
--- a/tests/test17.c
+++ b/tests/test17.c
@@ -281,6 +281,11 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
  a128.md = _mm_##A##_pd(A1.md);                     \
  printf("%s(", #C); print_pd(A1);                   \
  printf(") = "); print_pd(a128); printf("\n");
+ #define GO2pd(A, C, A1, A2)                        \
+ a128.md = _mm_##A##_pd(A1.md, A2.md);              \
+ printf("%s(", #C); print_pd(A1);                   \
+ printf(", "); print_pd(A2);                        \
+ printf(") = "); print_pd(a128); printf("\n");
  
 
  GO2(shuffle, 8, pshufb, a128_8, b128_8)
@@ -329,6 +334,20 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
  GO1pd(sqrt, psqrtpd, b128_pd)
  GO1pd(sqrt, psqrtpd, c128_pd)
  GO1pd(sqrt, psqrtpd, d128_pd)
+ #define MULITGO2pd(A, B)       \
+ GO2pd(A, B, a128_pd, b128_pd)  \
+ GO2pd(A, B, b128_pd, c128_pd)  \
+ GO2pd(A, B, a128_pd, d128_pd)  \
+ GO2pd(A, B, b128_pd, d128_pd)  \
+ GO2pd(A, B, c128_pd, d128_pd)  \
+ GO2pd(A, B, d128_pd, d128_pd)
+ MULITGO2pd(and, andpd)
+ MULITGO2pd(andnot, andnpd)
+ MULITGO2pd(or, orpd)
+ MULITGO2pd(xor, xorpd)
+ MULITGO2pd(add, addpd)
+ MULITGO2pd(mul, mulpd)
 
  return 0;
 }
+
author	ptitSeb <sebastien.chev@gmail.com>	2022-03-31 10:28:18 +0200
committer	ptitSeb <sebastien.chev@gmail.com>	2022-03-31 10:28:18 +0200
commit	716eb97af90b21ed1085c9c6e1eb8d132d9f3f18 (patch)
tree	5414a91c1be56719b37c54c1ffec9ace90be1a5d
parent	3623cb9785a1c7b593ebc65c42c23a4db981ada4 (diff)
download	box64-716eb97af90b21ed1085c9c6e1eb8d132d9f3f18.tar.gz box64-716eb97af90b21ed1085c9c6e1eb8d132d9f3f18.zip