about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2022-03-31 09:11:54 +0200
committerptitSeb <sebastien.chev@gmail.com>2022-03-31 09:11:54 +0200
commit3623cb9785a1c7b593ebc65c42c23a4db981ada4 (patch)
tree1f2a376d8eb416f2e0a9577ffc126241dad8e592
parent3f641c2b67ae210caa6ebe7a6e4253038dada6a5 (diff)
downloadbox64-3623cb9785a1c7b593ebc65c42c23a4db981ada4.tar.gz
box64-3623cb9785a1c7b593ebc65c42c23a4db981ada4.zip
Improved test17 (sse), and fixed NAN bordercase for PSQRTPD opcode
-rw-r--r--src/emu/x64run660f.c10
-rw-r--r--tests/ref17.txt17
-rwxr-xr-xtests/test17bin95272 -> 101568 bytes
-rw-r--r--tests/test17.c61
4 files changed, 83 insertions, 5 deletions
diff --git a/src/emu/x64run660f.c b/src/emu/x64run660f.c
index 148c99eb..09354192 100644
--- a/src/emu/x64run660f.c
+++ b/src/emu/x64run660f.c
@@ -794,8 +794,14 @@ int Run660F(x64emu_t *emu, rex_t rex)
         nextop = F8;

         GETEX(0);

         GETGX;

-        GX->d[0] = sqrt(EX->d[0]);

-        GX->d[1] = sqrt(EX->d[1]);

+        for (int i=0; i<2; ++i) {

+            #ifndef NOALIGN

+            if(EX->d[i]<0.0) // on x86, default nan are negative

+                GX->d[i] = -NAN;

+            else

+            #endif

+            GX->d[i] = sqrt(EX->d[i]);

+        }

         break;

 

     case 0x54:                      /* ANDPD Gx, Ex */

diff --git a/tests/ref17.txt b/tests/ref17.txt
index 43882799..4ed0bbc0 100644
--- a/tests/ref17.txt
+++ b/tests/ref17.txt
@@ -181,3 +181,20 @@ pmovzxbq(0xffffffffffffffff 0x8000000000000000 ) = 0xff 0xff
 pmovzxwd(0xffffffff 0x80000000 0x7fffffff 0x0 ) = 0xffff 0xffff 0x0 0x8000 
 pmovzxwq(0xffffffffffffffff 0x8000000000000000 ) = 0xffff 0xffff 
 pmovzxdq(0xffffffffffffffff 0x8000000000000000 ) = 0xffffffff 0xffffffff 
+pminsd(0xffffffff 0x80000000 0x7fffffff 0x0 , 0x1 0x80000000 0x5 0xfffffffe ) = 0xffffffff 0x80000000 0x5 0xfffffffe 
+pmaxsd(0xffffffff 0x80000000 0x7fffffff 0x0 , 0x1 0x80000000 0x5 0xfffffffe ) = 0x1 0x80000000 0x7fffffff 0x0 
+pblendw(0xffff 0x8000 0x7fff 0x0 0x1 0x2 0x3 0x8001 , 0x8000 0x7fff 0xffff 0xffff 0x50 0x9000 0xfffe 0x8001 0) = 0xffff 0x8000 0x7fff 0x0 0x1 0x2 0x3 0x8001 
+pblendw(0xffff 0x8000 0x7fff 0x0 0x1 0x2 0x3 0x8001 , 0x8000 0x7fff 0xffff 0xffff 0x50 0x9000 0xfffe 0x8001 255) = 0x8000 0x7fff 0xffff 0xffff 0x50 0x9000 0xfffe 0x8001 
+pblendw(0xffff 0x8000 0x7fff 0x0 0x1 0x2 0x3 0x8001 , 0x8000 0x7fff 0xffff 0xffff 0x50 0x9000 0xfffe 0x8001 170) = 0xffff 0x7fff 0x7fff 0xffff 0x1 0x9000 0x3 0x8001 
+pblendw(0xffff 0x8000 0x7fff 0x0 0x1 0x2 0x3 0x8001 , 0x8000 0x7fff 0xffff 0xffff 0x50 0x9000 0xfffe 0x8001 2) = 0xffff 0x7fff 0x7fff 0x0 0x1 0x2 0x3 0x8001 
+palignr(0xff 0x80 0x7f 0x0 0x1 0x2 0x3 0x81 0xfe 0x84 0x72 0x52 0xa5 0x0 0xc0 0x32 , 0x0 0x1 0x5 0x15 0x20 0x80 0xff 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 0) = 0x0 0x1 0x5 0x15 0x20 0x80 0xff 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 
+palignr(0xff 0x80 0x7f 0x0 0x1 0x2 0x3 0x81 0xfe 0x84 0x72 0x52 0xa5 0x0 0xc0 0x32 , 0x0 0x1 0x5 0x15 0x20 0x80 0xff 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 2) = 0x5 0x15 0x20 0x80 0xff 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 0xff 0x80 
+palignr(0xff 0x80 0x7f 0x0 0x1 0x2 0x3 0x81 0xfe 0x84 0x72 0x52 0xa5 0x0 0xc0 0x32 , 0x0 0x1 0x5 0x15 0x20 0x80 0xff 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 7) = 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 0xff 0x80 0x7f 0x0 0x1 0x2 0x3 
+palignr(0xff 0x80 0x7f 0x0 0x1 0x2 0x3 0x81 0xfe 0x84 0x72 0x52 0xa5 0x0 0xc0 0x32 , 0x0 0x1 0x5 0x15 0x20 0x80 0xff 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 15) = 0x1 0xff 0x80 0x7f 0x0 0x1 0x2 0x3 0x81 0xfe 0x84 0x72 0x52 0xa5 0x0 0xc0 
+palignr(0xff 0x80 0x7f 0x0 0x1 0x2 0x3 0x81 0xfe 0x84 0x72 0x52 0xa5 0x0 0xc0 0x32 , 0x0 0x1 0x5 0x15 0x20 0x80 0xff 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 16) = 0xff 0x80 0x7f 0x0 0x1 0x2 0x3 0x81 0xfe 0x84 0x72 0x52 0xa5 0x0 0xc0 0x32 
+palignr(0xff 0x80 0x7f 0x0 0x1 0x2 0x3 0x81 0xfe 0x84 0x72 0x52 0xa5 0x0 0xc0 0x32 , 0x0 0x1 0x5 0x15 0x20 0x80 0xff 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 255) = 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 
+movmskpd(0xffffffffffffffff 0x8000000000000000 ) = 0x3
+psqrtpd(1 2 ) = 1 1.41421 
+psqrtpd(0 -2 ) = 0 0xfff8000000000000 
+psqrtpd(inf -inf ) = inf 0xfff8000000000000 
+psqrtpd(0x7ff8000000000000 -0 ) = 0x7ff8000000000000 -0 
diff --git a/tests/test17 b/tests/test17
index b6be07fc..264d8d40 100755
--- a/tests/test17
+++ b/tests/test17
Binary files differdiff --git a/tests/test17.c b/tests/test17.c
index d3992ea2..2f55b431 100644
--- a/tests/test17.c
+++ b/tests/test17.c
@@ -1,3 +1,5 @@
+// build with  gcc -march=corei7 -O2 -g -msse -msse2 test17.c -o test17
+// and -m32 for 32bits version
 #include <string.h>
 #include <stdio.h>
 #include <stddef.h>
@@ -103,10 +105,18 @@ const v128 b128_16 = {.u16 = {
 const v128 b128_32 = {.u32 = {
     0x00000001, 0x80000000, 0x00000005, 0xfffffffe
 }};
+const v128 b128_64 = {.u64 = {
+    0x0000000000000001LL, 0x8000000000000000LL
+}};
 const v128 c128_32 = {.u32 = {
     0x00000001, 0x80000000, 0x80000005, 0x0000fffe
 }};
 
+const v128 a128_pd = {.d64 = { 1.0, 2.0}};
+const v128 b128_pd = {.d64 = { 0.0, -2.0}};
+const v128 c128_pd = {.d64 = { INFINITY, -INFINITY}};
+const v128 d128_pd = {.d64 = { NAN, -0.0}};
+
 void print_8(v128 v) {
     for(int i=0; i<16; ++i)
         printf("0x%x ", v.u8[i]);
@@ -123,6 +133,20 @@ void print_64(v128 v) {
     for(int i=0; i<2; ++i)
         printf("0x%llx ", v.u64[i]);
 }
+void print_ps(v128 v) {
+    for(int i=0; i<4; ++i)
+        if(isnanf(v.f32[i]))
+            printf("nan ");
+        else
+            printf("%g ", v.f32[i]);
+}
+void print_pd(v128 v) {
+    for(int i=0; i<2; ++i)
+        if(isnan(v.d64[i]))
+            printf("0x%llx ", v.u64[i]);
+        else
+            printf("%g ", v.d64[i]);
+}
 
 int main(int argc, const char** argv)
 {
@@ -226,13 +250,18 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
 
  #define GO1(A, N, C)                               \
  a128.mm = _mm_##A##_epi##N(a128_##N.mm);           \
- printf("%s(", #C); print_##N(a128_##N);           \
+ printf("%s(", #C); print_##N(a128_##N);            \
  printf(") = "); print_##N(a128); printf("\n");
  #define GO2(A, N, C, A1, A2)                       \
  a128.mm = _mm_##A##_epi##N(A1.mm, A2.mm);          \
- printf("%s(", #C); print_##N(A1);                 \
+ printf("%s(", #C); print_##N(A1);                  \
  printf(", "); print_##N(A2);                       \
  printf(") = "); print_##N(a128); printf("\n");
+ #define GO2C(A, N, C, A1, A2, I)                   \
+ a128.mm = _mm_##A##_epi##N(A1.mm, A2.mm, I);       \
+ printf("%s(", #C); print_##N(A1);                  \
+ printf(", "); print_##N(A2);                       \
+ printf("%d) = ", I); print_##N(a128); printf("\n");
  #define GO2i(A, A1, A2)                            \
  i = _mm_##A##_si128(A1.mm, A2.mm);                 \
  printf("p%s(", #A); print_64(A1);                  \
@@ -244,7 +273,15 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
  printf(", "); print_##N(A2);                       \
  printf(", "); print_##N(A3);                       \
  printf(") = "); print_##N(a128); printf("\n");
-
+ #define GO1ipd(A, C, A1)                           \
+ i = _mm_##A##_pd(A1.md);                           \
+ printf("%s(", #C); print_64(A1);                   \
+ printf(") = 0x%x\n", i);
+ #define GO1pd(A, C, A1)                            \
+ a128.md = _mm_##A##_pd(A1.md);                     \
+ printf("%s(", #C); print_pd(A1);                   \
+ printf(") = "); print_pd(a128); printf("\n");
+ 
 
  GO2(shuffle, 8, pshufb, a128_8, b128_8)
  GO2(hadd, 16, phaddw, a128_16, b128_16)
@@ -275,5 +312,23 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
  GO1(cvtepu16, 32, pmovzxwd);
  GO1(cvtepu16, 64, pmovzxwq);
  GO1(cvtepu32, 64, pmovzxdq);
+ GO2(min, 32, pminsd, a128_32, b128_32)
+ GO2(max, 32, pmaxsd, a128_32, b128_32)
+ GO2C(blend, 16, pblendw, a128_16, b128_16, 0)
+ GO2C(blend, 16, pblendw, a128_16, b128_16, 0xff)
+ GO2C(blend, 16, pblendw, a128_16, b128_16, 0xaa)
+ GO2C(blend, 16, pblendw, a128_16, b128_16, 2)
+ GO2C(alignr, 8, palignr, a128_8, b128_8, 0)
+ GO2C(alignr, 8, palignr, a128_8, b128_8, 2)
+ GO2C(alignr, 8, palignr, a128_8, b128_8, 7)
+ GO2C(alignr, 8, palignr, a128_8, b128_8, 15)
+ GO2C(alignr, 8, palignr, a128_8, b128_8, 16)
+ GO2C(alignr, 8, palignr, a128_8, b128_8, 0xff)
+ GO1ipd(movemask, movmskpd, a128_64)
+ GO1pd(sqrt, psqrtpd, a128_pd)
+ GO1pd(sqrt, psqrtpd, b128_pd)
+ GO1pd(sqrt, psqrtpd, c128_pd)
+ GO1pd(sqrt, psqrtpd, d128_pd)
+
  return 0;
 }