Added more sse2 opcode to test17, and added nan handling to SQRTSD and MULSD ([DYNAREC] too)

author: ptitSeb <sebastien.chev@gmail.com> 2022-04-01 19:27:27 +0200
committer: ptitSeb <sebastien.chev@gmail.com> 2022-04-01 19:27:27 +0200
commit: f2012fc6365c338b977a6e6a230e1d9d7c750d51 (patch)
tree: 584792a0b26327fdde1da550b1b06e93587a2332 /tests
parent: 16f82ba6b3a447fca0d9d1c56098cc1aace10d2c (diff)
download: box64-f2012fc6365c338b977a6e6a230e1d9d7c750d51.tar.gz
box64-f2012fc6365c338b977a6e6a230e1d9d7c750d51.zip
3 files changed, 245 insertions, 7 deletions
diff --git a/tests/ref17.txt b/tests/ref17.txt
index 3d0e67a2..eb67ae5e 100644
--- a/tests/ref17.txt
+++ b/tests/ref17.txt
@@ -357,3 +357,120 @@ psubq(0xffffffffffffffff 0x8000000000000000 , 0x1 0x8000000000000000 ) = 0xfffff
 paddb(0xff 0x80 0x7f 0x0 0x1 0x2 0x3 0x81 0xfe 0x84 0x72 0x52 0xa5 0x0 0xc0 0x32 , 0x0 0x1 0x5 0x15 0x20 0x80 0xff 0x0 0x8 0x7 0x81 0x6 0xa 0xf 0x10 0x1 ) = 0xff 0x81 0x84 0x15 0x21 0x82 0x2 0x81 0x6 0x8b 0xf3 0x58 0xaf 0xf 0xd0 0x33 
 paddw(0xffff 0x8000 0x7fff 0x0 0x1 0x2 0x3 0x8001 , 0x8000 0x7fff 0xffff 0xffff 0x50 0x9000 0xfffe 0x8001 ) = 0x7fff 0xffff 0x7ffe 0xffff 0x51 0x9002 0x1 0x2 
 paddd(0xffffffff 0x80000000 0x7fffffff 0x0 , 0x1 0x80000000 0x5 0xfffffffe ) = 0x0 0x0 0x80000004 0xfffffffe 
+pmovhlps(1 2 3 -4 , 0 -2 -10 0.5 ) = -10 0.5 3 -4 
+unpcklps(1 2 3 -4 , 0 -2 -10 0.5 ) = 1 0 2 -2 
+unpckhps(1 2 3 -4 , 0 -2 -10 0.5 ) = 3 -10 -4 0.5 
+pmovhps(1 2 3 -4 , 0 -2 -10 0.5 ) = 1 2 0 -2 
+psqrtps(1 2 3 -4 ) = 1 1.41421 1.73205 nan 
+psqrtps(0 -2 -10 0.5 ) = 0 nan nan 0.707107 
+psqrtps(inf -inf -inf 1 ) = inf nan nan 1 
+psqrtps(nan -0 nan inf ) = nan -0 nan inf 
+prcpps(nan -0 nan inf ) = nan -inf nan 0 
+andps(1 2 3 -4 , 0 -2 -10 0.5 ) = 0 2 2 0 
+andps(0 -2 -10 0.5 , inf -inf -inf 1 ) = 0 -2 -8 0.5 
+andps(1 2 3 -4 , nan -0 nan inf ) = 1 0 3 4 
+andps(0 -2 -10 0.5 , nan -0 nan inf ) = 0 -0 -8 0.5 
+andps(inf -inf -inf 1 , nan -0 nan inf ) = inf -0 -inf 1 
+andps(nan -0 nan inf , nan -0 nan inf ) = nan -0 nan inf 
+andnps(1 2 3 -4 , 0 -2 -10 0.5 ) = 0 -0 -2.93874e-38 0.5 
+andnps(0 -2 -10 0.5 , inf -inf -inf 1 ) = inf 1 0.25 1.17549e-38 
+andnps(1 2 3 -4 , nan -0 nan inf ) = 3 -0 -1 0.5 
+andnps(0 -2 -10 0.5 , nan -0 nan inf ) = nan 0 0.375 4 
+andnps(inf -inf -inf 1 , nan -0 nan inf ) = 5.87747e-39 0 5.87747e-39 2 
+andnps(nan -0 nan inf , nan -0 nan inf ) = 0 0 0 0 
+orps(1 2 3 -4 , 0 -2 -10 0.5 ) = 1 -2 -14 -inf 
+orps(0 -2 -10 0.5 , inf -inf -inf 1 ) = inf -inf nan 1 
+orps(1 2 3 -4 , nan -0 nan inf ) = nan -2 nan -inf 
+orps(0 -2 -10 0.5 , nan -0 nan inf ) = nan -2 nan inf 
+orps(inf -inf -inf 1 , nan -0 nan inf ) = nan -inf nan inf 
+orps(nan -0 nan inf , nan -0 nan inf ) = nan -0 nan inf 
+xorps(1 2 3 -4 , 0 -2 -10 0.5 ) = 1 -0 -4.11423e-38 -inf 
+xorps(0 -2 -10 0.5 , inf -inf -inf 1 ) = inf 1 0.3125 1.17549e-38 
+xorps(1 2 3 -4 , nan -0 nan inf ) = 3 -2 -1 -0.5 
+xorps(0 -2 -10 0.5 , nan -0 nan inf ) = nan 2 0.4375 4 
+xorps(inf -inf -inf 1 , nan -0 nan inf ) = 5.87747e-39 inf 5.87747e-39 2 
+xorps(nan -0 nan inf , nan -0 nan inf ) = 0 0 0 0 
+addps(1 2 3 -4 , 0 -2 -10 0.5 ) = 1 0 -7 -3.5 
+addps(0 -2 -10 0.5 , inf -inf -inf 1 ) = inf -inf -inf 1.5 
+addps(1 2 3 -4 , nan -0 nan inf ) = nan 2 nan inf 
+addps(0 -2 -10 0.5 , nan -0 nan inf ) = nan -2 nan inf 
+addps(inf -inf -inf 1 , nan -0 nan inf ) = nan -inf nan inf 
+addps(nan -0 nan inf , nan -0 nan inf ) = nan -0 nan inf 
+mulps(1 2 3 -4 , 0 -2 -10 0.5 ) = 0 -4 -30 -2 
+mulps(0 -2 -10 0.5 , inf -inf -inf 1 ) = nan inf inf 0.5 
+mulps(1 2 3 -4 , nan -0 nan inf ) = nan -0 nan -inf 
+mulps(0 -2 -10 0.5 , nan -0 nan inf ) = nan 0 nan inf 
+mulps(inf -inf -inf 1 , nan -0 nan inf ) = nan nan nan inf 
+mulps(nan -0 nan inf , nan -0 nan inf ) = nan 0 nan inf 
+subps(1 2 3 -4 , 0 -2 -10 0.5 ) = 1 4 13 -4.5 
+subps(0 -2 -10 0.5 , inf -inf -inf 1 ) = -inf inf inf -0.5 
+subps(1 2 3 -4 , nan -0 nan inf ) = nan 2 nan -inf 
+subps(0 -2 -10 0.5 , nan -0 nan inf ) = nan -2 nan -inf 
+subps(inf -inf -inf 1 , nan -0 nan inf ) = nan -inf nan -inf 
+subps(nan -0 nan inf , nan -0 nan inf ) = nan 0 nan nan 
+minps(1 2 3 -4 , 0 -2 -10 0.5 ) = 0 -2 -10 -4 
+minps(0 -2 -10 0.5 , inf -inf -inf 1 ) = 0 -inf -inf 0.5 
+minps(1 2 3 -4 , nan -0 nan inf ) = nan -0 nan -4 
+minps(0 -2 -10 0.5 , nan -0 nan inf ) = nan -2 nan 0.5 
+minps(inf -inf -inf 1 , nan -0 nan inf ) = nan -inf nan 1 
+minps(nan -0 nan inf , nan -0 nan inf ) = nan -0 nan inf 
+divps(1 2 3 -4 , 0 -2 -10 0.5 ) = inf -1 -0.3 -8 
+divps(0 -2 -10 0.5 , inf -inf -inf 1 ) = 0 0 0 0.5 
+divps(1 2 3 -4 , nan -0 nan inf ) = nan -inf nan -0 
+divps(0 -2 -10 0.5 , nan -0 nan inf ) = nan inf nan 0 
+divps(inf -inf -inf 1 , nan -0 nan inf ) = nan inf nan 0 
+divps(nan -0 nan inf , nan -0 nan inf ) = nan nan nan nan 
+maxps(1 2 3 -4 , 0 -2 -10 0.5 ) = 1 2 3 0.5 
+maxps(0 -2 -10 0.5 , inf -inf -inf 1 ) = inf -2 -10 1 
+maxps(1 2 3 -4 , nan -0 nan inf ) = nan 2 nan inf 
+maxps(0 -2 -10 0.5 , nan -0 nan inf ) = nan -0 nan inf 
+maxps(inf -inf -inf 1 , nan -0 nan inf ) = nan -0 nan inf 
+maxps(nan -0 nan inf , nan -0 nan inf ) = nan -0 nan inf 
+shufps(1 2 3 -4 , 0 -2 -10 0.5 , 0) = 1 1 0 0 
+shufps(0 -2 -10 0.5 , inf -inf -inf 1 , 0) = 0 0 inf inf 
+shufps(1 2 3 -4 , nan -0 nan inf , 0) = 1 1 nan nan 
+shufps(0 -2 -10 0.5 , nan -0 nan inf , 0) = 0 0 nan nan 
+shufps(inf -inf -inf 1 , nan -0 nan inf , 0) = inf inf nan nan 
+shufps(nan -0 nan inf , nan -0 nan inf , 0) = nan nan nan nan 
+shufps(1 2 3 -4 , 0 -2 -10 0.5 , 21) = 2 2 -2 0 
+shufps(0 -2 -10 0.5 , inf -inf -inf 1 , 21) = -2 -2 -inf inf 
+shufps(1 2 3 -4 , nan -0 nan inf , 21) = 2 2 -0 nan 
+shufps(0 -2 -10 0.5 , nan -0 nan inf , 21) = -2 -2 -0 nan 
+shufps(inf -inf -inf 1 , nan -0 nan inf , 21) = -inf -inf -0 nan 
+shufps(nan -0 nan inf , nan -0 nan inf , 21) = -0 -0 -0 nan 
+shufps(1 2 3 -4 , 0 -2 -10 0.5 , 255) = -4 -4 0.5 0.5 
+shufps(0 -2 -10 0.5 , inf -inf -inf 1 , 255) = 0.5 0.5 1 1 
+shufps(1 2 3 -4 , nan -0 nan inf , 255) = -4 -4 inf inf 
+shufps(0 -2 -10 0.5 , nan -0 nan inf , 255) = 0.5 0.5 inf inf 
+shufps(inf -inf -inf 1 , nan -0 nan inf , 255) = 1 1 inf inf 
+shufps(nan -0 nan inf , nan -0 nan inf , 255) = inf inf inf inf 
+shufps(1 2 3 -4 , 0 -2 -10 0.5 , 2) = 3 1 0 0 
+shufps(0 -2 -10 0.5 , inf -inf -inf 1 , 2) = -10 0 inf inf 
+shufps(1 2 3 -4 , nan -0 nan inf , 2) = 3 1 nan nan 
+shufps(0 -2 -10 0.5 , nan -0 nan inf , 2) = -10 0 nan nan 
+shufps(inf -inf -inf 1 , nan -0 nan inf , 2) = -inf inf nan nan 
+shufps(nan -0 nan inf , nan -0 nan inf , 2) = nan nan nan nan 
+sqrtsd(1 2 , 1 2 ) = 1 2 
+sqrtsd(1 2 , 0 -2 ) = 0 2 
+sqrtsd(1 2 , inf -inf ) = inf 2 
+sqrtsd(1 2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 2 
+sqrtsd(1 2 , 2 1 ) = 1.41421 2 
+sqrtsd(1 2 , -2 0 ) = 0xfff8000000000000 2 
+sqrtsd(1 2 , -inf inf ) = 0xfff8000000000000 2 
+sqrtsd(1 2 , -0 0x7ff8000000000000 ) = -0 2 
+addsd(1 2 , 1 2 ) = 2 2 
+addsd(1 2 , 0 -2 ) = 1 2 
+addsd(1 2 , inf -inf ) = inf 2 
+addsd(1 2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 2 
+addsd(1 2 , 2 1 ) = 3 2 
+addsd(1 2 , -2 0 ) = -1 2 
+addsd(1 2 , -inf inf ) = -inf 2 
+addsd(1 2 , -0 0x7ff8000000000000 ) = 1 2 
+mulsd(1 2 , 1 2 ) = 1 2 
+mulsd(1 2 , 0 -2 ) = 0 2 
+mulsd(1 2 , inf -inf ) = inf 2 
+mulsd(1 2 , 0x7ff8000000000000 -0 ) = 0x7ff8000000000000 2 
+mulsd(1 2 , 2 1 ) = 2 2 
+mulsd(1 2 , -2 0 ) = -2 2 
+mulsd(1 2 , -inf inf ) = -inf 2 
+mulsd(1 2 , -0 0x7ff8000000000000 ) = -0 2 
diff --git a/tests/test17 b/tests/test17
index f5cffaeb..f2632c6b 100755
--- a/tests/test17
+++ b/tests/test17
Binary files differdiff --git a/tests/test17.c b/tests/test17.c
index 454d5543..0627d8c5 100644
--- a/tests/test17.c
+++ b/tests/test17.c
@@ -116,6 +116,16 @@ const v128 a128_pd = {.d64 = { 1.0, 2.0}};
 const v128 b128_pd = {.d64 = { 0.0, -2.0}};
 const v128 c128_pd = {.d64 = { INFINITY, -INFINITY}};
 const v128 d128_pd = {.d64 = { NAN, -0.0}};
+const v128 a128_ps = {.f32 = { 1.0, 2.0, 3.0, -4.0}};
+const v128 b128_ps = {.f32 = { 0.0, -2.0, -10.0, 0.5}};
+const v128 c128_ps = {.f32 = { INFINITY, -INFINITY, -INFINITY, 1.0}};
+const v128 d128_ps = {.f32 = { NAN, -0.0, -NAN, INFINITY}};
+
+v128 reverse_pd(v128 a) {
+    v128 ret;
+    ret.md = _mm_shuffle_pd(a.md, a.md, 1);
+    return ret;
+}
 
 void print_8(v128 v) {
     for(int i=0; i<16; ++i)
@@ -148,6 +158,7 @@ void print_pd(v128 v) {
         else
             printf("%g ", v.d64[i]);
 }
+#define print_sd print_pd
 
 int main(int argc, const char** argv)
 {
@@ -301,7 +312,90 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
  printf("%s(", #C); print_pd(A1);                   \
  printf(", "); print_pd(A2);                        \
  printf(") = "); print_pd(a128); printf("\n");
+ #define GO2Cpd(A, C, A1, A2, I)                    \
+ a128.md = _mm_##A##_pd(A1.md, A2.md, I);           \
+ printf("%s(", #C); print_pd(A1);                   \
+ printf(", "); print_pd(A2);                        \
+ printf(", %d) = ", I); print_pd(a128); printf("\n");
+ #define GO1isd(A, C, A1)                           \
+ i = _mm_##A##_sd(A1.md);                           \
+ printf("%s(", #C); print_64(A1);                   \
+ printf(") = 0x%x\n", i);
+ #define GO1sd(A, C, A1)                            \
+ a128.md = _mm_##A##_sd(A1.md);                     \
+ printf("%s(", #C); print_sd(A1);                   \
+ printf(") = "); print_sd(a128); printf("\n");
+ #define GO2sd(A, C, A1, A2)                        \
+ a128.md = _mm_##A##_sd(A1.md, A2.md);              \
+ printf("%s(", #C); print_sd(A1);                   \
+ printf(", "); print_sd(A2);                        \
+ printf(") = "); print_sd(a128); printf("\n");
+ #define GO2Csd(A, C, A1, A2, I)                    \
+ a128.md = _mm_##A##_sd(A1.md, A2.md, I);           \
+ printf("%s(", #C); print_sd(A1);                   \
+ printf(", "); print_sd(A2);                        \
+ printf(", %d) = ", I); print_sd(a128); printf("\n");
+ #define GO1ips(A, C, A1)                           \
+ i = _mm_##A##_ps(A1.mf);                           \
+ printf("%s(", #C); print_32(A1);                   \
+ printf(") = 0x%x\n", i);
+ #define GO1ps(A, C, A1)                            \
+ a128.mf = _mm_##A##_ps(A1.mf);                     \
+ printf("%s(", #C); print_ps(A1);                   \
+ printf(") = "); print_ps(a128); printf("\n");
+ #define GO2ps(A, C, A1, A2)                        \
+ a128.mf = _mm_##A##_ps(A1.mf, A2.mf);              \
+ printf("%s(", #C); print_ps(A1);                   \
+ printf(", "); print_ps(A2);                        \
+ printf(") = "); print_ps(a128); printf("\n");
+ #define GO2Cps(A, C, A1, A2, I)                    \
+ a128.mf = _mm_##A##_ps(A1.mf, A2.mf, I);           \
+ printf("%s(", #C); print_ps(A1);                   \
+ printf(", "); print_ps(A2);                        \
+ printf(", %d) = ", I); print_ps(a128); printf("\n");
  
+ #define MULITGO2pd(A, B)       \
+ GO2pd(A, B, a128_pd, b128_pd)  \
+ GO2pd(A, B, b128_pd, c128_pd)  \
+ GO2pd(A, B, a128_pd, d128_pd)  \
+ GO2pd(A, B, b128_pd, d128_pd)  \
+ GO2pd(A, B, c128_pd, d128_pd)  \
+ GO2pd(A, B, d128_pd, d128_pd)
+
+ #define MULITGO2Cpd(A, B, I)       \
+ GO2Cpd(A, B, a128_pd, b128_pd, I)  \
+ GO2Cpd(A, B, b128_pd, c128_pd, I)  \
+ GO2Cpd(A, B, a128_pd, d128_pd, I)  \
+ GO2Cpd(A, B, b128_pd, d128_pd, I)  \
+ GO2Cpd(A, B, c128_pd, d128_pd, I)  \
+ GO2Cpd(A, B, d128_pd, d128_pd, I)
+
+ #define MULITGO2ps(A, B)       \
+ GO2ps(A, B, a128_ps, b128_ps)  \
+ GO2ps(A, B, b128_ps, c128_ps)  \
+ GO2ps(A, B, a128_ps, d128_ps)  \
+ GO2ps(A, B, b128_ps, d128_ps)  \
+ GO2ps(A, B, c128_ps, d128_ps)  \
+ GO2ps(A, B, d128_ps, d128_ps)
+
+ #define MULITGO2Cps(A, B, I)       \
+ GO2Cps(A, B, a128_ps, b128_ps, I)  \
+ GO2Cps(A, B, b128_ps, c128_ps, I)  \
+ GO2Cps(A, B, a128_ps, d128_ps, I)  \
+ GO2Cps(A, B, b128_ps, d128_ps, I)  \
+ GO2Cps(A, B, c128_ps, d128_ps, I)  \
+ GO2Cps(A, B, d128_ps, d128_ps, I)
+
+ #define MULTIGO2sd(A, B)                   \
+ GO2sd(A, B, a128_pd, a128_pd)              \
+ GO2sd(A, B, a128_pd, b128_pd)              \
+ GO2sd(A, B, a128_pd, c128_pd)              \
+ GO2sd(A, B, a128_pd, d128_pd)              \
+ GO2sd(A, B, a128_pd, reverse_pd(a128_pd))  \
+ GO2sd(A, B, a128_pd, reverse_pd(b128_pd))  \
+ GO2sd(A, B, a128_pd, reverse_pd(c128_pd))  \
+ GO2sd(A, B, a128_pd, reverse_pd(d128_pd))
+
 
  GO2(shuffle, 8, pshufb, a128_8, b128_8)
  GO2(hadd, 16, phaddw, a128_16, b128_16)
@@ -349,13 +443,6 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
  GO1pd(sqrt, psqrtpd, b128_pd)
  GO1pd(sqrt, psqrtpd, c128_pd)
  GO1pd(sqrt, psqrtpd, d128_pd)
- #define MULITGO2pd(A, B)       \
- GO2pd(A, B, a128_pd, b128_pd)  \
- GO2pd(A, B, b128_pd, c128_pd)  \
- GO2pd(A, B, a128_pd, d128_pd)  \
- GO2pd(A, B, b128_pd, d128_pd)  \
- GO2pd(A, B, c128_pd, d128_pd)  \
- GO2pd(A, B, d128_pd, d128_pd)
  MULITGO2pd(and, andpd)
  MULITGO2pd(andnot, andnpd)
  MULITGO2pd(or, orpd)
@@ -460,6 +547,40 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
  GO2(add, 8, paddb, a128_8, b128_8)
  GO2(add, 16, paddw, a128_16, b128_16)
  GO2(add, 32, paddd, a128_32, b128_32)
+ GO2ps(movehl, pmovhlps, a128_ps, b128_ps)
+ GO2ps(unpacklo, unpcklps, a128_ps, b128_ps)
+ GO2ps(unpackhi, unpckhps, a128_ps, b128_ps)
+ GO2ps(movelh, pmovhps, a128_ps, b128_ps)
+ GO1ps(sqrt, psqrtps, a128_ps)
+ GO1ps(sqrt, psqrtps, b128_ps)
+ GO1ps(sqrt, psqrtps, c128_ps)
+ GO1ps(sqrt, psqrtps, d128_ps)
+ //GO1ps(rsqrt, prsqrtps, a128_ps)  // difference in precision
+ //GO1ps(rsqrt, prsqrtps, b128_ps)  // same
+ //GO1ps(rsqrt, prsqrtps, c128_ps)  // same
+ //GO1ps(rsqrt, prsqrtps, d128_ps)  // difference in the handling of NAN, (-)0, and INF in Dynarec
+ //GO1ps(rcp, prcpps, a128_ps)      // deference in precision
+ //GO1ps(rcp, prcpps, b128_ps)      // deference in precision
+ //GO1ps(rcp, prcpps, c128_ps)      // deference in precision
+ GO1ps(rcp, prcpps, d128_ps)
+ MULITGO2ps(and, andps)
+ MULITGO2ps(andnot, andnps)
+ MULITGO2ps(or, orps)
+ MULITGO2ps(xor, xorps)
+ MULITGO2ps(add, addps)
+ MULITGO2ps(mul, mulps)
+ MULITGO2ps(sub, subps)
+ MULITGO2ps(min, minps)
+ MULITGO2ps(div, divps)
+ MULITGO2ps(max, maxps)
+// MULITGO2Cps(cmp, cmpps, 0)   // use avx for some reason
+ MULITGO2Cps(shuffle, shufps, 0)
+ MULITGO2Cps(shuffle, shufps, 0x15)
+ MULITGO2Cps(shuffle, shufps, 0xff)
+ MULITGO2Cps(shuffle, shufps, 0x02)
+ MULTIGO2sd(sqrt, sqrtsd)
+ MULTIGO2sd(add, addsd)
+ MULTIGO2sd(mul, mulsd)
 
  return 0;
 }
author	ptitSeb <sebastien.chev@gmail.com>	2022-04-01 19:27:27 +0200
committer	ptitSeb <sebastien.chev@gmail.com>	2022-04-01 19:27:27 +0200
commit	f2012fc6365c338b977a6e6a230e1d9d7c750d51 (patch)
tree	584792a0b26327fdde1da550b1b06e93587a2332 /tests
parent	16f82ba6b3a447fca0d9d1c56098cc1aace10d2c (diff)
download	box64-f2012fc6365c338b977a6e6a230e1d9d7c750d51.tar.gz box64-f2012fc6365c338b977a6e6a230e1d9d7c750d51.zip