about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2021-11-07 15:06:55 +0100
committerptitSeb <sebastien.chev@gmail.com>2021-11-07 15:06:55 +0100
commitf5c9439f9110fcb4fde3c7db9f68be7c107b17f6 (patch)
treef0094dbbd242c7bc4526d61c2a1d48717f00324e
parentf73e4193f032713529e26c4537e5fac44147c040 (diff)
downloadbox64-f5c9439f9110fcb4fde3c7db9f68be7c107b17f6.tar.gz
box64-f5c9439f9110fcb4fde3c7db9f68be7c107b17f6.zip
Improved CMPSS/CMPSD opcodes, improved test17 ([DYNAREC] too, and improved MINSS/MAXSS/MINSD/MAXSD too)
-rwxr-xr-xsrc/dynarec/dynarec_arm64_f20f.c30
-rwxr-xr-xsrc/dynarec/dynarec_arm64_f30f.c28
-rw-r--r--src/emu/x64runf20f.c6
-rw-r--r--src/emu/x64runf30f.c6
-rw-r--r--tests/ref17.txt166
-rwxr-xr-xtests/test17bin19976 -> 29264 bytes
-rw-r--r--tests/test17.c204
7 files changed, 374 insertions, 66 deletions
diff --git a/src/dynarec/dynarec_arm64_f20f.c b/src/dynarec/dynarec_arm64_f20f.c
index e9a75a7d..e179b2ac 100755
--- a/src/dynarec/dynarec_arm64_f20f.c
+++ b/src/dynarec/dynarec_arm64_f20f.c
@@ -49,7 +49,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
     uint8_t gd, ed;

     uint8_t wback;

     uint8_t u8;

-    uint64_t u64;

+    uint64_t u64, j64;

     int v0, v1;

     int q0;

     int d0, d1;

@@ -214,9 +214,15 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             v0 = sse_get_reg(dyn, ninst, x1, gd);

             GETEX(v1, 0);

             // MINSD: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0]

+            #if 0

             d0 = fpu_get_scratch(dyn);

             FMINNMD(d0, v0, v1);    // NaN handling may be slightly different, is that a problem?

             VMOVeD(v0, 0, d0, 0);   // to not erase uper part

+            #else

+            FCMPD(v0, v1);

+            B_NEXT(cLS);    //Less than or equal

+            VMOVeD(v0, 0, v1, 0);   // to not erase uper part

+            #endif

             break;

         case 0x5E:

             INST_NAME("DIVSD Gx, Ex");

@@ -234,9 +240,15 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             v0 = sse_get_reg(dyn, ninst, x1, gd);

             GETEX(v1, 0);

             // MAXSD: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0]

+            #if 0

             d0 = fpu_get_scratch(dyn);

             FMAXNMD(d0, v0, v1);    // NaN handling may be slightly different, is that a problem?

             VMOVeD(v0, 0, d0, 0);   // to not erase uper part

+            #else

+            FCMPD(v0, v1);

+            B_NEXT(cGE);    //Greater than or equal

+            VMOVeD(v0, 0, v1, 0);   // to not erase uper part

+            #endif

             break;

 

         case 0x70:

@@ -282,19 +294,15 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETGX(v0);

             GETEX(v1, 1);

             u8 = F8;

-            if((u8&7)==6){

-                FCMPD(v1, v0);

-            } else {

-                FCMPD(v0, v1);

-            }

+            FCMPD(v0, v1);

             switch(u8&7) {

-                case 0: CSETMx(x2, cEQ); CSELx(x2, xZR, x2, cVS); break;   // Equal

-                case 1: CSETMx(x2, cMI); CSELx(x2, xZR, x2, cVS); break;   // Less than

-                case 2: CSETMx(x2, cLE); CSELx(x2, xZR, x2, cVS); break;   // Less or equal

+                case 0: CSETMx(x2, cEQ); break;   // Equal

+                case 1: CSETMx(x2, cCC); break;   // Less than

+                case 2: CSETMx(x2, cLS); break;   // Less or equal

                 case 3: CSETMx(x2, cVS); break;   // NaN

-                case 4: CSETMx(x2, cNE); break;   // Not Equal

+                case 4: CSETMx(x2, cNE); break;   // Not Equal or unordered

                 case 5: CSETMx(x2, cCS); break;   // Greater or equal or unordered

-                case 6: CSETMx(x2, cLT); break;   // Greater or unordered, test inverted, N!=V so unordered or less than (inverted)

+                case 6: CSETMx(x2, cHI); break;   // Greater or unordered, test inverted, N!=V so unordered or less than (inverted)

                 case 7: CSETMx(x2, cVC); break;   // not NaN

             }

             VMOVQDfrom(v0, 0, x2);

diff --git a/src/dynarec/dynarec_arm64_f30f.c b/src/dynarec/dynarec_arm64_f30f.c
index 8c87b790..d47a99c5 100755
--- a/src/dynarec/dynarec_arm64_f30f.c
+++ b/src/dynarec/dynarec_arm64_f30f.c
@@ -259,9 +259,15 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETGX(v0);

             GETEX(v1, 0);

             // MINSS: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0]

+            #if 0

             d0 = fpu_get_scratch(dyn);

             FMINNMS(d0, v0, v1);    // NaN handling may be slightly different, is that a problem?

             VMOVeS(v0, 0, d0, 0);   // to not erase uper part

+            #else

+            FCMPS(v0, v1);

+            B_NEXT(cLS);    //Less than or equal

+            VMOVeS(v0, 0, v1, 0);   // to not erase uper part

+            #endif

             break;

         case 0x5E:

             INST_NAME("DIVSS Gx, Ex");

@@ -278,9 +284,15 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETGX(v0);

             GETEX(v1, 0);

             // MAXSS: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0]

+            #if 0

             d0 = fpu_get_scratch(dyn);

             FMAXNMS(d0, v0, v1);    // NaN handling may be slightly different, is that a problem?

             VMOVeS(v0, 0, d0, 0);   // to not erase uper part

+            #else

+            FCMPS(v0, v1);

+            B_NEXT(cGE);    //Greater than or equal

+            VMOVeS(v0, 0, v1, 0);   // to not erase uper part

+            #endif

             break;

             

         case 0x6F:

@@ -384,19 +396,15 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETGX(v0);

             GETEX(v1, 1);

             u8 = F8;

-            if((u8&7)==6){

-                FCMPS(v1, v0);

-            } else {

-                FCMPS(v0, v1);

-            }

+            FCMPS(v0, v1);

             switch(u8&7) {

-                case 0: CSETMw(x2, cEQ); CSELw(x2, xZR, x2, cVS); break;   // Equal

-                case 1: CSETMw(x2, cMI); CSELw(x2, xZR, x2, cVS); break;   // Less than

-                case 2: CSETMw(x2, cLE); CSELw(x2, xZR, x2, cVS); break;   // Less or equal

+                case 0: CSETMw(x2, cEQ); break;   // Equal

+                case 1: CSETMw(x2, cCC); break;   // Less than

+                case 2: CSETMw(x2, cLS); break;   // Less or equal

                 case 3: CSETMw(x2, cVS); break;   // NaN

-                case 4: CSETMw(x2, cNE); break;   // Not Equal

+                case 4: CSETMw(x2, cNE); break;   // Not Equal or unordered

                 case 5: CSETMw(x2, cCS); break;   // Greater or equal or unordered

-                case 6: CSETMw(x2, cLT); break;   // Greater or unordered, test inverted, N!=V so unordered or less than (inverted)

+                case 6: CSETMw(x2, cHI); break;   // Greater or unordered, test inverted, N!=V so unordered or less than (inverted)

                 case 7: CSETMw(x2, cVC); break;   // not NaN

             }

             VMOVQSfrom(v0, 0, x2);

diff --git a/src/emu/x64runf20f.c b/src/emu/x64runf20f.c
index 7bf3c99e..eff4ca5f 100644
--- a/src/emu/x64runf20f.c
+++ b/src/emu/x64runf20f.c
@@ -218,10 +218,10 @@ int RunF20F(x64emu_t *emu, rex_t rex)
         tmp8s = 0;

         switch(tmp8u&7) {

             case 0: tmp8s=(GX->d[0] == EX->d[0]); break;

-            case 1: tmp8s=isless(GX->d[0], EX->d[0]); break;

-            case 2: tmp8s=islessequal(GX->d[0], EX->d[0]); break;

+            case 1: tmp8s=isless(GX->d[0], EX->d[0]) && !(isnan(GX->d[0]) || isnan(EX->d[0])); break;

+            case 2: tmp8s=islessequal(GX->d[0], EX->d[0]) && !(isnan(GX->d[0]) || isnan(EX->d[0])); break;

             case 3: tmp8s=isnan(GX->d[0]) || isnan(EX->d[0]); break;

-            case 4: tmp8s=(GX->d[0] != EX->d[0]); break;

+            case 4: tmp8s=isnan(GX->d[0]) || isnan(EX->d[0]) || (GX->d[0] != EX->d[0]); break;

             case 5: tmp8s=isnan(GX->d[0]) || isnan(EX->d[0]) || isgreaterequal(GX->d[0], EX->d[0]); break;

             case 6: tmp8s=isnan(GX->d[0]) || isnan(EX->d[0]) || isgreater(GX->d[0], EX->d[0]); break;

             case 7: tmp8s=!isnan(GX->d[0]) && !isnan(EX->d[0]); break;

diff --git a/src/emu/x64runf30f.c b/src/emu/x64runf30f.c
index 1960238e..7cc7add5 100644
--- a/src/emu/x64runf30f.c
+++ b/src/emu/x64runf30f.c
@@ -318,10 +318,10 @@ int RunF30F(x64emu_t *emu, rex_t rex)
         tmp8s = 0;

         switch(tmp8u&7) {

             case 0: tmp8s=(GX->f[0] == EX->f[0]); break;

-            case 1: tmp8s=isless(GX->f[0], EX->f[0]); break;

-            case 2: tmp8s=islessequal(GX->f[0], EX->f[0]); break;

+            case 1: tmp8s=isless(GX->f[0], EX->f[0]) && !(isnan(GX->f[0]) || isnan(EX->f[0])); break;

+            case 2: tmp8s=islessequal(GX->f[0], EX->f[0]) && !(isnan(GX->f[0]) || isnan(EX->f[0])); break;

             case 3: tmp8s=isnan(GX->f[0]) || isnan(EX->f[0]); break;

-            case 4: tmp8s=(GX->f[0] != EX->f[0]); break;

+            case 4: tmp8s=isnan(GX->f[0]) || isnan(EX->f[0]) || (GX->f[0] != EX->f[0]); break;

             case 5: tmp8s=isnan(GX->f[0]) || isnan(EX->f[0]) || isgreaterequal(GX->f[0], EX->f[0]); break;

             case 6: tmp8s=isnan(GX->f[0]) || isnan(EX->f[0]) || isgreater(GX->f[0], EX->f[0]); break;

             case 7: tmp8s=!isnan(GX->f[0]) && !isnan(EX->f[0]); break;

diff --git a/tests/ref17.txt b/tests/ref17.txt
index 349d50d0..9715adf0 100644
--- a/tests/ref17.txt
+++ b/tests/ref17.txt
@@ -1,12 +1,154 @@
-div 1, 1 => 1 / 0
-div 10, 5 => 2 / 0
-div 10, 3 => 3 / 1
-div 1, 18446744073709551615 => 0 / 1
-div 10, 18446744073709551613 => 0 / 10
-div 18446744073709551606, 18446744073709551613 => 0 / 18446744073709551606
-idiv 1, 1 => 1 / 0
-idiv 10, 5 => 2 / 0
-idiv 10, 3 => 3 / 1
-idiv 1, -1 => -1 / 0
-idiv 10, -3 => -3 / 1
-idiv -10, -3 => 3 / -1
+ucomiss 1.000000, 2.000000 => 0x202
+ucomiss 2.000000, 1.000000 => 0x203
+ucomiss 1.000000, inf => 0x202
+ucomiss inf, 1.000000 => 0x203
+ucomiss 1.000000, -inf => 0x203
+ucomiss -inf, 1.000000 => 0x202
+ucomiss 1.000000, nan => 0x247
+ucomiss nan, 1.000000 => 0x247
+ucomiss 1.000000, 1.000000 => 0x242
+ucomiss 1.000000, 1.000000 => 0x242
+ucomiss inf, inf => 0x242
+ucomiss -inf, inf => 0x202
+ucomiss inf, -inf => 0x203
+ucomiss nan, nan => 0x247
+minss 1, 2 => 1
+minss 2, 1 => 1
+minss -inf, 2 => -inf
+minss 2, -inf => -inf
+minss inf, 2 => 2
+minss 2, inf => 2
+minss nan, 2 => 2
+minss 2, nan => nan
+minss nan, 3.40282e+38 => 3.40282e+38
+minss 3.40282e+38, nan => nan
+minss -inf, 3.40282e+38 => -inf
+minss 3.40282e+38, -inf => -inf
+minss inf, 3.40282e+38 => 3.40282e+38
+minss 3.40282e+38, inf => 3.40282e+38
+maxss 1, 2 => 2
+maxss 2, 1 => 2
+maxss -inf, 2 => 2
+maxss 2, -inf => 2
+maxss inf, 2 => inf
+maxss 2, inf => inf
+maxss nan, 2 => 2
+maxss 2, nan => nan
+maxss nan, 3.40282e+38 => 3.40282e+38
+maxss 3.40282e+38, nan => nan
+maxss -inf, 3.40282e+38 => 3.40282e+38
+maxss 3.40282e+38, -inf => 3.40282e+38
+maxss inf, 3.40282e+38 => inf
+maxss 3.40282e+38, inf => inf
+cmpss 0 1.000000, 2.000000 => 0x0
+cmpss 0 2.000000, 1.000000 => 0x0
+cmpss 0 1.000000, inf => 0x0
+cmpss 0 inf, 1.000000 => 0x0
+cmpss 0 1.000000, -inf => 0x0
+cmpss 0 -inf, 1.000000 => 0x0
+cmpss 0 1.000000, nan => 0x0
+cmpss 0 nan, 1.000000 => 0x0
+cmpss 0 1.000000, 1.000000 => 0xffffffff
+cmpss 0 1.000000, 1.000000 => 0xffffffff
+cmpss 0 inf, inf => 0xffffffff
+cmpss 0 -inf, inf => 0x0
+cmpss 0 inf, -inf => 0x0
+cmpss 0 nan, nan => 0x0
+cmpss 1 1.000000, 2.000000 => 0xffffffff
+cmpss 1 2.000000, 1.000000 => 0x0
+cmpss 1 1.000000, inf => 0xffffffff
+cmpss 1 inf, 1.000000 => 0x0
+cmpss 1 1.000000, -inf => 0x0
+cmpss 1 -inf, 1.000000 => 0xffffffff
+cmpss 1 1.000000, nan => 0x0
+cmpss 1 nan, 1.000000 => 0x0
+cmpss 1 1.000000, 1.000000 => 0x0
+cmpss 1 1.000000, 1.000000 => 0x0
+cmpss 1 inf, inf => 0x0
+cmpss 1 -inf, inf => 0xffffffff
+cmpss 1 inf, -inf => 0x0
+cmpss 1 nan, nan => 0x0
+cmpss 2 1.000000, 2.000000 => 0xffffffff
+cmpss 2 2.000000, 1.000000 => 0x0
+cmpss 2 1.000000, inf => 0xffffffff
+cmpss 2 inf, 1.000000 => 0x0
+cmpss 2 1.000000, -inf => 0x0
+cmpss 2 -inf, 1.000000 => 0xffffffff
+cmpss 2 1.000000, nan => 0x0
+cmpss 2 nan, 1.000000 => 0x0
+cmpss 2 1.000000, 1.000000 => 0xffffffff
+cmpss 2 1.000000, 1.000000 => 0xffffffff
+cmpss 2 inf, inf => 0xffffffff
+cmpss 2 -inf, inf => 0xffffffff
+cmpss 2 inf, -inf => 0x0
+cmpss 2 nan, nan => 0x0
+cmpss 3 1.000000, 2.000000 => 0x0
+cmpss 3 2.000000, 1.000000 => 0x0
+cmpss 3 1.000000, inf => 0x0
+cmpss 3 inf, 1.000000 => 0x0
+cmpss 3 1.000000, -inf => 0x0
+cmpss 3 -inf, 1.000000 => 0x0
+cmpss 3 1.000000, nan => 0xffffffff
+cmpss 3 nan, 1.000000 => 0xffffffff
+cmpss 3 1.000000, 1.000000 => 0x0
+cmpss 3 1.000000, 1.000000 => 0x0
+cmpss 3 inf, inf => 0x0
+cmpss 3 -inf, inf => 0x0
+cmpss 3 inf, -inf => 0x0
+cmpss 3 nan, nan => 0xffffffff
+cmpss 4 1.000000, 2.000000 => 0xffffffff
+cmpss 4 2.000000, 1.000000 => 0xffffffff
+cmpss 4 1.000000, inf => 0xffffffff
+cmpss 4 inf, 1.000000 => 0xffffffff
+cmpss 4 1.000000, -inf => 0xffffffff
+cmpss 4 -inf, 1.000000 => 0xffffffff
+cmpss 4 1.000000, nan => 0xffffffff
+cmpss 4 nan, 1.000000 => 0xffffffff
+cmpss 4 1.000000, 1.000000 => 0x0
+cmpss 4 1.000000, 1.000000 => 0x0
+cmpss 4 inf, inf => 0x0
+cmpss 4 -inf, inf => 0xffffffff
+cmpss 4 inf, -inf => 0xffffffff
+cmpss 4 nan, nan => 0xffffffff
+cmpss 5 1.000000, 2.000000 => 0x0
+cmpss 5 2.000000, 1.000000 => 0xffffffff
+cmpss 5 1.000000, inf => 0x0
+cmpss 5 inf, 1.000000 => 0xffffffff
+cmpss 5 1.000000, -inf => 0xffffffff
+cmpss 5 -inf, 1.000000 => 0x0
+cmpss 5 1.000000, nan => 0xffffffff
+cmpss 5 nan, 1.000000 => 0xffffffff
+cmpss 5 1.000000, 1.000000 => 0xffffffff
+cmpss 5 1.000000, 1.000000 => 0xffffffff
+cmpss 5 inf, inf => 0xffffffff
+cmpss 5 -inf, inf => 0x0
+cmpss 5 inf, -inf => 0xffffffff
+cmpss 5 nan, nan => 0xffffffff
+cmpss 6 1.000000, 2.000000 => 0x0
+cmpss 6 2.000000, 1.000000 => 0xffffffff
+cmpss 6 1.000000, inf => 0x0
+cmpss 6 inf, 1.000000 => 0xffffffff
+cmpss 6 1.000000, -inf => 0xffffffff
+cmpss 6 -inf, 1.000000 => 0x0
+cmpss 6 1.000000, nan => 0xffffffff
+cmpss 6 nan, 1.000000 => 0xffffffff
+cmpss 6 1.000000, 1.000000 => 0x0
+cmpss 6 1.000000, 1.000000 => 0x0
+cmpss 6 inf, inf => 0x0
+cmpss 6 -inf, inf => 0x0
+cmpss 6 inf, -inf => 0xffffffff
+cmpss 6 nan, nan => 0xffffffff
+cmpss 7 1.000000, 2.000000 => 0xffffffff
+cmpss 7 2.000000, 1.000000 => 0xffffffff
+cmpss 7 1.000000, inf => 0xffffffff
+cmpss 7 inf, 1.000000 => 0xffffffff
+cmpss 7 1.000000, -inf => 0xffffffff
+cmpss 7 -inf, 1.000000 => 0xffffffff
+cmpss 7 1.000000, nan => 0x0
+cmpss 7 nan, 1.000000 => 0x0
+cmpss 7 1.000000, 1.000000 => 0xffffffff
+cmpss 7 1.000000, 1.000000 => 0xffffffff
+cmpss 7 inf, inf => 0xffffffff
+cmpss 7 -inf, inf => 0xffffffff
+cmpss 7 inf, -inf => 0xffffffff
+cmpss 7 nan, nan => 0x0
diff --git a/tests/test17 b/tests/test17
index bcfd74f8..e56e6a46 100755
--- a/tests/test17
+++ b/tests/test17
Binary files differdiff --git a/tests/test17.c b/tests/test17.c
index 799c1368..735e861d 100644
--- a/tests/test17.c
+++ b/tests/test17.c
@@ -6,45 +6,195 @@
 #include <math.h>
 
 #if defined(__x86_64__)
-uint64_t _div_(uint64_t a, uint64_t b, uint64_t *r)
+uint64_t _ucomiss_(float a, float b)
 {
-    uint64_t ret, rem;
+    uint64_t ret;
     asm volatile (
-    "xor %%rdx, %%rdx\n"
-    "div %%rcx\n"
-    "mov %%rdx, %%rbx\n"
-    :"=a" (ret), "=b" (rem):"a" (a), "c" (b):"rdx","cc");
-    *r = rem;
+    "ucomiss %%xmm0, %%xmm1\n"
+    "pushf\n"
+    "pop %%rax"
+    :"=a" (ret)::"xmm0","xmm1","cc");
     return ret;
 }
-uint64_t _idiv_(uint64_t a, uint64_t b, uint64_t *r)
+uint64_t _minss_(float a, float b)
 {
-    uint64_t ret, rem;
+    uint64_t ret;
     asm volatile (
-    "cqo\n"
-    "idiv %%rcx\n"
-    "mov %%rdx, %%rbx\n"
-    :"=a" (ret), "=b" (rem):"a" (a), "c" (b):"rdx","cc");
-    *r = rem;
+    "minss %%xmm1, %%xmm0\n"
+    "movd %%xmm0, %%eax"
+    :"=a" (ret)::"xmm0","xmm1","cc");
     return ret;
 }
+uint64_t _maxss_(float a, float b)
+{
+    uint64_t ret;
+    asm volatile (
+    "maxss %%xmm1, %%xmm0\n"
+    "movd %%xmm0, %%eax"
+    :"=a" (ret)::"xmm0","xmm1","cc");
+    return ret;
+}
+#define CMPSS(A)                        \
+uint64_t _cmpss_##A(float a, float b)   \
+{                                       \
+    uint64_t ret;                       \
+    asm volatile (                      \
+    "cmpss $" #A ", %%xmm1, %%xmm0\n"   \
+    "movd %%xmm0, %%eax"                \
+    :"=a" (ret)::"xmm0","xmm1","cc");   \
+    return ret;                         \
+}
 #else
+uint64_t _ucomiss_(float a, float b)
+{
+    uint32_t ret;
+    asm volatile (
+    "movss %1, %%xmm0\n"
+    "movss %2, %%xmm1\n"
+    "ucomiss %%xmm0, %%xmm1\n"
+    "pushf\n"
+    "pop %%eax"
+    :"=a" (ret):"m"(a), "m"(b):"xmm0", "xmm1", "cc");
+    return ret;
+}
+uint64_t _minss_(float a, float b)
+{
+    uint32_t ret;
+    asm volatile (
+    "movss %1, %%xmm0\n"
+    "movss %2, %%xmm1\n"
+    "minss %%xmm1, %%xmm0\n"
+    "movd %%xmm0, %%eax"
+    :"=a" (ret):"m"(a), "m"(b):"xmm0", "xmm1", "cc");
+    return ret;
+}
+uint64_t _maxss_(float a, float b)
+{
+    uint32_t ret;
+    asm volatile (
+    "movss %1, %%xmm0\n"
+    "movss %2, %%xmm1\n"
+    "maxss %%xmm1, %%xmm0\n"
+    "movd %%xmm0, %%eax"
+    :"=a" (ret):"m"(a), "m"(b):"xmm0", "xmm1", "cc");
+    return ret;
+}
+#define CMPSS(A)                                            \
+uint64_t _cmpss_##A(float a, float b)                       \
+{                                                           \
+    uint32_t ret;                                           \
+    asm volatile (                                          \
+    "movss %1, %%xmm0\n"                                    \
+    "movss %2, %%xmm1\n"                                    \
+    "cmpss $" #A ", %%xmm1, %%xmm0\n"                       \
+    "movd %%xmm0, %%eax"                                    \
+    :"=a" (ret):"m"(a), "m"(b):"xmm0", "xmm1", "cc");       \
+    return ret;                                             \
+}
 #endif
+CMPSS(0)
+CMPSS(1)
+CMPSS(2)
+CMPSS(3)
+CMPSS(4)
+CMPSS(5)
+CMPSS(6)
+CMPSS(7)
 
 int main(int argc, const char** argv)
 {
-  uint64_t datas[][2] = {{1,1},{10,5},{10,3},{1, (uint64_t)-1}, {10, (uint64_t)-3}, {(uint64_t)-10, (uint64_t)-3}};
+ float a, b;
+ uint64_t flags;
+ uint32_t maxf = 0x7f7fffff;
+ uint32_t minf = 0xff7fffff;
+ uint32_t r;
+
+#define GO1(A, N)                                   \
+a = 1.0f; b = 2.0f;                                 \
+flags = A(a, b);                                    \
+printf(N " %f, %f => 0x%lx\n", a, b, flags);        \
+flags = A(b, a);                                    \
+printf(N " %f, %f => 0x%lx\n", b, a, flags);        \
+b = INFINITY;                                       \
+flags = A(a, b);                                    \
+printf(N " %f, %f => 0x%lx\n", a, b, flags);        \
+flags = A(b, a);                                    \
+printf(N " %f, %f => 0x%lx\n", b, a, flags);        \
+b = -INFINITY;                                      \
+flags = A(a, b);                                    \
+printf(N " %f, %f => 0x%lx\n", a, b, flags);        \
+flags = A(b, a);                                    \
+printf(N " %f, %f => 0x%lx\n", b, a, flags);        \
+b = NAN;                                            \
+flags = A(a, b);                                    \
+printf(N " %f, %f => 0x%lx\n", a, b, flags);        \
+flags = A(b, a);                                    \
+printf(N " %f, %f => 0x%lx\n", b, a, flags);        \
+b = a;                                              \
+flags = A(a, b);                                    \
+printf(N " %f, %f => 0x%lx\n", a, b, flags);        \
+flags = A(b, a);                                    \
+printf(N " %f, %f => 0x%lx\n", b, a, flags);        \
+a = b = INFINITY;                                   \
+flags = A(a, b);                                    \
+printf(N " %f, %f => 0x%lx\n", a, b, flags);        \
+a = -INFINITY;                                      \
+flags = A(a, b);                                    \
+printf(N " %f, %f => 0x%lx\n", a, b, flags);        \
+flags = A(b, a);                                    \
+printf(N " %f, %f => 0x%lx\n", b, a, flags);        \
+a = b = NAN;                                        \
+flags = A(a, b);                                    \
+printf(N " %f, %f => 0x%lx\n", a, b, flags);
+
+#define GO2(A, N)                               \
+a = 1.0f; b = 2.0f;                             \
+r = A(a, b);                                    \
+printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
+r = A(b, a);                                    \
+printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
+a = -INFINITY;                                  \
+r = A(a, b);                                    \
+printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
+r = A(b, a);                                    \
+printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
+a = +INFINITY;                                  \
+r = A(a, b);                                    \
+printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
+r = A(b, a);                                    \
+printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
+a = NAN;                                        \
+r = A(a, b);                                    \
+printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
+r = A(b, a);                                    \
+printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
+b = *(float*)&maxf;                             \
+r = A(a, b);                                    \
+printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
+r = A(b, a);                                    \
+printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
+a = -INFINITY;                                  \
+r = A(a, b);                                    \
+printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
+r = A(b, a);                                    \
+printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
+a = +INFINITY;                                  \
+r = A(a, b);                                    \
+printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
+r = A(b, a);                                    \
+printf(N " %g, %g => %g\n", b, a, *(float*)&r);
 
-  int sz = sizeof(datas)/sizeof(datas[0]);
-  for(int i=0; i<sz; ++i) {
-   uint64_t rem = 0;
-   uint64_t d = _div_(datas[i][0], datas[i][1], &rem);
-   printf("div %llu, %llu => %llu / %llu\n", datas[i][0], datas[i][1], d, rem);
- }
-  for(int i=0; i<sz; ++i) {
-   uint64_t rem = 0;
-   uint64_t d = _idiv_(datas[i][0], datas[i][1], &rem);
-   printf("idiv %lld, %lld => %lld / %lld\n", datas[i][0], datas[i][1], d, rem);
- }
-  return 0;
+ GO1(_ucomiss_, "ucomiss")
+ GO2(_minss_, "minss")
+ GO2(_maxss_, "maxss")
+ GO1(_cmpss_0, "cmpss 0")
+ GO1(_cmpss_1, "cmpss 1")
+ GO1(_cmpss_2, "cmpss 2")
+ GO1(_cmpss_3, "cmpss 3")
+ GO1(_cmpss_4, "cmpss 4")
+ GO1(_cmpss_5, "cmpss 5")
+ GO1(_cmpss_6, "cmpss 6")
+ GO1(_cmpss_7, "cmpss 7")
+ 
+ return 0;
 }