about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorHagb (Junyu Guo 郭俊余) <hagb_green@qq.com>2025-01-08 22:34:15 +0800
committerGitHub <noreply@github.com>2025-01-08 15:34:15 +0100
commit653a67c8addcb980ce10a27765e582c972f8d69c (patch)
tree98ac1036f34b2810a1f41a6126827445f7774fd4
parentb99893d1c3506524103eebc2e4497a8be14cd6d0 (diff)
downloadbox64-653a67c8addcb980ce10a27765e582c972f8d69c.tar.gz
box64-653a67c8addcb980ce10a27765e582c972f8d69c.zip
Port rounding of some x87 instructions from Box86 (#2242)
* Port rounding of some x87 instructions from Box86

Ported from https://github.com/ptitSeb/box86/pull/951. The original pull
request and this commit also contain some improvements on precision of
`F2XM1` and `FYL2XP1`.

* Run fpu_rounding test with dynarec only for ARM64

They have been implemented on dynarec only for ARM64.
-rw-r--r--CMakeLists.txt11
-rw-r--r--docs/USAGE.md4
-rw-r--r--docs/box64.pod4
-rw-r--r--src/core.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_d8.c49
-rw-r--r--src/dynarec/arm64/dynarec_arm64_d9.c48
-rw-r--r--src/dynarec/arm64/dynarec_arm64_da.c25
-rw-r--r--src/dynarec/arm64/dynarec_arm64_dc.c49
-rw-r--r--src/dynarec/arm64/dynarec_arm64_de.c25
-rw-r--r--src/dynarec/dynarec_native_functions.c16
-rw-r--r--src/emu/x64rund8.c8
-rw-r--r--src/emu/x64rund9.c47
-rw-r--r--src/emu/x64runda.c7
-rw-r--r--src/emu/x64rundb.c8
-rw-r--r--src/emu/x64rundc.c7
-rw-r--r--src/emu/x64runde.c8
-rw-r--r--src/emu/x87emu_setround.h29
-rw-r--r--tests/ref31.txt810
-rw-r--r--tests/roundtest.h114
-rwxr-xr-xtests/test31bin0 -> 81928 bytes
-rw-r--r--tests/test31.c114
21 files changed, 1361 insertions, 24 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f15e1921..b59e7bfc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1497,6 +1497,17 @@ add_test(avx_intrinsics ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BINARY_DIR}/${B
 
 set_tests_properties(avx_intrinsics PROPERTIES ENVIRONMENT "BOX64_DYNAREC_FASTNAN=0;BOX64_DYNAREC_FASTROUND=0;BOX64_AVX=2")
 
+add_test(fpu_rounding ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BINARY_DIR}/${BOX64}
+    -D TEST_ARGS=${CMAKE_SOURCE_DIR}/tests/test31 -D TEST_OUTPUT=tmpfile31.txt
+    -D TEST_REFERENCE=${CMAKE_SOURCE_DIR}/tests/ref31.txt
+    -P ${CMAKE_SOURCE_DIR}/runTest.cmake )
+
+if(ARM_DYNAREC)
+set_tests_properties(fpu_rounding PROPERTIES ENVIRONMENT "BOX64_DYNAREC_FASTROUND=0;BOX64_DYNAREC_TEST=1")
+else()
+set_tests_properties(fpu_rounding PROPERTIES ENVIRONMENT "BOX64_DYNAREC=0")
+endif()
+
 else()
 
 add_test(bootSyscall ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BINARY_DIR}/${BOX64}
diff --git a/docs/USAGE.md b/docs/USAGE.md
index e74c11ad..03eef42e 100644
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -368,8 +368,8 @@ Generate x86 -NAN.
 #### BOX64_DYNAREC_FASTROUND *

 

 Generate precise x86 rounding.

- * 0 : Generate float/double -> int rounding like on x86.

- * 1 : Do not do anything special with edge case Rounding, to go as fast as possible (no INF/NAN/Overflow -> MIN_INT conversion). (default, faster)

+ * 0 : Generate float/double -> int rounding and use current rounding mode for float/double computation like on x86.

+ * 1 : Do not do anything special with edge case Rounding, to go as fast as possible (no INF/NAN/Overflow -> MIN_INT conversion, and no non-default rounding modes). (default, faster)

  * 2 : Everything from 1 plus also fast round of double->float (not taking into account current rounding mode).

 

 #### BOX64_DYNAREC_SAFEFLAGS *

diff --git a/docs/box64.pod b/docs/box64.pod
index e12f7c0a..59813110 100644
--- a/docs/box64.pod
+++ b/docs/box64.pod
@@ -335,8 +335,8 @@ Enable/Disable generation of -NAN
 
 Enable/Disable generation of precise x86 rounding
 
-    * 0 : Generate float/double -> int rounding like on x86
-    * 1 : Don't do anything special with edge case Rounding, to go as fast as possible (no INF/NAN/Overflow -> MIN_INT conversion) (default, faster)
+    * 0 : Generate float/double -> int rounding and use current rounding mode for float/double computation like on x86
+    * 1 : Don't do anything special with edge case Rounding, to go as fast as possible (no INF/NAN/Overflow -> MIN_INT conversion, and no non-default rounding modes) (default, faster)
 
 =item B<BOX64_DYNAREC_SAFEFLAGS>=I<0|1|2>
 
diff --git a/src/core.c b/src/core.c
index a39fa795..c7dc95c3 100644
--- a/src/core.c
+++ b/src/core.c
@@ -847,7 +847,7 @@ void LoadLogEnv()
                 box64_dynarec_fastround = p[0]-'0';
         }
         if(!box64_dynarec_fastround)
-            printf_log(LOG_INFO, "Dynarec will try to generate x86 precise IEEE->int rounding\n");
+            printf_log(LOG_INFO, "Dynarec will try to generate x86 precise IEEE->int rounding and and set rounding mode for computation\n");
         else if(box64_dynarec_fastround==2)
             printf_log(LOG_INFO, "Dynarec will generate x86 very imprecise double->float rounding\n");
     }
diff --git a/src/dynarec/arm64/dynarec_arm64_d8.c b/src/dynarec/arm64/dynarec_arm64_d8.c
index e8f002a4..826cad7d 100644
--- a/src/dynarec/arm64/dynarec_arm64_d8.c
+++ b/src/dynarec/arm64/dynarec_arm64_d8.c
@@ -29,6 +29,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
 
     uint8_t nextop = F8;
     uint8_t ed;
+    uint8_t u8;
     int64_t fixedaddress;
     int unscaled;
     int v1, v2;
@@ -51,11 +52,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FADD ST0, STx");
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FADDS(v1, v1, v2);
             } else {
                 FADDD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xC8:
         case 0xC9:
@@ -68,11 +73,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FMUL ST0, STx");
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FMULS(v1, v1, v2);
             } else {
                 FMULD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xD0:
         case 0xD1:
@@ -122,11 +131,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FSUB ST0, STx");
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FSUBS(v1, v1, v2);
             } else {
                 FSUBD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xE8:
         case 0xE9:
@@ -139,11 +152,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FSUBR ST0, STx");
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FSUBS(v1, v2, v1);
             } else {
                 FSUBD(v1, v2, v1);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xF0:
         case 0xF1:
@@ -156,11 +173,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FDIV ST0, STx");
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FDIVS(v1, v1, v2);
             } else {
                 FDIVD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xF8:
         case 0xF9:
@@ -173,11 +194,15 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FDIVR ST0, STx");
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FDIVS(v1, v2, v1);
             } else {
                 FDIVD(v1, v2, v1);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         default:
             DEFAULT;
@@ -190,12 +215,16 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 if(ST_IS_F(0)) {
                     FADDS(v1, v1, s0);
                 } else {
                     FCVT_D_S(s0, s0);
                     FADDD(v1, v1, s0);
                 }
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 1:
                 INST_NAME("FMUL ST0, float[ED]");
@@ -203,12 +232,16 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 if(ST_IS_F(0)) {
                     FMULS(v1, v1, s0);
                 } else {
                     FCVT_D_S(s0, s0);
                     FMULD(v1, v1, s0);
                 }
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 2:
                 INST_NAME("FCOM ST0, float[ED]");
@@ -245,12 +278,16 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 if(ST_IS_F(0)) {
                     FSUBS(v1, v1, s0);
                 } else {
                     FCVT_D_S(s0, s0);
                     FSUBD(v1, v1, s0);
                 }
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 5:
                 INST_NAME("FSUBR ST0, float[ED]");
@@ -258,12 +295,16 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 if(ST_IS_F(0)) {
                     FSUBS(v1, s0, v1);
                 } else {
                     FCVT_D_S(s0, s0);
                     FSUBD(v1, s0, v1);
                 }
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 6:
                 INST_NAME("FDIV ST0, float[ED]");
@@ -271,12 +312,16 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 if(ST_IS_F(0)) {
                     FDIVS(v1, v1, s0);
                 } else {
                     FCVT_D_S(s0, s0);
                     FDIVD(v1, v1, s0);
                 }
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 7:
                 INST_NAME("FDIVR ST0, float[ED]");
@@ -284,12 +329,16 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 if(ST_IS_F(0)) {
                     FDIVS(v1, s0, v1);
                 } else {
                     FCVT_D_S(s0, s0);
                     FDIVD(v1, s0, v1);
                 }
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             default:
                 DEFAULT;
diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c
index 378070a0..c1961a01 100644
--- a/src/dynarec/arm64/dynarec_arm64_d9.c
+++ b/src/dynarec/arm64/dynarec_arm64_d9.c
@@ -291,8 +291,12 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             MESSAGE(LOG_DUMP, "Need Optimization\n");
             i1 = x87_stackcount(dyn, ninst, x1);
             x87_forget(dyn, ninst, x1, x2, 0);
-            CALL(native_ftan, -1);
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
+            CALL_(native_ftan, -1, box64_dynarec_fastround ? 0 : u8);
             x87_unstackcount(dyn, ninst, x1, i1);
+           if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             if(PK(0)==0xdd && PK(1)==0xd8) {
                 MESSAGE(LOG_DUMP, "Optimized next DD D8 fstp st0, st0, not emitting 1\n");
                 u8 = F8;
@@ -312,7 +316,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             i1 = x87_stackcount(dyn, ninst, x1);
             x87_forget(dyn, ninst, x1, x2, 0);
             x87_forget(dyn, ninst, x1, x2, 1);
-            CALL(native_fpatan, -1);
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
+            CALL_(native_fpatan, -1, box64_dynarec_fastround ? 0 : u8);
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             x87_unstackcount(dyn, ninst, x1, i1);
             X87_POP_OR_FAIL(dyn, ninst, x3);
             break;
@@ -418,11 +426,15 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
         case 0xFA:
             INST_NAME("FSQRT");
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FSQRTS(v1, v1);
             } else {
                 FSQRTD(v1, v1);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xFB:
             INST_NAME("FSINCOS");
@@ -430,7 +442,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, 0);
             i1 = x87_stackcount(dyn, ninst, x1);
             x87_forget(dyn, ninst, x1, x2, 1);
-            CALL(native_fsincos, -1);
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
+            CALL_(native_fsincos, -1, box64_dynarec_fastround ? 0 : u8);
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             x87_unstackcount(dyn, ninst, x1, i1);
             break;
         case 0xFC:
@@ -457,7 +473,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             i1 = x87_stackcount(dyn, ninst, x1);
             x87_forget(dyn, ninst, x1, x2, 0);
             x87_forget(dyn, ninst, x1, x2, 1);
-            CALL(native_fscale, -1);
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
+            CALL_(native_fscale, -1, box64_dynarec_fastround ? 0 : u8);
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             x87_unstackcount(dyn, ninst, x1, i1);
             break;
         case 0xFE:
@@ -465,7 +485,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             MESSAGE(LOG_DUMP, "Need Optimization\n");
             i1 = x87_stackcount(dyn, ninst, x1);
             x87_forget(dyn, ninst, x1, x2, 0);
-            CALL(native_fsin, -1);
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
+            CALL_(native_fsin, -1, box64_dynarec_fastround ? 0 : u8);
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             x87_unstackcount(dyn, ninst, x1, i1);
             break;
         case 0xFF:
@@ -473,7 +497,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             MESSAGE(LOG_DUMP, "Need Optimization\n");
             i1 = x87_stackcount(dyn, ninst, x1);
             x87_forget(dyn, ninst, x1, x2, 0);
-            CALL(native_fcos, -1);
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
+            CALL_(native_fcos, -1, box64_dynarec_fastround ? 0 : u8);
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             x87_unstackcount(dyn, ninst, x1, i1);
             break;
         default:
@@ -497,7 +525,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     s0 = v1;
                 else {
                     s0 = fpu_get_scratch(dyn, ninst);
+                    if(!box64_dynarec_fastround)
+                        u8 = x87_setround(dyn, ninst, x1, x2, x4);
                     FCVT_S_D(s0, v1);
+                    if(!box64_dynarec_fastround)
+                        x87_restoreround(dyn, ninst, u8);
                 }
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VST32(s0, ed, fixedaddress);
@@ -507,7 +539,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 if(!ST_IS_F(0)) {
+                    if(!box64_dynarec_fastround)
+                        u8 = x87_setround(dyn, ninst, x1, x5, x4);
                     FCVT_S_D(v1, v1);
+                    if(!box64_dynarec_fastround)
+                        x87_restoreround(dyn, ninst, u8);
                 }
                 VST32(v1, ed, fixedaddress);
                 X87_POP_OR_FAIL(dyn, ninst, x3);
diff --git a/src/dynarec/arm64/dynarec_arm64_da.c b/src/dynarec/arm64/dynarec_arm64_da.c
index 52965b3f..6e4bb528 100644
--- a/src/dynarec/arm64/dynarec_arm64_da.c
+++ b/src/dynarec/arm64/dynarec_arm64_da.c
@@ -29,6 +29,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
     int64_t j64;
     uint8_t ed;
     uint8_t wback;
+    uint8_t u8;
     int v1, v2;
     int d0;
     int s0;
@@ -148,7 +149,11 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
                 SCVTFDD(v2, v2);    // i64 -> double
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 FADDD(v1, v1, v2);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 1:
                 INST_NAME("FIMUL ST0, Ed");
@@ -158,7 +163,11 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
                 SCVTFDD(v2, v2);    // i64 -> double
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 FMULD(v1, v1, v2);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 2:
                 INST_NAME("FICOM ST0, Ed");
@@ -191,7 +200,11 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
                 SCVTFDD(v2, v2);    // i64 -> double
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 FSUBD(v1, v1, v2);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 5:
                 INST_NAME("FISUBR ST0, Ed");
@@ -201,7 +214,11 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
                 SCVTFDD(v2, v2);    // i64 -> double
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 FSUBD(v1, v2, v1);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 6:
                 INST_NAME("FIDIV ST0, Ed");
@@ -211,7 +228,11 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
                 SCVTFDD(v2, v2);    // i64 -> double
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 FDIVD(v1, v1, v2);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 7:
                 INST_NAME("FIDIVR ST0, Ed");
@@ -221,7 +242,11 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
                 SCVTFDD(v2, v2);    // i64 -> double
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x5, x4);
                 FDIVD(v1, v2, v1);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
         }
     return addr;
diff --git a/src/dynarec/arm64/dynarec_arm64_dc.c b/src/dynarec/arm64/dynarec_arm64_dc.c
index ee5b2c62..2fbac9d8 100644
--- a/src/dynarec/arm64/dynarec_arm64_dc.c
+++ b/src/dynarec/arm64/dynarec_arm64_dc.c
@@ -29,6 +29,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
 
     uint8_t nextop = F8;
     uint8_t wback;
+    uint8_t u8;
     int64_t fixedaddress;
     int unscaled;
     int v1, v2;
@@ -49,11 +50,15 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FADD STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FADDS(v1, v1, v2);
             } else {
                 FADDD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xC8:
         case 0xC9:
@@ -66,11 +71,15 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FMUL STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FMULS(v1, v1, v2);
             } else {
                 FMULD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xD0:
         case 0xD1:
@@ -120,11 +129,15 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FSUBR STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FSUBS(v1, v2, v1);
             } else {
                 FSUBD(v1, v2, v1);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xE8:
         case 0xE9:
@@ -137,11 +150,15 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FSUB STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FSUBS(v1, v1, v2);
             } else {
                 FSUBD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xF0:
         case 0xF1:
@@ -154,11 +171,15 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FDIVR STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FDIVS(v1, v2, v1);
             } else {
                 FDIVD(v1, v2, v1);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         case 0xF8:
         case 0xF9:
@@ -171,11 +192,15 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FDIV STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FDIVS(v1, v1, v2);
             } else {
                 FDIVD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             break;
         default:
             DEFAULT;
@@ -188,7 +213,11 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x2, x4);
                 FADDD(v1, v1, v2);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 1:
                 INST_NAME("FMUL ST0, double[ED]");
@@ -196,7 +225,11 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x2, x4);
                 FMULD(v1, v1, v2);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 2:
                 INST_NAME("FCOM ST0, double[ED]");
@@ -223,7 +256,11 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x2, x4);
                 FSUBD(v1, v1, v2);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 5:
                 INST_NAME("FSUBR ST0, double[ED]");
@@ -231,7 +268,11 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x2, x4);
                 FSUBD(v1, v2, v1);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 6:
                 INST_NAME("FDIV ST0, double[ED]");
@@ -239,7 +280,11 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x2, x4);
                 FDIVD(v1, v1, v2);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
             case 7:
                 INST_NAME("FDIVR ST0, double[ED]");
@@ -247,7 +292,11 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
+                if(!box64_dynarec_fastround)
+                    u8 = x87_setround(dyn, ninst, x1, x2, x4);
                 FDIVD(v1, v2, v1);
+                if(!box64_dynarec_fastround)
+                    x87_restoreround(dyn, ninst, u8);
                 break;
         }
     return addr;
diff --git a/src/dynarec/arm64/dynarec_arm64_de.c b/src/dynarec/arm64/dynarec_arm64_de.c
index 7dbbb210..8dcc16c2 100644
--- a/src/dynarec/arm64/dynarec_arm64_de.c
+++ b/src/dynarec/arm64/dynarec_arm64_de.c
@@ -29,6 +29,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
 
     uint8_t nextop = F8;
     uint8_t wback;
+    uint8_t u8;
     int64_t fixedaddress;
     int unscaled;
     int v1, v2;
@@ -49,11 +50,15 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FADDP STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FADDS(v1, v1, v2);
             } else {
                 FADDD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             X87_POP_OR_FAIL(dyn, ninst, x3);
             break;
         case 0xC8:
@@ -67,11 +72,15 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FMULP STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FMULS(v1, v1, v2);
             } else {
                 FMULD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             X87_POP_OR_FAIL(dyn, ninst, x3);
             break;
         case 0xD0:
@@ -117,11 +126,15 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FSUBRP STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FSUBS(v1, v2, v1);
             } else {
                 FSUBD(v1, v2, v1);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             X87_POP_OR_FAIL(dyn, ninst, x3);
             break;
         case 0xE8:
@@ -135,11 +148,15 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FSUBP STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FSUBS(v1, v1, v2);
             } else {
                 FSUBD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             X87_POP_OR_FAIL(dyn, ninst, x3);
             break;
         case 0xF0:
@@ -153,11 +170,15 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FDIVRP STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FDIVS(v1, v2, v1);
             } else {
                 FDIVD(v1, v2, v1);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             X87_POP_OR_FAIL(dyn, ninst, x3);
             break;
         case 0xF8:
@@ -171,11 +192,15 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("FDIVP STx, ST0");
             v2 = x87_get_st(dyn, ninst, x1, x2, 0, X87_COMBINE(0, nextop&7));
             v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7, X87_COMBINE(0, nextop&7));
+            if(!box64_dynarec_fastround)
+                u8 = x87_setround(dyn, ninst, x1, x2, x4);
             if(ST_IS_F(0)) {
                 FDIVS(v1, v1, v2);
             } else {
                 FDIVD(v1, v1, v2);
             }
+            if(!box64_dynarec_fastround)
+                x87_restoreround(dyn, ninst, u8);
             X87_POP_OR_FAIL(dyn, ninst, x3);
             break;
         default:
diff --git a/src/dynarec/dynarec_native_functions.c b/src/dynarec/dynarec_native_functions.c
index 683d8256..18082d82 100644
--- a/src/dynarec/dynarec_native_functions.c
+++ b/src/dynarec/dynarec_native_functions.c
@@ -42,7 +42,7 @@ void native_print_armreg(x64emu_t* emu, uintptr_t reg, uintptr_t n)
 
 void native_f2xm1(x64emu_t* emu)
 {
-    ST0.d = exp2(ST0.d) - 1.0;
+    ST0.d = expm1(LN2 * ST0.d);
 }
 void native_fyl2x(x64emu_t* emu)
 {
@@ -50,11 +50,14 @@ void native_fyl2x(x64emu_t* emu)
 }
 void native_ftan(x64emu_t* emu)
 {
+#pragma STDC FENV_ACCESS ON
+    // seems that tan of glib doesn't follow the rounding direction mode
     ST0.d = tan(ST0.d);
     emu->sw.f.F87_C2 = 0;
 }
 void native_fpatan(x64emu_t* emu)
 {
+#pragma STDC FENV_ACCESS ON
     ST1.d = atan2(ST1.d, ST0.d);
 }
 void native_fxtract(x64emu_t* emu)
@@ -97,10 +100,12 @@ void native_fprem(x64emu_t* emu)
 }
 void native_fyl2xp1(x64emu_t* emu)
 {
-    ST(1).d = log2(ST0.d + 1.0)*ST(1).d;
+    ST(1).d = log1p(ST0.d)*ST(1).d/LN2;
 }
 void native_fsincos(x64emu_t* emu)
 {
+#pragma STDC FENV_ACCESS ON
+    // seems that sincos of glib doesn't follow the rounding direction mode
     sincos(ST1.d, &ST1.d, &ST0.d);
     emu->sw.f.F87_C2 = 0;
 }
@@ -110,16 +115,21 @@ void native_frndint(x64emu_t* emu)
 }
 void native_fscale(x64emu_t* emu)
 {
+#pragma STDC FENV_ACCESS ON
     if(ST0.d!=0.0)
-        ST0.d *= exp2(trunc(ST1.d));
+        ST0.d = ldexp(ST0.d, trunc(ST1.d));
 }
 void native_fsin(x64emu_t* emu)
 {
+#pragma STDC FENV_ACCESS ON
+    // seems that sin of glib doesn't follow the rounding direction mode
     ST0.d = sin(ST0.d);
     emu->sw.f.F87_C2 = 0;
 }
 void native_fcos(x64emu_t* emu)
 {
+#pragma STDC FENV_ACCESS ON
+    // seems that cos of glib doesn't follow the rounding direction mode
     ST0.d = cos(ST0.d);
     emu->sw.f.F87_C2 = 0;
 }
diff --git a/src/emu/x64rund8.c b/src/emu/x64rund8.c
index 000eab21..fd5ccf66 100644
--- a/src/emu/x64rund8.c
+++ b/src/emu/x64rund8.c
@@ -1,4 +1,5 @@
 #define _GNU_SOURCE

+#include <fenv.h>

 #include <stdint.h>

 #include <stdio.h>

 #include <stdlib.h>

@@ -17,6 +18,7 @@
 #include "x64primop.h"

 #include "x64trace.h"

 #include "x87emu_private.h"

+#include "x87emu_setround.h"

 #include "box64context.h"

 #include "bridge.h"

 

@@ -34,6 +36,7 @@ uintptr_t RunD8(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
     #ifdef TEST_INTERPRETER

     x64emu_t*emu = test->emu;

     #endif

+    int oldround = fpu_setround(emu);

 

     nextop = F8;

     if(MODREG)

@@ -121,6 +124,7 @@ uintptr_t RunD8(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
             ST0.d = ST(nextop&7).d / ST0.d;

             break;

         default:

+            fesetround(oldround);

             return 0;

     } else

         switch((nextop>>3)&7) {

@@ -190,7 +194,9 @@ uintptr_t RunD8(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
                 ST0.d = *(float*)ED / ST0.d;

                 break;

             default:

+                fesetround(oldround);

                 return 0;

         }

-   return addr;

+    fesetround(oldround);

+    return addr;

 }
\ No newline at end of file
diff --git a/src/emu/x64rund9.c b/src/emu/x64rund9.c
index 648eb0a7..c7b8d308 100644
--- a/src/emu/x64rund9.c
+++ b/src/emu/x64rund9.c
@@ -1,4 +1,5 @@
 #define _GNU_SOURCE

+#include <fenv.h>

 #include <stdint.h>

 #include <stdio.h>

 #include <stdlib.h>

@@ -17,6 +18,7 @@
 #include "x64primop.h"

 #include "x64trace.h"

 #include "x87emu_private.h"

+#include "x87emu_setround.h"

 #include "box64context.h"

 #include "bridge.h"

 

@@ -33,6 +35,7 @@ uintptr_t RunD9(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
     uint64_t ll;

     float f;

     reg64_t *oped;

+    int oldround;

     #ifdef TEST_INTERPRETER

     x64emu_t*emu = test->emu;

     #endif

@@ -123,20 +126,30 @@ uintptr_t RunD9(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
             break;

 

         case 0xF0:  /* F2XM1 */

-            ST0.d = exp2(ST0.d) - 1.0;

+            if (ST0.d == 0)

+                break;

+            // Using the expm1 instead of exp2(ST0)-1 can avoid losing precision much,

+            // expecially when ST0 is close to zero (which loses the precise when -1).

+            // printf("%a, %a\n", LN2 * ST0.d, expm1(LN2 * ST0.d));

+            ST0.d = expm1(LN2 * ST0.d);

+            //    = 2^ST0 - 1 + error. (in math)

             break;

         case 0xF1:  /* FYL2X */

             ST(1).d *= log2(ST0.d);

             fpu_do_pop(emu);

             break;

         case 0xF2:  /* FPTAN */

+            oldround = fpu_setround(emu);

             ST0.d = tan(ST0.d);

+            fesetround(oldround);

             fpu_do_push(emu);

             ST0.d = 1.0;

             emu->sw.f.F87_C2 = 0;

             break;

         case 0xF3:  /* FPATAN */

+            oldround = fpu_setround(emu);

             ST1.d = atan2(ST1.d, ST0.d);

+            fesetround(oldround);

             fpu_do_pop(emu);

             break;

         case 0xF4:  /* FXTRACT */

@@ -209,15 +222,22 @@ uintptr_t RunD9(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
                 emu->top=(emu->top+1)&7;    // this will probably break a few things

             break;

         case 0xF9:  /* FYL2XP1 */

-            ST(1).d *= log2(ST0.d + 1.0);

+            // Using the log1p instead of log2(ST0+1) can avoid losing precision much,

+            // expecially when ST0 is close to zero (which loses the precise when +1).

+            ST(1).d = (ST(1).d * log1p(ST0.d)) / M_LN2;

+            //      = ST1 * log2(ST0 + 1) + error. (in math)

             fpu_do_pop(emu);

             break;

         case 0xFA:  /* FSQRT */

+            oldround = fpu_setround(emu);

             ST0.d = sqrt(ST0.d);

+            fesetround(oldround);

             break;

         case 0xFB:  /* FSINCOS */

             fpu_do_push(emu);

+            oldround = fpu_setround(emu);

             sincos(ST1.d, &ST1.d, &ST0.d);

+            fesetround(oldround);

             emu->sw.f.F87_C2 = 0;

             break;

         case 0xFC:  /* FRNDINT */

@@ -225,15 +245,28 @@ uintptr_t RunD9(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
             break;

         case 0xFD:  /* FSCALE */

             // this could probably be done by just altering the exponant part of the float...

-            if(ST0.d!=0.0)

-                ST0.d *= exp2(trunc(ST1.d));

+            if (ST1.d > INT32_MAX)

+                tmp32s = INT32_MAX;

+            else if (ST1.d < INT32_MIN)

+                tmp32s = INT32_MIN;

+            else

+                tmp32s = ST1.d;

+            if(ST0.d!=0.0) {

+                oldround = fpu_setround(emu);

+                ST0.d = ldexp(ST0.d, tmp32s);

+                fesetround(oldround);

+            }

             break;

         case 0xFE:  /* FSIN */

+            oldround = fpu_setround(emu);

             ST0.d = sin(ST0.d);

+            fesetround(oldround);

             emu->sw.f.F87_C2 = 0;

             break;

         case 0xFF:  /* FCOS */

+            oldround = fpu_setround(emu);

             ST0.d = cos(ST0.d);

+            fesetround(oldround);

             emu->sw.f.F87_C2 = 0;

             break;

 

@@ -257,7 +290,9 @@ uintptr_t RunD9(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
                 } else {

                     GETE4(0);

                 }

+                oldround = fpu_setround(emu);

                 *(float*)ED = ST0.d;

+                fesetround(oldround);

                 break;

             case 3:     /* FSTP Ed, ST0 */

                 if(offs) {

@@ -265,7 +300,9 @@ uintptr_t RunD9(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
                 } else {

                     GETE4(0);

                 }

+                oldround = fpu_setround(emu);

                 *(float*)ED = ST0.d;

+                fesetround(oldround);

                 fpu_do_pop(emu);

                 break;

             case 4:     /* FLDENV m */

@@ -309,5 +346,5 @@ uintptr_t RunD9(x64emu_t *emu, rex_t rex, uintptr_t addr, uintptr_t offs)
             default:

                 return 0;

         }

-   return addr;

+    return addr;

 }
\ No newline at end of file
diff --git a/src/emu/x64runda.c b/src/emu/x64runda.c
index aed775f9..056937e7 100644
--- a/src/emu/x64runda.c
+++ b/src/emu/x64runda.c
@@ -1,4 +1,5 @@
 #define _GNU_SOURCE

+#include <fenv.h>

 #include <stdint.h>

 #include <stdio.h>

 #include <stdlib.h>

@@ -17,6 +18,7 @@
 #include "x64primop.h"

 #include "x64trace.h"

 #include "x87emu_private.h"

+#include "x87emu_setround.h"

 #include "box64context.h"

 #include "bridge.h"

 

@@ -94,7 +96,8 @@ uintptr_t RunDA(x64emu_t *emu, rex_t rex, uintptr_t addr)
 

     default:

         return 0;

-    } else

+    } else {

+        int oldround = fpu_setround(emu);

         switch((nextop>>3)&7) {

             case 0:     /* FIADD ST0, Ed int */

                 GETE4(0);

@@ -130,5 +133,7 @@ uintptr_t RunDA(x64emu_t *emu, rex_t rex, uintptr_t addr)
                 ST0.d = (double)ED->sdword[0] / ST0.d;

                 break;

         }

+        fesetround(oldround);

+    }

    return addr;

 }
\ No newline at end of file
diff --git a/src/emu/x64rundb.c b/src/emu/x64rundb.c
index 82ea43ff..b6ba0ff1 100644
--- a/src/emu/x64rundb.c
+++ b/src/emu/x64rundb.c
@@ -1,4 +1,5 @@
 #define _GNU_SOURCE

+#include <fenv.h>

 #include <stdint.h>

 #include <stdio.h>

 #include <stdlib.h>

@@ -17,6 +18,7 @@
 #include "x64primop.h"

 #include "x64trace.h"

 #include "x87emu_private.h"

+#include "x87emu_setround.h"

 #include "box64context.h"

 #include "bridge.h"

 

@@ -35,6 +37,7 @@ uintptr_t RunDB(x64emu_t *emu, rex_t rex, uintptr_t addr)
     x64emu_t*emu = test->emu;

     #endif

 

+    int oldround = fpu_setround(emu);

     nextop = F8;

     if(MODREG)

     switch(nextop) {

@@ -128,6 +131,7 @@ uintptr_t RunDB(x64emu_t *emu, rex_t rex, uintptr_t addr)
         break;

 

     default:

+        fesetround(oldround);

         return 0;

     } else

         switch((nextop>>3)&7) {

@@ -179,7 +183,9 @@ uintptr_t RunDB(x64emu_t *emu, rex_t rex, uintptr_t addr)
                 fpu_do_pop(emu);

                 break;

             default:

+                fesetround(oldround);

                 return 0;

         }

-  return addr;

+    fesetround(oldround);

+    return addr;

 }

diff --git a/src/emu/x64rundc.c b/src/emu/x64rundc.c
index 6d9ff07b..c9ffe738 100644
--- a/src/emu/x64rundc.c
+++ b/src/emu/x64rundc.c
@@ -17,6 +17,7 @@
 #include "x64primop.h"
 #include "x64trace.h"
 #include "x87emu_private.h"
+#include "x87emu_setround.h"
 #include "box64context.h"
 #include "bridge.h"
 
@@ -34,6 +35,7 @@ uintptr_t RunDC(x64emu_t *emu, rex_t rex, uintptr_t addr)
     x64emu_t*emu = test->emu;
     #endif
 
+    int oldround = fpu_setround(emu);
     nextop = F8;
     if(MODREG)
     switch(nextop) {
@@ -119,6 +121,7 @@ uintptr_t RunDC(x64emu_t *emu, rex_t rex, uintptr_t addr)
             ST(nextop&7).d /=  ST0.d;
             break;
         default:
+            fesetround(oldround);
             return 0;
     } else {
             GETE8(0);
@@ -149,8 +152,10 @@ uintptr_t RunDC(x64emu_t *emu, rex_t rex, uintptr_t addr)
                 ST0.d = *(double*)ED / ST0.d;
                 break;
            default:
+                fesetround(oldround);
                 return 0;
         }
     }
-  return addr;
+    fesetround(oldround);
+    return addr;
 }
diff --git a/src/emu/x64runde.c b/src/emu/x64runde.c
index 4cb5e4a2..8b082aa5 100644
--- a/src/emu/x64runde.c
+++ b/src/emu/x64runde.c
@@ -1,4 +1,5 @@
 #define _GNU_SOURCE
+#include <fenv.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -17,6 +18,7 @@
 #include "x64primop.h"
 #include "x64trace.h"
 #include "x87emu_private.h"
+#include "x87emu_setround.h"
 #include "box64context.h"
 #include "bridge.h"
 
@@ -34,6 +36,7 @@ uintptr_t RunDE(x64emu_t *emu, rex_t rex, uintptr_t addr)
     x64emu_t*emu = test->emu;
     #endif
 
+    int oldround = fpu_setround(emu);
     nextop = F8;
     if(MODREG)
     switch(nextop) {
@@ -126,6 +129,7 @@ uintptr_t RunDE(x64emu_t *emu, rex_t rex, uintptr_t addr)
             break;
 
         default:
+            fesetround(oldround);
             return 0;
     } else
         switch((nextop>>3)&7) {
@@ -154,7 +158,9 @@ uintptr_t RunDE(x64emu_t *emu, rex_t rex, uintptr_t addr)
                 ST0.d = (double)EW->sword[0] / ST0.d;
                 break;
            default:
+                fesetround(oldround);
                 return 0;
         }
-  return addr;
+    fesetround(oldround);
+    return addr;
 }
\ No newline at end of file
diff --git a/src/emu/x87emu_setround.h b/src/emu/x87emu_setround.h
new file mode 100644
index 00000000..4791c3ea
--- /dev/null
+++ b/src/emu/x87emu_setround.h
@@ -0,0 +1,29 @@
+#ifndef __SETROUND_H__
+#define __SETROUND_H__
+#pragma STDC FENV_ACCESS ON
+#include <fenv.h>
+#include <stdint.h>
+#include "x64emu.h"
+#include "x64emu_private.h"
+// set the rounding mode to the emulator's one, and return the old one
+static inline int fpu_setround(x64emu_t* emu) {
+    int ret = fegetround();
+    int rounding_direction;
+    switch (emu->cw.f.C87_RD) {
+        case ROUND_Nearest:
+            rounding_direction = FE_TONEAREST;
+            break;
+        case ROUND_Down:
+            rounding_direction = FE_DOWNWARD;
+            break;
+        case ROUND_Up:
+            rounding_direction = FE_UPWARD;
+            break;
+        case ROUND_Chop:
+            rounding_direction = FE_TOWARDZERO;
+            break;
+    }
+    fesetround(rounding_direction);
+    return ret;
+}
+#endif
\ No newline at end of file
diff --git a/tests/ref31.txt b/tests/ref31.txt
new file mode 100644
index 00000000..358877e1
--- /dev/null
+++ b/tests/ref31.txt
@@ -0,0 +1,810 @@
+Testing: s = (0x1.123456789abcp2) -> (double)s
+FE_TONEAREST   0x1.123456789abcp+2
+FE_DOWNWARD    0x1.123456789abcp+2
+FE_UPWARD      0x1.123456789abcp+2
+FE_TOWARDZERO  0x1.123456789abcp+2
+
+Testing: s = (0x1.123456789abcp2) -> (float)s
+FE_TONEAREST   0x1.123456p+2
+FE_DOWNWARD    0x1.123456p+2
+FE_UPWARD      0x1.123458p+2
+FE_TOWARDZERO  0x1.123456p+2
+
+Testing: s = (-(0x1.123456789abcp2)) -> (double)s
+FE_TONEAREST   -0x1.123456789abcp+2
+FE_DOWNWARD    -0x1.123456789abcp+2
+FE_UPWARD      -0x1.123456789abcp+2
+FE_TOWARDZERO  -0x1.123456789abcp+2
+
+Testing: s = (-(0x1.123456789abcp2)) -> (float)s
+FE_TONEAREST   -0x1.123456p+2
+FE_DOWNWARD    -0x1.123458p+2
+FE_UPWARD      -0x1.123456p+2
+FE_TOWARDZERO  -0x1.123456p+2
+
+Testing: d = (0x1.123456789abcp512) -> (float)d
+FE_TONEAREST   inf
+FE_DOWNWARD    0x1.fffffep+127
+FE_UPWARD      inf
+FE_TOWARDZERO  0x1.fffffep+127
+
+Testing: s = (0x1.123456789abcp29) -> (double)s
+FE_TONEAREST   0x1.123456789abcp+29
+FE_DOWNWARD    0x1.123456789abcp+29
+FE_UPWARD      0x1.123456789abcp+29
+FE_TOWARDZERO  0x1.123456789abcp+29
+
+Testing: s = (0x1.123456789abcp29) -> (float)s
+FE_TONEAREST   0x1.123456p+29
+FE_DOWNWARD    0x1.123456p+29
+FE_UPWARD      0x1.123458p+29
+FE_TOWARDZERO  0x1.123456p+29
+
+Testing: s = (0x1.123456789abcp29) -> (int16_t)s
+FE_TONEAREST   -32768
+FE_DOWNWARD    -32768
+FE_UPWARD      -32768
+FE_TOWARDZERO  -32768
+
+Testing: s = (0x1.123456789abcp29) -> (int8_t)s
+FE_TONEAREST   0
+FE_DOWNWARD    0
+FE_UPWARD      0
+FE_TOWARDZERO  0
+
+Testing: s = (0x1.123456789abcp29) -> (unsigned short)s
+FE_TONEAREST   35535
+FE_DOWNWARD    35535
+FE_UPWARD      35535
+FE_TOWARDZERO  35535
+
+Testing: s = (0x1.123456789abcp29) -> (unsigned char)s
+FE_TONEAREST   0
+FE_DOWNWARD    0
+FE_UPWARD      0
+FE_TOWARDZERO  0
+
+Testing: s = (-(0x1.123456789abcp29)) -> (double)s
+FE_TONEAREST   -0x1.123456789abcp+29
+FE_DOWNWARD    -0x1.123456789abcp+29
+FE_UPWARD      -0x1.123456789abcp+29
+FE_TOWARDZERO  -0x1.123456789abcp+29
+
+Testing: s = (-(0x1.123456789abcp29)) -> (float)s
+FE_TONEAREST   -0x1.123456p+29
+FE_DOWNWARD    -0x1.123458p+29
+FE_UPWARD      -0x1.123456p+29
+FE_TOWARDZERO  -0x1.123456p+29
+
+Testing: d = (-0x1.123456789abcp30) -> (int32_t)d
+FE_TONEAREST   -1150096798
+FE_DOWNWARD    -1150096798
+FE_UPWARD      -1150096798
+FE_TOWARDZERO  -1150096798
+
+Testing: d = (-0x1.123456789abcp62) -> (int64_t)d
+FE_TONEAREST   -4939628135293321216
+FE_DOWNWARD    -4939628135293321216
+FE_UPWARD      -4939628135293321216
+FE_TOWARDZERO  -4939628135293321216
+
+Testing: s = (0x1.123456789abcp2f) -> (double)s
+FE_TONEAREST   0x1.123456p+2
+FE_DOWNWARD    0x1.123456p+2
+FE_UPWARD      0x1.123458p+2
+FE_TOWARDZERO  0x1.123456p+2
+
+Testing: s = (0x1.123456789abcp2f) -> (float)s
+FE_TONEAREST   0x1.123456p+2
+FE_DOWNWARD    0x1.123456p+2
+FE_UPWARD      0x1.123458p+2
+FE_TOWARDZERO  0x1.123456p+2
+
+Testing: s = (-(0x1.123456789abcp2f)) -> (double)s
+FE_TONEAREST   -0x1.123456p+2
+FE_DOWNWARD    -0x1.123458p+2
+FE_UPWARD      -0x1.123456p+2
+FE_TOWARDZERO  -0x1.123456p+2
+
+Testing: s = (-(0x1.123456789abcp2f)) -> (float)s
+FE_TONEAREST   -0x1.123456p+2
+FE_DOWNWARD    -0x1.123458p+2
+FE_UPWARD      -0x1.123456p+2
+FE_TOWARDZERO  -0x1.123456p+2
+
+Testing: s = (0x1.123456789abcp29f) -> (double)s
+FE_TONEAREST   0x1.123456p+29
+FE_DOWNWARD    0x1.123456p+29
+FE_UPWARD      0x1.123458p+29
+FE_TOWARDZERO  0x1.123456p+29
+
+Testing: s = (0x1.123456789abcp29f) -> (float)s
+FE_TONEAREST   0x1.123456p+29
+FE_DOWNWARD    0x1.123456p+29
+FE_UPWARD      0x1.123458p+29
+FE_TOWARDZERO  0x1.123456p+29
+
+Testing: s = (0x1.123456789abcp29f) -> (int16_t)s
+FE_TONEAREST   -32768
+FE_DOWNWARD    -32768
+FE_UPWARD      -32768
+FE_TOWARDZERO  -32768
+
+Testing: s = (0x1.123456789abcp29f) -> (int8_t)s
+FE_TONEAREST   0
+FE_DOWNWARD    0
+FE_UPWARD      0
+FE_TOWARDZERO  0
+
+Testing: s = (0x1.123456789abcp29f) -> (unsigned short)s
+FE_TONEAREST   35520
+FE_DOWNWARD    35520
+FE_UPWARD      35584
+FE_TOWARDZERO  35520
+
+Testing: s = (0x1.123456789abcp29f) -> (unsigned char)s
+FE_TONEAREST   0
+FE_DOWNWARD    0
+FE_UPWARD      0
+FE_TOWARDZERO  0
+
+Testing: s = (-(0x1.123456789abcp29f)) -> (double)s
+FE_TONEAREST   -0x1.123456p+29
+FE_DOWNWARD    -0x1.123458p+29
+FE_UPWARD      -0x1.123456p+29
+FE_TOWARDZERO  -0x1.123456p+29
+
+Testing: s = (-(0x1.123456789abcp29f)) -> (float)s
+FE_TONEAREST   -0x1.123456p+29
+FE_DOWNWARD    -0x1.123458p+29
+FE_UPWARD      -0x1.123456p+29
+FE_TOWARDZERO  -0x1.123456p+29
+
+Testing: f = -0x1.123456789abcp30f -> (int32_t)f
+FE_TONEAREST   -1150096768
+FE_DOWNWARD    -1150096896
+FE_UPWARD      -1150096768
+FE_TOWARDZERO  -1150096768
+
+Testing: d = -0x1.1234567p0 -> (double)((int)d)
+FE_TONEAREST   -0x1p+0
+FE_DOWNWARD    -0x1p+0
+FE_UPWARD      -0x1p+0
+FE_TOWARDZERO  -0x1p+0
+
+Testing: d = 0x1.9234567p0 -> (double)((int)d)
+FE_TONEAREST   0x1p+0
+FE_DOWNWARD    0x1p+0
+FE_UPWARD      0x1p+0
+FE_TOWARDZERO  0x1p+0
+
+Testing: d = -0x1.9234567p0 -> (double)((int)d)
+FE_TONEAREST   -0x1p+0
+FE_DOWNWARD    -0x1p+0
+FE_UPWARD      -0x1p+0
+FE_TOWARDZERO  -0x1p+0
+
+Testing: d = 0x1.1234567p0 -> (double)((long int)d)
+FE_TONEAREST   0x1p+0
+FE_DOWNWARD    0x1p+0
+FE_UPWARD      0x1p+0
+FE_TOWARDZERO  0x1p+0
+
+Testing: d = -0x1.1234567p0 -> (double)((long int)d)
+FE_TONEAREST   -0x1p+0
+FE_DOWNWARD    -0x1p+0
+FE_UPWARD      -0x1p+0
+FE_TOWARDZERO  -0x1p+0
+
+Testing: d = 0x1.9234567p0 -> (double)((long int)d)
+FE_TONEAREST   0x1p+0
+FE_DOWNWARD    0x1p+0
+FE_UPWARD      0x1p+0
+FE_TOWARDZERO  0x1p+0
+
+Testing: d = -0x1.9234567p0 -> (double)((long int)d)
+FE_TONEAREST   -0x1p+0
+FE_DOWNWARD    -0x1p+0
+FE_UPWARD      -0x1p+0
+FE_TOWARDZERO  -0x1p+0
+
+Testing: (d1 = (1.0), d2 = (0x1.0000000000001p0)) -> d1 + d2
+FE_TONEAREST   0x1p+1
+FE_DOWNWARD    0x1p+1
+FE_UPWARD      0x1.0000000000001p+1
+FE_TOWARDZERO  0x1p+1
+
+Testing: (d1 = -(1.0), d2 = (0x1.0000000000001p0)) -> d1 + d2
+FE_TONEAREST   0x1p-52
+FE_DOWNWARD    0x1p-52
+FE_UPWARD      0x1p-52
+FE_TOWARDZERO  0x1p-52
+
+Testing: (d1 = (1.0), d2 = -(0x1.0000000000001p0)) -> d1 + d2
+FE_TONEAREST   -0x1p-52
+FE_DOWNWARD    -0x1p-52
+FE_UPWARD      -0x1p-52
+FE_TOWARDZERO  -0x1p-52
+
+Testing: (d1 = -(1.0), d2 = -(0x1.0000000000001p0)) -> d1 + d2
+FE_TONEAREST   -0x1p+1
+FE_DOWNWARD    -0x1.0000000000001p+1
+FE_UPWARD      -0x1p+1
+FE_TOWARDZERO  -0x1p+1
+
+Testing: (d1 = (1.0), d2 = (0x1.0000000000001p0)) -> d1 - d2
+FE_TONEAREST   -0x1p-52
+FE_DOWNWARD    -0x1p-52
+FE_UPWARD      -0x1p-52
+FE_TOWARDZERO  -0x1p-52
+
+Testing: (d1 = -(1.0), d2 = (0x1.0000000000001p0)) -> d1 - d2
+FE_TONEAREST   -0x1p+1
+FE_DOWNWARD    -0x1.0000000000001p+1
+FE_UPWARD      -0x1p+1
+FE_TOWARDZERO  -0x1p+1
+
+Testing: (d1 = (1.0), d2 = -(0x1.0000000000001p0)) -> d1 - d2
+FE_TONEAREST   0x1p+1
+FE_DOWNWARD    0x1p+1
+FE_UPWARD      0x1.0000000000001p+1
+FE_TOWARDZERO  0x1p+1
+
+Testing: (d1 = -(1.0), d2 = -(0x1.0000000000001p0)) -> d1 - d2
+FE_TONEAREST   0x1p-52
+FE_DOWNWARD    0x1p-52
+FE_UPWARD      0x1p-52
+FE_TOWARDZERO  0x1p-52
+
+Testing: (d1 = (1.0), d2 = (0x1.0000000000001p0)) -> d2 - d1
+FE_TONEAREST   0x1p-52
+FE_DOWNWARD    0x1p-52
+FE_UPWARD      0x1p-52
+FE_TOWARDZERO  0x1p-52
+
+Testing: (d1 = -(1.0), d2 = (0x1.0000000000001p0)) -> d2 - d1
+FE_TONEAREST   0x1p+1
+FE_DOWNWARD    0x1p+1
+FE_UPWARD      0x1.0000000000001p+1
+FE_TOWARDZERO  0x1p+1
+
+Testing: (d1 = (1.0), d2 = -(0x1.0000000000001p0)) -> d2 - d1
+FE_TONEAREST   -0x1p+1
+FE_DOWNWARD    -0x1.0000000000001p+1
+FE_UPWARD      -0x1p+1
+FE_TOWARDZERO  -0x1p+1
+
+Testing: (d1 = -(1.0), d2 = -(0x1.0000000000001p0)) -> d2 - d1
+FE_TONEAREST   -0x1p-52
+FE_DOWNWARD    -0x1p-52
+FE_UPWARD      -0x1p-52
+FE_TOWARDZERO  -0x1p-52
+
+Testing: (d1 = (1.0), d2 = (0x1.000000000000dp-4)) -> d1 + d2
+FE_TONEAREST   0x1.1000000000001p+0
+FE_DOWNWARD    0x1.1p+0
+FE_UPWARD      0x1.1000000000001p+0
+FE_TOWARDZERO  0x1.1p+0
+
+Testing: (d1 = -(1.0), d2 = (0x1.000000000000dp-4)) -> d1 + d2
+FE_TONEAREST   -0x1.dfffffffffffep-1
+FE_DOWNWARD    -0x1.dffffffffffffp-1
+FE_UPWARD      -0x1.dfffffffffffep-1
+FE_TOWARDZERO  -0x1.dfffffffffffep-1
+
+Testing: (d1 = (1.0), d2 = -(0x1.000000000000dp-4)) -> d1 + d2
+FE_TONEAREST   0x1.dfffffffffffep-1
+FE_DOWNWARD    0x1.dfffffffffffep-1
+FE_UPWARD      0x1.dffffffffffffp-1
+FE_TOWARDZERO  0x1.dfffffffffffep-1
+
+Testing: (d1 = -(1.0), d2 = -(0x1.000000000000dp-4)) -> d1 + d2
+FE_TONEAREST   -0x1.1000000000001p+0
+FE_DOWNWARD    -0x1.1000000000001p+0
+FE_UPWARD      -0x1.1p+0
+FE_TOWARDZERO  -0x1.1p+0
+
+Testing: (d1 = (1.0), d2 = (0x1.000000000000dp-4)) -> d1 - d2
+FE_TONEAREST   0x1.dfffffffffffep-1
+FE_DOWNWARD    0x1.dfffffffffffep-1
+FE_UPWARD      0x1.dffffffffffffp-1
+FE_TOWARDZERO  0x1.dfffffffffffep-1
+
+Testing: (d1 = -(1.0), d2 = (0x1.000000000000dp-4)) -> d1 - d2
+FE_TONEAREST   -0x1.1000000000001p+0
+FE_DOWNWARD    -0x1.1000000000001p+0
+FE_UPWARD      -0x1.1p+0
+FE_TOWARDZERO  -0x1.1p+0
+
+Testing: (d1 = (1.0), d2 = -(0x1.000000000000dp-4)) -> d1 - d2
+FE_TONEAREST   0x1.1000000000001p+0
+FE_DOWNWARD    0x1.1p+0
+FE_UPWARD      0x1.1000000000001p+0
+FE_TOWARDZERO  0x1.1p+0
+
+Testing: (d1 = -(1.0), d2 = -(0x1.000000000000dp-4)) -> d1 - d2
+FE_TONEAREST   -0x1.dfffffffffffep-1
+FE_DOWNWARD    -0x1.dffffffffffffp-1
+FE_UPWARD      -0x1.dfffffffffffep-1
+FE_TOWARDZERO  -0x1.dfffffffffffep-1
+
+Testing: (d1 = (1.0), d2 = (0x1.000000000000dp-4)) -> d2 - d1
+FE_TONEAREST   -0x1.dfffffffffffep-1
+FE_DOWNWARD    -0x1.dffffffffffffp-1
+FE_UPWARD      -0x1.dfffffffffffep-1
+FE_TOWARDZERO  -0x1.dfffffffffffep-1
+
+Testing: (d1 = -(1.0), d2 = (0x1.000000000000dp-4)) -> d2 - d1
+FE_TONEAREST   0x1.1000000000001p+0
+FE_DOWNWARD    0x1.1p+0
+FE_UPWARD      0x1.1000000000001p+0
+FE_TOWARDZERO  0x1.1p+0
+
+Testing: (d1 = (1.0), d2 = -(0x1.000000000000dp-4)) -> d2 - d1
+FE_TONEAREST   -0x1.1000000000001p+0
+FE_DOWNWARD    -0x1.1000000000001p+0
+FE_UPWARD      -0x1.1p+0
+FE_TOWARDZERO  -0x1.1p+0
+
+Testing: (d1 = -(1.0), d2 = -(0x1.000000000000dp-4)) -> d2 - d1
+FE_TONEAREST   0x1.dfffffffffffep-1
+FE_DOWNWARD    0x1.dfffffffffffep-1
+FE_UPWARD      0x1.dffffffffffffp-1
+FE_TOWARDZERO  0x1.dfffffffffffep-1
+
+Testing: (d1 = (0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 + d2
+FE_TONEAREST   0x1.2bc55ef8922bp+1
+FE_DOWNWARD    0x1.2bc55ef8922bp+1
+FE_UPWARD      0x1.2bc55ef8922bp+1
+FE_TOWARDZERO  0x1.2bc55ef8922bp+1
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 + d2
+FE_TONEAREST   0x1.12233445566p-4
+FE_DOWNWARD    0x1.12233445566p-4
+FE_UPWARD      0x1.12233445566p-4
+FE_TOWARDZERO  0x1.12233445566p-4
+
+Testing: (d1 = (0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 + d2
+FE_TONEAREST   -0x1.12233445566p-4
+FE_DOWNWARD    -0x1.12233445566p-4
+FE_UPWARD      -0x1.12233445566p-4
+FE_TOWARDZERO  -0x1.12233445566p-4
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 + d2
+FE_TONEAREST   -0x1.2bc55ef8922bp+1
+FE_DOWNWARD    -0x1.2bc55ef8922bp+1
+FE_UPWARD      -0x1.2bc55ef8922bp+1
+FE_TOWARDZERO  -0x1.2bc55ef8922bp+1
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = (0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd404804dp+0
+FE_DOWNWARD    0x1.5ebd3ddf57ep+0
+FE_UPWARD      0x1.5ebd428e6d5cp+0
+FE_TOWARDZERO  0x1.5ebd3ddf57ep+0
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = (0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd404804dp+0
+FE_DOWNWARD    -0x1.5ebd404804dp+0
+FE_UPWARD      -0x1.5ebd4025c068p+0
+FE_TOWARDZERO  -0x1.5ebd3ddf57ep+0
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = -(0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd404804dp+0
+FE_DOWNWARD    -0x1.5ebd4025c068p+0
+FE_UPWARD      -0x1.5ebd404804dp+0
+FE_TOWARDZERO  -0x1.5ebd3ddf57ep+0
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = -(0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd404804dp+0
+FE_DOWNWARD    0x1.5ebd428e6d5cp+0
+FE_UPWARD      0x1.5ebd3ddf57ep+0
+FE_TOWARDZERO  0x1.5ebd3ddf57ep+0
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = (0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd40f80919p+0
+FE_DOWNWARD    0x1.5ebd3e8f5c27dp+0
+FE_UPWARD      0x1.5ebd40f809191p+0
+FE_TOWARDZERO  0x1.5ebd3e8f5c27dp+0
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = (0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd40f80919p+0
+FE_DOWNWARD    -0x1.5ebd40f809191p+0
+FE_UPWARD      -0x1.5ebd3e8f5c27dp+0
+FE_TOWARDZERO  -0x1.5ebd3e8f5c27dp+0
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = -(0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd40f80919p+0
+FE_DOWNWARD    -0x1.5ebd3e8f5c27ep+0
+FE_UPWARD      -0x1.5ebd40f80919p+0
+FE_TOWARDZERO  -0x1.5ebd3e8f5c27dp+0
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = -(0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd40f80919p+0
+FE_DOWNWARD    0x1.5ebd40f80919p+0
+FE_UPWARD      0x1.5ebd3e8f5c27ep+0
+FE_TOWARDZERO  0x1.5ebd3e8f5c27dp+0
+
+Testing: (d1 = (0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd402bc44c4p+0
+FE_DOWNWARD    0x1.5ebd402bc44c4p+0
+FE_UPWARD      0x1.5ebd402bc44c5p+0
+FE_TOWARDZERO  0x1.5ebd402bc44c4p+0
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd402bc44c4p+0
+FE_DOWNWARD    -0x1.5ebd402bc44c5p+0
+FE_UPWARD      -0x1.5ebd402bc44c4p+0
+FE_TOWARDZERO  -0x1.5ebd402bc44c4p+0
+
+Testing: (d1 = (0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd402bc44c4p+0
+FE_DOWNWARD    -0x1.5ebd402bc44c5p+0
+FE_UPWARD      -0x1.5ebd402bc44c4p+0
+FE_TOWARDZERO  -0x1.5ebd402bc44c4p+0
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd402bc44c4p+0
+FE_DOWNWARD    0x1.5ebd402bc44c4p+0
+FE_UPWARD      0x1.5ebd402bc44c5p+0
+FE_TOWARDZERO  0x1.5ebd402bc44c4p+0
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = (0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd40f80919p+0
+FE_DOWNWARD    0x1.5ebd3e8f5c27dp+0
+FE_UPWARD      0x1.5ebd40f809191p+0
+FE_TOWARDZERO  0x1.5ebd3e8f5c27dp+0
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = (0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd40f80919p+0
+FE_DOWNWARD    -0x1.5ebd40f809191p+0
+FE_UPWARD      -0x1.5ebd3e8f5c27dp+0
+FE_TOWARDZERO  -0x1.5ebd3e8f5c27dp+0
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = -(0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd40f80919p+0
+FE_DOWNWARD    -0x1.5ebd3e8f5c27ep+0
+FE_UPWARD      -0x1.5ebd40f80919p+0
+FE_TOWARDZERO  -0x1.5ebd3e8f5c27dp+0
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = -(0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd40f80919p+0
+FE_DOWNWARD    0x1.5ebd40f80919p+0
+FE_UPWARD      0x1.5ebd3e8f5c27ep+0
+FE_TOWARDZERO  0x1.5ebd3e8f5c27dp+0
+
+Testing: (d1 = (0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd3f7bc003ap+0
+FE_DOWNWARD    0x1.5ebd3f7bc003ap+0
+FE_UPWARD      0x1.5ebd41c2288e5p+0
+FE_TOWARDZERO  0x1.5ebd3f7bc003ap+0
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd3f7bc003ap+0
+FE_DOWNWARD    -0x1.5ebd3f7bc003bp+0
+FE_UPWARD      -0x1.5ebd41c2288e4p+0
+FE_TOWARDZERO  -0x1.5ebd3f7bc003ap+0
+
+Testing: (d1 = (0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd3f7bc003ap+0
+FE_DOWNWARD    -0x1.5ebd41c2288e5p+0
+FE_UPWARD      -0x1.5ebd3f7bc003ap+0
+FE_TOWARDZERO  -0x1.5ebd3f7bc003ap+0
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd3f7bc003ap+0
+FE_DOWNWARD    0x1.5ebd41c2288e4p+0
+FE_UPWARD      0x1.5ebd3f7bc003bp+0
+FE_TOWARDZERO  0x1.5ebd3f7bc003ap+0
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = (0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd404804dp+0
+FE_DOWNWARD    0x1.5ebd3ddf57ep+0
+FE_UPWARD      0x1.5ebd428e6d5cp+0
+FE_TOWARDZERO  0x1.5ebd3ddf57ep+0
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = (0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd404804dp+0
+FE_DOWNWARD    -0x1.5ebd404804dp+0
+FE_UPWARD      -0x1.5ebd4025c068p+0
+FE_TOWARDZERO  -0x1.5ebd3ddf57ep+0
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = -(0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   -0x1.5ebd404804dp+0
+FE_DOWNWARD    -0x1.5ebd4025c068p+0
+FE_UPWARD      -0x1.5ebd404804dp+0
+FE_TOWARDZERO  -0x1.5ebd3ddf57ep+0
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = -(0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   0x1.5ebd404804dp+0
+FE_DOWNWARD    0x1.5ebd428e6d5cp+0
+FE_UPWARD      0x1.5ebd3ddf57ep+0
+FE_TOWARDZERO  0x1.5ebd3ddf57ep+0
+
+Testing: (d1 = (0x1.233445566778p0), d2 = (5)) -> d1 *d2
+FE_TONEAREST   0x1.6c0156ac0156p+2
+FE_DOWNWARD    0x1.6c0156ac0156p+2
+FE_UPWARD      0x1.6c0156ac0156p+2
+FE_TOWARDZERO  0x1.6c0156ac0156p+2
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = (5)) -> d1 *d2
+FE_TONEAREST   -0x1.6c0156ac0156p+2
+FE_DOWNWARD    -0x1.6c0156ac0156p+2
+FE_UPWARD      -0x1.6c0156ac0156p+2
+FE_TOWARDZERO  -0x1.6c0156ac0156p+2
+
+Testing: (d1 = (0x1.233445566778p0), d2 = -(5)) -> d1 *d2
+FE_TONEAREST   -0x1.6c0156ac0156p+2
+FE_DOWNWARD    -0x1.6c0156ac0156p+2
+FE_UPWARD      -0x1.6c0156ac0156p+2
+FE_TOWARDZERO  -0x1.6c0156ac0156p+2
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = -(5)) -> d1 *d2
+FE_TONEAREST   0x1.6c0156ac0156p+2
+FE_DOWNWARD    0x1.6c0156ac0156p+2
+FE_UPWARD      0x1.6c0156ac0156p+2
+FE_TOWARDZERO  0x1.6c0156ac0156p+2
+
+Testing: (d1 = (15), d2 = (0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   0x1.2111111111102p+4
+FE_DOWNWARD    0x1.2111111111102p+4
+FE_UPWARD      0x1.2111111111102p+4
+FE_TOWARDZERO  0x1.2111111111102p+4
+
+Testing: (d1 = -(15), d2 = (0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   -0x1.2111111111102p+4
+FE_DOWNWARD    -0x1.2111111111102p+4
+FE_UPWARD      -0x1.2111111111102p+4
+FE_TOWARDZERO  -0x1.2111111111102p+4
+
+Testing: (d1 = (15), d2 = -(0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   -0x1.2111111111102p+4
+FE_DOWNWARD    -0x1.2111111111102p+4
+FE_UPWARD      -0x1.2111111111102p+4
+FE_TOWARDZERO  -0x1.2111111111102p+4
+
+Testing: (d1 = -(15), d2 = -(0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   0x1.2111111111102p+4
+FE_DOWNWARD    0x1.2111111111102p+4
+FE_UPWARD      0x1.2111111111102p+4
+FE_TOWARDZERO  0x1.2111111111102p+4
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = (15)) -> d1 *d2
+FE_TONEAREST   0x1.110101ap+4
+FE_DOWNWARD    0x1.1100ffcp+4
+FE_UPWARD      0x1.110101ap+4
+FE_TOWARDZERO  0x1.1100ffcp+4
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = (15)) -> d1 *d2
+FE_TONEAREST   -0x1.110101ap+4
+FE_DOWNWARD    -0x1.110101ap+4
+FE_UPWARD      -0x1.1100ffcp+4
+FE_TOWARDZERO  -0x1.1100ffcp+4
+
+Testing: (d1 = (0x1.233445566778p0f), d2 = -(15)) -> d1 *d2
+FE_TONEAREST   -0x1.110101ap+4
+FE_DOWNWARD    -0x1.1100ffcp+4
+FE_UPWARD      -0x1.110101ap+4
+FE_TOWARDZERO  -0x1.1100ffcp+4
+
+Testing: (d1 = -(0x1.233445566778p0f), d2 = -(15)) -> d1 *d2
+FE_TONEAREST   0x1.110101ap+4
+FE_DOWNWARD    0x1.110101ap+4
+FE_UPWARD      0x1.1100ffcp+4
+FE_TOWARDZERO  0x1.1100ffcp+4
+
+Testing: (d1 = (15), d2 = (0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   0x1.2111108p+4
+FE_DOWNWARD    0x1.2111108p+4
+FE_UPWARD      0x1.2111126p+4
+FE_TOWARDZERO  0x1.2111108p+4
+
+Testing: (d1 = -(15), d2 = (0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   -0x1.2111108p+4
+FE_DOWNWARD    -0x1.2111108p+4
+FE_UPWARD      -0x1.2111126p+4
+FE_TOWARDZERO  -0x1.2111108p+4
+
+Testing: (d1 = (15), d2 = -(0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   -0x1.2111108p+4
+FE_DOWNWARD    -0x1.2111126p+4
+FE_UPWARD      -0x1.2111108p+4
+FE_TOWARDZERO  -0x1.2111108p+4
+
+Testing: (d1 = -(15), d2 = -(0x1.3456789abcdep0f)) -> d1 *d2
+FE_TONEAREST   0x1.2111108p+4
+FE_DOWNWARD    0x1.2111126p+4
+FE_UPWARD      0x1.2111108p+4
+FE_TOWARDZERO  0x1.2111108p+4
+
+Testing: (d1 = (0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 / d2
+FE_TONEAREST   0x1.e38ca44203ab9p-1
+FE_DOWNWARD    0x1.e38ca44203ab8p-1
+FE_UPWARD      0x1.e38ca44203ab9p-1
+FE_TOWARDZERO  0x1.e38ca44203ab8p-1
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 / d2
+FE_TONEAREST   -0x1.e38ca44203ab9p-1
+FE_DOWNWARD    -0x1.e38ca44203ab9p-1
+FE_UPWARD      -0x1.e38ca44203ab8p-1
+FE_TOWARDZERO  -0x1.e38ca44203ab8p-1
+
+Testing: (d1 = (0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 / d2
+FE_TONEAREST   -0x1.e38ca44203ab9p-1
+FE_DOWNWARD    -0x1.e38ca44203ab9p-1
+FE_UPWARD      -0x1.e38ca44203ab8p-1
+FE_TOWARDZERO  -0x1.e38ca44203ab8p-1
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 / d2
+FE_TONEAREST   0x1.e38ca44203ab9p-1
+FE_DOWNWARD    0x1.e38ca44203ab8p-1
+FE_UPWARD      0x1.e38ca44203ab9p-1
+FE_TOWARDZERO  0x1.e38ca44203ab8p-1
+
+Testing: (d1 = (0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 / d2
+FE_TONEAREST   0x1.e38ca44203ab9p-1
+FE_DOWNWARD    0x1.e38ca44203ab8p-1
+FE_UPWARD      0x1.e38ca44203ab9p-1
+FE_TOWARDZERO  0x1.e38ca44203ab8p-1
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = (0x1.3456789abcdep0)) -> d1 / d2
+FE_TONEAREST   -0x1.e38ca44203ab9p-1
+FE_DOWNWARD    -0x1.e38ca44203ab9p-1
+FE_UPWARD      -0x1.e38ca44203ab8p-1
+FE_TOWARDZERO  -0x1.e38ca44203ab8p-1
+
+Testing: (d1 = (0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 / d2
+FE_TONEAREST   -0x1.e38ca44203ab9p-1
+FE_DOWNWARD    -0x1.e38ca44203ab9p-1
+FE_UPWARD      -0x1.e38ca44203ab8p-1
+FE_TOWARDZERO  -0x1.e38ca44203ab8p-1
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = -(0x1.3456789abcdep0)) -> d1 / d2
+FE_TONEAREST   0x1.e38ca44203ab9p-1
+FE_DOWNWARD    0x1.e38ca44203ab8p-1
+FE_UPWARD      0x1.e38ca44203ab9p-1
+FE_TOWARDZERO  0x1.e38ca44203ab8p-1
+
+Testing: (d1 = (0x1.233445566778p0), d2 = (0x1.3456789abcdep0f)) -> d1 / d2
+FE_TONEAREST   0x1.e38ca534ae61p-1
+FE_DOWNWARD    0x1.e38ca534ae61p-1
+FE_UPWARD      0x1.e38ca211bd4adp-1
+FE_TOWARDZERO  0x1.e38ca534ae61p-1
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = (0x1.3456789abcdep0f)) -> d1 / d2
+FE_TONEAREST   -0x1.e38ca534ae61p-1
+FE_DOWNWARD    -0x1.e38ca534ae611p-1
+FE_UPWARD      -0x1.e38ca211bd4acp-1
+FE_TOWARDZERO  -0x1.e38ca534ae61p-1
+
+Testing: (d1 = (0x1.233445566778p0), d2 = -(0x1.3456789abcdep0f)) -> d1 / d2
+FE_TONEAREST   -0x1.e38ca534ae61p-1
+FE_DOWNWARD    -0x1.e38ca211bd4adp-1
+FE_UPWARD      -0x1.e38ca534ae61p-1
+FE_TOWARDZERO  -0x1.e38ca534ae61p-1
+
+Testing: (d1 = -(0x1.233445566778p0), d2 = -(0x1.3456789abcdep0f)) -> d1 / d2
+FE_TONEAREST   0x1.e38ca534ae61p-1
+FE_DOWNWARD    0x1.e38ca211bd4acp-1
+FE_UPWARD      0x1.e38ca534ae611p-1
+FE_TOWARDZERO  0x1.e38ca534ae61p-1
+
+Testing: (d1 = (1.0), d2 = (0x1.0000000000001p0)) -> d2 - d1
+FE_TONEAREST   0x1p-52
+FE_DOWNWARD    0x1p-52
+FE_UPWARD      0x1p-52
+FE_TOWARDZERO  0x1p-52
+
+Testing: (d1 = -(1.0), d2 = (0x1.0000000000001p0)) -> d2 - d1
+FE_TONEAREST   0x1p+1
+FE_DOWNWARD    0x1p+1
+FE_UPWARD      0x1.0000000000001p+1
+FE_TOWARDZERO  0x1p+1
+
+Testing: (d1 = (1.0), d2 = -(0x1.0000000000001p0)) -> d2 - d1
+FE_TONEAREST   -0x1p+1
+FE_DOWNWARD    -0x1.0000000000001p+1
+FE_UPWARD      -0x1p+1
+FE_TOWARDZERO  -0x1p+1
+
+Testing: (d1 = -(1.0), d2 = -(0x1.0000000000001p0)) -> d2 - d1
+FE_TONEAREST   -0x1p-52
+FE_DOWNWARD    -0x1p-52
+FE_UPWARD      -0x1p-52
+FE_TOWARDZERO  -0x1p-52
+
+Testing: (d1 = (1.0), d2 = (0x1.000000000000dp-4)) -> d1 + d2
+FE_TONEAREST   0x1.1000000000001p+0
+FE_DOWNWARD    0x1.1p+0
+FE_UPWARD      0x1.1000000000001p+0
+FE_TOWARDZERO  0x1.1p+0
+
+Testing: (d1 = -(1.0), d2 = (0x1.000000000000dp-4)) -> d1 + d2
+FE_TONEAREST   -0x1.dfffffffffffep-1
+FE_DOWNWARD    -0x1.dffffffffffffp-1
+FE_UPWARD      -0x1.dfffffffffffep-1
+FE_TOWARDZERO  -0x1.dfffffffffffep-1
+
+Testing: (d1 = (1.0), d2 = -(0x1.000000000000dp-4)) -> d1 + d2
+FE_TONEAREST   0x1.dfffffffffffep-1
+FE_DOWNWARD    0x1.dfffffffffffep-1
+FE_UPWARD      0x1.dffffffffffffp-1
+FE_TOWARDZERO  0x1.dfffffffffffep-1
+
+Testing: (d1 = -(1.0), d2 = -(0x1.000000000000dp-4)) -> d1 + d2
+FE_TONEAREST   -0x1.1000000000001p+0
+FE_DOWNWARD    -0x1.1000000000001p+0
+FE_UPWARD      -0x1.1p+0
+FE_TOWARDZERO  -0x1.1p+0
+
+Testing: (d1 = (1.0), d2 = (0x1.000000000000dp-4)) -> d1 - d2
+FE_TONEAREST   0x1.dfffffffffffep-1
+FE_DOWNWARD    0x1.dfffffffffffep-1
+FE_UPWARD      0x1.dffffffffffffp-1
+FE_TOWARDZERO  0x1.dfffffffffffep-1
+
+Testing: (d1 = -(1.0), d2 = (0x1.000000000000dp-4)) -> d1 - d2
+FE_TONEAREST   -0x1.1000000000001p+0
+FE_DOWNWARD    -0x1.1000000000001p+0
+FE_UPWARD      -0x1.1p+0
+FE_TOWARDZERO  -0x1.1p+0
+
+Testing: (d1 = (1.0), d2 = -(0x1.000000000000dp-4)) -> d1 - d2
+FE_TONEAREST   0x1.1000000000001p+0
+FE_DOWNWARD    0x1.1p+0
+FE_UPWARD      0x1.1000000000001p+0
+FE_TOWARDZERO  0x1.1p+0
+
+Testing: (d1 = -(1.0), d2 = -(0x1.000000000000dp-4)) -> d1 - d2
+FE_TONEAREST   -0x1.dfffffffffffep-1
+FE_DOWNWARD    -0x1.dffffffffffffp-1
+FE_UPWARD      -0x1.dfffffffffffep-1
+FE_TOWARDZERO  -0x1.dfffffffffffep-1
+
+Testing: (d1 = (1.0), d2 = (0x1.000000000000dp-4)) -> d2 - d1
+FE_TONEAREST   -0x1.dfffffffffffep-1
+FE_DOWNWARD    -0x1.dffffffffffffp-1
+FE_UPWARD      -0x1.dfffffffffffep-1
+FE_TOWARDZERO  -0x1.dfffffffffffep-1
+
+Testing: (d1 = -(1.0), d2 = (0x1.000000000000dp-4)) -> d2 - d1
+FE_TONEAREST   0x1.1000000000001p+0
+FE_DOWNWARD    0x1.1p+0
+FE_UPWARD      0x1.1000000000001p+0
+FE_TOWARDZERO  0x1.1p+0
+
+Testing: (d1 = (1.0), d2 = -(0x1.000000000000dp-4)) -> d2 - d1
+FE_TONEAREST   -0x1.1000000000001p+0
+FE_DOWNWARD    -0x1.1000000000001p+0
+FE_UPWARD      -0x1.1p+0
+FE_TOWARDZERO  -0x1.1p+0
+
+Testing: (d1 = -(1.0), d2 = -(0x1.000000000000dp-4)) -> d2 - d1
+FE_TONEAREST   0x1.dfffffffffffep-1
+FE_DOWNWARD    0x1.dfffffffffffep-1
+FE_UPWARD      0x1.dffffffffffffp-1
+FE_TOWARDZERO  0x1.dfffffffffffep-1
+
+Testing X87 instruction: "FSQRT" (ST0 = 0x1p+2, ST1 = 0x0p+0)
+FE_TONEAREST    ST0 = 0x1p+1
+FE_DOWNWARD     ST0 = 0x1p+1
+FE_UPWARD       ST0 = 0x1p+1
+FE_TOWARDZERO   ST0 = 0x1p+1
+
+Testing X87 instruction: "FSQRT" (ST0 = 0x1.0000000000001p+1, ST1 = 0x0p+0)
+FE_TONEAREST    ST0 = 0x1.6a09e667f3bcdp+0
+FE_DOWNWARD     ST0 = 0x1.6a09e667f3bcdp+0
+FE_UPWARD       ST0 = 0x1.6a09e667f3bcep+0
+FE_TOWARDZERO   ST0 = 0x1.6a09e667f3bcdp+0
+
+Testing X87 instruction: "FSQRT" (ST0 = 0x1.123456789abcp+31, ST1 = 0x0p+0)
+FE_TONEAREST    ST0 = 0x1.76b0aac9e6a5p+15
+FE_DOWNWARD     ST0 = 0x1.76b0aac9e6a4fp+15
+FE_UPWARD       ST0 = 0x1.76b0aac9e6a5p+15
+FE_TOWARDZERO   ST0 = 0x1.76b0aac9e6a4fp+15
+
+Testing X87 instruction: "FSQRT" (ST0 = 0x1.123456789abdp+31, ST1 = 0x0p+0)
+FE_TONEAREST    ST0 = 0x1.76b0aac9e6a5bp+15
+FE_DOWNWARD     ST0 = 0x1.76b0aac9e6a5ap+15
+FE_UPWARD       ST0 = 0x1.76b0aac9e6a5bp+15
+FE_TOWARDZERO   ST0 = 0x1.76b0aac9e6a5ap+15
+
diff --git a/tests/roundtest.h b/tests/roundtest.h
new file mode 100644
index 00000000..320e22c9
--- /dev/null
+++ b/tests/roundtest.h
@@ -0,0 +1,114 @@
+#pragma STDC FENV_ACCESS ON
+#include <assert.h>
+#include <stdio.h>
+
+#ifdef USE_ASM_ROUNDING
+int fesetround_(int rounding_direction) {
+  uint16_t old_cw;
+  __asm__("FNSTCW %0" : "=m"(old_cw)::);
+  uint16_t new_cw = (old_cw & ~0xc00) | rounding_direction;
+  __asm__("FLDCW %0" ::"m"(new_cw));
+  return old_cw & 0xc00;
+}
+int fegetround_() {
+  uint16_t cw;
+  __asm__("FNSTCW %0" : "=m"(cw)::);
+  return cw & 0xc00;
+}
+#define fesetround fesetround_
+#define fegetround fegetround_
+#define FE_TONEAREST 0
+#define FE_DOWNWARD 0x400
+#define FE_UPWARD 0x800
+#define FE_TOWARDZERO 0xc00
+#else
+#include <fenv.h>
+#endif
+
+#define FE_TONEAREST_INDEX 0
+#define FE_DOWNWARD_INDEX 1
+#define FE_UPWARD_INDEX 2
+#define FE_TOWARDZERO_INDEX 3
+int FE_MODES[] = {FE_TONEAREST, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO};
+char *FE_MODES_STR[] = {
+    "FE_TONEAREST",
+    "FE_DOWNWARD",
+    "FE_UPWARD",
+    "FE_TOWARDZERO",
+};
+
+void assert_round(double *array) {
+  assert(array[FE_DOWNWARD_INDEX] <= array[FE_TONEAREST_INDEX]);
+  assert(array[FE_TONEAREST_INDEX] <= array[FE_UPWARD_INDEX]);
+  if (array[FE_TOWARDZERO_INDEX] < 0)
+    assert(array[FE_TOWARDZERO_INDEX] == array[FE_UPWARD_INDEX]);
+  else if (array[FE_TOWARDZERO_INDEX] > 0)
+    assert(array[FE_TOWARDZERO_INDEX] == array[FE_DOWNWARD_INDEX]);
+  else if (array[FE_TOWARDZERO_INDEX] == 0)
+    assert(array[FE_TOWARDZERO_INDEX] == array[FE_UPWARD_INDEX] ||
+           array[FE_TOWARDZERO_INDEX] == array[FE_DOWNWARD_INDEX]);
+}
+
+#define TEST_(exec, expr, format)                                              \
+  do {                                                                         \
+    if (sizeof(#exec) == 1)                                                    \
+      printf("Testing: %s\n", #expr);                                          \
+    else                                                                       \
+      printf("Testing: %s -> %s\n", #exec, #expr);                             \
+    for (int i = 0; i < sizeof(FE_MODES) / sizeof(FE_MODES[0]); i++) {         \
+      fesetround(FE_MODES[i]);                                                 \
+      exec;                                                                    \
+      printf("%-15s" format "\n", FE_MODES_STR[i], expr);                      \
+      assert(FE_MODES[i] == fegetround());                                     \
+    }                                                                          \
+    printf("\n");                                                              \
+  } while (0)
+
+#define TEST(exec, expr) TEST_(exec, expr, "%a")
+
+#if defined(i386) || defined(__i386__) || defined(__i386) ||                   \
+    defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#define TEST_X87(instruction, st0, st1, deep_change)                           \
+  do {                                                                         \
+    double _st0 = (st0), _st1 = (st1);                                         \
+    double array1[4], array2[4];                                               \
+    double __st0, __st1;                                                       \
+    printf("Testing X87 instruction: %s (ST0 = %a, ST1 = %a)\n", #instruction, \
+           _st0, _st1);                                                        \
+    for (int i = 0; i < sizeof(FE_MODES) / sizeof(FE_MODES[0]); i++) {         \
+      fesetround(FE_MODES[i]);                                                 \
+      __st0 = _st0, __st1 = _st1;                                              \
+      switch (deep_change) {                                                   \
+      case -1: /* the instruction pops */                                      \
+        __asm__(instruction : "+t"(__st0) : "u"(__st1) : "st(1)");             \
+        printf("%-15s ST0 = %a\n", FE_MODES_STR[i], __st0);                    \
+        break;                                                                 \
+      case 0:                                                                  \
+        __asm__(instruction : "+t"(__st0) : "u"(__st1) :);                     \
+        printf("%-15s ST0 = %a\n", FE_MODES_STR[i], __st0);                    \
+        break;                                                                 \
+      case 1: /* the instruction pushes */                                     \
+        __asm__(instruction : "+t"(__st0), "=u"(__st1)::);                     \
+        printf("%-15s ST0 = %a, ST1 = %a\n", FE_MODES_STR[i], __st0, __st1);   \
+        array2[i] = __st1;                                                     \
+      }                                                                        \
+      array1[i] = __st0;                                                       \
+      assert(FE_MODES[i] == fegetround());                                     \
+    }                                                                          \
+    if (deep_change == 1)                                                      \
+      assert_round(array2);                                                    \
+    assert_round(array1);                                                      \
+    printf("\n");                                                              \
+  } while (0)
+#else
+#define TEST_X87(instruction, st0, st1, deep_change)                           \
+  do {                                                                         \
+    double _st0 = (st0), _st1 = (st1);                                         \
+    printf("Cannot test X87 instruction: %s (ST0 = %a, ST1 = %a) because it "  \
+           "is not compiled to x86\n\n",                                       \
+           #instruction, _st0, _st1);                                          \
+  } while (0)
+#endif
+
+#define TEST_X87_1(i, st0) TEST_X87(i, st0, 0.0, 0)
+#define TEST_X87_2(i, st0, st1) TEST_X87(i, st0, st1, -1)
diff --git a/tests/test31 b/tests/test31
new file mode 100755
index 00000000..432bc57f
--- /dev/null
+++ b/tests/test31
Binary files differdiff --git a/tests/test31.c b/tests/test31.c
new file mode 100644
index 00000000..379ddec5
--- /dev/null
+++ b/tests/test31.c
@@ -0,0 +1,114 @@
+// Copy from Box86/tests/test26.c (Box64/tests32/test26.c)
+// Build with `gcc -march=core2 -O0 test31.c -o test31 -std=c99 -masm=intel -mfpmath=387 -frounding-math`
+#include <fenv.h>
+#include <float.h>
+#include <inttypes.h>
+#include <stdint.h>
+#define USE_ASM_ROUNDING
+#include "roundtest.h"
+
+#define TEST_CONVERT_(stype, s_)                                               \
+  do {                                                                         \
+    stype s;                                                                   \
+    TEST_(s = (s_), (double)s, "%a");                                          \
+    TEST_(s = (s_), (float)s, "%a");                                           \
+    /* converting too large float to integer, the result is undefined, on both \
+     * c99 and FISTP instruction */                                            \
+    if (INT64_MIN <= s && INT64_MAX <= s)                                      \
+      TEST_(s = (s_), (int64_t)s, "%" PRId64);                                 \
+    if (INT32_MIN <= s && INT32_MAX <= s)                                      \
+      TEST_(s = (s_), (int32_t)s, "%" PRId32);                                 \
+    if (INT16_MIN <= s && INT16_MAX <= s)                                      \
+      TEST_(s = (s_), (int16_t)s, "%" PRId16);                                 \
+    if (INT8_MIN <= s && INT8_MAX <= s)                                        \
+      TEST_(s = (s_), (int8_t)s, "%" PRId8);                                   \
+    if (0 <= s && UINT64_MAX <= s)                                             \
+      TEST_(s = (s_), (uint64_t)s, "%" PRIu64);                                \
+    if (0 <= s && UINT32_MAX <= s)                                             \
+      TEST_(s = (s_), (unsigned int)s, "%" PRIu32);                            \
+    if (0 <= s && UINT16_MAX <= s)                                             \
+      TEST_(s = (s_), (unsigned short)s, "%" PRIu16);                          \
+    if (0 <= s && UINT8_MAX <= s)                                              \
+      TEST_(s = (s_), (unsigned char)s, "%" PRIu8);                            \
+  } while (0)
+
+#define TEST_CONVERT(stype, s_)                                                \
+  do {                                                                         \
+    TEST_CONVERT_(stype, s_);                                                  \
+    TEST_CONVERT_(stype, -(s_));                                               \
+  } while (0)
+
+#define TEST_2NUMBER(d1type, d1_, d2type, d2_, operation)                      \
+  do {                                                                         \
+    d1type d1;                                                                 \
+    d2type d2;                                                                 \
+    TEST((d1 = (d1_), d2 = (d2_)), operation);                                 \
+    TEST((d1 = -(d1_), d2 = (d2_)), operation);                                \
+    TEST((d1 = (d1_), d2 = -(d2_)), operation);                                \
+    TEST((d1 = -(d1_), d2 = -(d2_)), operation);                               \
+  } while (0)
+
+int main() {
+  double d;
+  float f;
+  int64_t i64;
+  TEST_CONVERT(double, 0x1.123456789abcp2); // FISTTP
+  TEST_(d = (0x1.123456789abcp512), (float)d, "%a");
+  TEST_CONVERT(double, 0x1.123456789abcp29);
+  TEST_(d = (-0x1.123456789abcp30), (int32_t)d, "%" PRId32);
+  TEST_(d = (-0x1.123456789abcp62), (int64_t)d, "%" PRId64);
+
+  TEST_CONVERT(float, 0x1.123456789abcp2f);
+  TEST_CONVERT(float, 0x1.123456789abcp29f);
+  TEST_(f = -0x1.123456789abcp30f, (int32_t)f, "%" PRId32);
+  // to be fixed:
+  //TEST_(f = -0x1.123456789abcp62f, (int64_t)f, "%" PRId64);
+  // The direction of rounding when an integer is converted to a floating-point
+  // number that cannot exactly represent the original value
+  // https://gcc.gnu.org/onlinedocs/gcc/Floating-point-implementation.html
+  // to be fixed:
+  //TEST_(i64 = INT64_MAX, (double)i64, "%a"); // FILD and FSTP
+  TEST(d = -0x1.1234567p0, (double)((int)d));
+  TEST(d = 0x1.9234567p0, (double)((int)d));
+  TEST(d = -0x1.9234567p0, (double)((int)d));
+
+  TEST(d = 0x1.1234567p0, (double)((long int)d));
+  TEST(d = -0x1.1234567p0, (double)((long int)d));
+  TEST(d = 0x1.9234567p0, (double)((long int)d));
+  TEST(d = -0x1.9234567p0, (double)((long int)d));
+
+  TEST_2NUMBER(double, 1.0, double, 0x1.0000000000001p0, d1 + d2);
+  TEST_2NUMBER(double, 1.0, double, 0x1.0000000000001p0, d1 - d2);
+  TEST_2NUMBER(double, 1.0, double, 0x1.0000000000001p0, d2 - d1);
+  TEST_2NUMBER(double, 1.0, double, 0x1.000000000000dp-4, d1 + d2);
+  TEST_2NUMBER(double, 1.0, double, 0x1.000000000000dp-4, d1 - d2);
+  TEST_2NUMBER(double, 1.0, double, 0x1.000000000000dp-4, d2 - d1);
+
+  TEST_2NUMBER(double, 0x1.233445566778p0, double, 0x1.3456789abcdep0, d1 + d2);
+  TEST_2NUMBER(float, 0x1.233445566778p0f, float, 0x1.3456789abcdep0f, d1 *d2);
+  TEST_2NUMBER(float, 0x1.233445566778p0f, double, 0x1.3456789abcdep0, d1 *d2);
+  TEST_2NUMBER(double, 0x1.233445566778p0, double, 0x1.3456789abcdep0, d1 *d2);
+  TEST_2NUMBER(float, 0x1.233445566778p0f, double, 0x1.3456789abcdep0, d1 *d2);
+  TEST_2NUMBER(double, 0x1.233445566778p0, float, 0x1.3456789abcdep0, d1 *d2);
+  TEST_2NUMBER(float, 0x1.233445566778p0f, float, 0x1.3456789abcdep0f, d1 *d2);
+  TEST_2NUMBER(double, 0x1.233445566778p0, int, 5, d1 *d2);
+  TEST_2NUMBER(int, 15, double, 0x1.3456789abcdep0f, d1 *d2);
+  TEST_2NUMBER(float, 0x1.233445566778p0f, int, 15, d1 *d2);
+  TEST_2NUMBER(int, 15, float, 0x1.3456789abcdep0f, d1 *d2);
+
+  TEST_2NUMBER(double, 0x1.233445566778p0, double, 0x1.3456789abcdep0, d1 / d2);
+  TEST_2NUMBER(double, 0x1.233445566778p0, double, 0x1.3456789abcdep0, d1 / d2);
+  TEST_2NUMBER(double, 0x1.233445566778p0, float, 0x1.3456789abcdep0f, d1 / d2);
+
+  TEST_2NUMBER(double, 1.0, double, 0x1.0000000000001p0, d2 - d1);
+  TEST_2NUMBER(double, 1.0, double, 0x1.000000000000dp-4, d1 + d2);
+  TEST_2NUMBER(double, 1.0, double, 0x1.000000000000dp-4, d1 - d2);
+  TEST_2NUMBER(double, 1.0, double, 0x1.000000000000dp-4, d2 - d1);
+
+  TEST_X87_1("FSQRT", 0x1.0000000000000p2);
+  TEST_X87_1("FSQRT", 0x1.0000000000001p1);
+  TEST_X87_1("FSQRT", 0x1.123456789abcp31);
+  TEST_X87_1("FSQRT", 0x1.123456789abdp31);
+
+  return 0;
+}