about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2025-04-09 15:19:45 +0200
committerptitSeb <sebastien.chev@gmail.com>2025-04-09 15:19:45 +0200
commit8c991cb6762d3a1384cec16d6e54402ce276ea9e (patch)
tree3d109e77532ce32822efec69950b7db787368fcc
parent70770db8bec892e66fa8b6834f1ea4a8f27325b6 (diff)
downloadbox64-8c991cb6762d3a1384cec16d6e54402ce276ea9e.tar.gz
box64-8c991cb6762d3a1384cec16d6e54402ce276ea9e.zip
[DYNAREC] Better handling of self-loop and added CALLRET=2 settings (ARM64 only, RV64 and LA64 todo)
-rw-r--r--docs/USAGE.md1
-rw-r--r--docs/box64.pod3
-rw-r--r--docs/gen/usage.json5
-rw-r--r--src/dynarec/arm64/dynarec_arm64_00.c18
-rw-r--r--src/dynarec/arm64/dynarec_arm64_64.c12
-rw-r--r--src/dynarec/arm64/dynarec_arm64_67.c12
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h10
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass2.h2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_pass3.h2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h3
-rw-r--r--src/dynarec/dynablock.c46
-rw-r--r--src/dynarec/dynablock_private.h7
-rw-r--r--src/dynarec/dynarec_arch.h3
-rw-r--r--src/dynarec/dynarec_native.c41
-rw-r--r--src/dynarec/dynarec_native_pass.c5
-rw-r--r--src/dynarec/dynarec_private.h1
-rw-r--r--src/dynarec/la64/dynarec_la64_private.h4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h4
-rw-r--r--src/include/dynablock.h3
-rw-r--r--src/include/env.h2
-rw-r--r--src/libtools/signals.c79
-rw-r--r--src/tools/env.c2
-rw-r--r--src/wrapped/wrappedlibc.c14
-rw-r--r--system/box64.box64rc9
24 files changed, 257 insertions, 31 deletions
diff --git a/docs/USAGE.md b/docs/USAGE.md
index 33507f44..2a2256b3 100644
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -581,6 +581,7 @@ Optimize CALL/RET opcodes.
 
  * 0: Do not optimize CALL/RET, use jump table. [Default]
  * 1: Try to optimize CALL/RET, skipping the jump table when possible. 
+ * 2: Try to optimize CALL/RET, skipping the jump table when possible, adding code to handle return to dirty/modified block. 
 
 ### BOX64_DYNAREC_DF
 
diff --git a/docs/box64.pod b/docs/box64.pod
index 82e7153b..787e99cd 100644
--- a/docs/box64.pod
+++ b/docs/box64.pod
@@ -154,12 +154,13 @@ Detect MonoBleedingEdge and apply conservative settings.
  * 1 : Detect MonoBleedingEdge and apply BOX64_DYNAREC_BIGBLOCK=0 BOX64_DYNAREC_STRONGMEM=1 when detected. [Default]
 
 
-=item B<BOX64_DYNAREC_CALLRET> =I<0|1>
+=item B<BOX64_DYNAREC_CALLRET> =I<0|1|2>
 
 Optimize CALL/RET opcodes.
 
  * 0 : Do not optimize CALL/RET, use jump table. [Default]
  * 1 : Try to optimize CALL/RET, skipping the jump table when possible. 
+ * 2 : Try to optimize CALL/RET, skipping the jump table when possible, adding code to handle return to dirty/modified block. 
 
 
 =item B<BOX64_DYNAREC_DF> =I<0|1>
diff --git a/docs/gen/usage.json b/docs/gen/usage.json
index 63be95e0..d42c4dbf 100644
--- a/docs/gen/usage.json
+++ b/docs/gen/usage.json
@@ -239,6 +239,11 @@
         "key": "1",
         "description": "Try to optimize CALL/RET, skipping the jump table when possible.",
         "default": false
+      },
+      {
+        "key": "2",
+        "description": "Try to optimize CALL/RET, skipping the jump table when possible, adding code to handle return to dirty/modified block.",
+        "default": false
       }
     ]
   },
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index 0b828704..0a22ac90 100644
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -16,6 +16,7 @@
 #include "bridge.h"
 #include "x64trace.h"
 #include "dynarec_native.h"
+#include "../dynablock_private.h"
 #include "custommem.h"
 #include "alternate.h"
 
@@ -3386,11 +3387,17 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         // Push actual return address
                         if(addr < (dyn->start+dyn->isize)) {
                             // there is a next...
-                            j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0;
+                            if(BOX64DRENV(dynarec_callret)>1)
+                                j64 = CALLRET_GETRET();
+                            else
+                                j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0;
                             ADR_S20(x4, j64);
                             MESSAGE(LOG_NONE, "\tCALLRET set return to +%di\n", j64>>2);
                         } else {
-                            j64 = (dyn->insts)?(GETMARK-(dyn->native_size)):0;
+                            if(BOX64DRENV(dynarec_callret)>1)
+                                j64 = CALLRET_GETRET();
+                            else
+                                j64 = (dyn->insts)?(GETMARK-(dyn->native_size)):0;
                             ADR_S20(x4, j64);
                             MESSAGE(LOG_NONE, "\tCALLRET set return to +%di\n", j64>>2);
                         }
@@ -3404,6 +3411,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     else
                         j64 = addr+i32;
                     jump_to_next(dyn, j64, 0, ninst, rex.is32bits);
+                    if(BOX64DRENV(dynarec_callret)>1) CALLRET_RET();
                     if (BOX64DRENV(dynarec_callret) && addr >= (dyn->start + dyn->isize)) {
                         // jumps out of current dynablock...
                         MARK;
@@ -4046,7 +4054,10 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         // Push actual return address
                         if(addr < (dyn->start+dyn->isize)) {
                             // there is a next...
-                            j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0;
+                            if(BOX64DRENV(dynarec_callret)>1)
+                                j64 = CALLRET_GETRET();
+                            else
+                                j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0;
                             ADR_S20(x4, j64);
                             MESSAGE(LOG_NONE, "\tCALLRET set return to +%di\n", j64>>2);
                         } else {
@@ -4058,6 +4069,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     }
                     PUSH1z(xRIP);
                     jump_to_next(dyn, 0, ed, ninst, rex.is32bits);
+                    if(BOX64DRENV(dynarec_callret)>1) CALLRET_RET();
                     if (BOX64DRENV(dynarec_callret) && addr >= (dyn->start + dyn->isize)) {
                         // jumps out of current dynablock...
                         MARK;
diff --git a/src/dynarec/arm64/dynarec_arm64_64.c b/src/dynarec/arm64/dynarec_arm64_64.c
index f7265aad..1f9edc20 100644
--- a/src/dynarec/arm64/dynarec_arm64_64.c
+++ b/src/dynarec/arm64/dynarec_arm64_64.c
@@ -14,6 +14,7 @@
 #include "emu/x64run_private.h"
 #include "x64trace.h"
 #include "dynarec_native.h"
+#include "../dynablock_private.h"
 #include "custommem.h"
 
 #include "arm64_printer.h"
@@ -1602,11 +1603,17 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         // Push actual return address
                         if(addr < (dyn->start+dyn->isize)) {
                             // there is a next...
-                            j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0;
+                            if(BOX64DRENV(dynarec_callret)>1)
+                                j64 = CALLRET_GETRET();
+                            else
+                                j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0;
                             ADR_S20(x4, j64);
                             MESSAGE(LOG_NONE, "\tCALLRET set return to +%di\n", j64>>2);
                         } else {
-                            j64 = (dyn->insts)?(GETMARK-(dyn->native_size)):0;
+                            if(BOX64DRENV(dynarec_callret)>1)
+                                j64 = CALLRET_GETRET();
+                            else
+                                j64 = (dyn->insts)?(GETMARK-(dyn->native_size)):0;
                             ADR_S20(x4, j64);
                             MESSAGE(LOG_NONE, "\tCALLRET set return to +%di\n", j64>>2);
                         }
@@ -1614,6 +1621,7 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     }
                     PUSH1z(xRIP);
                     jump_to_next(dyn, 0, ed, ninst, rex.is32bits);
+                    if(BOX64DRENV(dynarec_callret)>1) CALLRET_RET();
                     if (BOX64DRENV(dynarec_callret) && addr >= (dyn->start + dyn->isize)) {
                         // jumps out of current dynablock...
                         MARK;
diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c
index 1d95bb6b..3d9a7b35 100644
--- a/src/dynarec/arm64/dynarec_arm64_67.c
+++ b/src/dynarec/arm64/dynarec_arm64_67.c
@@ -13,6 +13,7 @@
 #include "emu/x64run_private.h"

 #include "x64trace.h"

 #include "dynarec_native.h"

+#include "../dynablock_private.h"

 #include "custommem.h"

 

 #include "arm64_printer.h"

@@ -1708,11 +1709,17 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         // Push actual return address

                         if(addr < (dyn->start+dyn->isize)) {

                             // there is a next...

-                            j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0;

+                            if(BOX64DRENV(dynarec_callret)>1)

+                                j64 = CALLRET_GETRET();

+                            else

+                                j64 = (dyn->insts)?(dyn->insts[ninst].epilog-(dyn->native_size)):0;

                             ADR_S20(x4, j64);

                             MESSAGE(LOG_NONE, "\tCALLRET set return to +%di\n", j64>>2);

                         } else {

-                            j64 = (dyn->insts)?(GETMARK-(dyn->native_size)):0;

+                            if(BOX64DRENV(dynarec_callret)>1)

+                                j64 = CALLRET_GETRET();

+                            else

+                                j64 = (dyn->insts)?(GETMARK-(dyn->native_size)):0;

                             ADR_S20(x4, j64);

                             MESSAGE(LOG_NONE, "\tCALLRET set return to +%di\n", j64>>2);

                         }

@@ -1720,6 +1727,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     }

                     PUSH1z(xRIP);

                     jump_to_next(dyn, 0, ed, ninst, rex.is32bits);

+                    if(BOX64DRENV(dynarec_callret)>1) CALLRET_RET();

                     if(BOX64DRENV(dynarec_callret) && addr >= (dyn->start + dyn->isize)) {

                         // jumps out of current dynablock...

                         MARK;

diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 771e80f5..2e152dfb 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -988,6 +988,16 @@
 #define IF_ALIGNED(A) if (!dyn->insts[ninst].unaligned)
 #endif
 
+#ifndef CALLRET_RET
+#define CALLRET_RET()   NOP
+#endif
+#ifndef CALLRET_GETRET
+#define CALLRET_GETRET()    (dyn->callrets?(dyn->callrets[dyn->callret_size].offs-dyn->native_size):0)
+#endif
+#ifndef CALLRET_LOOP
+#define CALLRET_LOOP()  NOP
+#endif
+
 #define STORE_REG(A)    STRx_U12(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
 #define STP_REGS(A, B)  STPx_S7_offset(x##A, x##B, xEmu, offsetof(x64emu_t, regs[_##A]))
 #define LDP_REGS(A, B)  LDPx_S7_offset(x##A, x##B, xEmu, offsetof(x64emu_t, regs[_##A]))
diff --git a/src/dynarec/arm64/dynarec_arm64_pass2.h b/src/dynarec/arm64/dynarec_arm64_pass2.h
index 0975908e..6fcb5b50 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass2.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass2.h
@@ -17,3 +17,5 @@
 #define INST_NAME(name) 
 #define TABLE64(A, V)   {Table64(dyn, (V), 2); EMIT(0);}
 #define FTABLE64(A, V)  {mmx87_regs_t v = {.d = V}; Table64(dyn, v.q, 2); EMIT(0);}
+#define CALLRET_RET()   do {dyn->callrets[dyn->callret_size].type = 0; dyn->callrets[dyn->callret_size++].offs = dyn->native_size; EMIT(ARCH_NOP); } while(0)
+#define CALLRET_LOOP()   do {dyn->callrets[dyn->callret_size].type = 1; dyn->callrets[dyn->callret_size++].offs = dyn->native_size; EMIT(ARCH_NOP); } while(0)
\ No newline at end of file
diff --git a/src/dynarec/arm64/dynarec_arm64_pass3.h b/src/dynarec/arm64/dynarec_arm64_pass3.h
index b274cabb..a0d79f30 100644
--- a/src/dynarec/arm64/dynarec_arm64_pass3.h
+++ b/src/dynarec/arm64/dynarec_arm64_pass3.h
@@ -23,3 +23,5 @@
 #define INST_NAME(name) inst_name_pass3(dyn, ninst, name, rex)
 #define TABLE64(A, V)   {int val64offset = Table64(dyn, (V), 3); MESSAGE(LOG_DUMP, "  Table64: 0x%lx\n", (V)); LDRx_literal(A, val64offset);}
 #define FTABLE64(A, V)  {mmx87_regs_t v = {.d = V}; int val64offset = Table64(dyn, v.q, 3); MESSAGE(LOG_DUMP, "  FTable64: %g\n", v.d); VLDR64_literal(A, val64offset);}
+#define CALLRET_RET()   do {dyn->callrets[dyn->callret_size].type = 0; dyn->callrets[dyn->callret_size++].offs = dyn->native_size; EMIT(ARCH_NOP); } while(0)
+#define CALLRET_LOOP()   do {dyn->callrets[dyn->callret_size].type = 1; dyn->callrets[dyn->callret_size++].offs = dyn->native_size; EMIT(ARCH_NOP); } while(0)
\ No newline at end of file
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 740e7e9e..1c5f7008 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -47,6 +47,7 @@ typedef union sse_cache_s {
         uint8_t write:1;
     };
 } sse_cache_t;
+typedef struct callret_s callret_t;
 typedef struct neoncache_s {
     // Neon cache
     neon_cache_t        neoncache[32];
@@ -157,6 +158,8 @@ typedef struct dynarec_arm_s {
     dynablock_t*        dynablock;
     instsize_t*         instsize;
     size_t              insts_size; // size of the instruction size array (calculated)
+    int                 callret_size;   // size of the array
+    callret_t*          callrets;   // arrey of callret return, with NOP / UDF depending if the block is clean or dirty
     uintptr_t           forward;    // address of the last end of code while testing forward
     uintptr_t           forward_to; // address of the next jump to (to check if everything is ok)
     int32_t             forward_size;   // size at the forward point
diff --git a/src/dynarec/dynablock.c b/src/dynarec/dynablock.c
index c12210b5..f0a8bbd9 100644
--- a/src/dynarec/dynablock.c
+++ b/src/dynarec/dynablock.c
@@ -50,15 +50,24 @@ dynablock_t* InvalidDynablock(dynablock_t* db, int need_lock)
         db->done = 0;
         db->gone = 1;
         uintptr_t db_size = db->x64_size;
+        #ifdef ARCH_NOP
+        if(db->callret_size) {
+            // mark all callrets to UDF
+            for(int i=0; i<db->callret_size; ++i)
+                *(uint32_t*)(db->block+db->callrets[i].offs) = ARCH_NOP;
+            ClearCache(db->block, db->size);
+        }
+        #endif
         if(db_size && my_context) {
             uint32_t n = rb_get(my_context->db_sizes, db_size);
             if(n>1)
                 rb_set(my_context->db_sizes, db_size, db_size+1, n-1);
-            else
+            else {
                 rb_unset(my_context->db_sizes, db_size, db_size+1);
-            if(db_size == my_context->max_db_size) {
-                my_context->max_db_size = rb_get_righter(my_context->db_sizes);
-                dynarec_log(LOG_INFO, "BOX64 Dynarec: lower max_db=%d\n", my_context->max_db_size);
+                if(db_size == my_context->max_db_size) {
+                    my_context->max_db_size = rb_get_righter(my_context->db_sizes);
+                    dynarec_log(LOG_INFO, "BOX64 Dynarec: lower max_db=%d\n", my_context->max_db_size);
+                }
             }
         }
         if(need_lock)
@@ -135,7 +144,14 @@ void MarkDynablock(dynablock_t* db)
                 else
                     db->previous = old;
             }
+        } 
+        #ifdef ARCH_NOP
+        else if(db->callret_size) {
+            // mark all callrets to UDF
+            for(int i=0; i<db->callret_size; ++i)
+                *(uint32_t*)(db->block+db->callrets[i].offs) = ARCH_UDF;
         }
+        #endif
     }
 }
 
@@ -290,8 +306,17 @@ dynablock_t* DBGetBlock(x64emu_t* emu, uintptr_t addr, int create, int is32bits)
             dynarec_log(LOG_DEBUG, "Validating block %p from %p:%p (hash:%X, always_test:%d) for %p\n", db, db->x64_addr, db->x64_addr+db->x64_size-1, db->hash, db->always_test, (void*)addr);
             if(db->always_test)
                 protectDB((uintptr_t)db->x64_addr, db->x64_size);
-            else
+            else {
+                #ifdef ARCH_NOP
+                if(db->callret_size) {
+                    // mark all callrets to UDF
+                    for(int i=0; i<db->callret_size; ++i)
+                        *(uint32_t*)(db->block+db->callrets[i].offs) = ARCH_NOP;
+                    ClearCache(db->block, db->size);
+                }
+                #endif
                 protectDBJumpTable((uintptr_t)db->x64_addr, db->x64_size, db->block, db->jmpnext);
+            }
         }
         if(!need_lock)
             mutex_unlock(&my_context->mutex_dyndump);
@@ -326,8 +351,17 @@ dynablock_t* DBAlternateBlock(x64emu_t* emu, uintptr_t addr, uintptr_t filladdr,
         } else {
             if(db->always_test)
                 protectDB((uintptr_t)db->x64_addr, db->x64_size);
-            else
+            else {
+                #ifdef ARCH_NOP
+                if(db->callret_size) {
+                    // mark all callrets to UDF
+                    for(int i=0; i<db->callret_size; ++i)
+                        *(uint32_t*)(db->block+db->callrets[i].offs) = ARCH_NOP;
+                    ClearCache(db->block, db->size);
+                }
+                #endif
                 protectDBJumpTable((uintptr_t)db->x64_addr, db->x64_size, db->block, db->jmpnext);
+            }
         }
         if(!need_lock)
             mutex_unlock(&my_context->mutex_dyndump);
diff --git a/src/dynarec/dynablock_private.h b/src/dynarec/dynablock_private.h
index b9e5f55d..8e174a63 100644
--- a/src/dynarec/dynablock_private.h
+++ b/src/dynarec/dynablock_private.h
@@ -6,6 +6,11 @@ typedef struct instsize_s {
     unsigned char nat:4;
 } instsize_t;
 
+typedef struct callret_s {
+    uint32_t    offs:31;
+    uint32_t    type:1;
+} callret_t;
+
 typedef struct dynablock_s {
     void*           block;  // block-sizeof(void*) == self
     void*           actual_block;   // the actual start of the block (so block-sizeof(void*))
@@ -23,6 +28,8 @@ typedef struct dynablock_s {
     instsize_t*     instsize;
     void*           arch;       // arch dependant per inst info (can be NULL)
     size_t          arch_size;  // size of of arch dependant infos
+    int             callret_size;   // size of the array
+    callret_t*      callrets;   // array of callret return, with NOP / UDF depending if the block is clean or dirty
     void*           jmpnext;    // a branch jmpnext code when block is marked
 } dynablock_t;
 
diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h
index eaf64fd8..44d767b2 100644
--- a/src/dynarec/dynarec_arch.h
+++ b/src/dynarec/dynarec_arch.h
@@ -32,6 +32,9 @@
 #define ARCH_UNALIGNED(A, B) arch_unaligned(A, B)

 extern uint32_t arm64_crc(void* p, uint32_t len);

 #define ARCH_CRC(A, B)  if(arm64_crc32) return arm64_crc(A, B)

+

+#define ARCH_NOP    0b11010101000000110010000000011111

+#define ARCH_UDF    0xcafe

 #elif defined(LA64)

 

 #define instruction_native_t        instruction_la64_t

diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index 58aa4493..df6ab348 100644
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -504,6 +504,7 @@ static int static_jmps[MAX_INSTS+2];
 static uintptr_t static_next[MAX_INSTS+2];
 static uint64_t static_table64[(MAX_INSTS+3)/4];
 static instruction_native_t static_insts[MAX_INSTS+2] = {0};
+static callret_t static_callrets[MAX_INSTS+2] = {0};
 // TODO: ninst could be a uint16_t instead of an int, that could same some temp. memory
 
 void ClearCache(void* start, size_t len)
@@ -653,7 +654,12 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
         int i = helper.jmps[ii];
         uintptr_t j = helper.insts[i].x64.jmp;
         helper.insts[i].x64.jmp_insts = -1;
-        if(j<start || j>=end || j==helper.insts[i].x64.addr) {
+        #ifndef ARCH_NOP
+        if(j<start || j>=end || j==helper.insts[i].x64.addr)
+        #else
+        if(j<start || j>=end)
+        #endif
+        {
             helper.insts[i].x64.need_after |= X_PEND;
         } else {
             // find jump address instruction
@@ -691,8 +697,12 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
                     helper.insts[k].x64.barrier |= BARRIER_FULL;
                 // special case, loop on itself with some nop in between
                 if(k<i && !helper.insts[i].x64.has_next && is_nops(&helper, helper.insts[k].x64.addr, helper.insts[i].x64.addr-helper.insts[k].x64.addr)) {
+                    #ifndef ARCH_NOP
                     helper.always_test = 1;
                     k = -1;
+                    #else
+                    helper.insts[k].x64.self_loop = 1;
+                    #endif
                 }
                 helper.insts[i].x64.jmp_insts = k;
             }
@@ -737,7 +747,11 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
     for(int ii=0; ii<helper.jmp_sz && !helper.always_test; ++ii) {
         int i = helper.jmps[ii];
         if(helper.insts[i].x64.alive && (helper.insts[i].x64.jmp==helper.insts[i].x64.addr)) {
+            #ifndef ARCH_NOP
             helper.always_test = 1;
+            #else
+            helper.insts[i].x64.self_loop = 1;
+            #endif
         }
     }
     // no need for next anymore
@@ -753,6 +767,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
     }
     
     // pass 2, instruction size
+    helper.callrets = static_callrets;
     native_pass2(&helper, addr, alternate, is32bits, inst_max);
     if(helper.abort) {
         if(BOX64DRENV(dynarec_dump) || BOX64ENV(dynarec_log))dynarec_log(LOG_NONE, "Abort dynablock on pass2\n");
@@ -778,15 +793,17 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
     size_t insts_rsize = (helper.insts_size+2)*sizeof(instsize_t);
     insts_rsize = (insts_rsize+7)&~7;   // round the size...
     size_t arch_size = ARCH_SIZE(&helper);
+    size_t callret_size = helper.callret_size*4;
     // ok, now allocate mapped memory, with executable flag on
-    size_t sz = sizeof(void*) + native_size + helper.table64size*sizeof(uint64_t) + 4*sizeof(void*) + insts_rsize + arch_size;
-    //           dynablock_t*     block (arm insts)            table64               jmpnext code       instsize     arch
+    size_t sz = sizeof(void*) + native_size + helper.table64size*sizeof(uint64_t) + 4*sizeof(void*) + insts_rsize + arch_size + callret_size;
+    //           dynablock_t*     block (arm insts)            table64               jmpnext code       instsize     arch         callrets
     void* actual_p = (void*)AllocDynarecMap(sz);
     void* p = (void*)(((uintptr_t)actual_p) + sizeof(void*));
     void* tablestart = p + native_size;
     void* next = tablestart + helper.table64size*sizeof(uint64_t);
     void* instsize = next + 4*sizeof(void*);
     void* arch = instsize + insts_rsize;
+    void* callrets = arch + arch_size;
     if(actual_p==NULL) {
         dynarec_log(LOG_INFO, "AllocDynarecMap(%p, %zu) failed, canceling block\n", block, sz);
         CancelBlock64(0);
@@ -801,9 +818,13 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
     *(dynablock_t**)actual_p = block;
     helper.table64cap = helper.table64size;
     helper.table64 = (uint64_t*)helper.tablestart;
+    helper.callrets = (callret_t*)callrets;
+    if(callret_size)
+        memcpy(helper.callrets, static_callrets, helper.callret_size*sizeof(callret_t));
+    helper.callret_size = 0;
     // pass 3, emit (log emit native opcode)
     if(BOX64DRENV(dynarec_dump)) {
-        dynarec_log(LOG_NONE, "%s%04d|Emitting %zu bytes for %u %s bytes (native=%zu, table64=%zu, instsize=%zu, arch=%zu)", (BOX64DRENV(dynarec_dump)>1)?"\e[01;36m":"", GetTID(), helper.native_size, helper.isize, is32bits?"x86":"x64", native_size, helper.table64size*sizeof(uint64_t), insts_rsize, arch_size); 
+        dynarec_log(LOG_NONE, "%s%04d|Emitting %zu bytes for %u %s bytes (native=%zu, table64=%zu, instsize=%zu, arch=%zu, callrets=%zu)", (BOX64DRENV(dynarec_dump)>1)?"\e[01;36m":"", GetTID(), helper.native_size, helper.isize, is32bits?"x86":"x64", native_size, helper.table64size*sizeof(uint64_t), insts_rsize, arch_size, callret_size);
         printFunctionAddr(helper.start, " => ");
         dynarec_log(LOG_NONE, "%s\n", (BOX64DRENV(dynarec_dump)>1)?"\e[m":"");
     }
@@ -846,6 +867,8 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
         block->arch = NULL;
         block->arch_size = arch_size;
     }
+    block->callret_size = helper.callret_size;
+    block->callrets = helper.callrets;
     *(dynablock_t**)next = block;
     *(void**)(next+3*sizeof(void*)) = native_next;
     CreateJmpNext(block->jmpnext, next+3*sizeof(void*));
@@ -889,7 +912,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
         printf_log(LOG_NONE, "Warning, insts_size difference in block between pass2 (%zu) and pass3 (%zu), allocated: %zu\n", oldinstsize, helper.insts_size, insts_rsize/sizeof(instsize_t));
     }
     if(!isprotectedDB(addr, end-addr)) {
-        dynarec_log(LOG_DEBUG, "Warning, block unprotected while being processed %p:%ld, marking as need_test\n", block->x64_addr, block->x64_size);
+        dynarec_log(LOG_INFO, "Warning, block unprotected while being processed %p:%ld, marking as need_test\n", block->x64_addr, block->x64_size);
         block->dirty = 1;
         //protectDB(addr, end-addr);
     }
@@ -898,7 +921,13 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
         block->always_test = 1;
     }
     if(block->always_test) {
-        dynarec_log(LOG_DEBUG, "Note: block marked as always dirty %p:%ld\n", block->x64_addr, block->x64_size);
+        dynarec_log(LOG_INFO, "Note: block marked as always dirty %p:%ld\n", block->x64_addr, block->x64_size);
+        #ifdef ARCH_NOP
+        // mark callrets to trigger SIGILL to check clean state
+        if(block->callret_size)
+            for(int i=0; i<block->callret_size; ++i)
+                *(uint32_t*)(block->block+block->callrets[i].offs) = ARCH_UDF;
+        #endif
     }
     current_helper = NULL;
     //block->done = 1;
diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index bf30503c..99cb2449 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -14,6 +14,7 @@
 #include "x64trace.h"
 #include "dynablock.h"
 #include "dynarec_native.h"
+#include "dynablock_private.h"
 #include "custommem.h"
 #include "elfloader.h"
 #include "x64test.h"
@@ -114,6 +115,10 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
         dyn->f.dfnone_here = 0;
         NEW_INST;
         MESSAGE(LOG_DUMP, "New Instruction %s:%p, native:%p\n", is32bits?"x86":"x64",(void*)addr, (void*)dyn->block);
+        #ifdef ARCH_NOP
+        if(dyn->insts[ninst].x64.alive && dyn->insts[ninst].x64.self_loop)
+            CALLRET_LOOP();
+        #endif
         if(!ninst) {
             GOTEST(x1, x2);
         }
diff --git a/src/dynarec/dynarec_private.h b/src/dynarec/dynarec_private.h
index 111e7c74..9e3c55be 100644
--- a/src/dynarec/dynarec_private.h
+++ b/src/dynarec/dynarec_private.h
@@ -41,6 +41,7 @@ typedef struct instruction_x64_s {
     uint8_t     has_next:1;   // does this opcode can continue to the next?
     uint8_t     has_callret:1;    // this instruction have an optimized call setup
     uint8_t     alive:1;    // this opcode gets executed (0 if dead code in that block)
+    uint8_t     self_loop:1;    // this is a landing address for a self-loop (loop on itslef with no exit)
     uint8_t     barrier;    // next instruction is a jump point, so no optim allowed
     uint8_t     state_flags;// One of SF_XXX state
     uint8_t     use_flags;  // 0 or combination of X_?F
diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h
index 7c40ca27..0246007e 100644
--- a/src/dynarec/la64/dynarec_la64_private.h
+++ b/src/dynarec/la64/dynarec_la64_private.h
@@ -68,6 +68,8 @@ typedef struct flagcache_s {
     uint8_t             dfnone_here;// defered flags is cleared in this opcode
 } flagcache_t;
 
+typedef struct callret_s callret_t;
+
 typedef struct instruction_la64_s {
     instruction_x64_t   x64;
     uintptr_t           address;    // (start) address of the arm emitted instruction
@@ -134,6 +136,8 @@ typedef struct dynarec_la64_s {
     dynablock_t*         dynablock;
     instsize_t*          instsize;
     size_t               insts_size; // size of the instruction size array (calculated)
+    int                  callret_size;   // size of the array
+    callret_t*           callrets;   // arrey of callret return, with NOP / UDF depending if the block is clean or dirty
     uintptr_t            forward;    // address of the last end of code while testing forward
     uintptr_t            forward_to; // address of the next jump to (to check if everything is ok)
     int32_t              forward_size;   // size at the forward point
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index 99281462..7519d99c 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -100,6 +100,8 @@ typedef struct flagcache_s {
     uint8_t             dfnone_here;// defered flags is cleared in this opcode
 } flagcache_t;
 
+typedef struct callret_s callret_t;
+
 typedef struct instruction_rv64_s {
     instruction_x64_t   x64;
     uintptr_t           address;    // (start) address of the riscv emitted instruction
@@ -169,6 +171,8 @@ typedef struct dynarec_rv64_s {
     dynablock_t*        dynablock;
     instsize_t*         instsize;
     size_t              insts_size; // size of the instruction size array (calculated)
+    int                 callret_size;   // size of the array
+    callret_t*          callrets;   // arrey of callret return, with NOP / UDF depending if the block is clean or dirty
     uint8_t             smwrite;    // for strongmem model emulation
     uintptr_t           forward;    // address of the last end of code while testing forward
     uintptr_t           forward_to; // address of the next jump to (to check if everything is ok)
diff --git a/src/include/dynablock.h b/src/include/dynablock.h
index 757ca4ae..b9aeddc0 100644
--- a/src/include/dynablock.h
+++ b/src/include/dynablock.h
@@ -21,4 +21,7 @@ dynablock_t* DBAlternateBlock(x64emu_t* emu, uintptr_t addr, uintptr_t filladdr,
 // for use in signal handler
 void cancelFillBlock(void);
 
+// clear instruction cache on a range
+void ClearCache(void* start, size_t len);
+
 #endif //__DYNABLOCK_H_
\ No newline at end of file
diff --git a/src/include/env.h b/src/include/env.h
index d820931d..3398f3b0 100644
--- a/src/include/env.h
+++ b/src/include/env.h
@@ -38,7 +38,7 @@ extern char* ftrace_name;
     BOOLEAN(BOX64_DYNAREC_ALIGNED_ATOMICS, dynarec_aligned_atomics, 0)  \
     INTEGER(BOX64_DYNAREC_BIGBLOCK, dynarec_bigblock, 2, 0, 3)          \
     BOOLEAN(BOX64_DYNAREC_BLEEDING_EDGE, dynarec_bleeding_edge, 1)      \
-    BOOLEAN(BOX64_DYNAREC_CALLRET, dynarec_callret, 0)                  \
+    INTEGER(BOX64_DYNAREC_CALLRET, dynarec_callret, 0, 0, 2)            \
     BOOLEAN(BOX64_DYNAREC_DF, dynarec_df, 1)                            \
     INTEGER(BOX64_DYNAREC_DIRTY, dynarec_dirty, 0, 0, 2)                \
     BOOLEAN(BOX64_DYNAREC_DIV0, dynarec_div0, 0)                        \
diff --git a/src/libtools/signals.c b/src/libtools/signals.c
index 21e79cab..99ddd41e 100644
--- a/src/libtools/signals.c
+++ b/src/libtools/signals.c
@@ -1616,6 +1616,80 @@ void my_box64signalhandler(int32_t sig, siginfo_t* info, void * ucntx)
             return;
         }
     }
+    #ifdef ARCH_NOP
+    if(sig==SIGILL) {
+        db = FindDynablockFromNativeAddress(pc);
+        if(db)
+            x64pc = getX64Address(db, (uintptr_t)pc);   // this will be incorect in the case of the callret!
+        db_searched = 1;
+        if(db && db->callret_size) {
+            int is_callrets = 0;
+            int type_callret = 0;
+            for(int i=0; i<db->callret_size && !is_callrets; ++i)
+                if(pc==(db->block+db->callrets[i].offs)) {
+                    is_callrets = 1;
+                    type_callret = db->callrets[i].type;
+                }
+            if(is_callrets) {
+                if(!type_callret) {
+                    // adjust x64pc for "ret" type
+                    #ifdef __aarch64__
+                    x64pc = p->uc_mcontext.regs[27];
+                    #elif defined(LA64)
+                    x64pc = p->uc_mcontext.__gregs[20];
+                    #elif defined(RV64)
+                    x64pc = p->uc_mcontext.__gregs[22];
+                    #endif
+                }
+                // check if block is still valid
+                int is_hotpage = checkInHotPage(x64pc);
+                uint32_t hash = (db->gone || is_hotpage)?0:X31_hash_code(db->x64_addr, db->x64_size);
+                if(!db->gone && !is_hotpage && hash==db->hash) {
+                    dynarec_log(LOG_INFO, "Dynablock (%p, x64addr=%p, always_test=%d) is clean, %s continuing at %p (%p)!\n", db, db->x64_addr, db->always_test, type_callret?"self-loop":"ret from callret", (void*)x64pc, (void*)addr);
+                    // it's good! go next opcode
+                    #ifdef __aarch64__
+                    p->uc_mcontext.pc+=4;
+                    #elif defined(LA64)
+                    p->uc_mcontext.__pc+=4;
+                    #elif defined(RV64)
+                    p->uc_mcontext.__gregs[REG_PC]+=4;
+                    #endif
+                    if(db->always_test)
+                        protectDB((uintptr_t)db->x64_addr, 1);
+                    else {
+                        if(db->callret_size) {
+                            // mark all callrets to NOP
+                            for(int i=0; i<db->callret_size; ++i)
+                                *(uint32_t*)(db->block+db->callrets[i].offs) = ARCH_NOP;
+                            ClearCache(db->block, db->size);
+                        }
+                        protectDBJumpTable((uintptr_t)db->x64_addr, db->x64_size, db->block, db->jmpnext);
+                    }
+                    return;
+                } else {
+                    // dynablock got dirty! need to get out of it!!!
+                    if(emu->jmpbuf) {
+                        copyUCTXreg2Emu(emu, p, x64pc);
+                        // only copy as it's a return address, so there is just the "epilog" to mimic here on "ret" type. "loop" type need everything
+                        if(type_callret) {
+                            adjustregs(emu);
+                            if(db && db->arch_size)
+                                ARCH_ADJUST(db, emu, p, x64pc);
+                        }
+                        dynarec_log(LOG_INFO, "Dynablock (%p, x64addr=%p) %s, getting out at %s %p (%p)!\n", db, db->x64_addr, is_hotpage?"in HotPage":"dirty",(void*)R_RIP, type_callret?"self-loop":"ret from callret", (void*)addr);
+                        emu->test.clean = 0;
+                        #ifdef ANDROID
+                        siglongjmp(*(JUMPBUFF*)emu->jmpbuf, 2);
+                        #else
+                        siglongjmp(emu->jmpbuf, 2);
+                        #endif
+                    }
+                    dynarec_log(LOG_INFO, "Warning, Dirty %s (%p for db %p/%p) detected, but jmpbuffer not ready!\n", type_callret?"self-loop":"ret from callret", (void*)addr, db, (void*)db->x64_addr);
+                }
+            }
+        }
+    }
+    #endif
     int Locks = unlockMutex();
     uint32_t prot = getProtection((uintptr_t)addr);
     #ifdef BAD_SIGNAL
@@ -1737,7 +1811,10 @@ dynarec_log(/*LOG_DEBUG*/LOG_INFO, "%04d|Repeated SIGSEGV with Access error on %
             glitch_pc = NULL;
             glitch_addr = NULL;
             glitch_prot = 0;
-        }
+            relockMutex(Locks);
+            unlock_signal();
+            return; // try again
+    }
         if(addr && pc && ((prot&(PROT_READ|PROT_WRITE))==(PROT_READ|PROT_WRITE))) {
             static void* glitch2_pc = NULL;
             static void* glitch2_addr = NULL;
diff --git a/src/tools/env.c b/src/tools/env.c
index f1390fbe..00e116f3 100644
--- a/src/tools/env.c
+++ b/src/tools/env.c
@@ -627,6 +627,7 @@ void RecordEnvMappings(uintptr_t addr, size_t length, int fd)
             if (k != kh_end(box64env_entries))
                 mapping->env = &kh_value(box64env_entries, k);
         }
+        dynarec_log(LOG_INFO, "Mapping %s (%s) in %p-%p\n", fullname, lowercase_filename, (void*)addr, (void*)(addr+length));
     } else
         mapping = kh_value(mapping_entries, k);
 
@@ -659,6 +660,7 @@ void RemoveMapping(uintptr_t addr, size_t length)
             start = end;
         } while(end!=UINTPTR_MAX);
         // no occurence found, delete mapping
+        dynarec_log(LOG_INFO, "Delete Mapping %s (%s) in %p(%p)-%p\n", mapping->fullname, mapping->filename, (void*)addr, (void*)mapping->start, (void*)(addr+length));
         khint_t k = kh_get(mapping_entry, mapping_entries, mapping->filename);
         if(k!=kh_end(mapping_entries))
             kh_del(mapping_entry, mapping_entries, k);
diff --git a/src/wrapped/wrappedlibc.c b/src/wrapped/wrappedlibc.c
index 7eb5aa9e..20ced661 100644
--- a/src/wrapped/wrappedlibc.c
+++ b/src/wrapped/wrappedlibc.c
@@ -3029,13 +3029,6 @@ EXPORT void* my_mmap64(x64emu_t* emu, void *addr, size_t length, int prot, int f
     }
     #endif
     if(ret!=MAP_FAILED) {
-        if((flags&MAP_SHARED) && (fd>0)) {
-            uint32_t flags = fcntl(fd, F_GETFL);
-            if((flags&O_ACCMODE)==O_RDWR) {
-                if((BOX64ENV(log)>=LOG_DEBUG || BOX64ENV(dynarec_log)>=LOG_DEBUG)) {printf_log(LOG_NONE, "Note: Marking the region (%p-%p prot=%x) as NEVERCLEAN because fd have O_RDWR attribute\n", ret, ret+length, prot);}
-                prot |= PROT_NEVERCLEAN;
-            }
-        }
         if(emu && !(flags&MAP_ANONYMOUS) && (fd>0)) {
             DetectUnityPlayer(fd);
             // the last_mmap will allow mmap created by wine, even those that have hole, to be fully tracked as one single mmap
@@ -3044,6 +3037,13 @@ EXPORT void* my_mmap64(x64emu_t* emu, void *addr, size_t length, int prot, int f
             else
                 RecordEnvMappings((uintptr_t)ret, length, fd);
         }
+        if((flags&MAP_SHARED) && (fd>0)) {
+            uint32_t flags = fcntl(fd, F_GETFL);
+            if((flags&O_ACCMODE)==O_RDWR) {
+                if((BOX64ENV(log)>=LOG_DEBUG || BOX64ENV(dynarec_log)>=LOG_DEBUG)) {printf_log(LOG_NONE, "Note: Marking the region (%p-%p prot=%x) as NEVERCLEAN because fd have O_RDWR attribute\n", ret, ret+length, prot);}
+                prot |= PROT_NEVERCLEAN;
+            }
+        }
         // hack to capture full size of the mmap done by wine
         if(emu && (fd==-1) && (flags==(MAP_PRIVATE|MAP_ANON))) {
             last_mmap_addr = ret;
diff --git a/system/box64.box64rc b/system/box64.box64rc
index 53f49a4a..3d47fdeb 100644
--- a/system/box64.box64rc
+++ b/system/box64.box64rc
@@ -351,7 +351,7 @@ BOX64_DYNAREC_ALIGNED_ATOMICS=1
 [Blacksad.exe]
 BOX64_DYNAREC_STRONGMEM=1
 BOX64_DYNAREC_BIGBLOCK=3
-BOX64_DYNAREC_CALLRET=0
+BOX64_DYNAREC_CALLRET=2
 
 [Battle.net.exe]
 BOX64_DYNAREC_BIGBLOCK=0
@@ -465,7 +465,7 @@ BOX64_DYNAREC_BIGBLOCK=0
 BOX64_DYNAREC_SAFEFLAGS=2
 BOX64_DYNAREC_STRONGMEM=1
 BOX64_DYNAREC_BIGBLOCK=3
-BOX64_DYNAREC_CALLRET=0
+BOX64_DYNAREC_CALLRET=2
 BOX64_SSE_FLUSHTO0=1
 BOX64_DYNAREC_DIRTY=1
 
@@ -734,6 +734,11 @@ BOX64_DYNAREC_BIGBLOCK=3
 BOX64_DYNAREC_CALLRET=1
 BOX64_DYNAREC_SAFEFLAGS=0
 
+[Trials of Innocence.exe]
+BOX64_DYNAREC_BIGBLOCK=3
+BOX64_DYNAREC_CALLRET=2
+BOX64_DYNAREC_DIRTY=0
+
 [TT2.exe]
 BOX64_DYNAREC_STRONGMEM=1
 BOX64_DYNAREC_BIGBLOCK=3