about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorLeslie Zhai <zhaixiang@loongson.cn>2025-09-19 14:10:41 +0800
committerGitHub <noreply@github.com>2025-09-19 08:10:41 +0200
commit5bfb27d3fa867c5d204d5f7e23507e36b87c82f7 (patch)
treebcfabe4007a9e9d01a928fc79f42bdde0af83aee /src
parent0d355aaf01e0e76968a0b9ea7e4478e9b4228948 (diff)
downloadbox64-5bfb27d3fa867c5d204d5f7e23507e36b87c82f7.tar.gz
box64-5bfb27d3fa867c5d204d5f7e23507e36b87c82f7.zip
[LA64_DYNAREC] Refactor register mapping (#2940)
* [LA64_DYNAREC] Refactor register mapping

* [LA64_DYNAREC] Fix typo

* [LA64_DYNAREC] Remapping xSavedSP to fp ($r22)

* [LA64_DYNAREC] Fix VPCLMULQDQ x3 and x4 issue

* [LA64_DYNAREC] Fix typo

* [LA64_DYNAREC] Fix typo
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/la64/dynarec_la64_00.c34
-rw-r--r--src/dynarec/la64/dynarec_la64_0f.c44
-rw-r--r--src/dynarec/la64/dynarec_la64_66.c10
-rw-r--r--src/dynarec/la64/dynarec_la64_660f.c24
-rw-r--r--src/dynarec/la64/dynarec_la64_avx_66_0f38.c10
-rw-r--r--src/dynarec/la64/dynarec_la64_avx_66_0f3a.c4
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.c48
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h25
-rw-r--r--src/dynarec/la64/dynarec_la64_private.h12
-rw-r--r--src/dynarec/la64/la64_emitter.h2
-rw-r--r--src/dynarec/la64/la64_epilog.S58
-rw-r--r--src/dynarec/la64/la64_lock.S6
-rw-r--r--src/dynarec/la64/la64_mapping.h161
-rw-r--r--src/dynarec/la64/la64_next.S66
-rw-r--r--src/dynarec/la64/la64_printer.c2
-rw-r--r--src/dynarec/la64/la64_prolog.S59
-rw-r--r--src/emu/x64emu_private.h2
17 files changed, 318 insertions, 249 deletions
diff --git a/src/dynarec/la64/dynarec_la64_00.c b/src/dynarec/la64/dynarec_la64_00.c
index 7faaa01a..e2bc02d6 100644
--- a/src/dynarec/la64/dynarec_la64_00.c
+++ b/src/dynarec/la64/dynarec_la64_00.c
@@ -674,7 +674,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             }
             GETIP(ip, x7);
             STORE_XEMU_CALL();
-            CALL(const_native_priv, -1);
+            CALL(const_native_priv, -1, 0, 0);
             LOAD_XEMU_CALL();
             jump_to_epilog(dyn, 0, xRIP, ninst);
             *need_epilog = 0;
@@ -690,7 +690,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             }
             GETIP(ip, x7);
             STORE_XEMU_CALL();
-            CALL(const_native_priv, -1);
+            CALL(const_native_priv, -1, 0, 0);
             LOAD_XEMU_CALL();
             jump_to_epilog(dyn, 0, xRIP, ninst);
             *need_epilog = 0;
@@ -1671,7 +1671,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     GETEB(x1, 1);
                     u8 = F8;
                     MOV32w(x2, u8);
-                    CALL_(const_rol8, ed, x3);
+                    CALL_(const_rol8, ed, x3, x1, x2);
                     EBBACK();
                     break;
                 case 4:
@@ -1983,7 +1983,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     STORE_XEMU_CALL();
                     ADDI_D(x3, xRIP, 8 + 8 + 2);                        // expected return address
                     ADDI_D(x1, xEmu, (uint32_t)offsetof(x64emu_t, ip)); // setup addr as &emu->ip
-                    CALL_(const_int3, -1, x3);
+                    CALL_(const_int3, -1, x3, x1, 0);
                     LOAD_XEMU_CALL();
                     addr += 8 + 8;
                     BNE_MARK(xRIP, x3);
@@ -2002,7 +2002,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     BEQZ_MARK(x3);
                     GETIP(addr, x7);
                     STORE_XEMU_CALL();
-                    CALL(const_native_int3, -1);
+                    CALL(const_native_int3, -1, 0, 0);
                     LOAD_XEMU_CALL();
                     MARK;
                     jump_to_epilog(dyn, addr, 0, ninst);
@@ -2020,7 +2020,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 GETIP(ip, x7); // priviledged instruction, IP not updated
                 STORE_XEMU_CALL();
                 MOV32w(x1, u8);
-                CALL(const_native_int, -1);
+                CALL(const_native_int, -1, x1, 0);
                 LOAD_XEMU_CALL();
             } else if (u8 == 0x80) {
                 INST_NAME("32bits SYSCALL");
@@ -2028,7 +2028,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 SMEND();
                 GETIP(addr, x7);
                 STORE_XEMU_CALL();
-                CALL_S(const_x86syscall, -1);
+                CALL_S(const_x86syscall, -1, 0);
                 LOAD_XEMU_CALL();
                 TABLE64(x3, addr); // expected return address
                 BNE_MARK(xRIP, x3);
@@ -2046,7 +2046,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 }
                 GETIP(addr, x7);
                 STORE_XEMU_CALL();
-                CALL(const_native_int3, -1);
+                CALL(const_native_int3, -1, 0, 0);
                 LOAD_XEMU_CALL();
                 jump_to_epilog(dyn, 0, xRIP, ninst);
                 *need_epilog = 0;
@@ -2060,7 +2060,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 }
                 GETIP(ip, x7); // priviledged instruction, IP not updated
                 STORE_XEMU_CALL();
-                CALL(const_native_priv, -1);
+                CALL(const_native_priv, -1, 0, 0);
                 LOAD_XEMU_CALL();
                 jump_to_epilog(dyn, 0, xRIP, ninst);
                 *need_epilog = 0;
@@ -2091,7 +2091,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     }
                     MESSAGE(LOG_DUMP, "Need Optimization\n");
                     SETFLAGS(X_OF | X_CF, SF_SET_DF, NAT_FLAGS_NOFUSION);
-                    CALL_(const_rol8, ed, x3);
+                    CALL_(const_rol8, ed, x3, x1, x2);
                     EBBACK();
                     break;
                 case 4:
@@ -2173,7 +2173,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     SETFLAGS(X_OF | X_CF, SF_SET_DF, NAT_FLAGS_NOFUSION);
                     MOV32w(x2, 1);
                     GETEDW(x4, x1, 0);
-                    CALL_(rex.w ? const_rcr64 : const_rcr32, ed, x4);
+                    CALL_(rex.w ? const_rcr64 : const_rcr32, ed, x4, x1, x2);
                     WBACK;
                     if (!wback && !rex.w) ZEROUP(ed);
                     break;
@@ -2413,7 +2413,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         GETIP_(dyn->insts[ninst].natcall, x7); // read the 0xCC already
                         STORE_XEMU_CALL();
                         ADDI_D(x1, xEmu, (uint32_t)offsetof(x64emu_t, ip)); // setup addr as &emu->ip
-                        CALL_S(const_int3, -1);
+                        CALL_S(const_int3, -1, x1);
                         LOAD_XEMU_CALL();
                         MOV64x(x3, dyn->insts[ninst].natcall);
                         ADDI_D(x3, x3, 2 + 8 + 8);
@@ -2545,7 +2545,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             }
             GETIP(ip, x7);
             STORE_XEMU_CALL();
-            CALL(const_native_priv, -1);
+            CALL(const_native_priv, -1, 0, 0);
             LOAD_XEMU_CALL();
             jump_to_epilog(dyn, 0, xRIP, ninst);
             *need_epilog = 0;
@@ -2611,7 +2611,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     MESSAGE(LOG_DUMP, "Need Optimization\n");
                     SETFLAGS(X_ALL, SF_SET_DF, NAT_FLAGS_NOFUSION);
                     GETEB(x1, 0);
-                    CALL(const_div8, -1);
+                    CALL(const_div8, -1, x1, 0);
                     break;
                 default:
                     DEFAULT;
@@ -2721,8 +2721,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         } else {
                             GETEDH(x4, x1, 0); // get edd changed addr, so cannot be called 2 times for same op...
                             BEQ_MARK(xRDX, xZR);
-                            if (ed != x1) { MV(x1, ed); }
-                            CALL(const_div64, -1);
+                            CALL(const_div64, -1, ed, 0);
                             B_NEXT_nocond;
                             MARK;
                             DIV_DU(x2, xRAX, ed);
@@ -2767,8 +2766,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                             BNE_MARK3(x2, xZR);
                             BLT_MARK(xRAX, xZR);
                             MARK3;
-                            if (ed != x1) MV(x1, ed);
-                            CALL(const_idiv64, -1);
+                            CALL(const_idiv64, -1, ed, 0);
                             B_NEXT_nocond;
                             MARK;
                             DIV_D(x2, xRAX, ed);
diff --git a/src/dynarec/la64/dynarec_la64_0f.c b/src/dynarec/la64/dynarec_la64_0f.c
index cb815e9a..6e27e59b 100644
--- a/src/dynarec/la64/dynarec_la64_0f.c
+++ b/src/dynarec/la64/dynarec_la64_0f.c
@@ -88,7 +88,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             SMEND();
             GETIP(addr, x7);
             STORE_XEMU_CALL();
-            CALL_S(const_x64syscall, -1);
+            CALL_S(const_x64syscall, -1, 0);
             LOAD_XEMU_CALL();
             TABLE64(x3, addr); // expected return address
             BNE_MARK(xRIP, x3);
@@ -107,7 +107,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             }
             GETIP(ip, x7);
             STORE_XEMU_CALL();
-            CALL(const_native_ud, -1);
+            CALL(const_native_ud, -1, 0, 0);
             LOAD_XEMU_CALL();
             jump_to_epilog(dyn, 0, xRIP, ninst);
             *need_epilog = 0;
@@ -420,7 +420,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             INST_NAME("RDTSC");
             NOTEST(x1);
             if (box64_rdtsc) {
-                CALL(const_readtsc, x3); // will return the u64 in x3
+                CALL(const_readtsc, x3, 0, 0); // will return the u64 in x3
             } else {
                 RDTIME_D(x3, xZR);
             }
@@ -620,12 +620,10 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         ed = (nextop & 7) + (rex.b << 3);
                         sse_reflect_reg(dyn, ninst, ed);
                         ADDI_D(x2, xEmu, offsetof(x64emu_t, xmm[ed]));
+                        ed = x2;
                     } else {
                         SMREAD();
                         addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 0, 0);
-                        if (ed != x2) {
-                            MV(x2, ed);
-                        }
                     }
                     GETG;
                     sse_forget_reg(dyn, ninst, gd);
@@ -633,22 +631,22 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     sse_reflect_reg(dyn, ninst, 0);
                     switch (u8) {
                         case 0xC8:
-                            CALL(const_sha1nexte, -1);
+                            CALL(const_sha1nexte, -1, x1, ed);
                             break;
                         case 0xC9:
-                            CALL(const_sha1msg1, -1);
+                            CALL(const_sha1msg1, -1, x1, ed);
                             break;
                         case 0xCA:
-                            CALL(const_sha1msg2, -1);
+                            CALL(const_sha1msg2, -1, x1, ed);
                             break;
                         case 0xCB:
-                            CALL(const_sha256rnds2, -1);
+                            CALL(const_sha256rnds2, -1, x1, ed);
                             break;
                         case 0xCC:
-                            CALL(const_sha256msg1, -1);
+                            CALL(const_sha256msg1, -1, x1, ed);
                             break;
                         case 0xCD:
-                            CALL(const_sha256msg2, -1);
+                            CALL(const_sha256msg2, -1, x1, ed);
                             break;
                     }
                     break;
@@ -710,17 +708,17 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         ed = (nextop & 7) + (rex.b << 3);
                         sse_reflect_reg(dyn, ninst, ed);
                         ADDI_D(x2, xEmu, offsetof(x64emu_t, xmm[ed]));
+                        wback = x2;
                     } else {
                         SMREAD();
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 0, 1);
-                        if (wback != x2) MV(x2, wback);
                     }
                     u8 = F8;
                     GETG;
                     sse_forget_reg(dyn, ninst, gd);
                     ADDI_D(x1, xEmu, offsetof(x64emu_t, xmm[gd]));
                     MOV32w(x3, u8);
-                    CALL(const_sha1rnds4, -1);
+                    CALL4(const_sha1rnds4, -1, x1, wback, x3, 0);
                     break;
                 default:
                     DEFAULT;
@@ -1368,8 +1366,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         case 0xA2:
             INST_NAME("CPUID");
             NOTEST(x1);
-            MV(A1, xRAX);
-            CALL_(const_cpuid, -1, 0);
+            CALL_(const_cpuid, -1, 0, xRAX, 0);
             // BX and DX are not synchronized durring the call, so need to force the update
             LD_D(xRDX, xEmu, offsetof(x64emu_t, regs[_DX]));
             LD_D(xRBX, xEmu, offsetof(x64emu_t, regs[_BX]));
@@ -1497,8 +1494,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                             DEFAULT;
                         } else {
                             addr = geted(dyn, addr, ninst, nextop, &ed, x1, x3, &fixedaddress, rex, NULL, 0, 0);
-                            if (ed != x1) { MV(x1, ed); }
-                            CALL(rex.is32bits ? const_fpu_fxsave32 : const_fpu_fxsave64, -1);
+                            CALL(rex.is32bits ? const_fpu_fxsave32 : const_fpu_fxsave64, -1, ed, 0);
                         }
                         break;
                     case 1:
@@ -1507,8 +1503,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         SKIPTEST(x1);
                         fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, x3, &fixedaddress, rex, NULL, 0, 0);
-                        if (ed != x1) { MV(x1, ed); }
-                        CALL(rex.is32bits ? const_fpu_fxrstor32 : const_fpu_fxrstor64, -1);
+                        CALL(rex.is32bits ? const_fpu_fxrstor32 : const_fpu_fxrstor64, -1, ed, 0);
                         break;
                     case 2:
                         INST_NAME("LDMXCSR Md");
@@ -1563,25 +1558,22 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         MESSAGE(LOG_DUMP, "Need Optimization\n");
                         fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                        if (ed != x1) { MV(x1, ed); }
                         MOV32w(x2, rex.w ? 0 : 1);
-                        CALL(const_fpu_xsave, -1);
+                        CALL(const_fpu_xsave, -1, ed, x2);
                         break;
                     case 5:
                         INST_NAME("XRSTOR Ed");
                         MESSAGE(LOG_DUMP, "Need Optimization\n");
                         fpu_purgecache(dyn, ninst, 0, x1, x2, x3);
                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                        if (ed != x1) { MV(x1, ed); }
                         MOV32w(x2, rex.w ? 0 : 1);
-                        CALL(const_fpu_xrstor, -1);
+                        CALL(const_fpu_xrstor, -1, ed, x2);
                         break;
                     case 7:
                         INST_NAME("CLFLUSH Ed");
                         MESSAGE(LOG_DUMP, "Need Optimization?\n");
                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0);
-                        if (ed != x1) { MV(x1, ed); }
-                        CALL_(const_native_clflush, -1, 0);
+                        CALL_(const_native_clflush, -1, 0, ed, 0);
                         break;
                     default:
                         DEFAULT;
diff --git a/src/dynarec/la64/dynarec_la64_66.c b/src/dynarec/la64/dynarec_la64_66.c
index cc29e3bc..95ef995d 100644
--- a/src/dynarec/la64/dynarec_la64_66.c
+++ b/src/dynarec/la64/dynarec_la64_66.c
@@ -782,7 +782,7 @@ uintptr_t dynarec64_66(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     GETEW(x1, 1);
                     u8 = F8;
                     MOV32w(x2, u8);
-                    CALL_(const_rol16, x1, x3);
+                    CALL_(const_rol16, x1, x3, x1, x2);
                     EWBACK;
                     break;
                 case 1:
@@ -792,7 +792,7 @@ uintptr_t dynarec64_66(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     GETEW(x1, 1);
                     u8 = F8;
                     MOV32w(x2, u8);
-                    CALL_(const_ror16, x1, x3);
+                    CALL_(const_ror16, x1, x3, x1, x2);
                     EWBACK;
                     break;
                 case 4:
@@ -872,7 +872,7 @@ uintptr_t dynarec64_66(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                     SETFLAGS(X_OF | X_CF, SF_SET_DF, NAT_FLAGS_NOFUSION);
                     if (BOX64DRENV(dynarec_safeflags) > 1) MAYSETFLAGS();
                     GETEW(x1, 1);
-                    CALL_(const_rol16, x1, x3);
+                    CALL_(const_rol16, x1, x3, x1, x2);
                     EWBACK;
                     break;
                 case 5:
@@ -970,7 +970,7 @@ uintptr_t dynarec64_66(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         BNE_MARK3(ed, xZR);
                         GETIP_(ip, x6);
                         STORE_XEMU_CALL();
-                        CALL(const_native_div0, -1);
+                        CALL(const_native_div0, -1, 0, 0);
                         CLEARIP();
                         LOAD_XEMU_CALL();
                         jump_to_epilog(dyn, 0, xRIP, ninst);
@@ -991,7 +991,7 @@ uintptr_t dynarec64_66(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         BNE_MARK3(ed, xZR);
                         GETIP_(ip, x7);
                         STORE_XEMU_CALL();
-                        CALL(const_native_div0, -1);
+                        CALL(const_native_div0, -1, 0, 0);
                         CLEARIP();
                         LOAD_XEMU_CALL();
                         jump_to_epilog(dyn, 0, xRIP, ninst);
diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c
index ef2a1e3f..2a82cea1 100644
--- a/src/dynarec/la64/dynarec_la64_660f.c
+++ b/src/dynarec/la64/dynarec_la64_660f.c
@@ -797,7 +797,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     }
                     sse_forget_reg(dyn, ninst, gd);
                     MOV32w(x1, gd);
-                    CALL(const_native_aesimc, -1);
+                    CALL(const_native_aesimc, -1, x1, 0);
                     break;
                 case 0xDC:
                     INST_NAME("AESENC Gx, Ex"); // AES-NI
@@ -811,7 +811,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         d0 = -1;
                     sse_forget_reg(dyn, ninst, gd);
                     MOV32w(x1, gd);
-                    CALL(const_native_aese, -1);
+                    CALL(const_native_aese, -1, x1, 0);
                     GETGX(q0, 1);
                     VXOR_V(q0, q0, (d0 != -1) ? d0 : q1);
                     break;
@@ -827,7 +827,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         d0 = -1;
                     sse_forget_reg(dyn, ninst, gd);
                     MOV32w(x1, gd);
-                    CALL(const_native_aeselast, -1);
+                    CALL(const_native_aeselast, -1, x1, 0);
                     GETGX(q0, 1);
                     VXOR_V(q0, q0, (d0 != -1) ? d0 : q1);
                     break;
@@ -843,7 +843,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         d0 = -1;
                     sse_forget_reg(dyn, ninst, gd);
                     MOV32w(x1, gd);
-                    CALL(const_native_aesd, -1);
+                    CALL(const_native_aesd, -1, x1, 0);
                     GETGX(q0, 1);
                     VXOR_V(q0, q0, (d0 != -1) ? d0 : q1);
                     break;
@@ -859,7 +859,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         d0 = -1;
                     sse_forget_reg(dyn, ninst, gd);
                     MOV32w(x1, gd);
-                    CALL(const_native_aesdlast, -1);
+                    CALL(const_native_aesdlast, -1, x1, 0);
                     GETGX(q0, 1);
                     VXOR_V(q0, q0, (d0 != -1) ? d0 : q1);
                     break;
@@ -1256,7 +1256,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     }
                     u8 = F8;
                     MOV32w(x4, u8);
-                    CALL(const_native_pclmul, -1);
+                    CALL4(const_native_pclmul, -1, x1, x2, x3, x4);
                     break;
                 case 0x61:
                     INST_NAME("PCMPESTRI Gx, Ex, Ib");
@@ -1272,15 +1272,15 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         if (ed > 7)
                             sse_reflect_reg(dyn, ninst, ed);
                         ADDI_D(x1, xEmu, offsetof(x64emu_t, xmm[ed]));
+                        ed = x1;
                     } else {
-                        addr = geted(dyn, addr, ninst, nextop, &ed, x1, x5, &fixedaddress, rex, NULL, 0, 1);
-                        if (ed != x1) MV(x1, ed);
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1);
                     }
                     MV(x2, xRDX);
                     MV(x4, xRAX);
                     u8 = F8;
                     MOV32w(x5, u8);
-                    CALL(const_sse42_compare_string_explicit_len, x1);
+                    CALL6(const_sse42_compare_string_explicit_len, x1, ed, x2, x3, x4, x5, 0);
                     ZEROUP(x1);
                     BNEZ_MARK(x1);
                     MOV32w(xRCX, (u8 & 1) ? 8 : 16);
@@ -1305,13 +1305,13 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         ed = (nextop & 7) + (rex.b << 3);
                         if (ed > 7) sse_reflect_reg(dyn, ninst, ed);
                         ADDI_D(x1, xEmu, offsetof(x64emu_t, xmm[ed]));
+                        ed = x1;
                     } else {
                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1);
-                        if (ed != x1) MV(x1, ed);
                     }
                     u8 = F8;
                     MOV32w(x3, u8);
-                    CALL(const_sse42_compare_string_implicit_len, x1);
+                    CALL4(const_sse42_compare_string_implicit_len, x1, ed, x2, x3, 0);
                     BNEZ_MARK(x1);
                     MOV32w(xRCX, (u8 & 1) ? 8 : 16);
                     B_NEXT_nocond;
@@ -1344,7 +1344,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     }
                     u8 = F8;
                     MOV32w(x4, u8);
-                    CALL(const_native_aeskeygenassist, -1);
+                    CALL4(const_native_aeskeygenassist, -1, x1, x2, x3, x4);
                     break;
                 default:
                     DEFAULT;
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c
index d2afd48c..0a04c586 100644
--- a/src/dynarec/la64/dynarec_la64_avx_66_0f38.c
+++ b/src/dynarec/la64/dynarec_la64_avx_66_0f38.c
@@ -1278,7 +1278,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             }
             avx_forget_reg(dyn, ninst, gd);
             MOV32w(x1, gd);
-            CALL(const_native_aesimc, -1);
+            CALL(const_native_aesimc, -1, x1, 0);
             if (!vex.l) {
                 ST_D(xZR, xEmu, offsetof(x64emu_t, ymm[gd]));
                 ST_D(xZR, xEmu, offsetof(x64emu_t, ymm[gd]) + 8);
@@ -1296,7 +1296,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
                 d0 = -1;
             avx_forget_reg(dyn, ninst, gd);
             MOV32w(x1, gd);
-            CALL(const_native_aese, -1);
+            CALL(const_native_aese, -1, x1, 0);
             GETGYx(q0, 1);
             VXOR_V(q0, q0, (d0 != -1) ? d0 : q1);
             if (!vex.l) {
@@ -1316,7 +1316,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
                 d0 = -1;
             avx_forget_reg(dyn, ninst, gd);
             MOV32w(x1, gd);
-            CALL(const_native_aeselast, -1);
+            CALL(const_native_aeselast, -1, x1, 0);
             GETGYx(q0, 1);
             VXOR_V(q0, q0, (d0 != -1) ? d0 : q1);
             if (!vex.l) {
@@ -1336,7 +1336,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
                 d0 = -1;
             avx_forget_reg(dyn, ninst, gd);
             MOV32w(x1, gd);
-            CALL(const_native_aesd, -1);
+            CALL(const_native_aesd, -1, x1, 0);
             GETGYx(q0, 1);
             VXOR_V(q0, q0, (d0 != -1) ? d0 : q1);
             if (!vex.l) {
@@ -1356,7 +1356,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
                 d0 = -1;
             avx_forget_reg(dyn, ninst, gd);
             MOV32w(x1, gd);
-            CALL(const_native_aesdlast, -1);
+            CALL(const_native_aesdlast, -1, x1, 0);
             GETGYx(q0, 1);
             VXOR_V(q0, q0, (d0 != -1) ? d0 : q1);
             if (!vex.l) {
diff --git a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c
index c91eb1bc..691b2c1e 100644
--- a/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c
+++ b/src/dynarec/la64/dynarec_la64_avx_66_0f3a.c
@@ -737,7 +737,7 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             }
             u8 = F8;
             MOV32w(x4, u8);
-            CALL_(vex.l ? const_native_pclmul_y : const_native_pclmul_x, -1, x3);
+            CALL4_(vex.l ? const_native_pclmul_y : const_native_pclmul_x, -1, x3, x1, x2, x3, x4);
             if (!vex.l) {
                 ST_D(xZR, xEmu, offsetof(x64emu_t, ymm[gd]));
                 ST_D(xZR, xEmu, offsetof(x64emu_t, ymm[gd]) + 8);
@@ -796,7 +796,7 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t i
             }
             u8 = F8;
             MOV32w(x4, u8);
-            CALL(const_native_aeskeygenassist, -1);
+            CALL4(const_native_aeskeygenassist, -1, x1, x2, x3, x4);
             if (!vex.l) {
                 ST_D(xZR, xEmu, offsetof(x64emu_t, ymm[gd]));
                 ST_D(xZR, xEmu, offsetof(x64emu_t, ymm[gd]) + 8);
diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c
index 049803ce..d283c09f 100644
--- a/src/dynarec/la64/dynarec_la64_helper.c
+++ b/src/dynarec/la64/dynarec_la64_helper.c
@@ -597,7 +597,7 @@ void ret_to_epilog(dynarec_la64_t* dyn, uintptr_t ip, int ninst, rex_t rex)
     MVz(x1, xRIP);
     SMEND();
     if (BOX64DRENV(dynarec_callret)) {
-        // pop the actual return address from RV64 stack
+        // pop the actual return address from LA64 stack
         LD_D(xRA, xSP, 0);    // native addr
         LD_D(x6, xSP, 8);     // x86 addr
         ADDI_D(xSP, xSP, 16); // pop
@@ -627,7 +627,7 @@ void retn_to_epilog(dynarec_la64_t* dyn, uintptr_t ip, int ninst, rex_t rex, int
     MVz(x1, xRIP);
     SMEND();
     if (BOX64DRENV(dynarec_callret)) {
-        // pop the actual return address from RV64 stack
+        // pop the actual return address from LA64 stack
         LD_D(xRA, xSP, 0);    // native addr
         LD_D(x6, xSP, 8);     // x86 addr
         ADDI_D(xSP, xSP, 16); // pop
@@ -691,7 +691,7 @@ void iret_to_epilog(dynarec_la64_t* dyn, uintptr_t ip, int ninst, int is64bits)
     CLEARIP();
 }
 
-void call_c(dynarec_la64_t* dyn, int ninst, la64_consts_t fnc, int reg, int ret, int saveflags, int savereg)
+void call_c(dynarec_la64_t* dyn, int ninst, la64_consts_t fnc, int reg, int ret, int saveflags, int savereg, int arg1, int arg2, int arg3, int arg4, int arg5, int arg6)
 {
     MAYUSE(fnc);
     if (savereg == 0)
@@ -702,39 +702,47 @@ void call_c(dynarec_la64_t* dyn, int ninst, la64_consts_t fnc, int reg, int ret,
     }
     fpu_pushcache(dyn, ninst, reg, 0);
     if (ret != -2) {
-        ADDI_D(xSP, xSP, -16); // RV64 stack needs to be 16byte aligned
-        ST_D(xEmu, xSP, 0);
-        ST_D(savereg, xSP, 8);
-        // $r4..$r20 needs to be saved by caller
-        STORE_REG(RAX);
-        STORE_REG(RCX);
+        ADDI_D(xSP, xSP, -16); // LA64 stack needs to be 16byte aligned
+        ST_D(savereg, xSP, 0);
+        STORE_REG(RDI);
+        STORE_REG(RSI);
         STORE_REG(RDX);
+        STORE_REG(RCX);
+        STORE_REG(R8);
+        STORE_REG(R9);
+        STORE_REG(RAX);
         STORE_REG(RBX);
         STORE_REG(RSP);
         STORE_REG(RBP);
-        STORE_REG(RSI);
-        STORE_REG(RDI);
         ST_D(xRIP, xEmu, offsetof(x64emu_t, ip));
     }
     TABLE64C(reg, fnc);
+    MV(A0, xEmu);
+    if (arg1) MV(A1, arg1);
+    if (arg2) MV(A2, arg2);
+    if (arg3) MV(A3, arg3);
+    if (arg4) MV(A4, arg4);
+    if (arg5) MV(A5, arg5);
+    if (arg6) MV(A6, arg6);
     JIRL(xRA, reg, 0);
     if (ret >= 0) {
-        MV(ret, xEmu);
+        MV(ret, A0);
     }
     if (ret != -2) {
-        LD_D(xEmu, xSP, 0);
-        LD_D(savereg, xSP, 8);
+        LD_D(savereg, xSP, 0);
         ADDI_D(xSP, xSP, 16);
 #define GO(A) \
     if (ret != x##A) { LOAD_REG(A); }
-        GO(RAX);
-        GO(RCX);
+        GO(RDI);
+        GO(RSI);
         GO(RDX);
+        GO(RCX);
+        GO(R8);
+        GO(R9);
+        GO(RAX);
         GO(RBX);
         GO(RSP);
         GO(RBP);
-        GO(RSI);
-        GO(RDI);
         if (ret != xRIP)
             LD_D(xRIP, xEmu, offsetof(x64emu_t, ip));
 #undef GO
@@ -770,7 +778,7 @@ void grab_segdata(dynarec_la64_t* dyn, uintptr_t addr, int ninst, int reg, int s
         CBZ_MARKSEG(t1);
     }
     MOV64x(x1, segment);
-    call_c(dyn, ninst, const_getsegmentbase, t2, reg, 0, xFlags);
+    call_c(dyn, ninst, const_getsegmentbase, t2, reg, 0, xFlags, x1, 0, 0, 0, 0, 0);
     MARKSEG;
     MESSAGE(LOG_DUMP, "----%s Offset\n", (segment == _FS) ? "FS" : "GS");
 }
@@ -1689,7 +1697,7 @@ static void flagsCacheTransform(dynarec_la64_t* dyn, int ninst, int s1)
             j64 = (GETMARKF2) - (dyn->native_size);
             BEQZ(s1, j64);
         }
-        CALL_(const_updateflags, -1, 0);
+        CALL_(const_updateflags, -1, 0, 0, 0);
         MARKF2;
     }
 }
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index ba375b7f..740c6909 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -674,13 +674,16 @@
 
 // CALL will use x6 for the call address. Return value can be put in ret (unless ret is -1)
 // R0 will not be pushed/popd if ret is -2
-#define CALL(F, ret) call_c(dyn, ninst, F, x6, ret, 1, 0)
+#define CALL(F, ret, arg1, arg2)                          call_c(dyn, ninst, F, x6, ret, 1, 0, arg1, arg2, 0, 0, 0, 0)
+#define CALL4(F, ret, arg1, arg2, arg3, arg4)             call_c(dyn, ninst, F, x6, ret, 1, 0, arg1, arg2, arg3, arg4, 0, 0)
+#define CALL6(F, ret, arg1, arg2, arg3, arg4, arg5, arg6) call_c(dyn, ninst, F, x6, ret, 1, 0, arg1, arg2, arg3, arg4, arg5, arg6)
 // CALL_ will use x6 for the call address. Return value can be put in ret (unless ret is -1)
 // R0 will not be pushed/popd if ret is -2
-#define CALL_(F, ret, reg) call_c(dyn, ninst, F, x6, ret, 1, reg)
+#define CALL_(F, ret, reg, arg1, arg2) call_c(dyn, ninst, F, x6, ret, 1, reg, arg1, arg2, 0, 0, 0, 0)
+#define CALL4_(F, ret, reg, arg1, arg2, arg3, arg4) call_c(dyn, ninst, F, x6, ret, 1, reg, arg1, arg2, arg3, arg4, 0, 0)
 // CALL_S will use x6 for the call address. Return value can be put in ret (unless ret is -1)
 // R0 will not be pushed/popd if ret is -2. Flags are not save/restored
-#define CALL_S(F, ret) call_c(dyn, ninst, F, x6, ret, 0, 0)
+#define CALL_S(F, ret, arg1) call_c(dyn, ninst, F, x6, ret, 0, 0, arg1, 0, 0, 0, 0, 0)
 
 #define MARKi(i)    dyn->insts[ninst].mark[i] = dyn->native_size
 #define GETMARKi(i) dyn->insts[ninst].mark[i]
@@ -848,8 +851,9 @@
 
 // Need to also store current value of some register, as they may be used by functions like setjmp
 #define STORE_XEMU_CALL() \
-    STORE_REG(R8);        \
-    STORE_REG(R9);        \
+    STORE_REG(RBX);       \
+    STORE_REG(RSP);       \
+    STORE_REG(RBP);       \
     STORE_REG(R10);       \
     STORE_REG(R11);       \
     STORE_REG(R12);       \
@@ -860,8 +864,9 @@
 #define LOAD_XEMU_CALL()
 
 #define LOAD_XEMU_REM() \
-    LOAD_REG(R8);       \
-    LOAD_REG(R9);       \
+    LOAD_REG(RBX);      \
+    LOAD_REG(RSP);      \
+    LOAD_REG(RBP);      \
     LOAD_REG(R10);      \
     LOAD_REG(R11);      \
     LOAD_REG(R12);      \
@@ -888,7 +893,7 @@
         if (dyn->f.pending == SF_PENDING                       \
             && dyn->insts[ninst].x64.need_after                \
             && !(dyn->insts[ninst].x64.need_after & X_PEND)) { \
-            CALL_(const_updateflags, -1, 0);                   \
+            CALL_(const_updateflags, -1, 0, 0, 0);             \
             dyn->f.pending = SF_SET;                           \
             SET_NODF();                                        \
         }                                                      \
@@ -958,7 +963,7 @@
             j64 = (GETMARKF) - (dyn->native_size);   \
             BEQ(x3, xZR, j64);                       \
         }                                            \
-        CALL_(const_updateflags, -1, 0);             \
+        CALL_(const_updateflags, -1, 0, 0, 0);       \
         MARKF;                                       \
         dyn->f.pending = SF_SET;                     \
         SET_DFOK();                                  \
@@ -1272,7 +1277,7 @@ void jump_to_next(dynarec_la64_t* dyn, uintptr_t ip, int reg, int ninst, int is3
 void ret_to_epilog(dynarec_la64_t* dyn, uintptr_t ip, int ninst, rex_t rex);
 void retn_to_epilog(dynarec_la64_t* dyn, uintptr_t ip, int ninst, rex_t rex, int n);
 void iret_to_epilog(dynarec_la64_t* dyn, uintptr_t ip, int ninst, int is64bits);
-void call_c(dynarec_la64_t* dyn, int ninst, la64_consts_t fnc, int reg, int ret, int saveflags, int save_reg);
+void call_c(dynarec_la64_t* dyn, int ninst, la64_consts_t fnc, int reg, int ret, int saveflags, int save_reg, int arg1, int arg2, int arg3, int arg4, int arg5, int arg6);
 void grab_segdata(dynarec_la64_t* dyn, uintptr_t addr, int ninst, int reg, int segment, int modreg);
 void emit_cmp8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5, int s6);
 void emit_cmp16(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5, int s6);
diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h
index 00f19bc6..120fc14e 100644
--- a/src/dynarec/la64/dynarec_la64_private.h
+++ b/src/dynarec/la64/dynarec_la64_private.h
@@ -179,12 +179,12 @@ int Table64(dynarec_la64_t *dyn, uint64_t val, int pass);  // add a value to tab
 
 void CreateJmpNext(void* addr, void* next);
 
-#define GO_TRACE(A, B, s0) \
-    GETIP(addr, s0);       \
-    MV(A1, xRIP);          \
-    STORE_XEMU_CALL();     \
-    MOV64x(A2, B);         \
-    CALL(const_##A, -1);   \
+#define GO_TRACE(A, B, s0)         \
+    GETIP(addr, s0);               \
+    MV(x1, xRIP);                  \
+    STORE_XEMU_CALL();             \
+    MOV64x(x2, B);                 \
+    CALL(const_##A, -1, x1, x2);   \
     LOAD_XEMU_CALL()
 
 #endif //__DYNAREC_ARM_PRIVATE_H_
diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h
index 466180f7..7a00c0cc 100644
--- a/src/dynarec/la64/la64_emitter.h
+++ b/src/dynarec/la64/la64_emitter.h
@@ -131,6 +131,8 @@
 #define DMB_ISHLD() DBAR_R_RW()
 #define DMB_ISHST() DBAR_W_RW()
 
+#define BRK(hint) EMIT(type_hint(0b00000000001010100, hint))
+
 // GR[rd] = GR[rj] & GR[rk]
 #define AND(rd, rj, rk) EMIT(type_3R(0b00000000000101001, rk, rj, rd))
 // GR[rd] = GR[rj] | GR[rk]
diff --git a/src/dynarec/la64/la64_epilog.S b/src/dynarec/la64/la64_epilog.S
index ab6a80d6..701b8898 100644
--- a/src/dynarec/la64/la64_epilog.S
+++ b/src/dynarec/la64/la64_epilog.S
@@ -1,3 +1,7 @@
+#define ASM_MAPPING 1
+#include "la64_mapping.h"
+#undef ASM_MAPPING
+
 //la64 epilog for dynarec
 //Save stuff, prepare stack and register
 //called with pointer to emu as 1st parameter
@@ -11,37 +15,37 @@
 
 la64_epilog:
     // update register -> emu
-    st.d   $r12, $r4, (8 * 0)
-    st.d   $r13, $r4, (8 * 1)
-    st.d   $r14, $r4, (8 * 2)
-    st.d   $r15, $r4, (8 * 3)
-    st.d   $r16, $r4, (8 * 4)
-    st.d   $r17, $r4, (8 * 5)
-    st.d   $r18, $r4, (8 * 6)
-    st.d   $r19, $r4, (8 * 7)
-    st.d   $r23, $r4, (8 * 8)
-    st.d   $r24, $r4, (8 * 9)
-    st.d   $r25, $r4, (8 * 10)
-    st.d   $r26, $r4, (8 * 11)
-    st.d   $r27, $r4, (8 * 12)
-    st.d   $r28, $r4, (8 * 13)
-    st.d   $r29, $r4, (8 * 14)
-    st.d   $r30, $r4, (8 * 15)
+    st.d   RAX, Emu, (8 * 0)
+    st.d   RCX, Emu, (8 * 1)
+    st.d   RDX, Emu, (8 * 2)
+    st.d   RBX, Emu, (8 * 3)
+    st.d   RSP, Emu, (8 * 4)
+    st.d   RBP, Emu, (8 * 5)
+    st.d   RSI, Emu, (8 * 6)
+    st.d   RDI, Emu, (8 * 7)
+    st.d   R8,  Emu, (8 * 8)
+    st.d   R9,  Emu, (8 * 9)
+    st.d   R10, Emu, (8 * 10)
+    st.d   R11, Emu, (8 * 11)
+    st.d   R12, Emu, (8 * 12)
+    st.d   R13, Emu, (8 * 13)
+    st.d   R14, Emu, (8 * 14)
+    st.d   R15, Emu, (8 * 15)
     // restore xFlags from LBT.eflags
-    la.global $r12, cpuext
-    ldptr.d   $r12, $r12, 0
-    andi      $r12, $r12, 1
-    beqz      $r12, 1f
-    ori       $r13, $r0, 0b100011010101
-    andn      $r31, $r31, $r13
-    x86mfflag $r13, 0b111111
-    or        $r31, $r31, $r13
+    la.global $r19, cpuext
+    ldptr.d   $r19, $r19, 0
+    andi      $r19, $r19, 1
+    beqz      $r19, 1f
+    ori       $r19, $r0, 0b100011010101
+    andn      Flags, Flags, $r19
+    x86mfflag $r19, 0b111111
+    or        Flags, Flags, $r19
 1:
-    st.d   $r31, $r4, (8 * 16) // xFlags
-    st.d   $r20, $r4, (8 * 17) // put back reg value in emu, including EIP (so $r20 must be EIP now)
+    st.d   Flags, Emu, (8 * 16) // xFlags
+    st.d   RIP,   Emu, (8 * 17) // put back reg value in emu, including EIP (so $r29 must be EIP now)
     // fallback to epilog_fast now, just restoring saved regs
 la64_epilog_fast:
-    addi.d $sp, $r22, 0       // restore save sp from xSavedSP
+    addi.d $sp, SavedSP, 0      // restore save sp from xSavedSP
     // restore all used register
     ld.d   $r1,  $sp, (8 * 0) // load ra
     ld.d   $r22, $sp, (8 * 1) // load fp
diff --git a/src/dynarec/la64/la64_lock.S b/src/dynarec/la64/la64_lock.S
index df7cfd83..fe025261 100644
--- a/src/dynarec/la64/la64_lock.S
+++ b/src/dynarec/la64/la64_lock.S
@@ -1,3 +1,7 @@
+#define ASM_MAPPING 1
+#include "la64_mapping.h"
+#undef ASM_MAPPING
+
 // LA64 lock helper
 // there is 2 part: read and write
 // write return 0 on success, 1 on fail (value has been changed)
@@ -226,4 +230,4 @@ la64_lock_get_d:
 la64_lock_get_dd:
     dbar 0
     ld.d $a0, $a0, 0
-    ret
\ No newline at end of file
+    ret
diff --git a/src/dynarec/la64/la64_mapping.h b/src/dynarec/la64/la64_mapping.h
index c373fe0c..3bb6c1d4 100644
--- a/src/dynarec/la64/la64_mapping.h
+++ b/src/dynarec/la64/la64_mapping.h
@@ -1,76 +1,87 @@
 #ifndef __LA64_MAPPING_H__
 #define __LA64_MAPPING_H__
 
+// LA64 Register Mapping Scheme
+/*****************************************************************************************
+name    alias  mapping      native description              Box64 description       saver
+******************************************************************************************
+r0      zero   native zero  Hard-wired zero                 N/A                     -
+r1      ra     native ra    Return address                  N/A                     Caller
+r2      tp     -            Thread pointer                  N/A                     -
+r3      sp     native sp    Stack pointer                   N/A                     Callee
+r4      a0     RDI          Function argument/return val.   -                       Caller
+r5      a1     RSI          Function argument/return val.   -                       Caller
+r6      a2     RDX          Function argument               -                       Caller
+r7      a3     RCX          Function argument               -                       Caller
+r8      a4     R8           Function argument               -                       Caller
+r9      a5     R9           Function argument               -                       Caller
+r10     a6     RBX          Function argument               -                       Caller
+r11     a7     RSP          Function argument               -                       Caller
+r12     t0     RAX          Temporary                       -                       Caller
+r13     t1     RBP          Temporary                       -                       Caller
+r14     t2     x1           Temporary                       Scratch                 Caller
+r15     t3     x2           Temporary                       Scratch                 Caller
+r16     t4     x3           Temporary                       Scratch                 Caller
+r17     t5     x4           Temporary                       Scratch                 Caller
+r18     t6     x5           Temporary                       Scratch                 Caller
+r19     t7     x6           Temporary                       Scratch                 Caller
+r20     t8     x7           Temporary                       Scratch                 Caller
+r21     rx     -            Reserved                        N/A                     -
+r22     fp     SavedSP      Saved register/frame pointer    -                       Callee
+r23     s0     R10          Saved register                  -                       Callee
+r24     s1     R11          Saved register                  -                       Callee
+r25     s2     R12          Saved register                  -                       Callee
+r26     s3     R13          Saved register                  -                       Callee
+r27     s4     R14          Saved register                  -                       Callee
+r28     s5     R15          Saved register                  -                       Callee
+r29     s6     RIP          Saved register                  -                       Callee
+r30     s7     FLAGS        Saved register                  -                       Callee
+r31     s8     xEmu         Saved register                  The Emu struct          Callee
+******************************************************************************************/
+
+#ifndef ASM_MAPPING
 
-// LA64 ABI
-/*
-Name     Alias     Meaning                         saver
----------------------------------------------------------
-r0       zero      Zero register                   -
-r1       ra        Return address                  Callee
-r2       tp        Thread pointer                  -
-r3       sp        Stack pointer                   Callee
-r4-r5    a0-a1     Function arguments,Return val.  Caller
-r6-r11   a2-a7     Function arguments              Caller
-r12-r20  t0-t8     Temp registers                  Caller
-r21      Reserved  Non-allocatable                 -
-r22      fp/s9     Frame pointer/Static register   Callee
-r23-31   s0-s8     Static registers                Callee
----------------------------------------------------------
-f0-f1    fa0-fa1   Function arguments,Return val.  Caller
-f2-f7    fa2-fa7   Function arguments              Caller
-f8-f23   ft0-ft15  Temp registers                  Caller
-f24-f31  fs0-fs7   Static registers                Callee
-*/
-/*
- LA64 GPR mapping
- There is no 15 registers free, so split the regs in 2 part
- AX..DI : r12-r19
- R8..R15: r23-r30
- flags in r31
- ip in r20
-*/
 // x86 Register mapping
 #define xRAX     12
-#define xRCX     13
-#define xRDX     14
-#define xRBX     15
-#define xRSP     16
-#define xRBP     17
-#define xRSI     18
-#define xRDI     19
-#define xR8      23
-#define xR9      24
-#define xR10     25
-#define xR11     26
-#define xR12     27
-#define xR13     28
-#define xR14     29
-#define xR15     30
-#define xFlags   31
-#define xRIP     20
+#define xRCX     7
+#define xRDX     6
+#define xRBX     10
+#define xRSP     11
+#define xRBP     13
+#define xRSI     5
+#define xRDI     4
+#define xR8      8
+#define xR9      9
+#define xR10     23
+#define xR11     24
+#define xR12     25
+#define xR13     26
+#define xR14     27
+#define xR15     28
+#define xFlags   30
+#define xRIP     29
 #define xSavedSP 22
 
 // convert a x86 register to native according to the register mapping
-#define TO_NAT(A) (xRAX + (A) + (((A) > 7) ? 3 : 0))
+#define TO_NAT(A) (((uint8_t[]) { 12, 7, 6, 10, 11, 13, 5, 4, 8, 9, 23, 24, 25, 26, 27, 28 })[(A)])
 
 // scratch registers
-#define x1 5
-#define x2 6
-#define x3 7
-#define x4 8
-#define x5 9
-#define x6 10
-#define x7 11
-
-// emu is r0
-#define xEmu 4
+#define x1 14
+#define x2 15
+#define x3 16
+#define x4 17
+#define x5 18
+#define x6 19
+#define x7 20
+
+// emu is $r31
+#define xEmu 31
 // LA64 RA
 #define xRA 1
 #define ra  xRA
 // LA64 SP
 #define xSP 3
-// RV64 args
+// LA64 args
 #define A0 4
 #define A1 5
 #define A2 6
@@ -127,4 +138,38 @@ f24-f31  fs0-fs7   Static registers                Callee
 #define FR_U 25
 #define FR_I 24
 
-#endif //__LA64_MAPPING_H__
\ No newline at end of file
+#else
+
+// x86 Register mapping
+#define RAX     $r12
+#define RCX     $r7
+#define RDX     $r6
+#define RBX     $r10
+#define RSP     $r11
+#define RBP     $r13
+#define RSI     $r5
+#define RDI     $r4
+#define R8      $r8
+#define R9      $r9
+#define R10     $r23
+#define R11     $r24
+#define R12     $r25
+#define R13     $r26
+#define R14     $r27
+#define R15     $r28
+#define Flags   $r30
+#define RIP     $r29
+#define Emu     $r31
+#define SavedSP $r22
+
+#ifdef LA64_ABI_1
+
+.macro ret
+    jr  $ra
+.endm
+
+#endif // LA64_ABI_1
+
+#endif // ASM_MAPPING
+
+#endif //__LA64_MAPPING_H__
diff --git a/src/dynarec/la64/la64_next.S b/src/dynarec/la64/la64_next.S
index f98558b0..fd33ad00 100644
--- a/src/dynarec/la64/la64_next.S
+++ b/src/dynarec/la64/la64_next.S
@@ -1,3 +1,7 @@
+#define ASM_MAPPING 1
+#include "la64_mapping.h"
+#undef ASM_MAPPING
+
 //la64 update linker table for dynarec
 //called with pointer to emu as 1st parameter
 //and address of table to as 2nd parameter
@@ -12,41 +16,41 @@
 
     .8byte  0   // NULL pointer before la64_next, for getDB
 la64_next:
-    // emu is a0
-    // IP address is a1
-    addi.d $sp, $sp, -(8 * 12)
-    st.d   $a0, $sp, 0
-    st.d   $a1, $sp, 8
-    st.d   $r11, $sp, 16
-    st.d   $r12, $sp, 24
-    st.d   $r13, $sp, 32
-    st.d   $r14, $sp, 40
-    st.d   $r15, $sp, 48
-    st.d   $r16, $sp, 56
-    st.d   $r17, $sp, 64
-    st.d   $r18, $sp, 72
-    st.d   $r19, $sp, 80
-    st.d   $r20, $sp, 88 // also save r20(rip) to allow change in LinkNext
+    // move emu to a0
+    // move IP address to a1
+    addi.d $sp, $sp, -(8 * 11)
+    st.d   RDI, $sp, 0
+    st.d   RSI, $sp, 8
+    st.d   RDX, $sp, 16
+    st.d   RCX, $sp, 24
+    st.d   R8,  $sp, 32
+    st.d   R9,  $sp, 40
+    st.d   RAX, $sp, 48
+    st.d   RBX, $sp, 56
+    st.d   RSP, $sp, 64
+    st.d   RBP, $sp, 72
+    st.d   RIP, $sp, 80 // also save r29(rip) to allow change in LinkNext
 
+    move   $a0, Emu
+    move   $a1, RIP
     move   $a2, $ra      // "from" is in ra, so put in a2
-    addi.d $a3, $sp, 88  // a3 is address to change rip
+    addi.d $a3, $sp, 80  // a3 is address to change rip
     // call the function
     bl LinkNext
     // preserve return value
-    move   $a3, $a0
+    move   $r16, $a0
     // pop regs
-    ld.d   $a0, $sp, 0
-    ld.d   $a1, $sp, 8
-    ld.d   $r11, $sp, 16
-    ld.d   $r12, $sp, 24
-    ld.d   $r13, $sp, 32
-    ld.d   $r14, $sp, 40
-    ld.d   $r15, $sp, 48
-    ld.d   $r16, $sp, 56
-    ld.d   $r17, $sp, 64
-    ld.d   $r18, $sp, 72
-    ld.d   $r19, $sp, 80
-    ld.d   $r20, $sp, 88
-    addi.d $sp,  $sp, (8 * 12)
+    ld.d   RDI, $sp, 0
+    ld.d   RSI, $sp, 8
+    ld.d   RDX, $sp, 16
+    ld.d   RCX, $sp, 24
+    ld.d   R8,  $sp, 32
+    ld.d   R9,  $sp, 40
+    ld.d   RAX, $sp, 48
+    ld.d   RBX, $sp, 56
+    ld.d   RSP, $sp, 64
+    ld.d   RBP, $sp, 72
+    ld.d   RIP, $sp, 80
+    addi.d $sp, $sp, (8 * 11)
     // return offset is jump address
-    jr     $a3
\ No newline at end of file
+    jr     $r16
diff --git a/src/dynarec/la64/la64_printer.c b/src/dynarec/la64/la64_printer.c
index 3b984d32..ba86aacf 100644
--- a/src/dynarec/la64/la64_printer.c
+++ b/src/dynarec/la64/la64_printer.c
@@ -6,7 +6,7 @@
 #include "la64_printer.h"
 #include "debug.h"
 
-static const char* Xt[] = { "xZR", "r1", "r2", "sp", "xEmu", "x1_r5", "x2_r6", "x3_r7", "x4_r8", "x5_r9", "x6_r10", "x7_r11", "xRAX_r12", "xRCX_r13", "xRDX_r14", "xRBX_r15", "xRSP_r16", "xRBP_r17", "xRSI_r18", "xRDI_r19", "xRIP_r20", "r21", "r22", "xR8_r23", "xR9_r24", "xR10_r25", "xR11_r26", "xR12_r27", "xR13_r28", "xR14_r29", "xR15_r30", "xFlags_r31" };
+static const char* Xt[] = { "xZR", "r1", "r2", "sp", "xRDI_r4", "xRSI_r5", "xRDX_r6", "xRCX_r7", "xR8_r8", "xR9_r9", "xRBX_r10", "xRSP_r11", "xRAX_r12", "xRBP_r13", "x1_r14", "x2_r15", "x3_r16", "x4_r17", "x5_r18", "x6_r19", "x7_r20", "r21", "xSavedSP_r22", "xR10_r23", "xR11_r24", "xR12_r25", "xR13_r26", "xR14_r27", "xR15_r28", "xRIP_r29", "xFlags_r30", "xEmu_r31" };
 static const char* Ft[] = { "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10", "ft11", "ft12", "ft13", "ft14", "ft15", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7" };
 static const char* Vt[] = { "vra0", "vra1", "vra2", "vra3", "vra4", "vra5", "vra6", "vra7", "vrt0", "vrt1", "vrt2", "vrt3", "vrt4", "vrt5", "vrt6", "vrt7", "vrt8", "vrt9", "vrt10", "vrt11", "vrt12", "vrt13", "vrt14", "vrt15", "vrs0", "vrs1", "vrs2", "vrs3", "vrs4", "vrs5", "vrs6", "vrs7" };
 static const char* XVt[] = { "xvra0", "xvra1", "xvra2", "xvra3", "xvra4", "xvra5", "xvra6", "xvra7", "xvrt0", "xvrt1", "xvrt2", "xvrt3", "xvrt4", "xvrt5", "xvrt6", "xvrt7", "xvrt8", "xvrt9", "xvrt10", "xvrt11", "xvrt12", "xvrt13", "xvrt14", "xvrt15", "xvrs0", "xvrs1", "xvrs2", "xvrs3", "xvrs4", "xvrs5", "xvrs6", "xvrs7" };
diff --git a/src/dynarec/la64/la64_prolog.S b/src/dynarec/la64/la64_prolog.S
index fc6bf34c..2cad9457 100644
--- a/src/dynarec/la64/la64_prolog.S
+++ b/src/dynarec/la64/la64_prolog.S
@@ -1,3 +1,7 @@
+#define ASM_MAPPING 1
+#include "la64_mapping.h"
+#undef ASM_MAPPING
+
 //loongarch prologue for dynarec
 //Save stuff, prepare stack and register
 //called with pointer to emu as 1st parameter
@@ -33,37 +37,40 @@ la64_prolog:
     fst.d  $f29, $sp, (8 * 16)
     fst.d  $f30, $sp, (8 * 17)
     fst.d  $f31, $sp, (8 * 18)
+    // save a1
+    move   $r16,  $a1
     // setup emu -> register
-    ld.d   $r12, $r4, (8 * 0)
-    ld.d   $r13, $r4, (8 * 1)
-    ld.d   $r14, $r4, (8 * 2)
-    ld.d   $r15, $r4, (8 * 3)
-    ld.d   $r16, $r4, (8 * 4)
-    ld.d   $r17, $r4, (8 * 5)
-    ld.d   $r18, $r4, (8 * 6)
-    ld.d   $r19, $r4, (8 * 7)
-    ld.d   $r23, $r4, (8 * 8)
-    ld.d   $r24, $r4, (8 * 9)
-    ld.d   $r25, $r4, (8 * 10)
-    ld.d   $r26, $r4, (8 * 11)
-    ld.d   $r27, $r4, (8 * 12)
-    ld.d   $r28, $r4, (8 * 13)
-    ld.d   $r29, $r4, (8 * 14)
-    ld.d   $r30, $r4, (8 * 15)
-    ld.d   $r31, $r4, (8 * 16)  // xFlags
-    ld.d   $r20, $r4, (8 * 17)  // xRIP
+    move   Emu,   $a0
+    ld.d   RAX,   Emu, (8 * 0)
+    ld.d   RCX,   Emu, (8 * 1)
+    ld.d   RDX,   Emu, (8 * 2)
+    ld.d   RBX,   Emu, (8 * 3)
+    ld.d   RSP,   Emu, (8 * 4)
+    ld.d   RBP,   Emu, (8 * 5)
+    ld.d   RSI,   Emu, (8 * 6)
+    ld.d   RDI,   Emu, (8 * 7)
+    ld.d   R8,    Emu, (8 * 8)
+    ld.d   R9,    Emu, (8 * 9)
+    ld.d   R10,   Emu, (8 * 10)
+    ld.d   R11,   Emu, (8 * 11)
+    ld.d   R12,   Emu, (8 * 12)
+    ld.d   R13,   Emu, (8 * 13)
+    ld.d   R14,   Emu, (8 * 14)
+    ld.d   R15,   Emu, (8 * 15)
+    ld.d   Flags, Emu, (8 * 16)  // xFlags
+    ld.d   RIP,   Emu, (8 * 17)  // xRIP
     // spill xFlags to LBT.eflags
-    la.global $a6, cpuext
-    ldptr.d   $a6, $a6, 0
-    andi      $a6, $a6, 1
-    beqz      $a6, 1f
-    x86mtflag $r31, 0b111111
+    la.global $r19, cpuext
+    ldptr.d   $r19, $r19, 0
+    andi      $r19, $r19, 1
+    beqz      $r19, 1f
+    x86mtflag Flags, 0b111111
 1:
     // push sentinel onto the stack
-    st.d   $r0, $sp, -16
+    st.d   $r0,  $sp, -16
     st.d   $r0,  $sp, -8
     addi.d $sp,  $sp, -16
     // save old sp into xSavedSP
-    addi.d $r22, $sp, 16
+    addi.d SavedSP, $sp, 16
     //jump to function
-    jirl   $r0,  $a1, 0
+    jirl   $r0,  $r16, 0
diff --git a/src/emu/x64emu_private.h b/src/emu/x64emu_private.h
index 15e85fa4..aba2b3ed 100644
--- a/src/emu/x64emu_private.h
+++ b/src/emu/x64emu_private.h
@@ -71,7 +71,7 @@ typedef struct x64emu_s {
     x87control_t cw;
     uint16_t    dummy_cw;   // align...
     mmxcontrol_t mxcsr;
-    #ifdef RV64         // it would be better to use a dedicated register for this like arm64 xSavedSP, but we're running of of free registers.
+    #ifdef RV64         // it would be better to use a dedicated register for this like arm64 xSavedSP, but we're running out of free registers.
     uintptr_t xSPSave;  // sp base value of current dynarec frame, used by call/ret optimization to reset stack when unmatch.
     #endif
     fpu_ld_t    fpu_ld[8]; // for long double emulation / 80bits fld fst