16 files changed, 1122 insertions, 93 deletions
diff --git a/src/dynarec/la64/dynarec_la64_00.c b/src/dynarec/la64/dynarec_la64_00.c
index e464e07c..5eeba868 100644
--- a/src/dynarec/la64/dynarec_la64_00.c
+++ b/src/dynarec/la64/dynarec_la64_00.c
@@ -53,6 +53,106 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
     MAYUSE(cacheupd);
 
     switch (opcode) {
+        case 0x00:
+            INST_NAME("ADD Eb, Gb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB(x1, 0);
+            GETGB(x2);
+            emit_add8(dyn, ninst, x1, x2, x4, x5);
+            EBBACK(x5, 0);
+            break;
+        case 0x01:
+            INST_NAME("ADD Ed, Gd");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED(0);
+            emit_add32(dyn, ninst, rex, ed, gd, x3, x4, x5);
+            WBACK;
+            break;
+        case 0x02:
+            INST_NAME("ADD Gb, Eb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB(x1, 0);
+            GETGB(x2);
+            emit_add8(dyn, ninst, x2, x1, x4, x5);
+            GBBACK(x5);
+            break;
+        case 0x03:
+            INST_NAME("ADD Gd, Ed");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED(0);
+            emit_add32(dyn, ninst, rex, gd, ed, x3, x4, x5);
+            break;
+        case 0x04:
+            INST_NAME("ADD AL, Ib");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            u8 = F8;
+            ANDI(x1, xRAX, 0xff);
+            emit_add8c(dyn, ninst, x1, u8, x3, x4, x5);
+            ANDI(xRAX, xRAX, ~0xff);
+            OR(xRAX, xRAX, x1);
+            break;
+        case 0x05:
+            INST_NAME("ADD EAX, Id");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            emit_add32c(dyn, ninst, rex, xRAX, i64, x3, x4, x5, x6);
+            break;
+        case 0x28:
+            INST_NAME("SUB Eb, Gb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB(x1, 0);
+            GETGB(x2);
+            emit_sub8(dyn, ninst, x1, x2, x4, x5, x6);
+            EBBACK(x5, 0);
+            break;
+        case 0x29:
+            INST_NAME("SUB Ed, Gd");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED(0);
+            emit_sub32(dyn, ninst, rex, ed, gd, x3, x4, x5);
+            WBACK;
+            break;
+        case 0x2A:
+            INST_NAME("SUB Gb, Eb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB(x1, 0);
+            GETGB(x2);
+            emit_sub8(dyn, ninst, x2, x1, x4, x5, x6);
+            GBBACK(x5);
+            break;
+        case 0x2B:
+            INST_NAME("SUB Gd, Ed");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED(0);
+            emit_sub32(dyn, ninst, rex, gd, ed, x3, x4, x5);
+            break;
+        case 0x2C:
+            INST_NAME("SUB AL, Ib");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            u8 = F8;
+            ANDI(x1, xRAX, 0xff);
+            emit_sub8c(dyn, ninst, x1, u8, x2, x3, x4, x5);
+            ANDI(xRAX, xRAX, ~0xff);
+            OR(xRAX, xRAX, x1);
+            break;
+        case 0x2D:
+            INST_NAME("SUB EAX, Id");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            i64 = F32S;
+            emit_sub32c(dyn, ninst, rex, xRAX, i64, x2, x3, x4, x5);
+            break;
         case 0x50:
         case 0x51:
         case 0x52:
diff --git a/src/dynarec/la64/dynarec_la64_emit_math.c b/src/dynarec/la64/dynarec_la64_emit_math.c
new file mode 100644
index 00000000..646606df
--- /dev/null
+++ b/src/dynarec/la64/dynarec_la64_emit_math.c
@@ -0,0 +1,519 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+
+#include "la64_printer.h"
+#include "dynarec_la64_private.h"
+#include "dynarec_la64_functions.h"
+#include "dynarec_la64_helper.h"
+
+// emit ADD32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
+void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5)
+{
+    CLEAR_FLAGS();
+    IFX(X_PEND)
+    {
+        if (rex.w) {
+            ST_D(s1, xEmu, offsetof(x64emu_t, op1));
+            ST_D(s2, xEmu, offsetof(x64emu_t, op2));
+        } else {
+            ST_W(s1, xEmu, offsetof(x64emu_t, op1));
+            ST_W(s2, xEmu, offsetof(x64emu_t, op2));
+        }
+        SET_DF(s3, rex.w ? d_add64 : d_add32b);
+    }
+    else IFX(X_ALL)
+    {
+        SET_DFNONE();
+    }
+    IFX(X_CF)
+    {
+        if (rex.w) {
+            MOV32w(x2, 0xffffffff);
+            AND(s5, x2, s1);
+            AND(s4, x2, s2);
+            ADD_D(s5, s5, s4);
+            SRLI_D(s3, s1, 0x20);
+            SRLI_D(s4, s2, 0x20);
+            ADD_D(s4, s4, s3);
+            SRLI_D(s5, s5, 0x20);
+            ADD_D(s5, s5, s4); // hi
+            SRAI_D(s5, s5, 0x20);
+            BEQZ(s5, 8);
+            ORI(xFlags, xFlags, 1 << F_CF);
+        } else {
+            ADD_D(s5, s1, s2);
+            SRLI_D(s5, s5, 0x20);
+            BEQZ(s5, 8);
+            ORI(xFlags, xFlags, 1 << F_CF);
+        }
+    }
+    IFX(X_AF | X_OF)
+    {
+        OR(s3, s1, s2);  // s3 = op1 | op2
+        AND(s4, s1, s2); // s4 = op1 & op2
+    }
+
+    if (rex.w)
+        ADD_D(s1, s1, s2);
+    else
+        ADD_W(s1, s1, s2);
+
+    IFX(X_PEND)
+    {
+        if (rex.w)
+            ST_D(s1, xEmu, offsetof(x64emu_t, res));
+        else
+            ST_W(s1, xEmu, offsetof(x64emu_t, res));
+    }
+    IFX(X_AF | X_OF)
+    {
+        ANDN(s3, s3, s1); // s3 = ~res & (op1 | op2)
+        OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
+        IFX(X_AF)
+        {
+            ANDI(s4, s3, 0x08); // AF: cc & 0x08
+            BEQZ(s4, 8);
+            ORI(xFlags, xFlags, 1 << F_AF);
+        }
+        IFX(X_OF)
+        {
+            SRLI_D(s3, s3, rex.w ? 62 : 30);
+            SRLI_D(s4, s3, 1);
+            XOR(s3, s3, s4);
+            ANDI(s3, s3, 1); // OF: xor of two MSB's of cc
+            BEQZ(s3, 8);
+            ORI(xFlags, xFlags, 1 << F_OF2);
+        }
+    }
+    IFX(X_SF)
+    {
+        BGE(s1, xZR, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    if (!rex.w) {
+        ZEROUP(s1);
+    }
+    IFX(X_PF)
+    {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+    IFX(X_ZF)
+    {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+}
+
+// emit ADD32 instruction, from s1, constant c, store result in s1 using s3 and s4 as scratch
+void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5)
+{
+    CLEAR_FLAGS();
+    if (s1 == xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags == X_PEND)) {
+        // special case when doing math on ESP and only PEND is needed: ignoring it!
+        if (c >= -2048 && c < 2048) {
+            ADDIxw(s1, s1, c);
+        } else {
+            MOV64xw(s2, c);
+            ADDxw(s1, s1, s2);
+        }
+        return;
+    }
+    IFX(X_PEND | X_AF | X_CF | X_OF)
+    {
+        MOV64xw(s2, c);
+    }
+    IFX(X_PEND)
+    {
+        SDxw(s1, xEmu, offsetof(x64emu_t, op1));
+        SDxw(s2, xEmu, offsetof(x64emu_t, op2));
+        SET_DF(s3, rex.w ? d_add64 : d_add32b);
+    }
+    else IFX(X_ALL)
+    {
+        SET_DFNONE();
+    }
+    IFX(X_CF)
+    {
+        if (rex.w) {
+            MOV32w(x2, 0xffffffff);
+            AND(s5, x2, s1);
+            AND(s4, x2, s2);
+            ADD_D(s5, s5, s4);
+            SRLI_D(s3, s1, 0x20);
+            SRLI_D(s4, s2, 0x20);
+            ADD_D(s4, s4, s3);
+            SRLI_D(s5, s5, 0x20);
+            ADD_D(s5, s5, s4); // hi
+            SRAI_D(s5, s5, 0x20);
+            BEQZ(s5, 8);
+            ORI(xFlags, xFlags, 1 << F_CF);
+        } else {
+            ADD_D(s5, s1, s2);
+            SRLI_D(s5, s5, 0x20);
+            BEQZ(s5, 8);
+            ORI(xFlags, xFlags, 1 << F_CF);
+        }
+    }
+    IFX(X_AF | X_OF)
+    {
+        OR(s3, s1, s2);  // s3 = op1 | op2
+        AND(s4, s1, s2); // s4 = op1 & op2
+    }
+
+    if (c >= -2048 && c < 2048) {
+        ADDIxw(s1, s1, c);
+    } else {
+        IFX(X_PEND | X_AF | X_CF | X_OF) { }
+        else
+        {
+            MOV64xw(s2, c);
+        }
+        ADDxw(s1, s1, s2);
+    }
+
+    IFX(X_PEND)
+    {
+        SDxw(s1, xEmu, offsetof(x64emu_t, res));
+    }
+    IFX(X_AF | X_OF)
+    {
+        ANDN(s3, s3, s1); // s3 = ~res & (op1 | op2)
+        OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
+        IFX(X_AF)
+        {
+            ANDI(s4, s3, 0x08); // AF: cc & 0x08
+            BEQZ(s4, 8);
+            ORI(xFlags, xFlags, 1 << F_AF);
+        }
+        IFX(X_OF)
+        {
+            SRLI_D(s3, s3, rex.w ? 62 : 30);
+            SRLI_D(s4, s3, 1);
+            XOR(s3, s3, s4);
+            ANDI(s3, s3, 1); // OF: xor of two MSB's of cc
+            BEQZ(s3, 8);
+            ORI(xFlags, xFlags, 1 << F_OF2);
+        }
+    }
+    IFX(X_SF)
+    {
+        BGE(s1, xZR, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    if (!rex.w) {
+        ZEROUP(s1);
+    }
+    IFX(X_PF)
+    {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+    IFX(X_ZF)
+    {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+}
+
+// emit ADD8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
+void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
+{
+    CLEAR_FLAGS();
+    IFX(X_PEND)
+    {
+        SUB_D(s1, xEmu, offsetof(x64emu_t, op1));
+        SUB_D(s2, xEmu, offsetof(x64emu_t, op2));
+        SET_DF(s3, d_add8);
+    }
+    else IFX(X_ALL)
+    {
+        SET_DFNONE();
+    }
+    IFX(X_AF | X_OF)
+    {
+        OR(s3, s1, s2);  // s3 = op1 | op2
+        AND(s4, s1, s2); // s4 = op1 & op2
+    }
+    ADD_D(s1, s1, s2);
+
+    IFX(X_AF | X_OF)
+    {
+        ANDN(s3, s3, s1); // s3 = ~res & (op1 | op2)
+        OR(s3, s3, s2);   // cc = (~res & (op1 | op2)) | (op1 & op2)
+        IFX(X_AF)
+        {
+            ANDI(s4, s3, 0x08); // AF: cc & 0x08
+            BEQZ(s4, 8);
+            ORI(xFlags, xFlags, 1 << F_AF);
+        }
+        IFX(X_OF)
+        {
+            SRLI_D(s3, s3, 6);
+            SRLI_D(s4, s3, 1);
+            XOR(s3, s3, s4);
+            ANDI(s3, s3, 1); // OF: xor of two MSB's of cc
+            BEQZ(s3, 8);
+            ORI(xFlags, xFlags, 1 << F_OF2);
+        }
+    }
+    IFX(X_CF)
+    {
+        SRLI_D(s3, s1, 8);
+        BEQZ(s3, 8);
+        ORI(xFlags, xFlags, 1 << F_CF);
+    }
+    IFX(X_PEND)
+    {
+        ST_H(s1, xEmu, offsetof(x64emu_t, res));
+    }
+    ANDI(s1, s1, 0xff);
+    IFX(X_ZF)
+    {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+    IFX(X_SF)
+    {
+        SRLI_D(s3, s1, 7);
+        BEQZ(s3, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    IFX(X_PF)
+    {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+}
+
+// emit ADD8 instruction, from s1, const c, store result in s1 using s3 and s4 as scratch
+void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, int s4)
+{
+    CLEAR_FLAGS();
+    IFX(X_PEND)
+    {
+        MOV32w(s4, c & 0xff);
+        ST_B(s1, xEmu, offsetof(x64emu_t, op1));
+        ST_B(s4, xEmu, offsetof(x64emu_t, op2));
+        SET_DF(s3, d_add8);
+    }
+    else IFX(X_ALL)
+    {
+        SET_DFNONE();
+    }
+    IFX(X_AF | X_OF)
+    {
+        IFX(X_PEND) { }
+        else
+        {
+            MOV32w(s4, c & 0xff);
+        }
+        OR(s3, s1, s4);  // s3 = op1 | op2
+        AND(s4, s1, s4); // s4 = op1 & op2
+    }
+    ADDI_D(s1, s1, c);
+
+    IFX(X_AF | X_OF)
+    {
+        ANDN(s3, s3, s1); // s3 = ~res & (op1 | op2)
+        OR(s3, s3, s4);   // cc = (~res & (op1 | op2)) | (op1 & op2)
+        IFX(X_AF)
+        {
+            ANDI(s4, s3, 0x08); // AF: cc & 0x08
+            BEQZ(s4, 8);
+            ORI(xFlags, xFlags, 1 << F_AF);
+        }
+        IFX(X_OF)
+        {
+            SRLI_D(s3, s3, 6);
+            SRLI_D(s4, s3, 1);
+            XOR(s3, s3, s4);
+            ANDI(s3, s3, 1); // OF: xor of two MSB's of cc
+            BEQZ(s3, 8);
+            ORI(xFlags, xFlags, 1 << F_OF2);
+        }
+    }
+    IFX(X_CF)
+    {
+        SRLI_D(s3, s1, 8);
+        BEQZ(s3, 8);
+        ORI(xFlags, xFlags, 1 << F_CF);
+    }
+    IFX(X_PEND)
+    {
+        ST_H(s1, xEmu, offsetof(x64emu_t, res));
+    }
+    ANDI(s1, s1, 0xff);
+    IFX(X_ZF)
+    {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+    IFX(X_SF)
+    {
+        SRLI_D(s3, s1, 7);
+        BEQZ(s3, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    IFX(X_PF)
+    {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+}
+
+// emit SUB8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
+void emit_sub8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5)
+{
+    CLEAR_FLAGS();
+    IFX(X_PEND) {
+        ST_B(s1, xEmu, offsetof(x64emu_t, op1));
+        ST_B(s2, xEmu, offsetof(x64emu_t, op2));
+        SET_DF(s3, d_sub8);
+    } else IFX(X_ALL) {
+        SET_DFNONE();
+    }
+
+    IFX(X_AF | X_CF | X_OF) {
+        // for later flag calculation
+        NOT(s5, s1);
+    }
+
+    SUB_D(s1, s1, s2);
+    ANDI(s1, s1, 0xff);
+    IFX(X_SF) {
+        SRLI_D(s3, s1, 7);
+        BEQZ(s3, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    IFX(X_PEND) {
+        ST_B(s1, xEmu, offsetof(x64emu_t, res));
+    }
+    CALC_SUB_FLAGS(s5, s2, s1, s3, s4, 8);
+    IFX(X_ZF) {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+    IFX(X_PF) {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+}
+
+// emit SUB8 instruction, from s1, constant c, store result in s1 using s3 and s4 as scratch
+void emit_sub8c(dynarec_la64_t* dyn, int ninst, int s1, int c, int s2, int s3, int s4, int s5)
+{
+    MOV32w(s2, c&0xff);
+    emit_sub8(dyn, ninst, s1, s2, s3, s4, s5);
+}
+
+// emit SUB32 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
+void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5)
+{
+    CLEAR_FLAGS();
+    IFX(X_PEND) {
+        SDxw(s1, xEmu, offsetof(x64emu_t, op1));
+        SDxw(s2, xEmu, offsetof(x64emu_t, op2));
+        SET_DF(s3, rex.w?d_sub64:d_sub32);
+    } else IFX(X_ALL) {
+        SET_DFNONE();
+    }
+
+    IFX(X_AF | X_CF | X_OF) {
+        // for later flag calculation
+        NOT(s5, s1);
+    }
+
+    SUBxw(s1, s1, s2);
+    IFX(X_PEND) {
+        SDxw(s1, xEmu, offsetof(x64emu_t, res));
+    }
+    IFX(X_SF) {
+        BGE(s1, xZR, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    if (!rex.w) {
+        ZEROUP(s1);
+    }
+    CALC_SUB_FLAGS(s5, s2, s1, s3, s4, rex.w?64:32);
+    IFX(X_ZF) {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+    IFX(X_PF) {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+}
+
+// emit SUB32 instruction, from s1, constant c, store result in s1 using s2, s3, s4 and s5 as scratch
+void emit_sub32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5)
+{
+    CLEAR_FLAGS();
+    if(s1==xRSP && (!dyn->insts || dyn->insts[ninst].x64.gen_flags==X_PEND))
+    {
+        // special case when doing math on RSP and only PEND is needed: ignoring it!
+        if (c > -2048 && c <= 2048) {
+            ADDI_D(s1, s1, -c);
+        } else {
+            MOV64xw(s2, c);
+            SUBxw(s1, s1, s2);
+        }
+        return;
+    }
+
+    IFX(X_PEND) {
+        SDxw(s1, xEmu, offsetof(x64emu_t, op1));
+        MOV64xw(s2, c);
+        SDxw(s2, xEmu, offsetof(x64emu_t, op2));
+        SET_DF(s3, rex.w?d_sub64:d_sub32);
+    } else IFX(X_ALL) {
+        SET_DFNONE();
+    }
+
+    IFX(X_AF | X_CF | X_OF) {
+        // for later flag calculation
+        NOT(s5, s1);
+    }
+
+    if (c > -2048 && c <= 2048) {
+        ADDIxw(s1, s1, -c);
+    } else {
+        IFX(X_PEND) {} else {MOV64xw(s2, c);}
+        SUBxw(s1, s1, s2);
+    }
+
+    IFX(X_AF | X_CF | X_OF) {
+        IFX(X_PEND) {}
+        else if (c > -2048 && c <= 2048) {
+            MOV64xw(s2, c);
+        }
+    }
+    IFX(X_PEND) {
+        SDxw(s1, xEmu, offsetof(x64emu_t, res));
+    }
+    IFX(X_SF) {
+        BGE(s1, xZR, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    if (!rex.w) {
+        ZEROUP(s1);
+    }
+    CALC_SUB_FLAGS(s5, s2, s1, s3, s4, rex.w?64:32);
+    IFX(X_ZF) {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+    IFX(X_PF) {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+}
diff --git a/src/dynarec/la64/dynarec_la64_functions.h b/src/dynarec/la64/dynarec_la64_functions.h
index 5f7b5036..67608783 100644
--- a/src/dynarec/la64/dynarec_la64_functions.h
+++ b/src/dynarec/la64/dynarec_la64_functions.h
@@ -4,7 +4,7 @@
 #include "../dynarec_native_functions.h"
 
 typedef struct x64emu_s x64emu_t;
-typedef struct dynarec_rv64_s dynarec_rv64_t;
+typedef struct dynarec_la64_s dynarec_la64_t;
 
 // Reset scratch regs counter
 void fpu_reset_scratch(dynarec_la64_t* dyn);
diff --git a/src/dynarec/la64/dynarec_la64_helper.c b/src/dynarec/la64/dynarec_la64_helper.c
index 71e3c1ed..1215cbd8 100644
--- a/src/dynarec/la64/dynarec_la64_helper.c
+++ b/src/dynarec/la64/dynarec_la64_helper.c
@@ -401,6 +401,67 @@ void jump_to_next(dynarec_la64_t* dyn, uintptr_t ip, int reg, int ninst, int is3
 
 void call_c(dynarec_la64_t* dyn, int ninst, void* fnc, int reg, int ret, int saveflags, int savereg)
 {
+    MAYUSE(fnc);
+    if (savereg == 0)
+        savereg = x6;
+    if (saveflags) {
+        FLAGS_ADJUST_TO11(xFlags, xFlags, reg);
+        ST_D(xFlags, xEmu, offsetof(x64emu_t, eflags));
+    }
+    fpu_pushcache(dyn, ninst, reg, 0);
+    if (ret != -2) {
+        ADDI_D(xSP, xSP, -16); // RV64 stack needs to be 16byte aligned
+        ST_D(xEmu, xSP, 0);
+        ST_D(savereg, xSP, 8);
+        // x5..x8, x10..x17, x28..x31 those needs to be saved by caller
+        STORE_REG(RAX);
+        STORE_REG(RCX);
+        STORE_REG(RDX);
+        STORE_REG(R12);
+        STORE_REG(R13);
+        STORE_REG(R14);
+        STORE_REG(R15);
+        ST_D(xRIP, xEmu, offsetof(x64emu_t, ip));
+    }
+    TABLE64(reg, (uintptr_t)fnc);
+    JIRL(xRA, reg, 0);
+    if (ret >= 0) {
+        MV(ret, xEmu);
+    }
+    if (ret != -2) {
+        LD_D(xEmu, xSP, 0);
+        LD_D(savereg, xSP, 8);
+        ADDI_D(xSP, xSP, 16);
+#define GO(A) \
+    if (ret != x##A) { LOAD_REG(A); }
+        GO(RAX);
+        GO(RCX);
+        GO(RDX);
+        GO(R12);
+        GO(R13);
+        GO(R14);
+        GO(R15);
+        if (ret != xRIP)
+            LD_D(xRIP, xEmu, offsetof(x64emu_t, ip));
+#undef GO
+    }
+
+    fpu_popcache(dyn, ninst, reg, 0);
+    if (saveflags) {
+        LD_D(xFlags, xEmu, offsetof(x64emu_t, eflags));
+        FLAGS_ADJUST_FROM11(xFlags, xFlags, reg);
+    }
+    SET_NODF();
+    dyn->last_ip = 0;
+}
+
+void fpu_pushcache(dynarec_la64_t* dyn, int ninst, int s1, int not07)
+{
+    // TODO
+}
+
+void fpu_popcache(dynarec_la64_t* dyn, int ninst, int s1, int not07)
+{
     // TODO
 }
 
@@ -419,6 +480,24 @@ void fpu_unreflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3)
     // TODO
 }
 
+void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4)
+{
+    MAYUSE(dyn);
+    MAYUSE(ninst);
+    // PF: (((emu->x64emu_parity_tab[(res&0xff) / 32] >> ((res&0xff) % 32)) & 1) == 0)
+    MOV64x(s4, (uintptr_t)GetParityTab());
+    SRLI_D(s3, s1, 3);
+    ANDI(s3, s3, 28);
+    ADD_D(s4, s4, s3);
+    LD_W(s4, s4, 0);
+    NOT(s4, s4);
+    SRL_W(s4, s4, s1);
+    ANDI(s4, s4, 1);
+
+    BEQZ(s4, 8);
+    ORI(xFlags, xFlags, 1 << F_PF);
+}
+
 void fpu_reset_cache(dynarec_la64_t* dyn, int ninst, int reset_n)
 {
     // TODO
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index deec5656..6073a359 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -82,20 +82,116 @@
 #define LOCK_LOCK (int*)1
 
 // GETGD    get x64 register in gd
-#define GETGD                                                        \
-    do {                                                             \
-        gd = TO_LA64(((nextop & 0x38) >> 3) + (rex.r << 3)); \
-    } while (0);
+#define GETGD gd = TO_LA64(((nextop & 0x38) >> 3) + (rex.r << 3));
+
+// GETED can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
+#define GETED(D)                                                                                \
+    if (MODREG) {                                                                               \
+        ed = TO_LA64((nextop & 7) + (rex.b << 3));                                             \
+        wback = 0;                                                                              \
+    } else {                                                                                    \
+        SMREAD();                                                                               \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \
+        if (rex.w)                                                                              \
+            LD_D(x1, wback, fixedaddress);                                                      \
+        else                                                                                    \
+            LD_W(x1, wback, fixedaddress);                                                      \
+        ed = x1;                                                                                \
+    }
+
+// Write back ed in wback (if wback not 0)
+#define WBACK                              \
+    if (wback) {                           \
+        if (rex.w)                         \
+            ST_D(ed, wback, fixedaddress); \
+        else                               \
+            ST_W(ed, wback, fixedaddress); \
+        SMWRITE();                         \
+    }
+
+// GETEB will use i for ed, and can use r3 for wback.
+#define GETEB(i, D)                                                                             \
+    if (MODREG) {                                                                               \
+        if (rex.rex) {                                                                          \
+            wback = TO_LA64((nextop & 7) + (rex.b << 3));                                      \
+            wb2 = 0;                                                                            \
+        } else {                                                                                \
+            wback = (nextop & 7);                                                               \
+            wb2 = (wback >> 2) * 8;                                                             \
+            wback = TO_LA64((wback & 3));                                                      \
+        }                                                                                       \
+        if (wb2) {                                                                              \
+            MV(i, wback);                                                                       \
+            SRLI_D(i, i, wb2);                                                                  \
+            ANDI(i, i, 0xff);                                                                   \
+        } else                                                                                  \
+            ANDI(i, wback, 0xff);                                                               \
+        wb1 = 0;                                                                                \
+        ed = i;                                                                                 \
+    } else {                                                                                    \
+        SMREAD();                                                                               \
+        addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \
+        LD_BU(i, wback, fixedaddress);                                                          \
+        wb1 = 1;                                                                                \
+        ed = i;                                                                                 \
+    }
 
-// CALL will use x7 for the call address. Return value can be put in ret (unless ret is -1)
+// GETGB will use i for gd
+#define GETGB(i)                                               \
+    if (rex.rex) {                                             \
+        gb1 = TO_LA64(((nextop & 0x38) >> 3) + (rex.r << 3)); \
+        gb2 = 0;                                               \
+    } else {                                                   \
+        gd = (nextop & 0x38) >> 3;                             \
+        gb2 = ((gd & 4) >> 2);                                 \
+        gb1 = TO_LA64((gd & 3));                              \
+    }                                                          \
+    gd = i;                                                    \
+    if (gb2) {                                                 \
+        MV(gd, gb1);                                           \
+        SRLI_D(gd, gd, 8);                                     \
+        ANDI(gd, gd, 0xff);                                    \
+    } else                                                     \
+        ANDI(gd, gb1, 0xff);
+
+// Write gb (gd) back to original register / memory, using s1 as scratch
+#define GBBACK(s1)                        \
+    if (gb2) {                            \
+        MOV64x(s1, 0xffffffffffff00ffLL); \
+        AND(gb1, gb1, s1);                \
+        SLLI_D(s1, gd, 8);                \
+        OR(gb1, gb1, s1);                 \
+    } else {                              \
+        ANDI(gb1, gb1, ~0xff);            \
+        OR(gb1, gb1, gd);                 \
+    }
+
+// Write eb (ed) back to original register / memory, using s1 as scratch
+#define EBBACK(s1, c)                     \
+    if (wb1) {                            \
+        SUB_D(ed, wback, fixedaddress);   \
+        SMWRITE();                        \
+    } else if (wb2) {                     \
+        MOV64x(s1, 0xffffffffffff00ffLL); \
+        AND(wback, wback, s1);            \
+        if (c) { ANDI(ed, ed, 0xff); }    \
+        SLLI_D(s1, ed, 8);                \
+        OR(wback, wback, s1);             \
+    } else {                              \
+        ANDI(wback, wback, ~0xff);        \
+        if (c) { ANDI(ed, ed, 0xff); }    \
+        OR(wback, wback, ed);             \
+    }
+
+// CALL will use x6 for the call address. Return value can be put in ret (unless ret is -1)
 // R0 will not be pushed/popd if ret is -2
-#define CALL(F, ret) call_c(dyn, ninst, F, x7, ret, 1, 0)
-// CALL_ will use x7 for the call address. Return value can be put in ret (unless ret is -1)
+#define CALL(F, ret) call_c(dyn, ninst, F, x6, ret, 1, 0)
+// CALL_ will use x6 for the call address. Return value can be put in ret (unless ret is -1)
 // R0 will not be pushed/popd if ret is -2
-#define CALL_(F, ret, reg) call_c(dyn, ninst, F, x7, ret, 1, reg)
-// CALL_S will use x7 for the call address. Return value can be put in ret (unless ret is -1)
+#define CALL_(F, ret, reg) call_c(dyn, ninst, F, x6, ret, 1, reg)
+// CALL_S will use x6 for the call address. Return value can be put in ret (unless ret is -1)
 // R0 will not be pushed/popd if ret is -2. Flags are not save/restored
-#define CALL_S(F, ret) call_c(dyn, ninst, F, x7, ret, 0, 0)
+#define CALL_S(F, ret) call_c(dyn, ninst, F, x6, ret, 0, 0)
 
 #define MARKi(i)    dyn->insts[ninst].mark[i] = dyn->native_size
 #define GETMARKi(i) dyn->insts[ninst].mark[i]
@@ -118,9 +214,119 @@
 #define MARKLOCK    dyn->insts[ninst].marklock = dyn->native_size
 #define GETMARKLOCK dyn->insts[ninst].marklock
 
+#define IFX(A) if ((dyn->insts[ninst].x64.gen_flags & (A)))
+
+#define STORE_REG(A) ST_D(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
+#define LOAD_REG(A)  LD_D(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
+
+#define SET_DFNONE()                             \
+    if (!dyn->f.dfnone) {                        \
+        ST_W(xZR, xEmu, offsetof(x64emu_t, df)); \
+        dyn->f.dfnone = 1;                       \
+    }
+#define SET_DF(S, N)                           \
+    if ((N) != d_none) {                       \
+        MOV32w(S, (N));                        \
+        ST_W(S, xEmu, offsetof(x64emu_t, df)); \
+        dyn->f.dfnone = 0;                     \
+    } else                                     \
+        SET_DFNONE()
+#define SET_NODF() dyn->f.dfnone = 0
+#define SET_DFOK() dyn->f.dfnone = 1
+
+#define CLEAR_FLAGS() \
+    IFX(X_ALL) { ANDI(xFlags, xFlags, ~((1UL << F_AF) | (1UL << F_CF) | (1UL << F_OF2) | (1UL << F_ZF) | (1UL << F_SF) | (1UL << F_PF))); }
+
+#define CALC_SUB_FLAGS(op1_, op2, res, scratch1, scratch2, width)     \
+    IFX(X_AF | X_CF | X_OF)                                           \
+    {                                                                 \
+        /* calc borrow chain */                                       \
+        /* bc = (res & (~op1 | op2)) | (~op1 & op2) */                \
+        OR(scratch1, op1_, op2);                                      \
+        AND(scratch2, res, scratch1);                                 \
+        AND(op1_, op1_, op2);                                         \
+        OR(scratch2, scratch2, op1_);                                 \
+        IFX(X_AF)                                                     \
+        {                                                             \
+            /* af = bc & 0x8 */                                       \
+            ANDI(scratch1, scratch2, 8);                              \
+            BEQZ(scratch1, 8);                                        \
+            ORI(xFlags, xFlags, 1 << F_AF);                           \
+        }                                                             \
+        IFX(X_CF)                                                     \
+        {                                                             \
+            /* cf = bc & (1<<(width-1)) */                            \
+            if ((width) == 8) {                                       \
+                ANDI(scratch1, scratch2, 0x80);                       \
+            } else {                                                  \
+                SRLI_D(scratch1, scratch2, (width)-1);                \
+                if (width != 64) ANDI(scratch1, scratch1, 1);         \
+            }                                                         \
+            BEQZ(scratch1, 8);                                        \
+            ORI(xFlags, xFlags, 1 << F_CF);                           \
+        }                                                             \
+        IFX(X_OF)                                                     \
+        {                                                             \
+            /* of = ((bc >> (width-2)) ^ (bc >> (width-1))) & 0x1; */ \
+            SRLI_D(scratch1, scratch2, (width)-2);                    \
+            SRLI_D(scratch2, scratch1, 1);                            \
+            XOR(scratch1, scratch1, scratch2);                        \
+            ANDI(scratch1, scratch1, 1);                              \
+            BEQZ(scratch1, 8);                                        \
+            ORI(xFlags, xFlags, 1 << F_OF2);                          \
+        }                                                             \
+    }
+
+// Adjust the flags bit 11 -> bit 5, result in reg (can be same as flags, but not s1)
+#define FLAGS_ADJUST_FROM11(reg, flags, s1) \
+    ANDI(reg, flags, ~(1 << 5));            \
+    SRLI_D(s1, reg, 11 - 5);                \
+    ANDI(s1, s1, 1 << 5);                   \
+    OR(reg, reg, s1)
+
+// Adjust the xFlags bit 5 -> bit 11, src and dst can be the same (and can be xFlags, but not s1)
+#define FLAGS_ADJUST_TO11(dst, src, s1) \
+    LU12I_W(s1, 0xFFFFF);               \
+    ADDI_W(s1, s1, 0x7DF);              \
+    AND(s1, src, s1);                   \
+    ANDI(dst, src, 1 << 5);             \
+    SLLI_D(dst, dst, 11 - 5);           \
+    OR(dst, dst, s1)
+
 #ifndef READFLAGS
-#define READFLAGS(A)
+#define READFLAGS(A)                                \
+    if (((A) != X_PEND && dyn->f.pending != SF_SET) \
+        && (dyn->f.pending != SF_SET_PENDING)) {    \
+        if (dyn->f.pending != SF_PENDING) {         \
+            LD_D(x3, xEmu, offsetof(x64emu_t, df)); \
+            j64 = (GETMARKF) - (dyn->native_size);  \
+            BEQ(x3, xZR, j64);                      \
+        }                                           \
+        CALL_(UpdateFlags, -1, 0);                  \
+        FLAGS_ADJUST_FROM11(xFlags, xFlags, x3);    \
+        MARKF;                                      \
+        dyn->f.pending = SF_SET;                    \
+        SET_DFOK();                                 \
+    }
+#endif
 
+#ifndef SETFLAGS
+#define SETFLAGS(A, B)                                                                                              \
+    if (dyn->f.pending != SF_SET                                                                                    \
+        && ((B) & SF_SUB)                                                                                           \
+        && (dyn->insts[ninst].x64.gen_flags & (~(A))))                                                              \
+        READFLAGS(((dyn->insts[ninst].x64.gen_flags & X_PEND) ? X_ALL : dyn->insts[ninst].x64.gen_flags) & (~(A))); \
+    if (dyn->insts[ninst].x64.gen_flags) switch (B) {                                                               \
+            case SF_SUBSET:                                                                                         \
+            case SF_SET: dyn->f.pending = SF_SET; break;                                                            \
+            case SF_PENDING: dyn->f.pending = SF_PENDING; break;                                                    \
+            case SF_SUBSET_PENDING:                                                                                 \
+            case SF_SET_PENDING:                                                                                    \
+                dyn->f.pending = (dyn->insts[ninst].x64.gen_flags & X_PEND) ? SF_SET_PENDING : SF_SET;              \
+                break;                                                                                              \
+        }                                                                                                           \
+    else                                                                                                            \
+        dyn->f.pending = SF_SET
 #endif
 
 #ifndef BARRIER
@@ -190,7 +396,19 @@ void* la64_next(x64emu_t* emu, uintptr_t addr);
 #define jump_to_epilog STEPNAME(jump_to_epilog)
 #define jump_to_next   STEPNAME(jump_to_next)
 #define call_c         STEPNAME(call_c)
-
+#define emit_add32     STEPNAME(emit_add32)
+#define emit_add32c    STEPNAME(emit_add32c)
+#define emit_add8      STEPNAME(emit_add8)
+#define emit_add8c     STEPNAME(emit_add8c)
+#define emit_sub32     STEPNAME(emit_sub32)
+#define emit_sub32c    STEPNAME(emit_sub32c)
+#define emit_sub8      STEPNAME(emit_sub8)
+#define emit_sub8c     STEPNAME(emit_sub8c)
+
+#define emit_pf STEPNAME(emit_pf)
+
+#define fpu_pushcache       STEPNAME(fpu_pushcache)
+#define fpu_popcache        STEPNAME(fpu_popcache)
 #define fpu_reset_cache     STEPNAME(fpu_reset_cache)
 #define fpu_propagate_stack STEPNAME(fpu_propagate_stack)
 #define fpu_purgecache      STEPNAME(fpu_purgecache)
@@ -207,7 +425,18 @@ uintptr_t geted32(dynarec_la64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop
 void jump_to_epilog(dynarec_la64_t* dyn, uintptr_t ip, int reg, int ninst);
 void jump_to_next(dynarec_la64_t* dyn, uintptr_t ip, int reg, int ninst, int is32bits);
 void call_c(dynarec_la64_t* dyn, int ninst, void* fnc, int reg, int ret, int saveflags, int save_reg);
-
+void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
+void emit_add32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5);
+void emit_add8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
+void emit_add8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s2, int s3, int s4);
+void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5);
+void emit_sub32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5);
+void emit_sub8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
+void emit_sub8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s2, int s3, int s4, int s5);
+
+void emit_pf(dynarec_la64_t* dyn, int ninst, int s1, int s3, int s4);
+
+// common coproc helpers
 // reset the cache with n
 void fpu_reset_cache(dynarec_la64_t* dyn, int ninst, int reset_n);
 // propagate stack state
@@ -216,6 +445,8 @@ void fpu_propagate_stack(dynarec_la64_t* dyn, int ninst);
 void fpu_purgecache(dynarec_la64_t* dyn, int ninst, int next, int s1, int s2, int s3);
 void fpu_reflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3);
 void fpu_unreflectcache(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3);
+void fpu_pushcache(dynarec_la64_t* dyn, int ninst, int s1, int not07);
+void fpu_popcache(dynarec_la64_t* dyn, int ninst, int s1, int not07);
 
 uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 
diff --git a/src/dynarec/la64/dynarec_la64_jumpnext.c b/src/dynarec/la64/dynarec_la64_jmpnext.c
index 793eb729..793eb729 100644
--- a/src/dynarec/la64/dynarec_la64_jumpnext.c
+++ b/src/dynarec/la64/dynarec_la64_jmpnext.c
diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h
index 3f6a68ce..f3cca4d7 100644
--- a/src/dynarec/la64/dynarec_la64_pass0.h
+++ b/src/dynarec/la64/dynarec_la64_pass0.h
@@ -9,6 +9,11 @@
     dyn->insts[ninst].x64.use_flags = A; \
     dyn->f.dfnone = 1;                   \
     dyn->f.pending = SF_SET
+#define SETFLAGS(A, B)                     \
+    dyn->insts[ninst].x64.set_flags = A;   \
+    dyn->insts[ninst].x64.state_flags = B; \
+    dyn->f.pending = (B) & SF_SET_PENDING; \
+    dyn->f.dfnone = ((B) & SF_SET) ? 1 : 0;
 #define EMIT(A) dyn->native_size += 4
 #define BARRIER(A)                                 \
     if (A != BARRIER_MAYBE) {                      \
diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h
index 7ba34204..a1719187 100644
--- a/src/dynarec/la64/la64_emitter.h
+++ b/src/dynarec/la64/la64_emitter.h
@@ -99,6 +99,9 @@ f24-f31  fs0-fs7   Static registers                Callee
 #define wZR     xZR
 #define r0      xZR
 
+// replacement for F_OF internaly, using a reserved bit. Need to use F_OF2 internaly, never F_OF directly!
+#define F_OF2 F_res3
+
 // split a 32bits value in 20bits + 12bits, adjust the upper part is 12bits is negative
 #define SPLIT20(A) (((A) + 0x800) >> 12)
 #define SPLIT12(A) ((A) & 0xfff)
@@ -195,6 +198,19 @@ f24-f31  fs0-fs7   Static registers                Callee
 // GR[rd] = GR[rj] ^ ZeroExtend(imm12, GRLEN)
 #define XORI(rd, rj, imm12) EMIT(type_2RI12(0b0000001111, imm12, rj, rd))
 
+// tmp = SLL(GR[rj][31:0], GR[rk][4:0])
+// GR[rd] = SignExtend(tmp[31:0], GRLEN)
+#define SLL_W(rd, rj, rk) EMIT(type_3R(0b00000000000101110, rk, rj, rd))
+// tmp = SRL(GR[rj][31:0], GR[rk][4:0])
+// GR[rd] = SignExtend(tmp[31:0], GRLEN)
+#define SRL_W(rd, rj, rk) EMIT(type_3R(0b00000000000101111, rk, rj, rd))
+// tmp = SLA(GR[rj][31:0], GR[rk][4:0])
+// GR[rd] = SignExtend(tmp[31:0], GRLEN)
+#define SLA_W(rd, rj, rk) EMIT(type_3R(0b00000000000110000, rk, rj, rd))
+// tmp = ROTR(GR[rj][31:0], GR[rk][4:0])
+// GR[rd] = SignExtend(tmp[31:0], GRLEN)
+#define ROTR_W(rd, rj, rk) EMIT(type_3R(0b00000000000110110, rk, rj, rd))
+
 // GR[rd] = SLL(GR[rj][63:0], imm6) (Shift Left Logical)
 #define SLLI_D(rd, rj, imm6) EMIT(type_2RI6(0b0000000001000001, imm6, rj, rd))
 // GR[rd] = SRL(GR[rj][63:0], imm6) (Shift Right Logical)
@@ -213,6 +229,25 @@ f24-f31  fs0-fs7   Static registers                Callee
         ADD_D(rd, rs1, scratch);           \
     }
 
+// if GR[rj] == GR[rd]:
+//     PC = PC + SignExtend({imm16, 2'b0}, GRLEN)
+#define BEQ(rj, rd, imm16) EMIT(type_2RI16(0b010110, imm16, rj, rd))
+// if GR[rj] != GR[rd]:
+//     PC = PC + SignExtend({imm16, 2'b0}, GRLEN)
+#define BNE(rj, rd, imm16) EMIT(type_2RI16(0b010111, imm16, rj, rd))
+// if signed(GR[rj]) < signed(GR[rd]):
+//     PC = PC + SignExtend({imm16, 2'b0}, GRLEN)
+#define BLT(rj, rd, imm16) EMIT(type_2RI16(0b011000, imm16, rj, rd))
+// if signed(GR[rj]) >= signed(GR[rd]):
+//     PC = PC + SignExtend({imm16, 2'b0}, GRLEN)
+#define BGE(rj, rd, imm16) EMIT(type_2RI16(0b011001, imm16, rj, rd))
+// if unsigned(GR[rj]) == unsigned(GR[rd]):
+//     PC = PC + SignExtend({imm16, 2'b0}, GRLEN)
+#define BLTU(rj, rd, imm16) EMIT(type_2RI16(0b011010, imm16, rj, rd))
+// if unsigned(GR[rj]) == unsigned(GR[rd]):
+//     PC = PC + SignExtend({imm16, 2'b0}, GRLEN)
+#define BGEU(rj, rd, imm16) EMIT(type_2RI16(0b011011, imm16, rj, rd))
+
 // if GR[rj] == 0:
 //     PC = PC + SignExtend({imm21, 2'b0}, GRLEN)
 #define BEQZ(rj, imm21) EMIT(type_1RI21(0b010000, (imm21) >> 2, rj))
@@ -305,6 +340,19 @@ f24-f31  fs0-fs7   Static registers                Callee
         LU52I_D(rd, rd, ((uint64_t)(imm64)) >> 52); \
     }
 
+#define MOV64xw(A, B) \
+    if (rex.w) {      \
+        MOV64x(A, B); \
+    } else {          \
+        MOV32w(A, B); \
+    }
+#define MOV64z(A, B)    \
+    if (rex.is32bits) { \
+        MOV32w(A, B);   \
+    } else {            \
+        MOV64x(A, B);   \
+    }
+
 // rd[63:0] = rj[63:0] (pseudo instruction)
 #define MV(rd, rj) ADDI_D(rd, rj, 0)
 // rd = rj (pseudo instruction)
@@ -324,6 +372,53 @@ f24-f31  fs0-fs7   Static registers                Callee
         MV(rd, rj);             \
     }
 
+// rd = !rs1
+#define NOT(rd, rs1) XORI(rd, rs1, -1)
+
+#define ADDIxw(rd, rj, imm12)  \
+    if (rex.w)                 \
+        ADDI_D(rd, rj, imm12); \
+    else                       \
+        ADDI_W(rd, rj, imm12);
+#define ADDIz(rd, rj, imm12)   \
+    if (rex.is32bits)          \
+        ADDI_W(rd, rj, imm12); \
+    else                       \
+        ADDI_D(rd, rj, imm12);
+
+#define ADDxw(rd, rj, rk)  \
+    if (rex.w)             \
+        ADD_D(rd, rj, rk); \
+    else                   \
+        ADD_W(rd, rj, rk);
+#define ADDz(rd, rj, rk)   \
+    if (rex.is32bits)      \
+        ADD_W(rd, rj, rk); \
+    else                   \
+        ADD_D(rd, rj, rk);
+
+#define SDxw(rd, rj, imm12)  \
+    if (rex.w)               \
+        ST_D(rd, rj, imm12); \
+    else                     \
+        ST_W(rd, rj, imm12);
+#define SDz(rd, rj, imm12)   \
+    if (rex.is32bits)        \
+        ST_W(rd, rj, imm12); \
+    else                     \
+        ST_D(rd, rj, imm12);
+
+#define SUBxw(rd, rj, rk)  \
+    if (rex.w)             \
+        SUB_D(rd, rj, rk); \
+    else                   \
+        SUB_W(rd, rj, rk);
+#define SUBz(rd, rj, rk)   \
+    if (rex.is32bits)      \
+        SUB_W(rd, rj, rk); \
+    else                   \
+        SUB_D(rd, rj, rk);
+
 // PUSH / POP reg[0:63]
 #define PUSH1(reg)              \
     do {                        \
diff --git a/src/dynarec/la64/la64_epilog.S b/src/dynarec/la64/la64_epilog.S
index 41eae0cc..bb6977c1 100644
--- a/src/dynarec/la64/la64_epilog.S
+++ b/src/dynarec/la64/la64_epilog.S
@@ -1,4 +1,4 @@
-//la464 epilog for dynarec
+//la64 epilog for dynarec
 //Save stuff, prepare stack and register
 //called with pointer to emu as 1st parameter
 //and address to jump to as 2nd parameter
@@ -6,8 +6,8 @@
 .text
 .align 4
 
-.global la464_epilog
-la464_epilog:
+.global la64_epilog
+la64_epilog:
     //update register -> emu
     st.d   $r12, $r4, (8 * 0)
     st.d   $r13, $r4, (8 * 1)
@@ -27,7 +27,7 @@ la464_epilog:
     st.d   $r30, $r4, (8 * 15)
     st.d   $r31, $r4, (8 * 16) // xFlags
     st.d   $r20, $r4, (8 * 17) // put back reg value in emu, including EIP (so x27 must be EIP now)
-    ld.d   $sp,  $r4, 552      // restore saved sp from emu->xSPSave, see la464_prolog
+    ld.d   $sp,  $r4, 552      // restore saved sp from emu->xSPSave, see la64_prolog
     ld.d   $r11, $sp, -8
     st.d   $r11, $r4, 552
     // vpop {d8-d15}
diff --git a/src/dynarec/la64/la64_lock.S b/src/dynarec/la64/la64_lock.S
index 9a728b14..74c877d4 100644
--- a/src/dynarec/la64/la64_lock.S
+++ b/src/dynarec/la64/la64_lock.S
@@ -5,39 +5,39 @@
 .text
 .align 4
 
-.global la464_lock_xchg_dd
-.global la464_lock_xchg_d
-.global la464_lock_storeifnull
-.global la464_lock_storeifnull_d
-.global la464_lock_storeifref
-.global la464_lock_storeifref_d
-.global la464_lock_storeifref2_d
-.global la464_lock_decifnot0b
-.global la464_lock_storeb
-.global la464_lock_incif0
-.global la464_lock_decifnot0
-.global la464_lock_store
-.global la464_lock_store_dd
-.global la464_lock_get_b
-.global la464_lock_get_d
-.global la464_lock_get_dd
-.global la464_lock_cas_d
-.global la464_lock_cas_dd
-.global la464_lock_cas_dq
-
-la464_lock_xchg_dd:
+.global la64_lock_xchg_dd
+.global la64_lock_xchg_d
+.global la64_lock_storeifnull
+.global la64_lock_storeifnull_d
+.global la64_lock_storeifref
+.global la64_lock_storeifref_d
+.global la64_lock_storeifref2_d
+.global la64_lock_decifnot0b
+.global la64_lock_storeb
+.global la64_lock_incif0
+.global la64_lock_decifnot0
+.global la64_lock_store
+.global la64_lock_store_dd
+.global la64_lock_get_b
+.global la64_lock_get_d
+.global la64_lock_get_dd
+.global la64_lock_cas_d
+.global la64_lock_cas_dd
+.global la64_lock_cas_dq
+
+la64_lock_xchg_dd:
     // address is a0, value is a1, return old value in a0
     amswap_db.d $a2, $a1, $a0
     move        $a0, $a2
     ret
 
-la464_lock_xchg_d:
+la64_lock_xchg_d:
     // address is a0, value is a1, return old value in a0
     amswap_db.w $a2, $a1, $a0
     move        $a0, $a2
     ret
 
-la464_lock_storeifnull:
+la64_lock_storeifnull:
     // address is a0, value is a1, a1 store to a0 only if [a0] is 0. return old [a0] value
     dbar 0
     move $a3, $a1
@@ -48,7 +48,7 @@ la464_lock_storeifnull:
     move $a0, $a2
     ret
 
-la464_lock_storeifnull_d:
+la64_lock_storeifnull_d:
     // address is a0, value is a1, a1 store to a0 only if [a0] is 0. return old [a0] value
     dbar 0
     move $a3, $a1
@@ -59,7 +59,7 @@ la464_lock_storeifnull_d:
     move $a0, $a2
     ret
 
-la464_lock_storeifref:
+la64_lock_storeifref:
     // address is a0, value is a1, a1 store to a0 only if [a0] is a2. return new [a0] value (so a1 or old value)
     dbar 0
     move $a4, $a1
@@ -74,7 +74,7 @@ la464_lock_storeifref:
     move $a0, $a3
     ret
 
-la464_lock_storeifref_d:
+la64_lock_storeifref_d:
     // address is a0, value is a1, a1 store to a0 only if [a0] is a2. return new [a0] value (so a1 or old value)
     dbar 0
     move $a4, $a1  
@@ -89,7 +89,7 @@ la464_lock_storeifref_d:
     move $a0, $a3
     ret
 
-la464_lock_storeifref2_d:
+la64_lock_storeifref2_d:
     // address is a0, value is a1, a1 store to a0 only if [a0] is a2. return old [a0] value
     dbar 0
     move $a4, $a1
@@ -100,17 +100,17 @@ la464_lock_storeifref2_d:
     move $a0, $a3
     ret
 
-la464_lock_decifnot0b:
+la64_lock_decifnot0b:
     dbar       0
     // TODO
     ret
 
-la464_lock_storeb:
+la64_lock_storeb:
     st.b $a1, $a0, 0
     dbar 0
     ret
 
-la464_lock_decifnot0:
+la64_lock_decifnot0:
     dbar   0
     ll.w   $a1, $a0, 0
     beqz   $a1, 20
@@ -121,7 +121,7 @@ la464_lock_decifnot0:
     move   $a0, $a1
     ret
 
-la464_lock_incif0:
+la64_lock_incif0:
     dbar   0
     ll.w   $a1, $a0, 0
     bnez   $a1, 20
@@ -132,32 +132,32 @@ la464_lock_incif0:
     move   $a0, $a1
     ret
 
-la464_lock_store:
+la64_lock_store:
     st.w $a1, $a0, 0
     dbar 0
     ret
 
-la464_lock_store_dd:
+la64_lock_store_dd:
     st.d $a1, $a0, 0
     dbar 0
     ret
 
-la464_lock_get_b:
+la64_lock_get_b:
     dbar 0
     ld.b $a0, $a0, 0
     ret
 
-la464_lock_get_d:
+la64_lock_get_d:
     dbar 0
     ld.w $a0, $a0, 0
     ret
 
-la464_lock_get_dd:
+la64_lock_get_dd:
     dbar 0
     ld.d $a0, $a0, 0
     ret
 
-la464_lock_cas_d:
+la64_lock_cas_d:
     ll.w $a3, $a0, 0
     bne  $a3, $a1, 16
     sc.w $a2, $a0, 0
@@ -166,7 +166,7 @@ la464_lock_cas_d:
     li.d $a0, 1
     ret
 
-la464_lock_cas_dd:
+la64_lock_cas_dd:
     ll.d $a3, $a0, 0
     bne  $a3, $a1, 16
     sc.d $a2, $a0, 0
@@ -175,7 +175,7 @@ la464_lock_cas_dd:
     li.d $a0, 1
     ret
 
-la464_lock_cas_dq:
+la64_lock_cas_dq:
     ll.d $a4, $a0, 0
     bne  $a4, $a2, 20
     sc.d $a1, $a0, 0
diff --git a/src/dynarec/la64/la64_lock.h b/src/dynarec/la64/la64_lock.h
index 1bf9a004..c757e08a 100644
--- a/src/dynarec/la64/la64_lock.h
+++ b/src/dynarec/la64/la64_lock.h
@@ -3,61 +3,61 @@
 #include <stdint.h>
 
 // Atomically exchange value at [p] with val, return old p
-extern uintptr_t la464_lock_xchg_dd(void* p, uintptr_t val);
+extern uintptr_t la64_lock_xchg_dd(void* p, uintptr_t val);
 
 // Atomically exchange value at [p] with val, return old p
-extern uint32_t la464_lock_xchg_d(void* p, uint32_t val);
+extern uint32_t la64_lock_xchg_d(void* p, uint32_t val);
 
 // Atomically store value to [p] only if [p] is NULL. Return old [p] value
-extern uint32_t la464_lock_storeifnull_d(void*p, uint32_t val);
+extern uint32_t la64_lock_storeifnull_d(void*p, uint32_t val);
 
 // Atomically store value to [p] only if [p] is NULL. Return old [p] value
-extern void* la464_lock_storeifnull(void*p, void* val);
+extern void* la64_lock_storeifnull(void*p, void* val);
 
 // Atomically store value to [p] only if [p] is ref. Return new [p] value (so val or old)
-extern void* la464_lock_storeifref(void*p, void* val, void* ref);
+extern void* la64_lock_storeifref(void*p, void* val, void* ref);
 
 // Atomically store value to [p] only if [p] is ref. Return new [p] value (so val or old)
-extern uint32_t la464_lock_storeifref_d(void*p, uint32_t val, uint32_t ref);
+extern uint32_t la64_lock_storeifref_d(void*p, uint32_t val, uint32_t ref);
 
 // Atomically store value to [p] only if [p] is ref. Return new [p] value (so val or old)
-extern uint32_t la464_lock_storeifref2_d(void*p, uint32_t val, uint32_t ref);
+extern uint32_t la64_lock_storeifref2_d(void*p, uint32_t val, uint32_t ref);
 
 // decrement atomically the byte at [p] (but only if p not 0)
-extern void la464_lock_decifnot0b(void*p);
+extern void la64_lock_decifnot0b(void*p);
 
 // atomic store (with memory barrier)
-extern void la464_lock_storeb(void*p, uint8_t b);
+extern void la64_lock_storeb(void*p, uint8_t b);
 
 // increment atomically the int at [p] only if it was 0. Return the old value of [p]
-extern int la464_lock_incif0(void*p);
+extern int la64_lock_incif0(void*p);
 
 // decrement atomically the int at [p] (but only if p not 0)
-extern int la464_lock_decifnot0(void*p);
+extern int la64_lock_decifnot0(void*p);
 
 // atomic store (with memory barrier)
-extern void la464_lock_store(void*p, uint32_t v);
+extern void la64_lock_store(void*p, uint32_t v);
 
 // atomic store (with memory barrier)
-extern void la464_lock_store_dd(void*p, uint64_t v);
+extern void la64_lock_store_dd(void*p, uint64_t v);
 
 // atomic get (with memory barrier)
-extern uint32_t la464_lock_get_b(void* p);
+extern uint32_t la64_lock_get_b(void* p);
 
 // atomic get (with memory barrier)
-extern uint32_t la464_lock_get_d(void* p);
+extern uint32_t la64_lock_get_d(void* p);
 
 // atomic get (with memory barrier)
-extern void* la464_lock_get_dd(void* p);
+extern void* la64_lock_get_dd(void* p);
 
 // Atomically store val at [p] if old [p] is ref. Return 0 if OK, 1 is not. p needs to be aligned
-extern int la464_lock_cas_d(void* p, int32_t ref, int32_t val);
+extern int la64_lock_cas_d(void* p, int32_t ref, int32_t val);
 
 // Atomically store val at [p] if old [p] is ref. Return 0 if OK, 1 is not. p needs to be aligned
-extern int la464_lock_cas_dd(void* p, int64_t ref, int64_t val);
+extern int la64_lock_cas_dd(void* p, int64_t ref, int64_t val);
 
 // (mostly) Atomically store val1 and val2 at [p] if old [p] is ref. Return 0 if OK, 1 is not. p needs to be aligned
-extern int la464_lock_cas_dq(void* p, uint64_t ref, uint64_t val1, uint64_t val2);
+extern int la64_lock_cas_dq(void* p, uint64_t ref, uint64_t val1, uint64_t val2);
 
 // Not defined in assembler but in dynarec_rv64_functions
 uint8_t extract_byte(uint32_t val, void* address);
@@ -65,9 +65,9 @@ uint32_t insert_byte(uint32_t val, uint8_t b, void* address);
 uint16_t extract_half(uint32_t val, void* address);
 uint32_t insert_half(uint32_t val, uint16_t h, void* address);
 
-uint8_t la464_lock_xchg_b(void* addr, uint8_t v);
-uint16_t la464_lock_xchg_h(void* addr, uint16_t v);
-int la464_lock_cas_b(void* p, uint8_t ref, uint8_t val);
-int la464_lock_cas_h(void* p, uint16_t ref, uint16_t val);
+uint8_t la64_lock_xchg_b(void* addr, uint8_t v);
+uint16_t la64_lock_xchg_h(void* addr, uint16_t v);
+int la64_lock_cas_b(void* p, uint8_t ref, uint8_t val);
+int la64_lock_cas_h(void* p, uint16_t ref, uint16_t val);
 
 #endif  //__LA64_LOCK__H__
diff --git a/src/dynarec/la64/la64_next.S b/src/dynarec/la64/la64_next.S
index e2c4924d..c69830f3 100644
--- a/src/dynarec/la64/la64_next.S
+++ b/src/dynarec/la64/la64_next.S
@@ -1,4 +1,4 @@
-//la464 update linker table for dynarec
+//la64 update linker table for dynarec
 //called with pointer to emu as 1st parameter
 //and address of table to as 2nd parameter
 //ip is at r12
@@ -8,10 +8,10 @@
 
 .extern LinkNext
 
-.global la464_next
+.global la64_next
 
-    .8byte  0   // NULL pointer before la464_next, for getDB
-la464_next:
+    .8byte  0   // NULL pointer before la64_next, for getDB
+la64_next:
     // emu is a0
     // IP address is a1
     addi.d $sp, $sp, -(8 * 12)
diff --git a/src/dynarec/la64/la64_printer.c b/src/dynarec/la64/la64_printer.c
index 5651ec12..5e1231bd 100644
--- a/src/dynarec/la64/la64_printer.c
+++ b/src/dynarec/la64/la64_printer.c
@@ -3,17 +3,17 @@
 #include <string.h>
 #include <stdio.h>
 
-#include "la464_printer.h"
+#include "la64_printer.h"
 #include "debug.h"
 
 static const char* Xt[] = {"xZR", "r1", "r2", "sp", "xEmu", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "xRAX", "xRCX", "xRDX", "xRBX", "xRSP", "xRBP", "xRSI", "xRDI", "xR8", "r21", "xR9", "xR10", "xR11", "xR12", "xR13", "xR14", "xR15", "xFlags", "xRIP", "r31"};
 
-typedef struct la464_print_s {
+typedef struct la64_print_s {
     int d, j, k, a;
     int i, u;
-} la464_print_t;
+} la64_print_t;
 
-int isMask(uint32_t opcode, const char* mask, la464_print_t *a)
+int isMask(uint32_t opcode, const char* mask, la64_print_t *a)
 {
     if(strlen(mask)!=32) {
         printf_log(LOG_NONE, "Error: printer mask \"%s\" in not len 32 but %ld\n", mask, strlen(mask));
@@ -50,10 +50,10 @@ int64_t signExtend(uint32_t val, int sz)
     return ret;
 }
 
-const char* la464_print(uint32_t opcode, uintptr_t addr)
+const char* la64_print(uint32_t opcode, uintptr_t addr)
 {
     static char buff[200];
-    la464_print_t a;
+    la64_print_t a;
     #define Rd a.d
     #define Rj a.j
     #define Rk a.k
diff --git a/src/dynarec/la64/la64_printer.h b/src/dynarec/la64/la64_printer.h
index 2bc4ded8..bce44339 100644
--- a/src/dynarec/la64/la64_printer.h
+++ b/src/dynarec/la64/la64_printer.h
@@ -1,6 +1,6 @@
 #ifndef _LA64_PRINTER_H_
 #define _LA64_PRINTER_H_
 
-const char* la464_print(uint32_t opcode, uint64_t addr);
+const char* la64_print(uint32_t opcode, uint64_t addr);
 
 #endif //_LA64_PRINTER_H_
diff --git a/src/dynarec/la64/la64_prolog.S b/src/dynarec/la64/la64_prolog.S
index aafe4dc7..b1dd3450 100644
--- a/src/dynarec/la64/la64_prolog.S
+++ b/src/dynarec/la64/la64_prolog.S
@@ -6,8 +6,8 @@
 .text
 .align 4
 
-.global la464_prolog
-la464_prolog:
+.global la64_prolog
+la64_prolog:
     //save all 18 used register
     addi.d $sp,  $sp, -(8 * 19)
     st.d   $r1,  $sp, (8 * 0) //save ra
diff --git a/src/include/dynarec_la464.h b/src/include/dynarec_la64.h
index cf8946f0..cf8946f0 100644
--- a/src/include/dynarec_la464.h
+++ b/src/include/dynarec_la64.h