about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2021-03-20 12:45:22 +0100
committerptitSeb <sebastien.chev@gmail.com>2021-03-20 12:45:22 +0100
commitb52b3219696b851e96dc8d2e6543e91dbba909ff (patch)
tree2272f1454640cbbd818ea9ce176bcd19af447778 /src
parente5b36d2997a44eb10adc1a17591a878200743f5f (diff)
downloadbox64-b52b3219696b851e96dc8d2e6543e91dbba909ff.tar.gz
box64-b52b3219696b851e96dc8d2e6543e91dbba909ff.zip
[DYNAREC] Added F6 opcodes and fixed issue with native call that may destroy some registers
Diffstat (limited to 'src')
-rwxr-xr-xsrc/dynarec/arm64_emitter.h62
-rwxr-xr-xsrc/dynarec/arm64_printer.c27
-rwxr-xr-xsrc/dynarec/dynarec_arm64_00.c86
-rwxr-xr-xsrc/dynarec/dynarec_arm64_emit_math.c91
-rwxr-xr-xsrc/dynarec/dynarec_arm64_helper.c20
-rwxr-xr-xsrc/dynarec/dynarec_arm64_helper.h18
-rwxr-xr-xsrc/dynarec/dynarec_arm64_pass.c4
-rwxr-xr-xsrc/emu/x64primop.c8
8 files changed, 228 insertions, 88 deletions
diff --git a/src/dynarec/arm64_emitter.h b/src/dynarec/arm64_emitter.h
index 091544df..100b0d74 100755
--- a/src/dynarec/arm64_emitter.h
+++ b/src/dynarec/arm64_emitter.h
@@ -159,6 +159,12 @@
 #define CMPSx_REG(Rn, Rm)                   SUBSx_REG(xZR, Rn, Rm)
 #define CMPSw_REG(Rn, Rm)                   SUBSw_REG(wZR, Rn, Rm)
 #define CMPSxw_REG(Rn, Rm)                  SUBSxw_REG(xZR, Rn, Rm)
+#define NEGx_REG(Rd, Rm)                    SUBx_REG(Rd, xZR, Rm);
+#define NEGw_REG(Rd, Rm)                    SUBw_REG(Rd, wZR, Rm);
+#define NEGxw_REG(Rd, Rm)                   SUBxw_REG(Rd, xZR, Rm);
+#define NEGSx_REG(Rd, Rm)                   SUBSx_REG(Rd, xZR, Rm);
+#define NEGSw_REG(Rd, Rm)                   SUBSw_REG(Rd, wZR, Rm);
+#define NEGSxw_REG(Rd, Rm)                  SUBSxw_REG(Rd, xZR, Rm);
 
 #define SUBx_U12(Rd, Rn, imm12)     EMIT(ADDSUB_IMM_gen(1, 1, 0, 0b00, (imm12)&0xfff, Rn, Rd))
 #define SUBSx_U12(Rd, Rn, imm12)    EMIT(ADDSUB_IMM_gen(1, 1, 1, 0b00, (imm12)&0xfff, Rn, Rd))
@@ -247,25 +253,25 @@
 // LOAD/STORE PAIR
 #define MEMPAIR_gen(size, L, op2, imm7, Rt2, Rn, Rt)    ((size)<<31 | 0b101<<27 | (op2)<<23 | (L)<<22 | (imm7)<<15 | (Rt2)<<10 | (Rn)<<5 | (Rt))
 
-#define LDPx_S7_postindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(1, 1, 0b01, (imm>>3)&0x7f, Rt2, Rn, Rt))
-#define LDPw_S7_postindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(0, 1, 0b01, (imm>>2)&0x7f, Rt2, Rn, Rt))
-#define LDPxw_S7_postindex(Rt, Rt2, Rn, imm)            EMIT(MEMPAIR_gen(rex.w, 1, 0b01, (imm>>(2+rex.w)), Rt2, Rn, Rt))
-#define LDPx_S7_preindex(Rt, Rt2, Rn, imm)              EMIT(MEMPAIR_gen(1, 1, 0b11, (imm>>3)&0x7f, Rt2, Rn, Rt))
-#define LDPw_S7_preindex(Rt, Rt2, Rn, imm)              EMIT(MEMPAIR_gen(0, 1, 0b11, (imm>>2)&0x7f, Rt2, Rn, Rt))
-#define LDPxw_S7_preindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(rex.w, 1, 0b11, (imm>>(2+rex.w)), Rt2, Rn, Rt))
-#define LDPx_S7_offset(Rt, Rt2, Rn, imm)                EMIT(MEMPAIR_gen(1, 1, 0b10, (imm>>3)&0x7f, Rt2, Rn, Rt))
-#define LDPw_S7_offset(Rt, Rt2, Rn, imm)                EMIT(MEMPAIR_gen(0, 1, 0b10, (imm>>2)&0x7f, Rt2, Rn, Rt))
-#define LDPxw_S7_offset(Rt, Rt2, Rn, imm)               EMIT(MEMPAIR_gen(rex.w, 1, 0b10, (imm>>(2+rex.w)), Rt2, Rn, Rt))
-
-#define STPx_S7_postindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(1, 0, 0b01, (imm>>3)&0x7f, Rt2, Rn, Rt))
-#define STPw_S7_postindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(0, 0, 0b01, (imm>>2)&0x7f, Rt2, Rn, Rt))
-#define STPxw_S7_postindex(Rt, Rt2, Rn, imm)            EMIT(MEMPAIR_gen(rex.w, 0, 0b01, (imm>>(2+rex.w)), Rt2, Rn, Rt))
-#define STPx_S7_preindex(Rt, Rt2, Rn, imm)              EMIT(MEMPAIR_gen(1, 0, 0b11, (imm>>3)&0x7f, Rt2, Rn, Rt))
-#define STPw_S7_preindex(Rt, Rt2, Rn, imm)              EMIT(MEMPAIR_gen(0, 0, 0b11, (imm>>2)&0x7f, Rt2, Rn, Rt))
-#define STPxw_S7_preindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(rex.w, 0, 0b11, (imm>>(2+rex.w)), Rt2, Rn, Rt))
-#define STPx_S7_offset(Rt, Rt2, Rn, imm)                EMIT(MEMPAIR_gen(1, 0, 0b10, (imm>>3)&0x7f, Rt2, Rn, Rt))
-#define STPw_S7_offset(Rt, Rt2, Rn, imm)                EMIT(MEMPAIR_gen(0, 0, 0b10, (imm>>2)&0x7f, Rt2, Rn, Rt))
-#define STPxw_S7_offset(Rt, Rt2, Rn, imm)               EMIT(MEMPAIR_gen(rex.w, 0, 0b10, (imm>>(2+rex.w)), Rt2, Rn, Rt))
+#define LDPx_S7_postindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(1, 1, 0b01, (((uint32_t)(imm))>>3)&0x7f, Rt2, Rn, Rt))
+#define LDPw_S7_postindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(0, 1, 0b01, (((uint32_t)(imm))>>2)&0x7f, Rt2, Rn, Rt))
+#define LDPxw_S7_postindex(Rt, Rt2, Rn, imm)            EMIT(MEMPAIR_gen(rex.w, 1, 0b01, (((uint32_t)(imm))>>(2+rex.w)), Rt2, Rn, Rt))
+#define LDPx_S7_preindex(Rt, Rt2, Rn, imm)              EMIT(MEMPAIR_gen(1, 1, 0b11, (((uint32_t)(imm))>>3)&0x7f, Rt2, Rn, Rt))
+#define LDPw_S7_preindex(Rt, Rt2, Rn, imm)              EMIT(MEMPAIR_gen(0, 1, 0b11, (((uint32_t)(imm))>>2)&0x7f, Rt2, Rn, Rt))
+#define LDPxw_S7_preindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(rex.w, 1, 0b11, (((uint32_t)(imm))>>(2+rex.w)), Rt2, Rn, Rt))
+#define LDPx_S7_offset(Rt, Rt2, Rn, imm)                EMIT(MEMPAIR_gen(1, 1, 0b10, (((uint32_t)(imm))>>3)&0x7f, Rt2, Rn, Rt))
+#define LDPw_S7_offset(Rt, Rt2, Rn, imm)                EMIT(MEMPAIR_gen(0, 1, 0b10, (((uint32_t)(imm))>>2)&0x7f, Rt2, Rn, Rt))
+#define LDPxw_S7_offset(Rt, Rt2, Rn, imm)               EMIT(MEMPAIR_gen(rex.w, 1, 0b10, (((uint32_t)(imm))>>(2+rex.w)), Rt2, Rn, Rt))
+
+#define STPx_S7_postindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(1, 0, 0b01, (((uint32_t)(imm))>>3)&0x7f, Rt2, Rn, Rt))
+#define STPw_S7_postindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(0, 0, 0b01, (((uint32_t)(imm))>>2)&0x7f, Rt2, Rn, Rt))
+#define STPxw_S7_postindex(Rt, Rt2, Rn, imm)            EMIT(MEMPAIR_gen(rex.w, 0, 0b01, (((uint32_t)(imm))>>(2+rex.w)), Rt2, Rn, Rt))
+#define STPx_S7_preindex(Rt, Rt2, Rn, imm)              EMIT(MEMPAIR_gen(1, 0, 0b11, (((uint32_t)(imm))>>3)&0x7f, Rt2, Rn, Rt))
+#define STPw_S7_preindex(Rt, Rt2, Rn, imm)              EMIT(MEMPAIR_gen(0, 0, 0b11, (((uint32_t)(imm))>>2)&0x7f, Rt2, Rn, Rt))
+#define STPxw_S7_preindex(Rt, Rt2, Rn, imm)             EMIT(MEMPAIR_gen(rex.w, 0, 0b11, (((uint32_t)(imm))>>(2+rex.w)), Rt2, Rn, Rt))
+#define STPx_S7_offset(Rt, Rt2, Rn, imm)                EMIT(MEMPAIR_gen(1, 0, 0b10, (((uint32_t)(imm))>>3)&0x7f, Rt2, Rn, Rt))
+#define STPw_S7_offset(Rt, Rt2, Rn, imm)                EMIT(MEMPAIR_gen(0, 0, 0b10, (((uint32_t)(imm))>>2)&0x7f, Rt2, Rn, Rt))
+#define STPxw_S7_offset(Rt, Rt2, Rn, imm)               EMIT(MEMPAIR_gen(rex.w, 0, 0b10, (((uint32_t)(imm))>>(2+rex.w)), Rt2, Rn, Rt))
 
 // PUSH / POP helper
 #define POP1(reg)       LDRx_S9_postindex(reg, xRSP, 8)
@@ -457,6 +463,24 @@
 #define LSLw_REG(Rd, Rn, Rm)            EMIT(LS_V_gen(0, Rm, 0b00, Rn, Rd))
 #define LSLxw_REG(Rd, Rn, Rm)           EMIT(LS_V_gen(rex.w, Rm, 0b00, Rn, Rd))
 
+// UMULL / SMULL
+#define MADDL_gen(U, Rm, o0, Ra, Rn, Rd)    (1<<31 | 0b11011<<24 | (U)<<23 | 0b01<<21 | (Rm)<<16 | (o0)<<15 | (Ra)<<10 | (Rn)<<5 | (Rd))
+#define UMADDL(Xd, Wn, Wm, Xa)          EMIT(MADDL_gen(1, Wm, 0, Xa, Wn, Xd))
+#define UMULL(Xd, Wn, Wm)               UMADDL(Xd, Wn, Wm, xZR)
+#define SMADDL(Xd, Wn, Wm, Xa)          EMIT(MADDL_gen(0, Wm, 0, Xa, Wn, Xd))
+#define SMULL(Xd, Wn, Wm)               SMADDL(Xd, Wn, Wm, xZR)
+
+#define MULH_gen(U, Rm, Rn, Rd)         (1<<31 | 0b11011<<24 | (U)<<23 | 0b10<<21 | (Rm)<<16 | 0b11111<<10 | (Rn)<<5 | (Rd))
+#define UMULH(Xd, Xn, Xm)               EMIT(MULH_gen(1, Xm, Xn, Xd))
+#define SMULH(Xd, Xn, Xm)               EMIT(MULH_gen(0, Xm, Xn, Xd))
+
+#define MADD_gen(sf, Rm, Ra, Rn, Rd)    ((sf)<<31 | 0b11011<<24 | (Rm)<<16 | (Ra)<<10 | (Rn)<<5 | (Rd))
+#define MADDx(Rd, Rn, Rm, Ra)           EMIT(MADD_gen(1, Rm, Ra, Rn, Rd))
+#define MADDw(Rd, Rn, Rm, Ra)           EMIT(MADD_gen(0, Rm, Ra, Rn, Rd))
+#define MADDxw(Rd, Rn, Rm, Ra)          EMIT(MADD_gen(rex.w, Rm, Ra, Rn, Rd))
+#define MULx(Rd, Rn, Rm)                MADDx(Rd, Rn, Rm, xZR)
+#define MULw(Rd, Rn, Rm)                MADDw(Rd, Rn, Rm, xZR)
+#define MULxw(Rd, Rn, Rm)               MADDxw(Rd, Rn, Rm, xZR)
 
 
 // MRS
diff --git a/src/dynarec/arm64_printer.c b/src/dynarec/arm64_printer.c
index b54d4b9e..57024b6b 100755
--- a/src/dynarec/arm64_printer.c
+++ b/src/dynarec/arm64_printer.c
@@ -16,8 +16,8 @@ static const char* conds[] = {"cEQ", "cNE", "cCS", "cCC", "cMI", "cPL", "cVS", "
 #define abs(A) (((A)<0)?(-(A)):(A))

 

 typedef struct arm64_print_s {

-    int N, S;

-    int t, n, m, d, t2;

+    int N, S, U;

+    int t, n, m, d, t2, a;

     int f, c, o, h;

     int i, r, s;

     int x, w;

@@ -59,10 +59,12 @@ int isMask(uint32_t opcode, const char* mask, arm64_print_t *a)
             case '1': if(v!=1) return 0; break;

             case 'N': a->N = (a->N<<1) | v; break;

             case 'S': a->S = (a->S<<1) | v; break;

+            case 'U': a->U = (a->U<<1) | v; break;

             case 't': a->t = (a->t<<1) | v; break;

             case '2': a->t2 = (a->t2<<1) | v; break;

             case 'n': a->n = (a->n<<1) | v; break;

             case 'm': a->m = (a->m<<1) | v; break;

+            case 'a': a->a = (a->a<<1) | v; break;

             case 'd': a->d = (a->d<<1) | v; break;

             case 'f': a->f = (a->f<<1) | v; break;

             case 'c': a->c = (a->c<<1) | v; break;

@@ -100,6 +102,7 @@ const char* arm64_print(uint32_t opcode, uintptr_t addr)
     #define Rt2 a.t2

     #define Rm a.m

     #define Rd a.d

+    #define Ra a.a

     #define sf a.f

     #define imm a.i

     #define option a.o

@@ -590,6 +593,26 @@ const char* arm64_print(uint32_t opcode, uintptr_t addr)
         return buff;

     }

 

+    // MULL ADD

+    if(isMask(opcode, "10011011U01mmmmm0aaaaannnnnddddd", &a)) {

+        if(Ra==31)

+            snprintf(buff, sizeof(buff), "%cMULL %s, %s, %s", a.U?'U':'S', Xt[Rd], Wt[Rn], Wt[Rm]);

+        else

+            snprintf(buff, sizeof(buff), "%cMADDL %s, %s, %s, %s", a.U?'U':'S', Xt[Rd], Wt[Rn], Wt[Rm], Xt[Ra]);

+        return buff;

+    }

+    if(isMask(opcode, "10011011U10mmmmm011111nnnnnddddd", &a)) {

+        snprintf(buff, sizeof(buff), "%cMULH %s, %s, %s", a.U?'U':'S', Xt[Rd], Wt[Rn], Wt[Rm]);

+        return buff;

+    }

+    if(isMask(opcode, "f0011011000mmmmm0aaaaannnnnddddd", &a)) {

+        if(Ra==31)

+            snprintf(buff, sizeof(buff), "MUL %s, %s, %s", sf?Xt[Rd]:Wt[Rd], sf?Xt[Rn]:Wt[Rn], sf?Xt[Rm]:Wt[Rm]);

+        else

+            snprintf(buff, sizeof(buff), "MADD %s, %s, %s, %s", sf?Xt[Rd]:Wt[Rd], sf?Xt[Rn]:Wt[Rn], sf?Xt[Rm]:Wt[Rm], sf?Xt[Ra]:Wt[Ra]);

+        return buff;

+    }

+

     snprintf(buff, sizeof(buff), "%08X ???", __builtin_bswap32(opcode));

     return buff;

 }
\ No newline at end of file
diff --git a/src/dynarec/dynarec_arm64_00.c b/src/dynarec/dynarec_arm64_00.c
index ac3e470c..02a67cf1 100755
--- a/src/dynarec/dynarec_arm64_00.c
+++ b/src/dynarec/dynarec_arm64_00.c
@@ -676,9 +676,9 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     x87_forget(dyn, ninst, x3, x4, 0);
                     sse_purge07cache(dyn, ninst, x3);
                     GETIP(ip+1); // read the 0xCC
-                    STORE_XEMU_MINIMUM(xRIP);
+                    STORE_XEMU_CALL(xRIP);
                     CALL_S(x64Int3, -1);
-                    LOAD_XEMU_MINIMUM(xRIP);
+                    LOAD_XEMU_CALL(xRIP);
                     addr+=8+8;
                     TABLE64(x3, addr); // expected return address
                     CMPSx_REG(xRIP, x3);
@@ -1030,9 +1030,9 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     x87_forget(dyn, ninst, x3, x4, 0);
                     sse_purge07cache(dyn, ninst, x3);
                     GETIP_(dyn->insts[ninst].natcall); // read the 0xCC already
-                    STORE_XEMU_MINIMUM(xRIP);
+                    STORE_XEMU_CALL(xRIP);
                     CALL_S(x64Int3, -1);
-                    LOAD_XEMU_MINIMUM(xRIP);
+                    LOAD_XEMU_CALL(xRIP);
                     TABLE64(x3, dyn->insts[ninst].natcall);
                     ADDx_U12(x3, x3, 2+8+8);
                     CMPSx_REG(xRIP, x3);
@@ -1102,6 +1102,84 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             *ok = 0;
             break;
 
+        case 0xF7:
+            nextop = F8;
+            switch((nextop>>3)&7) {
+                case 0:
+                case 1:
+                    INST_NAME("TEST Ed, Id");
+                    SETFLAGS(X_ALL, SF_SET);
+                    GETEDH(x1, 4);
+                    i64 = F32S;
+                    MOV64xw(x2, i64);
+                    emit_test32(dyn, ninst, rex, ed, x2, x3, x4);
+                    break;
+                case 2:
+                    INST_NAME("NOT Ed");
+                    GETED(4);
+                    MVNxw_REG(ed, ed);
+                    WBACK;
+                    break;
+                case 3:
+                    INST_NAME("NEG Ed");
+                    SETFLAGS(X_ALL, SF_SET);
+                    GETED(0);
+                    emit_neg32(dyn, ninst, rex, ed, x3, x4);
+                    WBACK;
+                    break;
+                case 4:
+                    INST_NAME("MUL EAX, Ed");
+                    SETFLAGS(X_ALL, SF_PENDING);
+                    UFLAG_DF(x2, rex.w?d_mul64:d_mul32);
+                    GETED(0);
+                    if(rex.w) {
+                        if(ed==xRDX) gd=x3; else gd=xRDX;
+                        UMULH(gd, xRAX, ed);
+                        MULx(xRAX, xRAX, ed);
+                        if(gd!=xRDX) {MOVx_REG(xRDX, gd);}
+                    } else {
+                        UMULL(xRDX, xRAX, ed);  //64 <- 32x32
+                        MOVw_REG(xRAX, xRDX);
+                        LSRx(xRDX, xRDX, 32);
+                    }
+                    UFLAG_RES(xRAX);
+                    UFLAG_OP1(xRDX);
+                    break;
+                case 5:
+                    INST_NAME("IMUL EAX, Ed");
+                    SETFLAGS(X_ALL, SF_PENDING);
+                    UFLAG_DF(x2, rex.w?d_imul64:d_imul32);
+                    GETED(0);
+                    if(rex.w) {
+                        if(ed==xRDX) gd=x3; else gd=xRDX;
+                        SMULH(gd, xRAX, ed);
+                        MULx(xRAX, xRAX, ed);
+                        if(gd!=xRDX) {MOVx_REG(xRDX, gd);}
+                    } else {
+                        SMULL(xRDX, xRAX, ed);  //64 <- 32x32
+                        MOVw_REG(xRAX, xRDX);
+                        LSRx(xRDX, xRDX, 32);
+                    }
+                    UFLAG_RES(xRAX);
+                    UFLAG_OP1(xRDX);
+                    break;
+                case 6:
+                    INST_NAME("DIV Ed");
+                    SETFLAGS(X_ALL, SF_SET);
+                    GETEDH(x1, 0);
+                    if(ed!=x1) {MOVxw_REG(x1, ed);}
+                    CALL(rex.w?((void*)div64):((void*)div32), -1);
+                    break;
+                case 7:
+                    INST_NAME("IDIV Ed");
+                    SETFLAGS(X_ALL, SF_SET);
+                    GETEDH(x1, 0);
+                    if(ed!=x1) {MOVxw_REG(x1, ed);}
+                    CALL(rex.w?((void*)idiv64):((void*)idiv32), -1);
+                    break;
+            }
+            break;
+        
         case 0xFF:
             nextop = F8;
             switch((nextop>>3)&7) {
diff --git a/src/dynarec/dynarec_arm64_emit_math.c b/src/dynarec/dynarec_arm64_emit_math.c
index d3931ccc..d76ded05 100755
--- a/src/dynarec/dynarec_arm64_emit_math.c
+++ b/src/dynarec/dynarec_arm64_emit_math.c
@@ -1733,52 +1733,51 @@ void emit_sbb8c(dynarec_arm_t* dyn, int ninst, int s1, int c, int s3, int s4, in
 //}
 
 // emit NEG32 instruction, from s1, store result in s1 using s3 and s4 as scratch
-//void emit_neg32(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4)
-//{
-//    IFX(X_PEND) {
-//        STR_IMM9(s1, xEmu, offsetof(x64emu_t, op1));
-//        SET_DF(s3, d_neg32);
-//    } else IFX(X_ALL) {
-//        SET_DFNONE(s3);
-//    }
-//    IFX(X_ZF|X_CF) {
-//        BIC_IMM8(xFlags, xFlags, (1<<F_ZF)|(1<<F_CF), 0);
-//    }
-//    IFX(X_CF) {
-//        TSTS_REG_LSL_IMM5(s1, s1, 0);
-//        ORR_IMM8_COND(cNE, xFlags, xFlags, 1<<F_CF, 0);
-//    }
-//    IFX(X_AF) {
-//        MOV_REG_LSL_IMM5(s3, s1, 0);
-//    }
-//    IFX(X_ZF|X_OF) {
-//        RSBS_IMM8(s1, s1, 0);
-//    } else {
-//        RSB_IMM8(s1, s1, 0);
-//    }
-//    IFX(X_PEND) {
-//        STR_IMM9(s1, xEmu, offsetof(x64emu_t, res));
-//    }
-//    IFX(X_ZF) {
-//        ORR_IMM8_COND(cEQ, xFlags, xFlags, 1<<F_ZF, 0);
-//    }
-//    IFX(X_OF) {
-//        ORR_IMM8_COND(cVS, xFlags, xFlags, 0b10, 0x0b);
-//        BIC_IMM8_COND(cVC, xFlags, xFlags, 0b10, 0x0b);
-//    }
-//    IFX(X_AF) {
-//        ORR_REG_LSL_IMM5(s3, s3, s1, 0);                        // bc = op1 | res
-//        MOV_REG_LSR_IMM5(s4, s3, 3);
-//        BFI(xFlags, s4, F_AF, 1);    // AF: bc & 0x08
-//    }
-//    IFX(X_SF) {
-//        MOV_REG_LSR_IMM5(s3, s1, 31);
-//        BFI(xFlags, s3, F_SF, 1);
-//    }
-//    IFX(X_PF) {
-//        emit_pf(dyn, ninst, s1, s3, s4);
-//    }
-//}
+void emit_neg32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s3, int s4)
+{
+    IFX(X_PEND) {
+        STRxw_U12(s1, xEmu, offsetof(x64emu_t, op1));
+        SET_DF(s3, rex.w?d_neg64:d_neg32);
+    } else IFX(X_ALL) {
+        SET_DFNONE(s3);
+    }
+    IFX(X_CF) {
+        TSTxw_REG(s1, s1);
+        CSETw(s4, cNE);
+        BFIw(xFlags, s4, F_CF, 1);
+    }
+    IFX(X_AF) {
+        MOVxw_REG(s3, s1);
+    }
+    IFX(X_ZF|X_OF) {
+        NEGSxw_REG(s1, s1);
+    } else {
+        NEGxw_REG(s1, s1);
+    }
+    IFX(X_PEND) {
+        STRxw_U12(s1, xEmu, offsetof(x64emu_t, res));
+    }
+    IFX(X_ZF) {
+        CSETw(s4, cEQ);
+        BFIw(xFlags, s4, F_ZF, 1);
+    }
+    IFX(X_OF) {
+        CSETw(s4, cVS);
+        BFIw(xFlags, s4, F_OF, 1);
+    }
+    IFX(X_AF) {
+        ORRxw_REG(s3, s3, s1);                        // bc = op1 | res
+        LSRxw(s4, s3, 3);
+        BFIw(xFlags, s4, F_AF, 1);    // AF: bc & 0x08
+    }
+    IFX(X_SF) {
+        LSRxw(s3, s1, rex.w?63:31);
+        BFIw(xFlags, s3, F_SF, 1);
+    }
+    IFX(X_PF) {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+}
 
 // emit NEG16 instruction, from s1, store result in s1 using s3 and s4 as scratch
 //void emit_neg16(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4)
diff --git a/src/dynarec/dynarec_arm64_helper.c b/src/dynarec/dynarec_arm64_helper.c
index 24aca546..efcc1bd4 100755
--- a/src/dynarec/dynarec_arm64_helper.c
+++ b/src/dynarec/dynarec_arm64_helper.c
@@ -333,6 +333,11 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save
         savereg = 7;
     if(ret!=-2) {
         STPx_S7_preindex(xEmu, savereg, xSP, -16);   // ARM64 stack needs to be 16byte aligned
+        STPx_S7_offset(xRAX, xRCX, xEmu, offsetof(x64emu_t, regs[_AX]));    // x9..x15, x16,x17,x18 those needs to be saved by caller
+        STPx_S7_offset(xRDX, xRBX, xEmu, offsetof(x64emu_t, regs[_DX]));    // but x18 is R8 wich is lost, so it's fine to not save it?
+        STPx_S7_offset(xRSP, xRBP, xEmu, offsetof(x64emu_t, regs[_SP]));
+        STPx_S7_offset(xRSI, xRDI, xEmu, offsetof(x64emu_t, regs[_SI]));
+        STRx_U12(xR8, xEmu, offsetof(x64emu_t, regs[_R8]));
     }
     fpu_pushcache(dyn, ninst, reg);
     if(saveflags) {
@@ -346,6 +351,21 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save
     }
     if(ret!=-2) {
         LDPx_S7_postindex(xEmu, savereg, xSP, 16);
+        #define GO(A, B) if(ret==x##A) {                                        \
+            LDRx_U12(x##B, xEmu, offsetof(x64emu_t, regs[_##B]));               \
+        } else if(ret==x##B) {                                                  \
+            LDRx_U12(x##A, xEmu, offsetof(x64emu_t, regs[_##A]));               \
+        } else {                                                                \
+            LDPx_S7_offset(x##A, x##B, xEmu, offsetof(x64emu_t, regs[_##A]));   \
+        }
+        GO(RAX, RCX);
+        GO(RDX, RBX);
+        GO(RSP, RBP);
+        GO(RSI, RDI);
+        #undef GO
+        if(ret!=xR8) {
+            LDRx_U12(xR8, xEmu, offsetof(x64emu_t, regs[_R8]));
+        }
     }
     if(saveflags) {
         LDRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags));
diff --git a/src/dynarec/dynarec_arm64_helper.h b/src/dynarec/dynarec_arm64_helper.h
index b3e08d7d..ac26870e 100755
--- a/src/dynarec/dynarec_arm64_helper.h
+++ b/src/dynarec/dynarec_arm64_helper.h
@@ -382,16 +382,12 @@
     STRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags)); \
     if(A) {STRx_U12(A, xEmu, offsetof(x64emu_t, ip));}
 
-#define LOAD_XEMU_MINIMUM(A)  \
-    LOAD_REG(RAX);         \
-    LOAD_REG(RCX);         \
-    LOAD_REG(RDX);         \
-    LOAD_REG(RBX);         \
-    LOAD_REG(RSP);         \
-    LOAD_REG(RBP);         \
-    LOAD_REG(RSI);         \
-    LOAD_REG(RDI);         \
-    LOAD_REG(R8);          \
+#define STORE_XEMU_CALL(A)  \
+    STORE_REG(R9);          \
+    STRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags)); \
+    if(A) {STRx_U12(A, xEmu, offsetof(x64emu_t, ip));}
+
+#define LOAD_XEMU_CALL(A)  \
     LOAD_REG(R9);          \
     LDRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags)); \
     if(A) {LDRx_U12(A, xEmu, offsetof(x64emu_t, ip)); if(A==xRIP) dyn->last_ip = 0;}
@@ -691,7 +687,7 @@ void emit_adc8c(dynarec_arm_t* dyn, int ninst, int s1, int32_t c, int s3, int s4
 void emit_sbb8c(dynarec_arm_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5);
 //void emit_sbb16(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int s4, int save_s4);
 //void emit_sbb16c(dynarec_arm_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
-//void emit_neg32(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4);
+void emit_neg32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s3, int s4);
 //void emit_neg16(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4);
 //void emit_neg8(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4);
 void emit_shl32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4);
diff --git a/src/dynarec/dynarec_arm64_pass.c b/src/dynarec/dynarec_arm64_pass.c
index e7adb758..8f7af07c 100755
--- a/src/dynarec/dynarec_arm64_pass.c
+++ b/src/dynarec/dynarec_arm64_pass.c
@@ -56,10 +56,10 @@ void arm_pass(dynarec_arm_t* dyn, uintptr_t addr)
                 fpu_reflectcache(dyn, ninst, x1, x2, x3);
                 GETIP(ip);
                 MOVx_REG(x1, xRIP);
-                STORE_XEMU_REGS(xRIP);
+                STORE_XEMU_CALL(xRIP);
                 MOV32w(x2, 1);
                 CALL(PrintTrace, -1);
-                LOAD_XEMU_REGS(xRIP);
+                LOAD_XEMU_CALL(xRIP);
                 MESSAGE(LOG_DUMP, "----------\n");
             }
         }
diff --git a/src/emu/x64primop.c b/src/emu/x64primop.c
index cc9422e7..441518ac 100755
--- a/src/emu/x64primop.c
+++ b/src/emu/x64primop.c
@@ -1384,8 +1384,8 @@ void idiv32(x64emu_t *emu, uint32_t s)
 	SET_FLAG(F_ZF);
 	CONDITIONAL_SET_FLAG(PARITY(mod & 0xff), F_PF);
 
-	R_EAX = (uint32_t)quot;
-	R_EDX = (uint32_t)mod;
+	R_RAX = (uint32_t)quot;
+	R_RDX = (uint32_t)mod;
 }
 
 void idiv64(x64emu_t *emu, uint64_t s)
@@ -1485,8 +1485,8 @@ void div32(x64emu_t *emu, uint32_t s)
 	SET_FLAG(F_ZF);
 	CONDITIONAL_SET_FLAG(PARITY(mod & 0xff), F_PF);
 
-	R_EAX = (uint32_t)div;
-	R_EDX = (uint32_t)mod;
+	R_RAX = (uint32_t)div;
+	R_RDX = (uint32_t)mod;
 }
 
 void div64(x64emu_t *emu, uint64_t s)