about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2023-02-16 12:17:42 +0100
committerptitSeb <sebastien.chev@gmail.com>2023-02-16 12:17:42 +0100
commit8f4e8ed1e7b6dff16396a84c5cb23ab72008035a (patch)
treeba2a8f553261159ab398981b028c9ed98465cbfc /src
parent0e10ead81adc6756cebc075f8b041a22d08d3c2d (diff)
downloadbox64-8f4e8ed1e7b6dff16396a84c5cb23ab72008035a.tar.gz
box64-8f4e8ed1e7b6dff16396a84c5cb23ab72008035a.zip
[DYNAREC] Various small fixes and improvment on Dynarec
Diffstat (limited to 'src')
-rwxr-xr-xsrc/dynarec/arm64/arm64_emitter.h2
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_00.c10
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_0f.c97
-rw-r--r--src/dynarec/arm64/dynarec_arm64_64.c116
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_66.c6
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_660f.c75
-rw-r--r--src/dynarec/arm64/dynarec_arm64_6664.c12
-rw-r--r--src/dynarec/arm64/dynarec_arm64_66f0.c5
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_67.c92
-rw-r--r--src/dynarec/arm64/dynarec_arm64_d9.c25
-rw-r--r--src/dynarec/arm64/dynarec_arm64_db.c43
-rw-r--r--src/dynarec/arm64/dynarec_arm64_dc.c16
-rw-r--r--src/dynarec/arm64/dynarec_arm64_dd.c16
-rw-r--r--src/dynarec/arm64/dynarec_arm64_df.c2
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_emit_logic.c15
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_emit_math.c131
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_emit_shift.c57
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_emit_tests.c32
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f0.c321
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_f20f.c57
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_f30f.c64
-rwxr-xr-xsrc/dynarec/arm64/dynarec_arm64_helper.h96
-rwxr-xr-xsrc/dynarec/dynarec_native.c13
23 files changed, 594 insertions, 709 deletions
diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h
index a119f14c..9f573f8f 100755
--- a/src/dynarec/arm64/arm64_emitter.h
+++ b/src/dynarec/arm64/arm64_emitter.h
@@ -1231,7 +1231,9 @@
 #define SCVTFDx(Dd, Xn)             EMIT(SCVTF_scalar(1, 0b01, 0b00, 0b010, Xn, Dd))
 
 #define SCVTF_vector_scalar(U, sz, Rn, Rd)    (1<<30 | (U)<<29 | 0b11110<<24 | (sz)<<22 | 0b10000<<17 | 0b11101<<12 | 0b10<<10 | (Rn)<<5 | (Rd))
+// Convert Vn from i32 to Vd float
 #define SCVTFSS(Vd, Vn)             EMIT(SCVTF_vector_scalar(0, 0, Vn, Vd))
+// Convert Vn from i64 to Vd double
 #define SCVTFDD(Vd, Vn)             EMIT(SCVTF_vector_scalar(0, 1, Vn, Vd))
 
 #define SCVTF_vector(Q, U, sz, Rn, Rd)      ((Q)<<30 | (U)<<29 | 0b01110<<24 | (sz)<<22 | 0b10000<<17 | 0b11101<<12 | 0b10<<10 | (Rn)<<5 | (Rd))
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index 77f0660e..4f327f5b 100755
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -108,7 +108,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;
             GETEB(x1, 0);
             GETGB(x2);
-            emit_or8(dyn, ninst, x1, x2, x4, x2);
+            emit_or8(dyn, ninst, x1, x2, x4, x5);
             EBBACK;
             break;
         case 0x09:
@@ -562,7 +562,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             i64 = F32S;
             if(PK(0)==0xC3) {
                 MESSAGE(LOG_DUMP, "PUSH then RET, using indirect\n");
-                TABLE64(x3, ip+1);
+                TABLE64(x3, addr-4);
                 LDRSW_U12(x1, x3, 0);
                 PUSH1(x1);
             } else {
@@ -869,7 +869,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 MARKLOCK;
                 // do the swap with exclusive locking
                 LDAXRB(x1, ed);
-                // do the swap 14 -> strb(ed), 1 -> gd
+                // do the swap 4 -> strb(ed), 1 -> gd
                 STLXRB(x3, x4, ed);
                 CBNZx_MARKLOCK(x3);
                 SMDMB();
@@ -1040,7 +1040,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
         case 0x8F:
             INST_NAME("POP Ed");
             nextop = F8;
-            if((nextop&0xC0)==0xC0) {
+            if(MODREG) {
                 POP1(xRAX+(nextop&7)+(rex.b<<3));
             } else {
                 POP1(x2); // so this can handle POP [ESP] and maybe some variant too
@@ -2303,7 +2303,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     break;
                 case 2:
                     INST_NAME("NOT Ed");
-                    GETED(4);
+                    GETED(0);
                     MVNxw_REG(ed, ed);
                     WBACK;
                     break;
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c
index ed2a120b..fe039dd7 100755
--- a/src/dynarec/arm64/dynarec_arm64_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_0f.c
@@ -24,47 +24,6 @@
 #include "dynarec_arm64_functions.h"

 #include "dynarec_arm64_helper.h"

 

-#define GETG                            \

-    gd = ((nextop&0x38)>>3)+(rex.r<<3)  \

-

-#define GETGX(a, w)                     \

-    gd = ((nextop&0x38)>>3)+(rex.r<<3); \

-    a = sse_get_reg(dyn, ninst, x1, gd, w)

-

-#define GETGX_empty(a)                          \

-    gd = ((nextop&0x38)>>3)+(rex.r<<3);         \

-    a = sse_get_reg_empty(dyn, ninst, x1, gd)

-

-#define GETEX(a, w, D)                                              \

-    if(MODREG) {                                                    \

-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);  \

-    } else {                                                        \

-        SMREAD();                                                   \

-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, NULL, 0, D); \

-        a = fpu_get_scratch(dyn);                                   \

-        VLDR128_U12(a, ed, fixedaddress);                           \

-    }

-

-#define GETGM(a)                        \

-    gd = ((nextop&0x38)>>3);            \

-    a = mmx_get_reg(dyn, ninst, x1, x2, x3, gd)

-

-#define GETEM(a, D)                                             \

-    if(MODREG) {                                                \

-        a = mmx_get_reg(dyn, ninst, x1, x2, x3, (nextop&7));    \

-    } else {                                                    \

-        SMREAD();                                               \

-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, D); \

-        a = fpu_get_scratch(dyn);                               \

-        VLDR64_U12(a, ed, fixedaddress);                        \

-    }

-

-#define PUTEM(a)                            \

-    if(!MODREG) {                           \

-        VSTR64_U12(a, ed, fixedaddress);    \

-        SMWRITE2();                         \

-    }

-

 uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)

 {

     (void)ip; (void)rep; (void)need_epilog;

@@ -107,6 +66,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             GETIP(ip);

             STORE_XEMU_CALL(xRIP);

             CALL(arm_ud, -1);

+            LOAD_XEMU_CALL(xRIP);

+            jump_to_epilog(dyn, 0, xRIP, ninst);

+            *need_epilog = 0;

+            *ok = 0;

             break;

 

         case 0x05:

@@ -174,9 +137,9 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 VMOVQ(v0, v1);

             } else {

                 v0 = sse_get_reg_empty(dyn, ninst, x1, gd);

+                SMREAD();

                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, NULL, 0, 0);

                 VLDR128_U12(v0, ed, fixedaddress);   // no alignment issue with ARMv8 NEON :)

-                SMWRITE2();

             }

             break;

         case 0x11:

@@ -435,10 +398,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     if(MODREG) {   // reg <= reg

                         REVxw(xRAX+(nextop&7)+(rex.b<<3), gd);

                     } else {                    // mem <= reg

-                        SMREAD();

                         addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<(2+rex.w), (1<<(2+rex.w))-1, rex, NULL, 0, 0);

                         REVxw(x1, gd);

                         STRxw_U12(x1, ed, fixedaddress);

+                        SMWRITE2();

                     }

                     break;

 

@@ -493,7 +456,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGD;

             MOV32w(gd, 0);

-            if((nextop&0xC0)==0xC0) {

+            if(MODREG) {

                 // EX is an xmm reg

                 GETEX(q0, 0, 0);

                 VMOVQDto(x1, q0, 0);

@@ -721,37 +684,36 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("PUNPCKHBW Gm,Em");

             nextop = F8;

             GETGM(q0);

-            GETEM(q1, 1);

+            GETEM(q1, 0);

             VZIP2_8(q0, q0, q1);

             break;

         case 0x69:

             INST_NAME("PUNPCKHWD Gm,Em");

             nextop = F8;

             GETGM(q0);

-            GETEM(q1, 1);

+            GETEM(q1, 0);

             VZIP2_16(q0, q0, q1);

             break;

         case 0x6A:

             INST_NAME("PUNPCKHDQ Gm,Em");

             nextop = F8;

             GETGM(q0);

-            GETEM(q1, 1);

+            GETEM(q1, 0);

             VZIP2_32(q0, q0, q1);

             break;

         case 0x6B:

             INST_NAME("PACKSSDW Gm,Em");

             nextop = F8;

             GETGM(v0);

+            q0 = fpu_get_scratch(dyn);

+            VMOVeD(q0, 0, v0, 0);

             if(MODREG) {

                 GETEM(v1, 0);

-                q0 = fpu_get_scratch(dyn);

                 VMOVeD(q0, 1, v1, 0);

             } else {

-                q0 = fpu_get_scratch(dyn);

                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, NULL, 0, 0);

                 VLD1_64(q0, 1, ed);

             }

-            VMOVeD(q0, 0, v0, 0);

             SQXTN_16(v0, q0);

             break;

 

@@ -765,6 +727,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 if(rex.w) {

                     FMOVDx(v0, ed);

                 } else {

+                    VEOR(v0, v0, v0);

                     FMOVSw(v0, ed);

                 }

             } else {

@@ -885,9 +848,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         } else if(u8) {

                             VSHR_16(q0, q0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR64_U12(q0, ed, fixedaddress);

-                        }

+                        PUTEM(q0);

                     }

                     break;

                 case 4:

@@ -898,9 +859,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     if(u8) {

                         VSSHR_16(q0, q0, u8);

                     }

-                    if(!MODREG) {

-                        VSTR64_U12(q0, ed, fixedaddress);

-                    }

+                    PUTEM(q0);

                     break;

                 case 6:

                     INST_NAME("PSLLW Em, Ib");

@@ -912,9 +871,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         } else {

                             VSHL_16(q0, q0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR64_U12(q0, ed, fixedaddress);

-                        }

+                        PUTEM(q0);

                     }

                     break;

                 default:

@@ -935,9 +892,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         } else if(u8) {

                             VSHR_32(d0, d0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR64_U12(d0, ed, fixedaddress);

-                        }

+                        PUTEM(d0);

                     }

                     break;

                 case 4:

@@ -948,9 +903,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     if(u8) {

                         VSSHR_32(d0, d0, u8);

                     }

-                    if(!MODREG) {

-                        VSTR64_U12(d0, ed, fixedaddress);

-                    }

+                    PUTEM(d0);

                     break;

                 case 6:

                     INST_NAME("PSLLD Em, Ib");

@@ -962,9 +915,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         } else {

                             VSHL_32(d0, d0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR64_U12(d0, ed, fixedaddress);

-                        }

+                        PUTEM(d0);

                     }

                     break;

                 default:

@@ -1040,9 +991,9 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             if((nextop&0xC0)==0xC0) {

                 ed = xRAX + (nextop&7) + (rex.b<<3);

                 if(rex.w) {

-                    VMOVQDto(ed, v0, 0);

+                    FMOVxD(ed, v0);

                 } else {

-                    VMOVSto(ed, v0, 0);

+                    FMOVwS(ed, v0);

                     MOVxw_REG(ed, ed);

                 }

             } else {

@@ -1852,10 +1803,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             break;

 

         case 0xE7:

-            INST_NAME("MOVNTQ Em, Gm"); // Non Temporal par not handled for now

+            INST_NAME("MOVNTQ Em, Gm");

             nextop = F8;

             gd = (nextop&0x38)>>3;

-            if((nextop&0xC0)==0xC0) {

+            if(MODREG) {

                 DEFAULT;

             } else {

                 v0 = mmx_get_reg(dyn, ninst, x1, x2, x3, gd);

@@ -1905,7 +1856,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             gd = ((nextop&0x38)>>3);

             if(MODREG && ((nextop&7))==gd) {

-                // special case for PXOR Gx, Gx

+                // special case for PXOR Gm, Gm

                 q0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd);

                 VEOR(q0, q0, q0);

             } else {

diff --git a/src/dynarec/arm64/dynarec_arm64_64.c b/src/dynarec/arm64/dynarec_arm64_64.c
index b279b9c8..c6dfb44d 100644
--- a/src/dynarec/arm64/dynarec_arm64_64.c
+++ b/src/dynarec/arm64/dynarec_arm64_64.c
@@ -103,8 +103,9 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                             nextop = F8;
                             GETG;
                             if(MODREG) {
+                                ed = (nextop&7)+ (rex.b<<3);
                                 v0 = sse_get_reg(dyn, ninst, x1, gd, 1);
-                                q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0);
+                                q0 = sse_get_reg(dyn, ninst, x1, ed, 0);
                                 VMOVeS(v0, 0, q0, 0);
                             } else {
                                 grab_segdata(dyn, addr, ninst, x4, seg);
@@ -161,7 +162,8 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                             GETG;
                             v0 = sse_get_reg(dyn, ninst, x1, gd, 0);
                             if(MODREG) {
-                                q0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 1);
+                                ed = (nextop&7)+ (rex.b<<3);
+                                q0 = sse_get_reg(dyn, ninst, x1, ed, 1);
                                 VMOVeS(q0, 0, v0, 0);
                             } else {
                                 grab_segdata(dyn, addr, ninst, x4, seg);
@@ -201,57 +203,69 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
 
 
                 case 0xAF:
-                    INST_NAME("IMUL Gd, Ed");
-                    SETFLAGS(X_ALL, SF_PENDING);
-                    nextop = F8;
-                    grab_segdata(dyn, addr, ninst, x4, seg);
-                    GETGD;
-                    GETEDO(x4, 0);
-                    if(rex.w) {
-                        // 64bits imul
-                        UFLAG_IF {
-                            SMULH(x3, gd, ed);
-                            MULx(gd, gd, ed);
-                            UFLAG_OP1(x3);
-                            UFLAG_RES(gd);
-                            UFLAG_DF(x3, d_imul64);
-                        } else {
-                            MULxw(gd, gd, ed);
-                        }
-                    } else {
-                        // 32bits imul
-                        UFLAG_IF {
-                            SMULL(gd, gd, ed);
-                            UFLAG_RES(gd);
-                            LSRx(x3, gd, 32);
-                            UFLAG_OP1(x3);
-                            UFLAG_DF(x3, d_imul32);
-                            MOVw_REG(gd, gd);
-                        } else {
-                            MULxw(gd, gd, ed);
-                        }
+                    switch(rep) {
+                        case 0:
+                            INST_NAME("IMUL Gd, Ed");
+                            SETFLAGS(X_ALL, SF_PENDING);
+                            nextop = F8;
+                            grab_segdata(dyn, addr, ninst, x4, seg);
+                            GETGD;
+                            GETEDO(x4, 0);
+                            if(rex.w) {
+                                // 64bits imul
+                                UFLAG_IF {
+                                    SMULH(x3, gd, ed);
+                                    MULx(gd, gd, ed);
+                                    UFLAG_OP1(x3);
+                                    UFLAG_RES(gd);
+                                    UFLAG_DF(x3, d_imul64);
+                                } else {
+                                    MULxw(gd, gd, ed);
+                                }
+                            } else {
+                                // 32bits imul
+                                UFLAG_IF {
+                                    SMULL(gd, gd, ed);
+                                    UFLAG_RES(gd);
+                                    LSRx(x3, gd, 32);
+                                    UFLAG_OP1(x3);
+                                    UFLAG_DF(x3, d_imul32);
+                                    MOVw_REG(gd, gd);
+                                } else {
+                                    MULxw(gd, gd, ed);
+                                }
+                            }
+                            break;
+                        default:
+                            DEFAULT;
                     }
                     break;
 
                 case 0xB6:
-                    INST_NAME("MOVZX Gd, Eb");
-                    nextop = F8;
-                    grab_segdata(dyn, addr, ninst, x4, seg);
-                    GETGD;
-                    if(MODREG) {
-                        if(rex.rex) {
-                            eb1 = xRAX+(nextop&7)+(rex.b<<3);
-                            eb2 = 0;                \
-                        } else {
-                            ed = (nextop&7);
-                            eb1 = xRAX+(ed&3);  // Ax, Cx, Dx or Bx
-                            eb2 = (ed&4)>>2;    // L or H
-                        }
-                        UBFXxw(gd, eb1, eb2*8, 8);
-                    } else {
-                        SMREAD();
-                        addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0, 0, rex, NULL, 0, 0);
-                        LDRB_REG(gd, ed, x4);
+                    switch(rep) {
+                        case 0:
+                            INST_NAME("MOVZX Gd, Eb");
+                            nextop = F8;
+                            grab_segdata(dyn, addr, ninst, x4, seg);
+                            GETGD;
+                            if(MODREG) {
+                                if(rex.rex) {
+                                    eb1 = xRAX+(nextop&7)+(rex.b<<3);
+                                    eb2 = 0;                \
+                                } else {
+                                    ed = (nextop&7);
+                                    eb1 = xRAX+(ed&3);  // Ax, Cx, Dx or Bx
+                                    eb2 = (ed&4)>>2;    // L or H
+                                }
+                                UBFXxw(gd, eb1, eb2*8, 8);
+                            } else {
+                                SMREAD();
+                                addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0, 0, rex, NULL, 0, 0);
+                                LDRB_REG(gd, ed, x4);
+                            }
+                            break;
+                        default:
+                            DEFAULT;
                     }
                     break;
 
@@ -326,7 +340,7 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             break;
 
         case 0x66:
-            addr = dynarec64_6664(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);
+            addr = dynarec64_6664(dyn, addr, ip, ninst, rex, seg, ok, need_epilog);
             break;
 
         case 0x80:
@@ -814,7 +828,7 @@ uintptr_t dynarec64_64(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     break;
                 case 2:
                     INST_NAME("NOT Ed");
-                    GETEDO(x6, 4);
+                    GETEDO(x6, 0);
                     MVNxw_REG(ed, ed);
                     WBACKO(x6);
                     break;
diff --git a/src/dynarec/arm64/dynarec_arm64_66.c b/src/dynarec/arm64/dynarec_arm64_66.c
index 493544ea..951518b5 100755
--- a/src/dynarec/arm64/dynarec_arm64_66.c
+++ b/src/dynarec/arm64/dynarec_arm64_66.c
@@ -295,9 +295,11 @@ uintptr_t dynarec64_66(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             break;

 

         case 0x64:

-            addr = dynarec64_6664(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);

+            addr = dynarec64_6664(dyn, addr, ip, ninst, rex, _FS, ok, need_epilog);

+            break;

+        case 0x65:

+            addr = dynarec64_6664(dyn, addr, ip, ninst, rex, _GS, ok, need_epilog);

             break;

-

         case 0x66:

             addr = dynarec64_66(dyn, addr, ip, ninst, rex, rep, ok, need_epilog);

             break;

diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index 5ff39e2b..814c2552 100755
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -22,27 +22,6 @@
 #include "dynarec_arm64_functions.h"

 #include "dynarec_arm64_helper.h"

 

-// Get EX as a quad

-#define GETEX(a, w, D)                                                                                  \

-    if(MODREG) {                                                                                        \

-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \

-    } else {                                                                                            \

-        SMREAD();                                                                                       \

-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, NULL, 0, D);  \

-        a = fpu_get_scratch(dyn);                                                                       \

-        VLDR128_U12(a, ed, fixedaddress);                                                               \

-    }

-

-#define GETG        gd = ((nextop&0x38)>>3)+(rex.r<<3)

-

-#define GETGX(a, w)                     \

-    gd = ((nextop&0x38)>>3)+(rex.r<<3); \

-    a = sse_get_reg(dyn, ninst, x1, gd, w)

-

-#define GETGX_empty(a)                  \

-    gd = ((nextop&0x38)>>3)+(rex.r<<3); \

-    a = sse_get_reg_empty(dyn, ninst, x1, gd)

-

 uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)

 {

     (void)ip; (void)rep; (void)need_epilog;

@@ -684,9 +663,9 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     INST_NAME("PINSRB Gx, ED, Ib");

                     nextop = F8;

                     GETGX(q0, 1);

-                    GETEB(x1, 1);

+                    GETED(1);

                     u8 = F8;

-                    VMOVQBfrom(q0, (u8&15), x1);

+                    VMOVQBfrom(q0, (u8&15), ed);

                     break;

 

                 case 0x22:

@@ -1215,10 +1194,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         } else if(u8) {

                             VSHRQ_16(q0, q0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR128_U12(q0, ed, fixedaddress);

-                            SMWRITE2();

-                        }

+                        PUTEX(q0);

                     }

                     break;

                 case 4:

@@ -1229,10 +1205,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     if(u8) {

                         VSSHRQ_16(q0, q0, u8);

                     }

-                    if(!MODREG) {

-                        VSTR128_U12(q0, ed, fixedaddress);

-                        SMWRITE2();

-                    }

+                    PUTEX(q0);

                     break;

                 case 6:

                     INST_NAME("PSLLW Ex, Ib");

@@ -1244,10 +1217,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         } else {

                             VSHLQ_16(q0, q0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR128_U12(q0, ed, fixedaddress);

-                            SMWRITE2();

-                        }

+                        PUTEX(q0);

                     }

                     break;

                 default:

@@ -1268,10 +1238,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         } else if(u8) {

                             VSHRQ_32(q0, q0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR128_U12(q0, ed, fixedaddress);

-                            SMWRITE2();

-                        }

+                        PUTEX(q0);

                     }

                     break;

                 case 4:

@@ -1282,10 +1249,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     if(u8) {

                         VSSHRQ_32(q0, q0, u8);

                     }

-                    if(!MODREG) {

-                        VSTR128_U12(q0, ed, fixedaddress);

-                        SMWRITE2();

-                    }

+                    PUTEX(q0);

                     break;

                 case 6:

                     INST_NAME("PSLLD Ex, Ib");

@@ -1297,10 +1261,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         } else {

                             VSHLQ_32(q0, q0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR128_U12(q0, ed, fixedaddress);

-                            SMWRITE2();

-                        }

+                        PUTEX(q0);

                     }

                     break;

                 default:

@@ -1320,10 +1281,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         } else if(u8) {

                             VSHRQ_64(q0, q0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR128_U12(q0, ed, fixedaddress);

-                            SMWRITE2();

-                        }

+                        PUTEX(q0);

                     }

                     break;

                 case 3:

@@ -1338,10 +1296,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                             VEORQ(q1, q1, q1);

                             VEXTQ_8(q0, q0, q1, u8);

                         }

-                        if(!MODREG) {

-                            VSTR128_U12(q0, ed, fixedaddress);

-                            SMWRITE2();

-                        }

+                        PUTEX(q0);

                     }

                     break;

                 case 6:

@@ -1354,10 +1309,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         } else {

                             VSHLQ_64(q0, q0, u8);

                         }

-                        if(!MODREG) {

-                            VSTR128_U12(q0, ed, fixedaddress);

-                            SMWRITE2();

-                        }

+                        PUTEX(q0);

                     }

                     break;

                 case 7:

@@ -1372,10 +1324,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                             VEORQ(q1, q1, q1);

                             VEXTQ_8(q0, q1, q0, 16-u8);

                         }

-                        if(!MODREG) {

-                            VSTR128_U12(q0, ed, fixedaddress);

-                            SMWRITE2();

-                        }

+                        PUTEX(q0);

                     }

                     break;

                 default:

diff --git a/src/dynarec/arm64/dynarec_arm64_6664.c b/src/dynarec/arm64/dynarec_arm64_6664.c
index 362dd5de..92abeac6 100644
--- a/src/dynarec/arm64/dynarec_arm64_6664.c
+++ b/src/dynarec/arm64/dynarec_arm64_6664.c
@@ -24,9 +24,9 @@
 
 #define GETG        gd = ((nextop&0x38)>>3)+(rex.r<<3)
 
-uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog)
+uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog)
 {
-    (void)ip; (void)rep; (void)need_epilog;
+    (void)ip; (void)need_epilog;
 
     uint8_t opcode = F8;
     uint8_t nextop;
@@ -64,7 +64,7 @@ uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     if(MODREG) {
                         v1 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0);
                     } else {
-                        grab_segdata(dyn, addr, ninst, x4, _FS);
+                        grab_segdata(dyn, addr, ninst, x4, seg);
                         SMREAD();
                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, NULL, 0, 0);
                         v1 = fpu_get_scratch(dyn);                                                                       \
@@ -82,7 +82,7 @@ uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     v1 = sse_get_reg_empty(dyn, ninst, x1, (nextop&7) + (rex.b<<3));
                     FMOVD(v1, v0);
                 } else {
-                    grab_segdata(dyn, addr, ninst, x4, _FS);
+                    grab_segdata(dyn, addr, ninst, x4, seg);
                     addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, NULL, 0, 0);
                     VSTR64_REG(v0, ed, x4);
                     SMWRITE();
@@ -108,7 +108,7 @@ uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     }
                 }
             } else {
-                grab_segdata(dyn, addr, ninst, x4, _FS);
+                grab_segdata(dyn, addr, ninst, x4, seg);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0, 0, rex, NULL, 0, 0);
                 if(rex.w) {
                     STRx_REG(gd, ed, x4);
@@ -133,7 +133,7 @@ uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     }
                 }
             } else {                    // mem <= reg
-                grab_segdata(dyn, addr, ninst, x4, _FS);
+                grab_segdata(dyn, addr, ninst, x4, seg);
                 SMREAD();
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0, 0, rex, NULL, 0, 0);
                 if(rex.w) {
diff --git a/src/dynarec/arm64/dynarec_arm64_66f0.c b/src/dynarec/arm64/dynarec_arm64_66f0.c
index bfdf24ee..2a436ea1 100644
--- a/src/dynarec/arm64/dynarec_arm64_66f0.c
+++ b/src/dynarec/arm64/dynarec_arm64_66f0.c
@@ -89,7 +89,6 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         ed = xRAX+(nextop&7)+(rex.b<<3);
                         wback = 0;
                         UXTHw(x1, ed);
-                        UFLAG_IF {emit_cmp16(dyn, ninst, x6, x1, x3, x4, x5);}
                         CMPSxw_REG(x6, x1);
                         B_MARK(cNE);
                         BFIx(ed, gd, 0, 16);
@@ -120,7 +119,7 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     }
                     MARK;
                     // Common part (and fallback for EAX != Ed)
-                    UFLAG_IF {emit_cmp32(dyn, ninst, rex, x6, x1, x3, x4, x5);}
+                    UFLAG_IF {emit_cmp16(dyn, ninst, x6, x1, x3, x4, x5);}
                     BFIx(xRAX, x1, 0, 16);
                     SMDMB();
                     break;
@@ -347,9 +346,7 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     if(opcode==0x81) i32 = F16S; else i32 = F8S;
                     if(i32) {
                         MOV32w(x5, i32);
-                        UXTHw(x6, ed);
                         emit_cmp16(dyn, ninst, x6, x5, x3, x4, x6);
-                        BFIx(ed, x6, 0, 16);
                     } else {
                         emit_cmp16_0(dyn, ninst, ed, x3, x4);
                     }
diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c
index 094f68e7..a68cc363 100755
--- a/src/dynarec/arm64/dynarec_arm64_67.c
+++ b/src/dynarec/arm64/dynarec_arm64_67.c
@@ -175,53 +175,67 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 case 0x2E:

                     // no special check...

                 case 0x2F:

-                    if(rep) {

-                        DEFAULT;

-                    } else {

-                        if(opcode==0x2F) {INST_NAME("COMISS Gx, Ex");} else {INST_NAME("UCOMISS Gx, Ex");}

-                        SETFLAGS(X_ALL, SF_SET);

-                        nextop = F8;

-                        GETGX(v0, 0);

-                        if(MODREG) {

-                            s0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0);

-                        } else {

-                            s0 = fpu_get_scratch(dyn);

-                            SMREAD();

-                            addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);

-                            VLDR32_U12(s0, ed, fixedaddress);

-                        }

-                        FCMPS(v0, s0);

-                        FCOMI(x1, x2);

+                    switch(rep) {

+                        case 0:

+                            if(opcode==0x2F) {INST_NAME("COMISS Gx, Ex");} else {INST_NAME("UCOMISS Gx, Ex");}

+                            SETFLAGS(X_ALL, SF_SET);

+                            nextop = F8;

+                            GETGX(v0, 0);

+                            if(MODREG) {

+                                s0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0);

+                            } else {

+                                s0 = fpu_get_scratch(dyn);

+                                SMREAD();

+                                addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);

+                                VLDR32_U12(s0, ed, fixedaddress);

+                            }

+                            FCMPS(v0, s0);

+                            FCOMI(x1, x2);

+                            break;

+                        default:

+                            DEFAULT;

                     }

                     break;

 

                 case 0x6F:

-                    INST_NAME("MOVQ Gm, Em");

-                    nextop = F8;

-                    GETGm;

-                    if(MODREG) {

-                        v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); // no rex.b on MMX

-                        v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd);

-                        VMOV(v0, v1);

-                    } else {

-                        v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd);

-                        SMREAD();

-                        addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);

-                        VLDR64_U12(v0, ed, fixedaddress);

+                    switch(rep) {

+                        case 0:

+                            INST_NAME("MOVQ Gm, Em");

+                            nextop = F8;

+                            GETGm;

+                            if(MODREG) {

+                                v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); // no rex.b on MMX

+                                v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd);

+                                VMOV(v0, v1);

+                            } else {

+                                v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd);

+                                SMREAD();

+                                addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);

+                                VLDR64_U12(v0, ed, fixedaddress);

+                            }

+                            break;

+                        default:

+                            DEFAULT;

                     }

                     break;

 

                 case 0x7F:

-                    INST_NAME("MOVQ Em, Gm");

-                    nextop = F8;

-                    GETGM(v0);

-                    if(MODREG) {

-                        v1 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, nextop&7);

-                        VMOV(v1, v0);

-                    } else {

-                        addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);

-                        VSTR64_U12(v0, ed, fixedaddress);

-                        SMWRITE();

+                    switch(rep) {

+                        case 0:

+                            INST_NAME("MOVQ Em, Gm");

+                            nextop = F8;

+                            GETGM(v0);

+                            if(MODREG) {

+                                v1 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, nextop&7);

+                                VMOV(v1, v0);

+                            } else {

+                                addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);

+                                VSTR64_U12(v0, ed, fixedaddress);

+                                SMWRITE();

+                            }

+                            break;

+                        default:

+                            DEFAULT;

                     }

                     break;

 

diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c
index ce101f46..25587a9c 100644
--- a/src/dynarec/arm64/dynarec_arm64_d9.c
+++ b/src/dynarec/arm64/dynarec_arm64_d9.c
@@ -281,7 +281,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             #else
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
             u8 = x87_setround(dyn, ninst, x1, x2, x3);
-            FRINTID(v1, v1);
+            if(ST_IS_F(0)) {
+                FRINTIS(v1, v1);
+            } else {
+                FRINTID(v1, v1);
+            }
             x87_restoreround(dyn, ninst, u8);
             #endif
             break;
@@ -324,14 +328,10 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 case 0:
                     INST_NAME("FLD ST0, float[ED]");
                     v1 = x87_do_push(dyn, ninst, x1, box64_dynarec_x87double?NEON_CACHE_ST_D:NEON_CACHE_ST_F);
-                    if(ST_IS_F(0))
-                        s0 = v1;
-                    else
-                        s0 = fpu_get_scratch(dyn);
                     addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);
-                    VLDR32_U12(s0, ed, fixedaddress);
+                    VLDR32_U12(v1, ed, fixedaddress);
                     if(!ST_IS_F(0)) {
-                        FCVT_D_S(v1, s0);
+                        FCVT_D_S(v1, v1);
                     }
                     break;
                 case 2:
@@ -349,14 +349,11 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 case 3:
                     INST_NAME("FSTP float[ED], ST0");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F);
-                    if(ST_IS_F(0))
-                        s0 = v1;
-                    else {
-                        s0 = fpu_get_scratch(dyn);
-                        FCVT_S_D(s0, v1);
-                    }
                     addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);
-                    VSTR32_U12(s0, ed, fixedaddress);
+                    if(!ST_IS_F(0)) {
+                        FCVT_S_D(v1, v1);
+                    }
+                    VSTR32_U12(v1, ed, fixedaddress);
                     x87_do_pop(dyn, ninst, x3);
                     break;
                 case 4:
diff --git a/src/dynarec/arm64/dynarec_arm64_db.c b/src/dynarec/arm64/dynarec_arm64_db.c
index 5f8a1396..b9d71080 100644
--- a/src/dynarec/arm64/dynarec_arm64_db.c
+++ b/src/dynarec/arm64/dynarec_arm64_db.c
@@ -189,24 +189,18 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 case 0:
                     INST_NAME("FILD ST0, Ed");
                     v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D);
-                    s0 = fpu_get_scratch(dyn);
                     addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);
-                    VLDR32_U12(s0, ed, fixedaddress);
-                    SXTL_32(v1, s0);
-                    SCVTFDD(v1, v1);
+                    VLDR32_U12(v1, ed, fixedaddress);
+                    SXTL_32(v1, v1);    // i32 -> i64
+                    SCVTFDD(v1, v1);    // i64 -> double
                     break;
                 case 1:
                     INST_NAME("FISTTP Ed, ST0");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                    if(MODREG) {
-                        ed = xRAX+(nextop&7)+(rex.b<<3);
-                        wback = 0;
-                    } else {
-                        addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);
-                        ed = x1;
-                    }
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);
                     s0 = fpu_get_scratch(dyn);
                     #if 0
+                    ed = x1;
                     FRINT32ZD(s0, v1);
                     FCVTZSwD(ed, s0);
                     WBACK;
@@ -230,15 +224,10 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FIST Ed, ST0");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     u8 = x87_setround(dyn, ninst, x1, x2, x4); // x1 have the modified RPSCR reg
-                    if(MODREG) {
-                        ed = xRAX+(nextop&7)+(rex.b<<3);
-                        wback = 0;
-                    } else {
-                        addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);
-                        ed = x1;
-                    }
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);
                     s0 = fpu_get_scratch(dyn);
                     #if 0
+                    ed = x1;
                     FRINT32XD(s0, v1);
                     FCVTZSwD(ed, s0);
                     WBACK;
@@ -262,15 +251,10 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FISTP Ed, ST0");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     u8 = x87_setround(dyn, ninst, x1, x2, x4); // x1 have the modified RPSCR reg
-                    if(MODREG) {
-                        ed = xRAX+(nextop&7)+(rex.b<<3);
-                        wback = 0;
-                    } else {
-                        addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);
-                        ed = x1;
-                    }
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, 0);
                     s0 = fpu_get_scratch(dyn);
                     #if 0
+                    ed = x1;
                     FRINT32XD(s0, v1);
                     FCVTZSwD(ed, s0);
                     WBACK;
@@ -294,13 +278,18 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 case 5:
                     INST_NAME("FLD tbyte");
                     addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, NULL, 0, 0);
-                    if(PK(0)==0xDB && ((PK(1)>>3)&7)==7) {
+                    if((PK(0)==0xDB && ((PK(1)>>3)&7)==7) || (PK(0)>=0x40 && PK(0)<=0x4f && PK(1)==0xDB && ((PK(2)>>3)&7)==7)) {
                         // the FLD is immediatly followed by an FSTP
                         LDRx_U12(x5, ed, 0);
                         LDRH_U12(x6, ed, 8);
                         // no persistant scratch register, so unrool both instruction here...
                         MESSAGE(LOG_DUMP, "\tHack: FSTP tbyte\n");
-                        nextop = F8;    //0xDB
+                        nextop = F8;    // 0xDB or rex
+                        if(nextop>=0x40 && nextop<=0x4f) {
+                            rex.rex = nextop;
+                            nextop = F8;    //0xDB
+                        } else
+                            rex.rex = 0;
                         nextop = F8;    //modrm
                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, rex, NULL, 0, 0);
                         STRx_U12(x5, ed, 0);
diff --git a/src/dynarec/arm64/dynarec_arm64_dc.c b/src/dynarec/arm64/dynarec_arm64_dc.c
index da4ba97c..c729ed09 100644
--- a/src/dynarec/arm64/dynarec_arm64_dc.c
+++ b/src/dynarec/arm64/dynarec_arm64_dc.c
@@ -182,7 +182,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FADD ST0, double[ED]");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     v2 = fpu_get_scratch(dyn);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, NULL, 0, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);
                     VLDR64_U12(v2, wback, fixedaddress);
                     FADDD(v1, v1, v2);
                     break;
@@ -190,7 +190,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FMUL ST0, double[ED]");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     v2 = fpu_get_scratch(dyn);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, NULL, 0, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);
                     VLDR64_U12(v2, wback, fixedaddress);
                     FMULD(v1, v1, v2);
                     break;
@@ -198,7 +198,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FCOM ST0, double[ED]");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     v2 = fpu_get_scratch(dyn);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, NULL, 0, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);
                     VLDR64_U12(v2, wback, fixedaddress);
                     FCMPD(v1, v2);
                     FCOM(x1, x2, x3);
@@ -207,7 +207,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FCOMP ST0, double[ED]");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     v2 = fpu_get_scratch(dyn);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, NULL, 0, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);
                     VLDR64_U12(v2, wback, fixedaddress);
                     FCMPD(v1, v2);
                     FCOM(x1, x2, x3);
@@ -217,7 +217,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FSUB ST0, double[ED]");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     v2 = fpu_get_scratch(dyn);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, NULL, 0, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);
                     VLDR64_U12(v2, wback, fixedaddress);
                     FSUBD(v1, v1, v2);
                     break;
@@ -225,7 +225,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FSUBR ST0, double[ED]");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     v2 = fpu_get_scratch(dyn);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, NULL, 0, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);
                     VLDR64_U12(v2, wback, fixedaddress);
                     FSUBD(v1, v2, v1);
                     break;
@@ -233,7 +233,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FDIV ST0, double[ED]");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     v2 = fpu_get_scratch(dyn);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, NULL, 0, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);
                     VLDR64_U12(v2, wback, fixedaddress);
                     FDIVD(v1, v1, v2);
                     break;
@@ -241,7 +241,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     INST_NAME("FDIVR ST0, double[ED]");
                     v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                     v2 = fpu_get_scratch(dyn);
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 3, rex, NULL, 0, 0);
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, 0);
                     VLDR64_U12(v2, wback, fixedaddress);
                     FDIVD(v1, v2, v1);
                     break;
diff --git a/src/dynarec/arm64/dynarec_arm64_dd.c b/src/dynarec/arm64/dynarec_arm64_dd.c
index 686a7dde..dfb17507 100644
--- a/src/dynarec/arm64/dynarec_arm64_dd.c
+++ b/src/dynarec/arm64/dynarec_arm64_dd.c
@@ -33,10 +33,12 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
     int64_t fixedaddress;
     int v1, v2;
     int s0;
+    int64_t j64;
 
     MAYUSE(s0);
     MAYUSE(v2);
     MAYUSE(v1);
+    MAYUSE(j64);
 
     switch(nextop) {
         case 0xC0:
@@ -170,11 +172,21 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     #if 0
                     // those are ARM 8.5 opcode!
                     FRINT64ZD(s0, v1);
+                    FCVTZSxD(x2, s0);
+                    STRx_U12(x2, ed, fixedaddress);
                     #else
-                    FRINTRRD(s0, v1, 3);    // not ideal, might averflow and not set 0x8000000000000000 correctly
-                    #endif
+                    MRS_fpsr(x5);
+                    BFCw(x5, FPSR_IOC, 1);   // reset IOC bit
+                    MSR_fpsr(x5);
+                    FRINTRRD(s0, v1, 3);
                     FCVTZSxD(x2, s0);
                     STRx_U12(x2, ed, fixedaddress);
+                    MRS_fpsr(x5);   // get back FPSR to check the IOC bit
+                    TBZ_MARK3(x5, FPSR_IOC);
+                    ORRx_mask(x5, xZR, 1, 1, 0);    //0x8000000000000000
+                    STRw_U12(x5, ed, fixedaddress);
+                    MARK3;
+                    #endif
                     x87_do_pop(dyn, ninst, x3);
                     break;
                 case 2:
diff --git a/src/dynarec/arm64/dynarec_arm64_df.c b/src/dynarec/arm64/dynarec_arm64_df.c
index 960164e1..2cbcfe4b 100644
--- a/src/dynarec/arm64/dynarec_arm64_df.c
+++ b/src/dynarec/arm64/dynarec_arm64_df.c
@@ -308,7 +308,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     VSTR64_U12(s0, wback, fixedaddress);
                     MRS_fpsr(x5);   // get back FPSR to check the IOC bit
                     TBZ_MARK3(x5, FPSR_IOC);
-                    MOV64x(x5, 0x8000000000000000LL);
+                    ORRx_mask(x5, xZR, 1, 1, 0);    //0x8000000000000000
                     STRx_U12(x5, wback, fixedaddress);
                     MARK3;
                     #endif
diff --git a/src/dynarec/arm64/dynarec_arm64_emit_logic.c b/src/dynarec/arm64/dynarec_arm64_emit_logic.c
index b152ac4d..668713a6 100755
--- a/src/dynarec/arm64/dynarec_arm64_emit_logic.c
+++ b/src/dynarec/arm64/dynarec_arm64_emit_logic.c
@@ -47,7 +47,7 @@ void emit_or32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3,
     }
     IFX(X_SF) {
         LSRxw(s3, s1, (rex.w)?63:31);
-        BFIx(xFlags, s3, F_SF, 1);
+        BFIw(xFlags, s3, F_SF, 1);
     }
     IFX(X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
@@ -78,7 +78,7 @@ void emit_or32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int
     }
     IFX(X_SF) {
         LSRxw(s3, s1, (rex.w)?63:31);
-        BFIx(xFlags, s3, F_SF, 1);
+        BFIw(xFlags, s3, F_SF, 1);
     }
     IFX(X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
@@ -109,7 +109,7 @@ void emit_xor32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
     }
     IFX(X_SF) {
         LSRxw(s3, s1, (rex.w)?63:31);
-        BFIx(xFlags, s3, F_SF, 1);
+        BFIw(xFlags, s3, F_SF, 1);
     }
     IFX(X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
@@ -140,7 +140,7 @@ void emit_xor32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int64_t c, in
     }
     IFX(X_SF) {
         LSRxw(s3, s1, (rex.w)?63:31);
-        BFIx(xFlags, s3, F_SF, 1);
+        BFIw(xFlags, s3, F_SF, 1);
     }
     IFX(X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
@@ -174,7 +174,7 @@ void emit_and32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
     }
     IFX(X_SF) {
         LSRxw(s3, s1, (rex.w)?63:31);
-        BFIx(xFlags, s3, F_SF, 1);
+        BFIw(xFlags, s3, F_SF, 1);
     }
     IFX(X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
@@ -208,7 +208,7 @@ void emit_and32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int64_t c, in
     }
     IFX(X_SF) {
         LSRxw(s3, s1, (rex.w)?63:31);
-        BFIx(xFlags, s3, F_SF, 1);
+        BFIw(xFlags, s3, F_SF, 1);
     }
     IFX(X_PF) {
         emit_pf(dyn, ninst, s1, s3, s4);
@@ -579,9 +579,6 @@ void emit_and16(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     IFX(X_PEND) {
         STRH_U12(s1, xEmu, offsetof(x64emu_t, res));
     }
-    IFX(X_PEND) {
-        STRB_U12(s1, xEmu, offsetof(x64emu_t, res));
-    }
     IFX(X_CF | X_AF | X_OF) {
         MOV32w(s3, (1<<F_CF)|(1<<F_AF)|(1<<F_OF));
         BICw_REG(xFlags, xFlags, s3);
diff --git a/src/dynarec/arm64/dynarec_arm64_emit_math.c b/src/dynarec/arm64/dynarec_arm64_emit_math.c
index 37952444..8d952d50 100755
--- a/src/dynarec/arm64/dynarec_arm64_emit_math.c
+++ b/src/dynarec/arm64/dynarec_arm64_emit_math.c
@@ -157,8 +157,7 @@ void emit_sub32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
         SET_DFNONE(s3);
     }
     IFX(X_AF) {
-        MVNxw_REG(s3, s1);
-        ORRxw_REG(s3, s3, s2);  // s3 = ~op1 | op2
+        ORNxw_REG(s3, s2, s1);  // s3 = ~op1 | op2
         BICxw(s4, s2, s1);      // s4 = ~op1 & op2
     }
     IFX(X_ALL) {
@@ -612,8 +611,7 @@ void emit_sub16(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int s4)
         SET_DFNONE(s3);
     }
     IFX(X_AF|X_OF|X_CF) {
-        MVNw_REG(s3, s1);
-        ORRw_REG(s3, s3, s2);    // s3 = ~op1 | op2
+        ORNw_REG(s3, s2, s1);    // s3 = ~op1 | op2
         BICw_REG(s4, s2, s1);    // s4 = ~op1 & op2
     }
 
@@ -920,7 +918,11 @@ void emit_dec8(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4)
         ANDw_mask(s4, s3, 0, 0);        // s4 = ~op1 & op2
         ORRw_mask(s3, s3, 0, 0);     // s3 = ~op1 | op2
     }
-    SUBSw_U12(s1, s1, 1);
+    IFX(X_ZF) {
+        SUBSw_U12(s1, s1, 1);
+    } else {
+        SUBw_U12(s1, s1, 1);
+    }
     IFX(X_PEND) {
         STRB_U12(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -962,7 +964,11 @@ void emit_dec16(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4)
     IFX(X_AF|X_OF) {
         MVNw_REG(s4, s1);
     }
-    SUBSw_U12(s1, s1, 1);
+    IFX(X_ZF) {
+        SUBSw_U12(s1, s1, 1);
+    } else {
+        SUBw_U12(s1, s1, 1);
+    }
     IFX(X_PEND) {
         STRH_U12(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -1177,54 +1183,7 @@ void emit_adc8c(dynarec_arm_t* dyn, int ninst, int s1, int c, int s3, int s4, in
 {
     MAYUSE(s5);
     MOV32w(s5, c&0xff);
-    IFX(X_PEND) {
-        STRB_U12(s1, xEmu, offsetof(x64emu_t, op1));
-        STRB_U12(s5, xEmu, offsetof(x64emu_t, op2));
-        SET_DF(s4, d_adc8);
-    } else IFX(X_ALL) {
-        SET_DFNONE(s4);
-    }
-    IFX(X_AF | X_OF) {
-        MOVw_REG(s4, s1);
-    }
-    MRS_nzvc(s3);
-    BFIx(s3, xFlags, 29, 1); // set C
-    MSR_nzvc(s3);      // load CC into ARM CF
-    ADCw_REG(s1, s1, s5);
-    IFX(X_PEND) {
-        STRH_U12(s1, xEmu, offsetof(x64emu_t, res));
-    }
-    IFX(X_AF|X_OF) {
-        ORRw_REG(s3, s4, s5);        // s3 = op1 | op2
-        ANDw_REG(s4, s4, s5);        // s4 = op1 & op2
-        BICw_REG(s3, s3, s1);   // s3 = (op1 | op2) & ~ res
-        ORRw_REG(s3, s3, s4);   // s4 = (op1 & op2) | ((op1 | op2) & ~ res)
-        IFX(X_AF) {
-            LSRw(s4, s3, 3);
-            BFIw(xFlags, s4, F_AF, 1);    // AF: bc & 0x08
-        }
-        IFX(X_OF) {
-            LSRw(s4, s3, 6);
-            EORw_REG_LSR(s4, s4, s4, 1);
-            BFIw(xFlags, s4, F_OF, 1);    // OF: ((bc >> 6) ^ ((bc>>6)>>1)) & 1
-        }
-    }
-    IFX(X_CF) {
-        LSRw(s3, s1, 8);
-        BFIw(xFlags, s3, F_CF, 1);
-    }
-    IFX(X_ZF) {
-        ANDSw_mask(s1, s1, 0, 0b000111);    //mask=000000ff
-        CSETw(s3, cEQ);
-        BFIw(xFlags, s3, F_ZF, 1);
-    }
-    IFX(X_SF) {
-        LSRw(s3, s1, 7);
-        BFIw(xFlags, s3, F_SF, 1);
-    }
-    IFX(X_PF) {
-        emit_pf(dyn, ninst, s1, s3, s4);
-    }
+    emit_adc8(dyn, ninst, s1, s5, s3, s4);
 }
 
 // emit ADC16 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -1246,7 +1205,7 @@ void emit_adc16(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int s4)
     MSR_nzvc(s3);      // load CC into ARM CF
     ADCw_REG(s1, s1, s2);
     IFX(X_PEND) {
-        STRH_U12(s1, xEmu, offsetof(x64emu_t, res));
+        STRw_U12(s1, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_AF|X_OF) {
         ORRw_REG(s3, s4, s2);    // s3 = op1 | op2
@@ -1536,55 +1495,7 @@ void emit_sbb8c(dynarec_arm_t* dyn, int ninst, int s1, int c, int s3, int s4, in
 {
     MAYUSE(s5);
     MOV32w(s5, c&0xff);
-    IFX(X_PEND) {
-        STRB_U12(s1, xEmu, offsetof(x64emu_t, op1));
-        STRB_U12(s5, xEmu, offsetof(x64emu_t, op2));
-        SET_DF(s3, d_sbb8);
-    } else IFX(X_ALL) {
-        SET_DFNONE(s3);
-    }
-    EORw_mask(s4, xFlags, 0, 0);            // invert CC because it's reverted for SUB on ARM
-    MRS_nzvc(s3);
-    BFIx(s3, s4, 29, 1); // set C, bit 29
-    MSR_nzvc(s3);      // load CC into ARM CF
-    IFX(X_AF|X_OF|X_CF) {
-        MVNw_REG(s4, s1);
-    }
-    SBCw_REG(s1, s1, s5);
-    IFX(X_PEND) {
-        STRB_U12(s1, xEmu, offsetof(x64emu_t, res));
-    }
-    IFX(X_AF|X_OF|X_CF) {
-        ORRw_REG(s3, s4, s5);               // s3 = ~op1 | op2
-        ANDw_REG(s4, s4, s5);               // s4 = ~op1 & op2
-        ANDw_REG(s3, s3, s1);               // s3 = (~op1 | op2) & res
-        ORRw_REG(s3, s3, s4);               // s3 = (~op1 & op2) | ((~op1 | op2) & res)
-        IFX(X_CF) {
-            LSRw(s4, s3, 7);
-            BFIw(xFlags, s4, F_CF, 1);    // CF : bc & 0x80
-        }
-        IFX(X_AF) {
-            LSRw(s4, s3, 3);
-            BFIw(xFlags, s4, F_AF, 1);    // AF: bc & 0x08
-        }
-        IFX(X_OF) {
-            LSRw(s4, s3, 6);
-            EORw_REG_LSR(s4, s4, s4, 1);
-            BFIw(xFlags, s4, F_OF, 1);    // OF: ((bc >> 6) ^ ((bc>>6)>>1)) & 1
-        }
-    }
-    IFX(X_ZF) {
-        ANDSw_mask(s1, s1, 0, 0b000111);    //mask=000000ff
-        CSETw(s3, cEQ);
-        BFIw(xFlags, s3, F_ZF, 1);
-    }
-    IFX(X_SF) {
-        LSRw(s3, s1, 7);
-        BFIw(xFlags, s3, F_SF, 1);
-    }
-    IFX(X_PF) {
-        emit_pf(dyn, ninst, s1, s3, s4);
-    }
+    emit_sbb8(dyn, ninst, s1, s5, s3, s4);
 }
 
 // emit SBB16 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch
@@ -1773,7 +1684,11 @@ void emit_neg16(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4)
     IFX(X_AF|X_OF) {
         MOVw_REG(s3, s1);
     }
-    NEGSw_REG(s1, s1);
+    IFX(X_ZF) {
+        NEGSw_REG(s1, s1);
+    } else {
+        NEGw_REG(s1, s1);
+    }
     IFX(X_PEND) {
         STRH_U12(s1, xEmu, offsetof(x64emu_t, res));
     }
@@ -1819,7 +1734,11 @@ void emit_neg8(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4)
     IFX(X_AF|X_OF) {
         MOVw_REG(s3, s1);
     }
-    NEGSw_REG(s1, s1);
+    IFX(X_ZF) {
+        NEGSw_REG(s1, s1);
+    } else {
+        NEGw_REG(s1, s1);
+    }
     IFX(X_PEND) {
         STRB_U12(s1, xEmu, offsetof(x64emu_t, res));
     }
diff --git a/src/dynarec/arm64/dynarec_arm64_emit_shift.c b/src/dynarec/arm64/dynarec_arm64_emit_shift.c
index 51903720..45add1f2 100755
--- a/src/dynarec/arm64/dynarec_arm64_emit_shift.c
+++ b/src/dynarec/arm64/dynarec_arm64_emit_shift.c
@@ -37,14 +37,11 @@ void emit_shl32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
     } else IFX(X_ALL) {
         SET_DFNONE(s4);
     }
-    IFX(F_OF) {
+    IFX(X_OF) {
         CMPSxw_U12(s2, 0);
-        IFX(F_OF) {
-            Bcond(cNE, +8);
-            BFCx(xFlags, F_OF, 1);
-        }
+        Bcond(cNE, +8+((dyn->insts[ninst].x64.gen_flags&X_PEND)?4:0));
+        BFCw(xFlags, F_OF, 1);
         IFX(X_PEND) {
-            Bcond(cNE, +8);
             STRxw_U12(s1, xEmu, offsetof(x64emu_t, res));
         }
         B_NEXT(cEQ);
@@ -81,7 +78,7 @@ void emit_shl32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
 }
 
 // emit SHL32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
-void emit_shl32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4)
+void emit_shl32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4)
 {
     IFX(X_PEND) {
         MOV32w(s3, c);
@@ -92,8 +89,8 @@ void emit_shl32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, in
         SET_DFNONE(s4);
     }
     if(c==0) {
-        IFX(F_OF) {
-            BFCx(xFlags, F_OF, 1);
+        IFX(X_OF) {
+            BFCw(xFlags, F_OF, 1);
         }
         IFX(X_PEND) {
             STRxw_U12(s1, xEmu, offsetof(x64emu_t, res));
@@ -175,11 +172,7 @@ void emit_shr32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
     IFX(X_OF) {
         CMPSxw_U12(s2, 1);   // if s2==1
             Bcond(cNE, 4+3*4);
-            if(rex.w) {
-                LSRx(s4, s1, 62);
-            } else {
-                LSRw(s4, s1, 30);
-            }
+            LSRxw(s4, s1, rex.w?62:30);
             EORw_REG_LSR(s4, s4, s4, 1);
             BFIw(xFlags, s4, F_OF, 1);
     }
@@ -189,7 +182,7 @@ void emit_shr32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
 }
 
 // emit SHR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
-void emit_shr32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4)
+void emit_shr32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4)
 {
     IFX(X_PEND) {
         MOV32w(s3, c);
@@ -206,10 +199,8 @@ void emit_shr32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, in
         return;
     }
     IFX(X_CF) {
-        if(c>1) {
-            LSRxw(s3, s1, c-1);
-        }
-        BFIw(xFlags, (c>1)?s3:s1, 0, 1);
+        LSRxw(s3, s1, c-1);
+        BFIw(xFlags, s3, 0, 1);
     }
     LSRxw(s1, s1, c);
     IFX(X_PEND) {
@@ -237,7 +228,7 @@ void emit_shr32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, in
 }
 
 // emit SAR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
-void emit_sar32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4)
+void emit_sar32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4)
 {
     IFX(X_PEND) {
         MOV32w(s3, c);
@@ -254,10 +245,8 @@ void emit_sar32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, in
         return;
     }
     IFX(X_CF) {
-        if(c>1) {
-            ASRxw(s3, s1, c-1);
-        }
-        BFIw(xFlags, (c>1)?s3:s1, 0, 1);
+        ASRxw(s3, s1, c-1);
+        BFIw(xFlags, s3, 0, 1);
     }
     ASRxw(s1, s1, c);
     IFX(X_PEND) {
@@ -278,13 +267,13 @@ void emit_sar32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, in
 }
 
 // emit ROL32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
-void emit_rol32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4)
+void emit_rol32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4)
 {
     MAYUSE(rex); MAYUSE(s1); MAYUSE(s3); MAYUSE(s4);
     IFX(X_PEND) {
         MOV32w(s3, c);
         STRxw_U12(s3, xEmu, offsetof(x64emu_t, op2));
-        SET_DF(s4, d_rol32);
+        SET_DF(s4, rex.w?d_rol64:d_rol32);
     } else IFX(X_ALL) {
         SET_DFNONE(s4);
     }
@@ -310,7 +299,7 @@ void emit_rol32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, in
 }
 
 // emit ROR32 instruction, from s1 , constant c, store result in s1 using s3 and s4 as scratch
-void emit_ror32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4)
+void emit_ror32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4)
 {
     MAYUSE(s1); MAYUSE(s3); MAYUSE(s4);
     IFX(X_PEND) {
@@ -344,7 +333,7 @@ void emit_ror32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, in
 }
 
 // emit SHRD32 instruction, from s1, fill s2 , constant c, store result in s1 using s3 and s4 as scratch
-void emit_shrd32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int32_t c, int s3, int s4)
+void emit_shrd32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4)
 {
     c&=(rex.w?0x3f:0x1f);
     IFX(X_PEND) {
@@ -363,10 +352,8 @@ void emit_shrd32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int3
         return;
     }
     IFX(X_CF) {
-        if(c>1) {
-            LSRxw(s3, s1, c-1);
-        }
-        BFIw(xFlags, (c>1)?s3:s1, 0, 1);
+        LSRxw(s3, s1, c-1);
+        BFIw(xFlags, s3, 0, 1);
     }
     LSRxw(s3, s1, c);
     ORRxw_REG_LSL(s1, s3, s2, (rex.w?64:32)-c);
@@ -394,7 +381,7 @@ void emit_shrd32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int3
     }
 }
 
-void emit_shld32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int32_t c, int s3, int s4)
+void emit_shld32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4)
 {
     c&=(rex.w?0x3f:0x1f);
     IFX(X_PEND) {
@@ -407,8 +394,8 @@ void emit_shld32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int3
         SET_DFNONE(s4);
     }
     if(c==0) {
-        IFX(F_OF) {
-            BFCx(xFlags, F_OF, 1);
+        IFX(X_OF) {
+            BFCw(xFlags, F_OF, 1);
         }
         IFX(X_PEND) {
             STRxw_U12(s1, xEmu, offsetof(x64emu_t, res));
diff --git a/src/dynarec/arm64/dynarec_arm64_emit_tests.c b/src/dynarec/arm64/dynarec_arm64_emit_tests.c
index 301ab2f2..4d032e02 100755
--- a/src/dynarec/arm64/dynarec_arm64_emit_tests.c
+++ b/src/dynarec/arm64/dynarec_arm64_emit_tests.c
@@ -85,19 +85,14 @@ void emit_cmp32_0(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s3, int
     SUBSxw_U12(s3, s1, 0);   // res = s1 - 0
     // and now the tricky ones (and mostly unused), PF and AF
     // bc = (res & (~d | s)) | (~d & s) => is 0 here...
-    IFX(X_OF|X_AF) {
-        MOV32w(s4, (1<<F_OF)|(1<<F_AF));
+    IFX(X_OF|X_AF|X_CF) {
+        MOV32w(s4, (1<<F_OF)|(1<<F_AF)|(1<<F_CF));
         BICw(xFlags, xFlags, s4);
     }
     IFX(X_ZF) {
         CSETw(s4, cEQ);
         BFIw(xFlags, s4, F_ZF, 1);
     }
-    IFX(X_CF) {
-        // inverted carry
-        CSETw(s4, cCC);
-        BFIw(xFlags, s4, F_CF, 1);
-    }
     IFX(X_SF) {
         LSRxw(s3, s1, (rex.w)?63:31);
         BFIw(xFlags, s3, F_SF, 1);
@@ -118,12 +113,15 @@ void emit_cmp16(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     } else {
         SET_DFNONE(s3);
     }
-    SUBw_REG(s5, s1, s2);   // res = s1 - s2
+    IFX(X_ZF) {
+        SUBSw_REG(s5, s1, s2);   // res = s1 - s2
+    } else {
+        SUBw_REG(s5, s1, s2);   // res = s1 - s2
+    }
     IFX_PENDOR0 {
         STRH_U12(s5, xEmu, offsetof(x64emu_t, res));
     }
     IFX(X_ZF) {
-        TSTw_mask(s5, 0, 15);   //mask=0xffff
         CSETw(s3, cEQ);
         BFIw(xFlags, s3, F_ZF, 1);
     }
@@ -316,11 +314,9 @@ void emit_test16(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int s4,
     } else {
         SET_DFNONE(s4);
     }
-    IFX(X_OF) {
-        BFCw(xFlags, F_OF, 1);
-    }
-    IFX(X_CF) {
-        BFCw(xFlags, F_CF, 1);
+    IFX(X_CF | X_AF | X_OF) {
+        MOV32w(s3, (1<<F_CF)|(1<<F_AF)|(1<<F_OF));
+        BICw(xFlags, xFlags, s3);
     }
     ANDSw_REG(s5, s1, s2);   // res = s1 & s2
     IFX_PENDOR0 {
@@ -349,11 +345,9 @@ void emit_test8(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int s4, i
     } else {
         SET_DFNONE(s4);
     }
-    IFX(X_OF) {
-        BFCw(xFlags, F_OF, 1);
-    }
-    IFX(X_CF) {
-        BFCw(xFlags, F_CF, 1);
+    IFX(X_CF | X_AF | X_OF) {
+        MOV32w(s3, (1<<F_CF)|(1<<F_AF)|(1<<F_OF));
+        BICw(xFlags, xFlags, s3);
     }
     ANDSw_REG(s5, s1, s2);   // res = s1 & s2
     IFX_PENDOR0 {
diff --git a/src/dynarec/arm64/dynarec_arm64_f0.c b/src/dynarec/arm64/dynarec_arm64_f0.c
index 8bcf3e41..b38692fe 100644
--- a/src/dynarec/arm64/dynarec_arm64_f0.c
+++ b/src/dynarec/arm64/dynarec_arm64_f0.c
@@ -76,7 +76,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
                 MARKLOCK;
                 LDAXRB(x1, wback);
-                emit_add8(dyn, ninst, x1, x2, x4, x3);
+                emit_add8(dyn, ninst, x1, x2, x4, x5);
                 STLXRB(x4, x1, wback);
                 CBNZx_MARKLOCK(x4);
             }
@@ -124,7 +124,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
                 MARKLOCK;
                 LDAXRB(x1, wback);
-                emit_or8(dyn, ninst, x1, x2, x4, x3);
+                emit_or8(dyn, ninst, x1, x2, x4, x5);
                 STLXRB(x4, x1, wback);
                 CBNZx_MARKLOCK(x4);
             }
@@ -155,161 +155,184 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             switch(nextop) {
 
                 case 0xB0:
-                    INST_NAME("LOCK CMPXCHG Eb, Gb");
-                    SETFLAGS(X_ALL, SF_SET_PENDING);
-                    nextop = F8;
-                    GETGB(x1);
-                    UBFXx(x6, xRAX, 0, 8);
-                    SMDMB();
-                    if(MODREG) {
-                        if(rex.rex) {
-                            wback = xRAX+(nextop&7)+(rex.b<<3);
-                            wb2 = 0;
-                        } else { 
-                            wback = (nextop&7);
-                            wb2 = (wback>>2)*8;
-                            wback = xRAX+(wback&3);
-                        } 
-                        UBFXx(x2, wback, wb2, 8);
-                        wb1 = 0;
-                        ed = x2;
-                        UFLAG_IF {emit_cmp8(dyn, ninst, x6, ed, x3, x4, x5);}
-                        CMPSxw_REG(x6, x2);
-                        B_MARK2(cNE);
-                        BFIx(wback, x2, wb2, 8);
-                        MOVxw_REG(ed, gd);
-                        MARK2;
-                        BFIx(xRAX, x2, 0, 8);
-                        B_NEXT_nocond;
-                    } else {
-                        addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
-                        // Aligned version
-                        MARKLOCK;
-                        LDAXRB(x2, wback);
-                        CMPSxw_REG(x6, x2);
-                        B_MARK(cNE);
-                        // EAX == Ed
-                        STLXRB(x4, gd, wback);
-                        CBNZx_MARKLOCK(x4);
-                        // done
-                        MARK;
-                        UFLAG_IF {emit_cmp32(dyn, ninst, rex, x6, x2, x3, x4, x5);}
-                        BFIx(xRAX, x2, 0, 8);    // upper par of RAX will be erase on 32bits, no mater what
+                    switch(rep) {
+                        case 0:
+                            INST_NAME("LOCK CMPXCHG Eb, Gb");
+                            SETFLAGS(X_ALL, SF_SET_PENDING);
+                            nextop = F8;
+                            GETGB(x1);
+                            UBFXx(x6, xRAX, 0, 8);
+                            SMDMB();
+                            if(MODREG) {
+                                if(rex.rex) {
+                                    wback = xRAX+(nextop&7)+(rex.b<<3);
+                                    wb2 = 0;
+                                } else { 
+                                    wback = (nextop&7);
+                                    wb2 = (wback>>2)*8;
+                                    wback = xRAX+(wback&3);
+                                } 
+                                UBFXx(x2, wback, wb2, 8);
+                                wb1 = 0;
+                                ed = x2;
+                                UFLAG_IF {emit_cmp8(dyn, ninst, x6, ed, x3, x4, x5);}
+                                CMPSxw_REG(x6, x2);
+                                B_MARK2(cNE);
+                                BFIx(wback, x2, wb2, 8);
+                                MOVxw_REG(ed, gd);
+                                MARK2;
+                                BFIx(xRAX, x2, 0, 8);
+                                B_NEXT_nocond;
+                            } else {
+                                addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
+                                MARKLOCK;
+                                LDAXRB(x2, wback);
+                                CMPSxw_REG(x6, x2);
+                                B_MARK(cNE);
+                                // EAX == Ed
+                                STLXRB(x4, gd, wback);
+                                CBNZx_MARKLOCK(x4);
+                                // done
+                                MARK;
+                                UFLAG_IF {emit_cmp32(dyn, ninst, rex, x6, x2, x3, x4, x5);}
+                                BFIx(xRAX, x2, 0, 8);
+                            }
+                            SMDMB();
+                            break;
+                        default:
+                            DEFAULT;
                     }
-                    SMDMB();
                     break;
                 case 0xB1:
-                    INST_NAME("LOCK CMPXCHG Ed, Gd");
-                    SETFLAGS(X_ALL, SF_SET_PENDING);
-                    nextop = F8;
-                    GETGD;
-                    SMDMB();
-                    if(MODREG) {
-                        ed = xRAX+(nextop&7)+(rex.b<<3);
-                        wback = 0;
-                        UFLAG_IF {emit_cmp32(dyn, ninst, rex, xRAX, ed, x3, x4, x5);}
-                        MOVxw_REG(x1, ed);  // save value
-                        CMPSxw_REG(xRAX, x1);
-                        B_MARK2(cNE);
-                        MOVxw_REG(ed, gd);
-                        MARK2;
-                        MOVxw_REG(xRAX, x1);
-                        B_NEXT_nocond;
-                    } else {
-                        addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
-                        TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
-                        B_MARK3(cNE);
-                        // Aligned version
-                        MARKLOCK;
-                        LDAXRxw(x1, wback);
-                        CMPSxw_REG(xRAX, x1);
-                        B_MARK(cNE);
-                        // EAX == Ed
-                        STLXRxw(x4, gd, wback);
-                        CBNZx_MARKLOCK(x4);
-                        // done
-                        B_MARK_nocond;
-                        // Unaligned version
-                        MARK3;
-                        LDRxw_U12(x1, wback, 0);
-                        LDAXRB(x3, wback); // dummy read, to arm the write...
-                        CMPSxw_REG(xRAX, x1);
-                        B_MARK(cNE);
-                        // EAX == Ed
-                        STLXRB(x4, gd, wback);
-                        CBNZx_MARK3(x4);
-                        STRxw_U12(gd, wback, 0);
-                        MARK;
-                        // Common part (and fallback for EAX != Ed)
-                        UFLAG_IF {emit_cmp32(dyn, ninst, rex, xRAX, x1, x3, x4, x5);}
-                        MOVxw_REG(xRAX, x1);    // upper par of RAX will be erase on 32bits, no mater what
+                    switch(rep) {
+                        case 0:
+                            INST_NAME("LOCK CMPXCHG Ed, Gd");
+                            SETFLAGS(X_ALL, SF_SET_PENDING);
+                            nextop = F8;
+                            GETGD;
+                            SMDMB();
+                            if(MODREG) {
+                                ed = xRAX+(nextop&7)+(rex.b<<3);
+                                wback = 0;
+                                UFLAG_IF {emit_cmp32(dyn, ninst, rex, xRAX, ed, x3, x4, x5);}
+                                MOVxw_REG(x1, ed);  // save value
+                                CMPSxw_REG(xRAX, x1);
+                                B_MARK2(cNE);
+                                MOVxw_REG(ed, gd);
+                                MARK2;
+                                MOVxw_REG(xRAX, x1);
+                                B_NEXT_nocond;
+                            } else {
+                                addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
+                                TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
+                                B_MARK3(cNE);
+                                // Aligned version
+                                MARKLOCK;
+                                LDAXRxw(x1, wback);
+                                CMPSxw_REG(xRAX, x1);
+                                B_MARK(cNE);
+                                // EAX == Ed
+                                STLXRxw(x4, gd, wback);
+                                CBNZx_MARKLOCK(x4);
+                                // done
+                                B_MARK_nocond;
+                                // Unaligned version
+                                MARK3;
+                                LDRxw_U12(x1, wback, 0);
+                                LDAXRB(x3, wback); // dummy read, to arm the write...
+                                CMPSxw_REG(xRAX, x1);
+                                B_MARK(cNE);
+                                // EAX == Ed
+                                STLXRB(x4, gd, wback);
+                                CBNZx_MARK3(x4);
+                                STRxw_U12(gd, wback, 0);
+                                MARK;
+                                // Common part (and fallback for EAX != Ed)
+                                UFLAG_IF {emit_cmp32(dyn, ninst, rex, xRAX, x1, x3, x4, x5);}
+                                MOVxw_REG(xRAX, x1);    // upper par of RAX will be erase on 32bits, no mater what
+                            }
+                            SMDMB();
+                            break;
+                        default:
+                            DEFAULT;
                     }
-                    SMDMB();
                     break;
 
                 case 0xC1:
-                    INST_NAME("LOCK XADD Gd, Ed");
-                    SETFLAGS(X_ALL, SF_SET_PENDING);
-                    nextop = F8;
-                    GETGD;
-                    SMDMB();
-                    if(MODREG) {
-                        ed = xRAX+(nextop&7)+(rex.b<<3);
-                        MOVxw_REG(x1, ed);
-                        MOVxw_REG(ed, gd);
-                        MOVxw_REG(gd, x1);
-                        emit_add32(dyn, ninst, rex, ed, gd, x3, x4);
-                    } else {
-                        addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
-                        TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
-                        B_MARK(cNE);    // unaligned
-                        MARKLOCK;
-                        LDAXRxw(x1, wback);
-                        ADDxw_REG(x4, x1, gd);
-                        STLXRxw(x3, x4, wback);
-                        CBNZx_MARKLOCK(x3);
-                        B_MARK2_nocond;
-                        MARK;
-                        LDRxw_U12(x1, wback, 0);
-                        LDAXRB(x4, wback);
-                        BFIxw(x1, x4, 0, 8);
-                        ADDxw_REG(x4, x1, gd);
-                        STLXRB(x3, x4, wback);
-                        CBNZx_MARK(x3);
-                        STRxw_U12(x4, wback, 0);
-                        MARK2;
-                        IFX(X_ALL|X_PEND) {
-                            MOVxw_REG(x2, x1);
-                            emit_add32(dyn, ninst, rex, x2, gd, x3, x4);
-                        }
-                        MOVxw_REG(gd, x1);
+                    switch(rep) {
+                        case 0:
+                            INST_NAME("LOCK XADD Gd, Ed");
+                            SETFLAGS(X_ALL, SF_SET_PENDING);
+                            nextop = F8;
+                            GETGD;
+                            SMDMB();
+                            if(MODREG) {
+                                ed = xRAX+(nextop&7)+(rex.b<<3);
+                                MOVxw_REG(x1, ed);
+                                MOVxw_REG(ed, gd);
+                                MOVxw_REG(gd, x1);
+                                emit_add32(dyn, ninst, rex, ed, gd, x3, x4);
+                            } else {
+                                addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
+                                TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
+                                B_MARK(cNE);    // unaligned
+                                MARKLOCK;
+                                LDAXRxw(x1, wback);
+                                ADDxw_REG(x4, x1, gd);
+                                STLXRxw(x3, x4, wback);
+                                CBNZx_MARKLOCK(x3);
+                                B_MARK2_nocond;
+                                MARK;
+                                LDRxw_U12(x1, wback, 0);
+                                LDAXRB(x4, wback);
+                                BFIxw(x1, x4, 0, 8);
+                                ADDxw_REG(x4, x1, gd);
+                                STLXRB(x3, x4, wback);
+                                CBNZx_MARK(x3);
+                                STRxw_U12(x4, wback, 0);
+                                MARK2;
+                                IFX(X_ALL|X_PEND) {
+                                    MOVxw_REG(x2, x1);
+                                    emit_add32(dyn, ninst, rex, x2, gd, x3, x4);
+                                }
+                                MOVxw_REG(gd, x1);
+                            }
+                            SMDMB();
+                            break;
+                        default:
+                            DEFAULT;
                     }
-                    SMDMB();
                     break;
 
                 case 0xC7:
-                    INST_NAME("LOCK CMPXCHG8B Gq, Eq");
-                    SETFLAGS(X_ZF, SF_SUBSET);
-                    nextop = F8;
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x1, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
-                    SMDMB();
-                    MARKLOCK;
-                    LDAXPxw(x2, x3, wback);
-                    CMPSxw_REG(xRAX, x2);
-                    B_MARK(cNE);    // EAX != Ed[0]
-                    CMPSxw_REG(xRDX, x3);
-                    B_MARK(cNE);    // EDX != Ed[1]
-                    STLXPxw(x4, xRBX, xRCX, wback);
-                    CBNZx_MARKLOCK(x4);
-                    MOV32w(x1, 1);
-                    B_MARK3_nocond;
-                    MARK;
-                    MOVxw_REG(xRAX, x2);
-                    MOVxw_REG(xRDX, x3);
-                    MOV32w(x1, 0);
-                    MARK3;
-                    SMDMB();
-                    BFIw(xFlags, x1, F_ZF, 1);
+                    switch(rep) {
+                        case 0:
+                            INST_NAME("LOCK CMPXCHG8B Gq, Eq");
+                            SETFLAGS(X_ZF, SF_SUBSET);
+                            nextop = F8;
+                            addr = geted(dyn, addr, ninst, nextop, &wback, x1, &fixedaddress, 0, 0, rex, LOCK_LOCK, 0, 0);
+                            SMDMB();
+                            MARKLOCK;
+                            LDAXPxw(x2, x3, wback);
+                            CMPSxw_REG(xRAX, x2);
+                            B_MARK(cNE);    // EAX != Ed[0]
+                            CMPSxw_REG(xRDX, x3);
+                            B_MARK(cNE);    // EDX != Ed[1]
+                            STLXPxw(x4, xRBX, xRCX, wback);
+                            CBNZx_MARKLOCK(x4);
+                            MOV32w(x1, 1);
+                            B_MARK3_nocond;
+                            MARK;
+                            MOVxw_REG(xRAX, x2);
+                            MOVxw_REG(xRDX, x3);
+                            MOV32w(x1, 0);
+                            MARK3;
+                            SMDMB();
+                            BFIw(xFlags, x1, F_ZF, 1);
+                            break;
+                        default:
+                            DEFAULT;
+                    }
                     break;
 
                 default:
@@ -420,7 +443,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         wb1 = 1;
                         MARKLOCK;
                         LDAXRB(x1, wback);
-                        emit_adc8c(dyn, ninst, x1, u8, x2, x4, x5);
+                        emit_adc8c(dyn, ninst, x1, u8, x2, x4, x3);
                         STLXRB(x3, x1, wback);
                         CBNZx_MARKLOCK(x3);
                     }
@@ -441,7 +464,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         wb1 = 1;
                         MARKLOCK;
                         LDAXRB(x1, wback);
-                        emit_sbb8c(dyn, ninst, x1, u8, x2, x4, x5);
+                        emit_sbb8c(dyn, ninst, x1, u8, x2, x4, x3);
                         STLXRB(x3, x1, wback);
                         CBNZx_MARKLOCK(x3);
                     }
@@ -481,7 +504,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         wb1 = 1;
                         MARKLOCK;
                         LDAXRB(x1, wback);
-                        emit_sub8c(dyn, ninst, x1, u8, x2, x4, x5);
+                        emit_sub8c(dyn, ninst, x1, u8, x2, x4, x3);
                         STLXRB(x3, x1, wback);
                         CBNZx_MARKLOCK(x3);
                     }
diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c
index 153b634f..75553bfc 100755
--- a/src/dynarec/arm64/dynarec_arm64_f20f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f20f.c
@@ -22,29 +22,6 @@
 #include "dynarec_arm64_functions.h"

 #include "dynarec_arm64_helper.h"

 

-// Get Ex as a double, not a quad (warning, x2 get used)

-#define GETEX(a, w, D)                                                                                  \

-    if(MODREG) {                                                                                        \

-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \

-    } else {                                                                                            \

-        SMREAD();                                                                                       \

-        a = fpu_get_scratch(dyn);                                                                       \

-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, D);   \

-        VLDR64_U12(a, ed, fixedaddress);                                                                \

-    }

-

-#define GETG        gd = ((nextop&0x38)>>3)+(rex.r<<3)

-

-#define GETGX(a, w) gd = ((nextop&0x38)>>3)+(rex.r<<3); \

-                    a = sse_get_reg(dyn, ninst, x1, gd, w)

-

-#define GETGX_empty(a)  gd = ((nextop&0x38)>>3)+(rex.r<<3); \

-                        a = sse_get_reg_empty(dyn, ninst, x1, gd)

-

-#define GETGM(a)                        \

-    gd = ((nextop&0x38)>>3);            \

-    a = mmx_get_reg(dyn, ninst, x1, x2, x3, gd)

-

 uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog)

 {

     (void)ip; (void)need_epilog;

@@ -134,7 +111,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("CVTTSD2SI Gd, Ex");

             nextop = F8;

             GETGD;

-            GETEX(q0, 0, 0);

+            GETEXSD(q0, 0, 0);

             if(!box64_dynarec_fastround) {

                 MRS_fpsr(x5);

                 BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

@@ -145,7 +122,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 MRS_fpsr(x5);   // get back FPSR to check the IOC bit

                 TBZ_NEXT(x5, FPSR_IOC);

                 if(rex.w) {

-                    MOV64x(gd, 0x8000000000000000);

+                    ORRx_mask(gd, xZR, 1, 1, 0);    //0x8000000000000000

                 } else {

                     MOV32w(gd, 0x80000000);

                 }

@@ -155,7 +132,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("CVTSD2SI Gd, Ex");

             nextop = F8;

             GETGD;

-            GETEX(q0, 0, 0);

+            GETEXSD(q0, 0, 0);

             if(!box64_dynarec_fastround) {

                 MRS_fpsr(x5);

                 BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

@@ -170,7 +147,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 MRS_fpsr(x5);   // get back FPSR to check the IOC bit

                 TBZ_NEXT(x5, FPSR_IOC);

                 if(rex.w) {

-                    MOV64x(gd, 0x8000000000000000);

+                    ORRx_mask(gd, xZR, 1, 1, 0);    //0x8000000000000000

                 } else {

                     MOV32w(gd, 0x80000000);

                 }

@@ -183,7 +160,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             d1 = fpu_get_scratch(dyn);

-            GETEX(d0, 0, 0);

+            GETEXSD(d0, 0, 0);

             if(!box64_dynarec_fastnan) {

                 v1 = fpu_get_scratch(dyn);

                 FCMLTD_0(v1, d0);

@@ -201,7 +178,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(d1, 1);

             v1 = fpu_get_scratch(dyn);

-            GETEX(d0, 0, 0);

+            GETEXSD(d0, 0, 0);

             if(!box64_dynarec_fastnan) {

                 v0 = fpu_get_scratch(dyn);

                 q0 = fpu_get_scratch(dyn);

@@ -223,7 +200,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(d1, 1);

             v1 = fpu_get_scratch(dyn);

-            GETEX(d0, 0, 0);

+            GETEXSD(d0, 0, 0);

             if(!box64_dynarec_fastnan) {

                 v0 = fpu_get_scratch(dyn);

                 q0 = fpu_get_scratch(dyn);

@@ -244,7 +221,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("CVTSD2SS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(d0, 0, 0);

+            GETEXSD(d0, 0, 0);

             d1 = fpu_get_scratch(dyn);

             FCVT_S_D(d1, d0);

             VMOVeS(v0, 0, d1, 0);

@@ -255,7 +232,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(d1, 1);

             v1 = fpu_get_scratch(dyn);

-            GETEX(d0, 0, 0);

+            GETEXSD(d0, 0, 0);

             if(!box64_dynarec_fastnan) {

                 v0 = fpu_get_scratch(dyn);

                 q0 = fpu_get_scratch(dyn);

@@ -276,7 +253,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("MINSD Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 0);

+            GETEXSD(v1, 0, 0);

             // MINSD: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0]

             #if 0

             d0 = fpu_get_scratch(dyn);

@@ -293,7 +270,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             d1 = fpu_get_scratch(dyn);

-            GETEX(v1, 0, 0);

+            GETEXSD(v1, 0, 0);

             if(!box64_dynarec_fastnan) {

                 d0 = fpu_get_scratch(dyn);

                 q0 = fpu_get_scratch(dyn);

@@ -314,7 +291,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("MAXSD Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 0);

+            GETEXSD(v1, 0, 0);

             // MAXSD: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0]

             #if 0

             d0 = fpu_get_scratch(dyn);

@@ -330,7 +307,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
         case 0x70:

             INST_NAME("PSHUFLW Gx, Ex, Ib");

             nextop = F8;

-            GETEX(v1, 0, 1);

+            GETEXSD(v1, 0, 1);

             GETGX(v0, 1);

 

             u8 = F8;

@@ -368,7 +345,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("CMPSD Gx, Ex, Ib");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 1);

+            GETEXSD(v1, 0, 1);

             u8 = F8;

             FCMPD(v0, v1);

             switch(u8&7) {

@@ -388,7 +365,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("ADDSUBPS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 0);

+            GETEXSD(v1, 0, 0);

             q0 = fpu_get_scratch(dyn);

             static float addsubps[4] = {-1.f, 1.f, -1.f, 1.f};

             MAYUSE(addsubps);

@@ -401,14 +378,14 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("MOVDQ2Q Gm, Ex");

             nextop = F8;

             GETGM(v0);

-            GETEX(v1, 0, 0);

+            GETEXSD(v1, 0, 0);

             VMOV(v0, v1);

             break;

 

         case 0xE6:

             INST_NAME("CVTPD2DQ Gx, Ex");

             nextop = F8;

-            GETEX(v1, 0, 0);

+            GETEXSD(v1, 0, 0);

             GETGX_empty(v0);

             u8 = sse_setround(dyn, ninst, x1, x2, x3);

             VFRINTIDQ(v0, v1);

diff --git a/src/dynarec/arm64/dynarec_arm64_f30f.c b/src/dynarec/arm64/dynarec_arm64_f30f.c
index 3615b231..ea057881 100755
--- a/src/dynarec/arm64/dynarec_arm64_f30f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f30f.c
@@ -22,36 +22,6 @@
 #include "dynarec_arm64_functions.h"

 #include "dynarec_arm64_helper.h"

 

-// Get Ex as a single, not a quad (warning, x2 get used)

-#define GETEX(a, w, D)                                                                                  \

-    if(MODREG) {                                                                                        \

-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \

-    } else {                                                                                            \

-        SMREAD();                                                                                       \

-        a = fpu_get_scratch(dyn);                                                                       \

-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, D);   \

-        VLDR32_U12(a, ed, fixedaddress);                                                                \

-    }

-

-// Get EX as a quad

-#define GETEXQ(a, w, D)                                                                                 \

-    if(MODREG) {                                                                                        \

-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \

-    } else {                                                                                            \

-        SMREAD();                                                                                       \

-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, NULL, 0, D);  \

-        a = fpu_get_scratch(dyn);                                                                       \

-        VLDR128_U12(a, ed, fixedaddress);                                                               \

-    }

-

-#define GETG        gd = ((nextop&0x38)>>3)+(rex.r<<3)

-

-#define GETGX(a, w) gd = ((nextop&0x38)>>3)+(rex.r<<3); \

-                    a = sse_get_reg(dyn, ninst, x1, gd, w)

-

-#define GETGX_empty(a)  gd = ((nextop&0x38)>>3)+(rex.r<<3);         \

-                        a = sse_get_reg_empty(dyn, ninst, x1, gd)

-

 uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog)

 {

     (void)ip; (void)need_epilog;

@@ -160,7 +130,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("CVTTSS2SI Gd, Ex");

             nextop = F8;

             GETGD;

-            GETEX(d0, 0, 0);

+            GETEXSS(d0, 0, 0);

             if(!box64_dynarec_fastround) {

                 MRS_fpsr(x5);

                 BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

@@ -171,7 +141,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 MRS_fpsr(x5);   // get back FPSR to check the IOC bit

                 TBZ_NEXT(x5, FPSR_IOC);

                 if(rex.w) {

-                    MOV64x(gd, 0x8000000000000000);

+                    ORRx_mask(gd, xZR, 1, 1, 0);    //0x8000000000000000

                 } else {

                     MOV32w(gd, 0x80000000);

                 }

@@ -181,7 +151,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("CVTSS2SI Gd, Ex");

             nextop = F8;

             GETGD;

-            GETEX(q0, 0, 0);

+            GETEXSS(q0, 0, 0);

             if(!box64_dynarec_fastround) {

                 MRS_fpsr(x5);

                 BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

@@ -196,7 +166,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 MRS_fpsr(x5);   // get back FPSR to check the IOC bit

                 TBZ_NEXT(x5, FPSR_IOC);

                 if(rex.w) {

-                    MOV64x(gd, 0x8000000000000000);

+                    ORRx_mask(gd, xZR, 1, 1, 0);    //0x8000000000000000

                 } else {

                     MOV32w(gd, 0x80000000);

                 }

@@ -207,7 +177,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             d1 = fpu_get_scratch(dyn);

-            GETEX(d0, 0, 0);

+            GETEXSS(d0, 0, 0);

             FSQRTS(d1, d0);

             VMOVeS(v0, 0, d1, 0);

             break;

@@ -215,7 +185,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("RSQRTSS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 0);

+            GETEXSS(v1, 0, 0);

             d0 = fpu_get_scratch(dyn);

             d1 = fpu_get_scratch(dyn);

             // so here: F32: Imm8 = abcd efgh that gives => aBbbbbbc defgh000 00000000 00000000

@@ -232,7 +202,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("RCPSS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 0);

+            GETEXSS(v1, 0, 0);

             d0 = fpu_get_scratch(dyn);

             FMOVS_8(d0, 0b01110000);    //1.0f

             FDIVS(d0, d0, v1);

@@ -244,7 +214,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             d1 = fpu_get_scratch(dyn);

-            GETEX(d0, 0, 0);

+            GETEXSS(d0, 0, 0);

             FADDS(d1, v0, d0);  // the high part of the vector is erased...

             VMOVeS(v0, 0, d1, 0);

             break;

@@ -253,7 +223,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             d1 = fpu_get_scratch(dyn);

-            GETEX(d0, 0, 0);

+            GETEXSS(d0, 0, 0);

             FMULS(d1, v0, d0);

             VMOVeS(v0, 0, d1, 0);

             break;

@@ -261,7 +231,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("CVTSS2SD Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 0);

+            GETEXSS(v1, 0, 0);

             d0 = fpu_get_scratch(dyn);

             FCVT_D_S(d0, v1);

             VMOVeD(v0, 0, d0, 0);

@@ -269,7 +239,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
         case 0x5B:

             INST_NAME("CVTTPS2DQ Gx, Ex");

             nextop = F8;

-            GETEXQ(d0, 0, 0);

+            GETEX(d0, 0, 0) ;

             GETGX_empty(v0);

             VFCVTZSQS(v0, d0);

             break;

@@ -279,7 +249,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             d1 = fpu_get_scratch(dyn);

-            GETEX(d0, 0, 0);

+            GETEXSS(d0, 0, 0);

             FSUBS(d1, v0, d0);

             VMOVeS(v0, 0, d1, 0);

             break;

@@ -287,7 +257,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("MINSS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 0);

+            GETEXSS(v1, 0, 0);

             // MINSS: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0]

             #if 0

             d0 = fpu_get_scratch(dyn);

@@ -304,7 +274,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             d1 = fpu_get_scratch(dyn);

-            GETEX(d0, 0, 0);

+            GETEXSS(d0, 0, 0);

             FDIVS(d1, v0, d0);

             VMOVeS(v0, 0, d1, 0);

             break;

@@ -312,7 +282,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("MAXSS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 0);

+            GETEXSS(v1, 0, 0);

             // MAXSS: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0]

             #if 0

             d0 = fpu_get_scratch(dyn);

@@ -342,7 +312,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
         case 0x70:

             INST_NAME("PSHUFHW Gx, Ex, Ib");

             nextop = F8;

-            GETEXQ(v1, 0, 1);

+            GETEX(v1, 0, 1) ;

             GETGX(v0, 1);

             u8 = F8;

             // only high part need to be suffled. VTBL only handle 8bits value, so the 16bits suffles need to be changed in 8bits

@@ -423,7 +393,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("CMPSS Gx, Ex, Ib");

             nextop = F8;

             GETGX(v0, 1);

-            GETEX(v1, 0, 1);

+            GETEXSS(v1, 0, 1);

             u8 = F8;

             FCMPS(v0, v1);

             switch(u8&7) {

diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 6e1d3a2d..2cb06c07 100755
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -286,7 +286,7 @@
                     ed = i;                     \
                 } else {                        \
                     SMREAD();                   \
-                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0, 0, rex, NULL, 0, D); \
+                    addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 0xfff, 0, rex, NULL, 0, D); \
                     ADDx_REG(x3, wback, i);     \
                     if(wback!=x3) wback = x3;   \
                     LDRB_U12(i, wback, fixedaddress);      \
@@ -360,6 +360,84 @@
 // Write gb (gd) back to original register / memory
 #define GBBACK   BFIx(gb1, gd, gb2, 8);
 
+// Generic get GD, but reg value in gd (R_RAX is not added)
+#define GETG        gd = ((nextop&0x38)>>3)+(rex.r<<3)
+
+// Get GX as a quad (might use x1)
+#define GETGX(a, w)                     \
+    gd = ((nextop&0x38)>>3)+(rex.r<<3); \
+    a = sse_get_reg(dyn, ninst, x1, gd, w)
+
+// Get an empty GX (use x1)
+#define GETGX_empty(a)                  \
+    gd = ((nextop&0x38)>>3)+(rex.r<<3); \
+    a = sse_get_reg_empty(dyn, ninst, x1, gd)
+
+// Get EX as a quad, (x1 is used)
+#define GETEX(a, w, D)                                                                                  \
+    if(MODREG) {                                                                                        \
+        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
+    } else {                                                                                            \
+        SMREAD();                                                                                       \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<4, 15, rex, NULL, 0, D);  \
+        a = fpu_get_scratch(dyn);                                                                       \
+        VLDR128_U12(a, ed, fixedaddress);                                                               \
+    }
+
+// Put Back EX if it was a memory and not an emm register
+#define PUTEX(a)                                    \
+    if(!MODREG) {                                   \
+        VSTR128_U12(a, ed, fixedaddress);           \
+        SMWRITE2();                                 \
+    }
+
+
+// Get Ex as a double, not a quad (warning, x1 get used)
+#define GETEXSD(a, w, D)                                                                                \
+    if(MODREG) {                                                                                        \
+        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
+    } else {                                                                                            \
+        SMREAD();                                                                                       \
+        a = fpu_get_scratch(dyn);                                                                       \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, D);   \
+        VLDR64_U12(a, ed, fixedaddress);                                                                \
+    }
+
+// Get Ex as a single, not a quad (warning, x1 get used)
+#define GETEXSS(a, w, D)                                                                                \
+    if(MODREG) {                                                                                        \
+        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
+    } else {                                                                                            \
+        SMREAD();                                                                                       \
+        a = fpu_get_scratch(dyn);                                                                       \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<2, 3, rex, NULL, 0, D);   \
+        VLDR32_U12(a, ed, fixedaddress);                                                                \
+    }
+
+// Get GM, might use x1, x2 and x3
+#define GETGM(a)                        \
+    gd = ((nextop&0x38)>>3);            \
+    a = mmx_get_reg(dyn, ninst, x1, x2, x3, gd)
+
+// Get EM, might use x1, x2 and x3
+#define GETEM(a, D)                                             \
+    if(MODREG) {                                                \
+        a = mmx_get_reg(dyn, ninst, x1, x2, x3, (nextop&7));    \
+    } else {                                                    \
+        SMREAD();                                               \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0xfff<<3, 7, rex, NULL, 0, D); \
+        a = fpu_get_scratch(dyn);                               \
+        VLDR64_U12(a, ed, fixedaddress);                        \
+    }
+
+// Put Back EM if it was a memory and not an emm register
+#define PUTEM(a)                            \
+    if(!MODREG) {                           \
+        VSTR64_U12(a, ed, fixedaddress);    \
+        SMWRITE2();                         \
+    }
+
+
 // Get Direction with size Z and based of F_DF flag, on register r ready for LDR/STR fetching
 // F_DF is 1<<10, so 1 ROR 11*2 (so F_OF)
 #define GETDIR(r, A)                \
@@ -976,14 +1054,14 @@ void emit_neg32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s3, int s4
 void emit_neg16(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4);
 void emit_neg8(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4);
 void emit_shl32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4);
-void emit_shl32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4);
+void emit_shl32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4);
 void emit_shr32(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4);
-void emit_shr32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4);
-void emit_sar32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4);
-void emit_rol32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4);
-void emit_ror32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int32_t c, int s3, int s4);
-void emit_shrd32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int32_t c, int s3, int s4);
-void emit_shld32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, int32_t c, int s3, int s4);
+void emit_shr32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4);
+void emit_sar32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4);
+void emit_rol32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4);
+void emit_ror32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4);
+void emit_shrd32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4);
+void emit_shld32c(dynarec_arm_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4);
 
 void emit_pf(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4);
 
@@ -1100,7 +1178,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
 uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
-uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
+uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog);
 uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
 uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index 506eaf0f..c2cf1387 100755
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -425,6 +425,19 @@ void* CreateEmptyBlock(dynablock_t* block, uintptr_t addr) {
 }
 
 void* FillBlock64(dynablock_t* block, uintptr_t addr) {
+    /*
+        A Block must have this layout:
+
+        0x0000..0x0007  : dynablock_t* : self
+        0x0008..8+4*n   : actual Native instructions, (n is the total number)
+        A ..    A+8*n   : Table64: n 64bits values
+        B ..    B+7     : dynablock_t* : self (as part of JmpNext, that simulate another block)
+        B+8 ..  B+15    : 2 Native code for jmpnext (or jmp epilog in case of empty block)
+        B+16 .. B+23    : jmpnext (or jmp_epilog) address
+        B+24 .. B+31    : empty (in case an architecture needs more than 2 opcodes)
+        B+32 .. B+32+sz : instsize (compressed array with each instruction lenght on x64 and native side)
+
+    */
     if(IsInHotPage(addr)) {
         dynarec_log(LOG_DEBUG, "Cancelling dynarec FillBlock on hotpage for %p\n", (void*)addr);
         return NULL;