about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2023-10-16 20:20:26 +0200
committerptitSeb <sebastien.chev@gmail.com>2023-10-16 20:20:26 +0200
commitf67d0709b77f292ef3c69de0be67f1f3a76474bb (patch)
tree46cf31d769881396c46f5d5c3a4ec59fdec23f94 /src
parent2e66d603d3b0e9243bb00642a38def375b3a1a94 (diff)
downloadbox64-f67d0709b77f292ef3c69de0be67f1f3a76474bb.tar.gz
box64-f67d0709b77f292ef3c69de0be67f1f3a76474bb.zip
[ARM64_DYNAREC] Added code generation for Atomic ARM v8.1 extension for most lock prefix
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/arm64_emitter.h192
-rw-r--r--src/dynarec/arm64/arm64_printer.c50
-rw-r--r--src/dynarec/arm64/dynarec_arm64_00.c36
-rw-r--r--src/dynarec/arm64/dynarec_arm64_66f0.c194
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f0.c576
5 files changed, 807 insertions, 241 deletions
diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h
index 094adbe0..9b6cc2e2 100644
--- a/src/dynarec/arm64/arm64_emitter.h
+++ b/src/dynarec/arm64/arm64_emitter.h
@@ -427,6 +427,7 @@
 // Data Memory Barrier
 #define DMB_gen(CRm)                    (0b1101010100<<22 | 0b011<<16 | 0b0011<<12 | (CRm)<<8 | 1<<7 | 0b01<<5 | 0b11111)
 #define DMB_ISH()                       EMIT(DMB_gen(0b1011))
+#define DMB_SY()                        EMIT(DMB_gen(0b1111))
 
 // Break
 #define BRK_gen(imm16)                  (0b11010100<<24 | 0b001<<21 | (((imm16)&0xffff)<<5))
@@ -1938,4 +1939,195 @@
 #define PMULL_128(Rd, Rn, Rm)   EMIT(PMULL_gen(0, 0b11, Rm, Rn, Rd))
 #define PMULL2_128(Rd, Rn, Rm)  EMIT(PMULL_gen(1, 0b11, Rm, Rn, Rd))
 
+// Atomic extension
+#define ATOMIC_gen(size, A, R, Rs, opc, Rn, Rt) ((size)<<30 | 0b111<<27 | (A)<<23 | (R)<<22 | 1<<21 | (Rs)<<16 | (opc)<<12 | (Rn)<<5 | (Rt))
+// Atomic ADD
+#define LDADDxw(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b000, Rn, Rt))
+#define LDADDAxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 1, 0, Rs, 0b000, Rn, Rt))
+#define LDADDALxw(Rs, Rt, Rn)           EMIT(ATOMIC_gen(0b10+rex.w, 1, 1, Rs, 0b000, Rn, Rt))
+#define LDADDLxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b000, Rn, Rt))
+#define STADDxw(Rs, Rn)                 EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b000, Rn, 0b11111))
+#define STADDLxw(Rs, Rn)                EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b000, Rn, 0b11111))
+#define LDADDB(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b000, Rn, Rt))
+#define LDADDAB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 1, 0, Rs, 0b000, Rn, Rt))
+#define LDADDALB(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b00, 1, 1, Rs, 0b000, Rn, Rt))
+#define LDADDLB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b000, Rn, Rt))
+#define STADDB(Rs, Rn)                  EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b000, Rn, 0b11111))
+#define STADDLB(Rs, Rn)                 EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b000, Rn, 0b11111))
+#define LDADDH(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b000, Rn, Rt))
+#define LDADDAH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 1, 0, Rs, 0b000, Rn, Rt))
+#define LDADDALH(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b01, 1, 1, Rs, 0b000, Rn, Rt))
+#define LDADDLH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b000, Rn, Rt))
+#define STADDH(Rs, Rn)                  EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b000, Rn, 0b11111))
+#define STADDLH(Rs, Rn)                 EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b000, Rn, 0b11111))
+// Atomic AND with complement (i.e. BIC)
+#define LDCLRxw(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b001, Rn, Rt))
+#define LDCLRAxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 1, 0, Rs, 0b001, Rn, Rt))
+#define LDCLRALxw(Rs, Rt, Rn)           EMIT(ATOMIC_gen(0b10+rex.w, 1, 1, Rs, 0b001, Rn, Rt))
+#define LDCLRLxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b001, Rn, Rt))
+#define STCLRxw(Rs, Rn)                 EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b001, Rn, 0b11111))
+#define STCLRLxw(Rs, Rn)                EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b001, Rn, 0b11111))
+#define LDCLRB(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b001, Rn, Rt))
+#define LDCLRAB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 1, 0, Rs, 0b001, Rn, Rt))
+#define LDCLRALB(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b00, 1, 1, Rs, 0b001, Rn, Rt))
+#define LDCLRLB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b001, Rn, Rt))
+#define STCLRB(Rs, Rn)                  EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b001, Rn, 0b11111))
+#define STCLRLB(Rs, Rn)                 EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b001, Rn, 0b11111))
+#define LDCLRH(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b001, Rn, Rt))
+#define LDCLRAH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 1, 0, Rs, 0b001, Rn, Rt))
+#define LDCLRALH(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b01, 1, 1, Rs, 0b001, Rn, Rt))
+#define LDCLRLH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b001, Rn, Rt))
+#define STCLRH(Rs, Rn)                  EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b001, Rn, 0b11111))
+#define STCLRLH(Rs, Rn)                 EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b001, Rn, 0b11111))
+// Atomic EOR
+#define LDEORxw(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b010, Rn, Rt))
+#define LDEORAxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 1, 0, Rs, 0b010, Rn, Rt))
+#define LDEORALxw(Rs, Rt, Rn)           EMIT(ATOMIC_gen(0b10+rex.w, 1, 1, Rs, 0b010, Rn, Rt))
+#define LDEORLxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b010, Rn, Rt))
+#define STEORxw(Rs, Rn)                 EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b010, Rn, 0b11111))
+#define STEORLxw(Rs, Rn)                EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b010, Rn, 0b11111))
+#define LDEORB(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b010, Rn, Rt))
+#define LDEORAB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 1, 0, Rs, 0b010, Rn, Rt))
+#define LDEORALB(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b00, 1, 1, Rs, 0b010, Rn, Rt))
+#define LDEORLB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b010, Rn, Rt))
+#define STEORB(Rs, Rn)                  EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b010, Rn, 0b11111))
+#define STEORLB(Rs, Rn)                 EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b010, Rn, 0b11111))
+#define LDEORH(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b010, Rn, Rt))
+#define LDEORAH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 1, 0, Rs, 0b010, Rn, Rt))
+#define LDEORALH(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b01, 1, 1, Rs, 0b010, Rn, Rt))
+#define LDEORLH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b010, Rn, Rt))
+#define STEORH(Rs, Rn)                  EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b010, Rn, 0b11111))
+#define STEORLH(Rs, Rn)                 EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b010, Rn, 0b11111))
+// Atomic OR
+#define LDSETxw(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b011, Rn, Rt))
+#define LDSETAxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 1, 0, Rs, 0b011, Rn, Rt))
+#define LDSETALxw(Rs, Rt, Rn)           EMIT(ATOMIC_gen(0b10+rex.w, 1, 1, Rs, 0b011, Rn, Rt))
+#define LDSETLxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b011, Rn, Rt))
+#define STSETxw(Rs, Rn)                 EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b011, Rn, 0b11111))
+#define STSETLxw(Rs, Rn)                EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b011, Rn, 0b11111))
+#define LDSETB(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b011, Rn, Rt))
+#define LDSETAB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 1, 0, Rs, 0b011, Rn, Rt))
+#define LDSETALB(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b00, 1, 1, Rs, 0b011, Rn, Rt))
+#define LDSETLB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b011, Rn, Rt))
+#define STSETB(Rs, Rn)                  EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b011, Rn, 0b11111))
+#define STSETLB(Rs, Rn)                 EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b011, Rn, 0b11111))
+#define LDSETH(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b011, Rn, Rt))
+#define LDSETAH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 1, 0, Rs, 0b011, Rn, Rt))
+#define LDSETALH(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b01, 1, 1, Rs, 0b011, Rn, Rt))
+#define LDSETLH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b011, Rn, Rt))
+#define STSETH(Rs, Rn)                  EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b011, Rn, 0b11111))
+#define STSETLH(Rs, Rn)                 EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b011, Rn, 0b11111))
+// Atomic Signel Max
+#define LDSMAXxw(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b100, Rn, Rt))
+#define LDSMAXAxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 1, 0, Rs, 0b100, Rn, Rt))
+#define LDSMAXALxw(Rs, Rt, Rn)           EMIT(ATOMIC_gen(0b10+rex.w, 1, 1, Rs, 0b100, Rn, Rt))
+#define LDSMAXLxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b100, Rn, Rt))
+#define STSMAXxw(Rs, Rn)                 EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b100, Rn, 0b11111))
+#define STSMAXLxw(Rs, Rn)                EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b100, Rn, 0b11111))
+#define LDSMAXB(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b100, Rn, Rt))
+#define LDSMAXAB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 1, 0, Rs, 0b100, Rn, Rt))
+#define LDSMAXALB(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b00, 1, 1, Rs, 0b100, Rn, Rt))
+#define LDSMAXLB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b100, Rn, Rt))
+#define STSMAXB(Rs, Rn)                  EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b100, Rn, 0b11111))
+#define STSMAXLB(Rs, Rn)                 EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b100, Rn, 0b11111))
+#define LDSMAXH(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b100, Rn, Rt))
+#define LDSMAXAH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 1, 0, Rs, 0b100, Rn, Rt))
+#define LDSMAXALH(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b01, 1, 1, Rs, 0b100, Rn, Rt))
+#define LDSMAXLH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b100, Rn, Rt))
+#define STSMAXH(Rs, Rn)                  EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b100, Rn, 0b11111))
+#define STSMAXLH(Rs, Rn)                 EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b100, Rn, 0b11111))
+// Atomic Signed Min
+#define LDSMINxw(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b101, Rn, Rt))
+#define LDSMINAxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 1, 0, Rs, 0b101, Rn, Rt))
+#define LDSMINALxw(Rs, Rt, Rn)           EMIT(ATOMIC_gen(0b10+rex.w, 1, 1, Rs, 0b101, Rn, Rt))
+#define LDSMINLxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b101, Rn, Rt))
+#define STSMINxw(Rs, Rn)                 EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b101, Rn, 0b11111))
+#define STSMINLxw(Rs, Rn)                EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b101, Rn, 0b11111))
+#define LDSMINB(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b101, Rn, Rt))
+#define LDSMINAB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 1, 0, Rs, 0b101, Rn, Rt))
+#define LDSMINALB(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b00, 1, 1, Rs, 0b101, Rn, Rt))
+#define LDSMINLB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b101, Rn, Rt))
+#define STSMINB(Rs, Rn)                  EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b101, Rn, 0b11111))
+#define STSMINLB(Rs, Rn)                 EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b101, Rn, 0b11111))
+#define LDSMINH(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b101, Rn, Rt))
+#define LDSMINAH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 1, 0, Rs, 0b101, Rn, Rt))
+#define LDSMINALH(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b01, 1, 1, Rs, 0b101, Rn, Rt))
+#define LDSMINLH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b101, Rn, Rt))
+#define STSMINH(Rs, Rn)                  EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b101, Rn, 0b11111))
+#define STSMINLH(Rs, Rn)                 EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b101, Rn, 0b11111))
+// Atomic Unsigned Max
+#define LDUMAXxw(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b110, Rn, Rt))
+#define LDUMAXAxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 1, 0, Rs, 0b110, Rn, Rt))
+#define LDUMAXALxw(Rs, Rt, Rn)           EMIT(ATOMIC_gen(0b10+rex.w, 1, 1, Rs, 0b110, Rn, Rt))
+#define LDUMAXLxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b110, Rn, Rt))
+#define STUMAXxw(Rs, Rn)                 EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b110, Rn, 0b11111))
+#define STUMAXLxw(Rs, Rn)                EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b110, Rn, 0b11111))
+#define LDUMAXB(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b110, Rn, Rt))
+#define LDUMAXAB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 1, 0, Rs, 0b110, Rn, Rt))
+#define LDUMAXALB(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b00, 1, 1, Rs, 0b110, Rn, Rt))
+#define LDUMAXLB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b110, Rn, Rt))
+#define STUMAXB(Rs, Rn)                  EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b110, Rn, 0b11111))
+#define STUMAXLB(Rs, Rn)                 EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b110, Rn, 0b11111))
+#define LDUMAXH(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b110, Rn, Rt))
+#define LDUMAXAH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 1, 0, Rs, 0b110, Rn, Rt))
+#define LDUMAXALH(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b01, 1, 1, Rs, 0b110, Rn, Rt))
+#define LDUMAXLH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b110, Rn, Rt))
+#define STUMAXH(Rs, Rn)                  EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b110, Rn, 0b11111))
+#define STUMAXLH(Rs, Rn)                 EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b110, Rn, 0b11111))
+// Atomic Unsigned Min
+#define LDUMINxw(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b111, Rn, Rt))
+#define LDUMINAxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 1, 0, Rs, 0b111, Rn, Rt))
+#define LDUMINALxw(Rs, Rt, Rn)           EMIT(ATOMIC_gen(0b10+rex.w, 1, 1, Rs, 0b111, Rn, Rt))
+#define LDUMINLxw(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b111, Rn, Rt))
+#define STUMINxw(Rs, Rn)                 EMIT(ATOMIC_gen(0b10+rex.w, 0, 0, Rs, 0b111, Rn, 0b11111))
+#define STUMINLxw(Rs, Rn)                EMIT(ATOMIC_gen(0b10+rex.w, 0, 1, Rs, 0b111, Rn, 0b11111))
+#define LDUMINB(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b111, Rn, Rt))
+#define LDUMINAB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 1, 0, Rs, 0b111, Rn, Rt))
+#define LDUMINALB(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b00, 1, 1, Rs, 0b111, Rn, Rt))
+#define LDUMINLB(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b111, Rn, Rt))
+#define STUMINB(Rs, Rn)                  EMIT(ATOMIC_gen(0b00, 0, 0, Rs, 0b111, Rn, 0b11111))
+#define STUMINLB(Rs, Rn)                 EMIT(ATOMIC_gen(0b00, 0, 1, Rs, 0b111, Rn, 0b11111))
+#define LDUMINH(Rs, Rt, Rn)              EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b111, Rn, Rt))
+#define LDUMINAH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 1, 0, Rs, 0b111, Rn, Rt))
+#define LDUMINALH(Rs, Rt, Rn)            EMIT(ATOMIC_gen(0b01, 1, 1, Rs, 0b111, Rn, Rt))
+#define LDUMINLH(Rs, Rt, Rn)             EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b111, Rn, Rt))
+#define STUMINH(Rs, Rn)                  EMIT(ATOMIC_gen(0b01, 0, 0, Rs, 0b111, Rn, 0b11111))
+#define STUMINLH(Rs, Rn)                 EMIT(ATOMIC_gen(0b01, 0, 1, Rs, 0b111, Rn, 0b11111))
+
+#define SWAP_gen(size, A, R, Rs, Rn, Rt)    ((size)<<30 | 0b111<<27 | (A)<<23 | (R)<<22 | 1<<21 | (Rs)<<16 | 1<<15 | (Rn)<<5 | (Rt))
+#define SWPxw(Rs, Rt, Rn)               EMIT(SWAP_gen(0b10+rex.w, 0, 0, Rs, Rn, Rt))
+#define SWPAxw(Rs, Rt, Rn)              EMIT(SWAP_gen(0b10+rex.w, 1, 0, Rs, Rn, Rt))
+#define SWPALxw(Rs, Rt, Rn)             EMIT(SWAP_gen(0b10+rex.w, 1, 1, Rs, Rn, Rt))
+#define SWPLxw(Rs, Rt, Rn)              EMIT(SWAP_gen(0b10+rex.w, 0, 1, Rs, Rn, Rt))
+#define SWPB(Rs, Rt, Rn)                EMIT(SWAP_gen(0b00, 0, 0, Rs, Rn, Rt))
+#define SWPAB(Rs, Rt, Rn)               EMIT(SWAP_gen(0b00, 1, 0, Rs, Rn, Rt))
+#define SWPALB(Rs, Rt, Rn)              EMIT(SWAP_gen(0b00, 1, 1, Rs, Rn, Rt))
+#define SWPLB(Rs, Rt, Rn)               EMIT(SWAP_gen(0b00, 0, 1, Rs, Rn, Rt))
+#define SWPH(Rs, Rt, Rn)                EMIT(SWAP_gen(0b01, 0, 0, Rs, Rn, Rt))
+#define SWPAH(Rs, Rt, Rn)               EMIT(SWAP_gen(0b01, 1, 0, Rs, Rn, Rt))
+#define SWPALH(Rs, Rt, Rn)              EMIT(SWAP_gen(0b01, 1, 1, Rs, Rn, Rt))
+#define SWPLH(Rs, Rt, Rn)               EMIT(SWAP_gen(0b01, 0, 1, Rs, Rn, Rt))
+
+#define CAS_gen(size, L, Rs, O0, Rn, Rt)    ((size)<<30 | 0b001000<<24 | 1<<23 | (L)<<22 | 1<<21 | (Rs)<<16 | (O0)<<15 | 0b11111<<10 | (Rn)<<5 | (Rt))
+// Compare and Swap compare Rs with [Rn], write Rt is same, return old [Rn] in Rs
+#define CASxw(Rs, Rt, Rn)               EMIT(CAS_gen(0b10+rex.w, 0, Rs, 0, Rn, Rt))
+#define CASAxw(Rs, Rt, Rn)              EMIT(CAS_gen(0b10+rex.w, 1, Rs, 0, Rn, Rt))
+#define CASALxw(Rs, Rt, Rn)             EMIT(CAS_gen(0b10+rex.w, 1, Rs, 1, Rn, Rt))
+#define CASLxw(Rs, Rt, Rn)              EMIT(CAS_gen(0b10+rex.w, 0, Rs, 1, Rn, Rt))
+#define CASB(Rs, Rt, Rn)                EMIT(CAS_gen(0b00, 0, Rs, 0, Rn, Rt))
+#define CASAB(Rs, Rt, Rn)               EMIT(CAS_gen(0b00, 1, Rs, 0, Rn, Rt))
+#define CASALB(Rs, Rt, Rn)              EMIT(CAS_gen(0b00, 1, Rs, 1, Rn, Rt))
+#define CASLB(Rs, Rt, Rn)               EMIT(CAS_gen(0b00, 0, Rs, 1, Rn, Rt))
+#define CASH(Rs, Rt, Rn)                EMIT(CAS_gen(0b01, 0, Rs, 0, Rn, Rt))
+#define CASAH(Rs, Rt, Rn)               EMIT(CAS_gen(0b01, 1, Rs, 0, Rn, Rt))
+#define CASALH(Rs, Rt, Rn)              EMIT(CAS_gen(0b01, 1, Rs, 1, Rn, Rt))
+#define CASLH(Rs, Rt, Rn)               EMIT(CAS_gen(0b01, 0, Rs, 1, Rn, Rt))
+
+#define CASP_gen(size, L, Rs, O0, Rn, Rt)   ((size)<<30 | 0b001000<<24 | 0<<23 | (L)<<22 | 1<<21 | (Rs)<<16 | (O0)<<15 | 0b11111<<10 | (Rn)<<5 | (Rt))
+// Compare and Swap with pair, Rs, Rs+1 and Rt,Rt+1 with [Rt]
+#define CASPxw(Rs, Rt, Rn)              EMIT(CAS_gen(0b00+rex.w, 0, Rs, 0, Rn, Rt))
+#define CASPAxw(Rs, Rt, Rn)             EMIT(CAS_gen(0b00+rex.w, 1, Rs, 0, Rn, Rt))
+#define CASPALxw(Rs, Rt, Rn)            EMIT(CAS_gen(0b00+rex.w, 1, Rs, 1, Rn, Rt))
+#define CASPLxw(Rs, Rt, Rn)             EMIT(CAS_gen(0b00+rex.w, 0, Rs, 1, Rn, Rt))
+
 #endif  //__ARM64_EMITTER_H__
diff --git a/src/dynarec/arm64/arm64_printer.c b/src/dynarec/arm64/arm64_printer.c
index 973255f8..5f7433f3 100644
--- a/src/dynarec/arm64/arm64_printer.c
+++ b/src/dynarec/arm64/arm64_printer.c
@@ -16,7 +16,7 @@ static const char* conds[] = {"cEQ", "cNE", "cCS", "cCC", "cMI", "cPL", "cVS", "
 #define abs(A) (((A)<0)?(-(A)):(A))

 

 typedef struct arm64_print_s {

-    int N, S, U, L, Q;

+    int N, S, U, L, Q, A, R;

     int t, n, m, d, t2, a;

     int f, c, o, h, p;

     int i, r, s;

@@ -60,6 +60,8 @@ int isMask(uint32_t opcode, const char* mask, arm64_print_t *a)
             case 'N': a->N = (a->N<<1) | v; break;

             case 'S': a->S = (a->S<<1) | v; break;

             case 'U': a->U = (a->U<<1) | v; break;

+            case 'A': a->A = (a->A<<1) | v; break;

+            case 'R': a->R = (a->R<<1) | v; break;

             case 'L': a->L = (a->L<<1) | v; break;

             case 'Q': a->Q = (a->Q<<1) | v; break;

             case 't': a->t = (a->t<<1) | v; break;

@@ -1573,6 +1575,52 @@ const char* arm64_print(uint32_t opcode, uintptr_t addr)
         return buff;

     }

 

+    // CASxw

+    if(isMask(opcode, "1f0010001L1ssssso11111nnnnnttttt", &a)) {

+        snprintf(buff, sizeof(buff), "CAS%s%s %s, %s, [%s]", a.o?"A":"", a.L?"L":"", sf?Xt[Rs]:Wt[Rs], sf?Xt[Rt]:Wt[Rt], XtSp[Rn]);

+        return buff;

+    }

+    // CAS B/H

+    if(isMask(opcode, "0f0010001L1ssssso11111nnnnnttttt", &a)) {

+        snprintf(buff, sizeof(buff), "CAS%s%s%s %s, %s, [%s]", a.o?"A":"", a.L?"L":"", sf?"H":"B", Xt[Rs], Xt[Rt], XtSp[Rn]);

+        return buff;

+    }

+    // CASPxw

+    if(isMask(opcode, "0f0010000L1ssssso11111nnnnnttttt", &a)) {

+        snprintf(buff, sizeof(buff), "CASP%s%s %s,%s, %s,%s, [%s]", a.o?"A":"", a.L?"L":"", sf?Xt[Rs]:Wt[Rs], sf?Xt[Rs+1]:Wt[Rs+1], sf?Xt[Rt]:Wt[Rt], sf?Xt[Rt+1]:Wt[Rt+1], XtSp[Rn]);

+        return buff;

+    }

+    // SWPxw

+    if(isMask(opcode, "1f111000AR1sssss100000nnnnnttttt", &a)) {

+        snprintf(buff, sizeof(buff), "SWP%s%s %s, %s, [%s]", a.A?"A":"", a.R?"L":"", sf?Xt[Rs]:Wt[Rs], sf?Xt[Rt]:Wt[Rt], XtSp[Rn]);

+        return buff;

+    }

+    // SWPxw

+    if(isMask(opcode, "0f111000AR1sssss100000nnnnnttttt", &a)) {

+        snprintf(buff, sizeof(buff), "CAS%s%s%s %s, %s, [%s]", a.A?"A":"", a.R?"L":"", sf?"H":"B", Xt[Rs], Xt[Rt], XtSp[Rn]);

+        return buff;

+    }

+    // LDXXXxw

+    if(isMask(opcode, "1f111000AR1sssss0ooo00nnnnnttttt", &a)) {

+        const char* ops[] = { "ADD", "CLR", "EOR", "SET", "SMAX, SMIN", "UMAX", "UMIN" };

+        if((Rt == 0b11111) && !a.A) {

+            snprintf(buff, sizeof(buff), "ST%s%s %s, [%s]", ops[a.o], a.R?"L":"", sf?Xt[Rs]:Wt[Rs], XtSp[Rn]);

+        } else {

+            snprintf(buff, sizeof(buff), "LD%s%s%s %s, %s, [%s]", ops[a.o], a.A?"A":"", a.R?"L":"", sf?Xt[Rs]:Wt[Rs], sf?Xt[Rt]:Wt[Rt], XtSp[Rn]);

+        }

+        return buff;

+    }

+    // LDXXX B/H

+    if(isMask(opcode, "0f111000AR1sssss0ooo00nnnnnttttt", &a)) {

+        const char* ops[] = { "ADD", "CLR", "EOR", "SET", "SMAX, SMIN", "UMAX", "UMIN" };

+        if((Rt == 0b11111) && !a.A) {

+            snprintf(buff, sizeof(buff), "ST%s%s%s %s, [%s]", ops[a.o], a.R?"L":"", sf?"H":"B", Xt[Rs], XtSp[Rn]);

+        } else {

+            snprintf(buff, sizeof(buff), "LD%s%s%s%s %s, %s, [%s]", ops[a.o], a.A?"A":"", a.R?"L":"", sf?"H":"B", Xt[Rs], Xt[Rt], XtSp[Rn]);

+        }

+        return buff;

+    }

+

 

     snprintf(buff, sizeof(buff), "%08X ???", __builtin_bswap32(opcode));

     return buff;

diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index d6777252..c42adeaf 100644
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -1055,13 +1055,17 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 SMDMB();
                 GETGB(x4);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                MARKLOCK;
-                // do the swap with exclusive locking
-                LDAXRB(x1, ed);
-                // do the swap 4 -> strb(ed), 1 -> gd
-                STLXRB(x3, x4, ed);
-                CBNZx_MARKLOCK(x3);
-                SMDMB();
+                if(arm64_atomics) {
+                    SWPALB(x4, x1, ed);
+                } else {
+                    MARKLOCK;
+                    // do the swap with exclusive locking
+                    LDAXRB(x1, ed);
+                    // do the swap 4 -> strb(ed), 1 -> gd
+                    STLXRB(x3, x4, ed);
+                    CBNZx_MARKLOCK(x3);
+                    SMDMB();
+                }
                 BFIx(gb1, x1, gb2, 8);
             }
             break;
@@ -1080,13 +1084,21 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 SMDMB();
                 TSTx_mask(ed, 1, 0, 1+rex.w);    // mask=3 or 7
                 B_MARK(cNE);
-                MARKLOCK;
-                LDAXRxw(x1, ed);
-                STLXRxw(x3, gd, ed);
-                CBNZx_MARKLOCK(x3);
-                B_MARK2_nocond;
+                if(arm64_atomics) {
+                    SWPALxw(gd, gd, ed);
+                    B_NEXT_nocond;
+                } else {
+                    MARKLOCK;
+                    LDAXRxw(x1, ed);
+                    STLXRxw(x3, gd, ed);
+                    CBNZx_MARKLOCK(x3);
+                    B_MARK2_nocond;
+                }
                 MARK;
                 LDRxw_U12(x1, ed, 0);
+                LDAXRB(x3, ed);
+                STLXRB(x3, gd, ed);
+                CBNZx_MARK(x3);
                 STRxw_U12(gd, ed, 0);
                 MARK2;
                 SMDMB();
diff --git a/src/dynarec/arm64/dynarec_arm64_66f0.c b/src/dynarec/arm64/dynarec_arm64_66f0.c
index fe32cf99..c5faa8da 100644
--- a/src/dynarec/arm64/dynarec_arm64_66f0.c
+++ b/src/dynarec/arm64/dynarec_arm64_66f0.c
@@ -61,13 +61,21 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 BFIx(ed, x6, 0, 16);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                MARKLOCK;
-                LDAXRH(x1, wback);
-                emit_or16(dyn, ninst, x1, x5, x3, x4);
-                STLXRH(x3, x1, wback);
-                CBNZx_MARKLOCK(x3);
+                if(arm64_atomics) {
+                    UFLAG_IF {
+                        LDSETALH(x5, x1, wback);
+                        emit_or16(dyn, ninst, x1, x5, x3, x4);
+                    } else {
+                        STSETLH(x5, wback);
+                    }
+                } else {
+                    MARKLOCK;
+                    LDAXRH(x1, wback);
+                    emit_or16(dyn, ninst, x1, x5, x3, x4);
+                    STLXRH(x3, x1, wback);
+                    CBNZx_MARKLOCK(x3);
+                }
             }
-            SMDMB();
             break;
 
         case 0x0F:
@@ -93,14 +101,18 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         TSTx_mask(wback, 1, 0, 0);    // mask=1
                         B_MARK3(cNE);
                         // Aligned version
-                        MARKLOCK;
-                        LDAXRH(x1, wback);
-                        CMPSw_REG(x6, x1);
-                        B_MARK(cNE);
-                        // EAX == Ed
-                        STLXRH(x4, gd, wback);
-                        CBNZx_MARKLOCK(x4);
-                        // done
+                        if(arm64_atomics) {
+                            CASALH(x6, x1, wback);
+                        } else {
+                            MARKLOCK;
+                            LDAXRH(x1, wback);
+                            CMPSw_REG(x6, x1);
+                            B_MARK(cNE);
+                            // EAX == Ed
+                            STLXRH(x4, gd, wback);
+                            CBNZx_MARKLOCK(x4);
+                            // done
+                        }
                         B_MARK_nocond;
                         // Unaligned version
                         MARK3;
@@ -134,11 +146,15 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         BFIx(ed, x5, 0, 16);
                     } else {
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                        MARKLOCK;
-                        LDAXRH(x1, wback);
-                        ADDxw_REG(x4, x1, x5);
-                        STLXRH(x3, x4, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            LDADDALH(x5, x1, wback);
+                        } else {
+                            MARKLOCK;
+                            LDAXRH(x1, wback);
+                            ADDxw_REG(x4, x1, x5);
+                            STLXRH(x3, x4, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                         IFX(X_ALL|X_PEND) {
                             MOVxw_REG(x2, x1);
                             emit_add16(dyn, ninst, x2, x5, x3, x4);
@@ -199,11 +215,18 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         MOV32w(x5, i32);
                         TSTx_mask(wback, 1, 0, 0);    // mask=1
                         B_MARK(cNE);
-                        MARKLOCK;
-                        LDAXRH(x1, wback);
-                        emit_add16(dyn, ninst, x1, x5, x3, x4);
-                        STLXRH(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            LDADDALH(x5, x1, wback);
+                            UFLAG_IF {
+                                emit_add16(dyn, ninst, x1, x5, x3, x4);    
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRH(x1, wback);
+                            emit_add16(dyn, ninst, x1, x5, x3, x4);
+                            STLXRH(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                         B_NEXT_nocond;
                         MARK;   // unaligned! also, not enough
                         LDRH_U12(x1, wback, 0);
@@ -229,11 +252,20 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, (opcode==0x81)?2:1);
                         if(opcode==0x81) i32 = F16S; else i32 = F8S;
                         MOV32w(x5, i32);
-                        MARKLOCK;
-                        LDAXRH(x1, wback);
-                        emit_or16(dyn, ninst, x1, x5, x3, x4);
-                        STLXRH(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            UFLAG_IF {
+                                LDSETALH(x5, x1, wback);
+                                emit_or16(dyn, ninst, x1, x5, x3, x4);
+                            } else {
+                                STSETLH(x5, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRH(x1, wback);
+                            emit_or16(dyn, ninst, x1, x5, x3, x4);
+                            STLXRH(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 2: //ADC
@@ -293,12 +325,23 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     } else {
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, (opcode==0x81)?2:1);
                         if(opcode==0x81) i32 = F16S; else i32 = F8S;
-                        MOV32w(x5, i32);
-                        MARKLOCK;
-                        LDAXRH(x1, wback);
-                        emit_and16(dyn, ninst, x1, x5, x3, x4);
-                        STLXRH(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            MOV32w(x5, ~i32);
+                            UFLAG_IF {
+                                LDCLRALH(x5, x1, wback);
+                                MVNw_REG(x5, x5);
+                                emit_and16(dyn, ninst, x1, x5, x3, x4);
+                            } else {
+                                STCLRLH(x5, wback);
+                            }
+                        } else {
+                            MOV32w(x5, i32);
+                            MARKLOCK;
+                            LDAXRH(x1, wback);
+                            emit_and16(dyn, ninst, x1, x5, x3, x4);
+                            STLXRH(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 5: //SUB
@@ -317,11 +360,21 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         MOV32w(x5, i32);
                         TSTx_mask(wback, 1, 0, 0);    // mask=1
                         B_MARK(cNE);
-                        MARKLOCK;
-                        LDAXRH(x1, wback);
-                        emit_sub16(dyn, ninst, x1, x5, x3, x4);
-                        STLXRH(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            NEGw_REG(x4, x5);
+                            UFLAG_IF {
+                                LDADDALH(x4, x1, wback);
+                                emit_sub16(dyn, ninst, x1, x5, x3, x4);
+                            } else {
+                                STADDLH(x4, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRH(x1, wback);
+                            emit_sub16(dyn, ninst, x1, x5, x3, x4);
+                            STLXRH(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                         B_NEXT_nocond;
                         MARK;   // unaligned! also, not enough
                         LDRH_U12(x1, wback, 0);
@@ -347,11 +400,20 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, (opcode==0x81)?2:1);
                         if(opcode==0x81) i32 = F16S; else i32 = F8S;
                         MOV32w(x5, i32);
-                        MARKLOCK;
-                        LDAXRH(x1, wback);
-                        emit_xor16(dyn, ninst, x1, x5, x3, x4);
-                        STLXRH(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            UFLAG_IF {
+                                LDEORALH(x5, x1, wback);
+                                emit_xor16(dyn, ninst, x1, x5, x3, x4);
+                            } else {
+                                STEORLH(x5, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRH(x1, wback);
+                            emit_xor16(dyn, ninst, x1, x5, x3, x4);
+                            STLXRH(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 7: //CMP
@@ -387,13 +449,22 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                             BFIx(ed, x6, 0, 16);
                         } else {
                             addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                            MARKLOCK;
-                            LDAXRH(x1, wback);
-                            emit_inc16(dyn, ninst, x1, x3, x4);
-                            STLXRH(x3, x1, wback);
-                            CBNZx_MARKLOCK(x3);
+                            if(arm64_atomics) {
+                                MOV32w(x3, 1);
+                                UFLAG_IF {
+                                    LDADDALH(x3, x1, wback);
+                                    emit_inc16(dyn, ninst, x1, x3, x4);
+                                } else {
+                                    STADDLH(x3, wback);
+                                }
+                            } else {
+                                MARKLOCK;
+                                LDAXRH(x1, wback);
+                                emit_inc16(dyn, ninst, x1, x3, x4);
+                                STLXRH(x3, x1, wback);
+                                CBNZx_MARKLOCK(x3);
+                            }
                         }
-                        SMDMB();
                         break;
                     case 1: //DEC Ew
                         INST_NAME("LOCK DEC Ew");
@@ -406,13 +477,22 @@ uintptr_t dynarec64_66F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                             BFIx(ed, x6, 0, 16);
                         } else {
                             addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                            MARKLOCK;
-                            LDAXRH(x1, wback);
-                            emit_dec16(dyn, ninst, x1, x3, x4);
-                            STLXRH(x3, x1, wback);
-                            CBNZx_MARKLOCK(x3);
+                            if(arm64_atomics) {
+                                MOV32w(x3, -1);
+                                UFLAG_IF {
+                                    LDADDALH(x3, x1, wback);
+                                    emit_dec16(dyn, ninst, x1, x3, x4);
+                                } else {
+                                    STADDLH(x3, wback);
+                                }
+                            } else {
+                                MARKLOCK;
+                                LDAXRH(x1, wback);
+                                emit_dec16(dyn, ninst, x1, x3, x4);
+                                STLXRH(x3, x1, wback);
+                                CBNZx_MARKLOCK(x3);
+                            }
                         }
-                        SMDMB();
                         break;
                     default:
                         DEFAULT;
diff --git a/src/dynarec/arm64/dynarec_arm64_f0.c b/src/dynarec/arm64/dynarec_arm64_f0.c
index 9f43f6b8..b51c4f47 100644
--- a/src/dynarec/arm64/dynarec_arm64_f0.c
+++ b/src/dynarec/arm64/dynarec_arm64_f0.c
@@ -70,13 +70,19 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 BFIx(wback, x1, wb2*8, 8);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                MARKLOCK;
-                LDAXRB(x1, wback);
-                emit_add8(dyn, ninst, x1, x2, x4, x5);
-                STLXRB(x4, x1, wback);
-                CBNZx_MARKLOCK(x4);
+                if(arm64_atomics) {
+                    LDADDALB(x2, x1, wback);
+                    UFLAG_IF {
+                        emit_add8(dyn, ninst, x1, x2, x4, x5);    
+                    }
+                } else {
+                    MARKLOCK;
+                    LDAXRB(x1, wback);
+                    emit_add8(dyn, ninst, x1, x2, x4, x5);
+                    STLXRB(x4, x1, wback);
+                    CBNZx_MARKLOCK(x4);
+                }
             }
-            SMDMB();
             break;
         case 0x01:
             INST_NAME("LOCK ADD Ed, Gd");
@@ -89,13 +95,19 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 emit_add32(dyn, ninst, rex, ed, gd, x3, x4);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                MARKLOCK;
-                LDAXRxw(x1, wback);
-                emit_add32(dyn, ninst, rex, x1, gd, x3, x4);
-                STLXRxw(x3, x1, wback);
-                CBNZx_MARKLOCK(x3);
+                if(arm64_atomics) {
+                    LDADDALxw(gd, x1, wback);
+                    UFLAG_IF {
+                        emit_add32(dyn, ninst, rex, x1, gd, x3, x4);    
+                    }
+                } else {
+                    MARKLOCK;
+                    LDAXRxw(x1, wback);
+                    emit_add32(dyn, ninst, rex, x1, gd, x3, x4);
+                    STLXRxw(x3, x1, wback);
+                    CBNZx_MARKLOCK(x3);
+                }
             }
-            SMDMB();
             break;
 
         case 0x08:
@@ -118,13 +130,19 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 BFIx(wback, x1, wb2*8, 8);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                MARKLOCK;
-                LDAXRB(x1, wback);
-                emit_or8(dyn, ninst, x1, x2, x4, x5);
-                STLXRB(x4, x1, wback);
-                CBNZx_MARKLOCK(x4);
+                if(arm64_atomics) {
+                    LDSETALB(x2, x1, wback);
+                    UFLAG_IF {
+                        emit_or8(dyn, ninst, x1, x2, x4, x5);    
+                    }
+                } else {
+                    MARKLOCK;
+                    LDAXRB(x1, wback);
+                    emit_or8(dyn, ninst, x1, x2, x4, x5);
+                    STLXRB(x4, x1, wback);
+                    CBNZx_MARKLOCK(x4);
+                }
             }
-            SMDMB();
             break;
         case 0x09:
             INST_NAME("LOCK OR Ed, Gd");
@@ -137,13 +155,19 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 emit_or32(dyn, ninst, rex, ed, gd, x3, x4);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                MARKLOCK;
-                LDAXRxw(x1, wback);
-                emit_or32(dyn, ninst, rex, x1, gd, x3, x4);
-                STLXRxw(x3, x1, wback);
-                CBNZx_MARKLOCK(x3);
+                if(arm64_atomics) {
+                    LDSETALxw(gd, x1, wback);
+                    UFLAG_IF {
+                        emit_or32(dyn, ninst, rex, x1, gd, x3, x4);    
+                    }
+                } else {
+                    MARKLOCK;
+                    LDAXRxw(x1, wback);
+                    emit_or32(dyn, ninst, rex, x1, gd, x3, x4);
+                    STLXRxw(x3, x1, wback);
+                    CBNZx_MARKLOCK(x3);
+                }
             }
-            SMDMB();
             break;
 
         case 0x0F:
@@ -231,15 +255,19 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                                 B_NEXT_nocond;
                             } else {
                                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                                MARKLOCK;
-                                LDAXRB(x2, wback);
-                                CMPSxw_REG(x6, x2);
-                                B_MARK(cNE);
-                                // EAX == Ed
-                                STLXRB(x4, gd, wback);
-                                CBNZx_MARKLOCK(x4);
-                                // done
-                                MARK;
+                                if(arm64_atomics) {
+                                    CASALB(x6, x2, wback);
+                                } else {
+                                    MARKLOCK;
+                                    LDAXRB(x2, wback);
+                                    CMPSxw_REG(x6, x2);
+                                    B_MARK(cNE);
+                                    // EAX == Ed
+                                    STLXRB(x4, gd, wback);
+                                    CBNZx_MARKLOCK(x4);
+                                    // done
+                                    MARK;
+                                }
                                 UFLAG_IF {emit_cmp8(dyn, ninst, x6, x2, x3, x4, x5);}
                                 BFIx(xRAX, x2, 0, 8);
                             }
@@ -256,7 +284,6 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                             SETFLAGS(X_ALL, SF_SET_PENDING);
                             nextop = F8;
                             GETGD;
-                            SMDMB();
                             if(MODREG) {
                                 ed = xRAX+(nextop&7)+(rex.b<<3);
                                 wback = 0;
@@ -273,15 +300,27 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                                 TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
                                 B_MARK3(cNE);
                                 // Aligned version
-                                MARKLOCK;
-                                LDAXRxw(x1, wback);
-                                CMPSxw_REG(xRAX, x1);
-                                B_MARK(cNE);
-                                // EAX == Ed
-                                STLXRxw(x4, gd, wback);
-                                CBNZx_MARKLOCK(x4);
-                                // done
-                                B_MARK_nocond;
+                                if(arm64_atomics) {
+                                    UFLAG_IF {
+                                        MOVxw_REG(x1, xRAX);
+                                        CASALxw(x1, gd, wback);
+                                        emit_cmp32(dyn, ninst, rex, xRAX, x1, x3, x4, x5);
+                                        MOVxw_REG(xRAX, x1);
+                                    } else {
+                                        CASALxw(xRAX, gd, wback);
+                                    }
+                                    B_NEXT_nocond;
+                                } else {
+                                    MARKLOCK;
+                                    LDAXRxw(x1, wback);
+                                    CMPSxw_REG(xRAX, x1);
+                                    B_MARK(cNE);
+                                    // EAX == Ed
+                                    STLXRxw(x4, gd, wback);
+                                    CBNZx_MARKLOCK(x4);
+                                    // done
+                                    B_MARK_nocond;
+                                }
                                 // Unaligned version
                                 MARK3;
                                 LDRxw_U12(x1, wback, 0);
@@ -292,12 +331,12 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                                 STLXRB(x4, gd, wback);
                                 CBNZx_MARK3(x4);
                                 STRxw_U12(gd, wback, 0);
+                                SMDMB();
                                 MARK;
                                 // Common part (and fallback for EAX != Ed)
                                 UFLAG_IF {emit_cmp32(dyn, ninst, rex, xRAX, x1, x3, x4, x5);}
                                 MOVxw_REG(xRAX, x1);    // upper par of RAX will be erase on 32bits, no mater what
                             }
-                            SMDMB();
                             break;
                         default:
                             DEFAULT;
@@ -358,7 +397,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 case 0xC1:
                     switch(rep) {
                         case 0:
-                            INST_NAME("LOCK XADD Gd, Ed");
+                            INST_NAME("LOCK XADD Ed, Gd");
                             SETFLAGS(X_ALL, SF_SET_PENDING);
                             nextop = F8;
                             GETGD;
@@ -373,12 +412,23 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
                                 TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
                                 B_MARK(cNE);    // unaligned
-                                MARKLOCK;
-                                LDAXRxw(x1, wback);
-                                ADDxw_REG(x4, x1, gd);
-                                STLXRxw(x3, x4, wback);
-                                CBNZx_MARKLOCK(x3);
-                                B_MARK2_nocond;
+                                if(arm64_atomics) {
+                                    UFLAG_IF {
+                                        MOVxw_REG(x3, gd);
+                                        LDADDALxw(x3, gd, wback);
+                                        emit_add32(dyn, ninst, rex, x3, gd, x4, x5);
+                                    } else {
+                                        LDADDALxw(gd, gd, wback);
+                                    }
+                                    B_NEXT_nocond;
+                                } else {
+                                    MARKLOCK;
+                                    LDAXRxw(x1, wback);
+                                    ADDxw_REG(x4, x1, gd);
+                                    STLXRxw(x3, x4, wback);
+                                    CBNZx_MARKLOCK(x3);
+                                    B_MARK2_nocond;
+                                }
                                 MARK;
                                 LDRxw_U12(x1, wback, 0);
                                 LDAXRB(x4, wback);
@@ -387,6 +437,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                                 STLXRB(x3, x4, wback);
                                 CBNZx_MARK(x3);
                                 STRxw_U12(x4, wback, 0);
+                                SMDMB();
                                 MARK2;
                                 IFX(X_ALL|X_PEND) {
                                     MOVxw_REG(x2, x1);
@@ -394,7 +445,6 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                                 }
                                 MOVxw_REG(gd, x1);
                             }
-                            SMDMB();
                             break;
                         default:
                             DEFAULT;
@@ -408,24 +458,44 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                             SETFLAGS(X_ZF, SF_SUBSET);
                             nextop = F8;
                             addr = geted(dyn, addr, ninst, nextop, &wback, x1, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                            SMDMB();
-                            MARKLOCK;
-                            LDAXPxw(x2, x3, wback);
-                            CMPSxw_REG(xRAX, x2);
-                            B_MARK(cNE);    // EAX != Ed[0]
-                            CMPSxw_REG(xRDX, x3);
-                            B_MARK(cNE);    // EDX != Ed[1]
-                            STLXPxw(x4, xRBX, xRCX, wback);
-                            CBNZx_MARKLOCK(x4);
-                            MOV32w(x1, 1);
-                            B_MARK3_nocond;
-                            MARK;
-                            MOVxw_REG(xRAX, x2);
-                            MOVxw_REG(xRDX, x3);
-                            MOV32w(x1, 0);
+                            if(arm64_atomics) {
+                                MOVx_REG(x2, xRAX);
+                                MOVx_REG(x3, xRDX);
+                                MOVx_REG(x4, xRBX);
+                                MOVx_REG(x5, xRCX);
+                                CASPALxw(x2, x4, wback);
+                                UFLAG_IF {
+                                    CMPSxw_REG(x2, xRAX);
+                                    CSETw(x1, cEQ);
+                                    CMPSxw_REG(x3, xRDX);
+                                    CSETw(x2, cEQ);
+                                    ANDw_REG(x1, x1, x2);
+                                }
+                                B_MARK3_nocond;
+                            } else {
+                                MARKLOCK;
+                                LDAXPxw(x2, x3, wback);
+                                CMPSxw_REG(xRAX, x2);
+                                B_MARK(cNE);    // EAX != Ed[0]
+                                CMPSxw_REG(xRDX, x3);
+                                B_MARK(cNE);    // EDX != Ed[1]
+                                STLXPxw(x4, xRBX, xRCX, wback);
+                                CBNZx_MARKLOCK(x4);
+                                UFLAG_IF {
+                                    MOV32w(x1, 1);
+                                }
+                                B_MARK3_nocond;
+                                MARK;
+                                MOVxw_REG(xRAX, x2);
+                                MOVxw_REG(xRDX, x3);
+                                UFLAG_IF {
+                                    MOV32w(x1, 0);
+                                }
+                            }
                             MARK3;
-                            SMDMB();
-                            BFIw(xFlags, x1, F_ZF, 1);
+                            UFLAG_IF {
+                                BFIw(xFlags, x1, F_ZF, 1);
+                            }
                             break;
                         default:
                             DEFAULT;
@@ -490,19 +560,27 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             SETFLAGS(X_ALL, SF_SET_PENDING);
             nextop = F8;
             GETGD;
-            SMDMB();
             if(MODREG) {
                 ed = xRAX+(nextop&7)+(rex.b<<3);
                 emit_and32(dyn, ninst, rex, ed, gd, x3, x4);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                MARKLOCK;
-                LDAXRxw(x1, wback);
-                emit_and32(dyn, ninst, rex, x1, gd, x3, x4);
-                STLXRxw(x3, x1, wback);
-                CBNZx_MARKLOCK(x3);
+                if(arm64_atomics) {
+                    MVNxw_REG(x1, gd);
+                    UFLAG_IF {
+                        LDCLRALxw(x1, x1, wback);
+                        emit_and32(dyn, ninst, rex, x1, gd, x3, x4);
+                    } else {
+                        STCLRLxw(x1, wback);
+                    }
+                } else {
+                    MARKLOCK;
+                    LDAXRxw(x1, wback);
+                    emit_and32(dyn, ninst, rex, x1, gd, x3, x4);
+                    STLXRxw(x3, x1, wback);
+                    CBNZx_MARKLOCK(x3);
+                }
             }
-            SMDMB();
             break;
 
         case 0x29:
@@ -516,13 +594,22 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 emit_sub32(dyn, ninst, rex, ed, gd, x3, x4);
             } else {
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                MARKLOCK;
-                LDAXRxw(x1, wback);
-                emit_sub32(dyn, ninst, rex, x1, gd, x3, x4);
-                STLXRxw(x3, x1, wback);
-                CBNZx_MARKLOCK(x3);
+                if(arm64_atomics) {
+                    NEGxw_REG(x1, gd);
+                    UFLAG_IF {
+                        LDADDALxw(x1, x1, wback);
+                        emit_sub32(dyn, ninst, rex, x1, gd, x3, x4);
+                    } else {
+                        STADDLxw(x1, wback);
+                    }
+                } else {
+                    MARKLOCK;
+                    LDAXRxw(x1, wback);
+                    emit_sub32(dyn, ninst, rex, x1, gd, x3, x4);
+                    STLXRxw(x3, x1, wback);
+                    CBNZx_MARKLOCK(x3);
+                }
             }
-            SMDMB();
             break;
 
         case 0x66:
@@ -545,11 +632,22 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         addr = geted(dyn, addr, ninst, nextop, &wback, x5, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 1);
                         u8 = F8;
                         wb1 = 1;
-                        MARKLOCK;
-                        LDAXRB(x1, wback);
-                        emit_add8c(dyn, ninst, x1, u8, x2, x4);
-                        STLXRB(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            MOV32w(x2, u8);
+                            UFLAG_IF {
+                                LDADDALB(x2, x1, wback);
+                                emit_add8(dyn, ninst, x1, x2, x3, x4);
+                            } else {
+                                STADDB(x2, wback);
+                            }
+
+                        } else {
+                            MARKLOCK;
+                            LDAXRB(x1, wback);
+                            emit_add8c(dyn, ninst, x1, u8, x2, x4);
+                            STLXRB(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 1: //OR
@@ -565,11 +663,21 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         addr = geted(dyn, addr, ninst, nextop, &wback, x5, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 1);
                         u8 = F8;
                         wb1 = 1;
-                        MARKLOCK;
-                        LDAXRB(x1, wback);
-                        emit_or8c(dyn, ninst, x1, u8, x2, x4);
-                        STLXRB(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            MOV32w(x2, u8);
+                            UFLAG_IF {
+                                LDSETALB(x2, x1, wback);
+                                emit_or8(dyn, ninst, x1, x2, x3, x4);
+                            } else {
+                                STSETLB(x2, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRB(x1, wback);
+                            emit_or8c(dyn, ninst, x1, u8, x2, x4);
+                            STLXRB(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 2: //ADC
@@ -627,11 +735,21 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         addr = geted(dyn, addr, ninst, nextop, &wback, x5, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 1);
                         u8 = F8;
                         wb1 = 1;
-                        MARKLOCK;
-                        LDAXRB(x1, wback);
-                        emit_and8c(dyn, ninst, x1, u8, x2, x4);
-                        STLXRB(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            MOV32w(x2, ~u8);
+                            UFLAG_IF {
+                                LDCLRALB(x2, x1, wback);
+                                emit_and8c(dyn, ninst, x1, u8, x2, x4);
+                            } else {
+                                STCLRLB(x2, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRB(x1, wback);
+                            emit_and8c(dyn, ninst, x1, u8, x2, x4);
+                            STLXRB(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 5: //SUB
@@ -647,11 +765,21 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         addr = geted(dyn, addr, ninst, nextop, &wback, x5, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 1);
                         u8 = F8;
                         wb1 = 1;
-                        MARKLOCK;
-                        LDAXRB(x1, wback);
-                        emit_sub8c(dyn, ninst, x1, u8, x2, x4, x3);
-                        STLXRB(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            MOV32w(x2, -u8);
+                            UFLAG_IF {
+                                LDADDALB(x2, x1, wback);
+                                emit_sub8c(dyn, ninst, x1, u8, x2, x4, x3);
+                            } else {
+                                STADDLB(x2, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRB(x1, wback);
+                            emit_sub8c(dyn, ninst, x1, u8, x2, x4, x3);
+                            STLXRB(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 6: //XOR
@@ -667,11 +795,21 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         addr = geted(dyn, addr, ninst, nextop, &wback, x5, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 1);
                         u8 = F8;
                         wb1 = 1;
-                        MARKLOCK;
-                        LDAXRB(x1, wback);
-                        emit_xor8c(dyn, ninst, x1, u8, x2, x4);
-                        STLXRB(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            MOV32w(x2, u8);
+                            UFLAG_IF {
+                                LDEORALB(x2, x1, wback);
+                                emit_xor8(dyn, ninst, x1, x2, x3, x4);    
+                            } else {
+                                STEORLB(x2, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRB(x1, wback);
+                            emit_xor8c(dyn, ninst, x1, u8, x2, x4);
+                            STLXRB(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 7: //CMP
@@ -694,7 +832,6 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
         case 0x81:
         case 0x83:
             nextop = F8;
-            SMDMB();
             switch((nextop>>3)&7) {
                 case 0: //ADD
                     if(opcode==0x81) {
@@ -713,12 +850,21 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         if(opcode==0x81) i64 = F32S; else i64 = F8S;
                         TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
                         B_MARK(cNE);
-                        MARKLOCK;
-                        LDAXRxw(x1, wback);
-                        emit_add32c(dyn, ninst, rex, x1, i64, x3, x4, x5);
-                        STLXRxw(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
-                        SMDMB();
+                        if(arm64_atomics) {
+                            MOV64xw(x3, i64);
+                            UFLAG_IF {
+                                LDADDALxw(x3, x1, wback);
+                                emit_add32(dyn, ninst, rex, x1, x3, x4, x5);
+                            } else {
+                                STADDLxw(x3, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRxw(x1, wback);
+                            emit_add32c(dyn, ninst, rex, x1, i64, x3, x4, x5);
+                            STLXRxw(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                         B_NEXT_nocond;
                         MARK;   // unaligned! also, not enough
                         LDRxw_U12(x1, wback, 0);
@@ -728,6 +874,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         STLXRB(x3, x1, wback);
                         CBNZx_MARK(x3);
                         STRxw_U12(x1, wback, 0);    // put the whole value
+                        SMDMB();
                     }
                     break;
                 case 1: //OR
@@ -742,11 +889,20 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, (opcode==0x81)?4:1);
                         if(opcode==0x81) i64 = F32S; else i64 = F8S;
                         MOV64xw(x5, i64);
-                        MARKLOCK;
-                        LDAXRxw(x1, wback);
-                        emit_or32(dyn, ninst, rex, x1, x5, x3, x4);
-                        STLXRxw(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            UFLAG_IF {
+                                LDSETALxw(x5, x1, wback);
+                                emit_or32(dyn, ninst, rex, x1, x5, x3, x4);
+                            } else {
+                                STSETLxw(x5, wback);
+                            } 
+                        } else {
+                            MARKLOCK;
+                            LDAXRxw(x1, wback);
+                            emit_or32(dyn, ninst, rex, x1, x5, x3, x4);
+                            STLXRxw(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 2: //ADC
@@ -800,12 +956,23 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     } else {
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, (opcode==0x81)?4:1);
                         if(opcode==0x81) i64 = F32S; else i64 = F8S;
-                        MOV64xw(x5, i64);
-                        MARKLOCK;
-                        LDAXRxw(x1, wback);
-                        emit_and32(dyn, ninst, rex, x1, x5, x3, x4);
-                        STLXRxw(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            MOV64xw(x5, ~i64);
+                            UFLAG_IF {
+                                LDCLRALxw(x5, x1, wback);
+                                MVNxw_REG(x5, x5);
+                                emit_and32(dyn, ninst, rex, x1, x5, x3, x4);
+                            } else {
+                                STCLRLxw(x5, wback);
+                            }
+                        } else {
+                            MOV64xw(x5, i64);
+                            MARKLOCK;
+                            LDAXRxw(x1, wback);
+                            emit_and32(dyn, ninst, rex, x1, x5, x3, x4);
+                            STLXRxw(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 5: //SUB
@@ -821,12 +988,22 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         if(opcode==0x81) i64 = F32S; else i64 = F8S;
                         TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
                         B_MARK(cNE);
-                        MARKLOCK;
-                        LDAXRxw(x1, wback);
-                        emit_sub32c(dyn, ninst, rex, x1, i64, x3, x4, x5);
-                        STLXRxw(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
-                        SMDMB();
+                        if(arm64_atomics) {
+                            MOV64xw(x5, -i64);
+                            UFLAG_IF {
+                                LDADDALxw(x5, x1, wback);
+                                NEGxw_REG(x5, x5);
+                                emit_sub32(dyn, ninst, rex, x1, x5, x3, x4);
+                            } else {
+                                STADDLxw(x5, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRxw(x1, wback);
+                            emit_sub32c(dyn, ninst, rex, x1, i64, x3, x4, x5);
+                            STLXRxw(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                         B_NEXT_nocond;
                         MARK;   // unaligned! also, not enough
                         LDRxw_U12(x1, wback, 0);
@@ -836,6 +1013,7 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         STLXRB(x3, x1, wback);
                         CBNZx_MARK(x3);
                         STRxw_U12(x1, wback, 0);    // put the whole value
+                        SMDMB();
                     }
                     break;
                 case 6: //XOR
@@ -850,11 +1028,20 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, (opcode==0x81)?4:1);
                         if(opcode==0x81) i64 = F32S; else i64 = F8S;
                         MOV64xw(x5, i64);
-                        MARKLOCK;
-                        LDAXRxw(x1, wback);
-                        emit_xor32(dyn, ninst, rex, x1, x5, x3, x4);
-                        STLXRxw(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
+                        if(arm64_atomics) {
+                            UFLAG_IF {
+                                LDEORALxw(x5, x1, wback);
+                                emit_xor32(dyn, ninst, rex, x1, x5, x3, x4);
+                            } else {
+                                STEORLxw(x5, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRxw(x1, wback);
+                            emit_xor32(dyn, ninst, rex, x1, x5, x3, x4);
+                            STLXRxw(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 7: //CMP
@@ -871,7 +1058,6 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     }
                     break;
             }
-            SMDMB();
             break;
 
         case 0x86:
@@ -897,13 +1083,17 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 SMDMB();
                 GETGB(x4);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                MARKLOCK;
-                // do the swap with exclusive locking
-                LDAXRB(x1, ed);
-                // do the swap 14 -> strb(ed), 1 -> gd
-                STLXRB(x3, x4, ed);
-                CBNZx_MARKLOCK(x3);
-                SMDMB();
+                if(arm64_atomics) {
+                    SWPALB(x4, x1, ed);
+                } else {
+                    MARKLOCK;
+                    // do the swap with exclusive locking
+                    LDAXRB(x1, ed);
+                    // do the swap 14 -> strb(ed), 1 -> gd
+                    STLXRB(x3, x4, ed);
+                    CBNZx_MARKLOCK(x3);
+                    SMDMB();
+                }
                 BFIx(gb1, x1, gb2, 8);
             }
             break;
@@ -922,13 +1112,21 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
                 TSTx_mask(ed, 1, 0, 1+rex.w);    // mask=3 or 7
                 B_MARK(cNE);
-                MARKLOCK;
-                LDAXRxw(x1, ed);
-                STLXRxw(x3, gd, ed);
-                CBNZx_MARKLOCK(x3);
-                B_MARK2_nocond;
+                if(arm64_atomics) {
+                    SWPALxw(gd, gd, ed);
+                    B_NEXT_nocond;
+                } else {
+                    MARKLOCK;
+                    LDAXRxw(x1, ed);
+                    STLXRxw(x3, gd, ed);
+                    CBNZx_MARKLOCK(x3);
+                    B_MARK2_nocond;
+                }
                 MARK;
                 LDRxw_U12(x1, ed, 0);
+                LDAXRB(x3, ed);
+                STLXRB(x3, gd, ed);
+                CBNZx_MARK(x3);
                 STRxw_U12(gd, ed, 0);
                 MARK2;
                 SMDMB();
@@ -986,11 +1184,20 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     } else {
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
                         MARKLOCK;
-                        LDAXRB(x1, wback);
-                        emit_inc8(dyn, ninst, x1, x3, x4);
-                        STLXRB(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
-                        SMDMB();
+                        if(arm64_atomics) {
+                            MOV32w(x3, 1);
+                            UFLAG_IF {
+                                LDADDALB(x3, x1, wback);
+                                emit_inc8(dyn, ninst, x1, x3, x4);
+                            } else {
+                                STADDLB(x3, wback);
+                            }
+                        } else {
+                            LDAXRB(x1, wback);
+                            emit_inc8(dyn, ninst, x1, x3, x4);
+                            STLXRB(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 case 1: //DEC Ed
@@ -1003,12 +1210,21 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         EBBACK;
                     } else {
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
-                        MARKLOCK;
-                        LDAXRB(x1, wback);
-                        emit_dec8(dyn, ninst, x1, x3, x4);
-                        STLXRB(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
-                        SMDMB();
+                        if(arm64_atomics) {
+                            MOV32w(x3, -1);
+                            UFLAG_IF {
+                                LDADDALB(x3, x1, wback);
+                                emit_inc8(dyn, ninst, x1, x3, x4);
+                            } else {
+                                STADDLB(x3, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRB(x1, wback);
+                            emit_dec8(dyn, ninst, x1, x3, x4);
+                            STLXRB(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                     }
                     break;
                 default:
@@ -1030,12 +1246,21 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, NULL, 0, 0, rex, LOCK_LOCK, 0, 0);
                         TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
                         B_MARK(cNE);    // unaligned
-                        MARKLOCK;
-                        LDAXRxw(x1, wback);
-                        emit_inc32(dyn, ninst, rex, x1, x3, x4);
-                        STLXRxw(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
-                        SMDMB();
+                        if(arm64_atomics) {
+                            MOV32w(x3, 1);
+                            UFLAG_IF {
+                                LDADDALxw(x3, x1, wback);
+                                emit_inc32(dyn, ninst, rex, x1, x3, x4);
+                            } else {
+                                STADDLxw(x3, wback);
+                            }
+                        } else {
+                            MARKLOCK;
+                            LDAXRxw(x1, wback);
+                            emit_inc32(dyn, ninst, rex, x1, x3, x4);
+                            STLXRxw(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                         B_NEXT_nocond;
                         MARK;
                         LDRxw_U12(x1, wback, 0);
@@ -1060,11 +1285,20 @@ uintptr_t dynarec64_F0(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         TSTx_mask(wback, 1, 0, 1+rex.w);    // mask=3 or 7
                         B_MARK(cNE);    // unaligned
                         MARKLOCK;
-                        LDAXRxw(x1, wback);
-                        emit_dec32(dyn, ninst, rex, x1, x3, x4);
-                        STLXRxw(x3, x1, wback);
-                        CBNZx_MARKLOCK(x3);
-                        SMDMB();
+                        if(arm64_atomics) {
+                            MOV32w(x3, -1);
+                            UFLAG_IF {
+                                LDADDALxw(x3, x1, wback);
+                                emit_dec32(dyn, ninst, rex, x1, x3, x4);
+                            } else {
+                                STADDLxw(x3, wback);
+                            }
+                        } else {
+                            LDAXRxw(x1, wback);
+                            emit_dec32(dyn, ninst, rex, x1, x3, x4);
+                            STLXRxw(x3, x1, wback);
+                            CBNZx_MARKLOCK(x3);
+                        }
                         B_NEXT_nocond;
                         MARK;
                         LDRxw_U12(x1, wback, 0);