summary refs log tree commit diff stats
path: root/target-arm
diff options
context:
space:
mode:
Diffstat (limited to 'target-arm')
-rw-r--r--target-arm/cpu-qom.h1
-rw-r--r--target-arm/cpu.c7
-rw-r--r--target-arm/cpu.h115
-rw-r--r--target-arm/cpu64.c1
-rw-r--r--target-arm/helper.c145
-rw-r--r--target-arm/helper.h1
-rw-r--r--target-arm/internals.h6
-rw-r--r--target-arm/kvm-consts.h49
-rw-r--r--target-arm/op_helper.c27
-rw-r--r--target-arm/translate-a64.c98
-rw-r--r--target-arm/translate.c89
-rw-r--r--target-arm/translate.h12
12 files changed, 475 insertions, 76 deletions
diff --git a/target-arm/cpu-qom.h b/target-arm/cpu-qom.h
index ee4fbb1dad..07f3c9e866 100644
--- a/target-arm/cpu-qom.h
+++ b/target-arm/cpu-qom.h
@@ -148,6 +148,7 @@ typedef struct ARMCPU {
     uint64_t id_aa64isar1;
     uint64_t id_aa64mmfr0;
     uint64_t id_aa64mmfr1;
+    uint32_t dbgdidr;
     uint32_t clidr;
     /* The elements of this array are the CCSIDR values for each cache,
      * in the order L1DCache, L1ICache, L2DCache, L2ICache, etc.
diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 7cebb76656..8199f32e32 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -640,6 +640,7 @@ static void cortex_a8_initfn(Object *obj)
     cpu->id_isar2 = 0x21232031;
     cpu->id_isar3 = 0x11112131;
     cpu->id_isar4 = 0x00111142;
+    cpu->dbgdidr = 0x15141000;
     cpu->clidr = (1 << 27) | (2 << 24) | 3;
     cpu->ccsidr[0] = 0xe007e01a; /* 16k L1 dcache. */
     cpu->ccsidr[1] = 0x2007e01a; /* 16k L1 icache. */
@@ -712,9 +713,10 @@ static void cortex_a9_initfn(Object *obj)
     cpu->id_isar2 = 0x21232041;
     cpu->id_isar3 = 0x11112131;
     cpu->id_isar4 = 0x00111142;
+    cpu->dbgdidr = 0x35141000;
     cpu->clidr = (1 << 27) | (1 << 24) | 3;
-    cpu->ccsidr[0] = 0xe00fe015; /* 16k L1 dcache. */
-    cpu->ccsidr[1] = 0x200fe015; /* 16k L1 icache. */
+    cpu->ccsidr[0] = 0xe00fe019; /* 16k L1 dcache. */
+    cpu->ccsidr[1] = 0x200fe019; /* 16k L1 icache. */
     define_arm_cp_regs(cpu, cortexa9_cp_reginfo);
 }
 
@@ -773,6 +775,7 @@ static void cortex_a15_initfn(Object *obj)
     cpu->id_isar2 = 0x21232041;
     cpu->id_isar3 = 0x11112131;
     cpu->id_isar4 = 0x10011142;
+    cpu->dbgdidr = 0x3515f021;
     cpu->clidr = 0x0a200023;
     cpu->ccsidr[0] = 0x701fe00a; /* 32K L1 dcache */
     cpu->ccsidr[1] = 0x201fe00a; /* 32K L1 icache */
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 79205ba335..8098b8d357 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -220,6 +220,7 @@ typedef struct CPUARMState {
         uint64_t dbgbcr[16]; /* breakpoint control registers */
         uint64_t dbgwvr[16]; /* watchpoint value registers */
         uint64_t dbgwcr[16]; /* watchpoint control registers */
+        uint64_t mdscr_el1;
         /* If the counter is enabled, this stores the last time the counter
          * was reset. Otherwise it stores the counter value
          */
@@ -411,7 +412,13 @@ int arm_cpu_handle_mmu_fault(CPUState *cpu, vaddr address, int rw,
 #define CPSR_E (1U << 9)
 #define CPSR_IT_2_7 (0xfc00U)
 #define CPSR_GE (0xfU << 16)
-#define CPSR_RESERVED (0xfU << 20)
+#define CPSR_IL (1U << 20)
+/* Note that the RESERVED bits include bit 21, which is PSTATE_SS in
+ * an AArch64 SPSR but RES0 in AArch32 SPSR and CPSR. In QEMU we use
+ * env->uncached_cpsr bit 21 to store PSTATE.SS when executing in AArch32,
+ * where it is live state but not accessible to the AArch32 code.
+ */
+#define CPSR_RESERVED (0x7U << 21)
 #define CPSR_J (1U << 24)
 #define CPSR_IT_0_1 (3U << 25)
 #define CPSR_Q (1U << 27)
@@ -428,7 +435,9 @@ int arm_cpu_handle_mmu_fault(CPUState *cpu, vaddr address, int rw,
 /* Bits writable in user mode.  */
 #define CPSR_USER (CPSR_NZCV | CPSR_Q | CPSR_GE)
 /* Execution state bits.  MRS read as zero, MSR writes ignored.  */
-#define CPSR_EXEC (CPSR_T | CPSR_IT | CPSR_J)
+#define CPSR_EXEC (CPSR_T | CPSR_IT | CPSR_J | CPSR_IL)
+/* Mask of bits which may be set by exception return copying them from SPSR */
+#define CPSR_ERET_MASK (~CPSR_RESERVED)
 
 #define TTBCR_N      (7U << 0) /* TTBCR.EAE==0 */
 #define TTBCR_T0SZ   (7U << 0) /* TTBCR.EAE==1 */
@@ -1111,6 +1120,66 @@ static inline int cpu_mmu_index (CPUARMState *env)
     return arm_current_pl(env);
 }
 
+/* Return the Exception Level targeted by debug exceptions;
+ * currently always EL1 since we don't implement EL2 or EL3.
+ */
+static inline int arm_debug_target_el(CPUARMState *env)
+{
+    return 1;
+}
+
+static inline bool aa64_generate_debug_exceptions(CPUARMState *env)
+{
+    if (arm_current_pl(env) == arm_debug_target_el(env)) {
+        if ((extract32(env->cp15.mdscr_el1, 13, 1) == 0)
+            || (env->daif & PSTATE_D)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static inline bool aa32_generate_debug_exceptions(CPUARMState *env)
+{
+    if (arm_current_pl(env) == 0 && arm_el_is_aa64(env, 1)) {
+        return aa64_generate_debug_exceptions(env);
+    }
+    return arm_current_pl(env) != 2;
+}
+
+/* Return true if debugging exceptions are currently enabled.
+ * This corresponds to what in ARM ARM pseudocode would be
+ *    if UsingAArch32() then
+ *        return AArch32.GenerateDebugExceptions()
+ *    else
+ *        return AArch64.GenerateDebugExceptions()
+ * We choose to push the if() down into this function for clarity,
+ * since the pseudocode has it at all callsites except for the one in
+ * CheckSoftwareStep(), where it is elided because both branches would
+ * always return the same value.
+ *
+ * Parts of the pseudocode relating to EL2 and EL3 are omitted because we
+ * don't yet implement those exception levels or their associated trap bits.
+ */
+static inline bool arm_generate_debug_exceptions(CPUARMState *env)
+{
+    if (env->aarch64) {
+        return aa64_generate_debug_exceptions(env);
+    } else {
+        return aa32_generate_debug_exceptions(env);
+    }
+}
+
+/* Is single-stepping active? (Note that the "is EL_D AArch64?" check
+ * implicitly means this always returns false in pre-v8 CPUs.)
+ */
+static inline bool arm_singlestep_active(CPUARMState *env)
+{
+    return extract32(env->cp15.mdscr_el1, 0, 1)
+        && arm_el_is_aa64(env, arm_debug_target_el(env))
+        && arm_generate_debug_exceptions(env);
+}
+
 #include "exec/cpu-all.h"
 
 /* Bit usage in the TB flags field: bit 31 indicates whether we are
@@ -1136,12 +1205,20 @@ static inline int cpu_mmu_index (CPUARMState *env)
 #define ARM_TBFLAG_BSWAP_CODE_MASK  (1 << ARM_TBFLAG_BSWAP_CODE_SHIFT)
 #define ARM_TBFLAG_CPACR_FPEN_SHIFT 17
 #define ARM_TBFLAG_CPACR_FPEN_MASK  (1 << ARM_TBFLAG_CPACR_FPEN_SHIFT)
+#define ARM_TBFLAG_SS_ACTIVE_SHIFT 18
+#define ARM_TBFLAG_SS_ACTIVE_MASK (1 << ARM_TBFLAG_SS_ACTIVE_SHIFT)
+#define ARM_TBFLAG_PSTATE_SS_SHIFT 19
+#define ARM_TBFLAG_PSTATE_SS_MASK (1 << ARM_TBFLAG_PSTATE_SS_SHIFT)
 
 /* Bit usage when in AArch64 state */
 #define ARM_TBFLAG_AA64_EL_SHIFT    0
 #define ARM_TBFLAG_AA64_EL_MASK     (0x3 << ARM_TBFLAG_AA64_EL_SHIFT)
 #define ARM_TBFLAG_AA64_FPEN_SHIFT  2
 #define ARM_TBFLAG_AA64_FPEN_MASK   (1 << ARM_TBFLAG_AA64_FPEN_SHIFT)
+#define ARM_TBFLAG_AA64_SS_ACTIVE_SHIFT 3
+#define ARM_TBFLAG_AA64_SS_ACTIVE_MASK (1 << ARM_TBFLAG_AA64_SS_ACTIVE_SHIFT)
+#define ARM_TBFLAG_AA64_PSTATE_SS_SHIFT 4
+#define ARM_TBFLAG_AA64_PSTATE_SS_MASK (1 << ARM_TBFLAG_AA64_PSTATE_SS_SHIFT)
 
 /* some convenience accessor macros */
 #define ARM_TBFLAG_AARCH64_STATE(F) \
@@ -1162,10 +1239,18 @@ static inline int cpu_mmu_index (CPUARMState *env)
     (((F) & ARM_TBFLAG_BSWAP_CODE_MASK) >> ARM_TBFLAG_BSWAP_CODE_SHIFT)
 #define ARM_TBFLAG_CPACR_FPEN(F) \
     (((F) & ARM_TBFLAG_CPACR_FPEN_MASK) >> ARM_TBFLAG_CPACR_FPEN_SHIFT)
+#define ARM_TBFLAG_SS_ACTIVE(F) \
+    (((F) & ARM_TBFLAG_SS_ACTIVE_MASK) >> ARM_TBFLAG_SS_ACTIVE_SHIFT)
+#define ARM_TBFLAG_PSTATE_SS(F) \
+    (((F) & ARM_TBFLAG_PSTATE_SS_MASK) >> ARM_TBFLAG_PSTATE_SS_SHIFT)
 #define ARM_TBFLAG_AA64_EL(F) \
     (((F) & ARM_TBFLAG_AA64_EL_MASK) >> ARM_TBFLAG_AA64_EL_SHIFT)
 #define ARM_TBFLAG_AA64_FPEN(F) \
     (((F) & ARM_TBFLAG_AA64_FPEN_MASK) >> ARM_TBFLAG_AA64_FPEN_SHIFT)
+#define ARM_TBFLAG_AA64_SS_ACTIVE(F) \
+    (((F) & ARM_TBFLAG_AA64_SS_ACTIVE_MASK) >> ARM_TBFLAG_AA64_SS_ACTIVE_SHIFT)
+#define ARM_TBFLAG_AA64_PSTATE_SS(F) \
+    (((F) & ARM_TBFLAG_AA64_PSTATE_SS_MASK) >> ARM_TBFLAG_AA64_PSTATE_SS_SHIFT)
 
 static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
                                         target_ulong *cs_base, int *flags)
@@ -1179,6 +1264,19 @@ static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
         if (fpen == 3 || (fpen == 1 && arm_current_pl(env) != 0)) {
             *flags |= ARM_TBFLAG_AA64_FPEN_MASK;
         }
+        /* The SS_ACTIVE and PSTATE_SS bits correspond to the state machine
+         * states defined in the ARM ARM for software singlestep:
+         *  SS_ACTIVE   PSTATE.SS   State
+         *     0            x       Inactive (the TB flag for SS is always 0)
+         *     1            0       Active-pending
+         *     1            1       Active-not-pending
+         */
+        if (arm_singlestep_active(env)) {
+            *flags |= ARM_TBFLAG_AA64_SS_ACTIVE_MASK;
+            if (env->pstate & PSTATE_SS) {
+                *flags |= ARM_TBFLAG_AA64_PSTATE_SS_MASK;
+            }
+        }
     } else {
         int privmode;
         *pc = env->regs[15];
@@ -1202,6 +1300,19 @@ static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
         if (fpen == 3 || (fpen == 1 && arm_current_pl(env) != 0)) {
             *flags |= ARM_TBFLAG_CPACR_FPEN_MASK;
         }
+        /* The SS_ACTIVE and PSTATE_SS bits correspond to the state machine
+         * states defined in the ARM ARM for software singlestep:
+         *  SS_ACTIVE   PSTATE.SS   State
+         *     0            x       Inactive (the TB flag for SS is always 0)
+         *     1            0       Active-pending
+         *     1            1       Active-not-pending
+         */
+        if (arm_singlestep_active(env)) {
+            *flags |= ARM_TBFLAG_SS_ACTIVE_MASK;
+            if (env->uncached_cpsr & PSTATE_SS) {
+                *flags |= ARM_TBFLAG_PSTATE_SS_MASK;
+            }
+        }
     }
 
     *cs_base = 0;
diff --git a/target-arm/cpu64.c b/target-arm/cpu64.c
index 8b2081c246..38d2b8445a 100644
--- a/target-arm/cpu64.c
+++ b/target-arm/cpu64.c
@@ -127,6 +127,7 @@ static void aarch64_a57_initfn(Object *obj)
     cpu->id_aa64dfr0 = 0x10305106;
     cpu->id_aa64isar0 = 0x00010000;
     cpu->id_aa64mmfr0 = 0x00001124;
+    cpu->dbgdidr = 0x3516d000;
     cpu->clidr = 0x0a200023;
     cpu->ccsidr[0] = 0x701fe00a; /* 32KB L1 dcache */
     cpu->ccsidr[1] = 0x201fe012; /* 48KB L1 icache */
diff --git a/target-arm/helper.c b/target-arm/helper.c
index f630d96306..2a77c97c7b 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -389,12 +389,6 @@ static void tlbimvaa_write(CPUARMState *env, const ARMCPRegInfo *ri,
 }
 
 static const ARMCPRegInfo cp_reginfo[] = {
-    /* DBGDIDR: just RAZ. In particular this means the "debug architecture
-     * version" bits will read as a reserved value, which should cause
-     * Linux to not try to use the debug hardware.
-     */
-    { .name = "DBGDIDR", .cp = 14, .crn = 0, .crm = 0, .opc1 = 0, .opc2 = 0,
-      .access = PL0_R, .type = ARM_CP_CONST, .resetvalue = 0 },
     { .name = "FCSEIDR", .cp = 15, .crn = 13, .crm = 0, .opc1 = 0, .opc2 = 0,
       .access = PL1_RW, .fieldoffset = offsetof(CPUARMState, cp15.c13_fcse),
       .resetvalue = 0, .writefn = fcse_write, .raw_writefn = raw_write, },
@@ -471,6 +465,13 @@ static const ARMCPRegInfo not_v7_cp_reginfo[] = {
     { .name = "DUMMY", .cp = 15, .crn = 0, .crm = 0, .opc1 = 1, .opc2 = CP_ANY,
       .access = PL1_R, .type = ARM_CP_CONST | ARM_CP_NO_MIGRATE,
       .resetvalue = 0 },
+    /* We don't implement pre-v7 debug but most CPUs had at least a DBGDIDR;
+     * implementing it as RAZ means the "debug architecture version" bits
+     * will read as a reserved value, which should cause Linux to not try
+     * to use the debug hardware.
+     */
+    { .name = "DBGDIDR", .cp = 14, .crn = 0, .crm = 0, .opc1 = 0, .opc2 = 0,
+      .access = PL0_R, .type = ARM_CP_CONST, .resetvalue = 0 },
     REGINFO_SENTINEL
 };
 
@@ -712,13 +713,6 @@ static uint64_t isr_read(CPUARMState *env, const ARMCPRegInfo *ri)
 }
 
 static const ARMCPRegInfo v7_cp_reginfo[] = {
-    /* DBGDRAR, DBGDSAR: always RAZ since we don't implement memory mapped
-     * debug components
-     */
-    { .name = "DBGDRAR", .cp = 14, .crn = 1, .crm = 0, .opc1 = 0, .opc2 = 0,
-      .access = PL0_R, .type = ARM_CP_CONST, .resetvalue = 0 },
-    { .name = "DBGDSAR", .cp = 14, .crn = 2, .crm = 0, .opc1 = 0, .opc2 = 0,
-      .access = PL0_R, .type = ARM_CP_CONST, .resetvalue = 0 },
     /* the old v6 WFI, UNPREDICTABLE in v7 but we choose to NOP */
     { .name = "NOP", .cp = 15, .crn = 7, .crm = 0, .opc1 = 0, .opc2 = 4,
       .access = PL1_W, .type = ARM_CP_NOP },
@@ -1734,11 +1728,6 @@ static const ARMCPRegInfo lpae_cp_reginfo[] = {
     { .name = "AMAIR1", .cp = 15, .crn = 10, .crm = 3, .opc1 = 0, .opc2 = 1,
       .access = PL1_RW, .type = ARM_CP_CONST | ARM_CP_OVERRIDE,
       .resetvalue = 0 },
-    /* 64 bit access versions of the (dummy) debug registers */
-    { .name = "DBGDRAR", .cp = 14, .crm = 1, .opc1 = 0,
-      .access = PL0_R, .type = ARM_CP_CONST|ARM_CP_64BIT, .resetvalue = 0 },
-    { .name = "DBGDSAR", .cp = 14, .crm = 2, .opc1 = 0,
-      .access = PL0_R, .type = ARM_CP_CONST|ARM_CP_64BIT, .resetvalue = 0 },
     { .name = "PAR", .cp = 15, .crm = 7, .opc1 = 0,
       .access = PL1_RW, .type = ARM_CP_64BIT,
       .fieldoffset = offsetof(CPUARMState, cp15.par_el1), .resetvalue = 0 },
@@ -2083,16 +2072,6 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
       .opc1 = 0, .crn = 3, .crm = 0, .opc2 = 0,
       .access = PL1_RW, .fieldoffset = offsetof(CPUARMState, cp15.c3),
       .resetvalue = 0, .writefn = dacr_write, .raw_writefn = raw_write, },
-    /* Dummy implementation of monitor debug system control register:
-     * we don't support debug.
-     */
-    { .name = "MDSCR_EL1", .state = ARM_CP_STATE_AA64,
-      .opc0 = 2, .opc1 = 0, .crn = 0, .crm = 2, .opc2 = 2,
-      .access = PL1_RW, .type = ARM_CP_CONST, .resetvalue = 0 },
-    /* We define a dummy WI OSLAR_EL1, because Linux writes to it. */
-    { .name = "OSLAR_EL1", .state = ARM_CP_STATE_AA64,
-      .opc0 = 2, .opc1 = 0, .crn = 1, .crm = 0, .opc2 = 4,
-      .access = PL1_W, .type = ARM_CP_NOP },
     { .name = "ELR_EL1", .state = ARM_CP_STATE_AA64,
       .type = ARM_CP_NO_MIGRATE,
       .opc0 = 3, .opc1 = 0, .crn = 4, .crm = 0, .opc2 = 1,
@@ -2206,29 +2185,98 @@ static CPAccessResult ctr_el0_access(CPUARMState *env, const ARMCPRegInfo *ri)
     return CP_ACCESS_OK;
 }
 
-static void define_aarch64_debug_regs(ARMCPU *cpu)
+static const ARMCPRegInfo debug_cp_reginfo[] = {
+    /* DBGDRAR, DBGDSAR: always RAZ since we don't implement memory mapped
+     * debug components. The AArch64 version of DBGDRAR is named MDRAR_EL1;
+     * unlike DBGDRAR it is never accessible from EL0.
+     * DBGDSAR is deprecated and must RAZ from v8 anyway, so it has no AArch64
+     * accessor.
+     */
+    { .name = "DBGDRAR", .cp = 14, .crn = 1, .crm = 0, .opc1 = 0, .opc2 = 0,
+      .access = PL0_R, .type = ARM_CP_CONST, .resetvalue = 0 },
+    { .name = "MDRAR_EL1", .state = ARM_CP_STATE_AA64,
+      .opc0 = 2, .opc1 = 0, .crn = 1, .crm = 0, .opc2 = 0,
+      .access = PL1_R, .type = ARM_CP_CONST, .resetvalue = 0 },
+    { .name = "DBGDSAR", .cp = 14, .crn = 2, .crm = 0, .opc1 = 0, .opc2 = 0,
+      .access = PL0_R, .type = ARM_CP_CONST, .resetvalue = 0 },
+    /* Dummy implementation of monitor debug system control register:
+     * we don't support debug. (The 32-bit alias is DBGDSCRext.)
+     */
+    { .name = "MDSCR_EL1", .state = ARM_CP_STATE_BOTH,
+      .cp = 14, .opc0 = 2, .opc1 = 0, .crn = 0, .crm = 2, .opc2 = 2,
+      .access = PL1_RW,
+      .fieldoffset = offsetof(CPUARMState, cp15.mdscr_el1),
+      .resetvalue = 0 },
+    /* We define a dummy WI OSLAR_EL1, because Linux writes to it. */
+    { .name = "OSLAR_EL1", .state = ARM_CP_STATE_BOTH,
+      .cp = 14, .opc0 = 2, .opc1 = 0, .crn = 1, .crm = 0, .opc2 = 4,
+      .access = PL1_W, .type = ARM_CP_NOP },
+    REGINFO_SENTINEL
+};
+
+static const ARMCPRegInfo debug_lpae_cp_reginfo[] = {
+    /* 64 bit access versions of the (dummy) debug registers */
+    { .name = "DBGDRAR", .cp = 14, .crm = 1, .opc1 = 0,
+      .access = PL0_R, .type = ARM_CP_CONST|ARM_CP_64BIT, .resetvalue = 0 },
+    { .name = "DBGDSAR", .cp = 14, .crm = 2, .opc1 = 0,
+      .access = PL0_R, .type = ARM_CP_CONST|ARM_CP_64BIT, .resetvalue = 0 },
+    REGINFO_SENTINEL
+};
+
+static void define_debug_regs(ARMCPU *cpu)
 {
-    /* Define breakpoint and watchpoint registers. These do nothing
-     * but read as written, for now.
+    /* Define v7 and v8 architectural debug registers.
+     * These are just dummy implementations for now.
      */
     int i;
+    int wrps, brps;
+    ARMCPRegInfo dbgdidr = {
+        .name = "DBGDIDR", .cp = 14, .crn = 0, .crm = 0, .opc1 = 0, .opc2 = 0,
+        .access = PL0_R, .type = ARM_CP_CONST, .resetvalue = cpu->dbgdidr,
+    };
 
-    for (i = 0; i < 16; i++) {
+    brps = extract32(cpu->dbgdidr, 24, 4);
+    wrps = extract32(cpu->dbgdidr, 28, 4);
+
+    /* The DBGDIDR and ID_AA64DFR0_EL1 define various properties
+     * of the debug registers such as number of breakpoints;
+     * check that if they both exist then they agree.
+     */
+    if (arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
+        assert(extract32(cpu->id_aa64dfr0, 12, 4) == brps);
+        assert(extract32(cpu->id_aa64dfr0, 20, 4) == wrps);
+    }
+
+    define_one_arm_cp_reg(cpu, &dbgdidr);
+    define_arm_cp_regs(cpu, debug_cp_reginfo);
+
+    if (arm_feature(&cpu->env, ARM_FEATURE_LPAE)) {
+        define_arm_cp_regs(cpu, debug_lpae_cp_reginfo);
+    }
+
+    for (i = 0; i < brps + 1; i++) {
         ARMCPRegInfo dbgregs[] = {
-            { .name = "DBGBVR", .state = ARM_CP_STATE_AA64,
-              .opc0 = 2, .opc1 = 0, .crn = 0, .crm = i, .opc2 = 4,
+            { .name = "DBGBVR", .state = ARM_CP_STATE_BOTH,
+              .cp = 14, .opc0 = 2, .opc1 = 0, .crn = 0, .crm = i, .opc2 = 4,
               .access = PL1_RW,
               .fieldoffset = offsetof(CPUARMState, cp15.dbgbvr[i]) },
-            { .name = "DBGBCR", .state = ARM_CP_STATE_AA64,
-              .opc0 = 2, .opc1 = 0, .crn = 0, .crm = i, .opc2 = 5,
+            { .name = "DBGBCR", .state = ARM_CP_STATE_BOTH,
+              .cp = 14, .opc0 = 2, .opc1 = 0, .crn = 0, .crm = i, .opc2 = 5,
               .access = PL1_RW,
               .fieldoffset = offsetof(CPUARMState, cp15.dbgbcr[i]) },
-            { .name = "DBGWVR", .state = ARM_CP_STATE_AA64,
-              .opc0 = 2, .opc1 = 0, .crn = 0, .crm = i, .opc2 = 6,
+            REGINFO_SENTINEL
+        };
+        define_arm_cp_regs(cpu, dbgregs);
+    }
+
+    for (i = 0; i < wrps + 1; i++) {
+        ARMCPRegInfo dbgregs[] = {
+            { .name = "DBGWVR", .state = ARM_CP_STATE_BOTH,
+              .cp = 14, .opc0 = 2, .opc1 = 0, .crn = 0, .crm = i, .opc2 = 6,
               .access = PL1_RW,
               .fieldoffset = offsetof(CPUARMState, cp15.dbgwvr[i]) },
-            { .name = "DBGWCR", .state = ARM_CP_STATE_AA64,
-              .opc0 = 2, .opc1 = 0, .crn = 0, .crm = i, .opc2 = 7,
+            { .name = "DBGWCR", .state = ARM_CP_STATE_BOTH,
+              .cp = 14, .opc0 = 2, .opc1 = 0, .crn = 0, .crm = i, .opc2 = 7,
               .access = PL1_RW,
               .fieldoffset = offsetof(CPUARMState, cp15.dbgwcr[i]) },
                REGINFO_SENTINEL
@@ -2353,6 +2401,7 @@ void register_cp_regs_for_features(ARMCPU *cpu)
         };
         define_one_arm_cp_reg(cpu, &clidr);
         define_arm_cp_regs(cpu, v7_cp_reginfo);
+        define_debug_regs(cpu);
     } else {
         define_arm_cp_regs(cpu, not_v7_cp_reginfo);
     }
@@ -2426,7 +2475,6 @@ void register_cp_regs_for_features(ARMCPU *cpu)
         define_one_arm_cp_reg(cpu, &rvbar);
         define_arm_cp_regs(cpu, v8_idregs);
         define_arm_cp_regs(cpu, v8_cp_reginfo);
-        define_aarch64_debug_regs(cpu);
     }
     if (arm_feature(env, ARM_FEATURE_EL2)) {
         define_arm_cp_regs(cpu, v8_el2_cp_reginfo);
@@ -2779,9 +2827,11 @@ static void add_cpreg_to_hashtable(ARMCPU *cpu, const ARMCPRegInfo *r,
         /* The AArch32 view of a shared register sees the lower 32 bits
          * of a 64 bit backing field. It is not migratable as the AArch64
          * view handles that. AArch64 also handles reset.
-         * We assume it is a cp15 register.
+         * We assume it is a cp15 register if the .cp field is left unset.
          */
-        r2->cp = 15;
+        if (r2->cp == 0) {
+            r2->cp = 15;
+        }
         r2->type |= ARM_CP_NO_MIGRATE;
         r2->resetfn = arm_cp_reset_ignore;
 #ifdef HOST_WORDS_BIGENDIAN
@@ -2794,8 +2844,11 @@ static void add_cpreg_to_hashtable(ARMCPU *cpu, const ARMCPRegInfo *r,
         /* To allow abbreviation of ARMCPRegInfo
          * definitions, we treat cp == 0 as equivalent to
          * the value for "standard guest-visible sysreg".
+         * STATE_BOTH definitions are also always "standard
+         * sysreg" in their AArch64 view (the .cp value may
+         * be non-zero for the benefit of the AArch32 view).
          */
-        if (r->cp == 0) {
+        if (r->cp == 0 || r->state == ARM_CP_STATE_BOTH) {
             r2->cp = CP_REG_ARM64_SYSREG_CP;
         }
         *key = ENCODE_AA64_CP_REG(r2->cp, r2->crn, crm,
@@ -3499,6 +3552,10 @@ void arm_cpu_do_interrupt(CPUState *cs)
         addr += env->cp15.vbar_el[1];
     }
     switch_mode (env, new_mode);
+    /* For exceptions taken to AArch32 we must clear the SS bit in both
+     * PSTATE and in the old-state value we save to SPSR_<mode>, so zero it now.
+     */
+    env->uncached_cpsr &= ~PSTATE_SS;
     env->spsr = cpsr_read(env);
     /* Clear IT bits.  */
     env->condexec_bits = 0;
diff --git a/target-arm/helper.h b/target-arm/helper.h
index facfcd2c11..1d7003b70a 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -64,6 +64,7 @@ DEF_HELPER_3(set_cp_reg64, void, env, ptr, i64)
 DEF_HELPER_2(get_cp_reg64, i64, env, ptr)
 
 DEF_HELPER_3(msr_i_pstate, void, env, i32, i32)
+DEF_HELPER_1(clear_pstate_ss, void, env)
 DEF_HELPER_1(exception_return, void, env)
 
 DEF_HELPER_2(get_r13_banked, i32, env, i32)
diff --git a/target-arm/internals.h b/target-arm/internals.h
index 08fa69757d..53c2e3cf3e 100644
--- a/target-arm/internals.h
+++ b/target-arm/internals.h
@@ -290,4 +290,10 @@ static inline uint32_t syn_data_abort(int same_el, int ea, int cm, int s1ptw,
         | (ea << 9) | (cm << 8) | (s1ptw << 7) | (wnr << 6) | fsc;
 }
 
+static inline uint32_t syn_swstep(int same_el, int isv, int ex)
+{
+    return (EC_SOFTWARESTEP << ARM_EL_EC_SHIFT) | (same_el << ARM_EL_EC_SHIFT)
+        | (isv << 24) | (ex << 6) | 0x22;
+}
+
 #endif
diff --git a/target-arm/kvm-consts.h b/target-arm/kvm-consts.h
index 6009a33f10..091c1267d6 100644
--- a/target-arm/kvm-consts.h
+++ b/target-arm/kvm-consts.h
@@ -17,6 +17,7 @@
 #ifdef CONFIG_KVM
 #include "qemu/compiler.h"
 #include <linux/kvm.h>
+#include <linux/psci.h>
 
 #define MISMATCH_CHECK(X, Y) QEMU_BUILD_BUG_ON(X != Y)
 
@@ -38,17 +39,43 @@ MISMATCH_CHECK(CP_REG_SIZE_U64, KVM_REG_SIZE_U64)
 MISMATCH_CHECK(CP_REG_ARM, KVM_REG_ARM)
 MISMATCH_CHECK(CP_REG_ARCH_MASK, KVM_REG_ARCH_MASK)
 
-#define PSCI_FN_BASE 0x95c1ba5e
-#define PSCI_FN(n) (PSCI_FN_BASE + (n))
-#define PSCI_FN_CPU_SUSPEND PSCI_FN(0)
-#define PSCI_FN_CPU_OFF PSCI_FN(1)
-#define PSCI_FN_CPU_ON PSCI_FN(2)
-#define PSCI_FN_MIGRATE PSCI_FN(3)
-
-MISMATCH_CHECK(PSCI_FN_CPU_SUSPEND, KVM_PSCI_FN_CPU_SUSPEND)
-MISMATCH_CHECK(PSCI_FN_CPU_OFF, KVM_PSCI_FN_CPU_OFF)
-MISMATCH_CHECK(PSCI_FN_CPU_ON, KVM_PSCI_FN_CPU_ON)
-MISMATCH_CHECK(PSCI_FN_MIGRATE, KVM_PSCI_FN_MIGRATE)
+#define QEMU_PSCI_0_1_FN_BASE 0x95c1ba5e
+#define QEMU_PSCI_0_1_FN(n) (QEMU_PSCI_0_1_FN_BASE + (n))
+#define QEMU_PSCI_0_1_FN_CPU_SUSPEND QEMU_PSCI_0_1_FN(0)
+#define QEMU_PSCI_0_1_FN_CPU_OFF QEMU_PSCI_0_1_FN(1)
+#define QEMU_PSCI_0_1_FN_CPU_ON QEMU_PSCI_0_1_FN(2)
+#define QEMU_PSCI_0_1_FN_MIGRATE QEMU_PSCI_0_1_FN(3)
+
+MISMATCH_CHECK(QEMU_PSCI_0_1_FN_CPU_SUSPEND, KVM_PSCI_FN_CPU_SUSPEND)
+MISMATCH_CHECK(QEMU_PSCI_0_1_FN_CPU_OFF, KVM_PSCI_FN_CPU_OFF)
+MISMATCH_CHECK(QEMU_PSCI_0_1_FN_CPU_ON, KVM_PSCI_FN_CPU_ON)
+MISMATCH_CHECK(QEMU_PSCI_0_1_FN_MIGRATE, KVM_PSCI_FN_MIGRATE)
+
+#define QEMU_PSCI_0_2_FN_BASE 0x84000000
+#define QEMU_PSCI_0_2_FN(n) (QEMU_PSCI_0_2_FN_BASE + (n))
+
+#define QEMU_PSCI_0_2_64BIT 0x40000000
+#define QEMU_PSCI_0_2_FN64_BASE \
+        (QEMU_PSCI_0_2_FN_BASE + QEMU_PSCI_0_2_64BIT)
+#define QEMU_PSCI_0_2_FN64(n) (QEMU_PSCI_0_2_FN64_BASE + (n))
+
+#define QEMU_PSCI_0_2_FN_CPU_SUSPEND QEMU_PSCI_0_2_FN(1)
+#define QEMU_PSCI_0_2_FN_CPU_OFF QEMU_PSCI_0_2_FN(2)
+#define QEMU_PSCI_0_2_FN_CPU_ON QEMU_PSCI_0_2_FN(3)
+#define QEMU_PSCI_0_2_FN_MIGRATE QEMU_PSCI_0_2_FN(5)
+
+#define QEMU_PSCI_0_2_FN64_CPU_SUSPEND QEMU_PSCI_0_2_FN64(1)
+#define QEMU_PSCI_0_2_FN64_CPU_OFF QEMU_PSCI_0_2_FN64(2)
+#define QEMU_PSCI_0_2_FN64_CPU_ON QEMU_PSCI_0_2_FN64(3)
+#define QEMU_PSCI_0_2_FN64_MIGRATE QEMU_PSCI_0_2_FN64(5)
+
+MISMATCH_CHECK(QEMU_PSCI_0_2_FN_CPU_SUSPEND, PSCI_0_2_FN_CPU_SUSPEND)
+MISMATCH_CHECK(QEMU_PSCI_0_2_FN_CPU_OFF, PSCI_0_2_FN_CPU_OFF)
+MISMATCH_CHECK(QEMU_PSCI_0_2_FN_CPU_ON, PSCI_0_2_FN_CPU_ON)
+MISMATCH_CHECK(QEMU_PSCI_0_2_FN_MIGRATE, PSCI_0_2_FN_MIGRATE)
+MISMATCH_CHECK(QEMU_PSCI_0_2_FN64_CPU_SUSPEND, PSCI_0_2_FN64_CPU_SUSPEND)
+MISMATCH_CHECK(QEMU_PSCI_0_2_FN64_CPU_ON, PSCI_0_2_FN64_CPU_ON)
+MISMATCH_CHECK(QEMU_PSCI_0_2_FN64_MIGRATE, PSCI_0_2_FN64_MIGRATE)
 
 /* Note that KVM uses overlapping values for AArch32 and AArch64
  * target CPU numbers. AArch32 targets:
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 25ad902e04..fe40358c96 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -258,7 +258,7 @@ void HELPER(exception_with_syndrome)(CPUARMState *env, uint32_t excp,
 
 uint32_t HELPER(cpsr_read)(CPUARMState *env)
 {
-    return cpsr_read(env) & ~CPSR_EXEC;
+    return cpsr_read(env) & ~(CPSR_EXEC | CPSR_RESERVED);
 }
 
 void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask)
@@ -369,6 +369,11 @@ void HELPER(msr_i_pstate)(CPUARMState *env, uint32_t op, uint32_t imm)
     }
 }
 
+void HELPER(clear_pstate_ss)(CPUARMState *env)
+{
+    env->pstate &= ~PSTATE_SS;
+}
+
 void HELPER(exception_return)(CPUARMState *env)
 {
     int cur_el = arm_current_pl(env);
@@ -380,12 +385,26 @@ void HELPER(exception_return)(CPUARMState *env)
 
     env->exclusive_addr = -1;
 
+    /* We must squash the PSTATE.SS bit to zero unless both of the
+     * following hold:
+     *  1. debug exceptions are currently disabled
+     *  2. singlestep will be active in the EL we return to
+     * We check 1 here and 2 after we've done the pstate/cpsr write() to
+     * transition to the EL we're going to.
+     */
+    if (arm_generate_debug_exceptions(env)) {
+        spsr &= ~PSTATE_SS;
+    }
+
     if (spsr & PSTATE_nRW) {
         /* TODO: We currently assume EL1/2/3 are running in AArch64.  */
         env->aarch64 = 0;
         new_el = 0;
         env->uncached_cpsr = 0x10;
         cpsr_write(env, spsr, ~0);
+        if (!arm_singlestep_active(env)) {
+            env->uncached_cpsr &= ~PSTATE_SS;
+        }
         for (i = 0; i < 15; i++) {
             env->regs[i] = env->xregs[i];
         }
@@ -410,6 +429,9 @@ void HELPER(exception_return)(CPUARMState *env)
         }
         env->aarch64 = 1;
         pstate_write(env, spsr);
+        if (!arm_singlestep_active(env)) {
+            env->pstate &= ~PSTATE_SS;
+        }
         aarch64_restore_sp(env, new_el);
         env->pc = env->elr_el[cur_el];
     }
@@ -429,6 +451,9 @@ illegal_return:
     spsr &= PSTATE_NZCV | PSTATE_DAIF;
     spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF);
     pstate_write(env, spsr);
+    if (!arm_singlestep_active(env)) {
+        env->pstate &= ~PSTATE_SS;
+    }
 }
 
 /* ??? Flag setting arithmetic is awkward because we need to do comparisons.
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index f04ca49631..8e66b6c972 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -205,10 +205,39 @@ static void gen_exception_insn(DisasContext *s, int offset, int excp,
     s->is_jmp = DISAS_EXC;
 }
 
+static void gen_ss_advance(DisasContext *s)
+{
+    /* If the singlestep state is Active-not-pending, advance to
+     * Active-pending.
+     */
+    if (s->ss_active) {
+        s->pstate_ss = 0;
+        gen_helper_clear_pstate_ss(cpu_env);
+    }
+}
+
+static void gen_step_complete_exception(DisasContext *s)
+{
+    /* We just completed step of an insn. Move from Active-not-pending
+     * to Active-pending, and then also take the swstep exception.
+     * This corresponds to making the (IMPDEF) choice to prioritize
+     * swstep exceptions over asynchronous exceptions taken to an exception
+     * level where debug is disabled. This choice has the advantage that
+     * we do not need to maintain internal state corresponding to the
+     * ISV/EX syndrome bits between completion of the step and generation
+     * of the exception, and our syndrome information is always correct.
+     */
+    gen_ss_advance(s);
+    gen_exception(EXCP_UDEF, syn_swstep(s->ss_same_el, 1, s->is_ldex));
+    s->is_jmp = DISAS_EXC;
+}
+
 static inline bool use_goto_tb(DisasContext *s, int n, uint64_t dest)
 {
-    /* No direct tb linking with singlestep or deterministic io */
-    if (s->singlestep_enabled || (s->tb->cflags & CF_LAST_IO)) {
+    /* No direct tb linking with singlestep (either QEMU's or the ARM
+     * debug architecture kind) or deterministic io
+     */
+    if (s->singlestep_enabled || s->ss_active || (s->tb->cflags & CF_LAST_IO)) {
         return false;
     }
 
@@ -232,11 +261,14 @@ static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
         s->is_jmp = DISAS_TB_JUMP;
     } else {
         gen_a64_set_pc_im(dest);
-        if (s->singlestep_enabled) {
+        if (s->ss_active) {
+            gen_step_complete_exception(s);
+        } else if (s->singlestep_enabled) {
             gen_exception_internal(EXCP_DEBUG);
+        } else {
+            tcg_gen_exit_tb(0);
+            s->is_jmp = DISAS_TB_JUMP;
         }
-        tcg_gen_exit_tb(0);
-        s->is_jmp = DISAS_JUMP;
     }
 }
 
@@ -1448,6 +1480,12 @@ static void disas_exc(DisasContext *s, uint32_t insn)
             unallocated_encoding(s);
             break;
         }
+        /* For SVC, HVC and SMC we advance the single-step state
+         * machine before taking the exception. This is architecturally
+         * mandated, to ensure that single-stepping a system call
+         * instruction works properly.
+         */
+        gen_ss_advance(s);
         gen_exception_insn(s, 0, EXCP_SWI, syn_aa64_svc(imm16));
         break;
     case 1:
@@ -1456,7 +1494,7 @@ static void disas_exc(DisasContext *s, uint32_t insn)
             break;
         }
         /* BRK */
-        gen_exception_insn(s, 0, EXCP_BKPT, syn_aa64_bkpt(imm16));
+        gen_exception_insn(s, 4, EXCP_BKPT, syn_aa64_bkpt(imm16));
         break;
     case 2:
         if (op2_ll != 0) {
@@ -1728,6 +1766,7 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn)
 
     if (is_excl) {
         if (!is_store) {
+            s->is_ldex = true;
             gen_load_exclusive(s, rt, rt2, tcg_addr, size, is_pair);
         } else {
             gen_store_exclusive(s, rs, rt, rt2, tcg_addr, size, is_pair);
@@ -10868,6 +10907,26 @@ void gen_intermediate_code_internal_a64(ARMCPU *cpu,
     dc->current_pl = arm_current_pl(env);
     dc->features = env->features;
 
+    /* Single step state. The code-generation logic here is:
+     *  SS_ACTIVE == 0:
+     *   generate code with no special handling for single-stepping (except
+     *   that anything that can make us go to SS_ACTIVE == 1 must end the TB;
+     *   this happens anyway because those changes are all system register or
+     *   PSTATE writes).
+     *  SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending)
+     *   emit code for one insn
+     *   emit code to clear PSTATE.SS
+     *   emit code to generate software step exception for completed step
+     *   end TB (as usual for having generated an exception)
+     *  SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending)
+     *   emit code to generate a software step exception
+     *   end the TB
+     */
+    dc->ss_active = ARM_TBFLAG_AA64_SS_ACTIVE(tb->flags);
+    dc->pstate_ss = ARM_TBFLAG_AA64_PSTATE_SS(tb->flags);
+    dc->is_ldex = false;
+    dc->ss_same_el = (arm_debug_target_el(env) == dc->current_pl);
+
     init_tmp_a64_array(dc);
 
     next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
@@ -10916,6 +10975,23 @@ void gen_intermediate_code_internal_a64(ARMCPU *cpu,
             tcg_gen_debug_insn_start(dc->pc);
         }
 
+        if (dc->ss_active && !dc->pstate_ss) {
+            /* Singlestep state is Active-pending.
+             * If we're in this state at the start of a TB then either
+             *  a) we just took an exception to an EL which is being debugged
+             *     and this is the first insn in the exception handler
+             *  b) debug exceptions were masked and we just unmasked them
+             *     without changing EL (eg by clearing PSTATE.D)
+             * In either case we're going to take a swstep exception in the
+             * "did not step an insn" case, and so the syndrome ISV and EX
+             * bits should be zero.
+             */
+            assert(num_insns == 0);
+            gen_exception(EXCP_UDEF, syn_swstep(dc->ss_same_el, 0, 0));
+            dc->is_jmp = DISAS_EXC;
+            break;
+        }
+
         disas_a64_insn(env, dc);
 
         if (tcg_check_temp_count()) {
@@ -10932,6 +11008,7 @@ void gen_intermediate_code_internal_a64(ARMCPU *cpu,
     } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
              !cs->singlestep_enabled &&
              !singlestep &&
+             !dc->ss_active &&
              dc->pc < next_page_start &&
              num_insns < max_insns);
 
@@ -10939,7 +11016,8 @@ void gen_intermediate_code_internal_a64(ARMCPU *cpu,
         gen_io_end();
     }
 
-    if (unlikely(cs->singlestep_enabled) && dc->is_jmp != DISAS_EXC) {
+    if (unlikely(cs->singlestep_enabled || dc->ss_active)
+        && dc->is_jmp != DISAS_EXC) {
         /* Note that this means single stepping WFI doesn't halt the CPU.
          * For conditional branch insns this is harmless unreachable code as
          * gen_goto_tb() has already handled emitting the debug exception
@@ -10949,7 +11027,11 @@ void gen_intermediate_code_internal_a64(ARMCPU *cpu,
         if (dc->is_jmp != DISAS_JUMP) {
             gen_a64_set_pc_im(dc->pc);
         }
-        gen_exception_internal(EXCP_DEBUG);
+        if (cs->singlestep_enabled) {
+            gen_exception_internal(EXCP_DEBUG);
+        } else {
+            gen_step_complete_exception(dc);
+        }
     } else {
         switch (dc->is_jmp) {
         case DISAS_NEXT:
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 40121858d6..2c0b1deaea 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -205,6 +205,33 @@ static void gen_exception(int excp, uint32_t syndrome)
     tcg_temp_free_i32(tcg_excp);
 }
 
+static void gen_ss_advance(DisasContext *s)
+{
+    /* If the singlestep state is Active-not-pending, advance to
+     * Active-pending.
+     */
+    if (s->ss_active) {
+        s->pstate_ss = 0;
+        gen_helper_clear_pstate_ss(cpu_env);
+    }
+}
+
+static void gen_step_complete_exception(DisasContext *s)
+{
+    /* We just completed step of an insn. Move from Active-not-pending
+     * to Active-pending, and then also take the swstep exception.
+     * This corresponds to making the (IMPDEF) choice to prioritize
+     * swstep exceptions over asynchronous exceptions taken to an exception
+     * level where debug is disabled. This choice has the advantage that
+     * we do not need to maintain internal state corresponding to the
+     * ISV/EX syndrome bits between completion of the step and generation
+     * of the exception, and our syndrome information is always correct.
+     */
+    gen_ss_advance(s);
+    gen_exception(EXCP_UDEF, syn_swstep(s->ss_same_el, 1, s->is_ldex));
+    s->is_jmp = DISAS_EXC;
+}
+
 static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b)
 {
     TCGv_i32 tmp1 = tcg_temp_new_i32();
@@ -3860,7 +3887,7 @@ static inline void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
 
 static inline void gen_jmp (DisasContext *s, uint32_t dest)
 {
-    if (unlikely(s->singlestep_enabled)) {
+    if (unlikely(s->singlestep_enabled || s->ss_active)) {
         /* An indirect jump so that we still trigger the debug exception.  */
         if (s->thumb)
             dest |= 1;
@@ -3908,9 +3935,10 @@ static uint32_t msr_mask(CPUARMState *env, DisasContext *s, int flags, int spsr)
         mask &= ~(CPSR_E | CPSR_GE);
     if (!arm_feature(env, ARM_FEATURE_THUMB2))
         mask &= ~CPSR_IT;
-    /* Mask out execution state bits.  */
-    if (!spsr)
-        mask &= ~CPSR_EXEC;
+    /* Mask out execution state and reserved bits.  */
+    if (!spsr) {
+        mask &= ~(CPSR_EXEC | CPSR_RESERVED);
+    }
     /* Mask out privileged bits.  */
     if (IS_USER(s))
         mask &= CPSR_USER;
@@ -3954,7 +3982,7 @@ static void gen_exception_return(DisasContext *s, TCGv_i32 pc)
     TCGv_i32 tmp;
     store_reg(s, 15, pc);
     tmp = load_cpu_field(spsr);
-    gen_set_cpsr(tmp, 0xffffffff);
+    gen_set_cpsr(tmp, CPSR_ERET_MASK);
     tcg_temp_free_i32(tmp);
     s->is_jmp = DISAS_UPDATE;
 }
@@ -3962,7 +3990,7 @@ static void gen_exception_return(DisasContext *s, TCGv_i32 pc)
 /* Generate a v6 exception return.  Marks both values as dead.  */
 static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
 {
-    gen_set_cpsr(cpsr, 0xffffffff);
+    gen_set_cpsr(cpsr, CPSR_ERET_MASK);
     tcg_temp_free_i32(cpsr);
     store_reg(s, 15, pc);
     s->is_jmp = DISAS_UPDATE;
@@ -7280,6 +7308,8 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
 {
     TCGv_i32 tmp = tcg_temp_new_i32();
 
+    s->is_ldex = true;
+
     switch (size) {
     case 0:
         gen_aa32_ld8u(tmp, addr, get_mem_index(s));
@@ -8836,7 +8866,7 @@ static void disas_arm_insn(CPUARMState * env, DisasContext *s)
                 if ((insn & (1 << 22)) && !user) {
                     /* Restore CPSR from SPSR.  */
                     tmp = load_cpu_field(spsr);
-                    gen_set_cpsr(tmp, 0xffffffff);
+                    gen_set_cpsr(tmp, CPSR_ERET_MASK);
                     tcg_temp_free_i32(tmp);
                     s->is_jmp = DISAS_UPDATE;
                 }
@@ -10916,6 +10946,26 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
     dc->current_pl = arm_current_pl(env);
     dc->features = env->features;
 
+    /* Single step state. The code-generation logic here is:
+     *  SS_ACTIVE == 0:
+     *   generate code with no special handling for single-stepping (except
+     *   that anything that can make us go to SS_ACTIVE == 1 must end the TB;
+     *   this happens anyway because those changes are all system register or
+     *   PSTATE writes).
+     *  SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending)
+     *   emit code for one insn
+     *   emit code to clear PSTATE.SS
+     *   emit code to generate software step exception for completed step
+     *   end TB (as usual for having generated an exception)
+     *  SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending)
+     *   emit code to generate a software step exception
+     *   end the TB
+     */
+    dc->ss_active = ARM_TBFLAG_SS_ACTIVE(tb->flags);
+    dc->pstate_ss = ARM_TBFLAG_PSTATE_SS(tb->flags);
+    dc->is_ldex = false;
+    dc->ss_same_el = false; /* Can't be true since EL_d must be AArch64 */
+
     cpu_F0s = tcg_temp_new_i32();
     cpu_F1s = tcg_temp_new_i32();
     cpu_F0d = tcg_temp_new_i64();
@@ -11025,6 +11075,22 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
             tcg_gen_debug_insn_start(dc->pc);
         }
 
+        if (dc->ss_active && !dc->pstate_ss) {
+            /* Singlestep state is Active-pending.
+             * If we're in this state at the start of a TB then either
+             *  a) we just took an exception to an EL which is being debugged
+             *     and this is the first insn in the exception handler
+             *  b) debug exceptions were masked and we just unmasked them
+             *     without changing EL (eg by clearing PSTATE.D)
+             * In either case we're going to take a swstep exception in the
+             * "did not step an insn" case, and so the syndrome ISV and EX
+             * bits should be zero.
+             */
+            assert(num_insns == 0);
+            gen_exception(EXCP_UDEF, syn_swstep(dc->ss_same_el, 0, 0));
+            goto done_generating;
+        }
+
         if (dc->thumb) {
             disas_thumb_insn(env, dc);
             if (dc->condexec_mask) {
@@ -11057,6 +11123,7 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
     } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
              !cs->singlestep_enabled &&
              !singlestep &&
+             !dc->ss_active &&
              dc->pc < next_page_start &&
              num_insns < max_insns);
 
@@ -11072,12 +11139,15 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
     /* At this stage dc->condjmp will only be set when the skipped
        instruction was a conditional branch or trap, and the PC has
        already been written.  */
-    if (unlikely(cs->singlestep_enabled)) {
+    if (unlikely(cs->singlestep_enabled || dc->ss_active)) {
         /* Make sure the pc is updated, and raise a debug exception.  */
         if (dc->condjmp) {
             gen_set_condexec(dc);
             if (dc->is_jmp == DISAS_SWI) {
+                gen_ss_advance(dc);
                 gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb));
+            } else if (dc->ss_active) {
+                gen_step_complete_exception(dc);
             } else {
                 gen_exception_internal(EXCP_DEBUG);
             }
@@ -11089,7 +11159,10 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
         }
         gen_set_condexec(dc);
         if (dc->is_jmp == DISAS_SWI && !dc->condjmp) {
+            gen_ss_advance(dc);
             gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb));
+        } else if (dc->ss_active) {
+            gen_step_complete_exception(dc);
         } else {
             /* FIXME: Single stepping a WFI insn will not halt
                the CPU.  */
diff --git a/target-arm/translate.h b/target-arm/translate.h
index 31a0104b58..b90d27514d 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -40,6 +40,18 @@ typedef struct DisasContext {
      * that it is set at the point where we actually touch the FP regs.
      */
     bool fp_access_checked;
+    /* ARMv8 single-step state (this is distinct from the QEMU gdbstub
+     * single-step support).
+     */
+    bool ss_active;
+    bool pstate_ss;
+    /* True if the insn just emitted was a load-exclusive instruction
+     * (necessary for syndrome information for single step exceptions),
+     * ie A64 LDX*, LDAX*, A32/T32 LDREX*, LDAEX*.
+     */
+    bool is_ldex;
+    /* True if a single-step exception will be taken to the current EL */
+    bool ss_same_el;
 #define TMP_A64_MAX 16
     int tmp_a64_count;
     TCGv_i64 tmp_a64[TMP_A64_MAX];