summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2017-07-19 14:42:18 +0100
committerPeter Maydell <peter.maydell@linaro.org>2017-07-19 14:42:18 +0100
commita51568b78ea011e0f1e67664b8b0c6b693f8ee5a (patch)
treeeaffda17128a3b6ad2b56486501a30d8ebb508df
parent6d60e295ef020759a03b90724d0342012c189ba2 (diff)
parentec2eb22ebb3b36f39755414dcbe4f99c2c0562c9 (diff)
downloadfocaccia-qemu-a51568b78ea011e0f1e67664b8b0c6b693f8ee5a.tar.gz
focaccia-qemu-a51568b78ea011e0f1e67664b8b0c6b693f8ee5a.zip
Merge remote-tracking branch 'remotes/aurel/tags/pull-target-sh4-20170718' into staging
Queued target/sh4 patches

# gpg: Signature made Tue 18 Jul 2017 22:44:25 BST
# gpg:                using RSA key 0xBA9C78061DDD8C9B
# gpg: Good signature from "Aurelien Jarno <aurelien@aurel32.net>"
# gpg:                 aka "Aurelien Jarno <aurelien@jarno.fr>"
# gpg:                 aka "Aurelien Jarno <aurel32@debian.org>"
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: 7746 2642 A9EF 94FD 0F77  196D BA9C 7806 1DDD 8C9B

* remotes/aurel/tags/pull-target-sh4-20170718: (31 commits)
  target/sh4: Use tcg_gen_lookup_and_goto_ptr
  target/sh4: Implement fsrra
  target/sh4: Add missing FPSCR.PR == 0 checks
  target/sh4: Implement fpchg
  target/sh4: Introduce CHECK_SH4A
  target/sh4: Introduce CHECK_FPSCR_PR_*
  target/sh4: Tidy misc illegal insn checks
  target/sh4: Unify code for CHECK_FPU_ENABLED
  target/sh4: Unify code for CHECK_PRIVILEGED
  target/sh4: Unify code for CHECK_NOT_DELAY_SLOT
  target/sh4: Simplify 64-bit fp reg-reg move
  target/sh4: Load/store Dr as 64-bit quantities
  target/sh4: Merge DREG into fpr64 routines
  target/sh4: Eliminate unused XREG macro
  target/sh4: Hoist fp register bank selection
  target/sh4: Pass DisasContext to fpr64 routines
  target/sh4: Unify cpu_fregs into FREG
  target/sh4: Hoist register bank selection
  linux-user/sh4: Clean env->flags on signal boundaries
  linux-user/sh4: Notice gUSA regions during signal delivery
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--linux-user/signal.c31
-rw-r--r--target/sh4/cpu.c2
-rw-r--r--target/sh4/cpu.h24
-rw-r--r--target/sh4/helper.h13
-rw-r--r--target/sh4/op_helper.c93
-rw-r--r--target/sh4/translate.c941
6 files changed, 788 insertions, 316 deletions
diff --git a/linux-user/signal.c b/linux-user/signal.c
index 3d18d1b3ee..d68bd26013 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -3471,6 +3471,30 @@ static abi_ulong get_sigframe(struct target_sigaction *ka,
     return (sp - frame_size) & -8ul;
 }
 
+/* Notice when we're in the middle of a gUSA region and reset.
+   Note that this will only occur for !parallel_cpus, as we will
+   translate such sequences differently in a parallel context.  */
+static void unwind_gusa(CPUSH4State *regs)
+{
+    /* If the stack pointer is sufficiently negative, and we haven't
+       completed the sequence, then reset to the entry to the region.  */
+    /* ??? The SH4 kernel checks for and address above 0xC0000000.
+       However, the page mappings in qemu linux-user aren't as restricted
+       and we wind up with the normal stack mapped above 0xF0000000.
+       That said, there is no reason why the kernel should be allowing
+       a gUSA region that spans 1GB.  Use a tighter check here, for what
+       can actually be enabled by the immediate move.  */
+    if (regs->gregs[15] >= -128u && regs->pc < regs->gregs[0]) {
+        /* Reset the PC to before the gUSA region, as computed from
+           R0 = region end, SP = -(region size), plus one more for the
+           insn that actually initializes SP to the region size.  */
+        regs->pc = regs->gregs[0] + regs->gregs[15] - 2;
+
+        /* Reset the SP to the saved version in R1.  */
+        regs->gregs[15] = regs->gregs[1];
+    }
+}
+
 static void setup_sigcontext(struct target_sigcontext *sc,
                              CPUSH4State *regs, unsigned long mask)
 {
@@ -3525,6 +3549,7 @@ static void restore_sigcontext(CPUSH4State *regs, struct target_sigcontext *sc)
     __get_user(regs->fpul, &sc->sc_fpul);
 
     regs->tra = -1;         /* disable syscall checks */
+    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
 }
 
 static void setup_frame(int sig, struct target_sigaction *ka,
@@ -3534,6 +3559,8 @@ static void setup_frame(int sig, struct target_sigaction *ka,
     abi_ulong frame_addr;
     int i;
 
+    unwind_gusa(regs);
+
     frame_addr = get_sigframe(ka, regs->gregs[15], sizeof(*frame));
     trace_user_setup_frame(regs, frame_addr);
     if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
@@ -3566,6 +3593,7 @@ static void setup_frame(int sig, struct target_sigaction *ka,
     regs->gregs[5] = 0;
     regs->gregs[6] = frame_addr += offsetof(typeof(*frame), sc);
     regs->pc = (unsigned long) ka->_sa_handler;
+    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
 
     unlock_user_struct(frame, frame_addr, 1);
     return;
@@ -3583,6 +3611,8 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
     abi_ulong frame_addr;
     int i;
 
+    unwind_gusa(regs);
+
     frame_addr = get_sigframe(ka, regs->gregs[15], sizeof(*frame));
     trace_user_setup_rt_frame(regs, frame_addr);
     if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
@@ -3626,6 +3656,7 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
     regs->gregs[5] = frame_addr + offsetof(typeof(*frame), info);
     regs->gregs[6] = frame_addr + offsetof(typeof(*frame), uc);
     regs->pc = (unsigned long) ka->_sa_handler;
+    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
 
     unlock_user_struct(frame, frame_addr, 1);
     return;
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index 9da7e1ed38..8536f6d002 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -39,7 +39,7 @@ static void superh_cpu_synchronize_from_tb(CPUState *cs, TranslationBlock *tb)
     SuperHCPU *cpu = SUPERH_CPU(cs);
 
     cpu->env.pc = tb->pc;
-    cpu->env.flags = tb->flags;
+    cpu->env.flags = tb->flags & TB_FLAG_ENVFLAGS_MASK;
 }
 
 static bool superh_cpu_has_work(CPUState *cs)
diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index ffb91687b8..3c47f0de89 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -96,6 +96,21 @@
 #define DELAY_SLOT_CONDITIONAL (1 << 1)
 #define DELAY_SLOT_RTE         (1 << 2)
 
+#define TB_FLAG_PENDING_MOVCA  (1 << 3)
+
+#define GUSA_SHIFT             4
+#ifdef CONFIG_USER_ONLY
+#define GUSA_EXCLUSIVE         (1 << 12)
+#define GUSA_MASK              ((0xff << GUSA_SHIFT) | GUSA_EXCLUSIVE)
+#else
+/* Provide dummy versions of the above to allow tests against tbflags
+   to be elided while avoiding ifdefs.  */
+#define GUSA_EXCLUSIVE         0
+#define GUSA_MASK              0
+#endif
+
+#define TB_FLAG_ENVFLAGS_MASK  (DELAY_SLOT_MASK | GUSA_MASK)
+
 typedef struct tlb_t {
     uint32_t vpn;		/* virtual page number */
     uint32_t ppn;		/* physical page number */
@@ -366,8 +381,6 @@ static inline int cpu_ptel_pr (uint32_t ptel)
 #define PTEA_TC        (1 << 3)
 #define cpu_ptea_tc(ptea) (((ptea) & PTEA_TC) >> 3)
 
-#define TB_FLAG_PENDING_MOVCA  (1 << 4)
-
 static inline target_ulong cpu_read_sr(CPUSH4State *env)
 {
     return env->sr | (env->sr_m << SR_M) |
@@ -387,12 +400,13 @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, target_ulong *pc,
                                         target_ulong *cs_base, uint32_t *flags)
 {
     *pc = env->pc;
-    *cs_base = 0;
-    *flags = (env->flags & DELAY_SLOT_MASK)                    /* Bits  0- 2 */
+    /* For a gUSA region, notice the end of the region.  */
+    *cs_base = env->flags & GUSA_MASK ? env->gregs[0] : 0;
+    *flags = env->flags /* TB_FLAG_ENVFLAGS_MASK: bits 0-2, 4-12 */
             | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
             | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))      /* Bits 29-30 */
             | (env->sr & (1u << SR_FD))                        /* Bit 15 */
-            | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 4 */
+            | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 3 */
 }
 
 #endif /* SH4_CPU_H */
diff --git a/target/sh4/helper.h b/target/sh4/helper.h
index dce859caea..1e768fcbc7 100644
--- a/target/sh4/helper.h
+++ b/target/sh4/helper.h
@@ -6,6 +6,7 @@ DEF_HELPER_1(raise_slot_fpu_disable, noreturn, env)
 DEF_HELPER_1(debug, noreturn, env)
 DEF_HELPER_1(sleep, noreturn, env)
 DEF_HELPER_2(trapa, noreturn, env, i32)
+DEF_HELPER_1(exclusive, noreturn, env)
 
 DEF_HELPER_3(movcal, void, env, i32, i32)
 DEF_HELPER_1(discard_movcal_backup, void, env)
@@ -16,17 +17,15 @@ DEF_HELPER_3(macw, void, env, i32, i32)
 
 DEF_HELPER_2(ld_fpscr, void, env, i32)
 
-DEF_HELPER_FLAGS_1(fabs_FT, TCG_CALL_NO_RWG_SE, f32, f32)
-DEF_HELPER_FLAGS_1(fabs_DT, TCG_CALL_NO_RWG_SE, f64, f64)
 DEF_HELPER_FLAGS_3(fadd_FT, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fadd_DT, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_2(fcnvsd_FT_DT, TCG_CALL_NO_WG, f64, env, f32)
 DEF_HELPER_FLAGS_2(fcnvds_DT_FT, TCG_CALL_NO_WG, f32, env, f64)
 
-DEF_HELPER_3(fcmp_eq_FT, void, env, f32, f32)
-DEF_HELPER_3(fcmp_eq_DT, void, env, f64, f64)
-DEF_HELPER_3(fcmp_gt_FT, void, env, f32, f32)
-DEF_HELPER_3(fcmp_gt_DT, void, env, f64, f64)
+DEF_HELPER_FLAGS_3(fcmp_eq_FT, TCG_CALL_NO_WG, i32, env, f32, f32)
+DEF_HELPER_FLAGS_3(fcmp_eq_DT, TCG_CALL_NO_WG, i32, env, f64, f64)
+DEF_HELPER_FLAGS_3(fcmp_gt_FT, TCG_CALL_NO_WG, i32, env, f32, f32)
+DEF_HELPER_FLAGS_3(fcmp_gt_DT, TCG_CALL_NO_WG, i32, env, f64, f64)
 DEF_HELPER_FLAGS_3(fdiv_FT, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fdiv_DT, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_2(float_FT, TCG_CALL_NO_WG, f32, env, i32)
@@ -34,11 +33,11 @@ DEF_HELPER_FLAGS_2(float_DT, TCG_CALL_NO_WG, f64, env, i32)
 DEF_HELPER_FLAGS_4(fmac_FT, TCG_CALL_NO_WG, f32, env, f32, f32, f32)
 DEF_HELPER_FLAGS_3(fmul_FT, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fmul_DT, TCG_CALL_NO_WG, f64, env, f64, f64)
-DEF_HELPER_FLAGS_1(fneg_T, TCG_CALL_NO_RWG_SE, f32, f32)
 DEF_HELPER_FLAGS_3(fsub_FT, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fsub_DT, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_2(fsqrt_FT, TCG_CALL_NO_WG, f32, env, f32)
 DEF_HELPER_FLAGS_2(fsqrt_DT, TCG_CALL_NO_WG, f64, env, f64)
+DEF_HELPER_FLAGS_2(fsrra_FT, TCG_CALL_NO_WG, f32, env, f32)
 DEF_HELPER_FLAGS_2(ftrc_FT, TCG_CALL_NO_WG, i32, env, f32)
 DEF_HELPER_FLAGS_2(ftrc_DT, TCG_CALL_NO_WG, i32, env, f64)
 DEF_HELPER_3(fipr, void, env, i32, i32)
diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
index 528a40ac1d..d798f239cf 100644
--- a/target/sh4/op_helper.c
+++ b/target/sh4/op_helper.c
@@ -115,6 +115,12 @@ void helper_trapa(CPUSH4State *env, uint32_t tra)
     raise_exception(env, 0x160, 0);
 }
 
+void helper_exclusive(CPUSH4State *env)
+{
+    /* We do not want cpu_restore_state to run.  */
+    cpu_loop_exit_atomic(ENV_GET_CPU(env), 0);
+}
+
 void helper_movcal(CPUSH4State *env, uint32_t address, uint32_t value)
 {
     if (cpu_sh4_is_cached (env, address))
@@ -219,29 +225,29 @@ static void update_fpscr(CPUSH4State *env, uintptr_t retaddr)
 
     xcpt = get_float_exception_flags(&env->fp_status);
 
-    /* Clear the flag entries */
-    env->fpscr &= ~FPSCR_FLAG_MASK;
+    /* Clear the cause entries */
+    env->fpscr &= ~FPSCR_CAUSE_MASK;
 
     if (unlikely(xcpt)) {
         if (xcpt & float_flag_invalid) {
-            env->fpscr |= FPSCR_FLAG_V;
+            env->fpscr |= FPSCR_CAUSE_V;
         }
         if (xcpt & float_flag_divbyzero) {
-            env->fpscr |= FPSCR_FLAG_Z;
+            env->fpscr |= FPSCR_CAUSE_Z;
         }
         if (xcpt & float_flag_overflow) {
-            env->fpscr |= FPSCR_FLAG_O;
+            env->fpscr |= FPSCR_CAUSE_O;
         }
         if (xcpt & float_flag_underflow) {
-            env->fpscr |= FPSCR_FLAG_U;
+            env->fpscr |= FPSCR_CAUSE_U;
         }
         if (xcpt & float_flag_inexact) {
-            env->fpscr |= FPSCR_FLAG_I;
+            env->fpscr |= FPSCR_CAUSE_I;
         }
 
-        /* Accumulate in cause entries */
-        env->fpscr |= (env->fpscr & FPSCR_FLAG_MASK)
-                      << (FPSCR_CAUSE_SHIFT - FPSCR_FLAG_SHIFT);
+        /* Accumulate in flag entries */
+        env->fpscr |= (env->fpscr & FPSCR_CAUSE_MASK)
+                      >> (FPSCR_CAUSE_SHIFT - FPSCR_FLAG_SHIFT);
 
         /* Generate an exception if enabled */
         cause = (env->fpscr & FPSCR_CAUSE_MASK) >> FPSCR_CAUSE_SHIFT;
@@ -252,16 +258,6 @@ static void update_fpscr(CPUSH4State *env, uintptr_t retaddr)
     }
 }
 
-float32 helper_fabs_FT(float32 t0)
-{
-    return float32_abs(t0);
-}
-
-float64 helper_fabs_DT(float64 t0)
-{
-    return float64_abs(t0);
-}
-
 float32 helper_fadd_FT(CPUSH4State *env, float32 t0, float32 t1)
 {
     set_float_exception_flags(0, &env->fp_status);
@@ -278,56 +274,44 @@ float64 helper_fadd_DT(CPUSH4State *env, float64 t0, float64 t1)
     return t0;
 }
 
-void helper_fcmp_eq_FT(CPUSH4State *env, float32 t0, float32 t1)
+uint32_t helper_fcmp_eq_FT(CPUSH4State *env, float32 t0, float32 t1)
 {
     int relation;
 
     set_float_exception_flags(0, &env->fp_status);
     relation = float32_compare(t0, t1, &env->fp_status);
-    if (unlikely(relation == float_relation_unordered)) {
-        update_fpscr(env, GETPC());
-    } else {
-        env->sr_t = (relation == float_relation_equal);
-    }
+    update_fpscr(env, GETPC());
+    return relation == float_relation_equal;
 }
 
-void helper_fcmp_eq_DT(CPUSH4State *env, float64 t0, float64 t1)
+uint32_t helper_fcmp_eq_DT(CPUSH4State *env, float64 t0, float64 t1)
 {
     int relation;
 
     set_float_exception_flags(0, &env->fp_status);
     relation = float64_compare(t0, t1, &env->fp_status);
-    if (unlikely(relation == float_relation_unordered)) {
-        update_fpscr(env, GETPC());
-    } else {
-        env->sr_t = (relation == float_relation_equal);
-    }
+    update_fpscr(env, GETPC());
+    return relation == float_relation_equal;
 }
 
-void helper_fcmp_gt_FT(CPUSH4State *env, float32 t0, float32 t1)
+uint32_t helper_fcmp_gt_FT(CPUSH4State *env, float32 t0, float32 t1)
 {
     int relation;
 
     set_float_exception_flags(0, &env->fp_status);
     relation = float32_compare(t0, t1, &env->fp_status);
-    if (unlikely(relation == float_relation_unordered)) {
-        update_fpscr(env, GETPC());
-    } else {
-        env->sr_t = (relation == float_relation_greater);
-    }
+    update_fpscr(env, GETPC());
+    return relation == float_relation_greater;
 }
 
-void helper_fcmp_gt_DT(CPUSH4State *env, float64 t0, float64 t1)
+uint32_t helper_fcmp_gt_DT(CPUSH4State *env, float64 t0, float64 t1)
 {
     int relation;
 
     set_float_exception_flags(0, &env->fp_status);
     relation = float64_compare(t0, t1, &env->fp_status);
-    if (unlikely(relation == float_relation_unordered)) {
-        update_fpscr(env, GETPC());
-    } else {
-        env->sr_t = (relation == float_relation_greater);
-    }
+    update_fpscr(env, GETPC());
+    return relation == float_relation_greater;
 }
 
 float64 helper_fcnvsd_FT_DT(CPUSH4State *env, float32 t0)
@@ -406,11 +390,6 @@ float64 helper_fmul_DT(CPUSH4State *env, float64 t0, float64 t1)
     return t0;
 }
 
-float32 helper_fneg_T(float32 t0)
-{
-    return float32_chs(t0);
-}
-
 float32 helper_fsqrt_FT(CPUSH4State *env, float32 t0)
 {
     set_float_exception_flags(0, &env->fp_status);
@@ -427,6 +406,22 @@ float64 helper_fsqrt_DT(CPUSH4State *env, float64 t0)
     return t0;
 }
 
+float32 helper_fsrra_FT(CPUSH4State *env, float32 t0)
+{
+    set_float_exception_flags(0, &env->fp_status);
+    /* "Approximate" 1/sqrt(x) via actual computation.  */
+    t0 = float32_sqrt(t0, &env->fp_status);
+    t0 = float32_div(float32_one, t0, &env->fp_status);
+    /* Since this is supposed to be an approximation, an imprecision
+       exception is required.  One supposes this also follows the usual
+       IEEE rule that other exceptions take precidence.  */
+    if (get_float_exception_flags(&env->fp_status) == 0) {
+        set_float_exception_flags(float_flag_inexact, &env->fp_status);
+    }
+    update_fpscr(env, GETPC());
+    return t0;
+}
+
 float32 helper_fsub_FT(CPUSH4State *env, float32 t0, float32 t1)
 {
     set_float_exception_flags(0, &env->fp_status);
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 8bc132b27b..498bb99dc1 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -41,6 +41,8 @@ typedef struct DisasContext {
     uint32_t envflags;   /* should stay in sync with env->flags using TCG ops */
     int bstate;
     int memidx;
+    int gbank;
+    int fbank;
     uint32_t delayed_pc;
     int singlestep_enabled;
     uint32_t features;
@@ -64,7 +66,7 @@ enum {
 
 /* global register indexes */
 static TCGv_env cpu_env;
-static TCGv cpu_gregs[24];
+static TCGv cpu_gregs[32];
 static TCGv cpu_sr, cpu_sr_m, cpu_sr_q, cpu_sr_t;
 static TCGv cpu_pc, cpu_ssr, cpu_spc, cpu_gbr;
 static TCGv cpu_vbr, cpu_sgr, cpu_dbr, cpu_mach, cpu_macl;
@@ -98,16 +100,19 @@ void sh4_translate_init(void)
         "FPR12_BANK1", "FPR13_BANK1", "FPR14_BANK1", "FPR15_BANK1",
     };
 
-    if (done_init)
+    if (done_init) {
         return;
+    }
 
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
     tcg_ctx.tcg_env = cpu_env;
 
-    for (i = 0; i < 24; i++)
+    for (i = 0; i < 24; i++) {
         cpu_gregs[i] = tcg_global_mem_new_i32(cpu_env,
                                               offsetof(CPUSH4State, gregs[i]),
                                               gregnames[i]);
+    }
+    memcpy(cpu_gregs + 24, cpu_gregs + 8, 8 * sizeof(TCGv));
 
     cpu_pc = tcg_global_mem_new_i32(cpu_env,
                                     offsetof(CPUSH4State, pc), "PC");
@@ -220,17 +225,22 @@ static inline void gen_save_cpu_state(DisasContext *ctx, bool save_pc)
     if (ctx->delayed_pc != (uint32_t) -1) {
         tcg_gen_movi_i32(cpu_delayed_pc, ctx->delayed_pc);
     }
-    if ((ctx->tbflags & DELAY_SLOT_MASK) != ctx->envflags) {
+    if ((ctx->tbflags & TB_FLAG_ENVFLAGS_MASK) != ctx->envflags) {
         tcg_gen_movi_i32(cpu_flags, ctx->envflags);
     }
 }
 
+static inline bool use_exit_tb(DisasContext *ctx)
+{
+    return (ctx->tbflags & GUSA_EXCLUSIVE) != 0;
+}
+
 static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
 {
-    if (unlikely(ctx->singlestep_enabled)) {
+    /* Use a direct jump if in same page and singlestep not enabled */
+    if (unlikely(ctx->singlestep_enabled || use_exit_tb(ctx))) {
         return false;
     }
-
 #ifndef CONFIG_USER_ONLY
     return (ctx->tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
 #else
@@ -241,69 +251,110 @@ static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
 static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
 {
     if (use_goto_tb(ctx, dest)) {
-	/* Use a direct jump if in same page and singlestep not enabled */
         tcg_gen_goto_tb(n);
         tcg_gen_movi_i32(cpu_pc, dest);
         tcg_gen_exit_tb((uintptr_t)ctx->tb + n);
     } else {
         tcg_gen_movi_i32(cpu_pc, dest);
-        if (ctx->singlestep_enabled)
+        if (ctx->singlestep_enabled) {
             gen_helper_debug(cpu_env);
-        tcg_gen_exit_tb(0);
+        } else if (use_exit_tb(ctx)) {
+            tcg_gen_exit_tb(0);
+        } else {
+            tcg_gen_lookup_and_goto_ptr(cpu_pc);
+        }
     }
 }
 
 static void gen_jump(DisasContext * ctx)
 {
-    if (ctx->delayed_pc == (uint32_t) - 1) {
+    if (ctx->delayed_pc == -1) {
 	/* Target is not statically known, it comes necessarily from a
 	   delayed jump as immediate jump are conditinal jumps */
 	tcg_gen_mov_i32(cpu_pc, cpu_delayed_pc);
         tcg_gen_discard_i32(cpu_delayed_pc);
-	if (ctx->singlestep_enabled)
+        if (ctx->singlestep_enabled) {
             gen_helper_debug(cpu_env);
-	tcg_gen_exit_tb(0);
+        } else if (use_exit_tb(ctx)) {
+            tcg_gen_exit_tb(0);
+        } else {
+            tcg_gen_lookup_and_goto_ptr(cpu_pc);
+        }
     } else {
 	gen_goto_tb(ctx, 0, ctx->delayed_pc);
     }
 }
 
 /* Immediate conditional jump (bt or bf) */
-static void gen_conditional_jump(DisasContext * ctx,
-				 target_ulong ift, target_ulong ifnott)
+static void gen_conditional_jump(DisasContext *ctx, target_ulong dest,
+                                 bool jump_if_true)
 {
     TCGLabel *l1 = gen_new_label();
+    TCGCond cond_not_taken = jump_if_true ? TCG_COND_EQ : TCG_COND_NE;
+
+    if (ctx->tbflags & GUSA_EXCLUSIVE) {
+        /* When in an exclusive region, we must continue to the end.
+           Therefore, exit the region on a taken branch, but otherwise
+           fall through to the next instruction.  */
+        tcg_gen_brcondi_i32(cond_not_taken, cpu_sr_t, 0, l1);
+        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~GUSA_MASK);
+        /* Note that this won't actually use a goto_tb opcode because we
+           disallow it in use_goto_tb, but it handles exit + singlestep.  */
+        gen_goto_tb(ctx, 0, dest);
+        gen_set_label(l1);
+        return;
+    }
+
     gen_save_cpu_state(ctx, false);
-    tcg_gen_brcondi_i32(TCG_COND_NE, cpu_sr_t, 0, l1);
-    gen_goto_tb(ctx, 0, ifnott);
+    tcg_gen_brcondi_i32(cond_not_taken, cpu_sr_t, 0, l1);
+    gen_goto_tb(ctx, 0, dest);
     gen_set_label(l1);
-    gen_goto_tb(ctx, 1, ift);
+    gen_goto_tb(ctx, 1, ctx->pc + 2);
     ctx->bstate = BS_BRANCH;
 }
 
 /* Delayed conditional jump (bt or bf) */
 static void gen_delayed_conditional_jump(DisasContext * ctx)
 {
-    TCGLabel *l1;
-    TCGv ds;
+    TCGLabel *l1 = gen_new_label();
+    TCGv ds = tcg_temp_new();
 
-    l1 = gen_new_label();
-    ds = tcg_temp_new();
     tcg_gen_mov_i32(ds, cpu_delayed_cond);
     tcg_gen_discard_i32(cpu_delayed_cond);
+
+    if (ctx->tbflags & GUSA_EXCLUSIVE) {
+        /* When in an exclusive region, we must continue to the end.
+           Therefore, exit the region on a taken branch, but otherwise
+           fall through to the next instruction.  */
+        tcg_gen_brcondi_i32(TCG_COND_EQ, ds, 0, l1);
+
+        /* Leave the gUSA region.  */
+        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~GUSA_MASK);
+        gen_jump(ctx);
+
+        gen_set_label(l1);
+        return;
+    }
+
     tcg_gen_brcondi_i32(TCG_COND_NE, ds, 0, l1);
     gen_goto_tb(ctx, 1, ctx->pc + 2);
     gen_set_label(l1);
     gen_jump(ctx);
 }
 
-static inline void gen_load_fpr64(TCGv_i64 t, int reg)
+static inline void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
 {
+    /* We have already signaled illegal instruction for odd Dr.  */
+    tcg_debug_assert((reg & 1) == 0);
+    reg ^= ctx->fbank;
     tcg_gen_concat_i32_i64(t, cpu_fregs[reg + 1], cpu_fregs[reg]);
 }
 
-static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
+static inline void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
 {
+    /* We have already signaled illegal instruction for odd Dr.  */
+    tcg_debug_assert((reg & 1) == 0);
+    reg ^= ctx->fbank;
     tcg_gen_extr_i64_i32(cpu_fregs[reg + 1], cpu_fregs[reg], t);
 }
 
@@ -317,49 +368,40 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 #define B11_8 ((ctx->opcode >> 8) & 0xf)
 #define B15_12 ((ctx->opcode >> 12) & 0xf)
 
-#define REG(x) ((x) < 8 && (ctx->tbflags & (1u << SR_MD))\
-                        && (ctx->tbflags & (1u << SR_RB))\
-                ? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
+#define REG(x)     cpu_gregs[(x) ^ ctx->gbank]
+#define ALTREG(x)  cpu_gregs[(x) ^ ctx->gbank ^ 0x10]
+#define FREG(x)    cpu_fregs[(x) ^ ctx->fbank]
 
-#define ALTREG(x) ((x) < 8 && (!(ctx->tbflags & (1u << SR_MD))\
-                               || !(ctx->tbflags & (1u << SR_RB)))\
-		? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
-
-#define FREG(x) (ctx->tbflags & FPSCR_FR ? (x) ^ 0x10 : (x))
 #define XHACK(x) ((((x) & 1 ) << 4) | ((x) & 0xe))
-#define XREG(x) (ctx->tbflags & FPSCR_FR ? XHACK(x) ^ 0x10 : XHACK(x))
-#define DREG(x) FREG(x) /* Assumes lsb of (x) is always 0 */
 
 #define CHECK_NOT_DELAY_SLOT \
-    if (ctx->envflags & DELAY_SLOT_MASK) {                           \
-        gen_save_cpu_state(ctx, true);                               \
-        gen_helper_raise_slot_illegal_instruction(cpu_env);          \
-        ctx->bstate = BS_EXCP;                                       \
-        return;                                                      \
+    if (ctx->envflags & DELAY_SLOT_MASK) {  \
+        goto do_illegal_slot;               \
+    }
+
+#define CHECK_PRIVILEGED \
+    if (IS_USER(ctx)) {                     \
+        goto do_illegal;                    \
+    }
+
+#define CHECK_FPU_ENABLED \
+    if (ctx->tbflags & (1u << SR_FD)) {     \
+        goto do_fpu_disabled;               \
     }
 
-#define CHECK_PRIVILEGED                                             \
-    if (IS_USER(ctx)) {                                              \
-        gen_save_cpu_state(ctx, true);                               \
-        if (ctx->envflags & DELAY_SLOT_MASK) {                       \
-            gen_helper_raise_slot_illegal_instruction(cpu_env);      \
-        } else {                                                     \
-            gen_helper_raise_illegal_instruction(cpu_env);           \
-        }                                                            \
-        ctx->bstate = BS_EXCP;                                       \
-        return;                                                      \
+#define CHECK_FPSCR_PR_0 \
+    if (ctx->tbflags & FPSCR_PR) {          \
+        goto do_illegal;                    \
     }
 
-#define CHECK_FPU_ENABLED                                            \
-    if (ctx->tbflags & (1u << SR_FD)) {                              \
-        gen_save_cpu_state(ctx, true);                               \
-        if (ctx->envflags & DELAY_SLOT_MASK) {                       \
-            gen_helper_raise_slot_fpu_disable(cpu_env);              \
-        } else {                                                     \
-            gen_helper_raise_fpu_disable(cpu_env);                   \
-        }                                                            \
-        ctx->bstate = BS_EXCP;                                       \
-        return;                                                      \
+#define CHECK_FPSCR_PR_1 \
+    if (!(ctx->tbflags & FPSCR_PR)) {       \
+        goto do_illegal;                    \
+    }
+
+#define CHECK_SH4A \
+    if (!(ctx->features & SH_FEATURE_SH4A)) { \
+        goto do_illegal;                      \
     }
 
 static void _decode_opc(DisasContext * ctx)
@@ -441,13 +483,20 @@ static void _decode_opc(DisasContext * ctx)
         tcg_gen_movi_i32(cpu_sr_t, 1);
 	return;
     case 0xfbfd:		/* frchg */
+        CHECK_FPSCR_PR_0
 	tcg_gen_xori_i32(cpu_fpscr, cpu_fpscr, FPSCR_FR);
 	ctx->bstate = BS_STOP;
 	return;
     case 0xf3fd:		/* fschg */
+        CHECK_FPSCR_PR_0
         tcg_gen_xori_i32(cpu_fpscr, cpu_fpscr, FPSCR_SZ);
 	ctx->bstate = BS_STOP;
 	return;
+    case 0xf7fd:                /* fpchg */
+        CHECK_SH4A
+        tcg_gen_xori_i32(cpu_fpscr, cpu_fpscr, FPSCR_PR);
+        ctx->bstate = BS_STOP;
+        return;
     case 0x0009:		/* nop */
 	return;
     case 0x001b:		/* sleep */
@@ -475,6 +524,15 @@ static void _decode_opc(DisasContext * ctx)
 	}
 	return;
     case 0xe000:		/* mov #imm,Rn */
+#ifdef CONFIG_USER_ONLY
+        /* Detect the start of a gUSA region.  If so, update envflags
+           and end the TB.  This will allow us to see the end of the
+           region (stored in R0) in the next TB.  */
+        if (B11_8 == 15 && B7_0s < 0 && parallel_cpus) {
+            ctx->envflags = deposit32(ctx->envflags, GUSA_SHIFT, 8, B7_0s);
+            ctx->bstate = BS_STOP;
+        }
+#endif
 	tcg_gen_movi_i32(REG(B11_8), B7_0s);
 	return;
     case 0x9000:		/* mov.w @(disp,PC),Rn */
@@ -938,75 +996,66 @@ static void _decode_opc(DisasContext * ctx)
     case 0xf00c: /* fmov {F,D,X}Rm,{F,D,X}Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
         if (ctx->tbflags & FPSCR_SZ) {
-	    TCGv_i64 fp = tcg_temp_new_i64();
-	    gen_load_fpr64(fp, XREG(B7_4));
-	    gen_store_fpr64(fp, XREG(B11_8));
-	    tcg_temp_free_i64(fp);
+            int xsrc = XHACK(B7_4);
+            int xdst = XHACK(B11_8);
+            tcg_gen_mov_i32(FREG(xdst), FREG(xsrc));
+            tcg_gen_mov_i32(FREG(xdst + 1), FREG(xsrc + 1));
 	} else {
-	    tcg_gen_mov_i32(cpu_fregs[FREG(B11_8)], cpu_fregs[FREG(B7_4)]);
+            tcg_gen_mov_i32(FREG(B11_8), FREG(B7_4));
 	}
 	return;
     case 0xf00a: /* fmov {F,D,X}Rm,@Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
         if (ctx->tbflags & FPSCR_SZ) {
-	    TCGv addr_hi = tcg_temp_new();
-	    int fr = XREG(B7_4);
-	    tcg_gen_addi_i32(addr_hi, REG(B11_8), 4);
-            tcg_gen_qemu_st_i32(cpu_fregs[fr], REG(B11_8),
-                                ctx->memidx, MO_TEUL);
-            tcg_gen_qemu_st_i32(cpu_fregs[fr+1], addr_hi,
-                                ctx->memidx, MO_TEUL);
-	    tcg_temp_free(addr_hi);
+            TCGv_i64 fp = tcg_temp_new_i64();
+            gen_load_fpr64(ctx, fp, XHACK(B7_4));
+            tcg_gen_qemu_st_i64(fp, REG(B11_8), ctx->memidx, MO_TEQ);
+            tcg_temp_free_i64(fp);
 	} else {
-            tcg_gen_qemu_st_i32(cpu_fregs[FREG(B7_4)], REG(B11_8),
-                                ctx->memidx, MO_TEUL);
+            tcg_gen_qemu_st_i32(FREG(B7_4), REG(B11_8), ctx->memidx, MO_TEUL);
 	}
 	return;
     case 0xf008: /* fmov @Rm,{F,D,X}Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
         if (ctx->tbflags & FPSCR_SZ) {
-	    TCGv addr_hi = tcg_temp_new();
-	    int fr = XREG(B11_8);
-	    tcg_gen_addi_i32(addr_hi, REG(B7_4), 4);
-            tcg_gen_qemu_ld_i32(cpu_fregs[fr], REG(B7_4), ctx->memidx, MO_TEUL);
-            tcg_gen_qemu_ld_i32(cpu_fregs[fr+1], addr_hi, ctx->memidx, MO_TEUL);
-	    tcg_temp_free(addr_hi);
+            TCGv_i64 fp = tcg_temp_new_i64();
+            tcg_gen_qemu_ld_i64(fp, REG(B7_4), ctx->memidx, MO_TEQ);
+            gen_store_fpr64(ctx, fp, XHACK(B11_8));
+            tcg_temp_free_i64(fp);
 	} else {
-            tcg_gen_qemu_ld_i32(cpu_fregs[FREG(B11_8)], REG(B7_4),
-                                ctx->memidx, MO_TEUL);
+            tcg_gen_qemu_ld_i32(FREG(B11_8), REG(B7_4), ctx->memidx, MO_TEUL);
 	}
 	return;
     case 0xf009: /* fmov @Rm+,{F,D,X}Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
         if (ctx->tbflags & FPSCR_SZ) {
-	    TCGv addr_hi = tcg_temp_new();
-	    int fr = XREG(B11_8);
-	    tcg_gen_addi_i32(addr_hi, REG(B7_4), 4);
-            tcg_gen_qemu_ld_i32(cpu_fregs[fr], REG(B7_4), ctx->memidx, MO_TEUL);
-            tcg_gen_qemu_ld_i32(cpu_fregs[fr+1], addr_hi, ctx->memidx, MO_TEUL);
-	    tcg_gen_addi_i32(REG(B7_4), REG(B7_4), 8);
-	    tcg_temp_free(addr_hi);
+            TCGv_i64 fp = tcg_temp_new_i64();
+            tcg_gen_qemu_ld_i64(fp, REG(B7_4), ctx->memidx, MO_TEQ);
+            gen_store_fpr64(ctx, fp, XHACK(B11_8));
+            tcg_temp_free_i64(fp);
+            tcg_gen_addi_i32(REG(B7_4), REG(B7_4), 8);
 	} else {
-            tcg_gen_qemu_ld_i32(cpu_fregs[FREG(B11_8)], REG(B7_4),
-                                ctx->memidx, MO_TEUL);
+            tcg_gen_qemu_ld_i32(FREG(B11_8), REG(B7_4), ctx->memidx, MO_TEUL);
 	    tcg_gen_addi_i32(REG(B7_4), REG(B7_4), 4);
 	}
 	return;
     case 0xf00b: /* fmov {F,D,X}Rm,@-Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-        TCGv addr = tcg_temp_new_i32();
-        tcg_gen_subi_i32(addr, REG(B11_8), 4);
-        if (ctx->tbflags & FPSCR_SZ) {
-	    int fr = XREG(B7_4);
-            tcg_gen_qemu_st_i32(cpu_fregs[fr+1], addr, ctx->memidx, MO_TEUL);
-	    tcg_gen_subi_i32(addr, addr, 4);
-            tcg_gen_qemu_st_i32(cpu_fregs[fr], addr, ctx->memidx, MO_TEUL);
-	} else {
-            tcg_gen_qemu_st_i32(cpu_fregs[FREG(B7_4)], addr,
-                                ctx->memidx, MO_TEUL);
-	}
-        tcg_gen_mov_i32(REG(B11_8), addr);
-        tcg_temp_free(addr);
+        {
+            TCGv addr = tcg_temp_new_i32();
+            if (ctx->tbflags & FPSCR_SZ) {
+                TCGv_i64 fp = tcg_temp_new_i64();
+                gen_load_fpr64(ctx, fp, XHACK(B7_4));
+                tcg_gen_subi_i32(addr, REG(B11_8), 8);
+                tcg_gen_qemu_st_i64(fp, addr, ctx->memidx, MO_TEQ);
+                tcg_temp_free_i64(fp);
+            } else {
+                tcg_gen_subi_i32(addr, REG(B11_8), 4);
+                tcg_gen_qemu_st_i32(FREG(B7_4), addr, ctx->memidx, MO_TEUL);
+            }
+            tcg_gen_mov_i32(REG(B11_8), addr);
+            tcg_temp_free(addr);
+        }
 	return;
     case 0xf006: /* fmov @(R0,Rm),{F,D,X}Rm - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
@@ -1014,15 +1063,12 @@ static void _decode_opc(DisasContext * ctx)
 	    TCGv addr = tcg_temp_new_i32();
 	    tcg_gen_add_i32(addr, REG(B7_4), REG(0));
             if (ctx->tbflags & FPSCR_SZ) {
-		int fr = XREG(B11_8);
-                tcg_gen_qemu_ld_i32(cpu_fregs[fr], addr,
-                                    ctx->memidx, MO_TEUL);
-		tcg_gen_addi_i32(addr, addr, 4);
-                tcg_gen_qemu_ld_i32(cpu_fregs[fr+1], addr,
-                                    ctx->memidx, MO_TEUL);
+                TCGv_i64 fp = tcg_temp_new_i64();
+                tcg_gen_qemu_ld_i64(fp, addr, ctx->memidx, MO_TEQ);
+                gen_store_fpr64(ctx, fp, XHACK(B11_8));
+                tcg_temp_free_i64(fp);
 	    } else {
-                tcg_gen_qemu_ld_i32(cpu_fregs[FREG(B11_8)], addr,
-                                    ctx->memidx, MO_TEUL);
+                tcg_gen_qemu_ld_i32(FREG(B11_8), addr, ctx->memidx, MO_TEUL);
 	    }
 	    tcg_temp_free(addr);
 	}
@@ -1033,15 +1079,12 @@ static void _decode_opc(DisasContext * ctx)
 	    TCGv addr = tcg_temp_new();
 	    tcg_gen_add_i32(addr, REG(B11_8), REG(0));
             if (ctx->tbflags & FPSCR_SZ) {
-		int fr = XREG(B7_4);
-                tcg_gen_qemu_ld_i32(cpu_fregs[fr], addr,
-                                    ctx->memidx, MO_TEUL);
-		tcg_gen_addi_i32(addr, addr, 4);
-                tcg_gen_qemu_ld_i32(cpu_fregs[fr+1], addr,
-                                    ctx->memidx, MO_TEUL);
+                TCGv_i64 fp = tcg_temp_new_i64();
+                gen_load_fpr64(ctx, fp, XHACK(B7_4));
+                tcg_gen_qemu_st_i64(fp, addr, ctx->memidx, MO_TEQ);
+                tcg_temp_free_i64(fp);
 	    } else {
-                tcg_gen_qemu_st_i32(cpu_fregs[FREG(B7_4)], addr,
-                                    ctx->memidx, MO_TEUL);
+                tcg_gen_qemu_st_i32(FREG(B7_4), addr, ctx->memidx, MO_TEUL);
 	    }
 	    tcg_temp_free(addr);
 	}
@@ -1057,12 +1100,13 @@ static void _decode_opc(DisasContext * ctx)
             if (ctx->tbflags & FPSCR_PR) {
                 TCGv_i64 fp0, fp1;
 
-		if (ctx->opcode & 0x0110)
-		    break; /* illegal instruction */
+                if (ctx->opcode & 0x0110) {
+                    goto do_illegal;
+                }
 		fp0 = tcg_temp_new_i64();
 		fp1 = tcg_temp_new_i64();
-		gen_load_fpr64(fp0, DREG(B11_8));
-		gen_load_fpr64(fp1, DREG(B7_4));
+                gen_load_fpr64(ctx, fp0, B11_8);
+                gen_load_fpr64(ctx, fp1, B7_4);
                 switch (ctx->opcode & 0xf00f) {
                 case 0xf000:		/* fadd Rm,Rn */
                     gen_helper_fadd_DT(fp0, cpu_env, fp0, fp1);
@@ -1077,61 +1121,51 @@ static void _decode_opc(DisasContext * ctx)
                     gen_helper_fdiv_DT(fp0, cpu_env, fp0, fp1);
                     break;
                 case 0xf004:		/* fcmp/eq Rm,Rn */
-                    gen_helper_fcmp_eq_DT(cpu_env, fp0, fp1);
+                    gen_helper_fcmp_eq_DT(cpu_sr_t, cpu_env, fp0, fp1);
                     return;
                 case 0xf005:		/* fcmp/gt Rm,Rn */
-                    gen_helper_fcmp_gt_DT(cpu_env, fp0, fp1);
+                    gen_helper_fcmp_gt_DT(cpu_sr_t, cpu_env, fp0, fp1);
                     return;
                 }
-		gen_store_fpr64(fp0, DREG(B11_8));
+                gen_store_fpr64(ctx, fp0, B11_8);
                 tcg_temp_free_i64(fp0);
                 tcg_temp_free_i64(fp1);
 	    } else {
                 switch (ctx->opcode & 0xf00f) {
                 case 0xf000:		/* fadd Rm,Rn */
-                    gen_helper_fadd_FT(cpu_fregs[FREG(B11_8)], cpu_env,
-                                       cpu_fregs[FREG(B11_8)],
-                                       cpu_fregs[FREG(B7_4)]);
+                    gen_helper_fadd_FT(FREG(B11_8), cpu_env,
+                                       FREG(B11_8), FREG(B7_4));
                     break;
                 case 0xf001:		/* fsub Rm,Rn */
-                    gen_helper_fsub_FT(cpu_fregs[FREG(B11_8)], cpu_env,
-                                       cpu_fregs[FREG(B11_8)],
-                                       cpu_fregs[FREG(B7_4)]);
+                    gen_helper_fsub_FT(FREG(B11_8), cpu_env,
+                                       FREG(B11_8), FREG(B7_4));
                     break;
                 case 0xf002:		/* fmul Rm,Rn */
-                    gen_helper_fmul_FT(cpu_fregs[FREG(B11_8)], cpu_env,
-                                       cpu_fregs[FREG(B11_8)],
-                                       cpu_fregs[FREG(B7_4)]);
+                    gen_helper_fmul_FT(FREG(B11_8), cpu_env,
+                                       FREG(B11_8), FREG(B7_4));
                     break;
                 case 0xf003:		/* fdiv Rm,Rn */
-                    gen_helper_fdiv_FT(cpu_fregs[FREG(B11_8)], cpu_env,
-                                       cpu_fregs[FREG(B11_8)],
-                                       cpu_fregs[FREG(B7_4)]);
+                    gen_helper_fdiv_FT(FREG(B11_8), cpu_env,
+                                       FREG(B11_8), FREG(B7_4));
                     break;
                 case 0xf004:		/* fcmp/eq Rm,Rn */
-                    gen_helper_fcmp_eq_FT(cpu_env, cpu_fregs[FREG(B11_8)],
-                                          cpu_fregs[FREG(B7_4)]);
+                    gen_helper_fcmp_eq_FT(cpu_sr_t, cpu_env,
+                                          FREG(B11_8), FREG(B7_4));
                     return;
                 case 0xf005:		/* fcmp/gt Rm,Rn */
-                    gen_helper_fcmp_gt_FT(cpu_env, cpu_fregs[FREG(B11_8)],
-                                          cpu_fregs[FREG(B7_4)]);
+                    gen_helper_fcmp_gt_FT(cpu_sr_t, cpu_env,
+                                          FREG(B11_8), FREG(B7_4));
                     return;
                 }
 	    }
 	}
 	return;
     case 0xf00e: /* fmac FR0,RM,Rn */
-        {
-            CHECK_FPU_ENABLED
-            if (ctx->tbflags & FPSCR_PR) {
-                break; /* illegal instruction */
-            } else {
-                gen_helper_fmac_FT(cpu_fregs[FREG(B11_8)], cpu_env,
-                                   cpu_fregs[FREG(0)], cpu_fregs[FREG(B7_4)],
-                                   cpu_fregs[FREG(B11_8)]);
-                return;
-            }
-        }
+        CHECK_FPU_ENABLED
+        CHECK_FPSCR_PR_0
+        gen_helper_fmac_FT(FREG(B11_8), cpu_env,
+                           FREG(0), FREG(B7_4), FREG(B11_8));
+        return;
     }
 
     switch (ctx->opcode & 0xff00) {
@@ -1153,7 +1187,7 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x8b00:		/* bf label */
 	CHECK_NOT_DELAY_SLOT
-        gen_conditional_jump(ctx, ctx->pc + 2, ctx->pc + 4 + B7_0s * 2);
+        gen_conditional_jump(ctx, ctx->pc + 4 + B7_0s * 2, false);
 	return;
     case 0x8f00:		/* bf/s label */
 	CHECK_NOT_DELAY_SLOT
@@ -1163,7 +1197,7 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x8900:		/* bt label */
 	CHECK_NOT_DELAY_SLOT
-        gen_conditional_jump(ctx, ctx->pc + 4 + B7_0s * 2, ctx->pc + 2);
+        gen_conditional_jump(ctx, ctx->pc + 4 + B7_0s * 2, true);
 	return;
     case 0x8d00:		/* bt/s label */
 	CHECK_NOT_DELAY_SLOT
@@ -1455,7 +1489,7 @@ static void _decode_opc(DisasContext * ctx)
 	LDST(ssr,  0x403e, 0x4037, 0x0032, 0x4033, CHECK_PRIVILEGED)
 	LDST(spc,  0x404e, 0x4047, 0x0042, 0x4043, CHECK_PRIVILEGED)
 	ST(sgr,  0x003a, 0x4032, CHECK_PRIVILEGED)
-	LD(sgr,  0x403a, 0x4036, CHECK_PRIVILEGED if (!(ctx->features & SH_FEATURE_SH4A)) break;)
+        LD(sgr,  0x403a, 0x4036, CHECK_PRIVILEGED CHECK_SH4A)
 	LDST(dbr,  0x40fa, 0x40f6, 0x00fa, 0x40f2, CHECK_PRIVILEGED)
 	LDST(mach, 0x400a, 0x4006, 0x000a, 0x4002, {})
 	LDST(macl, 0x401a, 0x4016, 0x001a, 0x4012, {})
@@ -1505,21 +1539,19 @@ static void _decode_opc(DisasContext * ctx)
         ctx->has_movcal = 1;
 	return;
     case 0x40a9:                /* movua.l @Rm,R0 */
+        CHECK_SH4A
         /* Load non-boundary-aligned data */
-        if (ctx->features & SH_FEATURE_SH4A) {
-            tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx,
-                                MO_TEUL | MO_UNALN);
-            return;
-        }
+        tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx,
+                            MO_TEUL | MO_UNALN);
+        return;
         break;
     case 0x40e9:                /* movua.l @Rm+,R0 */
+        CHECK_SH4A
         /* Load non-boundary-aligned data */
-        if (ctx->features & SH_FEATURE_SH4A) {
-            tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx,
-                                MO_TEUL | MO_UNALN);
-            tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
-            return;
-        }
+        tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx,
+                            MO_TEUL | MO_UNALN);
+        tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
+        return;
         break;
     case 0x0029:		/* movt Rn */
         tcg_gen_mov_i32(REG(B11_8), cpu_sr_t);
@@ -1530,7 +1562,8 @@ static void _decode_opc(DisasContext * ctx)
                If (T == 1) R0 -> (Rn)
                0 -> LDST
         */
-        if (ctx->features & SH_FEATURE_SH4A) {
+        CHECK_SH4A
+        {
             TCGLabel *label = gen_new_label();
             tcg_gen_mov_i32(cpu_sr_t, cpu_ldst);
 	    tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_ldst, 0, label);
@@ -1538,8 +1571,7 @@ static void _decode_opc(DisasContext * ctx)
 	    gen_set_label(label);
 	    tcg_gen_movi_i32(cpu_ldst, 0);
 	    return;
-	} else
-	    break;
+        }
     case 0x0063:
         /* MOVLI.L @Rm,R0
                1 -> LDST
@@ -1547,13 +1579,11 @@ static void _decode_opc(DisasContext * ctx)
                When interrupt/exception
                occurred 0 -> LDST
         */
-	if (ctx->features & SH_FEATURE_SH4A) {
-	    tcg_gen_movi_i32(cpu_ldst, 0);
-            tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TESL);
-	    tcg_gen_movi_i32(cpu_ldst, 1);
-	    return;
-	} else
-	    break;
+        CHECK_SH4A
+        tcg_gen_movi_i32(cpu_ldst, 0);
+        tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TESL);
+        tcg_gen_movi_i32(cpu_ldst, 1);
+        return;
     case 0x0093:		/* ocbi @Rn */
 	{
             gen_helper_ocbi(cpu_env, REG(B11_8));
@@ -1568,20 +1598,15 @@ static void _decode_opc(DisasContext * ctx)
     case 0x0083:		/* pref @Rn */
 	return;
     case 0x00d3:		/* prefi @Rn */
-	if (ctx->features & SH_FEATURE_SH4A)
-	    return;
-	else
-	    break;
+        CHECK_SH4A
+        return;
     case 0x00e3:		/* icbi @Rn */
-	if (ctx->features & SH_FEATURE_SH4A)
-	    return;
-	else
-	    break;
+        CHECK_SH4A
+        return;
     case 0x00ab:		/* synco */
-        if (ctx->features & SH_FEATURE_SH4A) {
-            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
-            return;
-        }
+        CHECK_SH4A
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+        return;
         break;
     case 0x4024:		/* rotcl Rn */
 	{
@@ -1653,98 +1678,88 @@ static void _decode_opc(DisasContext * ctx)
         return;
     case 0xf00d: /* fsts FPUL,FRn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-	tcg_gen_mov_i32(cpu_fregs[FREG(B11_8)], cpu_fpul);
+        tcg_gen_mov_i32(FREG(B11_8), cpu_fpul);
 	return;
     case 0xf01d: /* flds FRm,FPUL - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-	tcg_gen_mov_i32(cpu_fpul, cpu_fregs[FREG(B11_8)]);
+        tcg_gen_mov_i32(cpu_fpul, FREG(B11_8));
 	return;
     case 0xf02d: /* float FPUL,FRn/DRn - FPSCR: R[PR,Enable.I]/W[Cause,Flag] */
 	CHECK_FPU_ENABLED
         if (ctx->tbflags & FPSCR_PR) {
 	    TCGv_i64 fp;
-	    if (ctx->opcode & 0x0100)
-		break; /* illegal instruction */
+            if (ctx->opcode & 0x0100) {
+                goto do_illegal;
+            }
 	    fp = tcg_temp_new_i64();
             gen_helper_float_DT(fp, cpu_env, cpu_fpul);
-	    gen_store_fpr64(fp, DREG(B11_8));
+            gen_store_fpr64(ctx, fp, B11_8);
 	    tcg_temp_free_i64(fp);
 	}
 	else {
-            gen_helper_float_FT(cpu_fregs[FREG(B11_8)], cpu_env, cpu_fpul);
+            gen_helper_float_FT(FREG(B11_8), cpu_env, cpu_fpul);
 	}
 	return;
     case 0xf03d: /* ftrc FRm/DRm,FPUL - FPSCR: R[PR,Enable.V]/W[Cause,Flag] */
 	CHECK_FPU_ENABLED
         if (ctx->tbflags & FPSCR_PR) {
 	    TCGv_i64 fp;
-	    if (ctx->opcode & 0x0100)
-		break; /* illegal instruction */
+            if (ctx->opcode & 0x0100) {
+                goto do_illegal;
+            }
 	    fp = tcg_temp_new_i64();
-	    gen_load_fpr64(fp, DREG(B11_8));
+            gen_load_fpr64(ctx, fp, B11_8);
             gen_helper_ftrc_DT(cpu_fpul, cpu_env, fp);
 	    tcg_temp_free_i64(fp);
 	}
 	else {
-            gen_helper_ftrc_FT(cpu_fpul, cpu_env, cpu_fregs[FREG(B11_8)]);
+            gen_helper_ftrc_FT(cpu_fpul, cpu_env, FREG(B11_8));
 	}
 	return;
     case 0xf04d: /* fneg FRn/DRn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-	{
-	    gen_helper_fneg_T(cpu_fregs[FREG(B11_8)], cpu_fregs[FREG(B11_8)]);
-	}
+        tcg_gen_xori_i32(FREG(B11_8), FREG(B11_8), 0x80000000);
 	return;
-    case 0xf05d: /* fabs FRn/DRn */
+    case 0xf05d: /* fabs FRn/DRn - FPCSR: Nothing */
 	CHECK_FPU_ENABLED
-        if (ctx->tbflags & FPSCR_PR) {
-	    if (ctx->opcode & 0x0100)
-		break; /* illegal instruction */
-	    TCGv_i64 fp = tcg_temp_new_i64();
-	    gen_load_fpr64(fp, DREG(B11_8));
-	    gen_helper_fabs_DT(fp, fp);
-	    gen_store_fpr64(fp, DREG(B11_8));
-	    tcg_temp_free_i64(fp);
-	} else {
-	    gen_helper_fabs_FT(cpu_fregs[FREG(B11_8)], cpu_fregs[FREG(B11_8)]);
-	}
+        tcg_gen_andi_i32(FREG(B11_8), FREG(B11_8), 0x7fffffff);
 	return;
     case 0xf06d: /* fsqrt FRn */
 	CHECK_FPU_ENABLED
         if (ctx->tbflags & FPSCR_PR) {
-	    if (ctx->opcode & 0x0100)
-		break; /* illegal instruction */
+            if (ctx->opcode & 0x0100) {
+                goto do_illegal;
+            }
 	    TCGv_i64 fp = tcg_temp_new_i64();
-	    gen_load_fpr64(fp, DREG(B11_8));
+            gen_load_fpr64(ctx, fp, B11_8);
             gen_helper_fsqrt_DT(fp, cpu_env, fp);
-	    gen_store_fpr64(fp, DREG(B11_8));
+            gen_store_fpr64(ctx, fp, B11_8);
 	    tcg_temp_free_i64(fp);
 	} else {
-            gen_helper_fsqrt_FT(cpu_fregs[FREG(B11_8)], cpu_env,
-                                cpu_fregs[FREG(B11_8)]);
+            gen_helper_fsqrt_FT(FREG(B11_8), cpu_env, FREG(B11_8));
 	}
 	return;
     case 0xf07d: /* fsrra FRn */
 	CHECK_FPU_ENABLED
+        CHECK_FPSCR_PR_0
+        gen_helper_fsrra_FT(FREG(B11_8), cpu_env, FREG(B11_8));
 	break;
     case 0xf08d: /* fldi0 FRn - FPSCR: R[PR] */
 	CHECK_FPU_ENABLED
-        if (!(ctx->tbflags & FPSCR_PR)) {
-	    tcg_gen_movi_i32(cpu_fregs[FREG(B11_8)], 0);
-	}
-	return;
+        CHECK_FPSCR_PR_0
+        tcg_gen_movi_i32(FREG(B11_8), 0);
+        return;
     case 0xf09d: /* fldi1 FRn - FPSCR: R[PR] */
 	CHECK_FPU_ENABLED
-        if (!(ctx->tbflags & FPSCR_PR)) {
-	    tcg_gen_movi_i32(cpu_fregs[FREG(B11_8)], 0x3f800000);
-	}
-	return;
+        CHECK_FPSCR_PR_0
+        tcg_gen_movi_i32(FREG(B11_8), 0x3f800000);
+        return;
     case 0xf0ad: /* fcnvsd FPUL,DRn */
 	CHECK_FPU_ENABLED
 	{
 	    TCGv_i64 fp = tcg_temp_new_i64();
             gen_helper_fcnvsd_FT_DT(fp, cpu_env, cpu_fpul);
-	    gen_store_fpr64(fp, DREG(B11_8));
+            gen_store_fpr64(ctx, fp, B11_8);
 	    tcg_temp_free_i64(fp);
 	}
 	return;
@@ -1752,17 +1767,17 @@ static void _decode_opc(DisasContext * ctx)
 	CHECK_FPU_ENABLED
 	{
 	    TCGv_i64 fp = tcg_temp_new_i64();
-	    gen_load_fpr64(fp, DREG(B11_8));
+            gen_load_fpr64(ctx, fp, B11_8);
             gen_helper_fcnvds_DT_FT(cpu_fpul, cpu_env, fp);
 	    tcg_temp_free_i64(fp);
 	}
 	return;
     case 0xf0ed: /* fipr FVm,FVn */
         CHECK_FPU_ENABLED
-        if ((ctx->tbflags & FPSCR_PR) == 0) {
-            TCGv m, n;
-            m = tcg_const_i32((ctx->opcode >> 8) & 3);
-            n = tcg_const_i32((ctx->opcode >> 10) & 3);
+        CHECK_FPSCR_PR_1
+        {
+            TCGv m = tcg_const_i32((ctx->opcode >> 8) & 3);
+            TCGv n = tcg_const_i32((ctx->opcode >> 10) & 3);
             gen_helper_fipr(cpu_env, m, n);
             tcg_temp_free(m);
             tcg_temp_free(n);
@@ -1771,10 +1786,12 @@ static void _decode_opc(DisasContext * ctx)
         break;
     case 0xf0fd: /* ftrv XMTRX,FVn */
         CHECK_FPU_ENABLED
-        if ((ctx->opcode & 0x0300) == 0x0100 &&
-            (ctx->tbflags & FPSCR_PR) == 0) {
-            TCGv n;
-            n = tcg_const_i32((ctx->opcode >> 10) & 3);
+        CHECK_FPSCR_PR_1
+        {
+            if ((ctx->opcode & 0x0300) != 0x0100) {
+                goto do_illegal;
+            }
+            TCGv n = tcg_const_i32((ctx->opcode >> 10) & 3);
             gen_helper_ftrv(cpu_env, n);
             tcg_temp_free(n);
             return;
@@ -1786,13 +1803,27 @@ static void _decode_opc(DisasContext * ctx)
 	    ctx->opcode, ctx->pc);
     fflush(stderr);
 #endif
-    gen_save_cpu_state(ctx, true);
+ do_illegal:
     if (ctx->envflags & DELAY_SLOT_MASK) {
+ do_illegal_slot:
+        gen_save_cpu_state(ctx, true);
         gen_helper_raise_slot_illegal_instruction(cpu_env);
     } else {
+        gen_save_cpu_state(ctx, true);
         gen_helper_raise_illegal_instruction(cpu_env);
     }
     ctx->bstate = BS_EXCP;
+    return;
+
+ do_fpu_disabled:
+    gen_save_cpu_state(ctx, true);
+    if (ctx->envflags & DELAY_SLOT_MASK) {
+        gen_helper_raise_slot_fpu_disable(cpu_env);
+    } else {
+        gen_helper_raise_fpu_disable(cpu_env);
+    }
+    ctx->bstate = BS_EXCP;
+    return;
 }
 
 static void decode_opc(DisasContext * ctx)
@@ -1804,6 +1835,18 @@ static void decode_opc(DisasContext * ctx)
     if (old_flags & DELAY_SLOT_MASK) {
         /* go out of the delay slot */
         ctx->envflags &= ~DELAY_SLOT_MASK;
+
+        /* When in an exclusive region, we must continue to the end
+           for conditional branches.  */
+        if (ctx->tbflags & GUSA_EXCLUSIVE
+            && old_flags & DELAY_SLOT_CONDITIONAL) {
+            gen_delayed_conditional_jump(ctx);
+            return;
+        }
+        /* Otherwise this is probably an invalid gUSA region.
+           Drop the GUSA bits so the next TB doesn't see them.  */
+        ctx->envflags &= ~GUSA_MASK;
+
         tcg_gen_movi_i32(cpu_flags, ctx->envflags);
         ctx->bstate = BS_BRANCH;
         if (old_flags & DELAY_SLOT_CONDITIONAL) {
@@ -1811,9 +1854,381 @@ static void decode_opc(DisasContext * ctx)
         } else {
             gen_jump(ctx);
 	}
+    }
+}
+
+#ifdef CONFIG_USER_ONLY
+/* For uniprocessors, SH4 uses optimistic restartable atomic sequences.
+   Upon an interrupt, a real kernel would simply notice magic values in
+   the registers and reset the PC to the start of the sequence.
+
+   For QEMU, we cannot do this in quite the same way.  Instead, we notice
+   the normal start of such a sequence (mov #-x,r15).  While we can handle
+   any sequence via cpu_exec_step_atomic, we can recognize the "normal"
+   sequences and transform them into atomic operations as seen by the host.
+*/
+static int decode_gusa(DisasContext *ctx, CPUSH4State *env, int *pmax_insns)
+{
+    uint16_t insns[5];
+    int ld_adr, ld_dst, ld_mop;
+    int op_dst, op_src, op_opc;
+    int mv_src, mt_dst, st_src, st_mop;
+    TCGv op_arg;
+
+    uint32_t pc = ctx->pc;
+    uint32_t pc_end = ctx->tb->cs_base;
+    int backup = sextract32(ctx->tbflags, GUSA_SHIFT, 8);
+    int max_insns = (pc_end - pc) / 2;
+    int i;
+
+    if (pc != pc_end + backup || max_insns < 2) {
+        /* This is a malformed gUSA region.  Don't do anything special,
+           since the interpreter is likely to get confused.  */
+        ctx->envflags &= ~GUSA_MASK;
+        return 0;
+    }
 
+    if (ctx->tbflags & GUSA_EXCLUSIVE) {
+        /* Regardless of single-stepping or the end of the page,
+           we must complete execution of the gUSA region while
+           holding the exclusive lock.  */
+        *pmax_insns = max_insns;
+        return 0;
     }
+
+    /* The state machine below will consume only a few insns.
+       If there are more than that in a region, fail now.  */
+    if (max_insns > ARRAY_SIZE(insns)) {
+        goto fail;
+    }
+
+    /* Read all of the insns for the region.  */
+    for (i = 0; i < max_insns; ++i) {
+        insns[i] = cpu_lduw_code(env, pc + i * 2);
+    }
+
+    ld_adr = ld_dst = ld_mop = -1;
+    mv_src = -1;
+    op_dst = op_src = op_opc = -1;
+    mt_dst = -1;
+    st_src = st_mop = -1;
+    TCGV_UNUSED(op_arg);
+    i = 0;
+
+#define NEXT_INSN \
+    do { if (i >= max_insns) goto fail; ctx->opcode = insns[i++]; } while (0)
+
+    /*
+     * Expect a load to begin the region.
+     */
+    NEXT_INSN;
+    switch (ctx->opcode & 0xf00f) {
+    case 0x6000: /* mov.b @Rm,Rn */
+        ld_mop = MO_SB;
+        break;
+    case 0x6001: /* mov.w @Rm,Rn */
+        ld_mop = MO_TESW;
+        break;
+    case 0x6002: /* mov.l @Rm,Rn */
+        ld_mop = MO_TESL;
+        break;
+    default:
+        goto fail;
+    }
+    ld_adr = B7_4;
+    ld_dst = B11_8;
+    if (ld_adr == ld_dst) {
+        goto fail;
+    }
+    /* Unless we see a mov, any two-operand operation must use ld_dst.  */
+    op_dst = ld_dst;
+
+    /*
+     * Expect an optional register move.
+     */
+    NEXT_INSN;
+    switch (ctx->opcode & 0xf00f) {
+    case 0x6003: /* mov Rm,Rn */
+        /* Here we want to recognize ld_dst being saved for later consumtion,
+           or for another input register being copied so that ld_dst need not
+           be clobbered during the operation.  */
+        op_dst = B11_8;
+        mv_src = B7_4;
+        if (op_dst == ld_dst) {
+            /* Overwriting the load output.  */
+            goto fail;
+        }
+        if (mv_src != ld_dst) {
+            /* Copying a new input; constrain op_src to match the load.  */
+            op_src = ld_dst;
+        }
+        break;
+
+    default:
+        /* Put back and re-examine as operation.  */
+        --i;
+    }
+
+    /*
+     * Expect the operation.
+     */
+    NEXT_INSN;
+    switch (ctx->opcode & 0xf00f) {
+    case 0x300c: /* add Rm,Rn */
+        op_opc = INDEX_op_add_i32;
+        goto do_reg_op;
+    case 0x2009: /* and Rm,Rn */
+        op_opc = INDEX_op_and_i32;
+        goto do_reg_op;
+    case 0x200a: /* xor Rm,Rn */
+        op_opc = INDEX_op_xor_i32;
+        goto do_reg_op;
+    case 0x200b: /* or Rm,Rn */
+        op_opc = INDEX_op_or_i32;
+    do_reg_op:
+        /* The operation register should be as expected, and the
+           other input cannot depend on the load.  */
+        if (op_dst != B11_8) {
+            goto fail;
+        }
+        if (op_src < 0) {
+            /* Unconstrainted input.  */
+            op_src = B7_4;
+        } else if (op_src == B7_4) {
+            /* Constrained input matched load.  All operations are
+               commutative; "swap" them by "moving" the load output
+               to the (implicit) first argument and the move source
+               to the (explicit) second argument.  */
+            op_src = mv_src;
+        } else {
+            goto fail;
+        }
+        op_arg = REG(op_src);
+        break;
+
+    case 0x6007: /* not Rm,Rn */
+        if (ld_dst != B7_4 || mv_src >= 0) {
+            goto fail;
+        }
+        op_dst = B11_8;
+        op_opc = INDEX_op_xor_i32;
+        op_arg = tcg_const_i32(-1);
+        break;
+
+    case 0x7000 ... 0x700f: /* add #imm,Rn */
+        if (op_dst != B11_8 || mv_src >= 0) {
+            goto fail;
+        }
+        op_opc = INDEX_op_add_i32;
+        op_arg = tcg_const_i32(B7_0s);
+        break;
+
+    case 0x3000: /* cmp/eq Rm,Rn */
+        /* Looking for the middle of a compare-and-swap sequence,
+           beginning with the compare.  Operands can be either order,
+           but with only one overlapping the load.  */
+        if ((ld_dst == B11_8) + (ld_dst == B7_4) != 1 || mv_src >= 0) {
+            goto fail;
+        }
+        op_opc = INDEX_op_setcond_i32;  /* placeholder */
+        op_src = (ld_dst == B11_8 ? B7_4 : B11_8);
+        op_arg = REG(op_src);
+
+        NEXT_INSN;
+        switch (ctx->opcode & 0xff00) {
+        case 0x8b00: /* bf label */
+        case 0x8f00: /* bf/s label */
+            if (pc + (i + 1 + B7_0s) * 2 != pc_end) {
+                goto fail;
+            }
+            if ((ctx->opcode & 0xff00) == 0x8b00) { /* bf label */
+                break;
+            }
+            /* We're looking to unconditionally modify Rn with the
+               result of the comparison, within the delay slot of
+               the branch.  This is used by older gcc.  */
+            NEXT_INSN;
+            if ((ctx->opcode & 0xf0ff) == 0x0029) { /* movt Rn */
+                mt_dst = B11_8;
+            } else {
+                goto fail;
+            }
+            break;
+
+        default:
+            goto fail;
+        }
+        break;
+
+    case 0x2008: /* tst Rm,Rn */
+        /* Looking for a compare-and-swap against zero.  */
+        if (ld_dst != B11_8 || ld_dst != B7_4 || mv_src >= 0) {
+            goto fail;
+        }
+        op_opc = INDEX_op_setcond_i32;
+        op_arg = tcg_const_i32(0);
+
+        NEXT_INSN;
+        if ((ctx->opcode & 0xff00) != 0x8900 /* bt label */
+            || pc + (i + 1 + B7_0s) * 2 != pc_end) {
+            goto fail;
+        }
+        break;
+
+    default:
+        /* Put back and re-examine as store.  */
+        --i;
+    }
+
+    /*
+     * Expect the store.
+     */
+    /* The store must be the last insn.  */
+    if (i != max_insns - 1) {
+        goto fail;
+    }
+    NEXT_INSN;
+    switch (ctx->opcode & 0xf00f) {
+    case 0x2000: /* mov.b Rm,@Rn */
+        st_mop = MO_UB;
+        break;
+    case 0x2001: /* mov.w Rm,@Rn */
+        st_mop = MO_UW;
+        break;
+    case 0x2002: /* mov.l Rm,@Rn */
+        st_mop = MO_UL;
+        break;
+    default:
+        goto fail;
+    }
+    /* The store must match the load.  */
+    if (ld_adr != B11_8 || st_mop != (ld_mop & MO_SIZE)) {
+        goto fail;
+    }
+    st_src = B7_4;
+
+#undef NEXT_INSN
+
+    /*
+     * Emit the operation.
+     */
+    tcg_gen_insn_start(pc, ctx->envflags);
+    switch (op_opc) {
+    case -1:
+        /* No operation found.  Look for exchange pattern.  */
+        if (st_src == ld_dst || mv_src >= 0) {
+            goto fail;
+        }
+        tcg_gen_atomic_xchg_i32(REG(ld_dst), REG(ld_adr), REG(st_src),
+                                ctx->memidx, ld_mop);
+        break;
+
+    case INDEX_op_add_i32:
+        if (op_dst != st_src) {
+            goto fail;
+        }
+        if (op_dst == ld_dst && st_mop == MO_UL) {
+            tcg_gen_atomic_add_fetch_i32(REG(ld_dst), REG(ld_adr),
+                                         op_arg, ctx->memidx, ld_mop);
+        } else {
+            tcg_gen_atomic_fetch_add_i32(REG(ld_dst), REG(ld_adr),
+                                         op_arg, ctx->memidx, ld_mop);
+            if (op_dst != ld_dst) {
+                /* Note that mop sizes < 4 cannot use add_fetch
+                   because it won't carry into the higher bits.  */
+                tcg_gen_add_i32(REG(op_dst), REG(ld_dst), op_arg);
+            }
+        }
+        break;
+
+    case INDEX_op_and_i32:
+        if (op_dst != st_src) {
+            goto fail;
+        }
+        if (op_dst == ld_dst) {
+            tcg_gen_atomic_and_fetch_i32(REG(ld_dst), REG(ld_adr),
+                                         op_arg, ctx->memidx, ld_mop);
+        } else {
+            tcg_gen_atomic_fetch_and_i32(REG(ld_dst), REG(ld_adr),
+                                         op_arg, ctx->memidx, ld_mop);
+            tcg_gen_and_i32(REG(op_dst), REG(ld_dst), op_arg);
+        }
+        break;
+
+    case INDEX_op_or_i32:
+        if (op_dst != st_src) {
+            goto fail;
+        }
+        if (op_dst == ld_dst) {
+            tcg_gen_atomic_or_fetch_i32(REG(ld_dst), REG(ld_adr),
+                                        op_arg, ctx->memidx, ld_mop);
+        } else {
+            tcg_gen_atomic_fetch_or_i32(REG(ld_dst), REG(ld_adr),
+                                        op_arg, ctx->memidx, ld_mop);
+            tcg_gen_or_i32(REG(op_dst), REG(ld_dst), op_arg);
+        }
+        break;
+
+    case INDEX_op_xor_i32:
+        if (op_dst != st_src) {
+            goto fail;
+        }
+        if (op_dst == ld_dst) {
+            tcg_gen_atomic_xor_fetch_i32(REG(ld_dst), REG(ld_adr),
+                                         op_arg, ctx->memidx, ld_mop);
+        } else {
+            tcg_gen_atomic_fetch_xor_i32(REG(ld_dst), REG(ld_adr),
+                                         op_arg, ctx->memidx, ld_mop);
+            tcg_gen_xor_i32(REG(op_dst), REG(ld_dst), op_arg);
+        }
+        break;
+
+    case INDEX_op_setcond_i32:
+        if (st_src == ld_dst) {
+            goto fail;
+        }
+        tcg_gen_atomic_cmpxchg_i32(REG(ld_dst), REG(ld_adr), op_arg,
+                                   REG(st_src), ctx->memidx, ld_mop);
+        tcg_gen_setcond_i32(TCG_COND_EQ, cpu_sr_t, REG(ld_dst), op_arg);
+        if (mt_dst >= 0) {
+            tcg_gen_mov_i32(REG(mt_dst), cpu_sr_t);
+        }
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+
+    /* If op_src is not a valid register, then op_arg was a constant.  */
+    if (op_src < 0) {
+        tcg_temp_free_i32(op_arg);
+    }
+
+    /* The entire region has been translated.  */
+    ctx->envflags &= ~GUSA_MASK;
+    ctx->pc = pc_end;
+    return max_insns;
+
+ fail:
+    qemu_log_mask(LOG_UNIMP, "Unrecognized gUSA sequence %08x-%08x\n",
+                  pc, pc_end);
+
+    /* Restart with the EXCLUSIVE bit set, within a TB run via
+       cpu_exec_step_atomic holding the exclusive lock.  */
+    tcg_gen_insn_start(pc, ctx->envflags);
+    ctx->envflags |= GUSA_EXCLUSIVE;
+    gen_save_cpu_state(ctx, false);
+    gen_helper_exclusive(cpu_env);
+    ctx->bstate = BS_EXCP;
+
+    /* We're not executing an instruction, but we must report one for the
+       purposes of accounting within the TB.  We might as well report the
+       entire region consumed via ctx->pc so that it's immediately available
+       in the disassembly dump.  */
+    ctx->pc = pc_end;
+    return 1;
 }
+#endif
 
 void gen_intermediate_code(CPUSH4State * env, struct TranslationBlock *tb)
 {
@@ -1827,7 +2242,7 @@ void gen_intermediate_code(CPUSH4State * env, struct TranslationBlock *tb)
     pc_start = tb->pc;
     ctx.pc = pc_start;
     ctx.tbflags = (uint32_t)tb->flags;
-    ctx.envflags = tb->flags & DELAY_SLOT_MASK;
+    ctx.envflags = tb->flags & TB_FLAG_ENVFLAGS_MASK;
     ctx.bstate = BS_NONE;
     ctx.memidx = (ctx.tbflags & (1u << SR_MD)) == 0 ? 1 : 0;
     /* We don't know if the delayed pc came from a dynamic or static branch,
@@ -1837,18 +2252,38 @@ void gen_intermediate_code(CPUSH4State * env, struct TranslationBlock *tb)
     ctx.singlestep_enabled = cs->singlestep_enabled;
     ctx.features = env->features;
     ctx.has_movcal = (ctx.tbflags & TB_FLAG_PENDING_MOVCA);
+    ctx.gbank = ((ctx.tbflags & (1 << SR_MD)) &&
+                 (ctx.tbflags & (1 << SR_RB))) * 0x10;
+    ctx.fbank = ctx.tbflags & FPSCR_FR ? 0x10 : 0;
 
-    num_insns = 0;
     max_insns = tb->cflags & CF_COUNT_MASK;
     if (max_insns == 0) {
         max_insns = CF_COUNT_MASK;
     }
-    if (max_insns > TCG_MAX_INSNS) {
-        max_insns = TCG_MAX_INSNS;
+    max_insns = MIN(max_insns, TCG_MAX_INSNS);
+
+    /* Since the ISA is fixed-width, we can bound by the number
+       of instructions remaining on the page.  */
+    num_insns = -(ctx.pc | TARGET_PAGE_MASK) / 2;
+    max_insns = MIN(max_insns, num_insns);
+
+    /* Single stepping means just that.  */
+    if (ctx.singlestep_enabled || singlestep) {
+        max_insns = 1;
     }
 
     gen_tb_start(tb);
-    while (ctx.bstate == BS_NONE && !tcg_op_buf_full()) {
+    num_insns = 0;
+
+#ifdef CONFIG_USER_ONLY
+    if (ctx.tbflags & GUSA_MASK) {
+        num_insns = decode_gusa(&ctx, env, &max_insns);
+    }
+#endif
+
+    while (ctx.bstate == BS_NONE
+           && num_insns < max_insns
+           && !tcg_op_buf_full()) {
         tcg_gen_insn_start(ctx.pc, ctx.envflags);
         num_insns++;
 
@@ -1872,18 +2307,16 @@ void gen_intermediate_code(CPUSH4State * env, struct TranslationBlock *tb)
         ctx.opcode = cpu_lduw_code(env, ctx.pc);
 	decode_opc(&ctx);
 	ctx.pc += 2;
-	if ((ctx.pc & (TARGET_PAGE_SIZE - 1)) == 0)
-	    break;
-        if (cs->singlestep_enabled) {
-	    break;
-        }
-        if (num_insns >= max_insns)
-            break;
-        if (singlestep)
-            break;
     }
-    if (tb->cflags & CF_LAST_IO)
+    if (tb->cflags & CF_LAST_IO) {
         gen_io_end();
+    }
+
+    if (ctx.tbflags & GUSA_EXCLUSIVE) {
+        /* Ending the region of exclusivity.  Clear the bits.  */
+        ctx.envflags &= ~GUSA_MASK;
+    }
+
     if (cs->singlestep_enabled) {
         gen_save_cpu_state(&ctx, true);
         gen_helper_debug(cpu_env);