summary refs log tree commit diff stats
path: root/tcg
diff options
context:
space:
mode:
Diffstat (limited to 'tcg')
-rw-r--r--tcg/README43
-rw-r--r--tcg/aarch64/tcg-target.c51
-rw-r--r--tcg/aarch64/tcg-target.h2
-rw-r--r--tcg/arm/tcg-target.c802
-rw-r--r--tcg/arm/tcg-target.h2
-rw-r--r--tcg/hppa/tcg-target.c1831
-rw-r--r--tcg/hppa/tcg-target.h123
-rw-r--r--tcg/i386/tcg-target.c671
-rw-r--r--tcg/i386/tcg-target.h2
-rw-r--r--tcg/ia64/tcg-target.c2
-rw-r--r--tcg/ia64/tcg-target.h2
-rw-r--r--tcg/mips/tcg-target.c2
-rw-r--r--tcg/mips/tcg-target.h2
-rw-r--r--tcg/optimize.c12
-rw-r--r--tcg/ppc/tcg-target.c684
-rw-r--r--tcg/ppc/tcg-target.h2
-rw-r--r--tcg/ppc64/tcg-target.c1162
-rw-r--r--tcg/ppc64/tcg-target.h2
-rw-r--r--tcg/s390/tcg-target.c2
-rw-r--r--tcg/s390/tcg-target.h2
-rw-r--r--tcg/sparc/tcg-target.c2
-rw-r--r--tcg/sparc/tcg-target.h4
-rw-r--r--tcg/tcg-be-ldst.h90
-rw-r--r--tcg/tcg-be-null.h43
-rw-r--r--tcg/tcg-op.h239
-rw-r--r--tcg/tcg-opc.h96
-rw-r--r--tcg/tcg.c338
-rw-r--r--tcg/tcg.h166
-rw-r--r--tcg/tci/tcg-target.c3
-rw-r--r--tcg/tci/tcg-target.h2
30 files changed, 2283 insertions, 4101 deletions
diff --git a/tcg/README b/tcg/README
index 063aeb95ea..f1782123b7 100644
--- a/tcg/README
+++ b/tcg/README
@@ -412,30 +412,25 @@ current TB was linked to this TB. Otherwise execute the next
 instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
 at most once with each slot index per TB.
 
-* qemu_ld8u t0, t1, flags
-qemu_ld8s t0, t1, flags
-qemu_ld16u t0, t1, flags
-qemu_ld16s t0, t1, flags
-qemu_ld32 t0, t1, flags
-qemu_ld32u t0, t1, flags
-qemu_ld32s t0, t1, flags
-qemu_ld64 t0, t1, flags
-
-Load data at the QEMU CPU address t1 into t0. t1 has the QEMU CPU address
-type. 'flags' contains the QEMU memory index (selects user or kernel access)
-for example.
-
-Note that "qemu_ld32" implies a 32-bit result, while "qemu_ld32u" and
-"qemu_ld32s" imply a 64-bit result appropriately extended from 32 bits.
-
-* qemu_st8 t0, t1, flags
-qemu_st16 t0, t1, flags
-qemu_st32 t0, t1, flags
-qemu_st64 t0, t1, flags
-
-Store the data t0 at the QEMU CPU Address t1. t1 has the QEMU CPU
-address type. 'flags' contains the QEMU memory index (selects user or
-kernel access) for example.
+* qemu_ld_i32/i64 t0, t1, flags, memidx
+* qemu_st_i32/i64 t0, t1, flags, memidx
+
+Load data at the guest address t1 into t0, or store data in t0 at guest
+address t1.  The _i32/_i64 size applies to the size of the input/output
+register t0 only.  The address t1 is always sized according to the guest,
+and the width of the memory operation is controlled by flags.
+
+Both t0 and t1 may be split into little-endian ordered pairs of registers
+if dealing with 64-bit quantities on a 32-bit host.
+
+The memidx selects the qemu tlb index to use (e.g. user or kernel access).
+The flags are the TCGMemOp bits, selecting the sign, width, and endianness
+of the memory access.
+
+For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
+64-bit memory access specified in flags.
+
+*********
 
 Note 1: Some shortcuts are defined when the last operand is known to be
 a constant (e.g. addi for add, movi for mov).
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 6379df1f68..04d7ae328d 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -10,6 +10,7 @@
  * See the COPYING file in the top-level directory for details.
  */
 
+#include "tcg-be-ldst.h"
 #include "qemu/bitops.h"
 
 #ifndef NDEBUG
@@ -778,22 +779,24 @@ static inline void tcg_out_nop(TCGContext *s)
 }
 
 #ifdef CONFIG_SOFTMMU
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
+/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+ *                                     int mmu_idx, uintptr_t ra)
+ */
 static const void * const qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+    helper_ret_ldub_mmu,
+    helper_ret_lduw_mmu,
+    helper_ret_ldul_mmu,
+    helper_ret_ldq_mmu,
 };
 
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
+/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
+ *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
+ */
 static const void * const qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+    helper_ret_stb_mmu,
+    helper_ret_stw_mmu,
+    helper_ret_stl_mmu,
+    helper_ret_stq_mmu,
 };
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@@ -802,6 +805,7 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
     tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_X3, (tcg_target_long)lb->raddr);
     tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
                  (tcg_target_long)qemu_ld_helpers[lb->opc & 3]);
     tcg_out_callr(s, TCG_REG_TMP);
@@ -822,6 +826,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
     tcg_out_movr(s, 1, TCG_REG_X2, lb->datalo_reg);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_X4, (tcg_target_long)lb->raddr);
     tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
                  (tcg_target_long)qemu_st_helpers[lb->opc & 3]);
     tcg_out_callr(s, TCG_REG_TMP);
@@ -830,33 +835,13 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_goto(s, (tcg_target_long)lb->raddr);
 }
 
-void tcg_out_tb_finalize(TCGContext *s)
-{
-    int i;
-    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
-        TCGLabelQemuLdst *label = &s->qemu_ldst_labels[i];
-        if (label->is_ld) {
-            tcg_out_qemu_ld_slow_path(s, label);
-        } else {
-            tcg_out_qemu_st_slow_path(s, label);
-        }
-    }
-}
-
 static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
                                 TCGReg data_reg, TCGReg addr_reg,
                                 int mem_index,
                                 uint8_t *raddr, uint8_t *label_ptr)
 {
-    int idx;
-    TCGLabelQemuLdst *label;
-
-    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
-        tcg_abort();
-    }
+    TCGLabelQemuLdst *label = new_ldst_label(s);
 
-    idx = s->nb_qemu_ldst_labels++;
-    label = &s->qemu_ldst_labels[idx];
     label->is_ld = is_ld;
     label->opc = opc;
     label->datalo_reg = data_reg;
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index d3a1bc2437..82ad919518 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -96,6 +96,8 @@ enum {
     TCG_AREG0 = TCG_REG_X19,
 };
 
+#define TCG_TARGET_HAS_new_ldst         0
+
 static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
 {
     __builtin___clear_cache((char *)start, (char *)stop);
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index eb0e84ce44..e93a4a237b 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -22,6 +22,8 @@
  * THE SOFTWARE.
  */
 
+#include "tcg-be-ldst.h"
+
 /* The __ARM_ARCH define is provided by gcc 4.8.  Construct it otherwise.  */
 #ifndef __ARM_ARCH
 # if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
@@ -175,24 +177,16 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         ct->ct |= TCG_CT_REG;
         tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
 #ifdef CONFIG_SOFTMMU
-        /* r0-r2 will be overwritten when reading the tlb entry,
+        /* r0-r2,lr will be overwritten when reading the tlb entry,
            so don't use these. */
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
-#endif
-        break;
-    case 'L':
-        ct->ct |= TCG_CT_REG;
-        tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
-#ifdef CONFIG_SOFTMMU
-        /* r1 is still needed to load data_reg or data_reg2,
-           so don't use it. */
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
 #endif
         break;
 
-    /* qemu_st address & data_reg */
+    /* qemu_st address & data */
     case 's':
         ct->ct |= TCG_CT_REG;
         tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
@@ -207,6 +201,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         /* Avoid clashes with registers being used for helper args */
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
 #endif
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
 #endif
         break;
 
@@ -320,6 +315,9 @@ typedef enum {
     INSN_STRB_REG  = 0x06400000,
 
     INSN_LDRD_IMM  = 0x004000d0,
+    INSN_LDRD_REG  = 0x000000d0,
+    INSN_STRD_IMM  = 0x004000f0,
+    INSN_STRD_REG  = 0x000000f0,
 } ARMInsn;
 
 #define SHIFT_IMM_LSL(im)	(((im) << 7) | 0x00)
@@ -379,13 +377,17 @@ static inline void tcg_out_b_noaddr(TCGContext *s, int cond)
     /* We pay attention here to not modify the branch target by skipping
        the corresponding bytes. This ensure that caches and memory are
        kept coherent during retranslation. */
-#ifdef HOST_WORDS_BIGENDIAN
-    tcg_out8(s, (cond << 4) | 0x0a);
-    s->code_ptr += 3;
-#else
     s->code_ptr += 3;
     tcg_out8(s, (cond << 4) | 0x0a);
-#endif
+}
+
+static inline void tcg_out_bl_noaddr(TCGContext *s, int cond)
+{
+    /* We pay attention here to not modify the branch target by skipping
+       the corresponding bytes. This ensure that caches and memory are
+       kept coherent during retranslation. */
+    s->code_ptr += 3;
+    tcg_out8(s, (cond << 4) | 0x0b);
 }
 
 static inline void tcg_out_bl(TCGContext *s, int cond, int32_t offset)
@@ -810,6 +812,30 @@ static inline void tcg_out_st32_r(TCGContext *s, int cond, TCGReg rt,
     tcg_out_memop_r(s, cond, INSN_STR_REG, rt, rn, rm, 1, 1, 0);
 }
 
+static inline void tcg_out_ldrd_8(TCGContext *s, int cond, TCGReg rt,
+                                   TCGReg rn, int imm8)
+{
+    tcg_out_memop_8(s, cond, INSN_LDRD_IMM, rt, rn, imm8, 1, 0);
+}
+
+static inline void tcg_out_ldrd_r(TCGContext *s, int cond, TCGReg rt,
+                                  TCGReg rn, TCGReg rm)
+{
+    tcg_out_memop_r(s, cond, INSN_LDRD_REG, rt, rn, rm, 1, 1, 0);
+}
+
+static inline void tcg_out_strd_8(TCGContext *s, int cond, TCGReg rt,
+                                   TCGReg rn, int imm8)
+{
+    tcg_out_memop_8(s, cond, INSN_STRD_IMM, rt, rn, imm8, 1, 0);
+}
+
+static inline void tcg_out_strd_r(TCGContext *s, int cond, TCGReg rt,
+                                  TCGReg rn, TCGReg rm)
+{
+    tcg_out_memop_r(s, cond, INSN_STRD_REG, rt, rn, rm, 1, 1, 0);
+}
+
 /* Register pre-increment with base writeback.  */
 static inline void tcg_out_ld32_rwb(TCGContext *s, int cond, TCGReg rt,
                                     TCGReg rn, TCGReg rm)
@@ -975,34 +1001,27 @@ static inline void tcg_out_st8(TCGContext *s, int cond,
         tcg_out_st8_12(s, cond, rd, rn, offset);
 }
 
-/* The _goto case is normally between TBs within the same code buffer,
- * and with the code buffer limited to 16MB we shouldn't need the long
- * case.
- *
- * .... except to the prologue that is in its own buffer.
+/* The _goto case is normally between TBs within the same code buffer, and
+ * with the code buffer limited to 16MB we wouldn't need the long case.
+ * But we also use it for the tail-call to the qemu_ld/st helpers, which does.
  */
 static inline void tcg_out_goto(TCGContext *s, int cond, uint32_t addr)
 {
-    int32_t val;
+    int32_t disp = addr - (tcg_target_long) s->code_ptr;
 
-    if (addr & 1) {
-        /* goto to a Thumb destination isn't supported */
-        tcg_abort();
+    if ((addr & 1) == 0 && disp - 8 < 0x01fffffd && disp - 8 > -0x01fffffd) {
+        tcg_out_b(s, cond, disp);
+        return;
     }
 
-    val = addr - (tcg_target_long) s->code_ptr;
-    if (val - 8 < 0x01fffffd && val - 8 > -0x01fffffd)
-        tcg_out_b(s, cond, val);
-    else {
-        if (cond == COND_AL) {
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
-            tcg_out32(s, addr);
-        } else {
-            tcg_out_movi32(s, cond, TCG_REG_TMP, val - 8);
-            tcg_out_dat_reg(s, cond, ARITH_ADD,
-                            TCG_REG_PC, TCG_REG_PC,
-                            TCG_REG_TMP, SHIFT_IMM_LSL(0));
+    tcg_out_movi32(s, cond, TCG_REG_TMP, addr);
+    if (use_armv5t_instructions) {
+        tcg_out_bx(s, cond, TCG_REG_TMP);
+    } else {
+        if (addr & 1) {
+            tcg_abort();
         }
+        tcg_out_mov_reg(s, cond, TCG_REG_PC, TCG_REG_TMP);
     }
 }
 
@@ -1057,23 +1076,37 @@ static inline void tcg_out_goto_label(TCGContext *s, int cond, int label_index)
 }
 
 #ifdef CONFIG_SOFTMMU
-
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
-static const void * const qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+ *                                     int mmu_idx, uintptr_t ra)
+ */
+static const void * const qemu_ld_helpers[16] = {
+    [MO_UB]   = helper_ret_ldub_mmu,
+    [MO_SB]   = helper_ret_ldsb_mmu,
+
+    [MO_LEUW] = helper_le_lduw_mmu,
+    [MO_LEUL] = helper_le_ldul_mmu,
+    [MO_LEQ]  = helper_le_ldq_mmu,
+    [MO_LESW] = helper_le_ldsw_mmu,
+    [MO_LESL] = helper_le_ldul_mmu,
+
+    [MO_BEUW] = helper_be_lduw_mmu,
+    [MO_BEUL] = helper_be_ldul_mmu,
+    [MO_BEQ]  = helper_be_ldq_mmu,
+    [MO_BESW] = helper_be_ldsw_mmu,
+    [MO_BESL] = helper_be_ldul_mmu,
 };
 
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
-static const void * const qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
+ *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
+ */
+static const void * const qemu_st_helpers[16] = {
+    [MO_UB]   = helper_ret_stb_mmu,
+    [MO_LEUW] = helper_le_stw_mmu,
+    [MO_LEUL] = helper_le_stl_mmu,
+    [MO_LEQ]  = helper_le_stq_mmu,
+    [MO_BEUW] = helper_be_stw_mmu,
+    [MO_BEUL] = helper_be_stl_mmu,
+    [MO_BEQ]  = helper_be_stq_mmu,
 };
 
 /* Helper routines for marshalling helper function arguments into
@@ -1117,53 +1150,62 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
     if (argreg & 1) {
         argreg++;
     }
-    argreg = tcg_out_arg_reg32(s, argreg, arglo);
-    argreg = tcg_out_arg_reg32(s, argreg, arghi);
-    return argreg;
+    if (use_armv6_instructions && argreg >= 4
+        && (arglo & 1) == 0 && arghi == arglo + 1) {
+        tcg_out_strd_8(s, COND_AL, arglo,
+                       TCG_REG_CALL_STACK, (argreg - 4) * 4);
+        return argreg + 2;
+    } else {
+        argreg = tcg_out_arg_reg32(s, argreg, arglo);
+        argreg = tcg_out_arg_reg32(s, argreg, arghi);
+        return argreg;
+    }
 }
 
 #define TLB_SHIFT	(CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
 
-/* Load and compare a TLB entry, leaving the flags set.  Leaves R2 pointing
-   to the tlb entry.  Clobbers R1 and TMP.  */
+/* We're expecting to use an 8-bit immediate and to mask.  */
+QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
+
+/* We're expecting to use an 8-bit immediate add + 8-bit ldrd offset.
+   Using the offset of the second entry in the last tlb table ensures
+   that we can index all of the elements of the first entry.  */
+QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
+                  > 0xffff);
+
+/* Load and compare a TLB entry, leaving the flags set.  Returns the register
+   containing the addend of the tlb entry.  Clobbers R0, R1, R2, TMP.  */
 
-static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
-                             int s_bits, int tlb_offset)
+static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
+                               TCGMemOp s_bits, int mem_index, bool is_load)
 {
     TCGReg base = TCG_AREG0;
+    int cmp_off =
+        (is_load
+         ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+         : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
+    int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
 
     /* Should generate something like the following:
-     * pre-v7:
-     *   shr    tmp, addr_reg, #TARGET_PAGE_BITS                  (1)
-     *   add    r2, env, #off & 0xff00
+     *   shr    tmp, addrlo, #TARGET_PAGE_BITS                    (1)
+     *   add    r2, env, #high
      *   and    r0, tmp, #(CPU_TLB_SIZE - 1)                      (2)
      *   add    r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS               (3)
-     *   ldr    r0, [r2, #off & 0xff]!                            (4)
-     *   tst    addr_reg, #s_mask
-     *   cmpeq  r0, tmp, lsl #TARGET_PAGE_BITS                    (5)
-     *
-     * v7 (not implemented yet):
-     *   ubfx   r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS    (1)
-     *   movw   tmp, #~TARGET_PAGE_MASK & ~s_mask
-     *   movw   r0, #off
-     *   add    r2, env, r2, lsl #CPU_TLB_ENTRY_BITS              (2)
-     *   bic    tmp, addr_reg, tmp
-     *   ldr    r0, [r2, r0]!                                     (3)
-     *   cmp    r0, tmp                                           (4)
+     *   ldr    r0, [r2, #cmp]                                    (4)
+     *   tst    addrlo, #s_mask
+     *   ldr    r2, [r2, #add]                                    (5)
+     *   cmpeq  r0, tmp, lsl #TARGET_PAGE_BITS
      */
-#  if CPU_TLB_BITS > 8
-#   error
-#  endif
     tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
                     0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
 
-    /* We assume that the offset is contained within 16 bits.  */
-    assert((tlb_offset & ~0xffff) == 0);
-    if (tlb_offset > 0xff) {
+    /* We checked that the offset is contained within 16 bits above.  */
+    if (add_off > 0xfff || (use_armv6_instructions && cmp_off > 0xff)) {
         tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
-                        (24 << 7) | (tlb_offset >> 8));
-        tlb_offset &= 0xff;
+                        (24 << 7) | (cmp_off >> 8));
         base = TCG_REG_R2;
+        add_off -= cmp_off & 0xff00;
+        cmp_off &= 0xff;
     }
 
     tcg_out_dat_imm(s, COND_AL, ARITH_AND,
@@ -1175,14 +1217,11 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
        but due to how the pointer needs setting up, ldm isn't useful.
        Base arm5 doesn't have ldrd, but armv5te does.  */
     if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
-        tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0,
-                        TCG_REG_R2, tlb_offset, 1, 1);
+        tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
     } else {
-        tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
-                         TCG_REG_R2, tlb_offset, 1, 1);
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
         if (TARGET_LONG_BITS == 64) {
-            tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R1,
-                             TCG_REG_R2, 4, 1, 0);
+            tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4);
         }
     }
 
@@ -1192,6 +1231,9 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
                         0, addrlo, (1 << s_bits) - 1);
     }
 
+    /* Load the tlb addend.  */
+    tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, add_off);
+
     tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
                     TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
 
@@ -1199,31 +1241,26 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
         tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0,
                         TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0));
     }
+
+    return TCG_REG_R2;
 }
 
 /* Record the context of a call to the out of line helper code for the slow
    path for a load or store, so that we can later generate the correct
    helper code.  */
-static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
-                                int data_reg, int data_reg2, int addrlo_reg,
-                                int addrhi_reg, int mem_index,
+static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOp opc,
+                                TCGReg datalo, TCGReg datahi, TCGReg addrlo,
+                                TCGReg addrhi, int mem_index,
                                 uint8_t *raddr, uint8_t *label_ptr)
 {
-    int idx;
-    TCGLabelQemuLdst *label;
-
-    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
-        tcg_abort();
-    }
+    TCGLabelQemuLdst *label = new_ldst_label(s);
 
-    idx = s->nb_qemu_ldst_labels++;
-    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
     label->is_ld = is_ld;
     label->opc = opc;
-    label->datalo_reg = data_reg;
-    label->datahi_reg = data_reg2;
-    label->addrlo_reg = addrlo_reg;
-    label->addrhi_reg = addrhi_reg;
+    label->datalo_reg = datalo;
+    label->datahi_reg = datahi;
+    label->addrlo_reg = addrlo;
+    label->addrhi_reg = addrhi;
     label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr;
@@ -1231,8 +1268,9 @@ static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGReg argreg, data_reg, data_reg2;
-    uint8_t *start;
+    TCGReg argreg, datalo, datahi;
+    TCGMemOp opc = lb->opc;
+    uintptr_t func;
 
     reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
 
@@ -1243,46 +1281,46 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
         argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
     }
     argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
-    tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[lb->opc & 3]);
+    argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
-    data_reg = lb->datalo_reg;
-    data_reg2 = lb->datahi_reg;
+    /* For armv6 we can use the canonical unsigned helpers and minimize
+       icache usage.  For pre-armv6, use the signed helpers since we do
+       not have a single insn sign-extend.  */
+    if (use_armv6_instructions) {
+        func = (uintptr_t)qemu_ld_helpers[opc & ~MO_SIGN];
+    } else {
+        func = (uintptr_t)qemu_ld_helpers[opc];
+        if (opc & MO_SIGN) {
+            opc = MO_UL;
+        }
+    }
+    tcg_out_call(s, func);
 
-    start = s->code_ptr;
-    switch (lb->opc) {
-    case 0 | 4:
-        tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0);
+    datalo = lb->datalo_reg;
+    datahi = lb->datahi_reg;
+    switch (opc & MO_SSIZE) {
+    case MO_SB:
+        tcg_out_ext8s(s, COND_AL, datalo, TCG_REG_R0);
         break;
-    case 1 | 4:
-        tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0);
+    case MO_SW:
+        tcg_out_ext16s(s, COND_AL, datalo, TCG_REG_R0);
         break;
-    case 0:
-    case 1:
-    case 2:
     default:
-        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
-        break;
-    case 3:
-        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
-        tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1);
-        break;
-    }
-
-    /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between
-       the call and the branch back to straight-line code.  Note that the
-       moves above could be elided by register allocation, nor do we know
-       which code alternative we chose for extension.  */
-    switch (s->code_ptr - start) {
-    case 0:
-        tcg_out_nop(s);
-        /* FALLTHRU */
-    case 4:
-        tcg_out_nop(s);
-        /* FALLTHRU */
-    case 8:
+        tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
+        break;
+    case MO_Q:
+        if (datalo != TCG_REG_R1) {
+            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
+            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
+        } else if (datahi != TCG_REG_R0) {
+            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
+            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
+        } else {
+            tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0);
+            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
+            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_TMP);
+        }
         break;
-    default:
-        abort();
     }
 
     tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
@@ -1290,7 +1328,8 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGReg argreg, data_reg, data_reg2;
+    TCGReg argreg, datalo, datahi;
+    TCGMemOp opc = lb->opc;
 
     reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
 
@@ -1302,293 +1341,311 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
         argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
     }
 
-    data_reg = lb->datalo_reg;
-    data_reg2 = lb->datahi_reg;
-    switch (lb->opc) {
-    case 0:
-        argreg = tcg_out_arg_reg8(s, argreg, data_reg);
+    datalo = lb->datalo_reg;
+    datahi = lb->datahi_reg;
+    switch (opc & MO_SIZE) {
+    case MO_8:
+        argreg = tcg_out_arg_reg8(s, argreg, datalo);
         break;
-    case 1:
-        argreg = tcg_out_arg_reg16(s, argreg, data_reg);
+    case MO_16:
+        argreg = tcg_out_arg_reg16(s, argreg, datalo);
         break;
-    case 2:
-        argreg = tcg_out_arg_reg32(s, argreg, data_reg);
+    case MO_32:
+    default:
+        argreg = tcg_out_arg_reg32(s, argreg, datalo);
         break;
-    case 3:
-        argreg = tcg_out_arg_reg64(s, argreg, data_reg, data_reg2);
+    case MO_64:
+        argreg = tcg_out_arg_reg64(s, argreg, datalo, datahi);
         break;
     }
 
     argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
-    tcg_out_call(s, (tcg_target_long) qemu_st_helpers[lb->opc & 3]);
+    argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
-    /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between
-       the call and the branch back to straight-line code.  */
-    tcg_out_nop(s);
-    tcg_out_nop(s);
-    tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
+    /* Tail-call to the helper, which will return to the fast path.  */
+    tcg_out_goto(s, COND_AL, (uintptr_t)qemu_st_helpers[opc]);
 }
 #endif /* SOFTMMU */
 
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
+static inline void tcg_out_qemu_ld_index(TCGContext *s, TCGMemOp opc,
+                                         TCGReg datalo, TCGReg datahi,
+                                         TCGReg addrlo, TCGReg addend)
 {
-    TCGReg addr_reg, data_reg, data_reg2;
-    bool bswap;
-#ifdef CONFIG_SOFTMMU
-    int mem_index, s_bits;
-    TCGReg addr_reg2;
-    uint8_t *label_ptr;
-#endif
-#ifdef TARGET_WORDS_BIGENDIAN
-    bswap = 1;
-#else
-    bswap = 0;
-#endif
-
-    data_reg = *args++;
-    data_reg2 = (opc == 3 ? *args++ : 0);
-    addr_reg = *args++;
-#ifdef CONFIG_SOFTMMU
-    addr_reg2 = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-    mem_index = *args;
-    s_bits = opc & 3;
-
-    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
-                     offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
-
-    label_ptr = s->code_ptr;
-    tcg_out_b_noaddr(s, COND_NE);
-
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
-                    offsetof(CPUTLBEntry, addend)
-                    - offsetof(CPUTLBEntry, addr_read));
+    TCGMemOp bswap = opc & MO_BSWAP;
 
-    switch (opc) {
-    case 0:
-        tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+    switch (opc & MO_SSIZE) {
+    case MO_UB:
+        tcg_out_ld8_r(s, COND_AL, datalo, addrlo, addend);
         break;
-    case 0 | 4:
-        tcg_out_ld8s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+    case MO_SB:
+        tcg_out_ld8s_r(s, COND_AL, datalo, addrlo, addend);
         break;
-    case 1:
-        tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+    case MO_UW:
+        tcg_out_ld16u_r(s, COND_AL, datalo, addrlo, addend);
         if (bswap) {
-            tcg_out_bswap16(s, COND_AL, data_reg, data_reg);
+            tcg_out_bswap16(s, COND_AL, datalo, datalo);
         }
         break;
-    case 1 | 4:
+    case MO_SW:
         if (bswap) {
-            tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
-            tcg_out_bswap16s(s, COND_AL, data_reg, data_reg);
+            tcg_out_ld16u_r(s, COND_AL, datalo, addrlo, addend);
+            tcg_out_bswap16s(s, COND_AL, datalo, datalo);
         } else {
-            tcg_out_ld16s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ld16s_r(s, COND_AL, datalo, addrlo, addend);
         }
         break;
-    case 2:
+    case MO_UL:
     default:
-        tcg_out_ld32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
         if (bswap) {
-            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
+            tcg_out_bswap32(s, COND_AL, datalo, datalo);
         }
         break;
-    case 3:
-        if (bswap) {
-            tcg_out_ld32_rwb(s, COND_AL, data_reg2, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_AL, data_reg, TCG_REG_R1, 4);
-            tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2);
-            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
-        } else {
-            tcg_out_ld32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
+    case MO_Q:
+        {
+            TCGReg dl = (bswap ? datahi : datalo);
+            TCGReg dh = (bswap ? datalo : datahi);
+
+            if (use_armv6_instructions && (dl & 1) == 0 && dh == dl + 1) {
+                tcg_out_ldrd_r(s, COND_AL, dl, addrlo, addend);
+            } else if (dl != addend) {
+                tcg_out_ld32_rwb(s, COND_AL, dl, addend, addrlo);
+                tcg_out_ld32_12(s, COND_AL, dh, addend, 4);
+            } else {
+                tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_TMP,
+                                addend, addrlo, SHIFT_IMM_LSL(0));
+                tcg_out_ld32_12(s, COND_AL, dl, TCG_REG_TMP, 0);
+                tcg_out_ld32_12(s, COND_AL, dh, TCG_REG_TMP, 4);
+            }
+            if (bswap) {
+                tcg_out_bswap32(s, COND_AL, dl, dl);
+                tcg_out_bswap32(s, COND_AL, dh, dh);
+            }
         }
         break;
     }
+}
 
-    add_qemu_ldst_label(s, 1, opc, data_reg, data_reg2, addr_reg, addr_reg2,
-                        mem_index, s->code_ptr, label_ptr);
-#else /* !CONFIG_SOFTMMU */
-    if (GUEST_BASE) {
-        uint32_t offset = GUEST_BASE;
-        int i, rot;
-
-        while (offset) {
-            i = ctz32(offset) & ~1;
-            rot = ((32 - i) << 7) & 0xf00;
+static inline void tcg_out_qemu_ld_direct(TCGContext *s, TCGMemOp opc,
+                                          TCGReg datalo, TCGReg datahi,
+                                          TCGReg addrlo)
+{
+    TCGMemOp bswap = opc & MO_BSWAP;
 
-            tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_TMP, addr_reg,
-                            ((offset >> i) & 0xff) | rot);
-            addr_reg = TCG_REG_TMP;
-            offset &= ~(0xff << i);
-        }
-    }
-    switch (opc) {
-    case 0:
-        tcg_out_ld8_12(s, COND_AL, data_reg, addr_reg, 0);
+    switch (opc & MO_SSIZE) {
+    case MO_UB:
+        tcg_out_ld8_12(s, COND_AL, datalo, addrlo, 0);
         break;
-    case 0 | 4:
-        tcg_out_ld8s_8(s, COND_AL, data_reg, addr_reg, 0);
+    case MO_SB:
+        tcg_out_ld8s_8(s, COND_AL, datalo, addrlo, 0);
         break;
-    case 1:
-        tcg_out_ld16u_8(s, COND_AL, data_reg, addr_reg, 0);
+    case MO_UW:
+        tcg_out_ld16u_8(s, COND_AL, datalo, addrlo, 0);
         if (bswap) {
-            tcg_out_bswap16(s, COND_AL, data_reg, data_reg);
+            tcg_out_bswap16(s, COND_AL, datalo, datalo);
         }
         break;
-    case 1 | 4:
+    case MO_SW:
         if (bswap) {
-            tcg_out_ld16u_8(s, COND_AL, data_reg, addr_reg, 0);
-            tcg_out_bswap16s(s, COND_AL, data_reg, data_reg);
+            tcg_out_ld16u_8(s, COND_AL, datalo, addrlo, 0);
+            tcg_out_bswap16s(s, COND_AL, datalo, datalo);
         } else {
-            tcg_out_ld16s_8(s, COND_AL, data_reg, addr_reg, 0);
+            tcg_out_ld16s_8(s, COND_AL, datalo, addrlo, 0);
         }
         break;
-    case 2:
+    case MO_UL:
     default:
-        tcg_out_ld32_12(s, COND_AL, data_reg, addr_reg, 0);
+        tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
         if (bswap) {
-            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
+            tcg_out_bswap32(s, COND_AL, datalo, datalo);
         }
         break;
-    case 3:
-        /* TODO: use block load -
-         * check that data_reg2 > data_reg or the other way */
-        if (data_reg == addr_reg) {
-            tcg_out_ld32_12(s, COND_AL, data_reg2, addr_reg, bswap ? 0 : 4);
-            tcg_out_ld32_12(s, COND_AL, data_reg, addr_reg, bswap ? 4 : 0);
-        } else {
-            tcg_out_ld32_12(s, COND_AL, data_reg, addr_reg, bswap ? 4 : 0);
-            tcg_out_ld32_12(s, COND_AL, data_reg2, addr_reg, bswap ? 0 : 4);
-        }
-        if (bswap) {
-            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
-            tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2);
+    case MO_Q:
+        {
+            TCGReg dl = (bswap ? datahi : datalo);
+            TCGReg dh = (bswap ? datalo : datahi);
+
+            if (use_armv6_instructions && (dl & 1) == 0 && dh == dl + 1) {
+                tcg_out_ldrd_8(s, COND_AL, dl, addrlo, 0);
+            } else if (dl == addrlo) {
+                tcg_out_ld32_12(s, COND_AL, dh, addrlo, bswap ? 0 : 4);
+                tcg_out_ld32_12(s, COND_AL, dl, addrlo, bswap ? 4 : 0);
+            } else {
+                tcg_out_ld32_12(s, COND_AL, dl, addrlo, bswap ? 4 : 0);
+                tcg_out_ld32_12(s, COND_AL, dh, addrlo, bswap ? 0 : 4);
+            }
+            if (bswap) {
+                tcg_out_bswap32(s, COND_AL, dl, dl);
+                tcg_out_bswap32(s, COND_AL, dh, dh);
+            }
         }
         break;
     }
-#endif
 }
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
+static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 {
-    TCGReg addr_reg, data_reg, data_reg2;
-    bool bswap;
+    TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
+    TCGMemOp opc;
 #ifdef CONFIG_SOFTMMU
-    int mem_index, s_bits;
-    TCGReg addr_reg2;
+    int mem_index;
+    TCGReg addend;
     uint8_t *label_ptr;
 #endif
-#ifdef TARGET_WORDS_BIGENDIAN
-    bswap = 1;
-#else
-    bswap = 0;
-#endif
 
-    data_reg = *args++;
-    data_reg2 = (opc == 3 ? *args++ : 0);
-    addr_reg = *args++;
+    datalo = *args++;
+    datahi = (is64 ? *args++ : 0);
+    addrlo = *args++;
+    addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
+    opc = *args++;
+
 #ifdef CONFIG_SOFTMMU
-    addr_reg2 = (TARGET_LONG_BITS == 64 ? *args++ : 0);
     mem_index = *args;
-    s_bits = opc & 3;
-
-    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
-                     offsetof(CPUArchState,
-                              tlb_table[mem_index][0].addr_write));
+    addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 1);
 
+    /* This a conditional BL only to load a pointer within this opcode into LR
+       for the slow path.  We will not be using the value for a tail call.  */
     label_ptr = s->code_ptr;
-    tcg_out_b_noaddr(s, COND_NE);
+    tcg_out_bl_noaddr(s, COND_NE);
 
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
-                    offsetof(CPUTLBEntry, addend)
-                    - offsetof(CPUTLBEntry, addr_write));
+    tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
 
-    switch (opc) {
-    case 0:
-        tcg_out_st8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+    add_qemu_ldst_label(s, 1, opc, datalo, datahi, addrlo, addrhi,
+                        mem_index, s->code_ptr, label_ptr);
+#else /* !CONFIG_SOFTMMU */
+    if (GUEST_BASE) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
+        tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, TCG_REG_TMP);
+    } else {
+        tcg_out_qemu_ld_direct(s, opc, datalo, datahi, addrlo);
+    }
+#endif
+}
+
+static inline void tcg_out_qemu_st_index(TCGContext *s, int cond, TCGMemOp opc,
+                                         TCGReg datalo, TCGReg datahi,
+                                         TCGReg addrlo, TCGReg addend)
+{
+    TCGMemOp bswap = opc & MO_BSWAP;
+
+    switch (opc & MO_SIZE) {
+    case MO_8:
+        tcg_out_st8_r(s, cond, datalo, addrlo, addend);
         break;
-    case 1:
+    case MO_16:
         if (bswap) {
-            tcg_out_bswap16st(s, COND_AL, TCG_REG_R0, data_reg);
-            tcg_out_st16_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_bswap16st(s, cond, TCG_REG_R0, datalo);
+            tcg_out_st16_r(s, cond, TCG_REG_R0, addrlo, addend);
         } else {
-            tcg_out_st16_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st16_r(s, cond, datalo, addrlo, addend);
         }
         break;
-    case 2:
+    case MO_32:
     default:
         if (bswap) {
-            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
-            tcg_out_st32_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_bswap32(s, cond, TCG_REG_R0, datalo);
+            tcg_out_st32_r(s, cond, TCG_REG_R0, addrlo, addend);
         } else {
-            tcg_out_st32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st32_r(s, cond, datalo, addrlo, addend);
         }
         break;
-    case 3:
+    case MO_64:
         if (bswap) {
-            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2);
-            tcg_out_st32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R1, addr_reg);
-            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
-            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R1, 4);
+            tcg_out_bswap32(s, cond, TCG_REG_R0, datahi);
+            tcg_out_st32_rwb(s, cond, TCG_REG_R0, addend, addrlo);
+            tcg_out_bswap32(s, cond, TCG_REG_R0, datalo);
+            tcg_out_st32_12(s, cond, TCG_REG_R0, addend, 4);
+        } else if (use_armv6_instructions
+                   && (datalo & 1) == 0 && datahi == datalo + 1) {
+            tcg_out_strd_r(s, cond, datalo, addrlo, addend);
         } else {
-            tcg_out_st32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_st32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
+            tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
+            tcg_out_st32_12(s, cond, datahi, addend, 4);
         }
         break;
     }
+}
 
-    add_qemu_ldst_label(s, 0, opc, data_reg, data_reg2, addr_reg, addr_reg2,
-                        mem_index, s->code_ptr, label_ptr);
-#else /* !CONFIG_SOFTMMU */
-    if (GUEST_BASE) {
-        uint32_t offset = GUEST_BASE;
-        int i;
-        int rot;
-
-        while (offset) {
-            i = ctz32(offset) & ~1;
-            rot = ((32 - i) << 7) & 0xf00;
-
-            tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R1, addr_reg,
-                            ((offset >> i) & 0xff) | rot);
-            addr_reg = TCG_REG_R1;
-            offset &= ~(0xff << i);
-        }
-    }
-    switch (opc) {
-    case 0:
-        tcg_out_st8_12(s, COND_AL, data_reg, addr_reg, 0);
+static inline void tcg_out_qemu_st_direct(TCGContext *s, TCGMemOp opc,
+                                          TCGReg datalo, TCGReg datahi,
+                                          TCGReg addrlo)
+{
+    TCGMemOp bswap = opc & MO_BSWAP;
+
+    switch (opc & MO_SIZE) {
+    case MO_8:
+        tcg_out_st8_12(s, COND_AL, datalo, addrlo, 0);
         break;
-    case 1:
+    case MO_16:
         if (bswap) {
-            tcg_out_bswap16st(s, COND_AL, TCG_REG_R0, data_reg);
-            tcg_out_st16_8(s, COND_AL, TCG_REG_R0, addr_reg, 0);
+            tcg_out_bswap16st(s, COND_AL, TCG_REG_R0, datalo);
+            tcg_out_st16_8(s, COND_AL, TCG_REG_R0, addrlo, 0);
         } else {
-            tcg_out_st16_8(s, COND_AL, data_reg, addr_reg, 0);
+            tcg_out_st16_8(s, COND_AL, datalo, addrlo, 0);
         }
         break;
-    case 2:
+    case MO_32:
     default:
         if (bswap) {
-            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
-            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 0);
+            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, datalo);
+            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addrlo, 0);
         } else {
-            tcg_out_st32_12(s, COND_AL, data_reg, addr_reg, 0);
+            tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
         }
         break;
-    case 3:
-        /* TODO: use block store -
-         * check that data_reg2 > data_reg or the other way */
+    case MO_64:
         if (bswap) {
-            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2);
-            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 0);
-            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
-            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 4);
+            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, datahi);
+            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addrlo, 0);
+            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, datalo);
+            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addrlo, 4);
+        } else if (use_armv6_instructions
+                   && (datalo & 1) == 0 && datahi == datalo + 1) {
+            tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
         } else {
-            tcg_out_st32_12(s, COND_AL, data_reg, addr_reg, 0);
-            tcg_out_st32_12(s, COND_AL, data_reg2, addr_reg, 4);
+            tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
+            tcg_out_st32_12(s, COND_AL, datahi, addrlo, 4);
         }
         break;
     }
+}
+
+static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+{
+    TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
+    TCGMemOp opc;
+#ifdef CONFIG_SOFTMMU
+    int mem_index;
+    TCGReg addend;
+    uint8_t *label_ptr;
+#endif
+
+    datalo = *args++;
+    datahi = (is64 ? *args++ : 0);
+    addrlo = *args++;
+    addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
+    opc = *args++;
+
+#ifdef CONFIG_SOFTMMU
+    mem_index = *args;
+    addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 0);
+
+    tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
+
+    /* The conditional call must come last, as we're going to return here.  */
+    label_ptr = s->code_ptr;
+    tcg_out_bl_noaddr(s, COND_NE);
+
+    add_qemu_ldst_label(s, 0, opc, datalo, datahi, addrlo, addrhi,
+                        mem_index, s->code_ptr, label_ptr);
+#else /* !CONFIG_SOFTMMU */
+    if (GUEST_BASE) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
+        tcg_out_qemu_st_index(s, COND_AL, opc, datalo,
+                              datahi, addrlo, TCG_REG_TMP);
+    } else {
+        tcg_out_qemu_st_direct(s, opc, datalo, datahi, addrlo);
+    }
 #endif
 }
 
@@ -1857,37 +1914,18 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         ARITH_MOV, args[0], 0, 0);
         break;
 
-    case INDEX_op_qemu_ld8u:
+    case INDEX_op_qemu_ld_i32:
         tcg_out_qemu_ld(s, args, 0);
         break;
-    case INDEX_op_qemu_ld8s:
-        tcg_out_qemu_ld(s, args, 0 | 4);
-        break;
-    case INDEX_op_qemu_ld16u:
+    case INDEX_op_qemu_ld_i64:
         tcg_out_qemu_ld(s, args, 1);
         break;
-    case INDEX_op_qemu_ld16s:
-        tcg_out_qemu_ld(s, args, 1 | 4);
-        break;
-    case INDEX_op_qemu_ld32:
-        tcg_out_qemu_ld(s, args, 2);
-        break;
-    case INDEX_op_qemu_ld64:
-        tcg_out_qemu_ld(s, args, 3);
-        break;
-
-    case INDEX_op_qemu_st8:
+    case INDEX_op_qemu_st_i32:
         tcg_out_qemu_st(s, args, 0);
         break;
-    case INDEX_op_qemu_st16:
+    case INDEX_op_qemu_st_i64:
         tcg_out_qemu_st(s, args, 1);
         break;
-    case INDEX_op_qemu_st32:
-        tcg_out_qemu_st(s, args, 2);
-        break;
-    case INDEX_op_qemu_st64:
-        tcg_out_qemu_st(s, args, 3);
-        break;
 
     case INDEX_op_bswap16_i32:
         tcg_out_bswap16(s, COND_AL, args[0], args[1]);
@@ -1923,22 +1961,6 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     }
 }
 
-#ifdef CONFIG_SOFTMMU
-/* Generate TB finalization at the end of block.  */
-void tcg_out_tb_finalize(TCGContext *s)
-{
-    int i;
-    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
-        TCGLabelQemuLdst *label = &s->qemu_ldst_labels[i];
-        if (label->is_ld) {
-            tcg_out_qemu_ld_slow_path(s, label);
-        } else {
-            tcg_out_qemu_st_slow_path(s, label);
-        }
-    }
-}
-#endif /* SOFTMMU */
-
 static const TCGTargetOpDef arm_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
@@ -1986,29 +2008,15 @@ static const TCGTargetOpDef arm_op_defs[] = {
     { INDEX_op_setcond2_i32, { "r", "r", "r", "rIN", "rIN" } },
 
 #if TARGET_LONG_BITS == 32
-    { INDEX_op_qemu_ld8u, { "r", "l" } },
-    { INDEX_op_qemu_ld8s, { "r", "l" } },
-    { INDEX_op_qemu_ld16u, { "r", "l" } },
-    { INDEX_op_qemu_ld16s, { "r", "l" } },
-    { INDEX_op_qemu_ld32, { "r", "l" } },
-    { INDEX_op_qemu_ld64, { "L", "L", "l" } },
-
-    { INDEX_op_qemu_st8, { "s", "s" } },
-    { INDEX_op_qemu_st16, { "s", "s" } },
-    { INDEX_op_qemu_st32, { "s", "s" } },
-    { INDEX_op_qemu_st64, { "s", "s", "s" } },
+    { INDEX_op_qemu_ld_i32, { "r", "l" } },
+    { INDEX_op_qemu_ld_i64, { "r", "r", "l" } },
+    { INDEX_op_qemu_st_i32, { "s", "s" } },
+    { INDEX_op_qemu_st_i64, { "s", "s", "s" } },
 #else
-    { INDEX_op_qemu_ld8u, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld8s, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld16u, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld16s, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld32, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld64, { "L", "L", "l", "l" } },
-
-    { INDEX_op_qemu_st8, { "s", "s", "s" } },
-    { INDEX_op_qemu_st16, { "s", "s", "s" } },
-    { INDEX_op_qemu_st32, { "s", "s", "s" } },
-    { INDEX_op_qemu_st64, { "s", "s", "s", "s" } },
+    { INDEX_op_qemu_ld_i32, { "r", "l", "l" } },
+    { INDEX_op_qemu_ld_i64, { "r", "r", "l", "l" } },
+    { INDEX_op_qemu_st_i32, { "s", "s", "s" } },
+    { INDEX_op_qemu_st_i64, { "s", "s", "s", "s" } },
 #endif
 
     { INDEX_op_bswap16_i32, { "r", "r" } },
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 9482bfa993..3746b6e298 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -85,6 +85,8 @@ extern bool use_idiv_instructions;
 #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
 #define TCG_TARGET_HAS_rem_i32          0
 
+#define TCG_TARGET_HAS_new_ldst         1
+
 extern bool tcg_target_deposit_valid(int ofs, int len);
 #define TCG_TARGET_deposit_i32_valid  tcg_target_deposit_valid
 
diff --git a/tcg/hppa/tcg-target.c b/tcg/hppa/tcg-target.c
deleted file mode 100644
index 236b39c31f..0000000000
--- a/tcg/hppa/tcg-target.c
+++ /dev/null
@@ -1,1831 +0,0 @@
-/*
- * Tiny Code Generator for QEMU
- *
- * Copyright (c) 2008 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#if TCG_TARGET_REG_BITS != 32
-#error unsupported
-#endif
-
-#ifndef NDEBUG
-static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
-    "%r0", "%r1", "%rp", "%r3", "%r4", "%r5", "%r6", "%r7",
-    "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
-    "%r16", "%r17", "%r18", "%r19", "%r20", "%r21", "%r22", "%r23",
-    "%r24", "%r25", "%r26", "%dp", "%ret0", "%ret1", "%sp", "%r31",
-};
-#endif
-
-/* This is an 8 byte temp slot in the stack frame.  */
-#define STACK_TEMP_OFS -16
-
-#ifdef CONFIG_USE_GUEST_BASE
-#define TCG_GUEST_BASE_REG TCG_REG_R16
-#else
-#define TCG_GUEST_BASE_REG TCG_REG_R0
-#endif
-
-static const int tcg_target_reg_alloc_order[] = {
-    TCG_REG_R4,
-    TCG_REG_R5,
-    TCG_REG_R6,
-    TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-    TCG_REG_R11,
-    TCG_REG_R12,
-    TCG_REG_R13,
-
-    TCG_REG_R17,
-    TCG_REG_R14,
-    TCG_REG_R15,
-    TCG_REG_R16,
-
-    TCG_REG_R26,
-    TCG_REG_R25,
-    TCG_REG_R24,
-    TCG_REG_R23,
-
-    TCG_REG_RET0,
-    TCG_REG_RET1,
-};
-
-static const int tcg_target_call_iarg_regs[4] = {
-    TCG_REG_R26,
-    TCG_REG_R25,
-    TCG_REG_R24,
-    TCG_REG_R23,
-};
-
-static const int tcg_target_call_oarg_regs[2] = {
-    TCG_REG_RET0,
-    TCG_REG_RET1,
-};
-
-/* True iff val fits a signed field of width BITS.  */
-static inline int check_fit_tl(tcg_target_long val, unsigned int bits)
-{
-    return (val << ((sizeof(tcg_target_long) * 8 - bits))
-            >> (sizeof(tcg_target_long) * 8 - bits)) == val;
-}
-
-/* True iff depi can be used to compute (reg | MASK).
-   Accept a bit pattern like:
-      0....01....1
-      1....10....0
-      0..01..10..0
-   Copied from gcc sources.  */
-static inline int or_mask_p(tcg_target_ulong mask)
-{
-    if (mask == 0 || mask == -1) {
-        return 0;
-    }
-    mask += mask & -mask;
-    return (mask & (mask - 1)) == 0;
-}
-
-/* True iff depi or extru can be used to compute (reg & mask).
-   Accept a bit pattern like these:
-      0....01....1
-      1....10....0
-      1..10..01..1
-   Copied from gcc sources.  */
-static inline int and_mask_p(tcg_target_ulong mask)
-{
-    return or_mask_p(~mask);
-}
-
-static int low_sign_ext(int val, int len)
-{
-    return (((val << 1) & ~(-1u << len)) | ((val >> (len - 1)) & 1));
-}
-
-static int reassemble_12(int as12)
-{
-    return (((as12 & 0x800) >> 11) |
-            ((as12 & 0x400) >> 8) |
-            ((as12 & 0x3ff) << 3));
-}
-
-static int reassemble_17(int as17)
-{
-    return (((as17 & 0x10000) >> 16) |
-            ((as17 & 0x0f800) << 5) |
-            ((as17 & 0x00400) >> 8) |
-            ((as17 & 0x003ff) << 3));
-}
-
-static int reassemble_21(int as21)
-{
-    return (((as21 & 0x100000) >> 20) |
-            ((as21 & 0x0ffe00) >> 8) |
-            ((as21 & 0x000180) << 7) |
-            ((as21 & 0x00007c) << 14) |
-            ((as21 & 0x000003) << 12));
-}
-
-/* ??? Bizzarely, there is no PCREL12F relocation type.  I guess all
-   such relocations are simply fully handled by the assembler.  */
-#define R_PARISC_PCREL12F  R_PARISC_NONE
-
-static void patch_reloc(uint8_t *code_ptr, int type,
-                        intptr_t value, intptr_t addend)
-{
-    uint32_t *insn_ptr = (uint32_t *)code_ptr;
-    uint32_t insn = *insn_ptr;
-    intptr_t pcrel;
-
-    value += addend;
-    pcrel = (value - ((intptr_t)code_ptr + 8)) >> 2;
-
-    switch (type) {
-    case R_PARISC_PCREL12F:
-        assert(check_fit_tl(pcrel, 12));
-        /* ??? We assume all patches are forward.  See tcg_out_brcond
-           re setting the NUL bit on the branch and eliding the nop.  */
-        assert(pcrel >= 0);
-        insn &= ~0x1ffdu;
-        insn |= reassemble_12(pcrel);
-        break;
-    case R_PARISC_PCREL17F:
-        assert(check_fit_tl(pcrel, 17));
-        insn &= ~0x1f1ffdu;
-        insn |= reassemble_17(pcrel);
-        break;
-    default:
-        tcg_abort();
-    }
-
-    *insn_ptr = insn;
-}
-
-/* parse target specific constraints */
-static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
-{
-    const char *ct_str;
-
-    ct_str = *pct_str;
-    switch (ct_str[0]) {
-    case 'r':
-        ct->ct |= TCG_CT_REG;
-        tcg_regset_set32(ct->u.regs, 0, 0xffffffff);
-        break;
-    case 'L': /* qemu_ld/st constraint */
-        ct->ct |= TCG_CT_REG;
-        tcg_regset_set32(ct->u.regs, 0, 0xffffffff);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R26);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R25);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R24);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R23);
-        break;
-    case 'Z':
-        ct->ct |= TCG_CT_CONST_0;
-        break;
-    case 'I':
-        ct->ct |= TCG_CT_CONST_S11;
-        break;
-    case 'J':
-        ct->ct |= TCG_CT_CONST_S5;
-	break;
-    case 'K':
-        ct->ct |= TCG_CT_CONST_MS11;
-        break;
-    case 'M':
-        ct->ct |= TCG_CT_CONST_AND;
-        break;
-    case 'O':
-        ct->ct |= TCG_CT_CONST_OR;
-        break;
-    default:
-        return -1;
-    }
-    ct_str++;
-    *pct_str = ct_str;
-    return 0;
-}
-
-/* test if a constant matches the constraint */
-static int tcg_target_const_match(tcg_target_long val,
-                                  const TCGArgConstraint *arg_ct)
-{
-    int ct = arg_ct->ct;
-    if (ct & TCG_CT_CONST) {
-        return 1;
-    } else if (ct & TCG_CT_CONST_0) {
-        return val == 0;
-    } else if (ct & TCG_CT_CONST_S5) {
-        return check_fit_tl(val, 5);
-    } else if (ct & TCG_CT_CONST_S11) {
-        return check_fit_tl(val, 11);
-    } else if (ct & TCG_CT_CONST_MS11) {
-        return check_fit_tl(-val, 11);
-    } else if (ct & TCG_CT_CONST_AND) {
-        return and_mask_p(val);
-    } else if (ct & TCG_CT_CONST_OR) {
-        return or_mask_p(val);
-    }
-    return 0;
-}
-
-#define INSN_OP(x)       ((x) << 26)
-#define INSN_EXT3BR(x)   ((x) << 13)
-#define INSN_EXT3SH(x)   ((x) << 10)
-#define INSN_EXT4(x)     ((x) << 6)
-#define INSN_EXT5(x)     (x)
-#define INSN_EXT6(x)     ((x) << 6)
-#define INSN_EXT7(x)     ((x) << 6)
-#define INSN_EXT8A(x)    ((x) << 6)
-#define INSN_EXT8B(x)    ((x) << 5)
-#define INSN_T(x)        (x)
-#define INSN_R1(x)       ((x) << 16)
-#define INSN_R2(x)       ((x) << 21)
-#define INSN_DEP_LEN(x)  (32 - (x))
-#define INSN_SHDEP_CP(x) ((31 - (x)) << 5)
-#define INSN_SHDEP_P(x)  ((x) << 5)
-#define INSN_COND(x)     ((x) << 13)
-#define INSN_IM11(x)     low_sign_ext(x, 11)
-#define INSN_IM14(x)     low_sign_ext(x, 14)
-#define INSN_IM5(x)      (low_sign_ext(x, 5) << 16)
-
-#define COND_NEVER   0
-#define COND_EQ      1
-#define COND_LT      2
-#define COND_LE      3
-#define COND_LTU     4
-#define COND_LEU     5
-#define COND_SV      6
-#define COND_OD      7
-#define COND_FALSE   8
-
-#define INSN_ADD	(INSN_OP(0x02) | INSN_EXT6(0x18))
-#define INSN_ADDC	(INSN_OP(0x02) | INSN_EXT6(0x1c))
-#define INSN_ADDI	(INSN_OP(0x2d))
-#define INSN_ADDIL	(INSN_OP(0x0a))
-#define INSN_ADDL	(INSN_OP(0x02) | INSN_EXT6(0x28))
-#define INSN_AND	(INSN_OP(0x02) | INSN_EXT6(0x08))
-#define INSN_ANDCM	(INSN_OP(0x02) | INSN_EXT6(0x00))
-#define INSN_COMCLR	(INSN_OP(0x02) | INSN_EXT6(0x22))
-#define INSN_COMICLR	(INSN_OP(0x24))
-#define INSN_DEP	(INSN_OP(0x35) | INSN_EXT3SH(3))
-#define INSN_DEPI	(INSN_OP(0x35) | INSN_EXT3SH(7))
-#define INSN_EXTRS	(INSN_OP(0x34) | INSN_EXT3SH(7))
-#define INSN_EXTRU	(INSN_OP(0x34) | INSN_EXT3SH(6))
-#define INSN_LDIL	(INSN_OP(0x08))
-#define INSN_LDO	(INSN_OP(0x0d))
-#define INSN_MTCTL	(INSN_OP(0x00) | INSN_EXT8B(0xc2))
-#define INSN_OR		(INSN_OP(0x02) | INSN_EXT6(0x09))
-#define INSN_SHD	(INSN_OP(0x34) | INSN_EXT3SH(2))
-#define INSN_SUB	(INSN_OP(0x02) | INSN_EXT6(0x10))
-#define INSN_SUBB	(INSN_OP(0x02) | INSN_EXT6(0x14))
-#define INSN_SUBI	(INSN_OP(0x25))
-#define INSN_VEXTRS	(INSN_OP(0x34) | INSN_EXT3SH(5))
-#define INSN_VEXTRU	(INSN_OP(0x34) | INSN_EXT3SH(4))
-#define INSN_VSHD	(INSN_OP(0x34) | INSN_EXT3SH(0))
-#define INSN_XOR	(INSN_OP(0x02) | INSN_EXT6(0x0a))
-#define INSN_ZDEP	(INSN_OP(0x35) | INSN_EXT3SH(2))
-#define INSN_ZVDEP	(INSN_OP(0x35) | INSN_EXT3SH(0))
-
-#define INSN_BL         (INSN_OP(0x3a) | INSN_EXT3BR(0))
-#define INSN_BL_N       (INSN_OP(0x3a) | INSN_EXT3BR(0) | 2)
-#define INSN_BLR        (INSN_OP(0x3a) | INSN_EXT3BR(2))
-#define INSN_BV         (INSN_OP(0x3a) | INSN_EXT3BR(6))
-#define INSN_BV_N       (INSN_OP(0x3a) | INSN_EXT3BR(6) | 2)
-#define INSN_BLE_SR4    (INSN_OP(0x39) | (1 << 13))
-
-#define INSN_LDB        (INSN_OP(0x10))
-#define INSN_LDH        (INSN_OP(0x11))
-#define INSN_LDW        (INSN_OP(0x12))
-#define INSN_LDWM       (INSN_OP(0x13))
-#define INSN_FLDDS      (INSN_OP(0x0b) | INSN_EXT4(0) | (1 << 12))
-
-#define INSN_LDBX	(INSN_OP(0x03) | INSN_EXT4(0))
-#define INSN_LDHX	(INSN_OP(0x03) | INSN_EXT4(1))
-#define INSN_LDWX       (INSN_OP(0x03) | INSN_EXT4(2))
-
-#define INSN_STB        (INSN_OP(0x18))
-#define INSN_STH        (INSN_OP(0x19))
-#define INSN_STW        (INSN_OP(0x1a))
-#define INSN_STWM       (INSN_OP(0x1b))
-#define INSN_FSTDS      (INSN_OP(0x0b) | INSN_EXT4(8) | (1 << 12))
-
-#define INSN_COMBT      (INSN_OP(0x20))
-#define INSN_COMBF      (INSN_OP(0x22))
-#define INSN_COMIBT     (INSN_OP(0x21))
-#define INSN_COMIBF     (INSN_OP(0x23))
-
-/* supplied by libgcc */
-extern void *__canonicalize_funcptr_for_compare(const void *);
-
-static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
-{
-    /* PA1.1 defines COPY as OR r,0,t; PA2.0 defines COPY as LDO 0(r),t
-       but hppa-dis.c is unaware of this definition */
-    if (ret != arg) {
-        tcg_out32(s, INSN_OR | INSN_T(ret) | INSN_R1(arg)
-                  | INSN_R2(TCG_REG_R0));
-    }
-}
-
-static void tcg_out_movi(TCGContext *s, TCGType type,
-                         TCGReg ret, tcg_target_long arg)
-{
-    if (check_fit_tl(arg, 14)) {
-        tcg_out32(s, INSN_LDO | INSN_R1(ret)
-                  | INSN_R2(TCG_REG_R0) | INSN_IM14(arg));
-    } else {
-        uint32_t hi, lo;
-        hi = arg >> 11;
-        lo = arg & 0x7ff;
-
-        tcg_out32(s, INSN_LDIL | INSN_R2(ret) | reassemble_21(hi));
-        if (lo) {
-            tcg_out32(s, INSN_LDO | INSN_R1(ret)
-                      | INSN_R2(ret) | INSN_IM14(lo));
-        }
-    }
-}
-
-static void tcg_out_ldst(TCGContext *s, int ret, int addr,
-                         tcg_target_long offset, int op)
-{
-    if (!check_fit_tl(offset, 14)) {
-        uint32_t hi, lo, op;
-
-        hi = offset >> 11;
-        lo = offset & 0x7ff;
-
-        if (addr == TCG_REG_R0) {
-            op = INSN_LDIL | INSN_R2(TCG_REG_R1);
-        } else {
-            op = INSN_ADDIL | INSN_R2(addr);
-        }
-        tcg_out32(s, op | reassemble_21(hi));
-
-        addr = TCG_REG_R1;
-	offset = lo;
-    }
-
-    if (ret != addr || offset != 0 || op != INSN_LDO) {
-        tcg_out32(s, op | INSN_R1(ret) | INSN_R2(addr) | INSN_IM14(offset));
-    }
-}
-
-/* This function is required by tcg.c.  */
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
-                              TCGReg arg1, intptr_t arg2)
-{
-    tcg_out_ldst(s, ret, arg1, arg2, INSN_LDW);
-}
-
-/* This function is required by tcg.c.  */
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg ret,
-                              TCGReg arg1, intptr_t arg2)
-{
-    tcg_out_ldst(s, ret, arg1, arg2, INSN_STW);
-}
-
-static void tcg_out_ldst_index(TCGContext *s, int data,
-                               int base, int index, int op)
-{
-    tcg_out32(s, op | INSN_T(data) | INSN_R1(index) | INSN_R2(base));
-}
-
-static inline void tcg_out_addi2(TCGContext *s, int ret, int arg1,
-                                 tcg_target_long val)
-{
-    tcg_out_ldst(s, ret, arg1, val, INSN_LDO);
-}
-
-/* This function is required by tcg.c.  */
-static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
-{
-    tcg_out_addi2(s, reg, reg, val);
-}
-
-static inline void tcg_out_arith(TCGContext *s, int t, int r1, int r2, int op)
-{
-    tcg_out32(s, op | INSN_T(t) | INSN_R1(r1) | INSN_R2(r2));
-}
-
-static inline void tcg_out_arithi(TCGContext *s, int t, int r1,
-                                  tcg_target_long val, int op)
-{
-    assert(check_fit_tl(val, 11));
-    tcg_out32(s, op | INSN_R1(t) | INSN_R2(r1) | INSN_IM11(val));
-}
-
-static inline void tcg_out_nop(TCGContext *s)
-{
-    tcg_out_arith(s, TCG_REG_R0, TCG_REG_R0, TCG_REG_R0, INSN_OR);
-}
-
-static inline void tcg_out_mtctl_sar(TCGContext *s, int arg)
-{
-    tcg_out32(s, INSN_MTCTL | INSN_R2(11) | INSN_R1(arg));
-}
-
-/* Extract LEN bits at position OFS from ARG and place in RET.
-   Note that here the bit ordering is reversed from the PA-RISC
-   standard, such that the right-most bit is 0.  */
-static inline void tcg_out_extr(TCGContext *s, int ret, int arg,
-                                unsigned ofs, unsigned len, int sign)
-{
-    assert(ofs < 32 && len <= 32 - ofs);
-    tcg_out32(s, (sign ? INSN_EXTRS : INSN_EXTRU)
-              | INSN_R1(ret) | INSN_R2(arg)
-              | INSN_SHDEP_P(31 - ofs) | INSN_DEP_LEN(len));
-}
-
-/* Likewise with OFS interpreted little-endian.  */
-static inline void tcg_out_dep(TCGContext *s, int ret, int arg,
-                               unsigned ofs, unsigned len)
-{
-    assert(ofs < 32 && len <= 32 - ofs);
-    tcg_out32(s, INSN_DEP | INSN_R2(ret) | INSN_R1(arg)
-              | INSN_SHDEP_CP(31 - ofs) | INSN_DEP_LEN(len));
-}
-
-static inline void tcg_out_depi(TCGContext *s, int ret, int arg,
-                                unsigned ofs, unsigned len)
-{
-    assert(ofs < 32 && len <= 32 - ofs);
-    tcg_out32(s, INSN_DEPI | INSN_R2(ret) | INSN_IM5(arg)
-              | INSN_SHDEP_CP(31 - ofs) | INSN_DEP_LEN(len));
-}
-
-static inline void tcg_out_shd(TCGContext *s, int ret, int hi, int lo,
-                               unsigned count)
-{
-    assert(count < 32);
-    tcg_out32(s, INSN_SHD | INSN_R1(hi) | INSN_R2(lo) | INSN_T(ret)
-              | INSN_SHDEP_CP(count));
-}
-
-static void tcg_out_vshd(TCGContext *s, int ret, int hi, int lo, int creg)
-{
-    tcg_out_mtctl_sar(s, creg);
-    tcg_out32(s, INSN_VSHD | INSN_T(ret) | INSN_R1(hi) | INSN_R2(lo));
-}
-
-static void tcg_out_ori(TCGContext *s, int ret, int arg, tcg_target_ulong m)
-{
-    int bs0, bs1;
-
-    /* Note that the argument is constrained to match or_mask_p.  */
-    for (bs0 = 0; bs0 < 32; bs0++) {
-        if ((m & (1u << bs0)) != 0) {
-            break;
-        }
-    }
-    for (bs1 = bs0; bs1 < 32; bs1++) {
-        if ((m & (1u << bs1)) == 0) {
-            break;
-        }
-    }
-    assert(bs1 == 32 || (1ul << bs1) > m);
-
-    tcg_out_mov(s, TCG_TYPE_I32, ret, arg);
-    tcg_out_depi(s, ret, -1, bs0, bs1 - bs0);
-}
-
-static void tcg_out_andi(TCGContext *s, int ret, int arg, tcg_target_ulong m)
-{
-    int ls0, ls1, ms0;
-
-    /* Note that the argument is constrained to match and_mask_p.  */
-    for (ls0 = 0; ls0 < 32; ls0++) {
-        if ((m & (1u << ls0)) == 0) {
-            break;
-        }
-    }
-    for (ls1 = ls0; ls1 < 32; ls1++) {
-        if ((m & (1u << ls1)) != 0) {
-            break;
-        }
-    }
-    for (ms0 = ls1; ms0 < 32; ms0++) {
-        if ((m & (1u << ms0)) == 0) {
-            break;
-        }
-    }
-    assert (ms0 == 32);
-
-    if (ls1 == 32) {
-        tcg_out_extr(s, ret, arg, 0, ls0, 0);
-    } else {
-        tcg_out_mov(s, TCG_TYPE_I32, ret, arg);
-        tcg_out_depi(s, ret, 0, ls0, ls1 - ls0);
-    }
-}
-
-static inline void tcg_out_ext8s(TCGContext *s, int ret, int arg)
-{
-    tcg_out_extr(s, ret, arg, 0, 8, 1);
-}
-
-static inline void tcg_out_ext16s(TCGContext *s, int ret, int arg)
-{
-    tcg_out_extr(s, ret, arg, 0, 16, 1);
-}
-
-static void tcg_out_shli(TCGContext *s, int ret, int arg, int count)
-{
-    count &= 31;
-    tcg_out32(s, INSN_ZDEP | INSN_R2(ret) | INSN_R1(arg)
-              | INSN_SHDEP_CP(31 - count) | INSN_DEP_LEN(32 - count));
-}
-
-static void tcg_out_shl(TCGContext *s, int ret, int arg, int creg)
-{
-    tcg_out_arithi(s, TCG_REG_R20, creg, 31, INSN_SUBI);
-    tcg_out_mtctl_sar(s, TCG_REG_R20);
-    tcg_out32(s, INSN_ZVDEP | INSN_R2(ret) | INSN_R1(arg) | INSN_DEP_LEN(32));
-}
-
-static void tcg_out_shri(TCGContext *s, int ret, int arg, int count)
-{
-    count &= 31;
-    tcg_out_extr(s, ret, arg, count, 32 - count, 0);
-}
-
-static void tcg_out_shr(TCGContext *s, int ret, int arg, int creg)
-{
-    tcg_out_vshd(s, ret, TCG_REG_R0, arg, creg);
-}
-
-static void tcg_out_sari(TCGContext *s, int ret, int arg, int count)
-{
-    count &= 31;
-    tcg_out_extr(s, ret, arg, count, 32 - count, 1);
-}
-
-static void tcg_out_sar(TCGContext *s, int ret, int arg, int creg)
-{
-    tcg_out_arithi(s, TCG_REG_R20, creg, 31, INSN_SUBI);
-    tcg_out_mtctl_sar(s, TCG_REG_R20);
-    tcg_out32(s, INSN_VEXTRS | INSN_R1(ret) | INSN_R2(arg) | INSN_DEP_LEN(32));
-}
-
-static void tcg_out_rotli(TCGContext *s, int ret, int arg, int count)
-{
-    count &= 31;
-    tcg_out_shd(s, ret, arg, arg, 32 - count);
-}
-
-static void tcg_out_rotl(TCGContext *s, int ret, int arg, int creg)
-{
-    tcg_out_arithi(s, TCG_REG_R20, creg, 32, INSN_SUBI);
-    tcg_out_vshd(s, ret, arg, arg, TCG_REG_R20);
-}
-
-static void tcg_out_rotri(TCGContext *s, int ret, int arg, int count)
-{
-    count &= 31;
-    tcg_out_shd(s, ret, arg, arg, count);
-}
-
-static void tcg_out_rotr(TCGContext *s, int ret, int arg, int creg)
-{
-    tcg_out_vshd(s, ret, arg, arg, creg);
-}
-
-static void tcg_out_bswap16(TCGContext *s, int ret, int arg, int sign)
-{
-    if (ret != arg) {
-        tcg_out_mov(s, TCG_TYPE_I32, ret, arg); /* arg =  xxAB */
-    }
-    tcg_out_dep(s, ret, ret, 16, 8);          /* ret =  xBAB */
-    tcg_out_extr(s, ret, ret, 8, 16, sign);   /* ret =  ..BA */
-}
-
-static void tcg_out_bswap32(TCGContext *s, int ret, int arg, int temp)
-{
-                                          /* arg =  ABCD */
-    tcg_out_rotri(s, temp, arg, 16);      /* temp = CDAB */
-    tcg_out_dep(s, temp, temp, 16, 8);    /* temp = CBAB */
-    tcg_out_shd(s, ret, arg, temp, 8);    /* ret =  DCBA */
-}
-
-static void tcg_out_call(TCGContext *s, const void *func)
-{
-    tcg_target_long val, hi, lo, disp;
-
-    val = (uint32_t)__canonicalize_funcptr_for_compare(func);
-    disp = (val - ((tcg_target_long)s->code_ptr + 8)) >> 2;
-
-    if (check_fit_tl(disp, 17)) {
-        tcg_out32(s, INSN_BL_N | INSN_R2(TCG_REG_RP) | reassemble_17(disp));
-    } else {
-        hi = val >> 11;
-        lo = val & 0x7ff;
-
-        tcg_out32(s, INSN_LDIL | INSN_R2(TCG_REG_R20) | reassemble_21(hi));
-        tcg_out32(s, INSN_BLE_SR4 | INSN_R2(TCG_REG_R20)
-                  | reassemble_17(lo >> 2));
-        tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_RP, TCG_REG_R31);
-    }
-}
-
-static void tcg_out_xmpyu(TCGContext *s, int retl, int reth,
-                          int arg1, int arg2)
-{
-    /* Store both words into the stack for copy to the FPU.  */
-    tcg_out_ldst(s, arg1, TCG_REG_CALL_STACK, STACK_TEMP_OFS, INSN_STW);
-    tcg_out_ldst(s, arg2, TCG_REG_CALL_STACK, STACK_TEMP_OFS + 4, INSN_STW);
-
-    /* Load both words into the FPU at the same time.  We get away
-       with this because we can address the left and right half of the
-       FPU registers individually once loaded.  */
-    /* fldds stack_temp(sp),fr22 */
-    tcg_out32(s, INSN_FLDDS | INSN_R2(TCG_REG_CALL_STACK)
-              | INSN_IM5(STACK_TEMP_OFS) | INSN_T(22));
-
-    /* xmpyu fr22r,fr22,fr22 */
-    tcg_out32(s, 0x3ad64796);
-
-    /* Store the 64-bit result back into the stack.  */
-    /* fstds stack_temp(sp),fr22 */
-    tcg_out32(s, INSN_FSTDS | INSN_R2(TCG_REG_CALL_STACK)
-              | INSN_IM5(STACK_TEMP_OFS) | INSN_T(22));
-
-    /* Load the pieces of the result that the caller requested.  */
-    if (reth) {
-        tcg_out_ldst(s, reth, TCG_REG_CALL_STACK, STACK_TEMP_OFS, INSN_LDW);
-    }
-    if (retl) {
-        tcg_out_ldst(s, retl, TCG_REG_CALL_STACK, STACK_TEMP_OFS + 4,
-                     INSN_LDW);
-    }
-}
-
-static void tcg_out_add2(TCGContext *s, int destl, int desth,
-                         int al, int ah, int bl, int bh, int blconst)
-{
-    int tmp = (destl == ah || destl == bh ? TCG_REG_R20 : destl);
-
-    if (blconst) {
-        tcg_out_arithi(s, tmp, al, bl, INSN_ADDI);
-    } else {
-        tcg_out_arith(s, tmp, al, bl, INSN_ADD);
-    }
-    tcg_out_arith(s, desth, ah, bh, INSN_ADDC);
-
-    tcg_out_mov(s, TCG_TYPE_I32, destl, tmp);
-}
-
-static void tcg_out_sub2(TCGContext *s, int destl, int desth, int al, int ah,
-                         int bl, int bh, int alconst, int blconst)
-{
-    int tmp = (destl == ah || destl == bh ? TCG_REG_R20 : destl);
-
-    if (alconst) {
-        if (blconst) {
-            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R20, bl);
-            bl = TCG_REG_R20;
-        }
-        tcg_out_arithi(s, tmp, bl, al, INSN_SUBI);
-    } else if (blconst) {
-        tcg_out_arithi(s, tmp, al, -bl, INSN_ADDI);
-    } else {
-        tcg_out_arith(s, tmp, al, bl, INSN_SUB);
-    }
-    tcg_out_arith(s, desth, ah, bh, INSN_SUBB);
-
-    tcg_out_mov(s, TCG_TYPE_I32, destl, tmp);
-}
-
-static void tcg_out_branch(TCGContext *s, int label_index, int nul)
-{
-    TCGLabel *l = &s->labels[label_index];
-    uint32_t op = nul ? INSN_BL_N : INSN_BL;
-
-    if (l->has_value) {
-        tcg_target_long val = l->u.value;
-
-        val -= (tcg_target_long)s->code_ptr + 8;
-        val >>= 2;
-        assert(check_fit_tl(val, 17));
-
-        tcg_out32(s, op | reassemble_17(val));
-    } else {
-        /* We need to keep the offset unchanged for retranslation.  */
-        uint32_t old_insn = *(uint32_t *)s->code_ptr;
-
-        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL17F, label_index, 0);
-        tcg_out32(s, op | (old_insn & 0x1f1ffdu));
-    }
-}
-
-static const uint8_t tcg_cond_to_cmp_cond[] =
-{
-    [TCG_COND_EQ] = COND_EQ,
-    [TCG_COND_NE] = COND_EQ | COND_FALSE,
-    [TCG_COND_LT] = COND_LT,
-    [TCG_COND_GE] = COND_LT | COND_FALSE,
-    [TCG_COND_LE] = COND_LE,
-    [TCG_COND_GT] = COND_LE | COND_FALSE,
-    [TCG_COND_LTU] = COND_LTU,
-    [TCG_COND_GEU] = COND_LTU | COND_FALSE,
-    [TCG_COND_LEU] = COND_LEU,
-    [TCG_COND_GTU] = COND_LEU | COND_FALSE,
-};
-
-static void tcg_out_brcond(TCGContext *s, int cond, TCGArg c1,
-                           TCGArg c2, int c2const, int label_index)
-{
-    TCGLabel *l = &s->labels[label_index];
-    int op, pacond;
-
-    /* Note that COMIB operates as if the immediate is the first
-       operand.  We model brcond with the immediate in the second
-       to better match what targets are likely to give us.  For
-       consistency, model COMB with reversed operands as well.  */
-    pacond = tcg_cond_to_cmp_cond[tcg_swap_cond(cond)];
-
-    if (c2const) {
-        op = (pacond & COND_FALSE ? INSN_COMIBF : INSN_COMIBT);
-        op |= INSN_IM5(c2);
-    } else {
-        op = (pacond & COND_FALSE ? INSN_COMBF : INSN_COMBT);
-        op |= INSN_R1(c2);
-    }
-    op |= INSN_R2(c1);
-    op |= INSN_COND(pacond & 7);
-
-    if (l->has_value) {
-        tcg_target_long val = l->u.value;
-
-        val -= (tcg_target_long)s->code_ptr + 8;
-        val >>= 2;
-        assert(check_fit_tl(val, 12));
-
-        /* ??? Assume that all branches to defined labels are backward.
-           Which means that if the nul bit is set, the delay slot is
-           executed if the branch is taken, and not executed in fallthru.  */
-        tcg_out32(s, op | reassemble_12(val));
-        tcg_out_nop(s);
-    } else {
-        /* We need to keep the offset unchanged for retranslation.  */
-        uint32_t old_insn = *(uint32_t *)s->code_ptr;
-
-        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL12F, label_index, 0);
-        /* ??? Assume that all branches to undefined labels are forward.
-           Which means that if the nul bit is set, the delay slot is
-           not executed if the branch is taken, which is what we want.  */
-        tcg_out32(s, op | 2 | (old_insn & 0x1ffdu));
-    }
-}
-
-static void tcg_out_comclr(TCGContext *s, int cond, TCGArg ret,
-                           TCGArg c1, TCGArg c2, int c2const)
-{
-    int op, pacond;
-
-    /* Note that COMICLR operates as if the immediate is the first
-       operand.  We model setcond with the immediate in the second
-       to better match what targets are likely to give us.  For
-       consistency, model COMCLR with reversed operands as well.  */
-    pacond = tcg_cond_to_cmp_cond[tcg_swap_cond(cond)];
-
-    if (c2const) {
-        op = INSN_COMICLR | INSN_R2(c1) | INSN_R1(ret) | INSN_IM11(c2);
-    } else {
-        op = INSN_COMCLR | INSN_R2(c1) | INSN_R1(c2) | INSN_T(ret);
-    }
-    op |= INSN_COND(pacond & 7);
-    op |= pacond & COND_FALSE ? 1 << 12 : 0;
-
-    tcg_out32(s, op);
-}
-
-static void tcg_out_brcond2(TCGContext *s, int cond, TCGArg al, TCGArg ah,
-                            TCGArg bl, int blconst, TCGArg bh, int bhconst,
-                            int label_index)
-{
-    switch (cond) {
-    case TCG_COND_EQ:
-        tcg_out_comclr(s, TCG_COND_NE, TCG_REG_R0, al, bl, blconst);
-        tcg_out_brcond(s, TCG_COND_EQ, ah, bh, bhconst, label_index);
-        break;
-    case TCG_COND_NE:
-        tcg_out_brcond(s, TCG_COND_NE, al, bl, blconst, label_index);
-        tcg_out_brcond(s, TCG_COND_NE, ah, bh, bhconst, label_index);
-        break;
-    default:
-        tcg_out_brcond(s, tcg_high_cond(cond), ah, bh, bhconst, label_index);
-        tcg_out_comclr(s, TCG_COND_NE, TCG_REG_R0, ah, bh, bhconst);
-        tcg_out_brcond(s, tcg_unsigned_cond(cond),
-                       al, bl, blconst, label_index);
-        break;
-    }
-}
-
-static void tcg_out_setcond(TCGContext *s, int cond, TCGArg ret,
-                            TCGArg c1, TCGArg c2, int c2const)
-{
-    tcg_out_comclr(s, tcg_invert_cond(cond), ret, c1, c2, c2const);
-    tcg_out_movi(s, TCG_TYPE_I32, ret, 1);
-}
-
-static void tcg_out_setcond2(TCGContext *s, int cond, TCGArg ret,
-                             TCGArg al, TCGArg ah, TCGArg bl, int blconst,
-                             TCGArg bh, int bhconst)
-{
-    int scratch = TCG_REG_R20;
-
-    /* Note that the low parts are fully consumed before scratch is set.  */
-    if (ret != ah && (bhconst || ret != bh)) {
-        scratch = ret;
-    }
-
-    switch (cond) {
-    case TCG_COND_EQ:
-    case TCG_COND_NE:
-        tcg_out_setcond(s, cond, scratch, al, bl, blconst);
-        tcg_out_comclr(s, TCG_COND_EQ, TCG_REG_R0, ah, bh, bhconst);
-        tcg_out_movi(s, TCG_TYPE_I32, scratch, cond == TCG_COND_NE);
-        break;
-
-    case TCG_COND_GE:
-    case TCG_COND_GEU:
-    case TCG_COND_LT:
-    case TCG_COND_LTU:
-        /* Optimize compares with low part zero.  */
-        if (bl == 0) {
-            tcg_out_setcond(s, cond, ret, ah, bh, bhconst);
-            return;
-        }
-        /* FALLTHRU */
-
-    case TCG_COND_LE:
-    case TCG_COND_LEU:
-    case TCG_COND_GT:
-    case TCG_COND_GTU:
-        /* <= : ah < bh | (ah == bh && al <= bl) */
-        tcg_out_setcond(s, tcg_unsigned_cond(cond), scratch, al, bl, blconst);
-        tcg_out_comclr(s, TCG_COND_EQ, TCG_REG_R0, ah, bh, bhconst);
-        tcg_out_movi(s, TCG_TYPE_I32, scratch, 0);
-        tcg_out_comclr(s, tcg_invert_cond(tcg_high_cond(cond)),
-                       TCG_REG_R0, ah, bh, bhconst);
-        tcg_out_movi(s, TCG_TYPE_I32, scratch, 1);
-        break;
-
-    default:
-        tcg_abort();
-    }
-
-    tcg_out_mov(s, TCG_TYPE_I32, ret, scratch);
-}
-
-static void tcg_out_movcond(TCGContext *s, int cond, TCGArg ret,
-                            TCGArg c1, TCGArg c2, int c2const,
-                            TCGArg v1, int v1const)
-{
-    tcg_out_comclr(s, tcg_invert_cond(cond), TCG_REG_R0, c1, c2, c2const);
-    if (v1const) {
-        tcg_out_movi(s, TCG_TYPE_I32, ret, v1);
-    } else {
-        tcg_out_mov(s, TCG_TYPE_I32, ret, v1);
-    }
-}
-
-#if defined(CONFIG_SOFTMMU)
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
-static const void * const qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
-};
-
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
-static const void * const qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
-};
-
-/* Load and compare a TLB entry, and branch if TLB miss.  OFFSET is set to
-   the offset of the first ADDR_READ or ADDR_WRITE member of the appropriate
-   TLB for the memory index.  The return value is the offset from ENV
-   contained in R1 afterward (to be used when loading ADDEND); if the
-   return value is 0, R1 is not used.  */
-
-static int tcg_out_tlb_read(TCGContext *s, int r0, int r1, int addrlo,
-                            int addrhi, int s_bits, int lab_miss, int offset)
-{
-    int ret;
-
-    /* Extracting the index into the TLB.  The "normal C operation" is
-          r1 = addr_reg >> TARGET_PAGE_BITS;
-          r1 &= CPU_TLB_SIZE - 1;
-          r1 <<= CPU_TLB_ENTRY_BITS;
-       What this does is extract CPU_TLB_BITS beginning at TARGET_PAGE_BITS
-       and place them at CPU_TLB_ENTRY_BITS.  We can combine the first two
-       operations with an EXTRU.  Unfortunately, the current value of
-       CPU_TLB_ENTRY_BITS is > 3, so we can't merge that shift with the
-       add that follows.  */
-    tcg_out_extr(s, r1, addrlo, TARGET_PAGE_BITS, CPU_TLB_BITS, 0);
-    tcg_out_shli(s, r1, r1, CPU_TLB_ENTRY_BITS);
-    tcg_out_arith(s, r1, r1, TCG_AREG0, INSN_ADDL);
-
-    /* Make sure that both the addr_{read,write} and addend can be
-       read with a 14-bit offset from the same base register.  */
-    if (check_fit_tl(offset + CPU_TLB_SIZE, 14)) {
-        ret = 0;
-    } else {
-        ret = (offset + 0x400) & ~0x7ff;
-        offset = ret - offset;
-        tcg_out_addi2(s, TCG_REG_R1, r1, ret);
-        r1 = TCG_REG_R1;
-    }
-
-    /* Load the entry from the computed slot.  */
-    if (TARGET_LONG_BITS == 64) {
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R23, r1, offset);
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, r1, offset + 4);
-    } else {
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, r1, offset);
-    }
-
-    /* Compute the value that ought to appear in the TLB for a hit, namely,
-       the page of the address.  We include the low N bits of the address
-       to catch unaligned accesses and force them onto the slow path.  Do
-       this computation after having issued the load from the TLB slot to
-       give the load time to complete.  */
-    tcg_out_andi(s, r0, addrlo, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
-
-    /* If not equal, jump to lab_miss. */
-    if (TARGET_LONG_BITS == 64) {
-        tcg_out_brcond2(s, TCG_COND_NE, TCG_REG_R20, TCG_REG_R23,
-                        r0, 0, addrhi, 0, lab_miss);
-    } else {
-        tcg_out_brcond(s, TCG_COND_NE, TCG_REG_R20, r0, 0, lab_miss);
-    }
-
-    return ret;
-}
-
-static int tcg_out_arg_reg32(TCGContext *s, int argno, TCGArg v, bool vconst)
-{
-    if (argno < 4) {
-        if (vconst) {
-            tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[argno], v);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[argno], v);
-        }
-    } else {
-        if (vconst && v != 0) {
-            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R20, v);
-            v = TCG_REG_R20;
-        }
-        tcg_out_st(s, TCG_TYPE_I32, v, TCG_REG_CALL_STACK,
-                   TCG_TARGET_CALL_STACK_OFFSET - ((argno - 3) * 4));
-    }
-    return argno + 1;
-}
-
-static int tcg_out_arg_reg64(TCGContext *s, int argno, TCGArg vl, TCGArg vh)
-{
-    /* 64-bit arguments must go in even reg pairs and stack slots.  */
-    if (argno & 1) {
-        argno++;
-    }
-    argno = tcg_out_arg_reg32(s, argno, vl, false);
-    argno = tcg_out_arg_reg32(s, argno, vh, false);
-    return argno;
-}
-#endif
-
-static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo_reg, int datahi_reg,
-                                   int addr_reg, int addend_reg, int opc)
-{
-#ifdef TARGET_WORDS_BIGENDIAN
-    const int bswap = 0;
-#else
-    const int bswap = 1;
-#endif
-
-    switch (opc) {
-    case 0:
-        tcg_out_ldst_index(s, datalo_reg, addr_reg, addend_reg, INSN_LDBX);
-        break;
-    case 0 | 4:
-        tcg_out_ldst_index(s, datalo_reg, addr_reg, addend_reg, INSN_LDBX);
-        tcg_out_ext8s(s, datalo_reg, datalo_reg);
-        break;
-    case 1:
-        tcg_out_ldst_index(s, datalo_reg, addr_reg, addend_reg, INSN_LDHX);
-        if (bswap) {
-            tcg_out_bswap16(s, datalo_reg, datalo_reg, 0);
-        }
-        break;
-    case 1 | 4:
-        tcg_out_ldst_index(s, datalo_reg, addr_reg, addend_reg, INSN_LDHX);
-        if (bswap) {
-            tcg_out_bswap16(s, datalo_reg, datalo_reg, 1);
-        } else {
-            tcg_out_ext16s(s, datalo_reg, datalo_reg);
-        }
-        break;
-    case 2:
-        tcg_out_ldst_index(s, datalo_reg, addr_reg, addend_reg, INSN_LDWX);
-        if (bswap) {
-            tcg_out_bswap32(s, datalo_reg, datalo_reg, TCG_REG_R20);
-        }
-        break;
-    case 3:
-        if (bswap) {
-            int t = datahi_reg;
-            datahi_reg = datalo_reg;
-            datalo_reg = t;
-        }
-        /* We can't access the low-part with a reg+reg addressing mode,
-           so perform the addition now and use reg_ofs addressing mode.  */
-        if (addend_reg != TCG_REG_R0) {
-            tcg_out_arith(s, TCG_REG_R20, addr_reg, addend_reg, INSN_ADD);
-            addr_reg = TCG_REG_R20;
-	}
-        /* Make sure not to clobber the base register.  */
-        if (datahi_reg == addr_reg) {
-            tcg_out_ldst(s, datalo_reg, addr_reg, 4, INSN_LDW);
-            tcg_out_ldst(s, datahi_reg, addr_reg, 0, INSN_LDW);
-        } else {
-            tcg_out_ldst(s, datahi_reg, addr_reg, 0, INSN_LDW);
-            tcg_out_ldst(s, datalo_reg, addr_reg, 4, INSN_LDW);
-        }
-        if (bswap) {
-            tcg_out_bswap32(s, datalo_reg, datalo_reg, TCG_REG_R20);
-            tcg_out_bswap32(s, datahi_reg, datahi_reg, TCG_REG_R20);
-        }
-        break;
-    default:
-        tcg_abort();
-    }
-}
-
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
-{
-    int datalo_reg = *args++;
-    /* Note that datahi_reg is only used for 64-bit loads.  */
-    int datahi_reg = (opc == 3 ? *args++ : TCG_REG_R0);
-    int addrlo_reg = *args++;
-
-#if defined(CONFIG_SOFTMMU)
-    /* Note that addrhi_reg is only used for 64-bit guests.  */
-    int addrhi_reg = (TARGET_LONG_BITS == 64 ? *args++ : TCG_REG_R0);
-    int mem_index = *args;
-    int lab1, lab2, argno, offset;
-
-    lab1 = gen_new_label();
-    lab2 = gen_new_label();
-
-    offset = offsetof(CPUArchState, tlb_table[mem_index][0].addr_read);
-    offset = tcg_out_tlb_read(s, TCG_REG_R26, TCG_REG_R25, addrlo_reg,
-                              addrhi_reg, opc & 3, lab1, offset);
-
-    /* TLB Hit.  */
-    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20,
-               (offset ? TCG_REG_R1 : TCG_REG_R25),
-               offsetof(CPUArchState, tlb_table[mem_index][0].addend) - offset);
-    tcg_out_qemu_ld_direct(s, datalo_reg, datahi_reg, addrlo_reg,
-                           TCG_REG_R20, opc);
-    tcg_out_branch(s, lab2, 1);
-
-    /* TLB Miss.  */
-    /* label1: */
-    tcg_out_label(s, lab1, s->code_ptr);
-
-    argno = 0;
-    argno = tcg_out_arg_reg32(s, argno, TCG_AREG0, false);
-    if (TARGET_LONG_BITS == 64) {
-        argno = tcg_out_arg_reg64(s, argno, addrlo_reg, addrhi_reg);
-    } else {
-        argno = tcg_out_arg_reg32(s, argno, addrlo_reg, false);
-    }
-    argno = tcg_out_arg_reg32(s, argno, mem_index, true);
-
-    tcg_out_call(s, qemu_ld_helpers[opc & 3]);
-
-    switch (opc) {
-    case 0:
-        tcg_out_andi(s, datalo_reg, TCG_REG_RET0, 0xff);
-        break;
-    case 0 | 4:
-        tcg_out_ext8s(s, datalo_reg, TCG_REG_RET0);
-        break;
-    case 1:
-        tcg_out_andi(s, datalo_reg, TCG_REG_RET0, 0xffff);
-        break;
-    case 1 | 4:
-        tcg_out_ext16s(s, datalo_reg, TCG_REG_RET0);
-        break;
-    case 2:
-    case 2 | 4:
-        tcg_out_mov(s, TCG_TYPE_I32, datalo_reg, TCG_REG_RET0);
-        break;
-    case 3:
-        tcg_out_mov(s, TCG_TYPE_I32, datahi_reg, TCG_REG_RET0);
-        tcg_out_mov(s, TCG_TYPE_I32, datalo_reg, TCG_REG_RET1);
-        break;
-    default:
-        tcg_abort();
-    }
-
-    /* label2: */
-    tcg_out_label(s, lab2, s->code_ptr);
-#else
-    tcg_out_qemu_ld_direct(s, datalo_reg, datahi_reg, addrlo_reg,
-                           (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_R0), opc);
-#endif
-}
-
-static void tcg_out_qemu_st_direct(TCGContext *s, int datalo_reg,
-                                   int datahi_reg, int addr_reg, int opc)
-{
-#ifdef TARGET_WORDS_BIGENDIAN
-    const int bswap = 0;
-#else
-    const int bswap = 1;
-#endif
-
-    switch (opc) {
-    case 0:
-        tcg_out_ldst(s, datalo_reg, addr_reg, 0, INSN_STB);
-        break;
-    case 1:
-        if (bswap) {
-            tcg_out_bswap16(s, TCG_REG_R20, datalo_reg, 0);
-            datalo_reg = TCG_REG_R20;
-        }
-        tcg_out_ldst(s, datalo_reg, addr_reg, 0, INSN_STH);
-        break;
-    case 2:
-        if (bswap) {
-            tcg_out_bswap32(s, TCG_REG_R20, datalo_reg, TCG_REG_R20);
-            datalo_reg = TCG_REG_R20;
-        }
-        tcg_out_ldst(s, datalo_reg, addr_reg, 0, INSN_STW);
-        break;
-    case 3:
-        if (bswap) {
-            tcg_out_bswap32(s, TCG_REG_R20, datalo_reg, TCG_REG_R20);
-            tcg_out_bswap32(s, TCG_REG_R23, datahi_reg, TCG_REG_R23);
-            datahi_reg = TCG_REG_R20;
-            datalo_reg = TCG_REG_R23;
-        }
-        tcg_out_ldst(s, datahi_reg, addr_reg, 0, INSN_STW);
-        tcg_out_ldst(s, datalo_reg, addr_reg, 4, INSN_STW);
-        break;
-    default:
-        tcg_abort();
-    }
-
-}
-
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
-{
-    int datalo_reg = *args++;
-    /* Note that datahi_reg is only used for 64-bit loads.  */
-    int datahi_reg = (opc == 3 ? *args++ : TCG_REG_R0);
-    int addrlo_reg = *args++;
-
-#if defined(CONFIG_SOFTMMU)
-    /* Note that addrhi_reg is only used for 64-bit guests.  */
-    int addrhi_reg = (TARGET_LONG_BITS == 64 ? *args++ : TCG_REG_R0);
-    int mem_index = *args;
-    int lab1, lab2, argno, next, offset;
-
-    lab1 = gen_new_label();
-    lab2 = gen_new_label();
-
-    offset = offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
-    offset = tcg_out_tlb_read(s, TCG_REG_R26, TCG_REG_R25, addrlo_reg,
-                              addrhi_reg, opc, lab1, offset);
-
-    /* TLB Hit.  */
-    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20,
-               (offset ? TCG_REG_R1 : TCG_REG_R25),
-               offsetof(CPUArchState, tlb_table[mem_index][0].addend) - offset);
-
-    /* There are no indexed stores, so we must do this addition explitly.
-       Careful to avoid R20, which is used for the bswaps to follow.  */
-    tcg_out_arith(s, TCG_REG_R31, addrlo_reg, TCG_REG_R20, INSN_ADDL);
-    tcg_out_qemu_st_direct(s, datalo_reg, datahi_reg, TCG_REG_R31, opc);
-    tcg_out_branch(s, lab2, 1);
-
-    /* TLB Miss.  */
-    /* label1: */
-    tcg_out_label(s, lab1, s->code_ptr);
-
-    argno = 0;
-    argno = tcg_out_arg_reg32(s, argno, TCG_AREG0, false);
-    if (TARGET_LONG_BITS == 64) {
-        argno = tcg_out_arg_reg64(s, argno, addrlo_reg, addrhi_reg);
-    } else {
-        argno = tcg_out_arg_reg32(s, argno, addrlo_reg, false);
-    }
-
-    next = (argno < 4 ? tcg_target_call_iarg_regs[argno] : TCG_REG_R20);
-    switch(opc) {
-    case 0:
-        tcg_out_andi(s, next, datalo_reg, 0xff);
-        argno = tcg_out_arg_reg32(s, argno, next, false);
-        break;
-    case 1:
-        tcg_out_andi(s, next, datalo_reg, 0xffff);
-        argno = tcg_out_arg_reg32(s, argno, next, false);
-        break;
-    case 2:
-        argno = tcg_out_arg_reg32(s, argno, datalo_reg, false);
-        break;
-    case 3:
-        argno = tcg_out_arg_reg64(s, argno, datalo_reg, datahi_reg);
-        break;
-    default:
-        tcg_abort();
-    }
-    argno = tcg_out_arg_reg32(s, argno, mem_index, true);
-
-    tcg_out_call(s, qemu_st_helpers[opc]);
-
-    /* label2: */
-    tcg_out_label(s, lab2, s->code_ptr);
-#else
-    /* There are no indexed stores, so if GUEST_BASE is set we must do
-       the add explicitly.  Careful to avoid R20, which is used for the
-       bswaps to follow.  */
-    if (GUEST_BASE != 0) {
-        tcg_out_arith(s, TCG_REG_R31, addrlo_reg,
-                      TCG_GUEST_BASE_REG, INSN_ADDL);
-        addrlo_reg = TCG_REG_R31;
-    }
-    tcg_out_qemu_st_direct(s, datalo_reg, datahi_reg, addrlo_reg, opc);
-#endif
-}
-
-static void tcg_out_exit_tb(TCGContext *s, TCGArg arg)
-{
-    if (!check_fit_tl(arg, 14)) {
-        uint32_t hi, lo;
-        hi = arg & ~0x7ff;
-        lo = arg & 0x7ff;
-        if (lo) {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, hi);
-            tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_R18));
-            tcg_out_addi(s, TCG_REG_RET0, lo);
-            return;
-        }
-        arg = hi;
-    }
-    tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_R18));
-    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, arg);
-}
-
-static void tcg_out_goto_tb(TCGContext *s, TCGArg arg)
-{
-    if (s->tb_jmp_offset) {
-        /* direct jump method */
-        fprintf(stderr, "goto_tb direct\n");
-        tcg_abort();
-    } else {
-        /* indirect jump method */
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, TCG_REG_R0,
-                   (tcg_target_long)(s->tb_next + arg));
-        tcg_out32(s, INSN_BV_N | INSN_R2(TCG_REG_R20));
-    }
-    s->tb_next_offset[arg] = s->code_ptr - s->code_buf;
-}
-
-static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
-                              const int *const_args)
-{
-    switch (opc) {
-    case INDEX_op_exit_tb:
-        tcg_out_exit_tb(s, args[0]);
-        break;
-    case INDEX_op_goto_tb:
-        tcg_out_goto_tb(s, args[0]);
-        break;
-
-    case INDEX_op_call:
-        if (const_args[0]) {
-            tcg_out_call(s, (void *)args[0]);
-        } else {
-            /* ??? FIXME: the value in the register in args[0] is almost
-               certainly a procedure descriptor, not a code address.  We
-               probably need to use the millicode $$dyncall routine.  */
-            tcg_abort();
-        }
-        break;
-
-    case INDEX_op_br:
-        tcg_out_branch(s, args[0], 1);
-        break;
-
-    case INDEX_op_movi_i32:
-        tcg_out_movi(s, TCG_TYPE_I32, args[0], (uint32_t)args[1]);
-        break;
-
-    case INDEX_op_ld8u_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDB);
-        break;
-    case INDEX_op_ld8s_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDB);
-        tcg_out_ext8s(s, args[0], args[0]);
-        break;
-    case INDEX_op_ld16u_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDH);
-        break;
-    case INDEX_op_ld16s_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDH);
-        tcg_out_ext16s(s, args[0], args[0]);
-        break;
-    case INDEX_op_ld_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDW);
-        break;
-
-    case INDEX_op_st8_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STB);
-        break;
-    case INDEX_op_st16_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STH);
-        break;
-    case INDEX_op_st_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STW);
-        break;
-
-    case INDEX_op_add_i32:
-        if (const_args[2]) {
-            tcg_out_addi2(s, args[0], args[1], args[2]);
-        } else {
-            tcg_out_arith(s, args[0], args[1], args[2], INSN_ADDL);
-        }
-        break;
-
-    case INDEX_op_sub_i32:
-        if (const_args[1]) {
-            if (const_args[2]) {
-                tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1] - args[2]);
-            } else {
-                /* Recall that SUBI is a reversed subtract.  */
-                tcg_out_arithi(s, args[0], args[2], args[1], INSN_SUBI);
-            }
-        } else if (const_args[2]) {
-            tcg_out_addi2(s, args[0], args[1], -args[2]);
-        } else {
-            tcg_out_arith(s, args[0], args[1], args[2], INSN_SUB);
-        }
-        break;
-
-    case INDEX_op_and_i32:
-        if (const_args[2]) {
-            tcg_out_andi(s, args[0], args[1], args[2]);
-        } else {
-            tcg_out_arith(s, args[0], args[1], args[2], INSN_AND);
-        }
-        break;
-
-    case INDEX_op_or_i32:
-        if (const_args[2]) {
-            tcg_out_ori(s, args[0], args[1], args[2]);
-        } else {
-            tcg_out_arith(s, args[0], args[1], args[2], INSN_OR);
-        }
-        break;
-
-    case INDEX_op_xor_i32:
-        tcg_out_arith(s, args[0], args[1], args[2], INSN_XOR);
-        break;
-
-    case INDEX_op_andc_i32:
-        if (const_args[2]) {
-            tcg_out_andi(s, args[0], args[1], ~args[2]);
-        } else {
-            tcg_out_arith(s, args[0], args[1], args[2], INSN_ANDCM);
-        }
-        break;
-
-    case INDEX_op_shl_i32:
-        if (const_args[2]) {
-            tcg_out_shli(s, args[0], args[1], args[2]);
-        } else {
-            tcg_out_shl(s, args[0], args[1], args[2]);
-        }
-        break;
-
-    case INDEX_op_shr_i32:
-        if (const_args[2]) {
-            tcg_out_shri(s, args[0], args[1], args[2]);
-        } else {
-            tcg_out_shr(s, args[0], args[1], args[2]);
-        }
-        break;
-
-    case INDEX_op_sar_i32:
-        if (const_args[2]) {
-            tcg_out_sari(s, args[0], args[1], args[2]);
-        } else {
-            tcg_out_sar(s, args[0], args[1], args[2]);
-        }
-        break;
-
-    case INDEX_op_rotl_i32:
-        if (const_args[2]) {
-            tcg_out_rotli(s, args[0], args[1], args[2]);
-        } else {
-            tcg_out_rotl(s, args[0], args[1], args[2]);
-        }
-        break;
-
-    case INDEX_op_rotr_i32:
-        if (const_args[2]) {
-            tcg_out_rotri(s, args[0], args[1], args[2]);
-        } else {
-            tcg_out_rotr(s, args[0], args[1], args[2]);
-        }
-        break;
-
-    case INDEX_op_mul_i32:
-        tcg_out_xmpyu(s, args[0], TCG_REG_R0, args[1], args[2]);
-        break;
-    case INDEX_op_mulu2_i32:
-        tcg_out_xmpyu(s, args[0], args[1], args[2], args[3]);
-        break;
-
-    case INDEX_op_bswap16_i32:
-        tcg_out_bswap16(s, args[0], args[1], 0);
-        break;
-    case INDEX_op_bswap32_i32:
-        tcg_out_bswap32(s, args[0], args[1], TCG_REG_R20);
-        break;
-
-    case INDEX_op_not_i32:
-        tcg_out_arithi(s, args[0], args[1], -1, INSN_SUBI);
-        break;
-    case INDEX_op_ext8s_i32:
-        tcg_out_ext8s(s, args[0], args[1]);
-        break;
-    case INDEX_op_ext16s_i32:
-        tcg_out_ext16s(s, args[0], args[1]);
-        break;
-
-    case INDEX_op_brcond_i32:
-        tcg_out_brcond(s, args[2], args[0], args[1], const_args[1], args[3]);
-        break;
-    case INDEX_op_brcond2_i32:
-        tcg_out_brcond2(s, args[4], args[0], args[1],
-                        args[2], const_args[2],
-                        args[3], const_args[3], args[5]);
-        break;
-
-    case INDEX_op_setcond_i32:
-        tcg_out_setcond(s, args[3], args[0], args[1], args[2], const_args[2]);
-        break;
-    case INDEX_op_setcond2_i32:
-        tcg_out_setcond2(s, args[5], args[0], args[1], args[2],
-                         args[3], const_args[3], args[4], const_args[4]);
-        break;
-
-    case INDEX_op_movcond_i32:
-        tcg_out_movcond(s, args[5], args[0], args[1], args[2], const_args[2],
-                        args[3], const_args[3]);
-        break;
-
-    case INDEX_op_add2_i32:
-        tcg_out_add2(s, args[0], args[1], args[2], args[3],
-                     args[4], args[5], const_args[4]);
-        break;
-
-    case INDEX_op_sub2_i32:
-        tcg_out_sub2(s, args[0], args[1], args[2], args[3],
-                     args[4], args[5], const_args[2], const_args[4]);
-        break;
-
-    case INDEX_op_deposit_i32:
-        if (const_args[2]) {
-            tcg_out_depi(s, args[0], args[2], args[3], args[4]);
-        } else {
-            tcg_out_dep(s, args[0], args[2], args[3], args[4]);
-        }
-        break;
-
-    case INDEX_op_qemu_ld8u:
-        tcg_out_qemu_ld(s, args, 0);
-        break;
-    case INDEX_op_qemu_ld8s:
-        tcg_out_qemu_ld(s, args, 0 | 4);
-        break;
-    case INDEX_op_qemu_ld16u:
-        tcg_out_qemu_ld(s, args, 1);
-        break;
-    case INDEX_op_qemu_ld16s:
-        tcg_out_qemu_ld(s, args, 1 | 4);
-        break;
-    case INDEX_op_qemu_ld32:
-        tcg_out_qemu_ld(s, args, 2);
-        break;
-    case INDEX_op_qemu_ld64:
-        tcg_out_qemu_ld(s, args, 3);
-        break;
-
-    case INDEX_op_qemu_st8:
-        tcg_out_qemu_st(s, args, 0);
-        break;
-    case INDEX_op_qemu_st16:
-        tcg_out_qemu_st(s, args, 1);
-        break;
-    case INDEX_op_qemu_st32:
-        tcg_out_qemu_st(s, args, 2);
-        break;
-    case INDEX_op_qemu_st64:
-        tcg_out_qemu_st(s, args, 3);
-        break;
-
-    default:
-        fprintf(stderr, "unknown opcode 0x%x\n", opc);
-        tcg_abort();
-    }
-}
-
-static const TCGTargetOpDef hppa_op_defs[] = {
-    { INDEX_op_exit_tb, { } },
-    { INDEX_op_goto_tb, { } },
-
-    { INDEX_op_call, { "ri" } },
-    { INDEX_op_br, { } },
-
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_movi_i32, { "r" } },
-
-    { INDEX_op_ld8u_i32, { "r", "r" } },
-    { INDEX_op_ld8s_i32, { "r", "r" } },
-    { INDEX_op_ld16u_i32, { "r", "r" } },
-    { INDEX_op_ld16s_i32, { "r", "r" } },
-    { INDEX_op_ld_i32, { "r", "r" } },
-    { INDEX_op_st8_i32, { "rZ", "r" } },
-    { INDEX_op_st16_i32, { "rZ", "r" } },
-    { INDEX_op_st_i32, { "rZ", "r" } },
-
-    { INDEX_op_add_i32, { "r", "rZ", "ri" } },
-    { INDEX_op_sub_i32, { "r", "rI", "ri" } },
-    { INDEX_op_and_i32, { "r", "rZ", "rM" } },
-    { INDEX_op_or_i32, { "r", "rZ", "rO" } },
-    { INDEX_op_xor_i32, { "r", "rZ", "rZ" } },
-    /* Note that the second argument will be inverted, which means
-       we want a constant whose inversion matches M, and that O = ~M.
-       See the implementation of and_mask_p.  */
-    { INDEX_op_andc_i32, { "r", "rZ", "rO" } },
-
-    { INDEX_op_mul_i32, { "r", "r", "r" } },
-    { INDEX_op_mulu2_i32, { "r", "r", "r", "r" } },
-
-    { INDEX_op_shl_i32, { "r", "r", "ri" } },
-    { INDEX_op_shr_i32, { "r", "r", "ri" } },
-    { INDEX_op_sar_i32, { "r", "r", "ri" } },
-    { INDEX_op_rotl_i32, { "r", "r", "ri" } },
-    { INDEX_op_rotr_i32, { "r", "r", "ri" } },
-
-    { INDEX_op_bswap16_i32, { "r", "r" } },
-    { INDEX_op_bswap32_i32, { "r", "r" } },
-    { INDEX_op_not_i32, { "r", "r" } },
-
-    { INDEX_op_ext8s_i32, { "r", "r" } },
-    { INDEX_op_ext16s_i32, { "r", "r" } },
-
-    { INDEX_op_brcond_i32, { "rZ", "rJ" } },
-    { INDEX_op_brcond2_i32,  { "rZ", "rZ", "rJ", "rJ" } },
-
-    { INDEX_op_setcond_i32, { "r", "rZ", "rI" } },
-    { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rI", "rI" } },
-
-    /* ??? We can actually support a signed 14-bit arg3, but we
-       only have existing constraints for a signed 11-bit.  */
-    { INDEX_op_movcond_i32, { "r", "rZ", "rI", "rI", "0" } },
-
-    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rI", "rZ" } },
-    { INDEX_op_sub2_i32, { "r", "r", "rI", "rZ", "rK", "rZ" } },
-
-    { INDEX_op_deposit_i32, { "r", "0", "rJ" } },
-
-#if TARGET_LONG_BITS == 32
-    { INDEX_op_qemu_ld8u, { "r", "L" } },
-    { INDEX_op_qemu_ld8s, { "r", "L" } },
-    { INDEX_op_qemu_ld16u, { "r", "L" } },
-    { INDEX_op_qemu_ld16s, { "r", "L" } },
-    { INDEX_op_qemu_ld32, { "r", "L" } },
-    { INDEX_op_qemu_ld64, { "r", "r", "L" } },
-
-    { INDEX_op_qemu_st8, { "LZ", "L" } },
-    { INDEX_op_qemu_st16, { "LZ", "L" } },
-    { INDEX_op_qemu_st32, { "LZ", "L" } },
-    { INDEX_op_qemu_st64, { "LZ", "LZ", "L" } },
-#else
-    { INDEX_op_qemu_ld8u, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld8s, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld16u, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld16s, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld32, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } },
-
-    { INDEX_op_qemu_st8, { "LZ", "L", "L" } },
-    { INDEX_op_qemu_st16, { "LZ", "L", "L" } },
-    { INDEX_op_qemu_st32, { "LZ", "L", "L" } },
-    { INDEX_op_qemu_st64, { "LZ", "LZ", "L", "L" } },
-#endif
-    { -1 },
-};
-
-static int tcg_target_callee_save_regs[] = {
-    /* R2, the return address register, is saved specially
-       in the caller's frame.  */
-    /* R3, the frame pointer, is not currently modified.  */
-    TCG_REG_R4,
-    TCG_REG_R5,
-    TCG_REG_R6,
-    TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-    TCG_REG_R11,
-    TCG_REG_R12,
-    TCG_REG_R13,
-    TCG_REG_R14,
-    TCG_REG_R15,
-    TCG_REG_R16,
-    TCG_REG_R17, /* R17 is the global env.  */
-    TCG_REG_R18
-};
-
-#define FRAME_SIZE ((-TCG_TARGET_CALL_STACK_OFFSET \
-                     + TCG_TARGET_STATIC_CALL_ARGS_SIZE \
-                     + ARRAY_SIZE(tcg_target_callee_save_regs) * 4 \
-                     + CPU_TEMP_BUF_NLONGS * sizeof(long) \
-                     + TCG_TARGET_STACK_ALIGN - 1) \
-                    & -TCG_TARGET_STACK_ALIGN)
-
-static void tcg_target_qemu_prologue(TCGContext *s)
-{
-    int frame_size, i;
-
-    frame_size = FRAME_SIZE;
-
-    /* The return address is stored in the caller's frame.  */
-    tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RP, TCG_REG_CALL_STACK, -20);
-
-    /* Allocate stack frame, saving the first register at the same time.  */
-    tcg_out_ldst(s, tcg_target_callee_save_regs[0],
-                 TCG_REG_CALL_STACK, frame_size, INSN_STWM);
-
-    /* Save all callee saved registers.  */
-    for (i = 1; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
-        tcg_out_st(s, TCG_TYPE_PTR, tcg_target_callee_save_regs[i],
-                   TCG_REG_CALL_STACK, -frame_size + i * 4);
-    }
-
-    /* Record the location of the TCG temps.  */
-    tcg_set_frame(s, TCG_REG_CALL_STACK, -frame_size + i * 4,
-                  CPU_TEMP_BUF_NLONGS * sizeof(long));
-
-#ifdef CONFIG_USE_GUEST_BASE
-    if (GUEST_BASE != 0) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
-        tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
-    }
-#endif
-
-    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
-
-    /* Jump to TB, and adjust R18 to be the return address.  */
-    tcg_out32(s, INSN_BLE_SR4 | INSN_R2(tcg_target_call_iarg_regs[1]));
-    tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R18, TCG_REG_R31);
-
-    /* Restore callee saved registers.  */
-    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_RP, TCG_REG_CALL_STACK,
-               -frame_size - 20);
-    for (i = 1; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
-        tcg_out_ld(s, TCG_TYPE_PTR, tcg_target_callee_save_regs[i],
-                   TCG_REG_CALL_STACK, -frame_size + i * 4);
-    }
-
-    /* Deallocate stack frame and return.  */
-    tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_RP));
-    tcg_out_ldst(s, tcg_target_callee_save_regs[0],
-                 TCG_REG_CALL_STACK, -frame_size, INSN_LDWM);
-}
-
-static void tcg_target_init(TCGContext *s)
-{
-    tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffffffff);
-
-    tcg_regset_clear(tcg_target_call_clobber_regs);
-    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R20);
-    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R21);
-    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R22);
-    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R23);
-    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R24);
-    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R25);
-    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R26);
-    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RET0);
-    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RET1);
-
-    tcg_regset_clear(s->reserved_regs);
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0);  /* hardwired to zero */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R1);  /* addil target */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_RP);  /* link register */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R3);  /* frame pointer */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R18); /* return pointer */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R19); /* clobbered w/o pic */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R20); /* reserved */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_DP);  /* data pointer */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);  /* stack pointer */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R31); /* ble link reg */
-
-    tcg_add_target_add_op_defs(hppa_op_defs);
-}
-
-typedef struct {
-    DebugFrameCIE cie;
-    DebugFrameFDEHeader fde;
-    uint8_t fde_def_cfa[4];
-    uint8_t fde_ret_ofs[3];
-    uint8_t fde_reg_ofs[ARRAY_SIZE(tcg_target_callee_save_regs) * 2];
-} DebugFrame;
-
-#define ELF_HOST_MACHINE  EM_PARISC
-#define ELF_HOST_FLAGS    EFA_PARISC_1_1
-
-/* ??? BFD (and thus GDB) wants very much to distinguish between HPUX
-   and other extensions.  We don't really care, but if we don't set this
-   to *something* then the object file won't be properly matched.  */
-#define ELF_OSABI         ELFOSABI_LINUX
-
-static DebugFrame debug_frame = {
-    .cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
-    .cie.id = -1,
-    .cie.version = 1,
-    .cie.code_align = 1,
-    .cie.data_align = 1,
-    .cie.return_column = 2,
-
-    /* Total FDE size does not include the "len" member.  */
-    .fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, fde.cie_offset),
-
-    .fde_def_cfa = {
-        0x12, 30,                       /* DW_CFA_def_cfa_sf sp, ... */
-        (-FRAME_SIZE & 0x7f) | 0x80,     /* ... sleb128 -FRAME_SIZE */
-        (-FRAME_SIZE >> 7) & 0x7f
-    },
-    .fde_ret_ofs = {
-        0x11, 2, (-20 / 4) & 0x7f       /* DW_CFA_offset_extended_sf r2, 20 */
-    },
-    .fde_reg_ofs = {
-        /* This must match the ordering in tcg_target_callee_save_regs.  */
-        0x80 + 4, 0,                    /* DW_CFA_offset r4, 0 */
-        0x80 + 5, 4,                    /* DW_CFA_offset r5, 4 */
-        0x80 + 6, 8,                    /* DW_CFA_offset r6, 8 */
-        0x80 + 7, 12,                    /* ... */
-        0x80 + 8, 16,
-        0x80 + 9, 20,
-        0x80 + 10, 24,
-        0x80 + 11, 28,
-        0x80 + 12, 32,
-        0x80 + 13, 36,
-        0x80 + 14, 40,
-        0x80 + 15, 44,
-        0x80 + 16, 48,
-        0x80 + 17, 52,
-        0x80 + 18, 56,
-    }
-};
-
-void tcg_register_jit(void *buf, size_t buf_size)
-{
-    debug_frame.fde.func_start = (tcg_target_long) buf;
-    debug_frame.fde.func_len = buf_size;
-
-    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
-}
diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
deleted file mode 100644
index 122edce7a7..0000000000
--- a/tcg/hppa/tcg-target.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Tiny Code Generator for QEMU
- *
- * Copyright (c) 2008 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef TCG_TARGET_HPPA
-#define TCG_TARGET_HPPA 1
-
-#define TCG_TARGET_WORDS_BIGENDIAN
-
-#define TCG_TARGET_NB_REGS 32
-
-typedef enum {
-    TCG_REG_R0 = 0,
-    TCG_REG_R1,
-    TCG_REG_RP,
-    TCG_REG_R3,
-    TCG_REG_R4,
-    TCG_REG_R5,
-    TCG_REG_R6,
-    TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-    TCG_REG_R11,
-    TCG_REG_R12,
-    TCG_REG_R13,
-    TCG_REG_R14,
-    TCG_REG_R15,
-    TCG_REG_R16,
-    TCG_REG_R17,
-    TCG_REG_R18,
-    TCG_REG_R19,
-    TCG_REG_R20,
-    TCG_REG_R21,
-    TCG_REG_R22,
-    TCG_REG_R23,
-    TCG_REG_R24,
-    TCG_REG_R25,
-    TCG_REG_R26,
-    TCG_REG_DP,
-    TCG_REG_RET0,
-    TCG_REG_RET1,
-    TCG_REG_SP,
-    TCG_REG_R31,
-} TCGReg;
-
-#define TCG_CT_CONST_0    0x0100
-#define TCG_CT_CONST_S5   0x0200
-#define TCG_CT_CONST_S11  0x0400
-#define TCG_CT_CONST_MS11 0x0800
-#define TCG_CT_CONST_AND  0x1000
-#define TCG_CT_CONST_OR   0x2000
-
-/* used for function call generation */
-#define TCG_REG_CALL_STACK TCG_REG_SP
-#define TCG_TARGET_STACK_ALIGN 64
-#define TCG_TARGET_CALL_STACK_OFFSET -48
-#define TCG_TARGET_STATIC_CALL_ARGS_SIZE 8*4
-#define TCG_TARGET_CALL_ALIGN_ARGS 1
-#define TCG_TARGET_STACK_GROWSUP
-
-/* optional instructions */
-#define TCG_TARGET_HAS_div_i32          0
-#define TCG_TARGET_HAS_rem_i32          0
-#define TCG_TARGET_HAS_rot_i32          1
-#define TCG_TARGET_HAS_ext8s_i32        1
-#define TCG_TARGET_HAS_ext16s_i32       1
-#define TCG_TARGET_HAS_bswap16_i32      1
-#define TCG_TARGET_HAS_bswap32_i32      1
-#define TCG_TARGET_HAS_not_i32          1
-#define TCG_TARGET_HAS_andc_i32         1
-#define TCG_TARGET_HAS_orc_i32          0
-#define TCG_TARGET_HAS_eqv_i32          0
-#define TCG_TARGET_HAS_nand_i32         0
-#define TCG_TARGET_HAS_nor_i32          0
-#define TCG_TARGET_HAS_deposit_i32      1
-#define TCG_TARGET_HAS_movcond_i32      1
-#define TCG_TARGET_HAS_muls2_i32        0
-#define TCG_TARGET_HAS_muluh_i32        0
-#define TCG_TARGET_HAS_mulsh_i32        0
-
-/* optional instructions automatically implemented */
-#define TCG_TARGET_HAS_neg_i32          0 /* sub rd, 0, rs */
-#define TCG_TARGET_HAS_ext8u_i32        0 /* and rd, rs, 0xff */
-#define TCG_TARGET_HAS_ext16u_i32       0 /* and rd, rs, 0xffff */
-
-#define TCG_AREG0 TCG_REG_R17
-
-
-static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
-{
-    start &= ~31;
-    while (start <= stop) {
-        asm volatile ("fdc 0(%0)\n\t"
-                      "sync\n\t"
-                      "fic 0(%%sr4, %0)\n\t"
-                      "sync"
-                      : : "r"(start) : "memory");
-        start += 32;
-    }
-}
-
-#endif
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index c1f07415ab..7ac8e45485 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -22,6 +22,8 @@
  * THE SOFTWARE.
  */
 
+#include "tcg-be-ldst.h"
+
 #ifndef NDEBUG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #if TCG_TARGET_REG_BITS == 64
@@ -1024,39 +1026,33 @@ static void tcg_out_jmp(TCGContext *s, uintptr_t dest)
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_ld_helpers[4] = {
-    helper_ret_ldub_mmu,
-    helper_ret_lduw_mmu,
-    helper_ret_ldul_mmu,
-    helper_ret_ldq_mmu,
+static const void * const qemu_ld_helpers[16] = {
+    [MO_UB]   = helper_ret_ldub_mmu,
+    [MO_LEUW] = helper_le_lduw_mmu,
+    [MO_LEUL] = helper_le_ldul_mmu,
+    [MO_LEQ]  = helper_le_ldq_mmu,
+    [MO_BEUW] = helper_be_lduw_mmu,
+    [MO_BEUL] = helper_be_ldul_mmu,
+    [MO_BEQ]  = helper_be_ldq_mmu,
 };
 
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_st_helpers[4] = {
-    helper_ret_stb_mmu,
-    helper_ret_stw_mmu,
-    helper_ret_stl_mmu,
-    helper_ret_stq_mmu,
+static const void * const qemu_st_helpers[16] = {
+    [MO_UB]   = helper_ret_stb_mmu,
+    [MO_LEUW] = helper_le_stw_mmu,
+    [MO_LEUL] = helper_le_stl_mmu,
+    [MO_LEQ]  = helper_le_stq_mmu,
+    [MO_BEUW] = helper_be_stw_mmu,
+    [MO_BEUL] = helper_be_stl_mmu,
+    [MO_BEQ]  = helper_be_stq_mmu,
 };
 
-static void add_qemu_ldst_label(TCGContext *s,
-                                int is_ld,
-                                int opc,
-                                int data_reg,
-                                int data_reg2,
-                                int addrlo_reg,
-                                int addrhi_reg,
-                                int mem_index,
-                                uint8_t *raddr,
-                                uint8_t **label_ptr);
-
 /* Perform the TLB load and compare.
 
    Inputs:
-   ADDRLO_IDX contains the index into ARGS of the low part of the
-   address; the high part of the address is at ADDR_LOW_IDX+1.
+   ADDRLO and ADDRHI contain the low and high part of the address.
 
    MEM_INDEX and S_BITS are the memory context and log2 size of the load.
 
@@ -1074,14 +1070,12 @@ static void add_qemu_ldst_label(TCGContext *s,
 
    First argument register is clobbered.  */
 
-static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
-                                    int mem_index, int s_bits,
-                                    const TCGArg *args,
+static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
+                                    int mem_index, TCGMemOp s_bits,
                                     uint8_t **label_ptr, int which)
 {
-    const int addrlo = args[addrlo_idx];
-    const int r0 = TCG_REG_L0;
-    const int r1 = TCG_REG_L1;
+    const TCGReg r0 = TCG_REG_L0;
+    const TCGReg r1 = TCG_REG_L1;
     TCGType ttype = TCG_TYPE_I32;
     TCGType htype = TCG_TYPE_I32;
     int trexw = 0, hrexw = 0;
@@ -1130,7 +1124,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
 
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         /* cmp 4(r0), addrhi */
-        tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r0, 4);
+        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
 
         /* jne slow_path */
         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
@@ -1144,6 +1138,182 @@ static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
                          offsetof(CPUTLBEntry, addend) - which);
 }
+
+/*
+ * Record the context of a call to the out of line helper code for the slow path
+ * for a load or store, so that we can later generate the correct helper code
+ */
+static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOp opc,
+                                TCGReg datalo, TCGReg datahi,
+                                TCGReg addrlo, TCGReg addrhi,
+                                int mem_index, uint8_t *raddr,
+                                uint8_t **label_ptr)
+{
+    TCGLabelQemuLdst *label = new_ldst_label(s);
+
+    label->is_ld = is_ld;
+    label->opc = opc;
+    label->datalo_reg = datalo;
+    label->datahi_reg = datahi;
+    label->addrlo_reg = addrlo;
+    label->addrhi_reg = addrhi;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    label->label_ptr[0] = label_ptr[0];
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        label->label_ptr[1] = label_ptr[1];
+    }
+}
+
+/*
+ * Generate code for the slow path for a load at the end of block
+ */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    TCGMemOp opc = l->opc;
+    TCGReg data_reg;
+    uint8_t **label_ptr = &l->label_ptr[0];
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+    }
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        int ofs = 0;
+
+        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
+        ofs += 4;
+
+        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
+        ofs += 4;
+
+        if (TARGET_LONG_BITS == 64) {
+            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
+            ofs += 4;
+        }
+
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
+        ofs += 4;
+
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, (uintptr_t)l->raddr);
+    } else {
+        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+        /* The second argument is already loaded with addrlo.  */
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
+                     l->mem_index);
+        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
+                     (uintptr_t)l->raddr);
+    }
+
+    tcg_out_calli(s, (uintptr_t)qemu_ld_helpers[opc & ~MO_SIGN]);
+
+    data_reg = l->datalo_reg;
+    switch (opc & MO_SSIZE) {
+    case MO_SB:
+        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case MO_SW:
+        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case MO_SL:
+        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
+        break;
+#endif
+    case MO_UB:
+    case MO_UW:
+        /* Note that the helpers have zero-extended to tcg_target_long.  */
+    case MO_UL:
+        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+        break;
+    case MO_Q:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
+        } else if (data_reg == TCG_REG_EDX) {
+            /* xchg %edx, %eax */
+            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
+            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+
+    /* Jump to the code corresponding to next IR of qemu_st */
+    tcg_out_jmp(s, (uintptr_t)l->raddr);
+}
+
+/*
+ * Generate code for the slow path for a store at the end of block
+ */
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    TCGMemOp opc = l->opc;
+    TCGMemOp s_bits = opc & MO_SIZE;
+    uint8_t **label_ptr = &l->label_ptr[0];
+    TCGReg retaddr;
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+    }
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        int ofs = 0;
+
+        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
+        ofs += 4;
+
+        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
+        ofs += 4;
+
+        if (TARGET_LONG_BITS == 64) {
+            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
+            ofs += 4;
+        }
+
+        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
+        ofs += 4;
+
+        if (s_bits == MO_64) {
+            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
+            ofs += 4;
+        }
+
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
+        ofs += 4;
+
+        retaddr = TCG_REG_EAX;
+        tcg_out_movi(s, TCG_TYPE_I32, retaddr, (uintptr_t)l->raddr);
+        tcg_out_st(s, TCG_TYPE_I32, retaddr, TCG_REG_ESP, ofs);
+    } else {
+        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+        /* The second argument is already loaded with addrlo.  */
+        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
+                    tcg_target_call_iarg_regs[2], l->datalo_reg);
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3],
+                     l->mem_index);
+
+        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
+            retaddr = tcg_target_call_iarg_regs[4];
+            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
+        } else {
+            retaddr = TCG_REG_RAX;
+            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
+            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, 0);
+        }
+    }
+
+    /* "Tail call" to the helper, with the return address back inline.  */
+    tcg_out_push(s, retaddr);
+    tcg_out_jmp(s, (uintptr_t)qemu_st_helpers[opc]);
+}
 #elif defined(__x86_64__) && defined(__linux__)
 # include <asm/prctl.h>
 # include <sys/prctl.h>
@@ -1162,28 +1332,26 @@ static inline void setup_guest_base_seg(void)
 static inline void setup_guest_base_seg(void) { }
 #endif /* SOFTMMU */
 
-static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi,
-                                   int base, intptr_t ofs, int seg, int sizeop)
+static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
+                                   TCGReg base, intptr_t ofs, int seg,
+                                   TCGMemOp memop)
 {
-#ifdef TARGET_WORDS_BIGENDIAN
-    const int bswap = 1;
-#else
-    const int bswap = 0;
-#endif
-    switch (sizeop) {
-    case 0:
+    const TCGMemOp bswap = memop & MO_BSWAP;
+
+    switch (memop & MO_SSIZE) {
+    case MO_UB:
         tcg_out_modrm_offset(s, OPC_MOVZBL + seg, datalo, base, ofs);
         break;
-    case 0 | 4:
+    case MO_SB:
         tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, ofs);
         break;
-    case 1:
+    case MO_UW:
         tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
         if (bswap) {
             tcg_out_rolw_8(s, datalo);
         }
         break;
-    case 1 | 4:
+    case MO_SW:
         if (bswap) {
             tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
             tcg_out_rolw_8(s, datalo);
@@ -1193,14 +1361,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi,
                                  datalo, base, ofs);
         }
         break;
-    case 2:
+    case MO_UL:
         tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
         if (bswap) {
             tcg_out_bswap32(s, datalo);
         }
         break;
 #if TCG_TARGET_REG_BITS == 64
-    case 2 | 4:
+    case MO_SL:
         if (bswap) {
             tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
             tcg_out_bswap32(s, datalo);
@@ -1210,7 +1378,7 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi,
         }
         break;
 #endif
-    case 3:
+    case MO_Q:
         if (TCG_TARGET_REG_BITS == 64) {
             tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
                                  datalo, base, ofs);
@@ -1248,48 +1416,40 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi,
 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
    EAX. It will be useful once fixed registers globals are less
    common. */
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
-                            int opc)
+static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 {
-    int data_reg, data_reg2 = 0;
-    int addrlo_idx;
+    TCGReg datalo, datahi, addrlo;
+    TCGReg addrhi __attribute__((unused));
+    TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
-    int mem_index, s_bits;
+    int mem_index;
+    TCGMemOp s_bits;
     uint8_t *label_ptr[2];
 #endif
 
-    data_reg = args[0];
-    addrlo_idx = 1;
-    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
-        data_reg2 = args[1];
-        addrlo_idx = 2;
-    }
+    datalo = *args++;
+    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
+    addrlo = *args++;
+    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
+    opc = *args++;
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = args[addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS)];
-    s_bits = opc & 3;
+    mem_index = *args++;
+    s_bits = opc & MO_SIZE;
 
-    tcg_out_tlb_load(s, addrlo_idx, mem_index, s_bits, args,
+    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, s_bits,
                      label_ptr, offsetof(CPUTLBEntry, addr_read));
 
     /* TLB Hit.  */
-    tcg_out_qemu_ld_direct(s, data_reg, data_reg2, TCG_REG_L1, 0, 0, opc);
+    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
 
     /* Record the current context of a load into ldst label */
-    add_qemu_ldst_label(s,
-                        1,
-                        opc,
-                        data_reg,
-                        data_reg2,
-                        args[addrlo_idx],
-                        args[addrlo_idx + 1],
-                        mem_index,
-                        s->code_ptr,
-                        label_ptr);
+    add_qemu_ldst_label(s, 1, opc, datalo, datahi, addrlo, addrhi,
+                        mem_index, s->code_ptr, label_ptr);
 #else
     {
         int32_t offset = GUEST_BASE;
-        int base = args[addrlo_idx];
+        TCGReg base = addrlo;
         int seg = 0;
 
         /* ??? We assume all operations have left us with register contents
@@ -1307,32 +1467,35 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
             offset = 0;
         }
 
-        tcg_out_qemu_ld_direct(s, data_reg, data_reg2, base, offset, seg, opc);
+        tcg_out_qemu_ld_direct(s, datalo, datahi, base, offset, seg, opc);
     }
 #endif
 }
 
-static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
-                                   int base, intptr_t ofs, int seg,
-                                   int sizeop)
+static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
+                                   TCGReg base, intptr_t ofs, int seg,
+                                   TCGMemOp memop)
 {
-#ifdef TARGET_WORDS_BIGENDIAN
-    const int bswap = 1;
-#else
-    const int bswap = 0;
-#endif
+    const TCGMemOp bswap = memop & MO_BSWAP;
+
     /* ??? Ideally we wouldn't need a scratch register.  For user-only,
        we could perform the bswap twice to restore the original value
        instead of moving to the scratch.  But as it is, the L constraint
        means that TCG_REG_L0 is definitely free here.  */
-    const int scratch = TCG_REG_L0;
+    const TCGReg scratch = TCG_REG_L0;
 
-    switch (sizeop) {
-    case 0:
+    switch (memop & MO_SIZE) {
+    case MO_8:
+        /* In 32-bit mode, 8-byte stores can only happen from [abcd]x.
+           Use the scratch register if necessary.  */
+        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
+            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
+            datalo = scratch;
+        }
         tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
                              datalo, base, ofs);
         break;
-    case 1:
+    case MO_16:
         if (bswap) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
             tcg_out_rolw_8(s, scratch);
@@ -1341,7 +1504,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
         tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
                              datalo, base, ofs);
         break;
-    case 2:
+    case MO_32:
         if (bswap) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
             tcg_out_bswap32(s, scratch);
@@ -1349,7 +1512,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
         }
         tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
         break;
-    case 3:
+    case MO_64:
         if (TCG_TARGET_REG_BITS == 64) {
             if (bswap) {
                 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
@@ -1375,48 +1538,40 @@ static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
     }
 }
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
-                            int opc)
+static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 {
-    int data_reg, data_reg2 = 0;
-    int addrlo_idx;
+    TCGReg datalo, datahi, addrlo;
+    TCGReg addrhi __attribute__((unused));
+    TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
-    int mem_index, s_bits;
+    int mem_index;
+    TCGMemOp s_bits;
     uint8_t *label_ptr[2];
 #endif
 
-    data_reg = args[0];
-    addrlo_idx = 1;
-    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
-        data_reg2 = args[1];
-        addrlo_idx = 2;
-    }
+    datalo = *args++;
+    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
+    addrlo = *args++;
+    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
+    opc = *args++;
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = args[addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS)];
-    s_bits = opc;
+    mem_index = *args++;
+    s_bits = opc & MO_SIZE;
 
-    tcg_out_tlb_load(s, addrlo_idx, mem_index, s_bits, args,
+    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, s_bits,
                      label_ptr, offsetof(CPUTLBEntry, addr_write));
 
     /* TLB Hit.  */
-    tcg_out_qemu_st_direct(s, data_reg, data_reg2, TCG_REG_L1, 0, 0, opc);
+    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
 
     /* Record the current context of a store into ldst label */
-    add_qemu_ldst_label(s,
-                        0,
-                        opc,
-                        data_reg,
-                        data_reg2,
-                        args[addrlo_idx],
-                        args[addrlo_idx + 1],
-                        mem_index,
-                        s->code_ptr,
-                        label_ptr);
+    add_qemu_ldst_label(s, 0, opc, datalo, datahi, addrlo, addrhi,
+                        mem_index, s->code_ptr, label_ptr);
 #else
     {
         int32_t offset = GUEST_BASE;
-        int base = args[addrlo_idx];
+        TCGReg base = addrlo;
         int seg = 0;
 
         /* ??? We assume all operations have left us with register contents
@@ -1434,221 +1589,11 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
             offset = 0;
         }
 
-        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, seg, opc);
-    }
-#endif
-}
-
-#if defined(CONFIG_SOFTMMU)
-/*
- * Record the context of a call to the out of line helper code for the slow path
- * for a load or store, so that we can later generate the correct helper code
- */
-static void add_qemu_ldst_label(TCGContext *s,
-                                int is_ld,
-                                int opc,
-                                int data_reg,
-                                int data_reg2,
-                                int addrlo_reg,
-                                int addrhi_reg,
-                                int mem_index,
-                                uint8_t *raddr,
-                                uint8_t **label_ptr)
-{
-    int idx;
-    TCGLabelQemuLdst *label;
-
-    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
-        tcg_abort();
-    }
-
-    idx = s->nb_qemu_ldst_labels++;
-    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
-    label->is_ld = is_ld;
-    label->opc = opc;
-    label->datalo_reg = data_reg;
-    label->datahi_reg = data_reg2;
-    label->addrlo_reg = addrlo_reg;
-    label->addrhi_reg = addrhi_reg;
-    label->mem_index = mem_index;
-    label->raddr = raddr;
-    label->label_ptr[0] = label_ptr[0];
-    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        label->label_ptr[1] = label_ptr[1];
-    }
-}
-
-/*
- * Generate code for the slow path for a load at the end of block
- */
-static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-    int opc = l->opc;
-    int s_bits = opc & 3;
-    TCGReg data_reg;
-    uint8_t **label_ptr = &l->label_ptr[0];
-
-    /* resolve label address */
-    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
-    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
-    }
-
-    if (TCG_TARGET_REG_BITS == 32) {
-        int ofs = 0;
-
-        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        if (TARGET_LONG_BITS == 64) {
-            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
-            ofs += 4;
-        }
-
-        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
-        ofs += 4;
-
-        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, (uintptr_t)l->raddr);
-    } else {
-        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
-        /* The second argument is already loaded with addrlo.  */
-        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
-                     l->mem_index);
-        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
-                     (uintptr_t)l->raddr);
+        tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
     }
-
-    tcg_out_calli(s, (uintptr_t)qemu_ld_helpers[s_bits]);
-
-    data_reg = l->datalo_reg;
-    switch(opc) {
-    case 0 | 4:
-        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-    case 1 | 4:
-        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-#if TCG_TARGET_REG_BITS == 64
-    case 2 | 4:
-        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
-        break;
 #endif
-    case 0:
-    case 1:
-        /* Note that the helpers have zero-extended to tcg_target_long.  */
-    case 2:
-        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-        break;
-    case 3:
-        if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
-        } else if (data_reg == TCG_REG_EDX) {
-            /* xchg %edx, %eax */
-            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
-        }
-        break;
-    default:
-        tcg_abort();
-    }
-
-    /* Jump to the code corresponding to next IR of qemu_st */
-    tcg_out_jmp(s, (uintptr_t)l->raddr);
 }
 
-/*
- * Generate code for the slow path for a store at the end of block
- */
-static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-    int opc = l->opc;
-    int s_bits = opc & 3;
-    uint8_t **label_ptr = &l->label_ptr[0];
-    TCGReg retaddr;
-
-    /* resolve label address */
-    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
-    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
-    }
-
-    if (TCG_TARGET_REG_BITS == 32) {
-        int ofs = 0;
-
-        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        if (TARGET_LONG_BITS == 64) {
-            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
-            ofs += 4;
-        }
-
-        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        if (opc == 3) {
-            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
-            ofs += 4;
-        }
-
-        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
-        ofs += 4;
-
-        retaddr = TCG_REG_EAX;
-        tcg_out_movi(s, TCG_TYPE_I32, retaddr, (uintptr_t)l->raddr);
-        tcg_out_st(s, TCG_TYPE_I32, retaddr, TCG_REG_ESP, ofs);
-    } else {
-        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
-        /* The second argument is already loaded with addrlo.  */
-        tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-                    tcg_target_call_iarg_regs[2], l->datalo_reg);
-        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3],
-                     l->mem_index);
-
-        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
-            retaddr = tcg_target_call_iarg_regs[4];
-            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
-        } else {
-            retaddr = TCG_REG_RAX;
-            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
-            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, 0);
-        }
-    }
-
-    /* "Tail call" to the helper, with the return address back inline.  */
-    tcg_out_push(s, retaddr);
-    tcg_out_jmp(s, (uintptr_t)qemu_st_helpers[s_bits]);
-}
-
-/*
- * Generate TB finalization at the end of block
- */
-void tcg_out_tb_finalize(TCGContext *s)
-{
-    int i;
-    TCGLabelQemuLdst *label;
-
-    /* qemu_ld/st slow paths */
-    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
-        label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
-        if (label->is_ld) {
-            tcg_out_qemu_ld_slow_path(s, label);
-        } else {
-            tcg_out_qemu_st_slow_path(s, label);
-        }
-    }
-}
-#endif  /* CONFIG_SOFTMMU */
-
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
@@ -1874,40 +1819,18 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_ext16u(s, args[0], args[1]);
         break;
 
-    case INDEX_op_qemu_ld8u:
+    case INDEX_op_qemu_ld_i32:
         tcg_out_qemu_ld(s, args, 0);
         break;
-    case INDEX_op_qemu_ld8s:
-        tcg_out_qemu_ld(s, args, 0 | 4);
-        break;
-    case INDEX_op_qemu_ld16u:
+    case INDEX_op_qemu_ld_i64:
         tcg_out_qemu_ld(s, args, 1);
         break;
-    case INDEX_op_qemu_ld16s:
-        tcg_out_qemu_ld(s, args, 1 | 4);
-        break;
-#if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_qemu_ld32u:
-#endif
-    case INDEX_op_qemu_ld32:
-        tcg_out_qemu_ld(s, args, 2);
-        break;
-    case INDEX_op_qemu_ld64:
-        tcg_out_qemu_ld(s, args, 3);
-        break;
-
-    case INDEX_op_qemu_st8:
+    case INDEX_op_qemu_st_i32:
         tcg_out_qemu_st(s, args, 0);
         break;
-    case INDEX_op_qemu_st16:
+    case INDEX_op_qemu_st_i64:
         tcg_out_qemu_st(s, args, 1);
         break;
-    case INDEX_op_qemu_st32:
-        tcg_out_qemu_st(s, args, 2);
-        break;
-    case INDEX_op_qemu_st64:
-        tcg_out_qemu_st(s, args, 3);
-        break;
 
     OP_32_64(mulu2):
         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
@@ -1966,9 +1889,6 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             tcg_out_st(s, TCG_TYPE_I64, args[0], args[1], args[2]);
         }
         break;
-    case INDEX_op_qemu_ld32s:
-        tcg_out_qemu_ld(s, args, 2 | 4);
-        break;
 
     case INDEX_op_brcond_i64:
         tcg_out_brcond64(s, args[2], args[0], args[1], const_args[1],
@@ -2133,43 +2053,20 @@ static const TCGTargetOpDef x86_op_defs[] = {
 #endif
 
 #if TCG_TARGET_REG_BITS == 64
-    { INDEX_op_qemu_ld8u, { "r", "L" } },
-    { INDEX_op_qemu_ld8s, { "r", "L" } },
-    { INDEX_op_qemu_ld16u, { "r", "L" } },
-    { INDEX_op_qemu_ld16s, { "r", "L" } },
-    { INDEX_op_qemu_ld32, { "r", "L" } },
-    { INDEX_op_qemu_ld32u, { "r", "L" } },
-    { INDEX_op_qemu_ld32s, { "r", "L" } },
-    { INDEX_op_qemu_ld64, { "r", "L" } },
-
-    { INDEX_op_qemu_st8, { "L", "L" } },
-    { INDEX_op_qemu_st16, { "L", "L" } },
-    { INDEX_op_qemu_st32, { "L", "L" } },
-    { INDEX_op_qemu_st64, { "L", "L" } },
+    { INDEX_op_qemu_ld_i32, { "r", "L" } },
+    { INDEX_op_qemu_st_i32, { "L", "L" } },
+    { INDEX_op_qemu_ld_i64, { "r", "L" } },
+    { INDEX_op_qemu_st_i64, { "L", "L" } },
 #elif TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
-    { INDEX_op_qemu_ld8u, { "r", "L" } },
-    { INDEX_op_qemu_ld8s, { "r", "L" } },
-    { INDEX_op_qemu_ld16u, { "r", "L" } },
-    { INDEX_op_qemu_ld16s, { "r", "L" } },
-    { INDEX_op_qemu_ld32, { "r", "L" } },
-    { INDEX_op_qemu_ld64, { "r", "r", "L" } },
-
-    { INDEX_op_qemu_st8, { "cb", "L" } },
-    { INDEX_op_qemu_st16, { "L", "L" } },
-    { INDEX_op_qemu_st32, { "L", "L" } },
-    { INDEX_op_qemu_st64, { "L", "L", "L" } },
+    { INDEX_op_qemu_ld_i32, { "r", "L" } },
+    { INDEX_op_qemu_st_i32, { "L", "L" } },
+    { INDEX_op_qemu_ld_i64, { "r", "r", "L" } },
+    { INDEX_op_qemu_st_i64, { "L", "L", "L" } },
 #else
-    { INDEX_op_qemu_ld8u, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld8s, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld16u, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld16s, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld32, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } },
-
-    { INDEX_op_qemu_st8, { "cb", "L", "L" } },
-    { INDEX_op_qemu_st16, { "L", "L", "L" } },
-    { INDEX_op_qemu_st32, { "L", "L", "L" } },
-    { INDEX_op_qemu_st64, { "L", "L", "L", "L" } },
+    { INDEX_op_qemu_ld_i32, { "r", "L", "L" } },
+    { INDEX_op_qemu_st_i32, { "L", "L", "L" } },
+    { INDEX_op_qemu_ld_i64, { "r", "r", "L", "L" } },
+    { INDEX_op_qemu_st_i64, { "L", "L", "L", "L" } },
 #endif
     { -1 },
 };
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index d32d7ef6f0..92c0fcd36d 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -130,6 +130,8 @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i64        0
 #endif
 
+#define TCG_TARGET_HAS_new_ldst         1
+
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
     (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
      ((ofs) == 0 && (len) == 16))
diff --git a/tcg/ia64/tcg-target.c b/tcg/ia64/tcg-target.c
index cd4f1ae1db..0656d3907a 100644
--- a/tcg/ia64/tcg-target.c
+++ b/tcg/ia64/tcg-target.c
@@ -23,6 +23,8 @@
  * THE SOFTWARE.
  */
 
+#include "tcg-be-null.h"
+
 /*
  * Register definitions
  */
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index 4330c9cdd3..c90038aae5 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -151,6 +151,8 @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_mulsh_i64        0
 
+#define TCG_TARGET_HAS_new_ldst         0
+
 #define TCG_TARGET_deposit_i32_valid(ofs, len) ((len) <= 16)
 #define TCG_TARGET_deposit_i64_valid(ofs, len) ((len) <= 16)
 
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index 5f0a65b4ea..40551cdcb5 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -24,6 +24,8 @@
  * THE SOFTWARE.
  */
 
+#include "tcg-be-null.h"
+
 #if defined(TCG_TARGET_WORDS_BIGENDIAN) == defined(TARGET_WORDS_BIGENDIAN)
 # define TCG_NEED_BSWAP 0
 #else
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index c37252269f..683c6af8b9 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -122,6 +122,8 @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_ext16s_i32       use_mips32r2_instructions
 #define TCG_TARGET_HAS_rot_i32          use_mips32r2_instructions
 
+#define TCG_TARGET_HAS_new_ldst         0
+
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub  rd, zero, rt   */
 #define TCG_TARGET_HAS_ext8u_i32        0 /* andi rt, rs, 0xff   */
diff --git a/tcg/optimize.c b/tcg/optimize.c
index b29bf25b67..89e2d6a3b3 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -238,20 +238,16 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
         return (int64_t)x >> (int64_t)y;
 
     case INDEX_op_rotr_i32:
-        x = ((uint32_t)x << (32 - y)) | ((uint32_t)x >> y);
-        return x;
+        return ror32(x, y);
 
     case INDEX_op_rotr_i64:
-        x = ((uint64_t)x << (64 - y)) | ((uint64_t)x >> y);
-        return x;
+        return ror64(x, y);
 
     case INDEX_op_rotl_i32:
-        x = ((uint32_t)x << y) | ((uint32_t)x >> (32 - y));
-        return x;
+        return rol32(x, y);
 
     case INDEX_op_rotl_i64:
-        x = ((uint64_t)x << y) | ((uint64_t)x >> (64 - y));
-        return x;
+        return rol64(x, y);
 
     CASE_OP_32_64(not):
         return ~x;
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 25955563b8..dc2c2df890 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -22,6 +22,8 @@
  * THE SOFTWARE.
  */
 
+#include "tcg-be-ldst.h"
+
 static uint8_t *tb_ret_addr;
 
 #if defined _CALL_DARWIN || defined __APPLE__
@@ -450,7 +452,9 @@ static const uint32_t tcg_to_bc[] = {
 
 static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 {
-    tcg_out32 (s, OR | SAB (arg, ret, arg));
+    if (ret != arg) {
+        tcg_out32(s, OR | SAB(arg, ret, arg));
+    }
 }
 
 static void tcg_out_movi(TCGContext *s, TCGType type,
@@ -490,7 +494,8 @@ static void tcg_out_b (TCGContext *s, int mask, tcg_target_long target)
     }
 }
 
-static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg)
+static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg,
+                          int lk)
 {
 #ifdef _CALL_AIX
     int reg;
@@ -504,14 +509,14 @@ static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg)
     tcg_out32 (s, LWZ | RT (0) | RA (reg));
     tcg_out32 (s, MTSPR | RA (0) | CTR);
     tcg_out32 (s, LWZ | RT (2) | RA (reg) | 4);
-    tcg_out32 (s, BCCTR | BO_ALWAYS | LK);
+    tcg_out32 (s, BCCTR | BO_ALWAYS | lk);
 #else
     if (const_arg) {
-        tcg_out_b (s, LK, arg);
+        tcg_out_b (s, lk, arg);
     }
     else {
         tcg_out32 (s, MTSPR | RS (arg) | LR);
-        tcg_out32 (s, BCLR | BO_ALWAYS | LK);
+        tcg_out32 (s, BCLR | BO_ALWAYS | lk);
     }
 #endif
 }
@@ -520,7 +525,7 @@ static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg)
 
 static void add_qemu_ldst_label (TCGContext *s,
                                  int is_ld,
-                                 int opc,
+                                 TCGMemOp opc,
                                  int data_reg,
                                  int data_reg2,
                                  int addrlo_reg,
@@ -529,15 +534,8 @@ static void add_qemu_ldst_label (TCGContext *s,
                                  uint8_t *raddr,
                                  uint8_t *label_ptr)
 {
-    int idx;
-    TCGLabelQemuLdst *label;
-
-    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
-        tcg_abort();
-    }
+    TCGLabelQemuLdst *label = new_ldst_label(s);
 
-    idx = s->nb_qemu_ldst_labels++;
-    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
     label->is_ld = is_ld;
     label->opc = opc;
     label->datalo_reg = data_reg;
@@ -549,442 +547,347 @@ static void add_qemu_ldst_label (TCGContext *s,
     label->label_ptr[0] = label_ptr;
 }
 
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
-static const void * const qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+ *                                     int mmu_idx, uintptr_t ra)
+ */
+static const void * const qemu_ld_helpers[16] = {
+    [MO_UB]   = helper_ret_ldub_mmu,
+    [MO_LEUW] = helper_le_lduw_mmu,
+    [MO_LEUL] = helper_le_ldul_mmu,
+    [MO_LEQ]  = helper_le_ldq_mmu,
+    [MO_BEUW] = helper_be_lduw_mmu,
+    [MO_BEUL] = helper_be_ldul_mmu,
+    [MO_BEQ]  = helper_be_ldq_mmu,
 };
 
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
-static const void * const qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
+ *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
+ */
+static const void * const qemu_st_helpers[16] = {
+    [MO_UB]   = helper_ret_stb_mmu,
+    [MO_LEUW] = helper_le_stw_mmu,
+    [MO_LEUL] = helper_le_stl_mmu,
+    [MO_LEQ]  = helper_le_stq_mmu,
+    [MO_BEUW] = helper_be_stw_mmu,
+    [MO_BEUL] = helper_be_stl_mmu,
+    [MO_BEQ]  = helper_be_stq_mmu,
 };
 
-static void *ld_trampolines[4];
-static void *st_trampolines[4];
+static void *ld_trampolines[16];
+static void *st_trampolines[16];
+
+/* Perform the TLB load and compare.  Branches to the slow path, placing the
+   address of the branch in *LABEL_PTR.  Loads the addend of the TLB into R0.
+   Clobbers R1 and R2.  */
 
-static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2,
-                               int addr_reg, int addr_reg2, int s_bits,
-                               int offset1, int offset2, uint8_t **label_ptr)
+static void tcg_out_tlb_check(TCGContext *s, TCGReg r0, TCGReg r1, TCGReg r2,
+                              TCGReg addrlo, TCGReg addrhi, TCGMemOp s_bits,
+                              int mem_index, int is_load, uint8_t **label_ptr)
 {
+    int cmp_off =
+        (is_load
+         ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+         : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
+    int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
     uint16_t retranst;
+    TCGReg base = TCG_AREG0;
+
+    /* Extract the page index, shifted into place for tlb index.  */
+    tcg_out32(s, (RLWINM
+                  | RA(r0)
+                  | RS(addrlo)
+                  | SH(32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS))
+                  | MB(32 - (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS))
+                  | ME(31 - CPU_TLB_ENTRY_BITS)));
+
+    /* Compensate for very large offsets.  */
+    if (add_off >= 0x8000) {
+        /* Most target env are smaller than 32k; none are larger than 64k.
+           Simplify the logic here merely to offset by 0x7ff0, giving us a
+           range just shy of 64k.  Check this assumption.  */
+        QEMU_BUILD_BUG_ON(offsetof(CPUArchState,
+                                   tlb_table[NB_MMU_MODES - 1][1])
+                          > 0x7ff0 + 0x7fff);
+        tcg_out32(s, ADDI | RT(r1) | RA(base) | 0x7ff0);
+        base = r1;
+        cmp_off -= 0x7ff0;
+        add_off -= 0x7ff0;
+    }
 
-    tcg_out32 (s, (RLWINM
-                   | RA (r0)
-                   | RS (addr_reg)
-                   | SH (32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS))
-                   | MB (32 - (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS))
-                   | ME (31 - CPU_TLB_ENTRY_BITS)
-                   )
-        );
-    tcg_out32 (s, ADD | RT (r0) | RA (r0) | RB (TCG_AREG0));
-    tcg_out32 (s, (LWZU
-                   | RT (r1)
-                   | RA (r0)
-                   | offset1
-                   )
-        );
-    tcg_out32 (s, (RLWINM
-                   | RA (r2)
-                   | RS (addr_reg)
-                   | SH (0)
-                   | MB ((32 - s_bits) & 31)
-                   | ME (31 - TARGET_PAGE_BITS)
-                   )
-        );
+    /* Clear the non-page, non-alignment bits from the address.  */
+    tcg_out32(s, (RLWINM
+                  | RA(r2)
+                  | RS(addrlo)
+                  | SH(0)
+                  | MB((32 - s_bits) & 31)
+                  | ME(31 - TARGET_PAGE_BITS)));
 
-    tcg_out32 (s, CMP | BF (7) | RA (r2) | RB (r1));
-#if TARGET_LONG_BITS == 64
-    tcg_out32 (s, LWZ | RT (r1) | RA (r0) | 4);
-    tcg_out32 (s, CMP | BF (6) | RA (addr_reg2) | RB (r1));
-    tcg_out32 (s, CRAND | BT (7, CR_EQ) | BA (6, CR_EQ) | BB (7, CR_EQ));
-#endif
-    *label_ptr = s->code_ptr;
-    retranst = ((uint16_t *) s->code_ptr)[1] & ~3;
-    tcg_out32 (s, BC | BI (7, CR_EQ) | retranst | BO_COND_FALSE);
+    tcg_out32(s, ADD | RT(r0) | RA(r0) | RB(base));
+    base = r0;
 
-    /* r0 now contains &env->tlb_table[mem_index][index].addr_x */
-    tcg_out32 (s, (LWZ
-                   | RT (r0)
-                   | RA (r0)
-                   | offset2
-                   )
-        );
-    /* r0 = env->tlb_table[mem_index][index].addend */
-    tcg_out32 (s, ADD | RT (r0) | RA (r0) | RB (addr_reg));
-    /* r0 = env->tlb_table[mem_index][index].addend + addr */
+    /* Load the tlb comparator.  */
+    tcg_out32(s, LWZ | RT(r1) | RA(base) | (cmp_off & 0xffff));
+
+    tcg_out32(s, CMP | BF(7) | RA(r2) | RB(r1));
+
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out32(s, LWZ | RT(r1) | RA(base) | ((cmp_off + 4) & 0xffff));
+    }
+
+    /* Load the tlb addend for use on the fast path.
+       Do this asap to minimize load delay.  */
+    tcg_out32(s, LWZ | RT(r0) | RA(base) | (add_off & 0xffff));
+
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out32(s, CMP | BF(6) | RA(addrhi) | RB(r1));
+        tcg_out32(s, CRAND | BT(7, CR_EQ) | BA(6, CR_EQ) | BB(7, CR_EQ));
+    }
 
+    /* Use a conditional branch-and-link so that we load a pointer to
+       somewhere within the current opcode, for passing on to the helper.
+       This address cannot be used for a tail call, but it's shorter
+       than forming an address from scratch.  */
+    *label_ptr = s->code_ptr;
+    retranst = ((uint16_t *) s->code_ptr)[1] & ~3;
+    tcg_out32(s, BC | BI(7, CR_EQ) | retranst | BO_COND_FALSE | LK);
 }
 #endif
 
-static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc)
+static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 {
-    int addr_reg, data_reg, data_reg2, r0, r1, rbase, bswap;
+    TCGReg addrlo, datalo, datahi, rbase, addrhi __attribute__((unused));
+    TCGMemOp opc, bswap;
 #ifdef CONFIG_SOFTMMU
-    int mem_index, s_bits, r2, addr_reg2;
+    int mem_index;
     uint8_t *label_ptr;
 #endif
 
-    data_reg = *args++;
-    if (opc == 3)
-        data_reg2 = *args++;
-    else
-        data_reg2 = 0;
-    addr_reg = *args++;
+    datalo = *args++;
+    datahi = (is64 ? *args++ : 0);
+    addrlo = *args++;
+    addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
+    opc = *args++;
+    bswap = opc & MO_BSWAP;
 
 #ifdef CONFIG_SOFTMMU
-#if TARGET_LONG_BITS == 64
-    addr_reg2 = *args++;
-#else
-    addr_reg2 = 0;
-#endif
     mem_index = *args;
-    s_bits = opc & 3;
-    r0 = 3;
-    r1 = 4;
-    r2 = 0;
-    rbase = 0;
-
-    tcg_out_tlb_check (
-        s, r0, r1, r2, addr_reg, addr_reg2, s_bits,
-        offsetof (CPUArchState, tlb_table[mem_index][0].addr_read),
-        offsetof (CPUTLBEntry, addend) - offsetof (CPUTLBEntry, addr_read),
-        &label_ptr
-        );
+    tcg_out_tlb_check(s, TCG_REG_R3, TCG_REG_R4, TCG_REG_R0, addrlo,
+                      addrhi, opc & MO_SIZE, mem_index, 0, &label_ptr);
+    rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
-    r0 = addr_reg;
-    r1 = 3;
     rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
 #endif
 
-#ifdef TARGET_WORDS_BIGENDIAN
-    bswap = 0;
-#else
-    bswap = 1;
-#endif
-
-    switch (opc) {
+    switch (opc & MO_SSIZE) {
     default:
-    case 0:
-        tcg_out32 (s, LBZX | TAB (data_reg, rbase, r0));
+    case MO_UB:
+        tcg_out32(s, LBZX | TAB(datalo, rbase, addrlo));
         break;
-    case 0|4:
-        tcg_out32 (s, LBZX | TAB (data_reg, rbase, r0));
-        tcg_out32 (s, EXTSB | RA (data_reg) | RS (data_reg));
+    case MO_SB:
+        tcg_out32(s, LBZX | TAB(datalo, rbase, addrlo));
+        tcg_out32(s, EXTSB | RA(datalo) | RS(datalo));
         break;
-    case 1:
-        if (bswap)
-            tcg_out32 (s, LHBRX | TAB (data_reg, rbase, r0));
-        else
-            tcg_out32 (s, LHZX | TAB (data_reg, rbase, r0));
+    case MO_UW:
+        tcg_out32(s, (bswap ? LHBRX : LHZX) | TAB(datalo, rbase, addrlo));
         break;
-    case 1|4:
+    case MO_SW:
         if (bswap) {
-            tcg_out32 (s, LHBRX | TAB (data_reg, rbase, r0));
-            tcg_out32 (s, EXTSH | RA (data_reg) | RS (data_reg));
+            tcg_out32(s, LHBRX | TAB(datalo, rbase, addrlo));
+            tcg_out32(s, EXTSH | RA(datalo) | RS(datalo));
+        } else {
+            tcg_out32(s, LHAX | TAB(datalo, rbase, addrlo));
         }
-        else tcg_out32 (s, LHAX | TAB (data_reg, rbase, r0));
         break;
-    case 2:
-        if (bswap)
-            tcg_out32 (s, LWBRX | TAB (data_reg, rbase, r0));
-        else
-            tcg_out32 (s, LWZX | TAB (data_reg, rbase, r0));
+    case MO_UL:
+        tcg_out32(s, (bswap ? LWBRX : LWZX) | TAB(datalo, rbase, addrlo));
         break;
-    case 3:
+    case MO_Q:
         if (bswap) {
-            tcg_out32 (s, ADDI | RT (r1) | RA (r0) | 4);
-            tcg_out32 (s, LWBRX | TAB (data_reg, rbase, r0));
-            tcg_out32 (s, LWBRX | TAB (data_reg2, rbase, r1));
-        }
-        else {
-#ifdef CONFIG_USE_GUEST_BASE
-            tcg_out32 (s, ADDI | RT (r1) | RA (r0) | 4);
-            tcg_out32 (s, LWZX | TAB (data_reg2, rbase, r0));
-            tcg_out32 (s, LWZX | TAB (data_reg, rbase, r1));
-#else
-            if (r0 == data_reg2) {
-                tcg_out32 (s, LWZ | RT (0) | RA (r0));
-                tcg_out32 (s, LWZ | RT (data_reg) | RA (r0) | 4);
-                tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 0);
-            }
-            else {
-                tcg_out32 (s, LWZ | RT (data_reg2) | RA (r0));
-                tcg_out32 (s, LWZ | RT (data_reg) | RA (r0) | 4);
-            }
-#endif
+            tcg_out32(s, ADDI | RT(TCG_REG_R0) | RA(addrlo) | 4);
+            tcg_out32(s, LWBRX | TAB(datalo, rbase, addrlo));
+            tcg_out32(s, LWBRX | TAB(datahi, rbase, TCG_REG_R0));
+        } else if (rbase != 0) {
+            tcg_out32(s, ADDI | RT(TCG_REG_R0) | RA(addrlo) | 4);
+            tcg_out32(s, LWZX | TAB(datahi, rbase, addrlo));
+            tcg_out32(s, LWZX | TAB(datalo, rbase, TCG_REG_R0));
+        } else if (addrlo == datahi) {
+            tcg_out32(s, LWZ | RT(datalo) | RA(addrlo) | 4);
+            tcg_out32(s, LWZ | RT(datahi) | RA(addrlo));
+        } else {
+            tcg_out32(s, LWZ | RT(datahi) | RA(addrlo));
+            tcg_out32(s, LWZ | RT(datalo) | RA(addrlo) | 4);
         }
         break;
     }
 #ifdef CONFIG_SOFTMMU
-    add_qemu_ldst_label (s,
-                         1,
-                         opc,
-                         data_reg,
-                         data_reg2,
-                         addr_reg,
-                         addr_reg2,
-                         mem_index,
-                         s->code_ptr,
-                         label_ptr);
+    add_qemu_ldst_label(s, 1, opc, datalo, datahi, addrlo,
+                        addrhi, mem_index, s->code_ptr, label_ptr);
 #endif
 }
 
-static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc)
+static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 {
-    int addr_reg, r0, r1, data_reg, data_reg2, bswap, rbase;
+    TCGReg addrlo, datalo, datahi, rbase, addrhi __attribute__((unused));
+    TCGMemOp opc, bswap, s_bits;
 #ifdef CONFIG_SOFTMMU
-    int mem_index, r2, addr_reg2;
+    int mem_index;
     uint8_t *label_ptr;
 #endif
 
-    data_reg = *args++;
-    if (opc == 3)
-        data_reg2 = *args++;
-    else
-        data_reg2 = 0;
-    addr_reg = *args++;
+    datalo = *args++;
+    datahi = (is64 ? *args++ : 0);
+    addrlo = *args++;
+    addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
+    opc = *args++;
+    bswap = opc & MO_BSWAP;
+    s_bits = opc & MO_SIZE;
 
 #ifdef CONFIG_SOFTMMU
-#if TARGET_LONG_BITS == 64
-    addr_reg2 = *args++;
-#else
-    addr_reg2 = 0;
-#endif
     mem_index = *args;
-    r0 = 3;
-    r1 = 4;
-    r2 = 0;
-    rbase = 0;
-
-    tcg_out_tlb_check (
-        s, r0, r1, r2, addr_reg, addr_reg2, opc & 3,
-        offsetof (CPUArchState, tlb_table[mem_index][0].addr_write),
-        offsetof (CPUTLBEntry, addend) - offsetof (CPUTLBEntry, addr_write),
-        &label_ptr
-        );
+    tcg_out_tlb_check(s, TCG_REG_R3, TCG_REG_R4, TCG_REG_R0, addrlo,
+                      addrhi, s_bits, mem_index, 0, &label_ptr);
+    rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
-    r0 = addr_reg;
-    r1 = 3;
     rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
 #endif
 
-#ifdef TARGET_WORDS_BIGENDIAN
-    bswap = 0;
-#else
-    bswap = 1;
-#endif
-    switch (opc) {
-    case 0:
-        tcg_out32 (s, STBX | SAB (data_reg, rbase, r0));
+    switch (s_bits) {
+    case MO_8:
+        tcg_out32(s, STBX | SAB(datalo, rbase, addrlo));
         break;
-    case 1:
-        if (bswap)
-            tcg_out32 (s, STHBRX | SAB (data_reg, rbase, r0));
-        else
-            tcg_out32 (s, STHX | SAB (data_reg, rbase, r0));
+    case MO_16:
+        tcg_out32(s, (bswap ? STHBRX : STHX) | SAB(datalo, rbase, addrlo));
         break;
-    case 2:
-        if (bswap)
-            tcg_out32 (s, STWBRX | SAB (data_reg, rbase, r0));
-        else
-            tcg_out32 (s, STWX | SAB (data_reg, rbase, r0));
+    case MO_32:
+    default:
+        tcg_out32(s, (bswap ? STWBRX : STWX) | SAB(datalo, rbase, addrlo));
         break;
-    case 3:
+    case MO_64:
         if (bswap) {
-            tcg_out32 (s, ADDI | RT (r1) | RA (r0) | 4);
-            tcg_out32 (s, STWBRX | SAB (data_reg,  rbase, r0));
-            tcg_out32 (s, STWBRX | SAB (data_reg2, rbase, r1));
-        }
-        else {
-#ifdef CONFIG_USE_GUEST_BASE
-            tcg_out32 (s, STWX | SAB (data_reg2, rbase, r0));
-            tcg_out32 (s, ADDI | RT (r1) | RA (r0) | 4);
-            tcg_out32 (s, STWX | SAB (data_reg,  rbase, r1));
-#else
-            tcg_out32 (s, STW | RS (data_reg2) | RA (r0));
-            tcg_out32 (s, STW | RS (data_reg) | RA (r0) | 4);
-#endif
+            tcg_out32(s, ADDI | RT(TCG_REG_R0) | RA(addrlo) | 4);
+            tcg_out32(s, STWBRX | SAB(datalo, rbase, addrlo));
+            tcg_out32(s, STWBRX | SAB(datahi, rbase, TCG_REG_R0));
+        } else if (rbase != 0) {
+            tcg_out32(s, ADDI | RT(TCG_REG_R0) | RA(addrlo) | 4);
+            tcg_out32(s, STWX | SAB(datahi, rbase, addrlo));
+            tcg_out32(s, STWX | SAB(datalo, rbase, TCG_REG_R0));
+        } else {
+            tcg_out32(s, STW | RS(datahi) | RA(addrlo));
+            tcg_out32(s, STW | RS(datalo) | RA(addrlo) | 4);
         }
         break;
     }
 
 #ifdef CONFIG_SOFTMMU
-    add_qemu_ldst_label (s,
-                         0,
-                         opc,
-                         data_reg,
-                         data_reg2,
-                         addr_reg,
-                         addr_reg2,
-                         mem_index,
-                         s->code_ptr,
-                         label_ptr);
+    add_qemu_ldst_label(s, 0, opc, datalo, datahi, addrlo, addrhi,
+                        mem_index, s->code_ptr, label_ptr);
 #endif
 }
 
 #if defined(CONFIG_SOFTMMU)
-static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    int s_bits;
-    int ir;
-    int opc = label->opc;
-    int mem_index = label->mem_index;
-    int data_reg = label->datalo_reg;
-    int data_reg2 = label->datahi_reg;
-    int addr_reg = label->addrlo_reg;
-    uint8_t *raddr = label->raddr;
-    uint8_t **label_ptr = &label->label_ptr[0];
-
-    s_bits = opc & 3;
-
-    /* resolve label address */
-    reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
-
-    /* slow path */
-    ir = 4;
-#if TARGET_LONG_BITS == 32
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
-#else
+    TCGReg ir, datalo, datahi;
+    TCGMemOp opc = l->opc;
+
+    reloc_pc14 (l->label_ptr[0], (uintptr_t)s->code_ptr);
+
+    ir = TCG_REG_R4;
+    if (TARGET_LONG_BITS == 32) {
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, l->addrlo_reg);
+    } else {
 #ifdef TCG_TARGET_CALL_ALIGN_ARGS
-    ir |= 1;
-#endif
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg);
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
+        ir |= 1;
 #endif
-    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
-    tcg_out_call (s, (tcg_target_long) ld_trampolines[s_bits], 1);
-    tcg_out32 (s, (tcg_target_long) raddr);
-    switch (opc) {
-    case 0|4:
-        tcg_out32 (s, EXTSB | RA (data_reg) | RS (3));
-        break;
-    case 1|4:
-        tcg_out32 (s, EXTSH | RA (data_reg) | RS (3));
-        break;
-    case 0:
-    case 1:
-    case 2:
-        if (data_reg != 3)
-            tcg_out_mov (s, TCG_TYPE_I32, data_reg, 3);
-        break;
-    case 3:
-        if (data_reg == 3) {
-            if (data_reg2 == 4) {
-                tcg_out_mov (s, TCG_TYPE_I32, 0, 4);
-                tcg_out_mov (s, TCG_TYPE_I32, 4, 3);
-                tcg_out_mov (s, TCG_TYPE_I32, 3, 0);
-            }
-            else {
-                tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3);
-                tcg_out_mov (s, TCG_TYPE_I32, 3, 4);
-            }
-        }
-        else {
-            if (data_reg != 4) tcg_out_mov (s, TCG_TYPE_I32, data_reg, 4);
-            if (data_reg2 != 3) tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3);
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, l->addrhi_reg);
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, l->addrlo_reg);
+    }
+    tcg_out_movi(s, TCG_TYPE_I32, ir++, l->mem_index);
+    tcg_out32(s, MFSPR | RT(ir++) | LR);
+    tcg_out_b(s, LK, (uintptr_t)ld_trampolines[opc & ~MO_SIGN]);
+
+    datalo = l->datalo_reg;
+    switch (opc & MO_SSIZE) {
+    case MO_SB:
+        tcg_out32(s, EXTSB | RA(datalo) | RS(TCG_REG_R3));
+        break;
+    case MO_SW:
+        tcg_out32(s, EXTSH | RA(datalo) | RS(TCG_REG_R3));
+        break;
+    default:
+        tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R3);
+        break;
+    case MO_Q:
+        datahi = l->datahi_reg;
+        if (datalo != TCG_REG_R3) {
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R4);
+            tcg_out_mov(s, TCG_TYPE_I32, datahi, TCG_REG_R3);
+        } else if (datahi != TCG_REG_R4) {
+            tcg_out_mov(s, TCG_TYPE_I32, datahi, TCG_REG_R3);
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R4);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R0, TCG_REG_R4);
+            tcg_out_mov(s, TCG_TYPE_I32, datahi, TCG_REG_R3);
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R0);
         }
         break;
     }
-    /* Jump to the code corresponding to next IR of qemu_st */
-    tcg_out_b (s, 0, (tcg_target_long) raddr);
+    tcg_out_b (s, 0, (uintptr_t)l->raddr);
 }
 
-static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    int ir;
-    int opc = label->opc;
-    int mem_index = label->mem_index;
-    int data_reg = label->datalo_reg;
-    int data_reg2 = label->datahi_reg;
-    int addr_reg = label->addrlo_reg;
-    uint8_t *raddr = label->raddr;
-    uint8_t **label_ptr = &label->label_ptr[0];
-
-    /* resolve label address */
-    reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
-
-    /* slow path */
-    ir = 4;
-#if TARGET_LONG_BITS == 32
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
-#else
+    TCGReg ir, datalo;
+    TCGMemOp opc = l->opc;
+
+    reloc_pc14 (l->label_ptr[0], (tcg_target_long) s->code_ptr);
+
+    ir = TCG_REG_R4;
+    if (TARGET_LONG_BITS == 32) {
+        tcg_out_mov (s, TCG_TYPE_I32, ir++, l->addrlo_reg);
+    } else {
 #ifdef TCG_TARGET_CALL_ALIGN_ARGS
-    ir |= 1;
-#endif
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg);
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
+        ir |= 1;
 #endif
+        tcg_out_mov (s, TCG_TYPE_I32, ir++, l->addrhi_reg);
+        tcg_out_mov (s, TCG_TYPE_I32, ir++, l->addrlo_reg);
+    }
 
-    switch (opc) {
-    case 0:
-        tcg_out32 (s, (RLWINM
-                       | RA (ir)
-                       | RS (data_reg)
-                       | SH (0)
-                       | MB (24)
-                       | ME (31)));
+    datalo = l->datalo_reg;
+    switch (opc & MO_SIZE) {
+    case MO_8:
+        tcg_out32(s, (RLWINM | RA (ir) | RS (datalo)
+                      | SH (0) | MB (24) | ME (31)));
         break;
-    case 1:
-        tcg_out32 (s, (RLWINM
-                       | RA (ir)
-                       | RS (data_reg)
-                       | SH (0)
-                       | MB (16)
-                       | ME (31)));
+    case MO_16:
+        tcg_out32(s, (RLWINM | RA (ir) | RS (datalo)
+                      | SH (0) | MB (16) | ME (31)));
         break;
-    case 2:
-        tcg_out_mov (s, TCG_TYPE_I32, ir, data_reg);
+    default:
+        tcg_out_mov(s, TCG_TYPE_I32, ir, datalo);
         break;
-    case 3:
+    case MO_64:
 #ifdef TCG_TARGET_CALL_ALIGN_ARGS
         ir |= 1;
 #endif
-        tcg_out_mov (s, TCG_TYPE_I32, ir++, data_reg2);
-        tcg_out_mov (s, TCG_TYPE_I32, ir, data_reg);
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, l->datahi_reg);
+        tcg_out_mov(s, TCG_TYPE_I32, ir, datalo);
         break;
     }
     ir++;
 
-    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
-    tcg_out_call (s, (tcg_target_long) st_trampolines[opc], 1);
-    tcg_out32 (s, (tcg_target_long) raddr);
-    tcg_out_b (s, 0, (tcg_target_long) raddr);
-}
-
-void tcg_out_tb_finalize(TCGContext *s)
-{
-    int i;
-    TCGLabelQemuLdst *label;
-
-    /* qemu_ld/st slow paths */
-    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
-        label = (TCGLabelQemuLdst *) &s->qemu_ldst_labels[i];
-        if (label->is_ld) {
-            tcg_out_qemu_ld_slow_path (s, label);
-        }
-        else {
-            tcg_out_qemu_st_slow_path (s, label);
-        }
-    }
+    tcg_out_movi(s, TCG_TYPE_I32, ir++, l->mem_index);
+    tcg_out32(s, MFSPR | RT(ir++) | LR);
+    tcg_out_b(s, LK, (uintptr_t)st_trampolines[opc]);
+    tcg_out_b(s, 0, (uintptr_t)l->raddr);
 }
 #endif
 
 #ifdef CONFIG_SOFTMMU
 static void emit_ldst_trampoline (TCGContext *s, const void *ptr)
 {
-    tcg_out32 (s, MFSPR | RT (3) | LR);
-    tcg_out32 (s, ADDI | RT (3) | RA (3) | 4);
-    tcg_out32 (s, MTSPR | RS (3) | LR);
     tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0);
-    tcg_out_b (s, 0, (tcg_target_long) ptr);
+    tcg_out_call (s, (tcg_target_long) ptr, 1, 0);
 }
 #endif
 
@@ -1050,12 +953,15 @@ static void tcg_target_qemu_prologue (TCGContext *s)
     tcg_out32 (s, BCLR | BO_ALWAYS);
 
 #ifdef CONFIG_SOFTMMU
-    for (i = 0; i < 4; ++i) {
-        ld_trampolines[i] = s->code_ptr;
-        emit_ldst_trampoline (s, qemu_ld_helpers[i]);
-
-        st_trampolines[i] = s->code_ptr;
-        emit_ldst_trampoline (s, qemu_st_helpers[i]);
+    for (i = 0; i < 16; ++i) {
+        if (qemu_ld_helpers[i]) {
+            ld_trampolines[i] = s->code_ptr;
+            emit_ldst_trampoline(s, qemu_ld_helpers[i]);
+        }
+        if (qemu_st_helpers[i]) {
+            st_trampolines[i] = s->code_ptr;
+            emit_ldst_trampoline(s, qemu_st_helpers[i]);
+        }
     }
 #endif
 }
@@ -1493,7 +1399,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         }
         break;
     case INDEX_op_call:
-        tcg_out_call (s, args[0], const_args[0]);
+        tcg_out_call (s, args[0], const_args[0], LK);
         break;
     case INDEX_op_movi_i32:
         tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
@@ -1800,36 +1706,18 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         tcg_out32 (s, NOR | SAB (args[1], args[0], args[1]));
         break;
 
-    case INDEX_op_qemu_ld8u:
+    case INDEX_op_qemu_ld_i32:
         tcg_out_qemu_ld(s, args, 0);
         break;
-    case INDEX_op_qemu_ld8s:
-        tcg_out_qemu_ld(s, args, 0 | 4);
-        break;
-    case INDEX_op_qemu_ld16u:
+    case INDEX_op_qemu_ld_i64:
         tcg_out_qemu_ld(s, args, 1);
         break;
-    case INDEX_op_qemu_ld16s:
-        tcg_out_qemu_ld(s, args, 1 | 4);
-        break;
-    case INDEX_op_qemu_ld32:
-        tcg_out_qemu_ld(s, args, 2);
-        break;
-    case INDEX_op_qemu_ld64:
-        tcg_out_qemu_ld(s, args, 3);
-        break;
-    case INDEX_op_qemu_st8:
+    case INDEX_op_qemu_st_i32:
         tcg_out_qemu_st(s, args, 0);
         break;
-    case INDEX_op_qemu_st16:
+    case INDEX_op_qemu_st_i64:
         tcg_out_qemu_st(s, args, 1);
         break;
-    case INDEX_op_qemu_st32:
-        tcg_out_qemu_st(s, args, 2);
-        break;
-    case INDEX_op_qemu_st64:
-        tcg_out_qemu_st(s, args, 3);
-        break;
 
     case INDEX_op_ext8s_i32:
         tcg_out32 (s, EXTSB | RS (args[1]) | RA (args[0]));
@@ -2013,29 +1901,15 @@ static const TCGTargetOpDef ppc_op_defs[] = {
     { INDEX_op_bswap32_i32, { "r", "r" } },
 
 #if TARGET_LONG_BITS == 32
-    { INDEX_op_qemu_ld8u, { "r", "L" } },
-    { INDEX_op_qemu_ld8s, { "r", "L" } },
-    { INDEX_op_qemu_ld16u, { "r", "L" } },
-    { INDEX_op_qemu_ld16s, { "r", "L" } },
-    { INDEX_op_qemu_ld32, { "r", "L" } },
-    { INDEX_op_qemu_ld64, { "r", "r", "L" } },
-
-    { INDEX_op_qemu_st8, { "K", "K" } },
-    { INDEX_op_qemu_st16, { "K", "K" } },
-    { INDEX_op_qemu_st32, { "K", "K" } },
-    { INDEX_op_qemu_st64, { "M", "M", "M" } },
+    { INDEX_op_qemu_ld_i32, { "r", "L" } },
+    { INDEX_op_qemu_ld_i64, { "L", "L", "L" } },
+    { INDEX_op_qemu_st_i32, { "K", "K" } },
+    { INDEX_op_qemu_st_i64, { "M", "M", "M" } },
 #else
-    { INDEX_op_qemu_ld8u, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld8s, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld16u, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld16s, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld32, { "r", "L", "L" } },
-    { INDEX_op_qemu_ld64, { "r", "L", "L", "L" } },
-
-    { INDEX_op_qemu_st8, { "K", "K", "K" } },
-    { INDEX_op_qemu_st16, { "K", "K", "K" } },
-    { INDEX_op_qemu_st32, { "K", "K", "K" } },
-    { INDEX_op_qemu_st64, { "M", "M", "M", "M" } },
+    { INDEX_op_qemu_ld_i32, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld_i64, { "L", "L", "L", "L" } },
+    { INDEX_op_qemu_st_i32, { "K", "K", "K" } },
+    { INDEX_op_qemu_st_i64, { "M", "M", "M", "M" } },
 #endif
 
     { INDEX_op_ext8s_i32, { "r", "r" } },
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index c9f8ff5206..e3395e301c 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -99,6 +99,8 @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
 
+#define TCG_TARGET_HAS_new_ldst         1
+
 #define TCG_AREG0 TCG_REG_R27
 
 #define tcg_qemu_tb_exec(env, tb_ptr) \
diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c
index 0bd1e0ce8c..6109d862db 100644
--- a/tcg/ppc64/tcg-target.c
+++ b/tcg/ppc64/tcg-target.c
@@ -22,6 +22,8 @@
  * THE SOFTWARE.
  */
 
+#include "tcg-be-ldst.h"
+
 #define TCG_CT_CONST_S16  0x100
 #define TCG_CT_CONST_U16  0x200
 #define TCG_CT_CONST_S32  0x400
@@ -31,13 +33,11 @@
 
 static uint8_t *tb_ret_addr;
 
-#define FAST_PATH
-
 #if TARGET_LONG_BITS == 32
-#define LD_ADDR LWZU
+#define LD_ADDR LWZ
 #define CMP_L 0
 #else
-#define LD_ADDR LDU
+#define LD_ADDR LD
 #define CMP_L (1<<21)
 #endif
 
@@ -99,7 +99,7 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #endif
 
 static const int tcg_target_reg_alloc_order[] = {
-    TCG_REG_R14,
+    TCG_REG_R14,  /* call saved registers */
     TCG_REG_R15,
     TCG_REG_R16,
     TCG_REG_R17,
@@ -109,29 +109,24 @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R21,
     TCG_REG_R22,
     TCG_REG_R23,
+    TCG_REG_R24,
+    TCG_REG_R25,
+    TCG_REG_R26,
+    TCG_REG_R27,
     TCG_REG_R28,
     TCG_REG_R29,
     TCG_REG_R30,
     TCG_REG_R31,
-#ifdef __APPLE__
-    TCG_REG_R2,
-#endif
-    TCG_REG_R3,
-    TCG_REG_R4,
-    TCG_REG_R5,
-    TCG_REG_R6,
-    TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-#ifndef __APPLE__
+    TCG_REG_R12,  /* call clobbered, non-arguments */
     TCG_REG_R11,
-#endif
-    TCG_REG_R12,
-    TCG_REG_R24,
-    TCG_REG_R25,
-    TCG_REG_R26,
-    TCG_REG_R27
+    TCG_REG_R10,  /* call clobbered, arguments */
+    TCG_REG_R9,
+    TCG_REG_R8,
+    TCG_REG_R7,
+    TCG_REG_R6,
+    TCG_REG_R5,
+    TCG_REG_R4,
+    TCG_REG_R3,
 };
 
 static const int tcg_target_call_iarg_regs[] = {
@@ -173,58 +168,74 @@ static const int tcg_target_callee_save_regs[] = {
     TCG_REG_R31
 };
 
-static uint32_t reloc_pc24_val (void *pc, tcg_target_long target)
+static inline bool in_range_b(tcg_target_long target)
+{
+    return target == sextract64(target, 0, 26);
+}
+
+static uint32_t reloc_pc24_val(void *pc, tcg_target_long target)
 {
     tcg_target_long disp;
 
-    disp = target - (tcg_target_long) pc;
-    if ((disp << 38) >> 38 != disp)
-        tcg_abort ();
+    disp = target - (tcg_target_long)pc;
+    assert(in_range_b(disp));
 
     return disp & 0x3fffffc;
 }
 
-static void reloc_pc24 (void *pc, tcg_target_long target)
+static void reloc_pc24(void *pc, tcg_target_long target)
 {
-    *(uint32_t *) pc = (*(uint32_t *) pc & ~0x3fffffc)
-        | reloc_pc24_val (pc, target);
+    *(uint32_t *)pc = (*(uint32_t *)pc & ~0x3fffffc)
+        | reloc_pc24_val(pc, target);
 }
 
-static uint16_t reloc_pc14_val (void *pc, tcg_target_long target)
+static uint16_t reloc_pc14_val(void *pc, tcg_target_long target)
 {
     tcg_target_long disp;
 
-    disp = target - (tcg_target_long) pc;
-    if (disp != (int16_t) disp)
-        tcg_abort ();
+    disp = target - (tcg_target_long)pc;
+    if (disp != (int16_t) disp) {
+        tcg_abort();
+    }
 
     return disp & 0xfffc;
 }
 
-static void reloc_pc14 (void *pc, tcg_target_long target)
+static void reloc_pc14(void *pc, tcg_target_long target)
 {
-    *(uint32_t *) pc = (*(uint32_t *) pc & ~0xfffc)
-        | reloc_pc14_val (pc, target);
+    *(uint32_t *)pc = (*(uint32_t *)pc & ~0xfffc) | reloc_pc14_val(pc, target);
 }
 
-static void patch_reloc (uint8_t *code_ptr, int type,
-                         intptr_t value, intptr_t addend)
+static inline void tcg_out_b_noaddr(TCGContext *s, int insn)
+{
+    unsigned retrans = *(uint32_t *)s->code_ptr & 0x3fffffc;
+    tcg_out32(s, insn | retrans);
+}
+
+static inline void tcg_out_bc_noaddr(TCGContext *s, int insn)
+{
+    unsigned retrans = *(uint32_t *)s->code_ptr & 0xfffc;
+    tcg_out32(s, insn | retrans);
+}
+
+static void patch_reloc(uint8_t *code_ptr, int type,
+                        intptr_t value, intptr_t addend)
 {
     value += addend;
     switch (type) {
     case R_PPC_REL14:
-        reloc_pc14 (code_ptr, value);
+        reloc_pc14(code_ptr, value);
         break;
     case R_PPC_REL24:
-        reloc_pc24 (code_ptr, value);
+        reloc_pc24(code_ptr, value);
         break;
     default:
-        tcg_abort ();
+        tcg_abort();
     }
 }
 
 /* parse target specific constraints */
-static int target_parse_constraint (TCGArgConstraint *ct, const char **pct_str)
+static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
     const char *ct_str;
 
@@ -232,29 +243,29 @@ static int target_parse_constraint (TCGArgConstraint *ct, const char **pct_str)
     switch (ct_str[0]) {
     case 'A': case 'B': case 'C': case 'D':
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set_reg (ct->u.regs, 3 + ct_str[0] - 'A');
+        tcg_regset_set_reg(ct->u.regs, 3 + ct_str[0] - 'A');
         break;
     case 'r':
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set32 (ct->u.regs, 0, 0xffffffff);
+        tcg_regset_set32(ct->u.regs, 0, 0xffffffff);
         break;
     case 'L':                   /* qemu_ld constraint */
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set32 (ct->u.regs, 0, 0xffffffff);
-        tcg_regset_reset_reg (ct->u.regs, TCG_REG_R3);
+        tcg_regset_set32(ct->u.regs, 0, 0xffffffff);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
 #ifdef CONFIG_SOFTMMU
-        tcg_regset_reset_reg (ct->u.regs, TCG_REG_R4);
-        tcg_regset_reset_reg (ct->u.regs, TCG_REG_R5);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R4);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R5);
 #endif
         break;
     case 'S':                   /* qemu_st constraint */
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set32 (ct->u.regs, 0, 0xffffffff);
-        tcg_regset_reset_reg (ct->u.regs, TCG_REG_R3);
+        tcg_regset_set32(ct->u.regs, 0, 0xffffffff);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
 #ifdef CONFIG_SOFTMMU
-        tcg_regset_reset_reg (ct->u.regs, TCG_REG_R4);
-        tcg_regset_reset_reg (ct->u.regs, TCG_REG_R5);
-        tcg_regset_reset_reg (ct->u.regs, TCG_REG_R6);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R4);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R5);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R6);
 #endif
         break;
     case 'I':
@@ -284,8 +295,8 @@ static int target_parse_constraint (TCGArgConstraint *ct, const char **pct_str)
 }
 
 /* test if a constant matches the constraint */
-static int tcg_target_const_match (tcg_target_long val,
-                                   const TCGArgConstraint *arg_ct)
+static int tcg_target_const_match(tcg_target_long val,
+                                  const TCGArgConstraint *arg_ct)
 {
     int ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
@@ -425,7 +436,7 @@ static int tcg_target_const_match (tcg_target_long val,
 #define STHX   XO31(407)
 #define STWX   XO31(151)
 
-#define SPR(a,b) ((((a)<<5)|(b))<<11)
+#define SPR(a, b) ((((a)<<5)|(b))<<11)
 #define LR     SPR(8, 0)
 #define CTR    SPR(9, 0)
 
@@ -439,7 +450,7 @@ static int tcg_target_const_match (tcg_target_long val,
 #define SRADI  XO31(413<<1)
 
 #define TW     XO31( 4)
-#define TRAP   (TW | TO (31))
+#define TRAP   (TW | TO(31))
 
 #define RT(r) ((r)<<21)
 #define RS(r) ((r)<<21)
@@ -467,9 +478,9 @@ static int tcg_target_const_match (tcg_target_long val,
 #define BB(n, c) (((c)+((n)*4))<<11)
 #define BC_(n, c) (((c)+((n)*4))<<6)
 
-#define BO_COND_TRUE  BO (12)
-#define BO_COND_FALSE BO ( 4)
-#define BO_ALWAYS     BO (20)
+#define BO_COND_TRUE  BO(12)
+#define BO_COND_FALSE BO( 4)
+#define BO_ALWAYS     BO(20)
 
 enum {
     CR_LT,
@@ -479,16 +490,16 @@ enum {
 };
 
 static const uint32_t tcg_to_bc[] = {
-    [TCG_COND_EQ]  = BC | BI (7, CR_EQ) | BO_COND_TRUE,
-    [TCG_COND_NE]  = BC | BI (7, CR_EQ) | BO_COND_FALSE,
-    [TCG_COND_LT]  = BC | BI (7, CR_LT) | BO_COND_TRUE,
-    [TCG_COND_GE]  = BC | BI (7, CR_LT) | BO_COND_FALSE,
-    [TCG_COND_LE]  = BC | BI (7, CR_GT) | BO_COND_FALSE,
-    [TCG_COND_GT]  = BC | BI (7, CR_GT) | BO_COND_TRUE,
-    [TCG_COND_LTU] = BC | BI (7, CR_LT) | BO_COND_TRUE,
-    [TCG_COND_GEU] = BC | BI (7, CR_LT) | BO_COND_FALSE,
-    [TCG_COND_LEU] = BC | BI (7, CR_GT) | BO_COND_FALSE,
-    [TCG_COND_GTU] = BC | BI (7, CR_GT) | BO_COND_TRUE,
+    [TCG_COND_EQ]  = BC | BI(7, CR_EQ) | BO_COND_TRUE,
+    [TCG_COND_NE]  = BC | BI(7, CR_EQ) | BO_COND_FALSE,
+    [TCG_COND_LT]  = BC | BI(7, CR_LT) | BO_COND_TRUE,
+    [TCG_COND_GE]  = BC | BI(7, CR_LT) | BO_COND_FALSE,
+    [TCG_COND_LE]  = BC | BI(7, CR_GT) | BO_COND_FALSE,
+    [TCG_COND_GT]  = BC | BI(7, CR_GT) | BO_COND_TRUE,
+    [TCG_COND_LTU] = BC | BI(7, CR_LT) | BO_COND_TRUE,
+    [TCG_COND_GEU] = BC | BI(7, CR_LT) | BO_COND_FALSE,
+    [TCG_COND_LEU] = BC | BI(7, CR_GT) | BO_COND_FALSE,
+    [TCG_COND_GTU] = BC | BI(7, CR_GT) | BO_COND_TRUE,
 };
 
 /* The low bit here is set if the RA and RB fields must be inverted.  */
@@ -508,15 +519,17 @@ static const uint32_t tcg_to_isel[] = {
 static inline void tcg_out_mov(TCGContext *s, TCGType type,
                                TCGReg ret, TCGReg arg)
 {
-    tcg_out32 (s, OR | SAB (arg, ret, arg));
+    if (ret != arg) {
+        tcg_out32(s, OR | SAB(arg, ret, arg));
+    }
 }
 
 static inline void tcg_out_rld(TCGContext *s, int op, TCGReg ra, TCGReg rs,
                                int sh, int mb)
 {
-    sh = SH (sh & 0x1f) | (((sh >> 5) & 1) << 1);
-    mb = MB64 ((mb >> 5) | ((mb << 1) & 0x3f));
-    tcg_out32 (s, op | RA (ra) | RS (rs) | sh | mb);
+    sh = SH(sh & 0x1f) | (((sh >> 5) & 1) << 1);
+    mb = MB64((mb >> 5) | ((mb << 1) & 0x3f));
+    tcg_out32(s, op | RA(ra) | RS(rs) | sh | mb);
 }
 
 static inline void tcg_out_rlw(TCGContext *s, int op, TCGReg ra, TCGReg rs,
@@ -636,8 +649,8 @@ static void tcg_out_andi32(TCGContext *s, TCGReg dst, TCGReg src, uint32_t c)
     } else if (mask_operand(c, &mb, &me)) {
         tcg_out_rlw(s, RLWINM, dst, src, 0, mb, me);
     } else {
-        tcg_out_movi(s, TCG_TYPE_I32, 0, c);
-        tcg_out32(s, AND | SAB(src, dst, 0));
+        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R0, c);
+        tcg_out32(s, AND | SAB(src, dst, TCG_REG_R0));
     }
 }
 
@@ -658,8 +671,8 @@ static void tcg_out_andi64(TCGContext *s, TCGReg dst, TCGReg src, uint64_t c)
             tcg_out_rld(s, RLDICL, dst, src, 0, mb);
         }
     } else {
-        tcg_out_movi(s, TCG_TYPE_I64, 0, c);
-        tcg_out32(s, AND | SAB(src, dst, 0));
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R0, c);
+        tcg_out32(s, AND | SAB(src, dst, TCG_REG_R0));
     }
 }
 
@@ -686,405 +699,487 @@ static void tcg_out_xori32(TCGContext *s, TCGReg dst, TCGReg src, uint32_t c)
     tcg_out_zori32(s, dst, src, c, XORI, XORIS);
 }
 
-static void tcg_out_b (TCGContext *s, int mask, tcg_target_long target)
+static void tcg_out_b(TCGContext *s, int mask, tcg_target_long target)
 {
     tcg_target_long disp;
 
-    disp = target - (tcg_target_long) s->code_ptr;
-    if ((disp << 38) >> 38 == disp)
-        tcg_out32 (s, B | (disp & 0x3fffffc) | mask);
-    else {
-        tcg_out_movi (s, TCG_TYPE_I64, 0, (tcg_target_long) target);
-        tcg_out32 (s, MTSPR | RS (0) | CTR);
-        tcg_out32 (s, BCCTR | BO_ALWAYS | mask);
+    disp = target - (tcg_target_long)s->code_ptr;
+    if (in_range_b(disp)) {
+        tcg_out32(s, B | (disp & 0x3fffffc) | mask);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R0, (tcg_target_long)target);
+        tcg_out32(s, MTSPR | RS(TCG_REG_R0) | CTR);
+        tcg_out32(s, BCCTR | BO_ALWAYS | mask);
     }
 }
 
-static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg)
+static void tcg_out_call(TCGContext *s, tcg_target_long arg, int const_arg)
 {
 #ifdef __APPLE__
     if (const_arg) {
-        tcg_out_b (s, LK, arg);
-    }
-    else {
-        tcg_out32 (s, MTSPR | RS (arg) | LR);
-        tcg_out32 (s, BCLR | BO_ALWAYS | LK);
+        tcg_out_b(s, LK, arg);
+    } else {
+        tcg_out32(s, MTSPR | RS(arg) | LR);
+        tcg_out32(s, BCLR | BO_ALWAYS | LK);
     }
 #else
-    int reg;
+    TCGReg reg = arg;
+    int ofs = 0;
 
     if (const_arg) {
-        reg = 2;
-        tcg_out_movi (s, TCG_TYPE_I64, reg, arg);
+        /* Look through the descriptor.  If the branch is in range, and we
+           don't have to spend too much effort on building the toc.  */
+        intptr_t tgt = ((intptr_t *)arg)[0];
+        intptr_t toc = ((intptr_t *)arg)[1];
+        intptr_t diff = tgt - (intptr_t)s->code_ptr;
+
+        if (in_range_b(diff) && toc == (uint32_t)toc) {
+            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R2, toc);
+            tcg_out_b(s, LK, tgt);
+            return;
+        }
+
+        /* Fold the low bits of the constant into the addresses below.  */
+        ofs = (int16_t)arg;
+        if (ofs + 8 < 0x8000) {
+            arg -= ofs;
+        } else {
+            ofs = 0;
+        }
+        reg = TCG_REG_R2;
+        tcg_out_movi(s, TCG_TYPE_I64, reg, arg);
     }
-    else reg = arg;
 
-    tcg_out32 (s, LD | RT (0) | RA (reg));
-    tcg_out32 (s, MTSPR | RA (0) | CTR);
-    tcg_out32 (s, LD | RT (11) | RA (reg) | 16);
-    tcg_out32 (s, LD | RT (2) | RA (reg) | 8);
-    tcg_out32 (s, BCCTR | BO_ALWAYS | LK);
+    tcg_out32(s, LD | TAI(TCG_REG_R0, reg, ofs));
+    tcg_out32(s, MTSPR | RA(TCG_REG_R0) | CTR);
+    tcg_out32(s, LD | TAI(TCG_REG_R2, reg, ofs + 8));
+    tcg_out32(s, BCCTR | BO_ALWAYS | LK);
 #endif
 }
 
-static void tcg_out_ldst(TCGContext *s, TCGReg ret, TCGReg addr,
-                         int offset, int op1, int op2)
+static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
+                             TCGReg base, tcg_target_long offset)
 {
-    if (offset == (int16_t) offset) {
-        tcg_out32(s, op1 | TAI(ret, addr, offset));
-    } else {
-        tcg_out_movi(s, TCG_TYPE_I64, 0, offset);
-        tcg_out32(s, op2 | TAB(ret, addr, 0));
+    tcg_target_long orig = offset, l0, l1, extra = 0, align = 0;
+    TCGReg rs = TCG_REG_R2;
+
+    assert(rt != TCG_REG_R2 && base != TCG_REG_R2);
+
+    switch (opi) {
+    case LD: case LWA:
+        align = 3;
+        /* FALLTHRU */
+    default:
+        if (rt != TCG_REG_R0) {
+            rs = rt;
+        }
+        break;
+    case STD:
+        align = 3;
+        break;
+    case STB: case STH: case STW:
+        break;
     }
-}
 
-static void tcg_out_ldsta(TCGContext *s, TCGReg ret, TCGReg addr,
-                          int offset, int op1, int op2)
-{
-    if (offset == (int16_t) (offset & ~3)) {
-        tcg_out32(s, op1 | TAI(ret, addr, offset));
-    } else {
-        tcg_out_movi(s, TCG_TYPE_I64, 0, offset);
-        tcg_out32(s, op2 | TAB(ret, addr, 0));
+    /* For unaligned, or very large offsets, use the indexed form.  */
+    if (offset & align || offset != (int32_t)offset) {
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R2, orig);
+        tcg_out32(s, opx | TAB(rt, base, TCG_REG_R2));
+        return;
+    }
+
+    l0 = (int16_t)offset;
+    offset = (offset - l0) >> 16;
+    l1 = (int16_t)offset;
+
+    if (l1 < 0 && orig >= 0) {
+        extra = 0x4000;
+        l1 = (int16_t)(offset - 0x4000);
+    }
+    if (l1) {
+        tcg_out32(s, ADDIS | TAI(rs, base, l1));
+        base = rs;
+    }
+    if (extra) {
+        tcg_out32(s, ADDIS | TAI(rs, base, extra));
+        base = rs;
+    }
+    if (opi != ADDI || base != rt || l0 != 0) {
+        tcg_out32(s, opi | TAI(rt, base, l0));
     }
 }
 
+static const uint32_t qemu_ldx_opc[16] = {
+    [MO_UB] = LBZX,
+    [MO_UW] = LHZX,
+    [MO_UL] = LWZX,
+    [MO_Q]  = LDX,
+    [MO_SW] = LHAX,
+    [MO_SL] = LWAX,
+    [MO_BSWAP | MO_UB] = LBZX,
+    [MO_BSWAP | MO_UW] = LHBRX,
+    [MO_BSWAP | MO_UL] = LWBRX,
+    [MO_BSWAP | MO_Q]  = LDBRX,
+};
+
+static const uint32_t qemu_stx_opc[16] = {
+    [MO_UB] = STBX,
+    [MO_UW] = STHX,
+    [MO_UL] = STWX,
+    [MO_Q]  = STDX,
+    [MO_BSWAP | MO_UB] = STBX,
+    [MO_BSWAP | MO_UW] = STHBRX,
+    [MO_BSWAP | MO_UL] = STWBRX,
+    [MO_BSWAP | MO_Q]  = STDBRX,
+};
+
+static const uint32_t qemu_exts_opc[4] = {
+    EXTSB, EXTSH, EXTSW, 0
+};
+
 #if defined (CONFIG_SOFTMMU)
 /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
-static const void * const qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+ *                                 int mmu_idx, uintptr_t ra)
+ */
+static const void * const qemu_ld_helpers[16] = {
+    [MO_UB]   = helper_ret_ldub_mmu,
+    [MO_LEUW] = helper_le_lduw_mmu,
+    [MO_LEUL] = helper_le_ldul_mmu,
+    [MO_LEQ]  = helper_le_ldq_mmu,
+    [MO_BEUW] = helper_be_lduw_mmu,
+    [MO_BEUL] = helper_be_ldul_mmu,
+    [MO_BEQ]  = helper_be_ldq_mmu,
 };
 
 /* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
-static const void * const qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+ *                                 uintxx_t val, int mmu_idx, uintptr_t ra)
+ */
+static const void * const qemu_st_helpers[16] = {
+    [MO_UB]   = helper_ret_stb_mmu,
+    [MO_LEUW] = helper_le_stw_mmu,
+    [MO_LEUL] = helper_le_stl_mmu,
+    [MO_LEQ]  = helper_le_stq_mmu,
+    [MO_BEUW] = helper_be_stw_mmu,
+    [MO_BEUL] = helper_be_stl_mmu,
+    [MO_BEQ]  = helper_be_stq_mmu,
 };
 
-static void tcg_out_tlb_read(TCGContext *s, TCGReg r0, TCGReg r1, TCGReg r2,
-                             TCGReg addr_reg, int s_bits, int offset)
+/* Perform the TLB load and compare.  Places the result of the comparison
+   in CR7, loads the addend of the TLB into R3, and returns the register
+   containing the guest address (zero-extended into R4).  Clobbers R0 and R2. */
+
+static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp s_bits, TCGReg addr_reg,
+                               int mem_index, bool is_read)
 {
-#if TARGET_LONG_BITS == 32
-    tcg_out_ext32u(s, addr_reg, addr_reg);
-
-    tcg_out_rlw(s, RLWINM, r0, addr_reg,
-                32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS),
-                32 - (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS),
-                31 - CPU_TLB_ENTRY_BITS);
-    tcg_out32(s, ADD | TAB(r0, r0, TCG_AREG0));
-    tcg_out32(s, LWZU | TAI(r1, r0, offset));
-    tcg_out_rlw(s, RLWINM, r2, addr_reg, 0,
-                (32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
-#else
-    tcg_out_rld (s, RLDICL, r0, addr_reg,
-                 64 - TARGET_PAGE_BITS,
-                 64 - CPU_TLB_BITS);
-    tcg_out_shli64(s, r0, r0, CPU_TLB_ENTRY_BITS);
+    int cmp_off
+        = (is_read
+           ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+           : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
+    int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
+    TCGReg base = TCG_AREG0;
+
+    /* Extract the page index, shifted into place for tlb index.  */
+    if (TARGET_LONG_BITS == 32) {
+        /* Zero-extend the address into a place helpful for further use.  */
+        tcg_out_ext32u(s, TCG_REG_R4, addr_reg);
+        addr_reg = TCG_REG_R4;
+    } else {
+        tcg_out_rld(s, RLDICL, TCG_REG_R3, addr_reg,
+                    64 - TARGET_PAGE_BITS, 64 - CPU_TLB_BITS);
+    }
 
-    tcg_out32(s, ADD | TAB(r0, r0, TCG_AREG0));
-    tcg_out32(s, LD_ADDR | TAI(r1, r0, offset));
+    /* Compensate for very large offsets.  */
+    if (add_off >= 0x8000) {
+        /* Most target env are smaller than 32k; none are larger than 64k.
+           Simplify the logic here merely to offset by 0x7ff0, giving us a
+           range just shy of 64k.  Check this assumption.  */
+        QEMU_BUILD_BUG_ON(offsetof(CPUArchState,
+                                   tlb_table[NB_MMU_MODES - 1][1])
+                          > 0x7ff0 + 0x7fff);
+        tcg_out32(s, ADDI | TAI(TCG_REG_R2, base, 0x7ff0));
+        base = TCG_REG_R2;
+        cmp_off -= 0x7ff0;
+        add_off -= 0x7ff0;
+    }
 
-    if (!s_bits) {
-        tcg_out_rld (s, RLDICR, r2, addr_reg, 0, 63 - TARGET_PAGE_BITS);
+    /* Extraction and shifting, part 2.  */
+    if (TARGET_LONG_BITS == 32) {
+        tcg_out_rlw(s, RLWINM, TCG_REG_R3, addr_reg,
+                    32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS),
+                    32 - (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS),
+                    31 - CPU_TLB_ENTRY_BITS);
+    } else {
+        tcg_out_shli64(s, TCG_REG_R3, TCG_REG_R3, CPU_TLB_ENTRY_BITS);
     }
-    else {
-        tcg_out_rld (s, RLDICL, r2, addr_reg,
-                     64 - TARGET_PAGE_BITS,
-                     TARGET_PAGE_BITS - s_bits);
-        tcg_out_rld (s, RLDICL, r2, r2, TARGET_PAGE_BITS, 0);
+
+    tcg_out32(s, ADD | TAB(TCG_REG_R3, TCG_REG_R3, base));
+
+    /* Load the tlb comparator.  */
+    tcg_out32(s, LD_ADDR | TAI(TCG_REG_R2, TCG_REG_R3, cmp_off));
+
+    /* Load the TLB addend for use on the fast path.  Do this asap
+       to minimize any load use delay.  */
+    tcg_out32(s, LD | TAI(TCG_REG_R3, TCG_REG_R3, add_off));
+
+    /* Clear the non-page, non-alignment bits from the address.  */
+    if (TARGET_LONG_BITS == 32) {
+        tcg_out_rlw(s, RLWINM, TCG_REG_R0, addr_reg, 0,
+                    (32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
+    } else if (!s_bits) {
+        tcg_out_rld(s, RLDICR, TCG_REG_R0, addr_reg, 0, 63 - TARGET_PAGE_BITS);
+    } else {
+        tcg_out_rld(s, RLDICL, TCG_REG_R0, addr_reg,
+                    64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - s_bits);
+        tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0);
     }
-#endif
-}
-#endif
 
-static const uint32_t qemu_ldx_opc[8] = {
-#ifdef TARGET_WORDS_BIGENDIAN
-    LBZX, LHZX, LWZX, LDX,
-    0,    LHAX, LWAX, LDX
-#else
-    LBZX, LHBRX, LWBRX, LDBRX,
-    0,    0,     0,     LDBRX,
-#endif
-};
+    tcg_out32(s, CMP | BF(7) | RA(TCG_REG_R0) | RB(TCG_REG_R2) | CMP_L);
 
-static const uint32_t qemu_stx_opc[4] = {
-#ifdef TARGET_WORDS_BIGENDIAN
-    STBX, STHX, STWX, STDX
-#else
-    STBX, STHBRX, STWBRX, STDBRX,
-#endif
-};
+    return addr_reg;
+}
 
-static const uint32_t qemu_exts_opc[4] = {
-    EXTSB, EXTSH, EXTSW, 0
-};
+/* Record the context of a call to the out of line helper code for the slow
+   path for a load or store, so that we can later generate the correct
+   helper code.  */
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
+                                int data_reg, int addr_reg, int mem_index,
+                                uint8_t *raddr, uint8_t *label_ptr)
+{
+    TCGLabelQemuLdst *label = new_ldst_label(s);
+
+    label->is_ld = is_ld;
+    label->opc = opc;
+    label->datalo_reg = data_reg;
+    label->addrlo_reg = addr_reg;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    label->label_ptr[0] = label_ptr;
+}
 
-static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc)
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGReg addr_reg, data_reg, r0, r1, rbase;
-    uint32_t insn, s_bits;
-#ifdef CONFIG_SOFTMMU
-    TCGReg r2, ir;
-    int mem_index;
-    void *label1_ptr, *label2_ptr;
-#endif
+    TCGMemOp opc = lb->opc;
 
-    data_reg = *args++;
-    addr_reg = *args++;
-    s_bits = opc & 3;
+    reloc_pc14(lb->label_ptr[0], (uintptr_t)s->code_ptr);
 
-#ifdef CONFIG_SOFTMMU
-    mem_index = *args;
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_AREG0);
 
-    r0 = 3;
-    r1 = 4;
-    r2 = 0;
-    rbase = 0;
+    /* If the address needed to be zero-extended, we'll have already
+       placed it in R4.  The only remaining case is 64-bit guest.  */
+    tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_R4, lb->addrlo_reg);
 
-    tcg_out_tlb_read (s, r0, r1, r2, addr_reg, s_bits,
-                      offsetof (CPUArchState, tlb_table[mem_index][0].addr_read));
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, lb->mem_index);
+    tcg_out32(s, MFSPR | RT(TCG_REG_R6) | LR);
 
-    tcg_out32 (s, CMP | BF (7) | RA (r2) | RB (r1) | CMP_L);
+    tcg_out_call(s, (tcg_target_long)qemu_ld_helpers[opc & ~MO_SIGN], 1);
 
-    label1_ptr = s->code_ptr;
-#ifdef FAST_PATH
-    tcg_out32 (s, BC | BI (7, CR_EQ) | BO_COND_TRUE);
-#endif
+    if (opc & MO_SIGN) {
+        uint32_t insn = qemu_exts_opc[opc & MO_SIZE];
+        tcg_out32(s, insn | RA(lb->datalo_reg) | RS(TCG_REG_R3));
+    } else {
+        tcg_out_mov(s, TCG_TYPE_I64, lb->datalo_reg, TCG_REG_R3);
+    }
 
-    /* slow path */
-    ir = 3;
-    tcg_out_mov (s, TCG_TYPE_I64, ir++, TCG_AREG0);
-    tcg_out_mov (s, TCG_TYPE_I64, ir++, addr_reg);
-    tcg_out_movi (s, TCG_TYPE_I64, ir++, mem_index);
+    tcg_out_b(s, 0, (uintptr_t)lb->raddr);
+}
 
-    tcg_out_call (s, (tcg_target_long) qemu_ld_helpers[s_bits], 1);
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+{
+    TCGMemOp opc = lb->opc;
+    TCGMemOp s_bits = opc & MO_SIZE;
 
-    if (opc & 4) {
-        insn = qemu_exts_opc[s_bits];
-        tcg_out32(s, insn | RA(data_reg) | RS(3));
-    } else if (data_reg != 3) {
-        tcg_out_mov(s, TCG_TYPE_I64, data_reg, 3);
-    }
-    label2_ptr = s->code_ptr;
-    tcg_out32 (s, B);
+    reloc_pc14(lb->label_ptr[0], (uintptr_t)s->code_ptr);
+
+    tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_R3, TCG_AREG0);
+
+    /* If the address needed to be zero-extended, we'll have already
+       placed it in R4.  The only remaining case is 64-bit guest.  */
+    tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_R4, lb->addrlo_reg);
+
+    tcg_out_rld(s, RLDICL, TCG_REG_R5, lb->datalo_reg,
+                0, 64 - (1 << (3 + s_bits)));
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R6, lb->mem_index);
+    tcg_out32(s, MFSPR | RT(TCG_REG_R7) | LR);
 
-    /* label1: fast path */
-#ifdef FAST_PATH
-    reloc_pc14 (label1_ptr, (tcg_target_long) s->code_ptr);
+    tcg_out_call(s, (tcg_target_long)qemu_st_helpers[opc], 1);
+
+    tcg_out_b(s, 0, (uintptr_t)lb->raddr);
+}
+#endif /* SOFTMMU */
+
+static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
+                            TCGMemOp opc, int mem_index)
+{
+    TCGReg rbase;
+    uint32_t insn;
+    TCGMemOp s_bits = opc & MO_SIZE;
+#ifdef CONFIG_SOFTMMU
+    void *label_ptr;
 #endif
 
-    /* r0 now contains &env->tlb_table[mem_index][index].addr_read */
-    tcg_out32(s, LD | TAI(r0, r0,
-                          offsetof(CPUTLBEntry, addend)
-                          - offsetof(CPUTLBEntry, addr_read)));
-    /* r0 = env->tlb_table[mem_index][index].addend */
-    tcg_out32(s, ADD | TAB(r0, r0, addr_reg));
-    /* r0 = env->tlb_table[mem_index][index].addend + addr */
+#ifdef CONFIG_SOFTMMU
+    addr_reg = tcg_out_tlb_read(s, s_bits, addr_reg, mem_index, true);
 
+    /* Load a pointer into the current opcode w/conditional branch-link. */
+    label_ptr = s->code_ptr;
+    tcg_out_bc_noaddr(s, BC | BI(7, CR_EQ) | BO_COND_FALSE | LK);
+
+    rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
-#if TARGET_LONG_BITS == 32
-    tcg_out_ext32u(s, addr_reg, addr_reg);
-#endif
-    r0 = addr_reg;
-    r1 = 3;
     rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
+    if (TARGET_LONG_BITS == 32) {
+        tcg_out_ext32u(s, TCG_REG_R2, addr_reg);
+        addr_reg = TCG_REG_R2;
+    }
 #endif
 
     insn = qemu_ldx_opc[opc];
     if (!HAVE_ISA_2_06 && insn == LDBRX) {
-        tcg_out32(s, ADDI | TAI(r1, r0, 4));
-        tcg_out32(s, LWBRX | TAB(data_reg, rbase, r0));
-        tcg_out32(s, LWBRX | TAB(      r1, rbase, r1));
-        tcg_out_rld(s, RLDIMI, data_reg, r1, 32, 0);
+        tcg_out32(s, ADDI | TAI(TCG_REG_R0, addr_reg, 4));
+        tcg_out32(s, LWBRX | TAB(data_reg, rbase, addr_reg));
+        tcg_out32(s, LWBRX | TAB(TCG_REG_R0, rbase, TCG_REG_R0));
+        tcg_out_rld(s, RLDIMI, data_reg, TCG_REG_R0, 32, 0);
     } else if (insn) {
-        tcg_out32(s, insn | TAB(data_reg, rbase, r0));
+        tcg_out32(s, insn | TAB(data_reg, rbase, addr_reg));
     } else {
-        insn = qemu_ldx_opc[s_bits];
-        tcg_out32(s, insn | TAB(data_reg, rbase, r0));
+        insn = qemu_ldx_opc[opc & (MO_SIZE | MO_BSWAP)];
+        tcg_out32(s, insn | TAB(data_reg, rbase, addr_reg));
         insn = qemu_exts_opc[s_bits];
-        tcg_out32 (s, insn | RA(data_reg) | RS(data_reg));
+        tcg_out32(s, insn | RA(data_reg) | RS(data_reg));
     }
 
 #ifdef CONFIG_SOFTMMU
-    reloc_pc24 (label2_ptr, (tcg_target_long) s->code_ptr);
+    add_qemu_ldst_label(s, true, opc, data_reg, addr_reg, mem_index,
+                        s->code_ptr, label_ptr);
 #endif
 }
 
-static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc)
+static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
+                            TCGMemOp opc, int mem_index)
 {
-    TCGReg addr_reg, r0, r1, rbase, data_reg;
+    TCGReg rbase;
     uint32_t insn;
 #ifdef CONFIG_SOFTMMU
-    TCGReg r2, ir;
-    int mem_index;
-    void *label1_ptr, *label2_ptr;
+    void *label_ptr;
 #endif
 
-    data_reg = *args++;
-    addr_reg = *args++;
-
 #ifdef CONFIG_SOFTMMU
-    mem_index = *args;
-
-    r0 = 3;
-    r1 = 4;
-    r2 = 0;
-    rbase = 0;
+    addr_reg = tcg_out_tlb_read(s, opc & MO_SIZE, addr_reg, mem_index, false);
 
-    tcg_out_tlb_read (s, r0, r1, r2, addr_reg, opc,
-                      offsetof (CPUArchState, tlb_table[mem_index][0].addr_write));
-
-    tcg_out32 (s, CMP | BF (7) | RA (r2) | RB (r1) | CMP_L);
-
-    label1_ptr = s->code_ptr;
-#ifdef FAST_PATH
-    tcg_out32 (s, BC | BI (7, CR_EQ) | BO_COND_TRUE);
-#endif
-
-    /* slow path */
-    ir = 3;
-    tcg_out_mov (s, TCG_TYPE_I64, ir++, TCG_AREG0);
-    tcg_out_mov (s, TCG_TYPE_I64, ir++, addr_reg);
-    tcg_out_rld (s, RLDICL, ir++, data_reg, 0, 64 - (1 << (3 + opc)));
-    tcg_out_movi (s, TCG_TYPE_I64, ir++, mem_index);
-
-    tcg_out_call (s, (tcg_target_long) qemu_st_helpers[opc], 1);
-
-    label2_ptr = s->code_ptr;
-    tcg_out32 (s, B);
-
-    /* label1: fast path */
-#ifdef FAST_PATH
-    reloc_pc14 (label1_ptr, (tcg_target_long) s->code_ptr);
-#endif
-
-    tcg_out32 (s, (LD
-                   | RT (r0)
-                   | RA (r0)
-                   | (offsetof (CPUTLBEntry, addend)
-                      - offsetof (CPUTLBEntry, addr_write))
-                   ));
-    /* r0 = env->tlb_table[mem_index][index].addend */
-    tcg_out32(s, ADD | TAB(r0, r0, addr_reg));
-    /* r0 = env->tlb_table[mem_index][index].addend + addr */
+    /* Load a pointer into the current opcode w/conditional branch-link. */
+    label_ptr = s->code_ptr;
+    tcg_out_bc_noaddr(s, BC | BI(7, CR_EQ) | BO_COND_FALSE | LK);
 
+    rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
-#if TARGET_LONG_BITS == 32
-    tcg_out_ext32u(s, addr_reg, addr_reg);
-#endif
-    r1 = 3;
-    r0 = addr_reg;
     rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
+    if (TARGET_LONG_BITS == 32) {
+        tcg_out_ext32u(s, TCG_REG_R2, addr_reg);
+        addr_reg = TCG_REG_R2;
+    }
 #endif
 
     insn = qemu_stx_opc[opc];
     if (!HAVE_ISA_2_06 && insn == STDBRX) {
-        tcg_out32(s, STWBRX | SAB(data_reg, rbase, r0));
-        tcg_out32(s, ADDI | TAI(r1, r0, 4));
-        tcg_out_shri64(s, 0, data_reg, 32);
-        tcg_out32(s, STWBRX | SAB(0, rbase, r1));
+        tcg_out32(s, STWBRX | SAB(data_reg, rbase, addr_reg));
+        tcg_out32(s, ADDI | TAI(TCG_REG_R2, addr_reg, 4));
+        tcg_out_shri64(s, TCG_REG_R0, data_reg, 32);
+        tcg_out32(s, STWBRX | SAB(TCG_REG_R0, rbase, TCG_REG_R2));
     } else {
-        tcg_out32(s, insn | SAB(data_reg, rbase, r0));
+        tcg_out32(s, insn | SAB(data_reg, rbase, addr_reg));
     }
 
 #ifdef CONFIG_SOFTMMU
-    reloc_pc24 (label2_ptr, (tcg_target_long) s->code_ptr);
+    add_qemu_ldst_label(s, false, opc, data_reg, addr_reg, mem_index,
+                        s->code_ptr, label_ptr);
 #endif
 }
 
-static void tcg_target_qemu_prologue (TCGContext *s)
+#define FRAME_SIZE ((int) \
+    ((8                     /* back chain */              \
+      + 8                   /* CR */                      \
+      + 8                   /* LR */                      \
+      + 8                   /* compiler doubleword */     \
+      + 8                   /* link editor doubleword */  \
+      + 8                   /* TOC save area */           \
+      + TCG_STATIC_CALL_ARGS_SIZE                         \
+      + CPU_TEMP_BUF_NLONGS * sizeof(long)                \
+      + ARRAY_SIZE(tcg_target_callee_save_regs) * 8       \
+      + 15) & ~15))
+
+#define REG_SAVE_BOT (FRAME_SIZE - ARRAY_SIZE(tcg_target_callee_save_regs) * 8)
+
+static void tcg_target_qemu_prologue(TCGContext *s)
 {
-    int i, frame_size;
-#ifndef __APPLE__
-    uint64_t addr;
-#endif
+    int i;
 
-    frame_size = 0
-        + 8                     /* back chain */
-        + 8                     /* CR */
-        + 8                     /* LR */
-        + 8                     /* compiler doubleword */
-        + 8                     /* link editor doubleword */
-        + 8                     /* TOC save area */
-        + TCG_STATIC_CALL_ARGS_SIZE
-        + ARRAY_SIZE (tcg_target_callee_save_regs) * 8
-        + CPU_TEMP_BUF_NLONGS * sizeof(long)
-        ;
-    frame_size = (frame_size + 15) & ~15;
-
-    tcg_set_frame (s, TCG_REG_CALL_STACK, frame_size
-                   - CPU_TEMP_BUF_NLONGS * sizeof (long),
-                   CPU_TEMP_BUF_NLONGS * sizeof (long));
+    tcg_set_frame(s, TCG_REG_CALL_STACK,
+                  REG_SAVE_BOT - CPU_TEMP_BUF_NLONGS * sizeof(long),
+                  CPU_TEMP_BUF_NLONGS * sizeof(long));
 
 #ifndef __APPLE__
     /* First emit adhoc function descriptor */
-    addr = (uint64_t) s->code_ptr + 24;
-    tcg_out32 (s, addr >> 32); tcg_out32 (s, addr); /* entry point */
+    tcg_out64(s, (uint64_t)s->code_ptr + 24); /* entry point */
     s->code_ptr += 16;          /* skip TOC and environment pointer */
 #endif
 
     /* Prologue */
-    tcg_out32 (s, MFSPR | RT (0) | LR);
-    tcg_out32 (s, STDU | RS (1) | RA (1) | (-frame_size & 0xffff));
-    for (i = 0; i < ARRAY_SIZE (tcg_target_callee_save_regs); ++i)
-        tcg_out32 (s, (STD
-                       | RS (tcg_target_callee_save_regs[i])
-                       | RA (1)
-                       | (i * 8 + 48 + TCG_STATIC_CALL_ARGS_SIZE)
-                       )
-            );
-    tcg_out32 (s, STD | RS (0) | RA (1) | (frame_size + 16));
+    tcg_out32(s, MFSPR | RT(TCG_REG_R0) | LR);
+    tcg_out32(s, STDU | SAI(TCG_REG_R1, TCG_REG_R1, -FRAME_SIZE));
+    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); ++i) {
+        tcg_out32(s, STD | SAI(tcg_target_callee_save_regs[i], 1, 
+                               REG_SAVE_BOT + i * 8));
+    }
+    tcg_out32(s, STD | SAI(TCG_REG_R0, TCG_REG_R1, FRAME_SIZE + 16));
 
 #ifdef CONFIG_USE_GUEST_BASE
     if (GUEST_BASE) {
-        tcg_out_movi (s, TCG_TYPE_I64, TCG_GUEST_BASE_REG, GUEST_BASE);
-        tcg_regset_set_reg (s->reserved_regs, TCG_GUEST_BASE_REG);
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_GUEST_BASE_REG, GUEST_BASE);
+        tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
     }
 #endif
 
-    tcg_out_mov (s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
-    tcg_out32 (s, MTSPR | RS (tcg_target_call_iarg_regs[1]) | CTR);
-    tcg_out32 (s, BCCTR | BO_ALWAYS);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
+    tcg_out32(s, MTSPR | RS(tcg_target_call_iarg_regs[1]) | CTR);
+    tcg_out32(s, BCCTR | BO_ALWAYS);
 
     /* Epilogue */
     tb_ret_addr = s->code_ptr;
 
-    for (i = 0; i < ARRAY_SIZE (tcg_target_callee_save_regs); ++i)
-        tcg_out32 (s, (LD
-                       | RT (tcg_target_callee_save_regs[i])
-                       | RA (1)
-                       | (i * 8 + 48 + TCG_STATIC_CALL_ARGS_SIZE)
-                       )
-            );
-    tcg_out32(s, LD | TAI(0, 1, frame_size + 16));
-    tcg_out32(s, MTSPR | RS(0) | LR);
-    tcg_out32(s, ADDI | TAI(1, 1, frame_size));
+    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); ++i) {
+        tcg_out32(s, LD | TAI(tcg_target_callee_save_regs[i], TCG_REG_R1,
+                              REG_SAVE_BOT + i * 8));
+    }
+    tcg_out32(s, LD | TAI(TCG_REG_R0, TCG_REG_R1, FRAME_SIZE + 16));
+    tcg_out32(s, MTSPR | RS(TCG_REG_R0) | LR);
+    tcg_out32(s, ADDI | TAI(TCG_REG_R1, TCG_REG_R1, FRAME_SIZE));
     tcg_out32(s, BCLR | BO_ALWAYS);
 }
 
-static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
-                       intptr_t arg2)
+static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
+                              TCGReg arg1, intptr_t arg2)
 {
-    if (type == TCG_TYPE_I32)
-        tcg_out_ldst (s, ret, arg1, arg2, LWZ, LWZX);
-    else
-        tcg_out_ldsta (s, ret, arg1, arg2, LD, LDX);
+    int opi, opx;
+
+    if (type == TCG_TYPE_I32) {
+        opi = LWZ, opx = LWZX;
+    } else {
+        opi = LD, opx = LDX;
+    }
+    tcg_out_mem_long(s, opi, opx, ret, arg1, arg2);
 }
 
-static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
-                       intptr_t arg2)
+static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+                              TCGReg arg1, intptr_t arg2)
 {
-    if (type == TCG_TYPE_I32)
-        tcg_out_ldst (s, arg, arg1, arg2, STW, STWX);
-    else
-        tcg_out_ldsta (s, arg, arg1, arg2, STD, STDX);
+    int opi, opx;
+
+    if (type == TCG_TYPE_I32) {
+        opi = STW, opx = STWX;
+    } else {
+        opi = STD, opx = STDX;
+    }
+    tcg_out_mem_long(s, opi, opx, arg, arg1, arg2);
 }
 
 static void tcg_out_cmp(TCGContext *s, int cond, TCGArg arg1, TCGArg arg2,
@@ -1106,8 +1201,7 @@ static void tcg_out_cmp(TCGContext *s, int cond, TCGArg arg1, TCGArg arg2,
                 op = CMPI;
                 imm = 1;
                 break;
-            }
-            else if ((uint16_t) arg2 == arg2) {
+            } else if ((uint16_t) arg2 == arg2) {
                 op = CMPLI;
                 imm = 1;
                 break;
@@ -1148,7 +1242,7 @@ static void tcg_out_cmp(TCGContext *s, int cond, TCGArg arg1, TCGArg arg2,
         break;
 
     default:
-        tcg_abort ();
+        tcg_abort();
     }
     op |= BF(cr) | ((type == TCG_TYPE_I64) << 21);
 
@@ -1156,8 +1250,8 @@ static void tcg_out_cmp(TCGContext *s, int cond, TCGArg arg1, TCGArg arg2,
         tcg_out32(s, op | RA(arg1) | (arg2 & 0xffff));
     } else {
         if (const_arg2) {
-            tcg_out_movi(s, type, 0, arg2);
-            arg2 = 0;
+            tcg_out_movi(s, type, TCG_REG_R0, arg2);
+            arg2 = TCG_REG_R0;
         }
         tcg_out32(s, op | RA(arg1) | RB(arg2));
     }
@@ -1178,8 +1272,8 @@ static void tcg_out_setcond_ne0(TCGContext *s, TCGReg dst, TCGReg src)
         tcg_out32(s, ADDIC | TAI(dst, src, -1));
         tcg_out32(s, SUBFE | TAB(dst, dst, src));
     } else {
-        tcg_out32(s, ADDIC | TAI(0, src, -1));
-        tcg_out32(s, SUBFE | TAB(dst, 0, src));
+        tcg_out32(s, ADDIC | TAI(TCG_REG_R0, src, -1));
+        tcg_out32(s, SUBFE | TAB(dst, TCG_REG_R0, src));
     }
 }
 
@@ -1292,13 +1386,13 @@ static void tcg_out_setcond(TCGContext *s, TCGType type, TCGCond cond,
     case TCG_COND_GE:
     case TCG_COND_GEU:
         sh = 31;
-        crop = CRNOR | BT (7, CR_EQ) | BA (7, CR_LT) | BB (7, CR_LT);
+        crop = CRNOR | BT(7, CR_EQ) | BA(7, CR_LT) | BB(7, CR_LT);
         goto crtest;
 
     case TCG_COND_LE:
     case TCG_COND_LEU:
         sh = 31;
-        crop = CRNOR | BT (7, CR_EQ) | BA (7, CR_GT) | BB (7, CR_GT);
+        crop = CRNOR | BT(7, CR_EQ) | BA(7, CR_GT) | BB(7, CR_GT);
     crtest:
         tcg_out_cmp(s, cond, arg1, arg2, const_arg2, 7, type);
         if (crop) {
@@ -1309,22 +1403,19 @@ static void tcg_out_setcond(TCGContext *s, TCGType type, TCGCond cond,
         break;
 
     default:
-        tcg_abort ();
+        tcg_abort();
     }
 }
 
-static void tcg_out_bc (TCGContext *s, int bc, int label_index)
+static void tcg_out_bc(TCGContext *s, int bc, int label_index)
 {
     TCGLabel *l = &s->labels[label_index];
 
-    if (l->has_value)
-        tcg_out32 (s, bc | reloc_pc14_val (s->code_ptr, l->u.value));
-    else {
-        uint16_t val = *(uint16_t *) &s->code_ptr[2];
-
-        /* Thanks to Andrzej Zaborowski */
-        tcg_out32 (s, bc | (val & 0xfffc));
-        tcg_out_reloc (s, s->code_ptr - 4, R_PPC_REL14, label_index, 0);
+    if (l->has_value) {
+        tcg_out32(s, bc | reloc_pc14_val(s->code_ptr, l->u.value));
+    } else {
+        tcg_out_reloc(s, s->code_ptr, R_PPC_REL14, label_index, 0);
+        tcg_out_bc_noaddr(s, bc);
     }
 }
 
@@ -1360,7 +1451,7 @@ static void tcg_out_movcond(TCGContext *s, TCGType type, TCGCond cond,
         }
         /* V1 == 0 is handled by isel; V2 == 0 must be handled by hand.  */
         if (v2 == 0) {
-            tcg_out_movi(s, type, 0, 0);
+            tcg_out_movi(s, type, TCG_REG_R0, 0);
         }
         tcg_out32(s, isel | TAB(dest, v1, v2));
     } else {
@@ -1384,37 +1475,36 @@ static void tcg_out_movcond(TCGContext *s, TCGType type, TCGCond cond,
     }
 }
 
-void ppc_tb_set_jmp_target (unsigned long jmp_addr, unsigned long addr)
+void ppc_tb_set_jmp_target(unsigned long jmp_addr, unsigned long addr)
 {
     TCGContext s;
     unsigned long patch_size;
 
     s.code_ptr = (uint8_t *) jmp_addr;
-    tcg_out_b (&s, 0, addr);
+    tcg_out_b(&s, 0, addr);
     patch_size = s.code_ptr - (uint8_t *) jmp_addr;
-    flush_icache_range (jmp_addr, jmp_addr + patch_size);
+    flush_icache_range(jmp_addr, jmp_addr + patch_size);
 }
 
-static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
-                        const int *const_args)
+static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+                       const int *const_args)
 {
     TCGArg a0, a1, a2;
     int c;
 
     switch (opc) {
     case INDEX_op_exit_tb:
-        tcg_out_movi (s, TCG_TYPE_I64, TCG_REG_R3, args[0]);
-        tcg_out_b (s, 0, (tcg_target_long) tb_ret_addr);
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R3, args[0]);
+        tcg_out_b(s, 0, (tcg_target_long)tb_ret_addr);
         break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
-            /* direct jump method */
-
+            /* Direct jump method.  */
             s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
             s->code_ptr += 28;
-        }
-        else {
-            tcg_abort ();
+        } else {
+            /* Indirect jump method.  */
+            tcg_abort();
         }
         s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
         break;
@@ -1423,83 +1513,70 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
             TCGLabel *l = &s->labels[args[0]];
 
             if (l->has_value) {
-                tcg_out_b (s, 0, l->u.value);
-            }
-            else {
-                uint32_t val = *(uint32_t *) s->code_ptr;
-
-                /* Thanks to Andrzej Zaborowski */
-                tcg_out32 (s, B | (val & 0x3fffffc));
-                tcg_out_reloc (s, s->code_ptr - 4, R_PPC_REL24, args[0], 0);
+                tcg_out_b(s, 0, l->u.value);
+            } else {
+                tcg_out_reloc(s, s->code_ptr, R_PPC_REL24, args[0], 0);
+                tcg_out_b_noaddr(s, B);
             }
         }
         break;
     case INDEX_op_call:
-        tcg_out_call (s, args[0], const_args[0]);
+        tcg_out_call(s, args[0], const_args[0]);
         break;
     case INDEX_op_movi_i32:
-        tcg_out_movi (s, TCG_TYPE_I32, args[0], args[1]);
+        tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
         break;
     case INDEX_op_movi_i64:
-        tcg_out_movi (s, TCG_TYPE_I64, args[0], args[1]);
+        tcg_out_movi(s, TCG_TYPE_I64, args[0], args[1]);
         break;
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8u_i64:
-        tcg_out_ldst (s, args[0], args[1], args[2], LBZ, LBZX);
+        tcg_out_mem_long(s, LBZ, LBZX, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld8s_i32:
     case INDEX_op_ld8s_i64:
-        tcg_out_ldst (s, args[0], args[1], args[2], LBZ, LBZX);
-        tcg_out32 (s, EXTSB | RS (args[0]) | RA (args[0]));
+        tcg_out_mem_long(s, LBZ, LBZX, args[0], args[1], args[2]);
+        tcg_out32(s, EXTSB | RS(args[0]) | RA(args[0]));
         break;
     case INDEX_op_ld16u_i32:
     case INDEX_op_ld16u_i64:
-        tcg_out_ldst (s, args[0], args[1], args[2], LHZ, LHZX);
+        tcg_out_mem_long(s, LHZ, LHZX, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld16s_i32:
     case INDEX_op_ld16s_i64:
-        tcg_out_ldst (s, args[0], args[1], args[2], LHA, LHAX);
+        tcg_out_mem_long(s, LHA, LHAX, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld_i32:
     case INDEX_op_ld32u_i64:
-        tcg_out_ldst (s, args[0], args[1], args[2], LWZ, LWZX);
+        tcg_out_mem_long(s, LWZ, LWZX, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld32s_i64:
-        tcg_out_ldsta (s, args[0], args[1], args[2], LWA, LWAX);
+        tcg_out_mem_long(s, LWA, LWAX, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld_i64:
-        tcg_out_ldsta (s, args[0], args[1], args[2], LD, LDX);
+        tcg_out_mem_long(s, LD, LDX, args[0], args[1], args[2]);
         break;
     case INDEX_op_st8_i32:
     case INDEX_op_st8_i64:
-        tcg_out_ldst (s, args[0], args[1], args[2], STB, STBX);
+        tcg_out_mem_long(s, STB, STBX, args[0], args[1], args[2]);
         break;
     case INDEX_op_st16_i32:
     case INDEX_op_st16_i64:
-        tcg_out_ldst (s, args[0], args[1], args[2], STH, STHX);
+        tcg_out_mem_long(s, STH, STHX, args[0], args[1], args[2]);
         break;
     case INDEX_op_st_i32:
     case INDEX_op_st32_i64:
-        tcg_out_ldst (s, args[0], args[1], args[2], STW, STWX);
+        tcg_out_mem_long(s, STW, STWX, args[0], args[1], args[2]);
         break;
     case INDEX_op_st_i64:
-        tcg_out_ldsta (s, args[0], args[1], args[2], STD, STDX);
+        tcg_out_mem_long(s, STD, STDX, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_add_i32:
         a0 = args[0], a1 = args[1], a2 = args[2];
         if (const_args[2]) {
-            int32_t l, h;
         do_addi_32:
-            l = (int16_t)a2;
-            h = a2 - l;
-            if (h) {
-                tcg_out32(s, ADDIS | TAI(a0, a1, h >> 16));
-                a1 = a0;
-            }
-            if (l || a0 != a1) {
-                tcg_out32(s, ADDI | TAI(a0, a1, l));
-            }
+            tcg_out_mem_long(s, ADDI, ADD, a0, a1, (int32_t)a2);
         } else {
             tcg_out32(s, ADD | TAB(a0, a1, a2));
         }
@@ -1607,32 +1684,33 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         break;
 
     case INDEX_op_div_i32:
-        tcg_out32 (s, DIVW | TAB (args[0], args[1], args[2]));
+        tcg_out32(s, DIVW | TAB(args[0], args[1], args[2]));
         break;
 
     case INDEX_op_divu_i32:
-        tcg_out32 (s, DIVWU | TAB (args[0], args[1], args[2]));
+        tcg_out32(s, DIVWU | TAB(args[0], args[1], args[2]));
         break;
 
     case INDEX_op_shl_i32:
         if (const_args[2]) {
             tcg_out_rlw(s, RLWINM, args[0], args[1], args[2], 0, 31 - args[2]);
         } else {
-            tcg_out32 (s, SLW | SAB (args[1], args[0], args[2]));
+            tcg_out32(s, SLW | SAB(args[1], args[0], args[2]));
         }
         break;
     case INDEX_op_shr_i32:
         if (const_args[2]) {
             tcg_out_rlw(s, RLWINM, args[0], args[1], 32 - args[2], args[2], 31);
         } else {
-            tcg_out32 (s, SRW | SAB (args[1], args[0], args[2]));
+            tcg_out32(s, SRW | SAB(args[1], args[0], args[2]));
         }
         break;
     case INDEX_op_sar_i32:
-        if (const_args[2])
-            tcg_out32 (s, SRAWI | RS (args[1]) | RA (args[0]) | SH (args[2]));
-        else
-            tcg_out32 (s, SRAW | SAB (args[1], args[0], args[2]));
+        if (const_args[2]) {
+            tcg_out32(s, SRAWI | RS(args[1]) | RA(args[0]) | SH(args[2]));
+        } else {
+            tcg_out32(s, SRAW | SAB(args[1], args[0], args[2]));
+        }
         break;
     case INDEX_op_rotl_i32:
         if (const_args[2]) {
@@ -1646,8 +1724,8 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         if (const_args[2]) {
             tcg_out_rlw(s, RLWINM, args[0], args[1], 32 - args[2], 0, 31);
         } else {
-            tcg_out32(s, SUBFIC | TAI(0, args[2], 32));
-            tcg_out32(s, RLWNM | SAB(args[1], args[0], 0)
+            tcg_out32(s, SUBFIC | TAI(TCG_REG_R0, args[2], 32));
+            tcg_out32(s, RLWNM | SAB(args[1], args[0], TCG_REG_R0)
                          | MB(0) | ME(31));
         }
         break;
@@ -1664,43 +1742,19 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
 
     case INDEX_op_neg_i32:
     case INDEX_op_neg_i64:
-        tcg_out32 (s, NEG | RT (args[0]) | RA (args[1]));
+        tcg_out32(s, NEG | RT(args[0]) | RA(args[1]));
         break;
 
     case INDEX_op_not_i32:
     case INDEX_op_not_i64:
-        tcg_out32 (s, NOR | SAB (args[1], args[0], args[1]));
+        tcg_out32(s, NOR | SAB(args[1], args[0], args[1]));
         break;
 
     case INDEX_op_add_i64:
         a0 = args[0], a1 = args[1], a2 = args[2];
         if (const_args[2]) {
-            int32_t l0, h1, h2;
         do_addi_64:
-            /* We can always split any 32-bit signed constant into 3 pieces.
-               Note the positive 0x80000000 coming from the sub_i64 path,
-               handled with the same code we need for eg 0x7fff8000.  */
-            assert(a2 == (int32_t)a2 || a2 == 0x80000000);
-            l0 = (int16_t)a2;
-            h1 = a2 - l0;
-            h2 = 0;
-            if (h1 < 0 && (int64_t)a2 > 0) {
-                h2 = 0x40000000;
-                h1 = a2 - h2 - l0;
-            }
-            assert((TCGArg)h2 + h1 + l0 == a2);
-
-            if (h2) {
-                tcg_out32(s, ADDIS | TAI(a0, a1, h2 >> 16));
-                a1 = a0;
-            }
-            if (h1) {
-                tcg_out32(s, ADDIS | TAI(a0, a1, h1 >> 16));
-                a1 = a0;
-            }
-            if (l0 || a0 != a1) {
-                tcg_out32(s, ADDI | TAI(a0, a1, l0));
-            }
+            tcg_out_mem_long(s, ADDI, ADD, a0, a1, a2);
         } else {
             tcg_out32(s, ADD | TAB(a0, a1, a2));
         }
@@ -1722,24 +1776,26 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         break;
 
     case INDEX_op_shl_i64:
-        if (const_args[2])
+        if (const_args[2]) {
             tcg_out_shli64(s, args[0], args[1], args[2]);
-        else
-            tcg_out32 (s, SLD | SAB (args[1], args[0], args[2]));
+        } else {
+            tcg_out32(s, SLD | SAB(args[1], args[0], args[2]));
+        }
         break;
     case INDEX_op_shr_i64:
-        if (const_args[2])
+        if (const_args[2]) {
             tcg_out_shri64(s, args[0], args[1], args[2]);
-        else
-            tcg_out32 (s, SRD | SAB (args[1], args[0], args[2]));
+        } else {
+            tcg_out32(s, SRD | SAB(args[1], args[0], args[2]));
+        }
         break;
     case INDEX_op_sar_i64:
         if (const_args[2]) {
-            int sh = SH (args[2] & 0x1f) | (((args[2] >> 5) & 1) << 1);
-            tcg_out32 (s, SRADI | RA (args[0]) | RS (args[1]) | sh);
+            int sh = SH(args[2] & 0x1f) | (((args[2] >> 5) & 1) << 1);
+            tcg_out32(s, SRADI | RA(args[0]) | RS(args[1]) | sh);
+        } else {
+            tcg_out32(s, SRAD | SAB(args[1], args[0], args[2]));
         }
-        else
-            tcg_out32 (s, SRAD | SAB (args[1], args[0], args[2]));
         break;
     case INDEX_op_rotl_i64:
         if (const_args[2]) {
@@ -1752,8 +1808,8 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         if (const_args[2]) {
             tcg_out_rld(s, RLDICL, args[0], args[1], 64 - args[2], 0);
         } else {
-            tcg_out32(s, SUBFIC | TAI(0, args[2], 64));
-            tcg_out32(s, RLDCL | SAB(args[1], args[0], 0) | MB64(0));
+            tcg_out32(s, SUBFIC | TAI(TCG_REG_R0, args[2], 64));
+            tcg_out32(s, RLDCL | SAB(args[1], args[0], TCG_REG_R0) | MB64(0));
         }
         break;
 
@@ -1766,45 +1822,19 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         }
         break;
     case INDEX_op_div_i64:
-        tcg_out32 (s, DIVD | TAB (args[0], args[1], args[2]));
+        tcg_out32(s, DIVD | TAB(args[0], args[1], args[2]));
         break;
     case INDEX_op_divu_i64:
-        tcg_out32 (s, DIVDU | TAB (args[0], args[1], args[2]));
+        tcg_out32(s, DIVDU | TAB(args[0], args[1], args[2]));
         break;
 
-    case INDEX_op_qemu_ld8u:
-        tcg_out_qemu_ld (s, args, 0);
+    case INDEX_op_qemu_ld_i32:
+    case INDEX_op_qemu_ld_i64:
+        tcg_out_qemu_ld(s, args[0], args[1], args[2], args[3]);
         break;
-    case INDEX_op_qemu_ld8s:
-        tcg_out_qemu_ld (s, args, 0 | 4);
-        break;
-    case INDEX_op_qemu_ld16u:
-        tcg_out_qemu_ld (s, args, 1);
-        break;
-    case INDEX_op_qemu_ld16s:
-        tcg_out_qemu_ld (s, args, 1 | 4);
-        break;
-    case INDEX_op_qemu_ld32:
-    case INDEX_op_qemu_ld32u:
-        tcg_out_qemu_ld (s, args, 2);
-        break;
-    case INDEX_op_qemu_ld32s:
-        tcg_out_qemu_ld (s, args, 2 | 4);
-        break;
-    case INDEX_op_qemu_ld64:
-        tcg_out_qemu_ld (s, args, 3);
-        break;
-    case INDEX_op_qemu_st8:
-        tcg_out_qemu_st (s, args, 0);
-        break;
-    case INDEX_op_qemu_st16:
-        tcg_out_qemu_st (s, args, 1);
-        break;
-    case INDEX_op_qemu_st32:
-        tcg_out_qemu_st (s, args, 2);
-        break;
-    case INDEX_op_qemu_st64:
-        tcg_out_qemu_st (s, args, 3);
+    case INDEX_op_qemu_st_i32:
+    case INDEX_op_qemu_st_i64:
+        tcg_out_qemu_st(s, args[0], args[1], args[2], args[3]);
         break;
 
     case INDEX_op_ext8s_i32:
@@ -1819,16 +1849,16 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         c = EXTSW;
         goto gen_ext;
     gen_ext:
-        tcg_out32 (s, c | RS (args[1]) | RA (args[0]));
+        tcg_out32(s, c | RS(args[1]) | RA(args[0]));
         break;
 
     case INDEX_op_setcond_i32:
-        tcg_out_setcond (s, TCG_TYPE_I32, args[3], args[0], args[1], args[2],
-                         const_args[2]);
+        tcg_out_setcond(s, TCG_TYPE_I32, args[3], args[0], args[1], args[2],
+                        const_args[2]);
         break;
     case INDEX_op_setcond_i64:
-        tcg_out_setcond (s, TCG_TYPE_I64, args[3], args[0], args[1], args[2],
-                         const_args[2]);
+        tcg_out_setcond(s, TCG_TYPE_I64, args[3], args[0], args[1], args[2],
+                        const_args[2]);
         break;
 
     case INDEX_op_bswap16_i32:
@@ -1870,9 +1900,9 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         break;
 
     case INDEX_op_bswap64_i64:
-        a0 = args[0], a1 = args[1], a2 = 0;
+        a0 = args[0], a1 = args[1], a2 = TCG_REG_R0;
         if (a0 == a1) {
-            a0 = 0;
+            a0 = TCG_REG_R0;
             a2 = a1;
         }
 
@@ -1980,8 +2010,8 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         break;
 
     default:
-        tcg_dump_ops (s);
-        tcg_abort ();
+        tcg_dump_ops(s);
+        tcg_abort();
     }
 }
 
@@ -2067,19 +2097,10 @@ static const TCGTargetOpDef ppc_op_defs[] = {
     { INDEX_op_neg_i64, { "r", "r" } },
     { INDEX_op_not_i64, { "r", "r" } },
 
-    { INDEX_op_qemu_ld8u, { "r", "L" } },
-    { INDEX_op_qemu_ld8s, { "r", "L" } },
-    { INDEX_op_qemu_ld16u, { "r", "L" } },
-    { INDEX_op_qemu_ld16s, { "r", "L" } },
-    { INDEX_op_qemu_ld32, { "r", "L" } },
-    { INDEX_op_qemu_ld32u, { "r", "L" } },
-    { INDEX_op_qemu_ld32s, { "r", "L" } },
-    { INDEX_op_qemu_ld64, { "r", "L" } },
-
-    { INDEX_op_qemu_st8, { "S", "S" } },
-    { INDEX_op_qemu_st16, { "S", "S" } },
-    { INDEX_op_qemu_st32, { "S", "S" } },
-    { INDEX_op_qemu_st64, { "S", "S" } },
+    { INDEX_op_qemu_ld_i32, { "r", "L" } },
+    { INDEX_op_qemu_ld_i64, { "r", "L" } },
+    { INDEX_op_qemu_st_i32, { "S", "S" } },
+    { INDEX_op_qemu_st_i64, { "S", "S" } },
 
     { INDEX_op_ext8s_i32, { "r", "r" } },
     { INDEX_op_ext16s_i32, { "r", "r" } },
@@ -2109,7 +2130,7 @@ static const TCGTargetOpDef ppc_op_defs[] = {
     { -1 },
 };
 
-static void tcg_target_init (TCGContext *s)
+static void tcg_target_init(TCGContext *s)
 {
 #ifdef CONFIG_GETAUXVAL
     unsigned long hwcap = getauxval(AT_HWCAP);
@@ -2118,13 +2139,11 @@ static void tcg_target_init (TCGContext *s)
     }
 #endif
 
-    tcg_regset_set32 (tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffffffff);
-    tcg_regset_set32 (tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffffffff);
-    tcg_regset_set32 (tcg_target_call_clobber_regs, 0,
+    tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffffffff);
+    tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffffffff);
+    tcg_regset_set32(tcg_target_call_clobber_regs, 0,
                      (1 << TCG_REG_R0) |
-#ifdef __APPLE__
                      (1 << TCG_REG_R2) |
-#endif
                      (1 << TCG_REG_R3) |
                      (1 << TCG_REG_R4) |
                      (1 << TCG_REG_R5) |
@@ -2134,16 +2153,65 @@ static void tcg_target_init (TCGContext *s)
                      (1 << TCG_REG_R9) |
                      (1 << TCG_REG_R10) |
                      (1 << TCG_REG_R11) |
-                     (1 << TCG_REG_R12)
-        );
+                     (1 << TCG_REG_R12));
 
-    tcg_regset_clear (s->reserved_regs);
-    tcg_regset_set_reg (s->reserved_regs, TCG_REG_R0);
-    tcg_regset_set_reg (s->reserved_regs, TCG_REG_R1);
-#ifndef __APPLE__
-    tcg_regset_set_reg (s->reserved_regs, TCG_REG_R2);
+    tcg_regset_clear(s->reserved_regs);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0); /* tcg temp */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R1); /* stack pointer */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R2); /* mem temp */
+#ifdef __APPLE__
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R11); /* ??? */
 #endif
-    tcg_regset_set_reg (s->reserved_regs, TCG_REG_R13);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_R13); /* thread pointer */
+
+    tcg_add_target_add_op_defs(ppc_op_defs);
+}
+
+typedef struct {
+    DebugFrameCIE cie;
+    DebugFrameFDEHeader fde;
+    uint8_t fde_def_cfa[4];
+    uint8_t fde_reg_ofs[ARRAY_SIZE(tcg_target_callee_save_regs) * 2 + 3];
+} DebugFrame;
+
+/* We're expecting a 2 byte uleb128 encoded value.  */
+QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
+
+#define ELF_HOST_MACHINE EM_PPC64
+
+static DebugFrame debug_frame = {
+    .cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
+    .cie.id = -1,
+    .cie.version = 1,
+    .cie.code_align = 1,
+    .cie.data_align = 0x78,             /* sleb128 -8 */
+    .cie.return_column = 65,
+
+    /* Total FDE size does not include the "len" member.  */
+    .fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, fde.cie_offset),
+
+    .fde_def_cfa = {
+        12, 1,                          /* DW_CFA_def_cfa r1, ... */
+        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
+        (FRAME_SIZE >> 7)
+    },
+    .fde_reg_ofs = {
+        0x11, 65, 0x7e,                 /* DW_CFA_offset_extended_sf, lr, 16 */
+    }
+};
+
+void tcg_register_jit(void *buf, size_t buf_size)
+{
+    uint8_t *p = &debug_frame.fde_reg_ofs[3];
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); ++i, p += 2) {
+        p[0] = 0x80 + tcg_target_callee_save_regs[i];
+        p[1] = (FRAME_SIZE - (REG_SAVE_BOT + i * 8)) / 8;
+    }
+
+    debug_frame.fde.func_start = (tcg_target_long) buf;
+    debug_frame.fde.func_len = buf_size;
 
-    tcg_add_target_add_op_defs (ppc_op_defs);
+    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
index fa4b9da093..7ee50b6c6c 100644
--- a/tcg/ppc64/tcg-target.h
+++ b/tcg/ppc64/tcg-target.h
@@ -123,6 +123,8 @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
 
+#define TCG_TARGET_HAS_new_ldst         1
+
 #define TCG_AREG0 TCG_REG_R27
 
 #define TCG_TARGET_EXTEND_ARGS 1
diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c
index 1b44aeee96..0a4f3be0e9 100644
--- a/tcg/s390/tcg-target.c
+++ b/tcg/s390/tcg-target.c
@@ -24,6 +24,8 @@
  * THE SOFTWARE.
  */
 
+#include "tcg-be-null.h"
+
 /* We only support generating code for 64-bit mode.  */
 #if TCG_TARGET_REG_BITS != 64
 #error "unsupported code generation mode"
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index 6142fb26a2..10adb778c7 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -99,6 +99,8 @@ typedef enum TCGReg {
 #define TCG_TARGET_HAS_muluh_i64        0
 #define TCG_TARGET_HAS_mulsh_i64        0
 
+#define TCG_TARGET_HAS_new_ldst         0
+
 extern bool tcg_target_deposit_valid(int ofs, int len);
 #define TCG_TARGET_deposit_i32_valid  tcg_target_deposit_valid
 #define TCG_TARGET_deposit_i64_valid  tcg_target_deposit_valid
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 9574954ac4..cbd1c91779 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -22,6 +22,8 @@
  * THE SOFTWARE.
  */
 
+#include "tcg-be-null.h"
+
 #ifndef NDEBUG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     "%g0",
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index 2edf858733..00f3a1848b 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -148,12 +148,14 @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i64        0
 #endif
 
+#define TCG_TARGET_HAS_new_ldst         0
+
 #define TCG_AREG0 TCG_REG_I0
 
 static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
 {
     uintptr_t p;
-    for (p = start & -8; p < (stop + 7) & -8; p += 8) {
+    for (p = start & -8; p < ((stop + 7) & -8); p += 8) {
         __asm__ __volatile__("flush\t%0" : : "r" (p));
     }
 }
diff --git a/tcg/tcg-be-ldst.h b/tcg/tcg-be-ldst.h
new file mode 100644
index 0000000000..284db0c70d
--- /dev/null
+++ b/tcg/tcg-be-ldst.h
@@ -0,0 +1,90 @@
+/*
+ * TCG Backend Data: load-store optimization only.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifdef CONFIG_SOFTMMU
+#define TCG_MAX_QEMU_LDST       640
+
+typedef struct TCGLabelQemuLdst {
+    int is_ld:1;            /* qemu_ld: 1, qemu_st: 0 */
+    TCGMemOp opc:4;
+    TCGReg addrlo_reg;      /* reg index for low word of guest virtual addr */
+    TCGReg addrhi_reg;      /* reg index for high word of guest virtual addr */
+    TCGReg datalo_reg;      /* reg index for low word to be loaded or stored */
+    TCGReg datahi_reg;      /* reg index for high word to be loaded or stored */
+    int mem_index;          /* soft MMU memory index */
+    uint8_t *raddr;         /* gen code addr of the next IR of qemu_ld/st IR */
+    uint8_t *label_ptr[2];  /* label pointers to be updated */
+} TCGLabelQemuLdst;
+
+typedef struct TCGBackendData {
+    int nb_ldst_labels;
+    TCGLabelQemuLdst ldst_labels[TCG_MAX_QEMU_LDST];
+} TCGBackendData;
+
+
+/*
+ * Initialize TB backend data at the beginning of the TB.
+ */
+
+static inline void tcg_out_tb_init(TCGContext *s)
+{
+    s->be->nb_ldst_labels = 0;
+}
+
+/*
+ * Generate TB finalization at the end of block
+ */
+
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l);
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l);
+
+static void tcg_out_tb_finalize(TCGContext *s)
+{
+    TCGLabelQemuLdst *lb = s->be->ldst_labels;
+    int i, n = s->be->nb_ldst_labels;
+
+    /* qemu_ld/st slow paths */
+    for (i = 0; i < n; i++) {
+        if (lb[i].is_ld) {
+            tcg_out_qemu_ld_slow_path(s, lb + i);
+        } else {
+            tcg_out_qemu_st_slow_path(s, lb + i);
+        }
+    }
+}
+
+/*
+ * Allocate a new TCGLabelQemuLdst entry.
+ */
+
+static inline TCGLabelQemuLdst *new_ldst_label(TCGContext *s)
+{
+    TCGBackendData *be = s->be;
+    int n = be->nb_ldst_labels;
+
+    assert(n < TCG_MAX_QEMU_LDST);
+    be->nb_ldst_labels = n + 1;
+    return &be->ldst_labels[n];
+}
+#else
+#include "tcg-be-null.h"
+#endif /* CONFIG_SOFTMMU */
diff --git a/tcg/tcg-be-null.h b/tcg/tcg-be-null.h
new file mode 100644
index 0000000000..74c57d5a6c
--- /dev/null
+++ b/tcg/tcg-be-null.h
@@ -0,0 +1,43 @@
+/*
+ * TCG Backend Data: No backend data
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+typedef struct TCGBackendData {
+    /* Empty */
+    char dummy;
+} TCGBackendData;
+
+
+/*
+ * Initialize TB backend data at the beginning of the TB.
+ */
+
+static inline void tcg_out_tb_init(TCGContext *s)
+{
+}
+
+/*
+ * Generate TB finalization at the end of block
+ */
+
+static inline void tcg_out_tb_finalize(TCGContext *s)
+{
+}
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index bb30a7cf39..7eabf22f01 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -137,24 +137,6 @@ static inline void tcg_gen_ldst_op_i64(TCGOpcode opc, TCGv_i64 val,
     *tcg_ctx.gen_opparam_ptr++ = offset;
 }
 
-static inline void tcg_gen_qemu_ldst_op_i64_i32(TCGOpcode opc, TCGv_i64 val,
-                                                TCGv_i32 addr, TCGArg mem_index)
-{
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(val);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(addr);
-    *tcg_ctx.gen_opparam_ptr++ = mem_index;
-}
-
-static inline void tcg_gen_qemu_ldst_op_i64_i64(TCGOpcode opc, TCGv_i64 val,
-                                                TCGv_i64 addr, TCGArg mem_index)
-{
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(val);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(addr);
-    *tcg_ctx.gen_opparam_ptr++ = mem_index;
-}
-
 static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
                                    TCGv_i32 arg3, TCGv_i32 arg4)
 {
@@ -361,6 +343,21 @@ static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 arg1,
     *tcg_ctx.gen_opparam_ptr++ = arg6;
 }
 
+static inline void tcg_add_param_i32(TCGv_i32 val)
+{
+    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(val);
+}
+
+static inline void tcg_add_param_i64(TCGv_i64 val)
+{
+#if TCG_TARGET_REG_BITS == 32
+    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(TCGV_LOW(val));
+    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(TCGV_HIGH(val));
+#else
+    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(val);
+#endif
+}
+
 static inline void gen_set_label(int n)
 {
     tcg_gen_op1i(INDEX_op_set_label, n);
@@ -2600,11 +2597,12 @@ static inline void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh,
 #define tcg_global_mem_new tcg_global_mem_new_i32
 #define tcg_temp_local_new() tcg_temp_local_new_i32()
 #define tcg_temp_free tcg_temp_free_i32
-#define tcg_gen_qemu_ldst_op tcg_gen_op3i_i32
-#define tcg_gen_qemu_ldst_op_i64 tcg_gen_qemu_ldst_op_i64_i32
 #define TCGV_UNUSED(x) TCGV_UNUSED_I32(x)
 #define TCGV_IS_UNUSED(x) TCGV_IS_UNUSED_I32(x)
 #define TCGV_EQUAL(a, b) TCGV_EQUAL_I32(a, b)
+#define tcg_add_param_tl tcg_add_param_i32
+#define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32
+#define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i32
 #else
 #define TCGv TCGv_i64
 #define tcg_temp_new() tcg_temp_new_i64()
@@ -2612,11 +2610,12 @@ static inline void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh,
 #define tcg_global_mem_new tcg_global_mem_new_i64
 #define tcg_temp_local_new() tcg_temp_local_new_i64()
 #define tcg_temp_free tcg_temp_free_i64
-#define tcg_gen_qemu_ldst_op tcg_gen_op3i_i64
-#define tcg_gen_qemu_ldst_op_i64 tcg_gen_qemu_ldst_op_i64_i64
 #define TCGV_UNUSED(x) TCGV_UNUSED_I64(x)
 #define TCGV_IS_UNUSED(x) TCGV_IS_UNUSED_I64(x)
 #define TCGV_EQUAL(a, b) TCGV_EQUAL_I64(a, b)
+#define tcg_add_param_tl tcg_add_param_i64
+#define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i64
+#define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i64
 #endif
 
 /* debug info: write the PC of the corresponding QEMU CPU instruction */
@@ -2648,197 +2647,67 @@ static inline void tcg_gen_goto_tb(unsigned idx)
     tcg_gen_op1i(INDEX_op_goto_tb, idx);
 }
 
-#if TCG_TARGET_REG_BITS == 32
-static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op3i_i32(INDEX_op_qemu_ld8u, ret, addr, mem_index);
-#else
-    tcg_gen_op4i_i32(INDEX_op_qemu_ld8u, TCGV_LOW(ret), TCGV_LOW(addr),
-                     TCGV_HIGH(addr), mem_index);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-#endif
-}
-
-static inline void tcg_gen_qemu_ld8s(TCGv ret, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op3i_i32(INDEX_op_qemu_ld8s, ret, addr, mem_index);
-#else
-    tcg_gen_op4i_i32(INDEX_op_qemu_ld8s, TCGV_LOW(ret), TCGV_LOW(addr),
-                     TCGV_HIGH(addr), mem_index);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-#endif
-}
 
-static inline void tcg_gen_qemu_ld16u(TCGv ret, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op3i_i32(INDEX_op_qemu_ld16u, ret, addr, mem_index);
-#else
-    tcg_gen_op4i_i32(INDEX_op_qemu_ld16u, TCGV_LOW(ret), TCGV_LOW(addr),
-                     TCGV_HIGH(addr), mem_index);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-#endif
-}
-
-static inline void tcg_gen_qemu_ld16s(TCGv ret, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op3i_i32(INDEX_op_qemu_ld16s, ret, addr, mem_index);
-#else
-    tcg_gen_op4i_i32(INDEX_op_qemu_ld16s, TCGV_LOW(ret), TCGV_LOW(addr),
-                     TCGV_HIGH(addr), mem_index);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-#endif
-}
-
-static inline void tcg_gen_qemu_ld32u(TCGv ret, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op3i_i32(INDEX_op_qemu_ld32, ret, addr, mem_index);
-#else
-    tcg_gen_op4i_i32(INDEX_op_qemu_ld32, TCGV_LOW(ret), TCGV_LOW(addr),
-                     TCGV_HIGH(addr), mem_index);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-#endif
-}
-
-static inline void tcg_gen_qemu_ld32s(TCGv ret, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op3i_i32(INDEX_op_qemu_ld32, ret, addr, mem_index);
-#else
-    tcg_gen_op4i_i32(INDEX_op_qemu_ld32, TCGV_LOW(ret), TCGV_LOW(addr),
-                     TCGV_HIGH(addr), mem_index);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-#endif
-}
-
-static inline void tcg_gen_qemu_ld64(TCGv_i64 ret, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op4i_i32(INDEX_op_qemu_ld64, TCGV_LOW(ret), TCGV_HIGH(ret), addr, mem_index);
-#else
-    tcg_gen_op5i_i32(INDEX_op_qemu_ld64, TCGV_LOW(ret), TCGV_HIGH(ret),
-                     TCGV_LOW(addr), TCGV_HIGH(addr), mem_index);
-#endif
-}
-
-static inline void tcg_gen_qemu_st8(TCGv arg, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op3i_i32(INDEX_op_qemu_st8, arg, addr, mem_index);
-#else
-    tcg_gen_op4i_i32(INDEX_op_qemu_st8, TCGV_LOW(arg), TCGV_LOW(addr),
-                     TCGV_HIGH(addr), mem_index);
-#endif
-}
-
-static inline void tcg_gen_qemu_st16(TCGv arg, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op3i_i32(INDEX_op_qemu_st16, arg, addr, mem_index);
-#else
-    tcg_gen_op4i_i32(INDEX_op_qemu_st16, TCGV_LOW(arg), TCGV_LOW(addr),
-                     TCGV_HIGH(addr), mem_index);
-#endif
-}
-
-static inline void tcg_gen_qemu_st32(TCGv arg, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op3i_i32(INDEX_op_qemu_st32, arg, addr, mem_index);
-#else
-    tcg_gen_op4i_i32(INDEX_op_qemu_st32, TCGV_LOW(arg), TCGV_LOW(addr),
-                     TCGV_HIGH(addr), mem_index);
-#endif
-}
-
-static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
-{
-#if TARGET_LONG_BITS == 32
-    tcg_gen_op4i_i32(INDEX_op_qemu_st64, TCGV_LOW(arg), TCGV_HIGH(arg), addr,
-                     mem_index);
-#else
-    tcg_gen_op5i_i32(INDEX_op_qemu_st64, TCGV_LOW(arg), TCGV_HIGH(arg),
-                     TCGV_LOW(addr), TCGV_HIGH(addr), mem_index);
-#endif
-}
-
-#define tcg_gen_ld_ptr(R, A, O) tcg_gen_ld_i32(TCGV_PTR_TO_NAT(R), (A), (O))
-#define tcg_gen_discard_ptr(A) tcg_gen_discard_i32(TCGV_PTR_TO_NAT(A))
-
-#else /* TCG_TARGET_REG_BITS == 32 */
+void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
+void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
+void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
+void tcg_gen_qemu_st_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
 
 static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
 {
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld8u, ret, addr, mem_index);
+    tcg_gen_qemu_ld_tl(ret, addr, mem_index, MO_UB);
 }
 
 static inline void tcg_gen_qemu_ld8s(TCGv ret, TCGv addr, int mem_index)
 {
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld8s, ret, addr, mem_index);
+    tcg_gen_qemu_ld_tl(ret, addr, mem_index, MO_SB);
 }
 
 static inline void tcg_gen_qemu_ld16u(TCGv ret, TCGv addr, int mem_index)
 {
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld16u, ret, addr, mem_index);
+    tcg_gen_qemu_ld_tl(ret, addr, mem_index, MO_TEUW);
 }
 
 static inline void tcg_gen_qemu_ld16s(TCGv ret, TCGv addr, int mem_index)
 {
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld16s, ret, addr, mem_index);
+    tcg_gen_qemu_ld_tl(ret, addr, mem_index, MO_TESW);
 }
 
 static inline void tcg_gen_qemu_ld32u(TCGv ret, TCGv addr, int mem_index)
 {
-#if TARGET_LONG_BITS == 32
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld32, ret, addr, mem_index);
-#else
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld32u, ret, addr, mem_index);
-#endif
+    tcg_gen_qemu_ld_tl(ret, addr, mem_index, MO_TEUL);
 }
 
 static inline void tcg_gen_qemu_ld32s(TCGv ret, TCGv addr, int mem_index)
 {
-#if TARGET_LONG_BITS == 32
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld32, ret, addr, mem_index);
-#else
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld32s, ret, addr, mem_index);
-#endif
+    tcg_gen_qemu_ld_tl(ret, addr, mem_index, MO_TESL);
 }
 
 static inline void tcg_gen_qemu_ld64(TCGv_i64 ret, TCGv addr, int mem_index)
 {
-    tcg_gen_qemu_ldst_op_i64(INDEX_op_qemu_ld64, ret, addr, mem_index);
+    tcg_gen_qemu_ld_i64(ret, addr, mem_index, MO_TEQ);
 }
 
 static inline void tcg_gen_qemu_st8(TCGv arg, TCGv addr, int mem_index)
 {
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_st8, arg, addr, mem_index);
+    tcg_gen_qemu_st_tl(arg, addr, mem_index, MO_UB);
 }
 
 static inline void tcg_gen_qemu_st16(TCGv arg, TCGv addr, int mem_index)
 {
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_st16, arg, addr, mem_index);
+    tcg_gen_qemu_st_tl(arg, addr, mem_index, MO_TEUW);
 }
 
 static inline void tcg_gen_qemu_st32(TCGv arg, TCGv addr, int mem_index)
 {
-    tcg_gen_qemu_ldst_op(INDEX_op_qemu_st32, arg, addr, mem_index);
+    tcg_gen_qemu_st_tl(arg, addr, mem_index, MO_TEUL);
 }
 
 static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 {
-    tcg_gen_qemu_ldst_op_i64(INDEX_op_qemu_st64, arg, addr, mem_index);
+    tcg_gen_qemu_st_i64(arg, addr, mem_index, MO_TEQ);
 }
 
-#define tcg_gen_ld_ptr(R, A, O) tcg_gen_ld_i64(TCGV_PTR_TO_NAT(R), (A), (O))
-#define tcg_gen_discard_ptr(A) tcg_gen_discard_i64(TCGV_PTR_TO_NAT(A))
-
-#endif /* TCG_TARGET_REG_BITS != 32 */
-
 #if TARGET_LONG_BITS == 64
 #define tcg_gen_movi_tl tcg_gen_movi_i64
 #define tcg_gen_mov_tl tcg_gen_mov_i64
@@ -2997,17 +2866,25 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #endif
 
 #if TCG_TARGET_REG_BITS == 32
-#define tcg_gen_add_ptr(R, A, B) tcg_gen_add_i32(TCGV_PTR_TO_NAT(R), \
-                                               TCGV_PTR_TO_NAT(A), \
-                                               TCGV_PTR_TO_NAT(B))
-#define tcg_gen_addi_ptr(R, A, B) tcg_gen_addi_i32(TCGV_PTR_TO_NAT(R), \
-                                                 TCGV_PTR_TO_NAT(A), (B))
-#define tcg_gen_ext_i32_ptr(R, A) tcg_gen_mov_i32(TCGV_PTR_TO_NAT(R), (A))
-#else /* TCG_TARGET_REG_BITS == 32 */
-#define tcg_gen_add_ptr(R, A, B) tcg_gen_add_i64(TCGV_PTR_TO_NAT(R), \
-                                               TCGV_PTR_TO_NAT(A), \
-                                               TCGV_PTR_TO_NAT(B))
-#define tcg_gen_addi_ptr(R, A, B) tcg_gen_addi_i64(TCGV_PTR_TO_NAT(R),   \
-                                                 TCGV_PTR_TO_NAT(A), (B))
-#define tcg_gen_ext_i32_ptr(R, A) tcg_gen_ext_i32_i64(TCGV_PTR_TO_NAT(R), (A))
-#endif /* TCG_TARGET_REG_BITS != 32 */
+# define tcg_gen_ld_ptr(R, A, O) \
+    tcg_gen_ld_i32(TCGV_PTR_TO_NAT(R), (A), (O))
+# define tcg_gen_discard_ptr(A) \
+    tcg_gen_discard_i32(TCGV_PTR_TO_NAT(A))
+# define tcg_gen_add_ptr(R, A, B) \
+    tcg_gen_add_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
+# define tcg_gen_addi_ptr(R, A, B) \
+    tcg_gen_addi_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), (B))
+# define tcg_gen_ext_i32_ptr(R, A) \
+    tcg_gen_mov_i32(TCGV_PTR_TO_NAT(R), (A))
+#else
+# define tcg_gen_ld_ptr(R, A, O) \
+    tcg_gen_ld_i64(TCGV_PTR_TO_NAT(R), (A), (O))
+# define tcg_gen_discard_ptr(A) \
+    tcg_gen_discard_i64(TCGV_PTR_TO_NAT(A))
+# define tcg_gen_add_ptr(R, A, B) \
+    tcg_gen_add_i64(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
+# define tcg_gen_addi_ptr(R, A, B) \
+    tcg_gen_addi_i64(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), (B))
+# define tcg_gen_ext_i32_ptr(R, A) \
+    tcg_gen_ext_i32_i64(TCGV_PTR_TO_NAT(R), (A))
+#endif /* TCG_TARGET_REG_BITS == 32 */
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index a75c29d518..d71707d9bb 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -180,79 +180,107 @@ DEF(debug_insn_start, 0, 0, 1, TCG_OPF_NOT_PRESENT)
 #endif
 DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_END)
 DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_END)
-/* Note: even if TARGET_LONG_BITS is not defined, the INDEX_op
-   constants must be defined */
+
+#define IMPL_NEW_LDST \
+    (TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS \
+     | IMPL(TCG_TARGET_HAS_new_ldst))
+
+#if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+DEF(qemu_ld_i32, 1, 1, 2, IMPL_NEW_LDST)
+DEF(qemu_st_i32, 0, 2, 2, IMPL_NEW_LDST)
+# if TCG_TARGET_REG_BITS == 64
+DEF(qemu_ld_i64, 1, 1, 2, IMPL_NEW_LDST | TCG_OPF_64BIT)
+DEF(qemu_st_i64, 0, 2, 2, IMPL_NEW_LDST | TCG_OPF_64BIT)
+# else
+DEF(qemu_ld_i64, 2, 1, 2, IMPL_NEW_LDST | TCG_OPF_64BIT)
+DEF(qemu_st_i64, 0, 3, 2, IMPL_NEW_LDST | TCG_OPF_64BIT)
+# endif
+#else
+DEF(qemu_ld_i32, 1, 2, 2, IMPL_NEW_LDST)
+DEF(qemu_st_i32, 0, 3, 2, IMPL_NEW_LDST)
+DEF(qemu_ld_i64, 2, 2, 2, IMPL_NEW_LDST | TCG_OPF_64BIT)
+DEF(qemu_st_i64, 0, 4, 2, IMPL_NEW_LDST | TCG_OPF_64BIT)
+#endif
+
+#undef IMPL_NEW_LDST
+
+#define IMPL_OLD_LDST \
+    (TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS \
+     | IMPL(!TCG_TARGET_HAS_new_ldst))
+
 #if TCG_TARGET_REG_BITS == 32
 #if TARGET_LONG_BITS == 32
-DEF(qemu_ld8u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld8u, 1, 1, 1, IMPL_OLD_LDST)
 #else
-DEF(qemu_ld8u, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld8u, 1, 2, 1, IMPL_OLD_LDST)
 #endif
 #if TARGET_LONG_BITS == 32
-DEF(qemu_ld8s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld8s, 1, 1, 1, IMPL_OLD_LDST)
 #else
-DEF(qemu_ld8s, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld8s, 1, 2, 1, IMPL_OLD_LDST)
 #endif
 #if TARGET_LONG_BITS == 32
-DEF(qemu_ld16u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld16u, 1, 1, 1, IMPL_OLD_LDST)
 #else
-DEF(qemu_ld16u, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld16u, 1, 2, 1, IMPL_OLD_LDST)
 #endif
 #if TARGET_LONG_BITS == 32
-DEF(qemu_ld16s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld16s, 1, 1, 1, IMPL_OLD_LDST)
 #else
-DEF(qemu_ld16s, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld16s, 1, 2, 1, IMPL_OLD_LDST)
 #endif
 #if TARGET_LONG_BITS == 32
-DEF(qemu_ld32, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld32, 1, 1, 1, IMPL_OLD_LDST)
 #else
-DEF(qemu_ld32, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld32, 1, 2, 1, IMPL_OLD_LDST)
 #endif
 #if TARGET_LONG_BITS == 32
-DEF(qemu_ld64, 2, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld64, 2, 1, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
 #else
-DEF(qemu_ld64, 2, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld64, 2, 2, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
 #endif
 
 #if TARGET_LONG_BITS == 32
-DEF(qemu_st8, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st8, 0, 2, 1, IMPL_OLD_LDST)
 #else
-DEF(qemu_st8, 0, 3, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st8, 0, 3, 1, IMPL_OLD_LDST)
 #endif
 #if TARGET_LONG_BITS == 32
-DEF(qemu_st16, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st16, 0, 2, 1, IMPL_OLD_LDST)
 #else
-DEF(qemu_st16, 0, 3, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st16, 0, 3, 1, IMPL_OLD_LDST)
 #endif
 #if TARGET_LONG_BITS == 32
-DEF(qemu_st32, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st32, 0, 2, 1, IMPL_OLD_LDST)
 #else
-DEF(qemu_st32, 0, 3, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st32, 0, 3, 1, IMPL_OLD_LDST)
 #endif
 #if TARGET_LONG_BITS == 32
-DEF(qemu_st64, 0, 3, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st64, 0, 3, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
 #else
-DEF(qemu_st64, 0, 4, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st64, 0, 4, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
 #endif
 
 #else /* TCG_TARGET_REG_BITS == 32 */
 
-DEF(qemu_ld8u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_ld8s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_ld16u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_ld16s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_ld32, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_ld32u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_ld32s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_ld64, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld8u, 1, 1, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_ld8s, 1, 1, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_ld16u, 1, 1, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_ld16s, 1, 1, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_ld32, 1, 1, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_ld32u, 1, 1, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_ld32s, 1, 1, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_ld64, 1, 1, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
 
-DEF(qemu_st8, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_st16, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_st32, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_st64, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st8, 0, 2, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_st16, 0, 2, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_st32, 0, 2, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
+DEF(qemu_st64, 0, 2, 1, IMPL_OLD_LDST | TCG_OPF_64BIT)
 
 #endif /* TCG_TARGET_REG_BITS != 32 */
 
+#undef IMPL_OLD_LDST
+
 #undef IMPL
 #undef IMPL64
 #undef DEF
diff --git a/tcg/tcg.c b/tcg/tcg.c
index fd7fb6b85e..66d3f3de80 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -103,6 +103,9 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
                        intptr_t arg2);
 static int tcg_target_const_match(tcg_target_long val,
                                   const TCGArgConstraint *arg_ct);
+static void tcg_out_tb_init(TCGContext *s);
+static void tcg_out_tb_finalize(TCGContext *s);
+
 
 TCGOpDef tcg_op_defs[] = {
 #define DEF(s, oargs, iargs, cargs, flags) { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
@@ -254,12 +257,41 @@ void tcg_pool_reset(TCGContext *s)
     s->pool_current = NULL;
 }
 
+#include "helper.h"
+
+typedef struct TCGHelperInfo {
+    void *func;
+    const char *name;
+} TCGHelperInfo;
+
+static const TCGHelperInfo all_helpers[] = {
+#define GEN_HELPER 2
+#include "helper.h"
+
+    /* Include tcg-runtime.c functions.  */
+    { tcg_helper_div_i32, "div_i32" },
+    { tcg_helper_rem_i32, "rem_i32" },
+    { tcg_helper_divu_i32, "divu_i32" },
+    { tcg_helper_remu_i32, "remu_i32" },
+
+    { tcg_helper_shl_i64, "shl_i64" },
+    { tcg_helper_shr_i64, "shr_i64" },
+    { tcg_helper_sar_i64, "sar_i64" },
+    { tcg_helper_div_i64, "div_i64" },
+    { tcg_helper_rem_i64, "rem_i64" },
+    { tcg_helper_divu_i64, "divu_i64" },
+    { tcg_helper_remu_i64, "remu_i64" },
+    { tcg_helper_mulsh_i64, "mulsh_i64" },
+    { tcg_helper_muluh_i64, "muluh_i64" },
+};
+
 void tcg_context_init(TCGContext *s)
 {
-    int op, total_args, n;
+    int op, total_args, n, i;
     TCGOpDef *def;
     TCGArgConstraint *args_ct;
     int *sorted_args;
+    GHashTable *helper_table;
 
     memset(s, 0, sizeof(*s));
     s->nb_globals = 0;
@@ -284,7 +316,16 @@ void tcg_context_init(TCGContext *s)
         sorted_args += n;
         args_ct += n;
     }
-    
+
+    /* Register helpers.  */
+    /* Use g_direct_hash/equal for direct pointer comparisons on func.  */
+    s->helpers = helper_table = g_hash_table_new(NULL, NULL);
+
+    for (i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
+        g_hash_table_insert(helper_table, (gpointer)all_helpers[i].func,
+                            (gpointer)all_helpers[i].name);
+    }
+
     tcg_target_init(s);
 }
 
@@ -332,13 +373,7 @@ void tcg_func_start(TCGContext *s)
     s->gen_opc_ptr = s->gen_opc_buf;
     s->gen_opparam_ptr = s->gen_opparam_buf;
 
-#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
-    /* Initialize qemu_ld/st labels to assist code generation at the end of TB
-       for TLB miss cases at the end of TB */
-    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) *
-                                     TCG_MAX_QEMU_LDST);
-    s->nb_qemu_ldst_labels = 0;
-#endif
+    s->be = tcg_malloc(sizeof(TCGBackendData));
 }
 
 static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -620,25 +655,6 @@ int tcg_check_temp_count(void)
 }
 #endif
 
-void tcg_register_helper(void *func, const char *name)
-{
-    TCGContext *s = &tcg_ctx;
-    int n;
-    if ((s->nb_helpers + 1) > s->allocated_helpers) {
-        n = s->allocated_helpers;
-        if (n == 0) {
-            n = 4;
-        } else {
-            n *= 2;
-        }
-        s->helpers = realloc(s->helpers, n * sizeof(TCGHelperInfo));
-        s->allocated_helpers = n;
-    }
-    s->helpers[s->nb_helpers].func = (uintptr_t)func;
-    s->helpers[s->nb_helpers].name = name;
-    s->nb_helpers++;
-}
-
 /* Note: we convert the 64 bit args to 32 bit and do some alignment
    and endian swap. Maybe it would be better to do the alignment
    and endian swap in tcg_reg_alloc_call(). */
@@ -795,6 +811,188 @@ void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
 }
 #endif
 
+static inline TCGMemOp tcg_canonicalize_memop(TCGMemOp op, bool is64, bool st)
+{
+    switch (op & MO_SIZE) {
+    case MO_8:
+        op &= ~MO_BSWAP;
+        break;
+    case MO_16:
+        break;
+    case MO_32:
+        if (!is64) {
+            op &= ~MO_SIGN;
+        }
+        break;
+    case MO_64:
+        if (!is64) {
+            tcg_abort();
+        }
+        break;
+    }
+    if (st) {
+        op &= ~MO_SIGN;
+    }
+    return op;
+}
+
+static const TCGOpcode old_ld_opc[8] = {
+    [MO_UB] = INDEX_op_qemu_ld8u,
+    [MO_SB] = INDEX_op_qemu_ld8s,
+    [MO_UW] = INDEX_op_qemu_ld16u,
+    [MO_SW] = INDEX_op_qemu_ld16s,
+#if TCG_TARGET_REG_BITS == 32
+    [MO_UL] = INDEX_op_qemu_ld32,
+    [MO_SL] = INDEX_op_qemu_ld32,
+#else
+    [MO_UL] = INDEX_op_qemu_ld32u,
+    [MO_SL] = INDEX_op_qemu_ld32s,
+#endif
+    [MO_Q]  = INDEX_op_qemu_ld64,
+};
+
+static const TCGOpcode old_st_opc[4] = {
+    [MO_UB] = INDEX_op_qemu_st8,
+    [MO_UW] = INDEX_op_qemu_st16,
+    [MO_UL] = INDEX_op_qemu_st32,
+    [MO_Q]  = INDEX_op_qemu_st64,
+};
+
+void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    if (TCG_TARGET_HAS_new_ldst) {
+        *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_ld_i32;
+        tcg_add_param_i32(val);
+        tcg_add_param_tl(addr);
+        *tcg_ctx.gen_opparam_ptr++ = memop;
+        *tcg_ctx.gen_opparam_ptr++ = idx;
+        return;
+    }
+
+    /* The old opcodes only support target-endian memory operations.  */
+    assert((memop & MO_BSWAP) == MO_TE || (memop & MO_SIZE) == MO_8);
+    assert(old_ld_opc[memop & MO_SSIZE] != 0);
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        *tcg_ctx.gen_opc_ptr++ = old_ld_opc[memop & MO_SSIZE];
+        tcg_add_param_i32(val);
+        tcg_add_param_tl(addr);
+        *tcg_ctx.gen_opparam_ptr++ = idx;
+    } else {
+        TCGv_i64 val64 = tcg_temp_new_i64();
+
+        *tcg_ctx.gen_opc_ptr++ = old_ld_opc[memop & MO_SSIZE];
+        tcg_add_param_i64(val64);
+        tcg_add_param_tl(addr);
+        *tcg_ctx.gen_opparam_ptr++ = idx;
+
+        tcg_gen_trunc_i64_i32(val, val64);
+        tcg_temp_free_i64(val64);
+    }
+}
+
+void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 0, 1);
+
+    if (TCG_TARGET_HAS_new_ldst) {
+        *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_st_i32;
+        tcg_add_param_i32(val);
+        tcg_add_param_tl(addr);
+        *tcg_ctx.gen_opparam_ptr++ = memop;
+        *tcg_ctx.gen_opparam_ptr++ = idx;
+        return;
+    }
+
+    /* The old opcodes only support target-endian memory operations.  */
+    assert((memop & MO_BSWAP) == MO_TE || (memop & MO_SIZE) == MO_8);
+    assert(old_st_opc[memop & MO_SIZE] != 0);
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        *tcg_ctx.gen_opc_ptr++ = old_st_opc[memop & MO_SIZE];
+        tcg_add_param_i32(val);
+        tcg_add_param_tl(addr);
+        *tcg_ctx.gen_opparam_ptr++ = idx;
+    } else {
+        TCGv_i64 val64 = tcg_temp_new_i64();
+
+        tcg_gen_extu_i32_i64(val64, val);
+
+        *tcg_ctx.gen_opc_ptr++ = old_st_opc[memop & MO_SIZE];
+        tcg_add_param_i64(val64);
+        tcg_add_param_tl(addr);
+        *tcg_ctx.gen_opparam_ptr++ = idx;
+
+        tcg_temp_free_i64(val64);
+    }
+}
+
+void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+#if TCG_TARGET_REG_BITS == 32
+    if ((memop & MO_SIZE) < MO_64) {
+        tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop);
+        if (memop & MO_SIGN) {
+            tcg_gen_sari_i32(TCGV_HIGH(val), TCGV_LOW(val), 31);
+        } else {
+            tcg_gen_movi_i32(TCGV_HIGH(val), 0);
+        }
+        return;
+    }
+#endif
+
+    if (TCG_TARGET_HAS_new_ldst) {
+        *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_ld_i64;
+        tcg_add_param_i64(val);
+        tcg_add_param_tl(addr);
+        *tcg_ctx.gen_opparam_ptr++ = memop;
+        *tcg_ctx.gen_opparam_ptr++ = idx;
+        return;
+    }
+
+    /* The old opcodes only support target-endian memory operations.  */
+    assert((memop & MO_BSWAP) == MO_TE || (memop & MO_SIZE) == MO_8);
+    assert(old_ld_opc[memop & MO_SSIZE] != 0);
+
+    *tcg_ctx.gen_opc_ptr++ = old_ld_opc[memop & MO_SSIZE];
+    tcg_add_param_i64(val);
+    tcg_add_param_tl(addr);
+    *tcg_ctx.gen_opparam_ptr++ = idx;
+}
+
+void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 1, 1);
+
+#if TCG_TARGET_REG_BITS == 32
+    if ((memop & MO_SIZE) < MO_64) {
+        tcg_gen_qemu_st_i32(TCGV_LOW(val), addr, idx, memop);
+        return;
+    }
+#endif
+
+    if (TCG_TARGET_HAS_new_ldst) {
+        *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_st_i64;
+        tcg_add_param_i64(val);
+        tcg_add_param_tl(addr);
+        *tcg_ctx.gen_opparam_ptr++ = memop;
+        *tcg_ctx.gen_opparam_ptr++ = idx;
+        return;
+    }
+
+    /* The old opcodes only support target-endian memory operations.  */
+    assert((memop & MO_BSWAP) == MO_TE || (memop & MO_SIZE) == MO_8);
+    assert(old_st_opc[memop & MO_SIZE] != 0);
+
+    *tcg_ctx.gen_opc_ptr++ = old_st_opc[memop & MO_SIZE];
+    tcg_add_param_i64(val);
+    tcg_add_param_tl(addr);
+    *tcg_ctx.gen_opparam_ptr++ = idx;
+}
 
 static void tcg_reg_alloc_start(TCGContext *s)
 {
@@ -851,47 +1049,14 @@ char *tcg_get_arg_str_i64(TCGContext *s, char *buf, int buf_size, TCGv_i64 arg)
     return tcg_get_arg_str_idx(s, buf, buf_size, GET_TCGV_I64(arg));
 }
 
-static int helper_cmp(const void *p1, const void *p2)
+/* Find helper name.  */
+static inline const char *tcg_find_helper(TCGContext *s, uintptr_t val)
 {
-    const TCGHelperInfo *th1 = p1;
-    const TCGHelperInfo *th2 = p2;
-    if (th1->func < th2->func)
-        return -1;
-    else if (th1->func == th2->func)
-        return 0;
-    else
-        return 1;
-}
-
-/* find helper definition (Note: A hash table would be better) */
-static TCGHelperInfo *tcg_find_helper(TCGContext *s, uintptr_t val)
-{
-    int m, m_min, m_max;
-    TCGHelperInfo *th;
-    uintptr_t v;
-
-    if (unlikely(!s->helpers_sorted)) {
-        qsort(s->helpers, s->nb_helpers, sizeof(TCGHelperInfo), 
-              helper_cmp);
-        s->helpers_sorted = 1;
+    const char *ret = NULL;
+    if (s->helpers) {
+        ret = g_hash_table_lookup(s->helpers, (gpointer)val);
     }
-
-    /* binary search */
-    m_min = 0;
-    m_max = s->nb_helpers - 1;
-    while (m_min <= m_max) {
-        m = (m_min + m_max) >> 1;
-        th = &s->helpers[m];
-        v = th->func;
-        if (v == val)
-            return th;
-        else if (val < v) {
-            m_max = m - 1;
-        } else {
-            m_min = m + 1;
-        }
-    }
-    return NULL;
+    return ret;
 }
 
 static const char * const cond_name[] =
@@ -910,6 +1075,22 @@ static const char * const cond_name[] =
     [TCG_COND_GTU] = "gtu"
 };
 
+static const char * const ldst_name[] =
+{
+    [MO_UB]   = "ub",
+    [MO_SB]   = "sb",
+    [MO_LEUW] = "leuw",
+    [MO_LESW] = "lesw",
+    [MO_LEUL] = "leul",
+    [MO_LESL] = "lesl",
+    [MO_LEQ]  = "leq",
+    [MO_BEUW] = "beuw",
+    [MO_BESW] = "besw",
+    [MO_BEUL] = "beul",
+    [MO_BESL] = "besl",
+    [MO_BEQ]  = "beq",
+};
+
 void tcg_dump_ops(TCGContext *s)
 {
     const uint16_t *opc_ptr;
@@ -976,7 +1157,7 @@ void tcg_dump_ops(TCGContext *s)
             }
         } else if (c == INDEX_op_movi_i32 || c == INDEX_op_movi_i64) {
             tcg_target_ulong val;
-            TCGHelperInfo *th;
+            const char *name;
 
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
@@ -984,9 +1165,9 @@ void tcg_dump_ops(TCGContext *s)
             qemu_log(" %s %s,$", def->name,
                      tcg_get_arg_str_idx(s, buf, sizeof(buf), args[0]));
             val = args[1];
-            th = tcg_find_helper(s, val);
-            if (th) {
-                qemu_log("%s", th->name);
+            name = tcg_find_helper(s, val);
+            if (name) {
+                qemu_log("%s", name);
             } else {
                 if (c == INDEX_op_movi_i32) {
                     qemu_log("0x%x", (uint32_t)val);
@@ -1038,6 +1219,17 @@ void tcg_dump_ops(TCGContext *s)
                 }
                 i = 1;
                 break;
+            case INDEX_op_qemu_ld_i32:
+            case INDEX_op_qemu_st_i32:
+            case INDEX_op_qemu_ld_i64:
+            case INDEX_op_qemu_st_i64:
+                if (args[k] < ARRAY_SIZE(ldst_name) && ldst_name[args[k]]) {
+                    qemu_log(",%s", ldst_name[args[k++]]);
+                } else {
+                    qemu_log(",$0x%" TCG_PRIlx, args[k++]);
+                }
+                i = 1;
+                break;
             default:
                 i = 0;
                 break;
@@ -2311,6 +2503,8 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
     s->code_buf = gen_code_buf;
     s->code_ptr = gen_code_buf;
 
+    tcg_out_tb_init(s);
+
     args = s->gen_opparam_buf;
     op_index = 0;
 
@@ -2384,10 +2578,8 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
 #endif
     }
  the_end:
-#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
     /* Generate TB finalization at the end of block */
     tcg_out_tb_finalize(s);
-#endif
     return -1;
 }
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 902c751d26..0d9bd293b5 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -197,6 +197,60 @@ typedef enum TCGType {
 #endif
 } TCGType;
 
+/* Constants for qemu_ld and qemu_st for the Memory Operation field.  */
+typedef enum TCGMemOp {
+    MO_8     = 0,
+    MO_16    = 1,
+    MO_32    = 2,
+    MO_64    = 3,
+    MO_SIZE  = 3,   /* Mask for the above.  */
+
+    MO_SIGN  = 4,   /* Sign-extended, otherwise zero-extended.  */
+
+    MO_BSWAP = 8,   /* Host reverse endian.  */
+#ifdef HOST_WORDS_BIGENDIAN
+    MO_LE    = MO_BSWAP,
+    MO_BE    = 0,
+#else
+    MO_LE    = 0,
+    MO_BE    = MO_BSWAP,
+#endif
+#ifdef TARGET_WORDS_BIGENDIAN
+    MO_TE    = MO_BE,
+#else
+    MO_TE    = MO_LE,
+#endif
+
+    /* Combinations of the above, for ease of use.  */
+    MO_UB    = MO_8,
+    MO_UW    = MO_16,
+    MO_UL    = MO_32,
+    MO_SB    = MO_SIGN | MO_8,
+    MO_SW    = MO_SIGN | MO_16,
+    MO_SL    = MO_SIGN | MO_32,
+    MO_Q     = MO_64,
+
+    MO_LEUW  = MO_LE | MO_UW,
+    MO_LEUL  = MO_LE | MO_UL,
+    MO_LESW  = MO_LE | MO_SW,
+    MO_LESL  = MO_LE | MO_SL,
+    MO_LEQ   = MO_LE | MO_Q,
+
+    MO_BEUW  = MO_BE | MO_UW,
+    MO_BEUL  = MO_BE | MO_UL,
+    MO_BESW  = MO_BE | MO_SW,
+    MO_BESL  = MO_BE | MO_SL,
+    MO_BEQ   = MO_BE | MO_Q,
+
+    MO_TEUW  = MO_TE | MO_UW,
+    MO_TEUL  = MO_TE | MO_UL,
+    MO_TESW  = MO_TE | MO_SW,
+    MO_TESL  = MO_TE | MO_SL,
+    MO_TEQ   = MO_TE | MO_Q,
+
+    MO_SSIZE = MO_SIZE | MO_SIGN,
+} TCGMemOp;
+
 typedef tcg_target_ulong TCGArg;
 
 /* Define a type and accessor macros for variables.  Using a struct is
@@ -211,24 +265,6 @@ typedef tcg_target_ulong TCGArg;
    are aliases for target_ulong and host pointer sized values respectively.
  */
 
-#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
-/* Macros/structures for qemu_ld/st IR code optimization:
-   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
-#define TCG_MAX_QEMU_LDST       640
-
-typedef struct TCGLabelQemuLdst {
-    int is_ld:1;            /* qemu_ld: 1, qemu_st: 0 */
-    int opc:4;
-    int addrlo_reg;         /* reg index for low word of guest virtual addr */
-    int addrhi_reg;         /* reg index for high word of guest virtual addr */
-    int datalo_reg;         /* reg index for low word to be loaded or stored */
-    int datahi_reg;         /* reg index for high word to be loaded or stored */
-    int mem_index;          /* soft MMU memory index */
-    uint8_t *raddr;         /* gen code addr of the next IR of qemu_ld/st IR */
-    uint8_t *label_ptr[2];  /* label pointers to be updated */
-} TCGLabelQemuLdst;
-#endif
-
 #ifdef CONFIG_DEBUG_TCG
 #define DEBUG_TCGV 1
 #endif
@@ -405,11 +441,6 @@ typedef struct TCGTemp {
     const char *name;
 } TCGTemp;
 
-typedef struct TCGHelperInfo {
-    uintptr_t func;
-    const char *name;
-} TCGHelperInfo;
-
 typedef struct TCGContext TCGContext;
 
 struct TCGContext {
@@ -447,10 +478,7 @@ struct TCGContext {
     uint8_t *code_ptr;
     TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
 
-    TCGHelperInfo *helpers;
-    int nb_helpers;
-    int allocated_helpers;
-    int helpers_sorted;
+    GHashTable *helpers;
 
 #ifdef CONFIG_PROFILER
     /* profiling info */
@@ -496,12 +524,8 @@ struct TCGContext {
 
     TBContext tb_ctx;
 
-#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
-    /* labels info for qemu_ld/st IRs
-       The labels help to generate TLB miss case codes at the end of TB */
-    TCGLabelQemuLdst *qemu_ldst_labels;
-    int nb_qemu_ldst_labels;
-#endif
+    /* The TCGBackendData structure is private to tcg-target.c.  */
+    struct TCGBackendData *be;
 };
 
 extern TCGContext tcg_ctx;
@@ -680,8 +704,6 @@ TCGArg *tcg_optimize(TCGContext *s, uint16_t *tcg_opc_ptr, TCGArg *args,
                      TCGOpDef *tcg_op_def);
 
 /* only used for debugging purposes */
-void tcg_register_helper(void *func, const char *name);
-const char *tcg_helper_get_name(TCGContext *s, void *func);
 void tcg_dump_ops(TCGContext *s);
 
 void dump_ops(const uint16_t *opc_buf, const TCGArg *opparam_buf);
@@ -745,11 +767,6 @@ TCGv_i64 tcg_const_local_i64(int64_t val);
 
 void tcg_register_jit(void *buf, size_t buf_size);
 
-#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
-/* Generate TB finalization at the end of block */
-void tcg_out_tb_finalize(TCGContext *s);
-#endif
-
 /*
  * Memory helpers that will be used by TCG generated code.
  */
@@ -757,29 +774,66 @@ void tcg_out_tb_finalize(TCGContext *s);
 /* Value zero-extended to tcg register size.  */
 tcg_target_ulong helper_ret_ldub_mmu(CPUArchState *env, target_ulong addr,
                                      int mmu_idx, uintptr_t retaddr);
-tcg_target_ulong helper_ret_lduw_mmu(CPUArchState *env, target_ulong addr,
-                                     int mmu_idx, uintptr_t retaddr);
-tcg_target_ulong helper_ret_ldul_mmu(CPUArchState *env, target_ulong addr,
-                                     int mmu_idx, uintptr_t retaddr);
-uint64_t helper_ret_ldq_mmu(CPUArchState *env, target_ulong addr,
-                            int mmu_idx, uintptr_t retaddr);
+tcg_target_ulong helper_le_lduw_mmu(CPUArchState *env, target_ulong addr,
+                                    int mmu_idx, uintptr_t retaddr);
+tcg_target_ulong helper_le_ldul_mmu(CPUArchState *env, target_ulong addr,
+                                    int mmu_idx, uintptr_t retaddr);
+uint64_t helper_le_ldq_mmu(CPUArchState *env, target_ulong addr,
+                           int mmu_idx, uintptr_t retaddr);
+tcg_target_ulong helper_be_lduw_mmu(CPUArchState *env, target_ulong addr,
+                                    int mmu_idx, uintptr_t retaddr);
+tcg_target_ulong helper_be_ldul_mmu(CPUArchState *env, target_ulong addr,
+                                    int mmu_idx, uintptr_t retaddr);
+uint64_t helper_be_ldq_mmu(CPUArchState *env, target_ulong addr,
+                           int mmu_idx, uintptr_t retaddr);
 
 /* Value sign-extended to tcg register size.  */
 tcg_target_ulong helper_ret_ldsb_mmu(CPUArchState *env, target_ulong addr,
                                      int mmu_idx, uintptr_t retaddr);
-tcg_target_ulong helper_ret_ldsw_mmu(CPUArchState *env, target_ulong addr,
-                                     int mmu_idx, uintptr_t retaddr);
-tcg_target_ulong helper_ret_ldsl_mmu(CPUArchState *env, target_ulong addr,
-                                     int mmu_idx, uintptr_t retaddr);
+tcg_target_ulong helper_le_ldsw_mmu(CPUArchState *env, target_ulong addr,
+                                    int mmu_idx, uintptr_t retaddr);
+tcg_target_ulong helper_le_ldsl_mmu(CPUArchState *env, target_ulong addr,
+                                    int mmu_idx, uintptr_t retaddr);
+tcg_target_ulong helper_be_ldsw_mmu(CPUArchState *env, target_ulong addr,
+                                    int mmu_idx, uintptr_t retaddr);
+tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
+                                    int mmu_idx, uintptr_t retaddr);
 
 void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
                         int mmu_idx, uintptr_t retaddr);
-void helper_ret_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-                        int mmu_idx, uintptr_t retaddr);
-void helper_ret_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
-                        int mmu_idx, uintptr_t retaddr);
-void helper_ret_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-                        int mmu_idx, uintptr_t retaddr);
+void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+                       int mmu_idx, uintptr_t retaddr);
+void helper_le_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
+                       int mmu_idx, uintptr_t retaddr);
+void helper_le_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
+                       int mmu_idx, uintptr_t retaddr);
+void helper_be_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+                       int mmu_idx, uintptr_t retaddr);
+void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
+                       int mmu_idx, uintptr_t retaddr);
+void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
+                       int mmu_idx, uintptr_t retaddr);
+
+/* Temporary aliases until backends are converted.  */
+#ifdef TARGET_WORDS_BIGENDIAN
+# define helper_ret_ldsw_mmu  helper_be_ldsw_mmu
+# define helper_ret_lduw_mmu  helper_be_lduw_mmu
+# define helper_ret_ldsl_mmu  helper_be_ldsl_mmu
+# define helper_ret_ldul_mmu  helper_be_ldul_mmu
+# define helper_ret_ldq_mmu   helper_be_ldq_mmu
+# define helper_ret_stw_mmu   helper_be_stw_mmu
+# define helper_ret_stl_mmu   helper_be_stl_mmu
+# define helper_ret_stq_mmu   helper_be_stq_mmu
+#else
+# define helper_ret_ldsw_mmu  helper_le_ldsw_mmu
+# define helper_ret_lduw_mmu  helper_le_lduw_mmu
+# define helper_ret_ldsl_mmu  helper_le_ldsl_mmu
+# define helper_ret_ldul_mmu  helper_le_ldul_mmu
+# define helper_ret_ldq_mmu   helper_le_ldq_mmu
+# define helper_ret_stw_mmu   helper_le_stw_mmu
+# define helper_ret_stl_mmu   helper_le_stl_mmu
+# define helper_ret_stq_mmu   helper_le_stq_mmu
+#endif
 
 uint8_t helper_ldb_mmu(CPUArchState *env, target_ulong addr, int mmu_idx);
 uint16_t helper_ldw_mmu(CPUArchState *env, target_ulong addr, int mmu_idx);
diff --git a/tcg/tci/tcg-target.c b/tcg/tci/tcg-target.c
index 233ab3bf35..fc80704de8 100644
--- a/tcg/tci/tcg-target.c
+++ b/tcg/tci/tcg-target.c
@@ -22,6 +22,8 @@
  * THE SOFTWARE.
  */
 
+#include "tcg-be-null.h"
+
 /* TODO list:
  * - See TODO comments in code.
  */
@@ -670,7 +672,6 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_shl_i64:
     case INDEX_op_shr_i64:
     case INDEX_op_sar_i64:
-        /* TODO: Implementation of rotl_i64, rotr_i64 missing in tci.c. */
     case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
     case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
         tcg_out_r(s, args[0]);
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index c2ecfbe047..6e1da8c007 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -120,6 +120,8 @@
 #define TCG_TARGET_HAS_mulsh_i64        0
 #endif /* TCG_TARGET_REG_BITS == 64 */
 
+#define TCG_TARGET_HAS_new_ldst         0
+
 /* Number of registers available.
    For 32 bit hosts, we need more than 8 registers (call arguments). */
 /* #define TCG_TARGET_NB_REGS 8 */