summary refs log tree commit diff stats
path: root/tcg
diff options
context:
space:
mode:
Diffstat (limited to 'tcg')
-rw-r--r--tcg/aarch64/tcg-target.c22
-rw-r--r--tcg/arm/tcg-target.c22
-rw-r--r--tcg/i386/tcg-target.c38
-rw-r--r--tcg/mips/tcg-target.c1849
-rw-r--r--tcg/mips/tcg-target.h14
-rw-r--r--tcg/optimize.c244
-rw-r--r--tcg/s390/tcg-target.c22
-rw-r--r--tcg/sparc/tcg-target.c22
-rw-r--r--tcg/tcg-op.h169
-rw-r--r--tcg/tcg-runtime.h30
-rw-r--r--tcg/tcg.c71
-rw-r--r--tcg/tcg.h6
-rw-r--r--tcg/tci/tcg-target.c3
13 files changed, 1248 insertions, 1264 deletions
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 77bb6d97de..56dae66a3f 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -1809,24 +1809,23 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 }
 
 typedef struct {
-    DebugFrameCIE cie;
-    DebugFrameFDEHeader fde;
+    DebugFrameHeader h;
     uint8_t fde_def_cfa[4];
     uint8_t fde_reg_ofs[24];
 } DebugFrame;
 
 #define ELF_HOST_MACHINE EM_AARCH64
 
-static DebugFrame debug_frame = {
-    .cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
-    .cie.id = -1,
-    .cie.version = 1,
-    .cie.code_align = 1,
-    .cie.data_align = 0x78,             /* sleb128 -8 */
-    .cie.return_column = TCG_REG_LR,
+static const DebugFrame debug_frame = {
+    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
+    .h.cie.id = -1,
+    .h.cie.version = 1,
+    .h.cie.code_align = 1,
+    .h.cie.data_align = 0x78,             /* sleb128 -8 */
+    .h.cie.return_column = TCG_REG_LR,
 
     /* Total FDE size does not include the "len" member.  */
-    .fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, fde.cie_offset),
+    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
 
     .fde_def_cfa = {
         12, TCG_REG_SP,                 /* DW_CFA_def_cfa sp, ... */
@@ -1851,8 +1850,5 @@ static DebugFrame debug_frame = {
 
 void tcg_register_jit(void *buf, size_t buf_size)
 {
-    debug_frame.fde.func_start = (intptr_t)buf;
-    debug_frame.fde.func_len = buf_size;
-
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 538ca2aed0..e40301c78b 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -2077,8 +2077,7 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 }
 
 typedef struct {
-    DebugFrameCIE cie;
-    DebugFrameFDEHeader fde;
+    DebugFrameHeader h;
     uint8_t fde_def_cfa[4];
     uint8_t fde_reg_ofs[18];
 } DebugFrame;
@@ -2088,16 +2087,16 @@ typedef struct {
 /* We're expecting a 2 byte uleb128 encoded value.  */
 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
 
-static DebugFrame debug_frame = {
-    .cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
-    .cie.id = -1,
-    .cie.version = 1,
-    .cie.code_align = 1,
-    .cie.data_align = 0x7c,             /* sleb128 -4 */
-    .cie.return_column = 14,
+static const DebugFrame debug_frame = {
+    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
+    .h.cie.id = -1,
+    .h.cie.version = 1,
+    .h.cie.code_align = 1,
+    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
+    .h.cie.return_column = 14,
 
     /* Total FDE size does not include the "len" member.  */
-    .fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, fde.cie_offset),
+    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
 
     .fde_def_cfa = {
         12, 13,                         /* DW_CFA_def_cfa sp, ... */
@@ -2120,8 +2119,5 @@ static DebugFrame debug_frame = {
 
 void tcg_register_jit(void *buf, size_t buf_size)
 {
-    debug_frame.fde.func_start = (tcg_target_long) buf;
-    debug_frame.fde.func_len = buf_size;
-
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index a373073ff8..d9102335f9 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -2341,8 +2341,7 @@ static void tcg_target_init(TCGContext *s)
 }
 
 typedef struct {
-    DebugFrameCIE cie;
-    DebugFrameFDEHeader fde;
+    DebugFrameHeader h;
     uint8_t fde_def_cfa[4];
     uint8_t fde_reg_ofs[14];
 } DebugFrame;
@@ -2354,16 +2353,16 @@ QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
     /* Host machine without ELF. */
 #elif TCG_TARGET_REG_BITS == 64
 #define ELF_HOST_MACHINE EM_X86_64
-static DebugFrame debug_frame = {
-    .cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
-    .cie.id = -1,
-    .cie.version = 1,
-    .cie.code_align = 1,
-    .cie.data_align = 0x78,             /* sleb128 -8 */
-    .cie.return_column = 16,
+static const DebugFrame debug_frame = {
+    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
+    .h.cie.id = -1,
+    .h.cie.version = 1,
+    .h.cie.code_align = 1,
+    .h.cie.data_align = 0x78,             /* sleb128 -8 */
+    .h.cie.return_column = 16,
 
     /* Total FDE size does not include the "len" member.  */
-    .fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, fde.cie_offset),
+    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
 
     .fde_def_cfa = {
         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
@@ -2383,16 +2382,16 @@ static DebugFrame debug_frame = {
 };
 #else
 #define ELF_HOST_MACHINE EM_386
-static DebugFrame debug_frame = {
-    .cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
-    .cie.id = -1,
-    .cie.version = 1,
-    .cie.code_align = 1,
-    .cie.data_align = 0x7c,             /* sleb128 -4 */
-    .cie.return_column = 8,
+static const DebugFrame debug_frame = {
+    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
+    .h.cie.id = -1,
+    .h.cie.version = 1,
+    .h.cie.code_align = 1,
+    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
+    .h.cie.return_column = 8,
 
     /* Total FDE size does not include the "len" member.  */
-    .fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, fde.cie_offset),
+    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
 
     .fde_def_cfa = {
         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
@@ -2413,9 +2412,6 @@ static DebugFrame debug_frame = {
 #if defined(ELF_HOST_MACHINE)
 void tcg_register_jit(void *buf, size_t buf_size)
 {
-    debug_frame.fde.func_start = (uintptr_t)buf;
-    debug_frame.fde.func_len = buf_size;
-
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
 #endif
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index 0ae495c586..8855d5039d 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -24,14 +24,17 @@
  * THE SOFTWARE.
  */
 
-#include "tcg-be-null.h"
+#include "tcg-be-ldst.h"
 
-#if defined(HOST_WORDS_BIGENDIAN) == defined(TARGET_WORDS_BIGENDIAN)
-# define TCG_NEED_BSWAP 0
+#ifdef HOST_WORDS_BIGENDIAN
+# define MIPS_BE  1
 #else
-# define TCG_NEED_BSWAP 1
+# define MIPS_BE  0
 #endif
 
+#define LO_OFF    (MIPS_BE * 4)
+#define HI_OFF    (4 - LO_OFF)
+
 #ifndef NDEBUG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     "zero",
@@ -64,13 +67,17 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     "k1",
     "gp",
     "sp",
-    "fp",
+    "s8",
     "ra",
 };
 #endif
 
+#define TCG_TMP0  TCG_REG_AT
+#define TCG_TMP1  TCG_REG_T9
+
 /* check if we really need so many registers :P */
 static const TCGReg tcg_target_reg_alloc_order[] = {
+    /* Call saved registers.  */
     TCG_REG_S0,
     TCG_REG_S1,
     TCG_REG_S2,
@@ -79,6 +86,10 @@ static const TCGReg tcg_target_reg_alloc_order[] = {
     TCG_REG_S5,
     TCG_REG_S6,
     TCG_REG_S7,
+    TCG_REG_S8,
+
+    /* Call clobbered registers.  */
+    TCG_REG_T0,
     TCG_REG_T1,
     TCG_REG_T2,
     TCG_REG_T3,
@@ -88,12 +99,14 @@ static const TCGReg tcg_target_reg_alloc_order[] = {
     TCG_REG_T7,
     TCG_REG_T8,
     TCG_REG_T9,
-    TCG_REG_A0,
-    TCG_REG_A1,
-    TCG_REG_A2,
-    TCG_REG_A3,
+    TCG_REG_V1,
     TCG_REG_V0,
-    TCG_REG_V1
+
+    /* Argument registers, opposite order of allocation.  */
+    TCG_REG_A3,
+    TCG_REG_A2,
+    TCG_REG_A1,
+    TCG_REG_A0,
 };
 
 static const TCGReg tcg_target_call_iarg_regs[4] = {
@@ -142,6 +155,17 @@ static void patch_reloc(tcg_insn_unit *code_ptr, int type,
     reloc_pc16(code_ptr, (tcg_insn_unit *)value);
 }
 
+#define TCG_CT_CONST_ZERO 0x100
+#define TCG_CT_CONST_U16  0x200    /* Unsigned 16-bit: 0 - 0xffff.  */
+#define TCG_CT_CONST_S16  0x400    /* Signed 16-bit: -32768 - 32767 */
+#define TCG_CT_CONST_P2M1 0x800    /* Power of 2 minus 1.  */
+#define TCG_CT_CONST_N16  0x1000   /* "Negatable" 16-bit: -32767 - 32767 */
+
+static inline bool is_p2m1(tcg_target_long val)
+{
+    return val && ((val + 1) & val) == 0;
+}
+
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -161,11 +185,11 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
     case 'l': /* qemu_ld input arg constraint */
         ct->ct |= TCG_CT_REG;
         tcg_regset_set(ct->u.regs, 0xffffffff);
-#if defined(CONFIG_SOFTMMU)
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
-# if (TARGET_LONG_BITS == 64)
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
-# endif
+#if defined(CONFIG_SOFTMMU)
+        if (TARGET_LONG_BITS == 64) {
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
+        }
 #endif
         break;
     case 'S': /* qemu_st constraint */
@@ -173,13 +197,12 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         tcg_regset_set(ct->u.regs, 0xffffffff);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
 #if defined(CONFIG_SOFTMMU)
-# if (TARGET_LONG_BITS == 32)
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A1);
-# endif
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
-# if TARGET_LONG_BITS == 64
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A3);
-# endif
+        if (TARGET_LONG_BITS == 32) {
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A1);
+        } else {
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A3);
+        }
 #endif
         break;
     case 'I':
@@ -188,6 +211,12 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
     case 'J':
         ct->ct |= TCG_CT_CONST_S16;
         break;
+    case 'K':
+        ct->ct |= TCG_CT_CONST_P2M1;
+        break;
+    case 'N':
+        ct->ct |= TCG_CT_CONST_N16;
+        break;
     case 'Z':
         /* We are cheating a bit here, using the fact that the register
            ZERO is also the register number 0. Hence there is no need
@@ -208,20 +237,27 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 {
     int ct;
     ct = arg_ct->ct;
-    if (ct & TCG_CT_CONST)
+    if (ct & TCG_CT_CONST) {
+        return 1;
+    } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
         return 1;
-    else if ((ct & TCG_CT_CONST_ZERO) && val == 0)
+    } else if ((ct & TCG_CT_CONST_U16) && val == (uint16_t)val) {
         return 1;
-    else if ((ct & TCG_CT_CONST_U16) && val == (uint16_t)val)
+    } else if ((ct & TCG_CT_CONST_S16) && val == (int16_t)val) {
         return 1;
-    else if ((ct & TCG_CT_CONST_S16) && val == (int16_t)val)
+    } else if ((ct & TCG_CT_CONST_N16) && val >= -32767 && val <= 32767) {
         return 1;
-    else
-        return 0;
+    } else if ((ct & TCG_CT_CONST_P2M1)
+               && use_mips32r2_instructions && is_p2m1(val)) {
+        return 1;
+    }
+    return 0;
 }
 
 /* instruction opcodes */
-enum {
+typedef enum {
+    OPC_J        = 0x02 << 26,
+    OPC_JAL      = 0x03 << 26,
     OPC_BEQ      = 0x04 << 26,
     OPC_BNE      = 0x05 << 26,
     OPC_BLEZ     = 0x06 << 26,
@@ -279,16 +315,17 @@ enum {
     OPC_MUL      = OPC_SPECIAL2 | 0x002,
 
     OPC_SPECIAL3 = 0x1f << 26,
+    OPC_EXT      = OPC_SPECIAL3 | 0x000,
     OPC_INS      = OPC_SPECIAL3 | 0x004,
     OPC_WSBH     = OPC_SPECIAL3 | 0x0a0,
     OPC_SEB      = OPC_SPECIAL3 | 0x420,
     OPC_SEH      = OPC_SPECIAL3 | 0x620,
-};
+} MIPSInsn;
 
 /*
  * Type reg
  */
-static inline void tcg_out_opc_reg(TCGContext *s, int opc,
+static inline void tcg_out_opc_reg(TCGContext *s, MIPSInsn opc,
                                    TCGReg rd, TCGReg rs, TCGReg rt)
 {
     int32_t inst;
@@ -303,7 +340,7 @@ static inline void tcg_out_opc_reg(TCGContext *s, int opc,
 /*
  * Type immediate
  */
-static inline void tcg_out_opc_imm(TCGContext *s, int opc,
+static inline void tcg_out_opc_imm(TCGContext *s, MIPSInsn opc,
                                    TCGReg rt, TCGReg rs, TCGArg imm)
 {
     int32_t inst;
@@ -316,9 +353,25 @@ static inline void tcg_out_opc_imm(TCGContext *s, int opc,
 }
 
 /*
+ * Type bitfield
+ */
+static inline void tcg_out_opc_bf(TCGContext *s, MIPSInsn opc, TCGReg rt,
+                                  TCGReg rs, int msb, int lsb)
+{
+    int32_t inst;
+
+    inst = opc;
+    inst |= (rs & 0x1F) << 21;
+    inst |= (rt & 0x1F) << 16;
+    inst |= (msb & 0x1F) << 11;
+    inst |= (lsb & 0x1F) << 6;
+    tcg_out32(s, inst);
+}
+
+/*
  * Type branch
  */
-static inline void tcg_out_opc_br(TCGContext *s, int opc,
+static inline void tcg_out_opc_br(TCGContext *s, MIPSInsn opc,
                                   TCGReg rt, TCGReg rs)
 {
     /* We pay attention here to not modify the branch target by reading
@@ -332,7 +385,7 @@ static inline void tcg_out_opc_br(TCGContext *s, int opc,
 /*
  * Type sa
  */
-static inline void tcg_out_opc_sa(TCGContext *s, int opc,
+static inline void tcg_out_opc_sa(TCGContext *s, MIPSInsn opc,
                                   TCGReg rd, TCGReg rt, TCGArg sa)
 {
     int32_t inst;
@@ -345,6 +398,29 @@ static inline void tcg_out_opc_sa(TCGContext *s, int opc,
 
 }
 
+/*
+ * Type jump.
+ * Returns true if the branch was in range and the insn was emitted.
+ */
+static bool tcg_out_opc_jmp(TCGContext *s, MIPSInsn opc, void *target)
+{
+    uintptr_t dest = (uintptr_t)target;
+    uintptr_t from = (uintptr_t)s->code_ptr + 4;
+    int32_t inst;
+
+    /* The pc-region branch happens within the 256MB region of
+       the delay slot (thus the +4).  */
+    if ((from ^ dest) & -(1 << 28)) {
+        return false;
+    }
+    assert((dest & 3) == 0);
+
+    inst = opc;
+    inst |= (dest >> 2) & 0x3ffffff;
+    tcg_out32(s, inst);
+    return true;
+}
+
 static inline void tcg_out_nop(TCGContext *s)
 {
     tcg_out32(s, 0);
@@ -367,8 +443,10 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
     } else if (arg == (uint16_t)arg) {
         tcg_out_opc_imm(s, OPC_ORI, reg, TCG_REG_ZERO, arg);
     } else {
-        tcg_out_opc_imm(s, OPC_LUI, reg, 0, arg >> 16);
-        tcg_out_opc_imm(s, OPC_ORI, reg, reg, arg & 0xffff);
+        tcg_out_opc_imm(s, OPC_LUI, reg, TCG_REG_ZERO, arg >> 16);
+        if (arg & 0xffff) {
+            tcg_out_opc_imm(s, OPC_ORI, reg, reg, arg & 0xffff);
+        }
     }
 }
 
@@ -378,14 +456,14 @@ static inline void tcg_out_bswap16(TCGContext *s, TCGReg ret, TCGReg arg)
         tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
     } else {
         /* ret and arg can't be register at */
-        if (ret == TCG_REG_AT || arg == TCG_REG_AT) {
+        if (ret == TCG_TMP0 || arg == TCG_TMP0) {
             tcg_abort();
         }
 
-        tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
+        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, arg, 8);
         tcg_out_opc_sa(s, OPC_SLL, ret, arg, 8);
         tcg_out_opc_imm(s, OPC_ANDI, ret, ret, 0xff00);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
     }
 }
 
@@ -396,14 +474,14 @@ static inline void tcg_out_bswap16s(TCGContext *s, TCGReg ret, TCGReg arg)
         tcg_out_opc_reg(s, OPC_SEH, ret, 0, ret);
     } else {
         /* ret and arg can't be register at */
-        if (ret == TCG_REG_AT || arg == TCG_REG_AT) {
+        if (ret == TCG_TMP0 || arg == TCG_TMP0) {
             tcg_abort();
         }
 
-        tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
+        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, arg, 8);
         tcg_out_opc_sa(s, OPC_SLL, ret, arg, 24);
         tcg_out_opc_sa(s, OPC_SRA, ret, ret, 16);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
     }
 }
 
@@ -414,22 +492,22 @@ static inline void tcg_out_bswap32(TCGContext *s, TCGReg ret, TCGReg arg)
         tcg_out_opc_sa(s, OPC_ROTR, ret, ret, 16);
     } else {
         /* ret and arg must be different and can't be register at */
-        if (ret == arg || ret == TCG_REG_AT || arg == TCG_REG_AT) {
+        if (ret == arg || ret == TCG_TMP0 || arg == TCG_TMP0) {
             tcg_abort();
         }
 
         tcg_out_opc_sa(s, OPC_SLL, ret, arg, 24);
 
-        tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 24);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, arg, 24);
+        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
 
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, arg, 0xff00);
-        tcg_out_opc_sa(s, OPC_SLL, TCG_REG_AT, TCG_REG_AT, 8);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, arg, 0xff00);
+        tcg_out_opc_sa(s, OPC_SLL, TCG_TMP0, TCG_TMP0, 8);
+        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
 
-        tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0xff00);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, arg, 8);
+        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_TMP0, 0xff00);
+        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
     }
 }
 
@@ -453,16 +531,18 @@ static inline void tcg_out_ext16s(TCGContext *s, TCGReg ret, TCGReg arg)
     }
 }
 
-static inline void tcg_out_ldst(TCGContext *s, int opc, TCGArg arg,
-                                TCGReg arg1, TCGArg arg2)
+static void tcg_out_ldst(TCGContext *s, MIPSInsn opc, TCGReg data,
+                         TCGReg addr, intptr_t ofs)
 {
-    if (arg2 == (int16_t) arg2) {
-        tcg_out_opc_imm(s, opc, arg, arg1, arg2);
-    } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_AT, arg2);
-        tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_AT, TCG_REG_AT, arg1);
-        tcg_out_opc_imm(s, opc, arg, TCG_REG_AT, 0);
+    int16_t lo = ofs;
+    if (ofs != lo) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, ofs - lo);
+        if (addr != TCG_REG_ZERO) {
+            tcg_out_opc_reg(s, OPC_ADDU, TCG_TMP0, TCG_TMP0, addr);
+        }
+        addr = TCG_TMP0;
     }
+    tcg_out_opc_imm(s, opc, data, addr, lo);
 }
 
 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
@@ -482,1051 +562,1008 @@ static inline void tcg_out_addi(TCGContext *s, TCGReg reg, TCGArg val)
     if (val == (int16_t)val) {
         tcg_out_opc_imm(s, OPC_ADDIU, reg, reg, val);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_AT, val);
-        tcg_out_opc_reg(s, OPC_ADDU, reg, reg, TCG_REG_AT);
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, val);
+        tcg_out_opc_reg(s, OPC_ADDU, reg, reg, TCG_TMP0);
     }
 }
 
-/* Helper routines for marshalling helper function arguments into
- * the correct registers and stack.
- * arg_num is where we want to put this argument, and is updated to be ready
- * for the next call. arg is the argument itself. Note that arg_num 0..3 is
- * real registers, 4+ on stack.
- *
- * We provide routines for arguments which are: immediate, 32 bit
- * value in register, 16 and 8 bit values in register (which must be zero
- * extended before use) and 64 bit value in a lo:hi register pair.
- */
-#define DEFINE_TCG_OUT_CALL_IARG(NAME, ARGPARAM)                               \
-    static inline void NAME(TCGContext *s, int *arg_num, ARGPARAM)             \
-    {                                                                          \
-    if (*arg_num < 4) {                                                        \
-        DEFINE_TCG_OUT_CALL_IARG_GET_ARG(tcg_target_call_iarg_regs[*arg_num]); \
-    } else {                                                                   \
-        DEFINE_TCG_OUT_CALL_IARG_GET_ARG(TCG_REG_AT);                          \
-        tcg_out_st(s, TCG_TYPE_I32, TCG_REG_AT, TCG_REG_SP, 4 * (*arg_num));   \
-    }                                                                          \
-    (*arg_num)++;                                                              \
-}
-#define DEFINE_TCG_OUT_CALL_IARG_GET_ARG(A) \
-    tcg_out_opc_imm(s, OPC_ANDI, A, arg, 0xff);
-DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_reg8, TCGReg arg)
-#undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
-#define DEFINE_TCG_OUT_CALL_IARG_GET_ARG(A) \
-    tcg_out_opc_imm(s, OPC_ANDI, A, arg, 0xffff);
-DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_reg16, TCGReg arg)
-#undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
-#define DEFINE_TCG_OUT_CALL_IARG_GET_ARG(A) \
-    tcg_out_movi(s, TCG_TYPE_I32, A, arg);
-DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_imm32, TCGArg arg)
-#undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
-
-/* We don't use the macro for this one to avoid an unnecessary reg-reg
-   move when storing to the stack. */
-static inline void tcg_out_call_iarg_reg32(TCGContext *s, int *arg_num,
-                                           TCGReg arg)
-{
-    if (*arg_num < 4) {
-        tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[*arg_num], arg);
-    } else {
-        tcg_out_st(s, TCG_TYPE_I32, arg, TCG_REG_SP, 4 * (*arg_num));
-    }
-    (*arg_num)++;
-}
-
-static inline void tcg_out_call_iarg_reg64(TCGContext *s, int *arg_num,
-                                           TCGReg arg_low, TCGReg arg_high)
-{
-    (*arg_num) = (*arg_num + 1) & ~1;
-
-#if defined(HOST_WORDS_BIGENDIAN)
-    tcg_out_call_iarg_reg32(s, arg_num, arg_high);
-    tcg_out_call_iarg_reg32(s, arg_num, arg_low);
-#else
-    tcg_out_call_iarg_reg32(s, arg_num, arg_low);
-    tcg_out_call_iarg_reg32(s, arg_num, arg_high);
-#endif
-}
+/* Bit 0 set if inversion required; bit 1 set if swapping required.  */
+#define MIPS_CMP_INV  1
+#define MIPS_CMP_SWAP 2
+
+static const uint8_t mips_cmp_map[16] = {
+    [TCG_COND_LT]  = 0,
+    [TCG_COND_LTU] = 0,
+    [TCG_COND_GE]  = MIPS_CMP_INV,
+    [TCG_COND_GEU] = MIPS_CMP_INV,
+    [TCG_COND_LE]  = MIPS_CMP_INV | MIPS_CMP_SWAP,
+    [TCG_COND_LEU] = MIPS_CMP_INV | MIPS_CMP_SWAP,
+    [TCG_COND_GT]  = MIPS_CMP_SWAP,
+    [TCG_COND_GTU] = MIPS_CMP_SWAP,
+};
 
-static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGArg arg1,
-                           TCGArg arg2, int label_index)
+static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGReg arg1, TCGReg arg2)
 {
-    TCGLabel *l = &s->labels[label_index];
+    MIPSInsn s_opc = OPC_SLTU;
+    int cmp_map;
 
     switch (cond) {
     case TCG_COND_EQ:
-        tcg_out_opc_br(s, OPC_BEQ, arg1, arg2);
+        if (arg2 != 0) {
+            tcg_out_opc_reg(s, OPC_XOR, ret, arg1, arg2);
+            arg1 = ret;
+        }
+        tcg_out_opc_imm(s, OPC_SLTIU, ret, arg1, 1);
         break;
+
     case TCG_COND_NE:
-        tcg_out_opc_br(s, OPC_BNE, arg1, arg2);
-        break;
-    case TCG_COND_LT:
-        if (arg2 == 0) {
-            tcg_out_opc_br(s, OPC_BLTZ, 0, arg1);
-        } else {
-            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
-            tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        if (arg2 != 0) {
+            tcg_out_opc_reg(s, OPC_XOR, ret, arg1, arg2);
+            arg1 = ret;
         }
+        tcg_out_opc_reg(s, OPC_SLTU, ret, TCG_REG_ZERO, arg1);
         break;
-    case TCG_COND_LTU:
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
-        break;
+
+    case TCG_COND_LT:
     case TCG_COND_GE:
-        if (arg2 == 0) {
-            tcg_out_opc_br(s, OPC_BGEZ, 0, arg1);
-        } else {
-            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
-            tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
-        }
-        break;
-    case TCG_COND_GEU:
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
-        break;
     case TCG_COND_LE:
-        if (arg2 == 0) {
-            tcg_out_opc_br(s, OPC_BLEZ, 0, arg1);
-        } else {
-            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
-            tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
-        }
-        break;
-    case TCG_COND_LEU:
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
-        break;
     case TCG_COND_GT:
-        if (arg2 == 0) {
-            tcg_out_opc_br(s, OPC_BGTZ, 0, arg1);
-        } else {
-            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
-            tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
-        }
-        break;
+        s_opc = OPC_SLT;
+        /* FALLTHRU */
+
+    case TCG_COND_LTU:
+    case TCG_COND_GEU:
+    case TCG_COND_LEU:
     case TCG_COND_GTU:
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
-        break;
-    default:
-        tcg_abort();
+        cmp_map = mips_cmp_map[cond];
+        if (cmp_map & MIPS_CMP_SWAP) {
+            TCGReg t = arg1;
+            arg1 = arg2;
+            arg2 = t;
+        }
+        tcg_out_opc_reg(s, s_opc, ret, arg1, arg2);
+        if (cmp_map & MIPS_CMP_INV) {
+            tcg_out_opc_imm(s, OPC_XORI, ret, ret, 1);
+        }
         break;
-    }
-    if (l->has_value) {
-        reloc_pc16(s->code_ptr - 1, l->u.value_ptr);
-    } else {
-        tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, label_index, 0);
-    }
-    tcg_out_nop(s);
+
+     default:
+         tcg_abort();
+         break;
+     }
 }
 
-/* XXX: we implement it at the target level to avoid having to
-   handle cross basic blocks temporaries */
-static void tcg_out_brcond2(TCGContext *s, TCGCond cond, TCGArg arg1,
-                            TCGArg arg2, TCGArg arg3, TCGArg arg4,
-                            int label_index)
+static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGReg arg1,
+                           TCGReg arg2, int label_index)
 {
-    tcg_insn_unit *label_ptr;
+    static const MIPSInsn b_zero[16] = {
+        [TCG_COND_LT] = OPC_BLTZ,
+        [TCG_COND_GT] = OPC_BGTZ,
+        [TCG_COND_LE] = OPC_BLEZ,
+        [TCG_COND_GE] = OPC_BGEZ,
+    };
+
+    TCGLabel *l;
+    MIPSInsn s_opc = OPC_SLTU;
+    MIPSInsn b_opc;
+    int cmp_map;
 
-    switch(cond) {
-    case TCG_COND_NE:
-        tcg_out_brcond(s, TCG_COND_NE, arg2, arg4, label_index);
-        tcg_out_brcond(s, TCG_COND_NE, arg1, arg3, label_index);
-        return;
+    switch (cond) {
     case TCG_COND_EQ:
+        b_opc = OPC_BEQ;
         break;
-    case TCG_COND_LT:
-    case TCG_COND_LE:
-        tcg_out_brcond(s, TCG_COND_LT, arg2, arg4, label_index);
+    case TCG_COND_NE:
+        b_opc = OPC_BNE;
         break;
+
+    case TCG_COND_LT:
     case TCG_COND_GT:
+    case TCG_COND_LE:
     case TCG_COND_GE:
-        tcg_out_brcond(s, TCG_COND_GT, arg2, arg4, label_index);
-        break;
-    case TCG_COND_LTU:
-    case TCG_COND_LEU:
-        tcg_out_brcond(s, TCG_COND_LTU, arg2, arg4, label_index);
-        break;
-    case TCG_COND_GTU:
-    case TCG_COND_GEU:
-        tcg_out_brcond(s, TCG_COND_GTU, arg2, arg4, label_index);
-        break;
-    default:
-        tcg_abort();
-    }
-
-    label_ptr = s->code_ptr;
-    tcg_out_opc_br(s, OPC_BNE, arg2, arg4);
-    tcg_out_nop(s);
+        if (arg2 == 0) {
+            b_opc = b_zero[cond];
+            arg2 = arg1;
+            arg1 = 0;
+            break;
+        }
+        s_opc = OPC_SLT;
+        /* FALLTHRU */
 
-    switch(cond) {
-    case TCG_COND_EQ:
-        tcg_out_brcond(s, TCG_COND_EQ, arg1, arg3, label_index);
-        break;
-    case TCG_COND_LT:
     case TCG_COND_LTU:
-        tcg_out_brcond(s, TCG_COND_LTU, arg1, arg3, label_index);
-        break;
-    case TCG_COND_LE:
-    case TCG_COND_LEU:
-        tcg_out_brcond(s, TCG_COND_LEU, arg1, arg3, label_index);
-        break;
-    case TCG_COND_GT:
     case TCG_COND_GTU:
-        tcg_out_brcond(s, TCG_COND_GTU, arg1, arg3, label_index);
-        break;
-    case TCG_COND_GE:
+    case TCG_COND_LEU:
     case TCG_COND_GEU:
-        tcg_out_brcond(s, TCG_COND_GEU, arg1, arg3, label_index);
+        cmp_map = mips_cmp_map[cond];
+        if (cmp_map & MIPS_CMP_SWAP) {
+            TCGReg t = arg1;
+            arg1 = arg2;
+            arg2 = t;
+        }
+        tcg_out_opc_reg(s, s_opc, TCG_TMP0, arg1, arg2);
+        b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQ : OPC_BNE);
+        arg1 = TCG_TMP0;
+        arg2 = TCG_REG_ZERO;
         break;
+
     default:
         tcg_abort();
+        break;
     }
 
-    reloc_pc16(label_ptr, s->code_ptr);
+    tcg_out_opc_br(s, b_opc, arg1, arg2);
+    l = &s->labels[label_index];
+    if (l->has_value) {
+        reloc_pc16(s->code_ptr - 1, l->u.value_ptr);
+    } else {
+        tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, label_index, 0);
+    }
+    tcg_out_nop(s);
 }
 
-static void tcg_out_movcond(TCGContext *s, TCGCond cond, TCGReg ret,
-                            TCGArg c1, TCGArg c2, TCGArg v)
+static TCGReg tcg_out_reduce_eq2(TCGContext *s, TCGReg tmp0, TCGReg tmp1,
+                                 TCGReg al, TCGReg ah,
+                                 TCGReg bl, TCGReg bh)
 {
-    switch (cond) {
-    case TCG_COND_EQ:
-        if (c1 == 0) {
-            tcg_out_opc_reg(s, OPC_MOVZ, ret, v, c2);
-        } else if (c2 == 0) {
-            tcg_out_opc_reg(s, OPC_MOVZ, ret, v, c1);
+    /* Merge highpart comparison into AH.  */
+    if (bh != 0) {
+        if (ah != 0) {
+            tcg_out_opc_reg(s, OPC_XOR, tmp0, ah, bh);
+            ah = tmp0;
         } else {
-            tcg_out_opc_reg(s, OPC_XOR, TCG_REG_AT, c1, c2);
-            tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+            ah = bh;
         }
-        break;
-    case TCG_COND_NE:
-        if (c1 == 0) {
-            tcg_out_opc_reg(s, OPC_MOVN, ret, v, c2);
-        } else if (c2 == 0) {
-            tcg_out_opc_reg(s, OPC_MOVN, ret, v, c1);
+    }
+    /* Merge lowpart comparison into AL.  */
+    if (bl != 0) {
+        if (al != 0) {
+            tcg_out_opc_reg(s, OPC_XOR, tmp1, al, bl);
+            al = tmp1;
         } else {
-            tcg_out_opc_reg(s, OPC_XOR, TCG_REG_AT, c1, c2);
-            tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+            al = bl;
+        }
+    }
+    /* Merge high and low part comparisons into AL.  */
+    if (ah != 0) {
+        if (al != 0) {
+            tcg_out_opc_reg(s, OPC_OR, tmp0, ah, al);
+            al = tmp0;
+        } else {
+            al = ah;
         }
-        break;
-    case TCG_COND_LT:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c1, c2);
-        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
-        break;
-    case TCG_COND_LTU:
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c1, c2);
-        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
-        break;
-    case TCG_COND_GE:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c1, c2);
-        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
-        break;
-    case TCG_COND_GEU:
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c1, c2);
-        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
-        break;
-    case TCG_COND_LE:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c2, c1);
-        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
-        break;
-    case TCG_COND_LEU:
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c2, c1);
-        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
-        break;
-    case TCG_COND_GT:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c2, c1);
-        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
-        break;
-    case TCG_COND_GTU:
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c2, c1);
-        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
-        break;
-    default:
-        tcg_abort();
-        break;
     }
+    return al;
 }
 
-static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
-                            TCGArg arg1, TCGArg arg2)
+static void tcg_out_setcond2(TCGContext *s, TCGCond cond, TCGReg ret,
+                             TCGReg al, TCGReg ah, TCGReg bl, TCGReg bh)
 {
+    TCGReg tmp0 = TCG_TMP0;
+    TCGReg tmp1 = ret;
+
+    assert(ret != TCG_TMP0);
+    if (ret == ah || ret == bh) {
+        assert(ret != TCG_TMP1);
+        tmp1 = TCG_TMP1;
+    }
+
     switch (cond) {
     case TCG_COND_EQ:
-        if (arg1 == 0) {
-            tcg_out_opc_imm(s, OPC_SLTIU, ret, arg2, 1);
-        } else if (arg2 == 0) {
-            tcg_out_opc_imm(s, OPC_SLTIU, ret, arg1, 1);
-        } else {
-            tcg_out_opc_reg(s, OPC_XOR, ret, arg1, arg2);
-            tcg_out_opc_imm(s, OPC_SLTIU, ret, ret, 1);
-        }
-        break;
     case TCG_COND_NE:
-        if (arg1 == 0) {
-            tcg_out_opc_reg(s, OPC_SLTU, ret, TCG_REG_ZERO, arg2);
-        } else if (arg2 == 0) {
-            tcg_out_opc_reg(s, OPC_SLTU, ret, TCG_REG_ZERO, arg1);
-        } else {
-            tcg_out_opc_reg(s, OPC_XOR, ret, arg1, arg2);
-            tcg_out_opc_reg(s, OPC_SLTU, ret, TCG_REG_ZERO, ret);
-        }
-        break;
-    case TCG_COND_LT:
-        tcg_out_opc_reg(s, OPC_SLT, ret, arg1, arg2);
-        break;
-    case TCG_COND_LTU:
-        tcg_out_opc_reg(s, OPC_SLTU, ret, arg1, arg2);
-        break;
-    case TCG_COND_GE:
-        tcg_out_opc_reg(s, OPC_SLT, ret, arg1, arg2);
-        tcg_out_opc_imm(s, OPC_XORI, ret, ret, 1);
-        break;
-    case TCG_COND_GEU:
-        tcg_out_opc_reg(s, OPC_SLTU, ret, arg1, arg2);
-        tcg_out_opc_imm(s, OPC_XORI, ret, ret, 1);
+        tmp1 = tcg_out_reduce_eq2(s, tmp0, tmp1, al, ah, bl, bh);
+        tcg_out_setcond(s, cond, ret, tmp1, TCG_REG_ZERO);
         break;
-    case TCG_COND_LE:
-        tcg_out_opc_reg(s, OPC_SLT, ret, arg2, arg1);
-        tcg_out_opc_imm(s, OPC_XORI, ret, ret, 1);
-        break;
-    case TCG_COND_LEU:
-        tcg_out_opc_reg(s, OPC_SLTU, ret, arg2, arg1);
-        tcg_out_opc_imm(s, OPC_XORI, ret, ret, 1);
-        break;
-    case TCG_COND_GT:
-        tcg_out_opc_reg(s, OPC_SLT, ret, arg2, arg1);
+
+    default:
+        tcg_out_setcond(s, TCG_COND_EQ, tmp0, ah, bh);
+        tcg_out_setcond(s, tcg_unsigned_cond(cond), tmp1, al, bl);
+        tcg_out_opc_reg(s, OPC_AND, tmp1, tmp1, tmp0);
+        tcg_out_setcond(s, tcg_high_cond(cond), tmp0, ah, bh);
+        tcg_out_opc_reg(s, OPC_OR, ret, tmp1, tmp0);
         break;
-    case TCG_COND_GTU:
-        tcg_out_opc_reg(s, OPC_SLTU, ret, arg2, arg1);
+    }
+}
+
+static void tcg_out_brcond2(TCGContext *s, TCGCond cond, TCGReg al, TCGReg ah,
+                            TCGReg bl, TCGReg bh, int label_index)
+{
+    TCGCond b_cond = TCG_COND_NE;
+    TCGReg tmp = TCG_TMP1;
+
+    /* With branches, we emit between 4 and 9 insns with 2 or 3 branches.
+       With setcond, we emit between 3 and 10 insns and only 1 branch,
+       which ought to get better branch prediction.  */
+     switch (cond) {
+     case TCG_COND_EQ:
+     case TCG_COND_NE:
+        b_cond = cond;
+        tmp = tcg_out_reduce_eq2(s, TCG_TMP0, TCG_TMP1, al, ah, bl, bh);
         break;
+
     default:
-        tcg_abort();
+        /* Minimize code size by prefering a compare not requiring INV.  */
+        if (mips_cmp_map[cond] & MIPS_CMP_INV) {
+            cond = tcg_invert_cond(cond);
+            b_cond = TCG_COND_EQ;
+        }
+        tcg_out_setcond2(s, cond, tmp, al, ah, bl, bh);
         break;
     }
+
+    tcg_out_brcond(s, b_cond, tmp, TCG_REG_ZERO, label_index);
 }
 
-/* XXX: we implement it at the target level to avoid having to
-   handle cross basic blocks temporaries */
-static void tcg_out_setcond2(TCGContext *s, TCGCond cond, TCGReg ret,
-                             TCGArg arg1, TCGArg arg2, TCGArg arg3, TCGArg arg4)
+static void tcg_out_movcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGReg c1, TCGReg c2, TCGReg v)
 {
+    MIPSInsn m_opc = OPC_MOVN;
+
     switch (cond) {
     case TCG_COND_EQ:
-        tcg_out_setcond(s, TCG_COND_EQ, TCG_REG_AT, arg2, arg4);
-        tcg_out_setcond(s, TCG_COND_EQ, TCG_REG_T0, arg1, arg3);
-        tcg_out_opc_reg(s, OPC_AND, ret, TCG_REG_AT, TCG_REG_T0);
-        return;
+        m_opc = OPC_MOVZ;
+        /* FALLTHRU */
     case TCG_COND_NE:
-        tcg_out_setcond(s, TCG_COND_NE, TCG_REG_AT, arg2, arg4);
-        tcg_out_setcond(s, TCG_COND_NE, TCG_REG_T0, arg1, arg3);
-        tcg_out_opc_reg(s, OPC_OR, ret, TCG_REG_AT, TCG_REG_T0);
-        return;
-    case TCG_COND_LT:
-    case TCG_COND_LE:
-        tcg_out_setcond(s, TCG_COND_LT, TCG_REG_AT, arg2, arg4);
-        break;
-    case TCG_COND_GT:
-    case TCG_COND_GE:
-        tcg_out_setcond(s, TCG_COND_GT, TCG_REG_AT, arg2, arg4);
-        break;
-    case TCG_COND_LTU:
-    case TCG_COND_LEU:
-        tcg_out_setcond(s, TCG_COND_LTU, TCG_REG_AT, arg2, arg4);
-        break;
-    case TCG_COND_GTU:
-    case TCG_COND_GEU:
-        tcg_out_setcond(s, TCG_COND_GTU, TCG_REG_AT, arg2, arg4);
+        if (c2 != 0) {
+            tcg_out_opc_reg(s, OPC_XOR, TCG_TMP0, c1, c2);
+            c1 = TCG_TMP0;
+        }
         break;
+
     default:
-        tcg_abort();
+        /* Minimize code size by prefering a compare not requiring INV.  */
+        if (mips_cmp_map[cond] & MIPS_CMP_INV) {
+            cond = tcg_invert_cond(cond);
+            m_opc = OPC_MOVZ;
+        }
+        tcg_out_setcond(s, cond, TCG_TMP0, c1, c2);
+        c1 = TCG_TMP0;
         break;
     }
 
-    tcg_out_setcond(s, TCG_COND_EQ, TCG_REG_T0, arg2, arg4);
+    tcg_out_opc_reg(s, m_opc, ret, v, c1);
+}
 
-    switch(cond) {
-    case TCG_COND_LT:
-    case TCG_COND_LTU:
-        tcg_out_setcond(s, TCG_COND_LTU, ret, arg1, arg3);
-        break;
-    case TCG_COND_LE:
-    case TCG_COND_LEU:
-        tcg_out_setcond(s, TCG_COND_LEU, ret, arg1, arg3);
-        break;
-    case TCG_COND_GT:
-    case TCG_COND_GTU:
-        tcg_out_setcond(s, TCG_COND_GTU, ret, arg1, arg3);
-        break;
-    case TCG_COND_GE:
-    case TCG_COND_GEU:
-        tcg_out_setcond(s, TCG_COND_GEU, ret, arg1, arg3);
-        break;
-    default:
-        tcg_abort();
+static void tcg_out_call_int(TCGContext *s, tcg_insn_unit *arg, bool tail)
+{
+    /* Note that the ABI requires the called function's address to be
+       loaded into T9, even if a direct branch is in range.  */
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T9, (uintptr_t)arg);
+
+    /* But do try a direct branch, allowing the cpu better insn prefetch.  */
+    if (tail) {
+        if (!tcg_out_opc_jmp(s, OPC_J, arg)) {
+            tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_T9, 0);
+        }
+    } else {
+        if (!tcg_out_opc_jmp(s, OPC_JAL, arg)) {
+            tcg_out_opc_reg(s, OPC_JALR, TCG_REG_RA, TCG_REG_T9, 0);
+        }
     }
+}
 
-    tcg_out_opc_reg(s, OPC_AND, ret, ret, TCG_REG_T0);
-    tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+static void tcg_out_call(TCGContext *s, tcg_insn_unit *arg)
+{
+    tcg_out_call_int(s, arg, false);
+    tcg_out_nop(s);
 }
 
 #if defined(CONFIG_SOFTMMU)
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
-static const void * const qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+static void * const qemu_ld_helpers[16] = {
+    [MO_UB]   = helper_ret_ldub_mmu,
+    [MO_SB]   = helper_ret_ldsb_mmu,
+    [MO_LEUW] = helper_le_lduw_mmu,
+    [MO_LESW] = helper_le_ldsw_mmu,
+    [MO_LEUL] = helper_le_ldul_mmu,
+    [MO_LEQ]  = helper_le_ldq_mmu,
+    [MO_BEUW] = helper_be_lduw_mmu,
+    [MO_BESW] = helper_be_ldsw_mmu,
+    [MO_BEUL] = helper_be_ldul_mmu,
+    [MO_BEQ]  = helper_be_ldq_mmu,
 };
 
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
-static const void * const qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+static void * const qemu_st_helpers[16] = {
+    [MO_UB]   = helper_ret_stb_mmu,
+    [MO_LEUW] = helper_le_stw_mmu,
+    [MO_LEUL] = helper_le_stl_mmu,
+    [MO_LEQ]  = helper_le_stq_mmu,
+    [MO_BEUW] = helper_be_stw_mmu,
+    [MO_BEUL] = helper_be_stl_mmu,
+    [MO_BEQ]  = helper_be_stq_mmu,
 };
-#endif
 
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
-                            int opc)
+/* Helper routines for marshalling helper function arguments into
+ * the correct registers and stack.
+ * I is where we want to put this argument, and is updated and returned
+ * for the next call. ARG is the argument itself.
+ *
+ * We provide routines for arguments which are: immediate, 32 bit
+ * value in register, 16 and 8 bit values in register (which must be zero
+ * extended before use) and 64 bit value in a lo:hi register pair.
+ */
+
+static int tcg_out_call_iarg_reg(TCGContext *s, int i, TCGReg arg)
 {
-    TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
-#if defined(CONFIG_SOFTMMU)
-    tcg_insn_unit *label1_ptr, *label2_ptr;
-    int arg_num;
-    int mem_index, s_bits;
-    int addr_meml;
-# if TARGET_LONG_BITS == 64
-    tcg_insn_unit *label3_ptr;
-    TCGReg addr_regh;
-    int addr_memh;
-# endif
-#endif
-    data_regl = *args++;
-    if (opc == 3)
-        data_regh = *args++;
-    else
-        data_regh = 0;
-    addr_regl = *args++;
-#if defined(CONFIG_SOFTMMU)
-# if TARGET_LONG_BITS == 64
-    addr_regh = *args++;
-#  if defined(HOST_WORDS_BIGENDIAN)
-    addr_memh = 0;
-    addr_meml = 4;
-#  else
-    addr_memh = 4;
-    addr_meml = 0;
-#  endif
-# else
-    addr_meml = 0;
-# endif
-    mem_index = *args;
-    s_bits = opc & 3;
-#endif
+    if (i < ARRAY_SIZE(tcg_target_call_iarg_regs)) {
+        tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[i], arg);
+    } else {
+        tcg_out_st(s, TCG_TYPE_REG, arg, TCG_REG_SP, 4 * i);
+    }
+    return i + 1;
+}
 
-    if (opc == 3) {
-#if defined(HOST_WORDS_BIGENDIAN)
-        data_reg1 = data_regh;
-        data_reg2 = data_regl;
-#else
-        data_reg1 = data_regl;
-        data_reg2 = data_regh;
-#endif
+static int tcg_out_call_iarg_reg8(TCGContext *s, int i, TCGReg arg)
+{
+    TCGReg tmp = TCG_TMP0;
+    if (i < ARRAY_SIZE(tcg_target_call_iarg_regs)) {
+        tmp = tcg_target_call_iarg_regs[i];
+    }
+    tcg_out_opc_imm(s, OPC_ANDI, tmp, arg, 0xff);
+    return tcg_out_call_iarg_reg(s, i, tmp);
+}
+
+static int tcg_out_call_iarg_reg16(TCGContext *s, int i, TCGReg arg)
+{
+    TCGReg tmp = TCG_TMP0;
+    if (i < ARRAY_SIZE(tcg_target_call_iarg_regs)) {
+        tmp = tcg_target_call_iarg_regs[i];
+    }
+    tcg_out_opc_imm(s, OPC_ANDI, tmp, arg, 0xffff);
+    return tcg_out_call_iarg_reg(s, i, tmp);
+}
+
+static int tcg_out_call_iarg_imm(TCGContext *s, int i, TCGArg arg)
+{
+    TCGReg tmp = TCG_TMP0;
+    if (arg == 0) {
+        tmp = TCG_REG_ZERO;
     } else {
-        data_reg1 = data_regl;
-        data_reg2 = 0;
+        if (i < ARRAY_SIZE(tcg_target_call_iarg_regs)) {
+            tmp = tcg_target_call_iarg_regs[i];
+        }
+        tcg_out_movi(s, TCG_TYPE_REG, tmp, arg);
     }
-#if defined(CONFIG_SOFTMMU)
-    tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addr_regl, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
+    return tcg_out_call_iarg_reg(s, i, tmp);
+}
+
+static int tcg_out_call_iarg_reg2(TCGContext *s, int i, TCGReg al, TCGReg ah)
+{
+    i = (i + 1) & ~1;
+    i = tcg_out_call_iarg_reg(s, i, (MIPS_BE ? ah : al));
+    i = tcg_out_call_iarg_reg(s, i, (MIPS_BE ? al : ah));
+    return i;
+}
+
+/* Perform the tlb comparison operation.  The complete host address is
+   placed in BASE.  Clobbers AT, T0, A0.  */
+static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
+                             TCGReg addrh, int mem_index, TCGMemOp s_bits,
+                             tcg_insn_unit *label_ptr[2], bool is_load)
+{
+    int cmp_off
+        = (is_load
+           ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+           : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
+    int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
+
+    tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addrl,
+                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0,
+                    (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
     tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
-    tcg_out_opc_imm(s, OPC_LW, TCG_REG_AT, TCG_REG_A0,
-                    offsetof(CPUArchState, tlb_table[mem_index][0].addr_read) + addr_meml);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T0, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
-    tcg_out_opc_reg(s, OPC_AND, TCG_REG_T0, TCG_REG_T0, addr_regl);
-
-# if TARGET_LONG_BITS == 64
-    label3_ptr = s->code_ptr;
-    tcg_out_opc_br(s, OPC_BNE, TCG_REG_T0, TCG_REG_AT);
-    tcg_out_nop(s);
 
-    tcg_out_opc_imm(s, OPC_LW, TCG_REG_AT, TCG_REG_A0,
-                    offsetof(CPUArchState, tlb_table[mem_index][0].addr_read) + addr_memh);
+    /* Compensate for very large offsets.  */
+    if (add_off >= 0x8000) {
+        /* Most target env are smaller than 32k; none are larger than 64k.
+           Simplify the logic here merely to offset by 0x7ff0, giving us a
+           range just shy of 64k.  Check this assumption.  */
+        QEMU_BUILD_BUG_ON(offsetof(CPUArchState,
+                                   tlb_table[NB_MMU_MODES - 1][1])
+                          > 0x7ff0 + 0x7fff);
+        tcg_out_opc_imm(s, OPC_ADDIU, TCG_REG_A0, TCG_REG_A0, 0x7ff0);
+        cmp_off -= 0x7ff0;
+        add_off -= 0x7ff0;
+    }
 
-    label1_ptr = s->code_ptr;
-    tcg_out_opc_br(s, OPC_BEQ, addr_regh, TCG_REG_AT);
-    tcg_out_nop(s);
+    /* Load the tlb comparator.  */
+    tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, TCG_REG_A0, cmp_off + LO_OFF);
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_opc_imm(s, OPC_LW, base, TCG_REG_A0, cmp_off + HI_OFF);
+    }
 
-    reloc_pc16(label3_ptr, s->code_ptr);
-# else
-    label1_ptr = s->code_ptr;
-    tcg_out_opc_br(s, OPC_BEQ, TCG_REG_T0, TCG_REG_AT);
-    tcg_out_nop(s);
-# endif
-
-    /* slow path */
-    arg_num = 0;
-    tcg_out_call_iarg_reg32(s, &arg_num, TCG_AREG0);
-# if TARGET_LONG_BITS == 64
-    tcg_out_call_iarg_reg64(s, &arg_num, addr_regl, addr_regh);
-# else
-    tcg_out_call_iarg_reg32(s, &arg_num, addr_regl);
-# endif
-    tcg_out_call_iarg_imm32(s, &arg_num, mem_index);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T9, (tcg_target_long)qemu_ld_helpers[s_bits]);
-    tcg_out_opc_reg(s, OPC_JALR, TCG_REG_RA, TCG_REG_T9, 0);
-    tcg_out_nop(s);
+    /* Mask the page bits, keeping the alignment bits to compare against.
+       In between, load the tlb addend for the fast path.  */
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1,
+                 TARGET_PAGE_MASK | ((1 << s_bits) - 1));
+    tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0, add_off);
+    tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl);
 
-    switch(opc) {
-    case 0:
-        tcg_out_opc_imm(s, OPC_ANDI, data_reg1, TCG_REG_V0, 0xff);
-        break;
-    case 0 | 4:
-        tcg_out_ext8s(s, data_reg1, TCG_REG_V0);
-        break;
-    case 1:
-        tcg_out_opc_imm(s, OPC_ANDI, data_reg1, TCG_REG_V0, 0xffff);
-        break;
-    case 1 | 4:
-        tcg_out_ext16s(s, data_reg1, TCG_REG_V0);
-        break;
-    case 2:
-        tcg_out_mov(s, TCG_TYPE_I32, data_reg1, TCG_REG_V0);
-        break;
-    case 3:
-        tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_V1);
-        tcg_out_mov(s, TCG_TYPE_I32, data_reg1, TCG_REG_V0);
-        break;
-    default:
-        tcg_abort();
+    label_ptr[0] = s->code_ptr;
+    tcg_out_opc_br(s, OPC_BNE, TCG_TMP1, TCG_TMP0);
+
+    if (TARGET_LONG_BITS == 64) {
+        /* delay slot */
+        tcg_out_nop(s);
+
+        label_ptr[1] = s->code_ptr;
+        tcg_out_opc_br(s, OPC_BNE, addrh, base);
     }
 
-    label2_ptr = s->code_ptr;
+    /* delay slot */
+    tcg_out_opc_reg(s, OPC_ADDU, base, TCG_REG_A0, addrl);
+}
+
+static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOp opc,
+                                TCGReg datalo, TCGReg datahi,
+                                TCGReg addrlo, TCGReg addrhi,
+                                int mem_index, void *raddr,
+                                tcg_insn_unit *label_ptr[2])
+{
+    TCGLabelQemuLdst *label = new_ldst_label(s);
+
+    label->is_ld = is_ld;
+    label->opc = opc;
+    label->datalo_reg = datalo;
+    label->datahi_reg = datahi;
+    label->addrlo_reg = addrlo;
+    label->addrhi_reg = addrhi;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    label->label_ptr[0] = label_ptr[0];
+    if (TARGET_LONG_BITS == 64) {
+        label->label_ptr[1] = label_ptr[1];
+    }
+}
+
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    TCGMemOp opc = l->opc;
+    TCGReg v0;
+    int i;
+
+    /* resolve label address */
+    reloc_pc16(l->label_ptr[0], s->code_ptr);
+    if (TARGET_LONG_BITS == 64) {
+        reloc_pc16(l->label_ptr[1], s->code_ptr);
+    }
+
+    i = 1;
+    if (TARGET_LONG_BITS == 64) {
+        i = tcg_out_call_iarg_reg2(s, i, l->addrlo_reg, l->addrhi_reg);
+    } else {
+        i = tcg_out_call_iarg_reg(s, i, l->addrlo_reg);
+    }
+    i = tcg_out_call_iarg_imm(s, i, l->mem_index);
+    i = tcg_out_call_iarg_imm(s, i, (intptr_t)l->raddr);
+    tcg_out_call_int(s, qemu_ld_helpers[opc], false);
+    /* delay slot */
+    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+
+    v0 = l->datalo_reg;
+    if ((opc & MO_SIZE) == MO_64) {
+        /* We eliminated V0 from the possible output registers, so it
+           cannot be clobbered here.  So we must move V1 first.  */
+        if (MIPS_BE) {
+            tcg_out_mov(s, TCG_TYPE_I32, v0, TCG_REG_V1);
+            v0 = l->datahi_reg;
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_V1);
+        }
+    }
+
+    reloc_pc16(s->code_ptr, l->raddr);
     tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
-    tcg_out_nop(s);
+    /* delay slot */
+    tcg_out_mov(s, TCG_TYPE_REG, v0, TCG_REG_V0);
+}
 
-    /* label1: fast path */
-    reloc_pc16(label1_ptr, s->code_ptr);
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    TCGMemOp opc = l->opc;
+    TCGMemOp s_bits = opc & MO_SIZE;
+    int i;
+
+    /* resolve label address */
+    reloc_pc16(l->label_ptr[0], s->code_ptr);
+    if (TARGET_LONG_BITS == 64) {
+        reloc_pc16(l->label_ptr[1], s->code_ptr);
+    }
 
-    tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0,
-                    offsetof(CPUArchState, tlb_table[mem_index][0].addend));
-    tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_V0, TCG_REG_A0, addr_regl);
-#else
-    if (GUEST_BASE == (int16_t)GUEST_BASE) {
-        tcg_out_opc_imm(s, OPC_ADDIU, TCG_REG_V0, addr_regl, GUEST_BASE);
+    i = 1;
+    if (TARGET_LONG_BITS == 64) {
+        i = tcg_out_call_iarg_reg2(s, i, l->addrlo_reg, l->addrhi_reg);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, GUEST_BASE);
-        tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_V0, TCG_REG_V0, addr_regl);
+        i = tcg_out_call_iarg_reg(s, i, l->addrlo_reg);
+    }
+    switch (s_bits) {
+    case MO_8:
+        i = tcg_out_call_iarg_reg8(s, i, l->datalo_reg);
+        break;
+    case MO_16:
+        i = tcg_out_call_iarg_reg16(s, i, l->datalo_reg);
+        break;
+    case MO_32:
+        i = tcg_out_call_iarg_reg(s, i, l->datalo_reg);
+        break;
+    case MO_64:
+        i = tcg_out_call_iarg_reg2(s, i, l->datalo_reg, l->datahi_reg);
+        break;
+    default:
+        tcg_abort();
     }
+    i = tcg_out_call_iarg_imm(s, i, l->mem_index);
+
+    /* Tail call to the store helper.  Thus force the return address
+       computation to take place in the return address register.  */
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (intptr_t)l->raddr);
+    i = tcg_out_call_iarg_reg(s, i, TCG_REG_RA);
+    tcg_out_call_int(s, qemu_st_helpers[opc], true);
+    /* delay slot */
+    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+}
 #endif
 
-    switch(opc) {
-    case 0:
-        tcg_out_opc_imm(s, OPC_LBU, data_reg1, TCG_REG_V0, 0);
+static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
+                                   TCGReg base, TCGMemOp opc)
+{
+    switch (opc) {
+    case MO_UB:
+        tcg_out_opc_imm(s, OPC_LBU, datalo, base, 0);
         break;
-    case 0 | 4:
-        tcg_out_opc_imm(s, OPC_LB, data_reg1, TCG_REG_V0, 0);
+    case MO_SB:
+        tcg_out_opc_imm(s, OPC_LB, datalo, base, 0);
         break;
-    case 1:
-        if (TCG_NEED_BSWAP) {
-            tcg_out_opc_imm(s, OPC_LHU, TCG_REG_T0, TCG_REG_V0, 0);
-            tcg_out_bswap16(s, data_reg1, TCG_REG_T0);
-        } else {
-            tcg_out_opc_imm(s, OPC_LHU, data_reg1, TCG_REG_V0, 0);
-        }
+    case MO_UW | MO_BSWAP:
+        tcg_out_opc_imm(s, OPC_LHU, TCG_TMP1, base, 0);
+        tcg_out_bswap16(s, datalo, TCG_TMP1);
         break;
-    case 1 | 4:
-        if (TCG_NEED_BSWAP) {
-            tcg_out_opc_imm(s, OPC_LHU, TCG_REG_T0, TCG_REG_V0, 0);
-            tcg_out_bswap16s(s, data_reg1, TCG_REG_T0);
-        } else {
-            tcg_out_opc_imm(s, OPC_LH, data_reg1, TCG_REG_V0, 0);
-        }
+    case MO_UW:
+        tcg_out_opc_imm(s, OPC_LHU, datalo, base, 0);
         break;
-    case 2:
-        if (TCG_NEED_BSWAP) {
-            tcg_out_opc_imm(s, OPC_LW, TCG_REG_T0, TCG_REG_V0, 0);
-            tcg_out_bswap32(s, data_reg1, TCG_REG_T0);
-        } else {
-            tcg_out_opc_imm(s, OPC_LW, data_reg1, TCG_REG_V0, 0);
-        }
+    case MO_SW | MO_BSWAP:
+        tcg_out_opc_imm(s, OPC_LHU, TCG_TMP1, base, 0);
+        tcg_out_bswap16s(s, datalo, TCG_TMP1);
         break;
-    case 3:
-        if (TCG_NEED_BSWAP) {
-            tcg_out_opc_imm(s, OPC_LW, TCG_REG_T0, TCG_REG_V0, 4);
-            tcg_out_bswap32(s, data_reg1, TCG_REG_T0);
-            tcg_out_opc_imm(s, OPC_LW, TCG_REG_T0, TCG_REG_V0, 0);
-            tcg_out_bswap32(s, data_reg2, TCG_REG_T0);
-        } else {
-            tcg_out_opc_imm(s, OPC_LW, data_reg1, TCG_REG_V0, 0);
-            tcg_out_opc_imm(s, OPC_LW, data_reg2, TCG_REG_V0, 4);
-        }
+    case MO_SW:
+        tcg_out_opc_imm(s, OPC_LH, datalo, base, 0);
+        break;
+    case MO_UL | MO_BSWAP:
+        tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, 0);
+        tcg_out_bswap32(s, datalo, TCG_TMP1);
+        break;
+    case MO_UL:
+        tcg_out_opc_imm(s, OPC_LW, datalo, base, 0);
+        break;
+    case MO_Q | MO_BSWAP:
+        tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, HI_OFF);
+        tcg_out_bswap32(s, datalo, TCG_TMP1);
+        tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, LO_OFF);
+        tcg_out_bswap32(s, datahi, TCG_TMP1);
+        break;
+    case MO_Q:
+        tcg_out_opc_imm(s, OPC_LW, datalo, base, LO_OFF);
+        tcg_out_opc_imm(s, OPC_LW, datahi, base, HI_OFF);
         break;
     default:
         tcg_abort();
     }
-
-#if defined(CONFIG_SOFTMMU)
-    reloc_pc16(label2_ptr, s->code_ptr);
-#endif
 }
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
-                            int opc)
+static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
 {
-    TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
+    TCGReg addr_regl, addr_regh __attribute__((unused));
+    TCGReg data_regl, data_regh;
+    TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
-    tcg_insn_unit *label1_ptr, *label2_ptr;
-    int arg_num;
-    int mem_index, s_bits;
-    int addr_meml;
-#endif
-#if TARGET_LONG_BITS == 64
-# if defined(CONFIG_SOFTMMU)
-    tcg_insn_unit *label3_ptr;
-    TCGReg addr_regh;
-    int addr_memh;
-# endif
+    tcg_insn_unit *label_ptr[2];
+    int mem_index;
+    TCGMemOp s_bits;
 #endif
+    /* Note that we've eliminated V0 from the output registers,
+       so we won't overwrite the base register during loading.  */
+    TCGReg base = TCG_REG_V0;
+
     data_regl = *args++;
-    if (opc == 3) {
-        data_regh = *args++;
-    } else {
-        data_regh = 0;
-    }
+    data_regh = (is_64 ? *args++ : 0);
     addr_regl = *args++;
+    addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
+    opc = *args++;
+
 #if defined(CONFIG_SOFTMMU)
-# if TARGET_LONG_BITS == 64
-    addr_regh = *args++;
-#  if defined(HOST_WORDS_BIGENDIAN)
-    addr_memh = 0;
-    addr_meml = 4;
-#  else
-    addr_memh = 4;
-    addr_meml = 0;
-#  endif
-# else
-    addr_meml = 0;
-# endif
     mem_index = *args;
-    s_bits = opc;
-#endif
+    s_bits = opc & MO_SIZE;
 
-    if (opc == 3) {
-#if defined(HOST_WORDS_BIGENDIAN)
-        data_reg1 = data_regh;
-        data_reg2 = data_regl;
+    tcg_out_tlb_load(s, base, addr_regl, addr_regh, mem_index,
+                     s_bits, label_ptr, 1);
+    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc);
+    add_qemu_ldst_label(s, 1, opc, data_regl, data_regh, addr_regl, addr_regh,
+                        mem_index, s->code_ptr, label_ptr);
 #else
-        data_reg1 = data_regl;
-        data_reg2 = data_regh;
-#endif
+    if (GUEST_BASE == 0 && data_regl != addr_regl) {
+        base = addr_regl;
+    } else if (GUEST_BASE == (int16_t)GUEST_BASE) {
+        tcg_out_opc_imm(s, OPC_ADDIU, base, addr_regl, GUEST_BASE);
     } else {
-        data_reg1 = data_regl;
-        data_reg2 = 0;
+        tcg_out_movi(s, TCG_TYPE_PTR, base, GUEST_BASE);
+        tcg_out_opc_reg(s, OPC_ADDU, base, base, addr_regl);
     }
+    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc);
+#endif
+}
 
-#if defined(CONFIG_SOFTMMU)
-    tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addr_regl, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
-    tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
-    tcg_out_opc_imm(s, OPC_LW, TCG_REG_AT, TCG_REG_A0,
-                    offsetof(CPUArchState, tlb_table[mem_index][0].addr_write) + addr_meml);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T0, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
-    tcg_out_opc_reg(s, OPC_AND, TCG_REG_T0, TCG_REG_T0, addr_regl);
-
-# if TARGET_LONG_BITS == 64
-    label3_ptr = s->code_ptr;
-    tcg_out_opc_br(s, OPC_BNE, TCG_REG_T0, TCG_REG_AT);
-    tcg_out_nop(s);
-
-    tcg_out_opc_imm(s, OPC_LW, TCG_REG_AT, TCG_REG_A0,
-                    offsetof(CPUArchState, tlb_table[mem_index][0].addr_write) + addr_memh);
-
-    label1_ptr = s->code_ptr;
-    tcg_out_opc_br(s, OPC_BEQ, addr_regh, TCG_REG_AT);
-    tcg_out_nop(s);
-
-    reloc_pc16(label3_ptr, s->code_ptr);
-# else
-    label1_ptr = s->code_ptr;
-    tcg_out_opc_br(s, OPC_BEQ, TCG_REG_T0, TCG_REG_AT);
-    tcg_out_nop(s);
-# endif
+static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
+                                   TCGReg base, TCGMemOp opc)
+{
+    switch (opc) {
+    case MO_8:
+        tcg_out_opc_imm(s, OPC_SB, datalo, base, 0);
+        break;
 
-    /* slow path */
-    arg_num = 0;
-    tcg_out_call_iarg_reg32(s, &arg_num, TCG_AREG0);
-# if TARGET_LONG_BITS == 64
-    tcg_out_call_iarg_reg64(s, &arg_num, addr_regl, addr_regh);
-# else
-    tcg_out_call_iarg_reg32(s, &arg_num, addr_regl);
-# endif
-    switch(opc) {
-    case 0:
-        tcg_out_call_iarg_reg8(s, &arg_num, data_regl);
+    case MO_16 | MO_BSWAP:
+        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, datalo, 0xffff);
+        tcg_out_bswap16(s, TCG_TMP1, TCG_TMP1);
+        datalo = TCG_TMP1;
+        /* FALLTHRU */
+    case MO_16:
+        tcg_out_opc_imm(s, OPC_SH, datalo, base, 0);
         break;
-    case 1:
-        tcg_out_call_iarg_reg16(s, &arg_num, data_regl);
+
+    case MO_32 | MO_BSWAP:
+        tcg_out_bswap32(s, TCG_TMP1, datalo);
+        datalo = TCG_TMP1;
+        /* FALLTHRU */
+    case MO_32:
+        tcg_out_opc_imm(s, OPC_SW, datalo, base, 0);
         break;
-    case 2:
-        tcg_out_call_iarg_reg32(s, &arg_num, data_regl);
+
+    case MO_64 | MO_BSWAP:
+        tcg_out_bswap32(s, TCG_TMP1, datalo);
+        tcg_out_opc_imm(s, OPC_SW, TCG_TMP1, base, HI_OFF);
+        tcg_out_bswap32(s, TCG_TMP1, datahi);
+        tcg_out_opc_imm(s, OPC_SW, TCG_TMP1, base, LO_OFF);
         break;
-    case 3:
-        tcg_out_call_iarg_reg64(s, &arg_num, data_regl, data_regh);
+    case MO_64:
+        tcg_out_opc_imm(s, OPC_SW, datalo, base, LO_OFF);
+        tcg_out_opc_imm(s, OPC_SW, datahi, base, HI_OFF);
         break;
+
     default:
         tcg_abort();
     }
-    tcg_out_call_iarg_imm32(s, &arg_num, mem_index);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T9, (tcg_target_long)qemu_st_helpers[s_bits]);
-    tcg_out_opc_reg(s, OPC_JALR, TCG_REG_RA, TCG_REG_T9, 0);
-    tcg_out_nop(s);
-
-    label2_ptr = s->code_ptr;
-    tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
-    tcg_out_nop(s);
+}
 
-    /* label1: fast path */
-    reloc_pc16(label1_ptr, s->code_ptr);
+static void tcg_out_addsub2(TCGContext *s, TCGReg rl, TCGReg rh, TCGReg al,
+                            TCGReg ah, TCGArg bl, TCGArg bh, bool cbl,
+                            bool cbh, bool is_sub)
+{
+    TCGReg th = TCG_TMP1;
+
+    /* If we have a negative constant such that negating it would
+       make the high part zero, we can (usually) eliminate one insn.  */
+    if (cbl && cbh && bh == -1 && bl != 0) {
+        bl = -bl;
+        bh = 0;
+        is_sub = !is_sub;
+    }
 
-    tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0,
-                    offsetof(CPUArchState, tlb_table[mem_index][0].addend));
-    tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, addr_regl);
-#else
-    if (GUEST_BASE == (int16_t)GUEST_BASE) {
-        tcg_out_opc_imm(s, OPC_ADDIU, TCG_REG_A0, addr_regl, GUEST_BASE);
+    /* By operating on the high part first, we get to use the final
+       carry operation to move back from the temporary.  */
+    if (!cbh) {
+        tcg_out_opc_reg(s, (is_sub ? OPC_SUBU : OPC_ADDU), th, ah, bh);
+    } else if (bh != 0 || ah == rl) {
+        tcg_out_opc_imm(s, OPC_ADDIU, th, ah, (is_sub ? -bh : bh));
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, GUEST_BASE);
-        tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, addr_regl);
+        th = ah;
     }
 
-#endif
-
-    switch(opc) {
-    case 0:
-        tcg_out_opc_imm(s, OPC_SB, data_reg1, TCG_REG_A0, 0);
-        break;
-    case 1:
-        if (TCG_NEED_BSWAP) {
-            tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_T0, data_reg1, 0xffff);
-            tcg_out_bswap16(s, TCG_REG_T0, TCG_REG_T0);
-            tcg_out_opc_imm(s, OPC_SH, TCG_REG_T0, TCG_REG_A0, 0);
+    /* Note that tcg optimization should eliminate the bl == 0 case.  */
+    if (is_sub) {
+        if (cbl) {
+            tcg_out_opc_imm(s, OPC_SLTIU, TCG_TMP0, al, bl);
+            tcg_out_opc_imm(s, OPC_ADDIU, rl, al, -bl);
         } else {
-            tcg_out_opc_imm(s, OPC_SH, data_reg1, TCG_REG_A0, 0);
+            tcg_out_opc_reg(s, OPC_SLTU, TCG_TMP0, al, bl);
+            tcg_out_opc_reg(s, OPC_SUBU, rl, al, bl);
         }
-        break;
-    case 2:
-        if (TCG_NEED_BSWAP) {
-            tcg_out_bswap32(s, TCG_REG_T0, data_reg1);
-            tcg_out_opc_imm(s, OPC_SW, TCG_REG_T0, TCG_REG_A0, 0);
-        } else {
-            tcg_out_opc_imm(s, OPC_SW, data_reg1, TCG_REG_A0, 0);
-        }
-        break;
-    case 3:
-        if (TCG_NEED_BSWAP) {
-            tcg_out_bswap32(s, TCG_REG_T0, data_reg2);
-            tcg_out_opc_imm(s, OPC_SW, TCG_REG_T0, TCG_REG_A0, 0);
-            tcg_out_bswap32(s, TCG_REG_T0, data_reg1);
-            tcg_out_opc_imm(s, OPC_SW, TCG_REG_T0, TCG_REG_A0, 4);
+        tcg_out_opc_reg(s, OPC_SUBU, rh, th, TCG_TMP0);
+    } else {
+        if (cbl) {
+            tcg_out_opc_imm(s, OPC_ADDIU, rl, al, bl);
+            tcg_out_opc_imm(s, OPC_SLTIU, TCG_TMP0, rl, bl);
         } else {
-            tcg_out_opc_imm(s, OPC_SW, data_reg1, TCG_REG_A0, 0);
-            tcg_out_opc_imm(s, OPC_SW, data_reg2, TCG_REG_A0, 4);
+            tcg_out_opc_reg(s, OPC_ADDU, rl, al, bl);
+            tcg_out_opc_reg(s, OPC_SLTU, TCG_TMP0, rl, (rl == bl ? al : bl));
         }
-        break;
-    default:
-        tcg_abort();
+        tcg_out_opc_reg(s, OPC_ADDU, rh, th, TCG_TMP0);
     }
+}
 
+static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+{
+    TCGReg addr_regl, addr_regh __attribute__((unused));
+    TCGReg data_regl, data_regh, base;
+    TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
-    reloc_pc16(label2_ptr, s->code_ptr);
+    tcg_insn_unit *label_ptr[2];
+    int mem_index;
+    TCGMemOp s_bits;
 #endif
-}
 
-static void tcg_out_call(TCGContext *s, tcg_insn_unit *target)
-{
-    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T9, (intptr_t)target);
-    tcg_out_opc_reg(s, OPC_JALR, TCG_REG_RA, TCG_REG_T9, 0);
-    tcg_out_nop(s);
+    data_regl = *args++;
+    data_regh = (is_64 ? *args++ : 0);
+    addr_regl = *args++;
+    addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
+    opc = *args++;
+
+#if defined(CONFIG_SOFTMMU)
+    mem_index = *args;
+    s_bits = opc & 3;
+
+    /* Note that we eliminated the helper's address argument,
+       so we can reuse that for the base.  */
+    base = (TARGET_LONG_BITS == 32 ? TCG_REG_A1 : TCG_REG_A2);
+    tcg_out_tlb_load(s, base, addr_regl, addr_regh, mem_index,
+                     s_bits, label_ptr, 1);
+    tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
+    add_qemu_ldst_label(s, 0, opc, data_regl, data_regh, addr_regl, addr_regh,
+                        mem_index, s->code_ptr, label_ptr);
+#else
+    if (GUEST_BASE == 0) {
+        base = addr_regl;
+    } else {
+        base = TCG_REG_A0;
+        if (GUEST_BASE == (int16_t)GUEST_BASE) {
+            tcg_out_opc_imm(s, OPC_ADDIU, base, addr_regl, GUEST_BASE);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_PTR, base, GUEST_BASE);
+            tcg_out_opc_reg(s, OPC_ADDU, base, base, addr_regl);
+        }
+    }
+    tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
+#endif
 }
 
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
-    switch(opc) {
+    MIPSInsn i1, i2;
+    TCGArg a0, a1, a2;
+    int c2;
+
+    a0 = args[0];
+    a1 = args[1];
+    a2 = args[2];
+    c2 = const_args[2];
+
+    switch (opc) {
     case INDEX_op_exit_tb:
-        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_V0, args[0]);
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_AT, (uintptr_t)tb_ret_addr);
-        tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_AT, 0);
-        tcg_out_nop(s);
+        {
+            TCGReg b0 = TCG_REG_ZERO;
+
+            if (a0 & ~0xffff) {
+                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, a0 & ~0xffff);
+                b0 = TCG_REG_V0;
+            }
+            if (!tcg_out_opc_jmp(s, OPC_J, tb_ret_addr)) {
+                tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0,
+                             (uintptr_t)tb_ret_addr);
+                tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
+            }
+            tcg_out_opc_imm(s, OPC_ORI, TCG_REG_V0, b0, a0 & 0xffff);
+        }
         break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* direct jump method */
-            tcg_abort();
+            s->tb_jmp_offset[a0] = tcg_current_code_size(s);
+            /* Avoid clobbering the address during retranslation.  */
+            tcg_out32(s, OPC_J | (*(uint32_t *)s->code_ptr & 0x3ffffff));
         } else {
             /* indirect jump method */
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_AT,
-                         (uintptr_t)(s->tb_next + args[0]));
-            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_AT, TCG_REG_AT, 0);
-            tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_AT, 0);
+            tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_REG_ZERO,
+                       (uintptr_t)(s->tb_next + a0));
+            tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
         }
         tcg_out_nop(s);
-        s->tb_next_offset[args[0]] = tcg_current_code_size(s);
+        s->tb_next_offset[a0] = tcg_current_code_size(s);
         break;
     case INDEX_op_br:
-        tcg_out_brcond(s, TCG_COND_EQ, TCG_REG_ZERO, TCG_REG_ZERO, args[0]);
+        tcg_out_brcond(s, TCG_COND_EQ, TCG_REG_ZERO, TCG_REG_ZERO, a0);
         break;
 
     case INDEX_op_ld8u_i32:
-        tcg_out_ldst(s, OPC_LBU, args[0], args[1], args[2]);
-        break;
+        i1 = OPC_LBU;
+        goto do_ldst;
     case INDEX_op_ld8s_i32:
-        tcg_out_ldst(s, OPC_LB, args[0], args[1], args[2]);
-        break;
+        i1 = OPC_LB;
+        goto do_ldst;
     case INDEX_op_ld16u_i32:
-        tcg_out_ldst(s, OPC_LHU, args[0], args[1], args[2]);
-        break;
+        i1 = OPC_LHU;
+        goto do_ldst;
     case INDEX_op_ld16s_i32:
-        tcg_out_ldst(s, OPC_LH, args[0], args[1], args[2]);
-        break;
+        i1 = OPC_LH;
+        goto do_ldst;
     case INDEX_op_ld_i32:
-        tcg_out_ldst(s, OPC_LW, args[0], args[1], args[2]);
-        break;
+        i1 = OPC_LW;
+        goto do_ldst;
     case INDEX_op_st8_i32:
-        tcg_out_ldst(s, OPC_SB, args[0], args[1], args[2]);
-        break;
+        i1 = OPC_SB;
+        goto do_ldst;
     case INDEX_op_st16_i32:
-        tcg_out_ldst(s, OPC_SH, args[0], args[1], args[2]);
-        break;
+        i1 = OPC_SH;
+        goto do_ldst;
     case INDEX_op_st_i32:
-        tcg_out_ldst(s, OPC_SW, args[0], args[1], args[2]);
+        i1 = OPC_SW;
+    do_ldst:
+        tcg_out_ldst(s, i1, a0, a1, a2);
         break;
 
     case INDEX_op_add_i32:
-        if (const_args[2]) {
-            tcg_out_opc_imm(s, OPC_ADDIU, args[0], args[1], args[2]);
-        } else {
-            tcg_out_opc_reg(s, OPC_ADDU, args[0], args[1], args[2]);
-        }
-        break;
-    case INDEX_op_add2_i32:
-        if (const_args[4]) {
-            tcg_out_opc_imm(s, OPC_ADDIU, TCG_REG_AT, args[2], args[4]);
-        } else {
-            tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_AT, args[2], args[4]);
-        }
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_T0, TCG_REG_AT, args[2]);
-        if (const_args[5]) {
-            tcg_out_opc_imm(s, OPC_ADDIU, args[1], args[3], args[5]);
-        } else {
-             tcg_out_opc_reg(s, OPC_ADDU, args[1], args[3], args[5]);
+        i1 = OPC_ADDU, i2 = OPC_ADDIU;
+        goto do_binary;
+    case INDEX_op_or_i32:
+        i1 = OPC_OR, i2 = OPC_ORI;
+        goto do_binary;
+    case INDEX_op_xor_i32:
+        i1 = OPC_XOR, i2 = OPC_XORI;
+    do_binary:
+        if (c2) {
+            tcg_out_opc_imm(s, i2, a0, a1, a2);
+            break;
         }
-        tcg_out_opc_reg(s, OPC_ADDU, args[1], args[1], TCG_REG_T0);
-        tcg_out_mov(s, TCG_TYPE_I32, args[0], TCG_REG_AT);
+    do_binaryv:
+        tcg_out_opc_reg(s, i1, a0, a1, a2);
         break;
+
     case INDEX_op_sub_i32:
-        if (const_args[2]) {
-            tcg_out_opc_imm(s, OPC_ADDIU, args[0], args[1], -args[2]);
-        } else {
-            tcg_out_opc_reg(s, OPC_SUBU, args[0], args[1], args[2]);
+        if (c2) {
+            tcg_out_opc_imm(s, OPC_ADDIU, a0, a1, -a2);
+            break;
         }
-        break;
-    case INDEX_op_sub2_i32:
-        if (const_args[4]) {
-            tcg_out_opc_imm(s, OPC_ADDIU, TCG_REG_AT, args[2], -args[4]);
-        } else {
-            tcg_out_opc_reg(s, OPC_SUBU, TCG_REG_AT, args[2], args[4]);
-        }
-        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_T0, args[2], TCG_REG_AT);
-        if (const_args[5]) {
-            tcg_out_opc_imm(s, OPC_ADDIU, args[1], args[3], -args[5]);
-        } else {
-             tcg_out_opc_reg(s, OPC_SUBU, args[1], args[3], args[5]);
+        i1 = OPC_SUBU;
+        goto do_binary;
+    case INDEX_op_and_i32:
+        if (c2 && a2 != (uint16_t)a2) {
+            int msb = ctz32(~a2) - 1;
+            assert(use_mips32r2_instructions);
+            assert(is_p2m1(a2));
+            tcg_out_opc_bf(s, OPC_EXT, a0, a1, msb, 0);
+            break;
         }
-        tcg_out_opc_reg(s, OPC_SUBU, args[1], args[1], TCG_REG_T0);
-        tcg_out_mov(s, TCG_TYPE_I32, args[0], TCG_REG_AT);
-        break;
+        i1 = OPC_AND, i2 = OPC_ANDI;
+        goto do_binary;
+    case INDEX_op_nor_i32:
+        i1 = OPC_NOR;
+        goto do_binaryv;
+
     case INDEX_op_mul_i32:
         if (use_mips32_instructions) {
-            tcg_out_opc_reg(s, OPC_MUL, args[0], args[1], args[2]);
-        } else {
-            tcg_out_opc_reg(s, OPC_MULT, 0, args[1], args[2]);
-            tcg_out_opc_reg(s, OPC_MFLO, args[0], 0, 0);
+            tcg_out_opc_reg(s, OPC_MUL, a0, a1, a2);
+            break;
         }
-        break;
-    case INDEX_op_muls2_i32:
-        tcg_out_opc_reg(s, OPC_MULT, 0, args[2], args[3]);
-        tcg_out_opc_reg(s, OPC_MFLO, args[0], 0, 0);
-        tcg_out_opc_reg(s, OPC_MFHI, args[1], 0, 0);
-        break;
-    case INDEX_op_mulu2_i32:
-        tcg_out_opc_reg(s, OPC_MULTU, 0, args[2], args[3]);
-        tcg_out_opc_reg(s, OPC_MFLO, args[0], 0, 0);
-        tcg_out_opc_reg(s, OPC_MFHI, args[1], 0, 0);
-        break;
+        i1 = OPC_MULT, i2 = OPC_MFLO;
+        goto do_hilo1;
     case INDEX_op_mulsh_i32:
-        tcg_out_opc_reg(s, OPC_MULT, 0, args[1], args[2]);
-        tcg_out_opc_reg(s, OPC_MFHI, args[0], 0, 0);
-        break;
+        i1 = OPC_MULT, i2 = OPC_MFHI;
+        goto do_hilo1;
     case INDEX_op_muluh_i32:
-        tcg_out_opc_reg(s, OPC_MULTU, 0, args[1], args[2]);
-        tcg_out_opc_reg(s, OPC_MFHI, args[0], 0, 0);
-        break;
+        i1 = OPC_MULTU, i2 = OPC_MFHI;
+        goto do_hilo1;
     case INDEX_op_div_i32:
-        tcg_out_opc_reg(s, OPC_DIV, 0, args[1], args[2]);
-        tcg_out_opc_reg(s, OPC_MFLO, args[0], 0, 0);
-        break;
+        i1 = OPC_DIV, i2 = OPC_MFLO;
+        goto do_hilo1;
     case INDEX_op_divu_i32:
-        tcg_out_opc_reg(s, OPC_DIVU, 0, args[1], args[2]);
-        tcg_out_opc_reg(s, OPC_MFLO, args[0], 0, 0);
-        break;
+        i1 = OPC_DIVU, i2 = OPC_MFLO;
+        goto do_hilo1;
     case INDEX_op_rem_i32:
-        tcg_out_opc_reg(s, OPC_DIV, 0, args[1], args[2]);
-        tcg_out_opc_reg(s, OPC_MFHI, args[0], 0, 0);
-        break;
+        i1 = OPC_DIV, i2 = OPC_MFHI;
+        goto do_hilo1;
     case INDEX_op_remu_i32:
-        tcg_out_opc_reg(s, OPC_DIVU, 0, args[1], args[2]);
-        tcg_out_opc_reg(s, OPC_MFHI, args[0], 0, 0);
+        i1 = OPC_DIVU, i2 = OPC_MFHI;
+    do_hilo1:
+        tcg_out_opc_reg(s, i1, 0, a1, a2);
+        tcg_out_opc_reg(s, i2, a0, 0, 0);
         break;
 
-    case INDEX_op_and_i32:
-        if (const_args[2]) {
-            tcg_out_opc_imm(s, OPC_ANDI, args[0], args[1], args[2]);
-        } else {
-            tcg_out_opc_reg(s, OPC_AND, args[0], args[1], args[2]);
-        }
-        break;
-    case INDEX_op_or_i32:
-        if (const_args[2]) {
-            tcg_out_opc_imm(s, OPC_ORI, args[0], args[1], args[2]);
-        } else {
-            tcg_out_opc_reg(s, OPC_OR, args[0], args[1], args[2]);
-        }
-        break;
-    case INDEX_op_nor_i32:
-        tcg_out_opc_reg(s, OPC_NOR, args[0], args[1], args[2]);
+    case INDEX_op_muls2_i32:
+        i1 = OPC_MULT;
+        goto do_hilo2;
+    case INDEX_op_mulu2_i32:
+        i1 = OPC_MULTU;
+    do_hilo2:
+        tcg_out_opc_reg(s, i1, 0, a2, args[3]);
+        tcg_out_opc_reg(s, OPC_MFLO, a0, 0, 0);
+        tcg_out_opc_reg(s, OPC_MFHI, a1, 0, 0);
         break;
+
     case INDEX_op_not_i32:
-        tcg_out_opc_reg(s, OPC_NOR, args[0], TCG_REG_ZERO, args[1]);
-        break;
-    case INDEX_op_xor_i32:
-        if (const_args[2]) {
-            tcg_out_opc_imm(s, OPC_XORI, args[0], args[1], args[2]);
-        } else {
-            tcg_out_opc_reg(s, OPC_XOR, args[0], args[1], args[2]);
-        }
+        i1 = OPC_NOR;
+        goto do_unary;
+    case INDEX_op_bswap16_i32:
+        i1 = OPC_WSBH;
+        goto do_unary;
+    case INDEX_op_ext8s_i32:
+        i1 = OPC_SEB;
+        goto do_unary;
+    case INDEX_op_ext16s_i32:
+        i1 = OPC_SEH;
+    do_unary:
+        tcg_out_opc_reg(s, i1, a0, TCG_REG_ZERO, a1);
         break;
 
     case INDEX_op_sar_i32:
-        if (const_args[2]) {
-            tcg_out_opc_sa(s, OPC_SRA, args[0], args[1], args[2]);
-        } else {
-            tcg_out_opc_reg(s, OPC_SRAV, args[0], args[2], args[1]);
-        }
-        break;
+        i1 = OPC_SRAV, i2 = OPC_SRA;
+        goto do_shift;
     case INDEX_op_shl_i32:
-        if (const_args[2]) {
-            tcg_out_opc_sa(s, OPC_SLL, args[0], args[1], args[2]);
-        } else {
-            tcg_out_opc_reg(s, OPC_SLLV, args[0], args[2], args[1]);
-        }
-        break;
+        i1 = OPC_SLLV, i2 = OPC_SLL;
+        goto do_shift;
     case INDEX_op_shr_i32:
-        if (const_args[2]) {
-            tcg_out_opc_sa(s, OPC_SRL, args[0], args[1], args[2]);
+        i1 = OPC_SRLV, i2 = OPC_SRL;
+        goto do_shift;
+    case INDEX_op_rotr_i32:
+        i1 = OPC_ROTRV, i2 = OPC_ROTR;
+    do_shift:
+        if (c2) {
+            tcg_out_opc_sa(s, i2, a0, a1, a2);
         } else {
-            tcg_out_opc_reg(s, OPC_SRLV, args[0], args[2], args[1]);
+            tcg_out_opc_reg(s, i1, a0, a2, a1);
         }
         break;
     case INDEX_op_rotl_i32:
-        if (const_args[2]) {
-            tcg_out_opc_sa(s, OPC_ROTR, args[0], args[1], 0x20 - args[2]);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_AT, 32);
-            tcg_out_opc_reg(s, OPC_SUBU, TCG_REG_AT, TCG_REG_AT, args[2]);
-            tcg_out_opc_reg(s, OPC_ROTRV, args[0], TCG_REG_AT, args[1]);
-        }
-        break;
-    case INDEX_op_rotr_i32:
-        if (const_args[2]) {
-            tcg_out_opc_sa(s, OPC_ROTR, args[0], args[1], args[2]);
+        if (c2) {
+            tcg_out_opc_sa(s, OPC_ROTR, a0, a1, 32 - a2);
         } else {
-            tcg_out_opc_reg(s, OPC_ROTRV, args[0], args[2], args[1]);
+            tcg_out_opc_reg(s, OPC_SUBU, TCG_TMP0, TCG_REG_ZERO, a2);
+            tcg_out_opc_reg(s, OPC_ROTRV, a0, TCG_TMP0, a1);
         }
         break;
 
-    case INDEX_op_bswap16_i32:
-        tcg_out_opc_reg(s, OPC_WSBH, args[0], 0, args[1]);
-        break;
     case INDEX_op_bswap32_i32:
-        tcg_out_opc_reg(s, OPC_WSBH, args[0], 0, args[1]);
-        tcg_out_opc_sa(s, OPC_ROTR, args[0], args[0], 16);
-        break;
-
-    case INDEX_op_ext8s_i32:
-        tcg_out_opc_reg(s, OPC_SEB, args[0], 0, args[1]);
-        break;
-    case INDEX_op_ext16s_i32:
-        tcg_out_opc_reg(s, OPC_SEH, args[0], 0, args[1]);
+        tcg_out_opc_reg(s, OPC_WSBH, a0, 0, a1);
+        tcg_out_opc_sa(s, OPC_ROTR, a0, a0, 16);
         break;
 
     case INDEX_op_deposit_i32:
-        tcg_out_opc_imm(s, OPC_INS, args[0], args[2],
-                        ((args[3] + args[4] - 1) << 11) | (args[3] << 6));
+        tcg_out_opc_bf(s, OPC_INS, a0, a2, args[3] + args[4] - 1, args[3]);
         break;
 
     case INDEX_op_brcond_i32:
-        tcg_out_brcond(s, args[2], args[0], args[1], args[3]);
+        tcg_out_brcond(s, a2, a0, a1, args[3]);
         break;
     case INDEX_op_brcond2_i32:
-        tcg_out_brcond2(s, args[4], args[0], args[1], args[2], args[3], args[5]);
+        tcg_out_brcond2(s, args[4], a0, a1, a2, args[3], args[5]);
         break;
 
     case INDEX_op_movcond_i32:
-        tcg_out_movcond(s, args[5], args[0], args[1], args[2], args[3]);
+        tcg_out_movcond(s, args[5], a0, a1, a2, args[3]);
         break;
 
     case INDEX_op_setcond_i32:
-        tcg_out_setcond(s, args[3], args[0], args[1], args[2]);
+        tcg_out_setcond(s, args[3], a0, a1, a2);
         break;
     case INDEX_op_setcond2_i32:
-        tcg_out_setcond2(s, args[5], args[0], args[1], args[2], args[3], args[4]);
+        tcg_out_setcond2(s, args[5], a0, a1, a2, args[3], args[4]);
         break;
 
-    case INDEX_op_qemu_ld8u:
-        tcg_out_qemu_ld(s, args, 0);
-        break;
-    case INDEX_op_qemu_ld8s:
-        tcg_out_qemu_ld(s, args, 0 | 4);
-        break;
-    case INDEX_op_qemu_ld16u:
-        tcg_out_qemu_ld(s, args, 1);
+    case INDEX_op_qemu_ld_i32:
+        tcg_out_qemu_ld(s, args, false);
         break;
-    case INDEX_op_qemu_ld16s:
-        tcg_out_qemu_ld(s, args, 1 | 4);
+    case INDEX_op_qemu_ld_i64:
+        tcg_out_qemu_ld(s, args, true);
         break;
-    case INDEX_op_qemu_ld32:
-        tcg_out_qemu_ld(s, args, 2);
+    case INDEX_op_qemu_st_i32:
+        tcg_out_qemu_st(s, args, false);
         break;
-    case INDEX_op_qemu_ld64:
-        tcg_out_qemu_ld(s, args, 3);
+    case INDEX_op_qemu_st_i64:
+        tcg_out_qemu_st(s, args, true);
         break;
-    case INDEX_op_qemu_st8:
-        tcg_out_qemu_st(s, args, 0);
-        break;
-    case INDEX_op_qemu_st16:
-        tcg_out_qemu_st(s, args, 1);
-        break;
-    case INDEX_op_qemu_st32:
-        tcg_out_qemu_st(s, args, 2);
+
+    case INDEX_op_add2_i32:
+        tcg_out_addsub2(s, a0, a1, a2, args[3], args[4], args[5],
+                        const_args[4], const_args[5], false);
         break;
-    case INDEX_op_qemu_st64:
-        tcg_out_qemu_st(s, args, 3);
+    case INDEX_op_sub2_i32:
+        tcg_out_addsub2(s, a0, a1, a2, args[3], args[4], args[5],
+                        const_args[4], const_args[5], true);
         break;
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
@@ -1561,9 +1598,9 @@ static const TCGTargetOpDef mips_op_defs[] = {
     { INDEX_op_divu_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_rem_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_remu_i32, { "r", "rZ", "rZ" } },
-    { INDEX_op_sub_i32, { "r", "rZ", "rJ" } },
+    { INDEX_op_sub_i32, { "r", "rZ", "rN" } },
 
-    { INDEX_op_and_i32, { "r", "rZ", "rI" } },
+    { INDEX_op_and_i32, { "r", "rZ", "rIK" } },
     { INDEX_op_nor_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_not_i32, { "r", "rZ" } },
     { INDEX_op_or_i32, { "r", "rZ", "rIZ" } },
@@ -1588,34 +1625,20 @@ static const TCGTargetOpDef mips_op_defs[] = {
     { INDEX_op_setcond_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rZ", "rZ" } },
 
-    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
-    { INDEX_op_sub2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
+    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rN", "rN" } },
+    { INDEX_op_sub2_i32, { "r", "r", "rZ", "rZ", "rN", "rN" } },
     { INDEX_op_brcond2_i32, { "rZ", "rZ", "rZ", "rZ" } },
 
 #if TARGET_LONG_BITS == 32
-    { INDEX_op_qemu_ld8u, { "L", "lZ" } },
-    { INDEX_op_qemu_ld8s, { "L", "lZ" } },
-    { INDEX_op_qemu_ld16u, { "L", "lZ" } },
-    { INDEX_op_qemu_ld16s, { "L", "lZ" } },
-    { INDEX_op_qemu_ld32, { "L", "lZ" } },
-    { INDEX_op_qemu_ld64, { "L", "L", "lZ" } },
-
-    { INDEX_op_qemu_st8, { "SZ", "SZ" } },
-    { INDEX_op_qemu_st16, { "SZ", "SZ" } },
-    { INDEX_op_qemu_st32, { "SZ", "SZ" } },
-    { INDEX_op_qemu_st64, { "SZ", "SZ", "SZ" } },
+    { INDEX_op_qemu_ld_i32, { "L", "lZ" } },
+    { INDEX_op_qemu_st_i32, { "SZ", "SZ" } },
+    { INDEX_op_qemu_ld_i64, { "L", "L", "lZ" } },
+    { INDEX_op_qemu_st_i64, { "SZ", "SZ", "SZ" } },
 #else
-    { INDEX_op_qemu_ld8u, { "L", "lZ", "lZ" } },
-    { INDEX_op_qemu_ld8s, { "L", "lZ", "lZ" } },
-    { INDEX_op_qemu_ld16u, { "L", "lZ", "lZ" } },
-    { INDEX_op_qemu_ld16s, { "L", "lZ", "lZ" } },
-    { INDEX_op_qemu_ld32, { "L", "lZ", "lZ" } },
-    { INDEX_op_qemu_ld64, { "L", "L", "lZ", "lZ" } },
-
-    { INDEX_op_qemu_st8, { "SZ", "SZ", "SZ" } },
-    { INDEX_op_qemu_st16, { "SZ", "SZ", "SZ" } },
-    { INDEX_op_qemu_st32, { "SZ", "SZ", "SZ" } },
-    { INDEX_op_qemu_st64, { "SZ", "SZ", "SZ", "SZ" } },
+    { INDEX_op_qemu_ld_i32, { "L", "lZ", "lZ" } },
+    { INDEX_op_qemu_st_i32, { "SZ", "SZ", "SZ" } },
+    { INDEX_op_qemu_ld_i64, { "L", "L", "lZ", "lZ" } },
+    { INDEX_op_qemu_st_i64, { "SZ", "SZ", "SZ", "SZ" } },
 #endif
     { -1 },
 };
@@ -1629,7 +1652,7 @@ static int tcg_target_callee_save_regs[] = {
     TCG_REG_S5,
     TCG_REG_S6,
     TCG_REG_S7,
-    TCG_REG_FP,
+    TCG_REG_S8,
     TCG_REG_RA,       /* should be last for ABI compliance */
 };
 
@@ -1761,6 +1784,7 @@ static void tcg_target_init(TCGContext *s)
                    (1 << TCG_REG_A1) |
                    (1 << TCG_REG_A2) |
                    (1 << TCG_REG_A3) |
+                   (1 << TCG_REG_T0) |
                    (1 << TCG_REG_T1) |
                    (1 << TCG_REG_T2) |
                    (1 << TCG_REG_T3) |
@@ -1775,11 +1799,18 @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_ZERO); /* zero register */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_K0);   /* kernel use only */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_K1);   /* kernel use only */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_AT);   /* internal use */
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_T0);   /* internal use */
+    tcg_regset_set_reg(s->reserved_regs, TCG_TMP0);     /* internal use */
+    tcg_regset_set_reg(s->reserved_regs, TCG_TMP1);     /* internal use */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_RA);   /* return address */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);   /* stack pointer */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_GP);   /* global pointer */
 
     tcg_add_target_add_op_defs(mips_op_defs);
 }
+
+void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
+{
+    uint32_t *ptr = (uint32_t *)jmp_addr;
+    *ptr = deposit32(*ptr, 0, 26, addr >> 2);
+    flush_icache_range(jmp_addr, jmp_addr + 4);
+}
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index c6d2267d77..b5face8b4d 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -60,16 +60,14 @@ typedef enum {
     TCG_REG_K1,
     TCG_REG_GP,
     TCG_REG_SP,
-    TCG_REG_FP,
+    TCG_REG_S8,
     TCG_REG_RA,
-} TCGReg;
 
-#define TCG_CT_CONST_ZERO 0x100
-#define TCG_CT_CONST_U16  0x200
-#define TCG_CT_CONST_S16  0x400
+    TCG_REG_CALL_STACK = TCG_REG_SP,
+    TCG_AREG0 = TCG_REG_S0,
+} TCGReg;
 
 /* used for function call generation */
-#define TCG_REG_CALL_STACK TCG_REG_SP
 #define TCG_TARGET_STACK_ALIGN 8
 #define TCG_TARGET_CALL_STACK_OFFSET 16
 #define TCG_TARGET_CALL_ALIGN_ARGS 1
@@ -120,15 +118,13 @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_ext16s_i32       use_mips32r2_instructions
 #define TCG_TARGET_HAS_rot_i32          use_mips32r2_instructions
 
-#define TCG_TARGET_HAS_new_ldst         0
+#define TCG_TARGET_HAS_new_ldst         1
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub  rd, zero, rt   */
 #define TCG_TARGET_HAS_ext8u_i32        0 /* andi rt, rs, 0xff   */
 #define TCG_TARGET_HAS_ext16u_i32       0 /* andi rt, rs, 0xffff */
 
-#define TCG_AREG0 TCG_REG_S0
-
 #ifdef __OpenBSD__
 #include <machine/sysarch.h>
 #else
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 3a504a1961..77da2f942a 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -83,6 +83,20 @@ static int op_bits(TCGOpcode op)
     return def->flags & TCG_OPF_64BIT ? 64 : 32;
 }
 
+static TCGOpcode op_to_mov(TCGOpcode op)
+{
+    switch (op_bits(op)) {
+    case 32:
+        return INDEX_op_mov_i32;
+    case 64:
+        return INDEX_op_mov_i64;
+    default:
+        fprintf(stderr, "op_to_mov: unexpected return value of "
+                "function op_bits.\n");
+        tcg_abort();
+    }
+}
+
 static TCGOpcode op_to_movi(TCGOpcode op)
 {
     switch (op_bits(op)) {
@@ -148,11 +162,22 @@ static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
     return false;
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args,
-                            TCGArg dst, TCGArg src)
+static void tcg_opt_gen_mov(TCGContext *s, int op_index, TCGArg *gen_args,
+                            TCGOpcode old_op, TCGArg dst, TCGArg src)
 {
+    TCGOpcode new_op = op_to_mov(old_op);
+    tcg_target_ulong mask;
+
+    s->gen_opc_buf[op_index] = new_op;
+
     reset_temp(dst);
-    temps[dst].mask = temps[src].mask;
+    mask = temps[src].mask;
+    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+        /* High bits of the destination are now garbage.  */
+        mask |= ~0xffffffffull;
+    }
+    temps[dst].mask = mask;
+
     assert(temps[src].state != TCG_TEMP_CONST);
 
     if (s->temps[src].type == s->temps[dst].type) {
@@ -172,30 +197,28 @@ static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args,
     gen_args[1] = src;
 }
 
-static void tcg_opt_gen_movi(TCGArg *gen_args, TCGArg dst, TCGArg val)
+static void tcg_opt_gen_movi(TCGContext *s, int op_index, TCGArg *gen_args,
+                             TCGOpcode old_op, TCGArg dst, TCGArg val)
 {
+    TCGOpcode new_op = op_to_movi(old_op);
+    tcg_target_ulong mask;
+
+    s->gen_opc_buf[op_index] = new_op;
+
     reset_temp(dst);
     temps[dst].state = TCG_TEMP_CONST;
     temps[dst].val = val;
-    temps[dst].mask = val;
+    mask = val;
+    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+        /* High bits of the destination are now garbage.  */
+        mask |= ~0xffffffffull;
+    }
+    temps[dst].mask = mask;
+
     gen_args[0] = dst;
     gen_args[1] = val;
 }
 
-static TCGOpcode op_to_mov(TCGOpcode op)
-{
-    switch (op_bits(op)) {
-    case 32:
-        return INDEX_op_mov_i32;
-    case 64:
-        return INDEX_op_mov_i64;
-    default:
-        fprintf(stderr, "op_to_mov: unexpected return value of "
-                "function op_bits.\n");
-        tcg_abort();
-    }
-}
-
 static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
 {
     uint64_t l64, h64;
@@ -530,7 +553,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
     for (op_index = 0; op_index < nb_ops; op_index++) {
         TCGOpcode op = s->gen_opc_buf[op_index];
         const TCGOpDef *def = &tcg_op_defs[op];
-        tcg_target_ulong mask, affected;
+        tcg_target_ulong mask, partmask, affected;
         int nb_oargs, nb_iargs, nb_args, i;
         TCGArg tmp;
 
@@ -619,8 +642,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(rotr):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[1]].val == 0) {
-                s->gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], 0);
+                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
                 args += 3;
                 gen_args += 2;
                 continue;
@@ -749,8 +771,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             if (temps_are_copies(args[0], args[1])) {
                 s->gen_opc_buf[op_index] = INDEX_op_nop;
             } else {
-                s->gen_opc_buf[op_index] = op_to_mov(op);
-                tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
+                tcg_opt_gen_mov(s, op_index, gen_args, op, args[0], args[1]);
                 gen_args += 2;
             }
             args += 3;
@@ -859,6 +880,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             break;
 
         CASE_OP_32_64(setcond):
+        case INDEX_op_setcond2_i32:
             mask = 1;
             break;
 
@@ -894,16 +916,20 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             break;
         }
 
-        /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit
-           results */
+        /* 32-bit ops (non 64-bit ops and non load/store ops) generate
+           32-bit results.  For the result is zero test below, we can
+           ignore high bits, but for further optimizations we need to
+           record that the high bits contain garbage.  */
+        partmask = mask;
         if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) {
-            mask &= 0xffffffffu;
+            mask |= ~(tcg_target_ulong)0xffffffffu;
+            partmask &= 0xffffffffu;
+            affected &= 0xffffffffu;
         }
 
-        if (mask == 0) {
+        if (partmask == 0) {
             assert(nb_oargs == 1);
-            s->gen_opc_buf[op_index] = op_to_movi(op);
-            tcg_opt_gen_movi(gen_args, args[0], 0);
+            tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
             args += nb_args;
             gen_args += 2;
             continue;
@@ -913,12 +939,11 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             if (temps_are_copies(args[0], args[1])) {
                 s->gen_opc_buf[op_index] = INDEX_op_nop;
             } else if (temps[args[1]].state != TCG_TEMP_CONST) {
-                s->gen_opc_buf[op_index] = op_to_mov(op);
-                tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
+                tcg_opt_gen_mov(s, op_index, gen_args, op, args[0], args[1]);
                 gen_args += 2;
             } else {
-                s->gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], temps[args[1]].val);
+                tcg_opt_gen_movi(s, op_index, gen_args, op,
+                                 args[0], temps[args[1]].val);
                 gen_args += 2;
             }
             args += nb_args;
@@ -933,8 +958,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(mulsh):
             if ((temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0)) {
-                s->gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], 0);
+                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
                 args += 3;
                 gen_args += 2;
                 continue;
@@ -952,8 +976,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                 if (temps_are_copies(args[0], args[1])) {
                     s->gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
-                    s->gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
+                    tcg_opt_gen_mov(s, op_index, gen_args, op,
+                                    args[0], args[1]);
                     gen_args += 2;
                 }
                 args += 3;
@@ -970,8 +994,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(sub):
         CASE_OP_32_64(xor):
             if (temps_are_copies(args[1], args[2])) {
-                s->gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], 0);
+                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
                 gen_args += 2;
                 args += 3;
                 continue;
@@ -992,19 +1015,17 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                 break;
             }
             if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
+                tcg_opt_gen_mov(s, op_index, gen_args, op, args[0], args[1]);
                 gen_args += 2;
                 args += 2;
                 break;
             }
             /* Source argument is constant.  Rewrite the operation and
                let movi case handle it. */
-            op = op_to_movi(op);
-            s->gen_opc_buf[op_index] = op;
             args[1] = temps[args[1]].val;
             /* fallthrough */
         CASE_OP_32_64(movi):
-            tcg_opt_gen_movi(gen_args, args[0], args[1]);
+            tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], args[1]);
             gen_args += 2;
             args += 2;
             break;
@@ -1018,9 +1039,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         case INDEX_op_ext32s_i64:
         case INDEX_op_ext32u_i64:
             if (temps[args[1]].state == TCG_TEMP_CONST) {
-                s->gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = do_constant_folding(op, temps[args[1]].val, 0);
-                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
                 gen_args += 2;
                 args += 2;
                 break;
@@ -1029,9 +1049,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
 
         case INDEX_op_trunc_shr_i32:
             if (temps[args[1]].state == TCG_TEMP_CONST) {
-                s->gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = do_constant_folding(op, temps[args[1]].val, args[2]);
-                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
                 gen_args += 2;
                 args += 3;
                 break;
@@ -1062,10 +1081,9 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(remu):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST) {
-                s->gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = do_constant_folding(op, temps[args[1]].val,
                                           temps[args[2]].val);
-                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
                 gen_args += 2;
                 args += 3;
                 break;
@@ -1075,10 +1093,9 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(deposit):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST) {
-                s->gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = deposit64(temps[args[1]].val, args[3], args[4],
                                 temps[args[2]].val);
-                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
                 gen_args += 2;
                 args += 5;
                 break;
@@ -1088,8 +1105,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(setcond):
             tmp = do_constant_folding_cond(op, args[1], args[2], args[3]);
             if (tmp != 2) {
-                s->gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
                 gen_args += 2;
                 args += 4;
                 break;
@@ -1118,12 +1134,12 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                 if (temps_are_copies(args[0], args[4-tmp])) {
                     s->gen_opc_buf[op_index] = INDEX_op_nop;
                 } else if (temps[args[4-tmp]].state == TCG_TEMP_CONST) {
-                    s->gen_opc_buf[op_index] = op_to_movi(op);
-                    tcg_opt_gen_movi(gen_args, args[0], temps[args[4-tmp]].val);
+                    tcg_opt_gen_movi(s, op_index, gen_args, op,
+                                     args[0], temps[args[4-tmp]].val);
                     gen_args += 2;
                 } else {
-                    s->gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(s, gen_args, args[0], args[4-tmp]);
+                    tcg_opt_gen_mov(s, op_index, gen_args, op,
+                                    args[0], args[4-tmp]);
                     gen_args += 2;
                 }
                 args += 6;
@@ -1156,10 +1172,10 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
 
                 rl = args[0];
                 rh = args[1];
-                s->gen_opc_buf[op_index] = INDEX_op_movi_i32;
-                s->gen_opc_buf[++op_index] = INDEX_op_movi_i32;
-                tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)a);
-                tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(a >> 32));
+                tcg_opt_gen_movi(s, op_index, &gen_args[0],
+                                 op, rl, (uint32_t)a);
+                tcg_opt_gen_movi(s, ++op_index, &gen_args[2],
+                                 op, rh, (uint32_t)(a >> 32));
                 gen_args += 4;
                 args += 6;
                 break;
@@ -1179,10 +1195,10 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
 
                 rl = args[0];
                 rh = args[1];
-                s->gen_opc_buf[op_index] = INDEX_op_movi_i32;
-                s->gen_opc_buf[++op_index] = INDEX_op_movi_i32;
-                tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)r);
-                tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(r >> 32));
+                tcg_opt_gen_movi(s, op_index, &gen_args[0],
+                                 op, rl, (uint32_t)r);
+                tcg_opt_gen_movi(s, ++op_index, &gen_args[2],
+                                 op, rh, (uint32_t)(r >> 32));
                 gen_args += 4;
                 args += 4;
                 break;
@@ -1193,11 +1209,13 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]);
             if (tmp != 2) {
                 if (tmp) {
+            do_brcond_true:
                     reset_all_temps(nb_temps);
                     s->gen_opc_buf[op_index] = INDEX_op_br;
                     gen_args[0] = args[5];
                     gen_args += 1;
                 } else {
+            do_brcond_false:
                     s->gen_opc_buf[op_index] = INDEX_op_nop;
                 }
             } else if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
@@ -1207,6 +1225,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                        && temps[args[3]].val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
+            do_brcond_high:
                 reset_all_temps(nb_temps);
                 s->gen_opc_buf[op_index] = INDEX_op_brcond_i32;
                 gen_args[0] = args[1];
@@ -1214,6 +1233,49 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                 gen_args[2] = args[4];
                 gen_args[3] = args[5];
                 gen_args += 4;
+            } else if (args[4] == TCG_COND_EQ) {
+                /* Simplify EQ comparisons where one of the pairs
+                   can be simplified.  */
+                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                               args[0], args[2], TCG_COND_EQ);
+                if (tmp == 0) {
+                    goto do_brcond_false;
+                } else if (tmp == 1) {
+                    goto do_brcond_high;
+                }
+                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                               args[1], args[3], TCG_COND_EQ);
+                if (tmp == 0) {
+                    goto do_brcond_false;
+                } else if (tmp != 1) {
+                    goto do_default;
+                }
+            do_brcond_low:
+                reset_all_temps(nb_temps);
+                s->gen_opc_buf[op_index] = INDEX_op_brcond_i32;
+                gen_args[0] = args[0];
+                gen_args[1] = args[2];
+                gen_args[2] = args[4];
+                gen_args[3] = args[5];
+                gen_args += 4;
+            } else if (args[4] == TCG_COND_NE) {
+                /* Simplify NE comparisons where one of the pairs
+                   can be simplified.  */
+                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                               args[0], args[2], TCG_COND_NE);
+                if (tmp == 0) {
+                    goto do_brcond_high;
+                } else if (tmp == 1) {
+                    goto do_brcond_true;
+                }
+                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                               args[1], args[3], TCG_COND_NE);
+                if (tmp == 0) {
+                    goto do_brcond_low;
+                } else if (tmp == 1) {
+                    goto do_brcond_true;
+                }
+                goto do_default;
             } else {
                 goto do_default;
             }
@@ -1223,8 +1285,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         case INDEX_op_setcond2_i32:
             tmp = do_constant_folding_cond2(&args[1], &args[3], args[5]);
             if (tmp != 2) {
-                s->gen_opc_buf[op_index] = INDEX_op_movi_i32;
-                tcg_opt_gen_movi(gen_args, args[0], tmp);
+            do_setcond_const:
+                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
                 gen_args += 2;
             } else if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
                        && temps[args[3]].state == TCG_TEMP_CONST
@@ -1233,13 +1295,59 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                        && temps[args[4]].val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
+            do_setcond_high:
                 s->gen_opc_buf[op_index] = INDEX_op_setcond_i32;
                 reset_temp(args[0]);
+                temps[args[0]].mask = 1;
                 gen_args[0] = args[0];
                 gen_args[1] = args[2];
                 gen_args[2] = args[4];
                 gen_args[3] = args[5];
                 gen_args += 4;
+            } else if (args[5] == TCG_COND_EQ) {
+                /* Simplify EQ comparisons where one of the pairs
+                   can be simplified.  */
+                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                               args[1], args[3], TCG_COND_EQ);
+                if (tmp == 0) {
+                    goto do_setcond_const;
+                } else if (tmp == 1) {
+                    goto do_setcond_high;
+                }
+                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                               args[2], args[4], TCG_COND_EQ);
+                if (tmp == 0) {
+                    goto do_setcond_high;
+                } else if (tmp != 1) {
+                    goto do_default;
+                }
+            do_setcond_low:
+                reset_temp(args[0]);
+                temps[args[0]].mask = 1;
+                s->gen_opc_buf[op_index] = INDEX_op_setcond_i32;
+                gen_args[0] = args[0];
+                gen_args[1] = args[1];
+                gen_args[2] = args[3];
+                gen_args[3] = args[5];
+                gen_args += 4;
+            } else if (args[5] == TCG_COND_NE) {
+                /* Simplify NE comparisons where one of the pairs
+                   can be simplified.  */
+                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                               args[1], args[3], TCG_COND_NE);
+                if (tmp == 0) {
+                    goto do_setcond_high;
+                } else if (tmp == 1) {
+                    goto do_setcond_const;
+                }
+                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                               args[2], args[4], TCG_COND_NE);
+                if (tmp == 0) {
+                    goto do_setcond_low;
+                } else if (tmp == 1) {
+                    goto do_setcond_const;
+                }
+                goto do_default;
             } else {
                 goto do_default;
             }
diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c
index 07164e544d..63e9c82cb3 100644
--- a/tcg/s390/tcg-target.c
+++ b/tcg/s390/tcg-target.c
@@ -2344,8 +2344,7 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 }
 
 typedef struct {
-    DebugFrameCIE cie;
-    DebugFrameFDEHeader fde;
+    DebugFrameHeader h;
     uint8_t fde_def_cfa[4];
     uint8_t fde_reg_ofs[18];
 } DebugFrame;
@@ -2355,16 +2354,16 @@ QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
 
 #define ELF_HOST_MACHINE  EM_S390
 
-static DebugFrame debug_frame = {
-    .cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
-    .cie.id = -1,
-    .cie.version = 1,
-    .cie.code_align = 1,
-    .cie.data_align = 8,                /* sleb128 8 */
-    .cie.return_column = TCG_REG_R14,
+static const DebugFrame debug_frame = {
+    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
+    .h.cie.id = -1,
+    .h.cie.version = 1,
+    .h.cie.code_align = 1,
+    .h.cie.data_align = 8,                /* sleb128 8 */
+    .h.cie.return_column = TCG_REG_R14,
 
     /* Total FDE size does not include the "len" member.  */
-    .fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, fde.cie_offset),
+    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
 
     .fde_def_cfa = {
         12, TCG_REG_CALL_STACK,         /* DW_CFA_def_cfa %r15, ... */
@@ -2386,8 +2385,5 @@ static DebugFrame debug_frame = {
 
 void tcg_register_jit(void *buf, size_t buf_size)
 {
-    debug_frame.fde.func_start = (uintptr_t)buf;
-    debug_frame.fde.func_len = buf_size;
-
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 17ff5773ad..40f2ec1027 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -1499,23 +1499,22 @@ static void tcg_target_init(TCGContext *s)
 #endif
 
 typedef struct {
-    DebugFrameCIE cie;
-    DebugFrameFDEHeader fde;
+    DebugFrameHeader h;
     uint8_t fde_def_cfa[SPARC64 ? 4 : 2];
     uint8_t fde_win_save;
     uint8_t fde_ret_save[3];
 } DebugFrame;
 
-static DebugFrame debug_frame = {
-    .cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
-    .cie.id = -1,
-    .cie.version = 1,
-    .cie.code_align = 1,
-    .cie.data_align = -sizeof(void *) & 0x7f,
-    .cie.return_column = 15,            /* o7 */
+static const DebugFrame debug_frame = {
+    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
+    .h.cie.id = -1,
+    .h.cie.version = 1,
+    .h.cie.code_align = 1,
+    .h.cie.data_align = -sizeof(void *) & 0x7f,
+    .h.cie.return_column = 15,            /* o7 */
 
     /* Total FDE size does not include the "len" member.  */
-    .fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, fde.cie_offset),
+    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
 
     .fde_def_cfa = {
 #if SPARC64
@@ -1531,9 +1530,6 @@ static DebugFrame debug_frame = {
 
 void tcg_register_jit(void *buf, size_t buf_size)
 {
-    debug_frame.fde.func_start = (uintptr_t)buf;
-    debug_frame.fde.func_len = buf_size;
-
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
 
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index bdd0139482..719533ac39 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -22,6 +22,8 @@
  * THE SOFTWARE.
  */
 #include "tcg.h"
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
 
 int gen_new_label(void);
 
@@ -379,47 +381,6 @@ static inline void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
     tcg_gen_op2i_i32(INDEX_op_movi_i32, ret, arg);
 }
 
-/* A version of dh_sizemask from def-helper.h that doesn't rely on
-   preprocessor magic.  */
-static inline int tcg_gen_sizemask(int n, int is_64bit, int is_signed)
-{
-    return (is_64bit << n*2) | (is_signed << (n*2 + 1));
-}
-
-/* helper calls */
-static inline void tcg_gen_helperN(void *func, int flags, int sizemask,
-                                   TCGArg ret, int nargs, TCGArg *args)
-{
-    tcg_gen_callN(&tcg_ctx, func, flags, sizemask, ret, nargs, args);
-}
-
-/* Note: Both tcg_gen_helper32() and tcg_gen_helper64() are currently
-   reserved for helpers in tcg-runtime.c. These helpers all do not read
-   globals and do not have side effects, hence the call to tcg_gen_callN()
-   with TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_SIDE_EFFECTS. This may need
-   to be adjusted if these functions start to be used with other helpers. */
-static inline void tcg_gen_helper32(void *func, int sizemask, TCGv_i32 ret,
-                                    TCGv_i32 a, TCGv_i32 b)
-{
-    TCGArg args[2];
-    args[0] = GET_TCGV_I32(a);
-    args[1] = GET_TCGV_I32(b);
-    tcg_gen_callN(&tcg_ctx, func,
-                  TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_SIDE_EFFECTS,
-                  sizemask, GET_TCGV_I32(ret), 2, args);
-}
-
-static inline void tcg_gen_helper64(void *func, int sizemask, TCGv_i64 ret,
-                                    TCGv_i64 a, TCGv_i64 b)
-{
-    TCGArg args[2];
-    args[0] = GET_TCGV_I64(a);
-    args[1] = GET_TCGV_I64(b);
-    tcg_gen_callN(&tcg_ctx, func,
-                  TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_SIDE_EFFECTS,
-                  sizemask, GET_TCGV_I64(ret), 2, args);
-}
-
 /* 32 bit ops */
 
 static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
@@ -707,12 +668,7 @@ static inline void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_op5_i32(INDEX_op_div2_i32, ret, t0, arg1, t0, arg2);
         tcg_temp_free_i32(t0);
     } else {
-        int sizemask = 0;
-        /* Return value and both arguments are 32-bit and signed.  */
-        sizemask |= tcg_gen_sizemask(0, 0, 1);
-        sizemask |= tcg_gen_sizemask(1, 0, 1);
-        sizemask |= tcg_gen_sizemask(2, 0, 1);
-        tcg_gen_helper32(tcg_helper_div_i32, sizemask, ret, arg1, arg2);
+        gen_helper_div_i32(ret, arg1, arg2);
     }
 }
 
@@ -732,12 +688,7 @@ static inline void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_op5_i32(INDEX_op_div2_i32, t0, ret, arg1, t0, arg2);
         tcg_temp_free_i32(t0);
     } else {
-        int sizemask = 0;
-        /* Return value and both arguments are 32-bit and signed.  */
-        sizemask |= tcg_gen_sizemask(0, 0, 1);
-        sizemask |= tcg_gen_sizemask(1, 0, 1);
-        sizemask |= tcg_gen_sizemask(2, 0, 1);
-        tcg_gen_helper32(tcg_helper_rem_i32, sizemask, ret, arg1, arg2);
+        gen_helper_rem_i32(ret, arg1, arg2);
     }
 }
 
@@ -751,12 +702,7 @@ static inline void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_op5_i32(INDEX_op_divu2_i32, ret, t0, arg1, t0, arg2);
         tcg_temp_free_i32(t0);
     } else {
-        int sizemask = 0;
-        /* Return value and both arguments are 32-bit and unsigned.  */
-        sizemask |= tcg_gen_sizemask(0, 0, 0);
-        sizemask |= tcg_gen_sizemask(1, 0, 0);
-        sizemask |= tcg_gen_sizemask(2, 0, 0);
-        tcg_gen_helper32(tcg_helper_divu_i32, sizemask, ret, arg1, arg2);
+        gen_helper_divu_i32(ret, arg1, arg2);
     }
 }
 
@@ -776,12 +722,7 @@ static inline void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_op5_i32(INDEX_op_divu2_i32, t0, ret, arg1, t0, arg2);
         tcg_temp_free_i32(t0);
     } else {
-        int sizemask = 0;
-        /* Return value and both arguments are 32-bit and unsigned.  */
-        sizemask |= tcg_gen_sizemask(0, 0, 0);
-        sizemask |= tcg_gen_sizemask(1, 0, 0);
-        sizemask |= tcg_gen_sizemask(2, 0, 0);
-        tcg_gen_helper32(tcg_helper_remu_i32, sizemask, ret, arg1, arg2);
+        gen_helper_remu_i32(ret, arg1, arg2);
     }
 }
 
@@ -945,13 +886,7 @@ static inline void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
    specific code (x86) */
 static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    int sizemask = 0;
-    /* Return value and both arguments are 64-bit and signed.  */
-    sizemask |= tcg_gen_sizemask(0, 1, 1);
-    sizemask |= tcg_gen_sizemask(1, 1, 1);
-    sizemask |= tcg_gen_sizemask(2, 1, 1);
-
-    tcg_gen_helper64(tcg_helper_shl_i64, sizemask, ret, arg1, arg2);
+    gen_helper_shl_i64(ret, arg1, arg2);
 }
 
 static inline void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -961,13 +896,7 @@ static inline void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 
 static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    int sizemask = 0;
-    /* Return value and both arguments are 64-bit and signed.  */
-    sizemask |= tcg_gen_sizemask(0, 1, 1);
-    sizemask |= tcg_gen_sizemask(1, 1, 1);
-    sizemask |= tcg_gen_sizemask(2, 1, 1);
-
-    tcg_gen_helper64(tcg_helper_shr_i64, sizemask, ret, arg1, arg2);
+    gen_helper_shr_i64(ret, arg1, arg2);
 }
 
 static inline void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -977,13 +906,7 @@ static inline void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 
 static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    int sizemask = 0;
-    /* Return value and both arguments are 64-bit and signed.  */
-    sizemask |= tcg_gen_sizemask(0, 1, 1);
-    sizemask |= tcg_gen_sizemask(1, 1, 1);
-    sizemask |= tcg_gen_sizemask(2, 1, 1);
-
-    tcg_gen_helper64(tcg_helper_sar_i64, sizemask, ret, arg1, arg2);
+    gen_helper_sar_i64(ret, arg1, arg2);
 }
 
 static inline void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -1051,46 +974,22 @@ static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 
 static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    int sizemask = 0;
-    /* Return value and both arguments are 64-bit and signed.  */
-    sizemask |= tcg_gen_sizemask(0, 1, 1);
-    sizemask |= tcg_gen_sizemask(1, 1, 1);
-    sizemask |= tcg_gen_sizemask(2, 1, 1);
-
-    tcg_gen_helper64(tcg_helper_div_i64, sizemask, ret, arg1, arg2);
+    gen_helper_div_i64(ret, arg1, arg2);
 }
 
 static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    int sizemask = 0;
-    /* Return value and both arguments are 64-bit and signed.  */
-    sizemask |= tcg_gen_sizemask(0, 1, 1);
-    sizemask |= tcg_gen_sizemask(1, 1, 1);
-    sizemask |= tcg_gen_sizemask(2, 1, 1);
-
-    tcg_gen_helper64(tcg_helper_rem_i64, sizemask, ret, arg1, arg2);
+    gen_helper_rem_i64(ret, arg1, arg2);
 }
 
 static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    int sizemask = 0;
-    /* Return value and both arguments are 64-bit and unsigned.  */
-    sizemask |= tcg_gen_sizemask(0, 1, 0);
-    sizemask |= tcg_gen_sizemask(1, 1, 0);
-    sizemask |= tcg_gen_sizemask(2, 1, 0);
-
-    tcg_gen_helper64(tcg_helper_divu_i64, sizemask, ret, arg1, arg2);
+    gen_helper_divu_i64(ret, arg1, arg2);
 }
 
 static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    int sizemask = 0;
-    /* Return value and both arguments are 64-bit and unsigned.  */
-    sizemask |= tcg_gen_sizemask(0, 1, 0);
-    sizemask |= tcg_gen_sizemask(1, 1, 0);
-    sizemask |= tcg_gen_sizemask(2, 1, 0);
-
-    tcg_gen_helper64(tcg_helper_remu_i64, sizemask, ret, arg1, arg2);
+    gen_helper_remu_i64(ret, arg1, arg2);
 }
 
 #else
@@ -1357,12 +1256,7 @@ static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
         tcg_gen_op5_i64(INDEX_op_div2_i64, ret, t0, arg1, t0, arg2);
         tcg_temp_free_i64(t0);
     } else {
-        int sizemask = 0;
-        /* Return value and both arguments are 64-bit and signed.  */
-        sizemask |= tcg_gen_sizemask(0, 1, 1);
-        sizemask |= tcg_gen_sizemask(1, 1, 1);
-        sizemask |= tcg_gen_sizemask(2, 1, 1);
-        tcg_gen_helper64(tcg_helper_div_i64, sizemask, ret, arg1, arg2);
+        gen_helper_div_i64(ret, arg1, arg2);
     }
 }
 
@@ -1382,12 +1276,7 @@ static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
         tcg_gen_op5_i64(INDEX_op_div2_i64, t0, ret, arg1, t0, arg2);
         tcg_temp_free_i64(t0);
     } else {
-        int sizemask = 0;
-        /* Return value and both arguments are 64-bit and signed.  */
-        sizemask |= tcg_gen_sizemask(0, 1, 1);
-        sizemask |= tcg_gen_sizemask(1, 1, 1);
-        sizemask |= tcg_gen_sizemask(2, 1, 1);
-        tcg_gen_helper64(tcg_helper_rem_i64, sizemask, ret, arg1, arg2);
+        gen_helper_rem_i64(ret, arg1, arg2);
     }
 }
 
@@ -1401,12 +1290,7 @@ static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
         tcg_gen_op5_i64(INDEX_op_divu2_i64, ret, t0, arg1, t0, arg2);
         tcg_temp_free_i64(t0);
     } else {
-        int sizemask = 0;
-        /* Return value and both arguments are 64-bit and unsigned.  */
-        sizemask |= tcg_gen_sizemask(0, 1, 0);
-        sizemask |= tcg_gen_sizemask(1, 1, 0);
-        sizemask |= tcg_gen_sizemask(2, 1, 0);
-        tcg_gen_helper64(tcg_helper_divu_i64, sizemask, ret, arg1, arg2);
+        gen_helper_divu_i64(ret, arg1, arg2);
     }
 }
 
@@ -1426,12 +1310,7 @@ static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
         tcg_gen_op5_i64(INDEX_op_divu2_i64, t0, ret, arg1, t0, arg2);
         tcg_temp_free_i64(t0);
     } else {
-        int sizemask = 0;
-        /* Return value and both arguments are 64-bit and unsigned.  */
-        sizemask |= tcg_gen_sizemask(0, 1, 0);
-        sizemask |= tcg_gen_sizemask(1, 1, 0);
-        sizemask |= tcg_gen_sizemask(2, 1, 0);
-        tcg_gen_helper64(tcg_helper_remu_i64, sizemask, ret, arg1, arg2);
+        gen_helper_remu_i64(ret, arg1, arg2);
     }
 }
 #endif /* TCG_TARGET_REG_BITS == 32 */
@@ -2530,13 +2409,8 @@ static inline void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh,
         tcg_temp_free_i64(t);
     } else {
         TCGv_i64 t0 = tcg_temp_new_i64();
-        int sizemask = 0;
-        /* Return value and both arguments are 64-bit and unsigned.  */
-        sizemask |= tcg_gen_sizemask(0, 1, 0);
-        sizemask |= tcg_gen_sizemask(1, 1, 0);
-        sizemask |= tcg_gen_sizemask(2, 1, 0);
         tcg_gen_mul_i64(t0, arg1, arg2);
-        tcg_gen_helper64(tcg_helper_muluh_i64, sizemask, rh, arg1, arg2);
+        gen_helper_muluh_i64(rh, arg1, arg2);
         tcg_gen_mov_i64(rl, t0);
         tcg_temp_free_i64(t0);
     }
@@ -2575,13 +2449,8 @@ static inline void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh,
         tcg_temp_free_i64(t3);
     } else {
         TCGv_i64 t0 = tcg_temp_new_i64();
-        int sizemask = 0;
-        /* Return value and both arguments are 64-bit and signed.  */
-        sizemask |= tcg_gen_sizemask(0, 1, 1);
-        sizemask |= tcg_gen_sizemask(1, 1, 1);
-        sizemask |= tcg_gen_sizemask(2, 1, 1);
         tcg_gen_mul_i64(t0, arg1, arg2);
-        tcg_gen_helper64(tcg_helper_mulsh_i64, sizemask, rh, arg1, arg2);
+        gen_helper_mulsh_i64(rh, arg1, arg2);
         tcg_gen_mov_i64(rl, t0);
         tcg_temp_free_i64(t0);
     }
diff --git a/tcg/tcg-runtime.h b/tcg/tcg-runtime.h
index a1ebef9f9c..23a0c37711 100644
--- a/tcg/tcg-runtime.h
+++ b/tcg/tcg-runtime.h
@@ -1,20 +1,16 @@
-#ifndef TCG_RUNTIME_H
-#define TCG_RUNTIME_H
+DEF_HELPER_FLAGS_2(div_i32, TCG_CALL_NO_RWG_SE, s32, s32, s32)
+DEF_HELPER_FLAGS_2(rem_i32, TCG_CALL_NO_RWG_SE, s32, s32, s32)
+DEF_HELPER_FLAGS_2(divu_i32, TCG_CALL_NO_RWG_SE, i32, i32, i32)
+DEF_HELPER_FLAGS_2(remu_i32, TCG_CALL_NO_RWG_SE, i32, i32, i32)
 
-/* tcg-runtime.c */
-int32_t tcg_helper_div_i32(int32_t arg1, int32_t arg2);
-int32_t tcg_helper_rem_i32(int32_t arg1, int32_t arg2);
-uint32_t tcg_helper_divu_i32(uint32_t arg1, uint32_t arg2);
-uint32_t tcg_helper_remu_i32(uint32_t arg1, uint32_t arg2);
+DEF_HELPER_FLAGS_2(div_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
+DEF_HELPER_FLAGS_2(rem_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
+DEF_HELPER_FLAGS_2(divu_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(remu_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 
-int64_t tcg_helper_shl_i64(int64_t arg1, int64_t arg2);
-int64_t tcg_helper_shr_i64(int64_t arg1, int64_t arg2);
-int64_t tcg_helper_sar_i64(int64_t arg1, int64_t arg2);
-int64_t tcg_helper_div_i64(int64_t arg1, int64_t arg2);
-int64_t tcg_helper_rem_i64(int64_t arg1, int64_t arg2);
-int64_t tcg_helper_mulsh_i64(int64_t arg1, int64_t arg2);
-uint64_t tcg_helper_divu_i64(uint64_t arg1, uint64_t arg2);
-uint64_t tcg_helper_remu_i64(uint64_t arg1, uint64_t arg2);
-uint64_t tcg_helper_muluh_i64(uint64_t arg1, uint64_t arg2);
+DEF_HELPER_FLAGS_2(shl_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(shr_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(sar_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 
-#endif
+DEF_HELPER_FLAGS_2(mulsh_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
+DEF_HELPER_FLAGS_2(muluh_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index ea8aa70c16..2c5732da17 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -86,8 +86,14 @@ typedef struct QEMU_PACKED {
     uintptr_t func_len;
 } DebugFrameFDEHeader;
 
+typedef struct QEMU_PACKED {
+    DebugFrameCIE cie;
+    DebugFrameFDEHeader fde;
+} DebugFrameHeader;
+
 static void tcg_register_jit_int(void *buf, size_t size,
-                                 void *debug_frame, size_t debug_frame_size)
+                                 const void *debug_frame,
+                                 size_t debug_frame_size)
     __attribute__((unused));
 
 /* Forward declarations for functions declared and used in tcg-target.c. */
@@ -307,32 +313,17 @@ void tcg_pool_reset(TCGContext *s)
     s->pool_current = NULL;
 }
 
-#include "helper.h"
-
 typedef struct TCGHelperInfo {
     void *func;
     const char *name;
+    unsigned flags;
+    unsigned sizemask;
 } TCGHelperInfo;
 
+#include "exec/helper-proto.h"
+
 static const TCGHelperInfo all_helpers[] = {
-#define GEN_HELPER 2
-#include "helper.h"
-
-    /* Include tcg-runtime.c functions.  */
-    { tcg_helper_div_i32, "div_i32" },
-    { tcg_helper_rem_i32, "rem_i32" },
-    { tcg_helper_divu_i32, "divu_i32" },
-    { tcg_helper_remu_i32, "remu_i32" },
-
-    { tcg_helper_shl_i64, "shl_i64" },
-    { tcg_helper_shr_i64, "shr_i64" },
-    { tcg_helper_sar_i64, "sar_i64" },
-    { tcg_helper_div_i64, "div_i64" },
-    { tcg_helper_rem_i64, "rem_i64" },
-    { tcg_helper_divu_i64, "divu_i64" },
-    { tcg_helper_remu_i64, "remu_i64" },
-    { tcg_helper_mulsh_i64, "mulsh_i64" },
-    { tcg_helper_muluh_i64, "muluh_i64" },
+#include "exec/helper-tcg.h"
 };
 
 void tcg_context_init(TCGContext *s)
@@ -373,7 +364,7 @@ void tcg_context_init(TCGContext *s)
 
     for (i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
         g_hash_table_insert(helper_table, (gpointer)all_helpers[i].func,
-                            (gpointer)all_helpers[i].name);
+                            (gpointer)&all_helpers[i]);
     }
 
     tcg_target_init(s);
@@ -706,13 +697,17 @@ int tcg_check_temp_count(void)
 /* Note: we convert the 64 bit args to 32 bit and do some alignment
    and endian swap. Maybe it would be better to do the alignment
    and endian swap in tcg_reg_alloc_call(). */
-void tcg_gen_callN(TCGContext *s, void *func, unsigned int flags,
-                   int sizemask, TCGArg ret, int nargs, TCGArg *args)
+void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret,
+                   int nargs, TCGArg *args)
 {
-    int i;
-    int real_args;
-    int nb_rets;
+    int i, real_args, nb_rets;
+    unsigned sizemask, flags;
     TCGArg *nparam;
+    TCGHelperInfo *info;
+
+    info = g_hash_table_lookup(s->helpers, (gpointer)func);
+    flags = info->flags;
+    sizemask = info->sizemask;
 
 #if defined(__sparc__) && !defined(__arch64__) \
     && !defined(CONFIG_TCG_INTERPRETER)
@@ -798,9 +793,8 @@ void tcg_gen_callN(TCGContext *s, void *func, unsigned int flags,
     }
     real_args = 0;
     for (i = 0; i < nargs; i++) {
-#if TCG_TARGET_REG_BITS < 64
         int is_64bit = sizemask & (1 << (i+1)*2);
-        if (is_64bit) {
+        if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
 #ifdef TCG_TARGET_CALL_ALIGN_ARGS
             /* some targets want aligned 64 bit args */
             if (real_args & 1) {
@@ -828,7 +822,6 @@ void tcg_gen_callN(TCGContext *s, void *func, unsigned int flags,
             real_args += 2;
             continue;
         }
-#endif /* TCG_TARGET_REG_BITS < 64 */
 
         *s->gen_opparam_ptr++ = args[i];
         real_args++;
@@ -1166,7 +1159,10 @@ static inline const char *tcg_find_helper(TCGContext *s, uintptr_t val)
 {
     const char *ret = NULL;
     if (s->helpers) {
-        ret = g_hash_table_lookup(s->helpers, (gpointer)val);
+        TCGHelperInfo *info = g_hash_table_lookup(s->helpers, (gpointer)val);
+        if (info) {
+            ret = info->name;
+        }
     }
     return ret;
 }
@@ -2787,7 +2783,8 @@ static int find_string(const char *strtab, const char *str)
 }
 
 static void tcg_register_jit_int(void *buf_ptr, size_t buf_size,
-                                 void *debug_frame, size_t debug_frame_size)
+                                 const void *debug_frame,
+                                 size_t debug_frame_size)
 {
     struct __attribute__((packed)) DebugInfo {
         uint32_t  len;
@@ -2925,10 +2922,10 @@ static void tcg_register_jit_int(void *buf_ptr, size_t buf_size,
 
     uintptr_t buf = (uintptr_t)buf_ptr;
     size_t img_size = sizeof(struct ElfImage) + debug_frame_size;
+    DebugFrameHeader *dfh;
 
     img = g_malloc(img_size);
     *img = img_template;
-    memcpy(img + 1, debug_frame, debug_frame_size);
 
     img->phdr.p_vaddr = buf;
     img->phdr.p_paddr = buf;
@@ -2956,6 +2953,11 @@ static void tcg_register_jit_int(void *buf_ptr, size_t buf_size,
     img->di.fn_low_pc = buf;
     img->di.fn_high_pc = buf + buf_size;
 
+    dfh = (DebugFrameHeader *)(img + 1);
+    memcpy(dfh, debug_frame, debug_frame_size);
+    dfh->fde.func_start = buf;
+    dfh->fde.func_len = buf_size;
+
 #ifdef DEBUG_JIT
     /* Enable this block to be able to debug the ELF image file creation.
        One can use readelf, objdump, or other inspection utilities.  */
@@ -2983,7 +2985,8 @@ static void tcg_register_jit_int(void *buf_ptr, size_t buf_size,
    and implement the internal function we declared earlier.  */
 
 static void tcg_register_jit_int(void *buf, size_t size,
-                                 void *debug_frame, size_t debug_frame_size)
+                                 const void *debug_frame,
+                                 size_t debug_frame_size)
 {
 }
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index fbc93101cf..2efa333166 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -54,8 +54,6 @@ typedef uint64_t tcg_target_ulong;
 #error unsupported
 #endif
 
-#include "tcg-runtime.h"
-
 #if TCG_TARGET_NB_REGS <= 32
 typedef uint32_t TCGRegSet;
 #elif TCG_TARGET_NB_REGS <= 64
@@ -725,8 +723,8 @@ void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs);
 #define tcg_temp_free_ptr(T) tcg_temp_free_i64(TCGV_PTR_TO_NAT(T))
 #endif
 
-void tcg_gen_callN(TCGContext *s, void *func, unsigned int flags,
-                   int sizemask, TCGArg ret, int nargs, TCGArg *args);
+void tcg_gen_callN(TCGContext *s, void *func,
+                   TCGArg ret, int nargs, TCGArg *args);
 
 void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
                         int c, int right, int arith);
diff --git a/tcg/tci/tcg-target.c b/tcg/tci/tcg-target.c
index 9b39231c15..375e590d2b 100644
--- a/tcg/tci/tcg-target.c
+++ b/tcg/tci/tcg-target.c
@@ -544,7 +544,10 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
 
 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *arg)
 {
+    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_out_op_t(s, INDEX_op_call);
     tcg_out_ri(s, 1, (uintptr_t)arg);
+    old_code_ptr[1] = s->code_ptr - old_code_ptr;
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,