10 files changed, 126 insertions, 130 deletions
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index b3be6f3177..fe44ad709c 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -1004,7 +1004,7 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, oi);
     tcg_out_adr(s, TCG_REG_X3, lb->raddr);
-    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
     if (opc & MO_SIGN) {
         tcg_out_sxt(s, lb->type, size, lb->datalo_reg, TCG_REG_X0);
     } else {
@@ -1027,7 +1027,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_mov(s, size == MO_64, TCG_REG_X2, lb->datalo_reg);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, oi);
     tcg_out_adr(s, TCG_REG_X4, lb->raddr);
-    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
     tcg_out_goto(s, lb->raddr);
 }
 
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 06a8064a9f..ae2ec7a922 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1260,9 +1260,9 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
        icache usage.  For pre-armv6, use the signed helpers since we do
        not have a single insn sign-extend.  */
     if (use_armv6_instructions) {
-        func = qemu_ld_helpers[opc & ~MO_SIGN];
+        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
     } else {
-        func = qemu_ld_helpers[opc];
+        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
         if (opc & MO_SIGN) {
             opc = MO_UL;
         }
@@ -1337,7 +1337,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
     /* Tail-call to the helper, which will return to the fast path.  */
-    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc]);
+    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 }
 #endif /* SOFTMMU */
 
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 2e4bf52aae..ff4d9cfec7 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -1307,7 +1307,7 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
                      (uintptr_t)l->raddr);
     }
 
-    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     data_reg = l->datalo_reg;
     switch (opc & MO_SSIZE) {
@@ -1413,7 +1413,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 
     /* "Tail call" to the helper, with the return address back inline.  */
     tcg_out_push(s, retaddr);
-    tcg_out_jmp(s, qemu_st_helpers[opc]);
+    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 }
 #elif defined(__x86_64__) && defined(__linux__)
 # include <asm/prctl.h>
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index f64c89c3c0..f643eca3df 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -1031,7 +1031,7 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     }
     i = tcg_out_call_iarg_imm(s, i, oi);
     i = tcg_out_call_iarg_imm(s, i, (intptr_t)l->raddr);
-    tcg_out_call_int(s, qemu_ld_helpers[opc], false);
+    tcg_out_call_int(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)], false);
     /* delay slot */
     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 
@@ -1094,7 +1094,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
        computation to take place in the return address register.  */
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (intptr_t)l->raddr);
     i = tcg_out_call_iarg_reg(s, i, TCG_REG_RA);
-    tcg_out_call_int(s, qemu_st_helpers[opc], true);
+    tcg_out_call_int(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)], true);
     /* delay slot */
     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 }
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 585f1ed7bb..0f6f7008da 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -193,10 +193,42 @@ static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
     return false;
 }
 
+static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg *args,
+                             TCGArg dst, TCGArg val)
+{
+    TCGOpcode new_op = op_to_movi(op->opc);
+    tcg_target_ulong mask;
+
+    op->opc = new_op;
+
+    reset_temp(dst);
+    temps[dst].state = TCG_TEMP_CONST;
+    temps[dst].val = val;
+    mask = val;
+    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+        /* High bits of the destination are now garbage.  */
+        mask |= ~0xffffffffull;
+    }
+    temps[dst].mask = mask;
+
+    args[0] = dst;
+    args[1] = val;
+}
+
 static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg *args,
-                            TCGOpcode old_op, TCGArg dst, TCGArg src)
+                            TCGArg dst, TCGArg src)
 {
-    TCGOpcode new_op = op_to_mov(old_op);
+    if (temps_are_copies(dst, src)) {
+        tcg_op_remove(s, op);
+        return;
+    }
+
+    if (temps[src].state == TCG_TEMP_CONST) {
+        tcg_opt_gen_movi(s, op, args, dst, temps[src].val);
+        return;
+    }
+
+    TCGOpcode new_op = op_to_mov(op->opc);
     tcg_target_ulong mask;
 
     op->opc = new_op;
@@ -228,28 +260,6 @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg *args,
     args[1] = src;
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg *args,
-                             TCGOpcode old_op, TCGArg dst, TCGArg val)
-{
-    TCGOpcode new_op = op_to_movi(old_op);
-    tcg_target_ulong mask;
-
-    op->opc = new_op;
-
-    reset_temp(dst);
-    temps[dst].state = TCG_TEMP_CONST;
-    temps[dst].val = val;
-    mask = val;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-        /* High bits of the destination are now garbage.  */
-        mask |= ~0xffffffffull;
-    }
-    temps[dst].mask = mask;
-
-    args[0] = dst;
-    args[1] = val;
-}
-
 static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
 {
     uint64_t l64, h64;
@@ -564,7 +574,7 @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 }
 
 /* Propagate constants and copies, fold constant expressions. */
-static void tcg_constant_folding(TCGContext *s)
+void tcg_optimize(TCGContext *s)
 {
     int oi, oi_next, nb_temps, nb_globals;
 
@@ -670,7 +680,7 @@ static void tcg_constant_folding(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[1]].val == 0) {
-                tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
+                tcg_opt_gen_movi(s, op, args, args[0], 0);
                 continue;
             }
             break;
@@ -775,7 +785,8 @@ static void tcg_constant_folding(TCGContext *s)
             if (temps[args[1]].state != TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0) {
-                goto do_mov3;
+                tcg_opt_gen_mov(s, op, args, args[0], args[1]);
+                continue;
             }
             break;
         CASE_OP_32_64(and):
@@ -784,16 +795,10 @@ static void tcg_constant_folding(TCGContext *s)
             if (temps[args[1]].state != TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == -1) {
-                goto do_mov3;
+                tcg_opt_gen_mov(s, op, args, args[0], args[1]);
+                continue;
             }
             break;
-        do_mov3:
-            if (temps_are_copies(args[0], args[1])) {
-                tcg_op_remove(s, op);
-            } else {
-                tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
-            }
-            continue;
         default:
             break;
         }
@@ -942,19 +947,12 @@ static void tcg_constant_folding(TCGContext *s)
 
         if (partmask == 0) {
             assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
+            tcg_opt_gen_movi(s, op, args, args[0], 0);
             continue;
         }
         if (affected == 0) {
             assert(nb_oargs == 1);
-            if (temps_are_copies(args[0], args[1])) {
-                tcg_op_remove(s, op);
-            } else if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
-            } else {
-                tcg_opt_gen_movi(s, op, args, opc,
-                                 args[0], temps[args[1]].val);
-            }
+            tcg_opt_gen_mov(s, op, args, args[0], args[1]);
             continue;
         }
 
@@ -966,7 +964,7 @@ static void tcg_constant_folding(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if ((temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0)) {
-                tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
+                tcg_opt_gen_movi(s, op, args, args[0], 0);
                 continue;
             }
             break;
@@ -979,14 +977,7 @@ static void tcg_constant_folding(TCGContext *s)
         CASE_OP_32_64(or):
         CASE_OP_32_64(and):
             if (temps_are_copies(args[1], args[2])) {
-                if (temps_are_copies(args[0], args[1])) {
-                    tcg_op_remove(s, op);
-                } else if (temps[args[1]].state != TCG_TEMP_CONST) {
-                    tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
-                } else {
-                    tcg_opt_gen_movi(s, op, args, opc,
-                                     args[0], temps[args[1]].val);
-                }
+                tcg_opt_gen_mov(s, op, args, args[0], args[1]);
                 continue;
             }
             break;
@@ -1000,7 +991,7 @@ static void tcg_constant_folding(TCGContext *s)
         CASE_OP_32_64(sub):
         CASE_OP_32_64(xor):
             if (temps_are_copies(args[1], args[2])) {
-                tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
+                tcg_opt_gen_movi(s, op, args, args[0], 0);
                 continue;
             }
             break;
@@ -1013,20 +1004,10 @@ static void tcg_constant_folding(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64(mov):
-            if (temps_are_copies(args[0], args[1])) {
-                tcg_op_remove(s, op);
-                break;
-            }
-            if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
-                break;
-            }
-            /* Source argument is constant.  Rewrite the operation and
-               let movi case handle it. */
-            args[1] = temps[args[1]].val;
-            /* fallthrough */
+            tcg_opt_gen_mov(s, op, args, args[0], args[1]);
+            break;
         CASE_OP_32_64(movi):
-            tcg_opt_gen_movi(s, op, args, opc, args[0], args[1]);
+            tcg_opt_gen_movi(s, op, args, args[0], args[1]);
             break;
 
         CASE_OP_32_64(not):
@@ -1039,7 +1020,7 @@ static void tcg_constant_folding(TCGContext *s)
         case INDEX_op_ext32u_i64:
             if (temps[args[1]].state == TCG_TEMP_CONST) {
                 tmp = do_constant_folding(opc, temps[args[1]].val, 0);
-                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
+                tcg_opt_gen_movi(s, op, args, args[0], tmp);
                 break;
             }
             goto do_default;
@@ -1047,7 +1028,7 @@ static void tcg_constant_folding(TCGContext *s)
         case INDEX_op_trunc_shr_i32:
             if (temps[args[1]].state == TCG_TEMP_CONST) {
                 tmp = do_constant_folding(opc, temps[args[1]].val, args[2]);
-                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
+                tcg_opt_gen_movi(s, op, args, args[0], tmp);
                 break;
             }
             goto do_default;
@@ -1078,7 +1059,7 @@ static void tcg_constant_folding(TCGContext *s)
                 && temps[args[2]].state == TCG_TEMP_CONST) {
                 tmp = do_constant_folding(opc, temps[args[1]].val,
                                           temps[args[2]].val);
-                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
+                tcg_opt_gen_movi(s, op, args, args[0], tmp);
                 break;
             }
             goto do_default;
@@ -1088,7 +1069,7 @@ static void tcg_constant_folding(TCGContext *s)
                 && temps[args[2]].state == TCG_TEMP_CONST) {
                 tmp = deposit64(temps[args[1]].val, args[3], args[4],
                                 temps[args[2]].val);
-                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
+                tcg_opt_gen_movi(s, op, args, args[0], tmp);
                 break;
             }
             goto do_default;
@@ -1096,7 +1077,7 @@ static void tcg_constant_folding(TCGContext *s)
         CASE_OP_32_64(setcond):
             tmp = do_constant_folding_cond(opc, args[1], args[2], args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
+                tcg_opt_gen_movi(s, op, args, args[0], tmp);
                 break;
             }
             goto do_default;
@@ -1118,14 +1099,7 @@ static void tcg_constant_folding(TCGContext *s)
         CASE_OP_32_64(movcond):
             tmp = do_constant_folding_cond(opc, args[1], args[2], args[5]);
             if (tmp != 2) {
-                if (temps_are_copies(args[0], args[4-tmp])) {
-                    tcg_op_remove(s, op);
-                } else if (temps[args[4-tmp]].state == TCG_TEMP_CONST) {
-                    tcg_opt_gen_movi(s, op, args, opc,
-                                     args[0], temps[args[4-tmp]].val);
-                } else {
-                    tcg_opt_gen_mov(s, op, args, opc, args[0], args[4-tmp]);
-                }
+                tcg_opt_gen_mov(s, op, args, args[0], args[4-tmp]);
                 break;
             }
             goto do_default;
@@ -1154,8 +1128,8 @@ static void tcg_constant_folding(TCGContext *s)
 
                 rl = args[0];
                 rh = args[1];
-                tcg_opt_gen_movi(s, op, args, opc, rl, (uint32_t)a);
-                tcg_opt_gen_movi(s, op2, args2, opc, rh, (uint32_t)(a >> 32));
+                tcg_opt_gen_movi(s, op, args, rl, (uint32_t)a);
+                tcg_opt_gen_movi(s, op2, args2, rh, (uint32_t)(a >> 32));
 
                 /* We've done all we need to do with the movi.  Skip it.  */
                 oi_next = op2->next;
@@ -1175,8 +1149,8 @@ static void tcg_constant_folding(TCGContext *s)
 
                 rl = args[0];
                 rh = args[1];
-                tcg_opt_gen_movi(s, op, args, opc, rl, (uint32_t)r);
-                tcg_opt_gen_movi(s, op2, args2, opc, rh, (uint32_t)(r >> 32));
+                tcg_opt_gen_movi(s, op, args, rl, (uint32_t)r);
+                tcg_opt_gen_movi(s, op2, args2, rh, (uint32_t)(r >> 32));
 
                 /* We've done all we need to do with the movi.  Skip it.  */
                 oi_next = op2->next;
@@ -1260,7 +1234,7 @@ static void tcg_constant_folding(TCGContext *s)
             tmp = do_constant_folding_cond2(&args[1], &args[3], args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
+                tcg_opt_gen_movi(s, op, args, args[0], tmp);
             } else if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
                        && temps[args[3]].state == TCG_TEMP_CONST
                        && temps[args[4]].state == TCG_TEMP_CONST
@@ -1354,8 +1328,3 @@ static void tcg_constant_folding(TCGContext *s)
         }
     }
 }
-
-void tcg_optimize(TCGContext *s)
-{
-    tcg_constant_folding(s);
-}
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index d49c7d925f..2b6eafa03c 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -1495,7 +1495,7 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
     tcg_out32(s, MFSPR | RT(arg) | LR);
 
-    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     lo = lb->datalo_reg;
     hi = lb->datahi_reg;
@@ -1565,7 +1565,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
     tcg_out32(s, MFSPR | RT(arg) | LR);
 
-    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     tcg_out_b(s, 0, lb->raddr);
 }
@@ -1624,7 +1624,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
             tcg_out32(s, LWZ | TAI(datalo, addrlo, 4));
         }
     } else {
-        uint32_t insn = qemu_ldx_opc[opc];
+        uint32_t insn = qemu_ldx_opc[opc & (MO_BSWAP | MO_SSIZE)];
         if (!HAVE_ISA_2_06 && insn == LDBRX) {
             tcg_out32(s, ADDI | TAI(TCG_REG_R0, addrlo, 4));
             tcg_out32(s, LWBRX | TAB(datalo, rbase, addrlo));
@@ -1696,7 +1696,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
             tcg_out32(s, STW | TAI(datalo, addrlo, 4));
         }
     } else {
-        uint32_t insn = qemu_stx_opc[opc];
+        uint32_t insn = qemu_stx_opc[opc & (MO_BSWAP | MO_SIZE)];
         if (!HAVE_ISA_2_06 && insn == STDBRX) {
             tcg_out32(s, STWBRX | SAB(datalo, rbase, addrlo));
             tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, addrlo, 4));
diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c
index 46dedc9f82..669fafe24f 100644
--- a/tcg/s390/tcg-target.c
+++ b/tcg/s390/tcg-target.c
@@ -1573,7 +1573,7 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     }
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R4, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R5, (uintptr_t)lb->raddr);
-    tcg_out_call(s, qemu_ld_helpers[opc]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);
     tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_R2);
 
     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
@@ -1610,7 +1610,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     }
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R6, (uintptr_t)lb->raddr);
-    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
 }
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index c1794a33ed..1a870a81d7 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -1075,12 +1075,11 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
     TCGMemOp memop = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
     unsigned memi = get_mmuidx(oi);
-    TCGMemOp s_bits = memop & MO_SIZE;
     TCGReg addrz, param;
     tcg_insn_unit *func;
     tcg_insn_unit *label_ptr;
 
-    addrz = tcg_out_tlb_load(s, addr, memi, s_bits,
+    addrz = tcg_out_tlb_load(s, addr, memi, memop & MO_SIZE,
                              offsetof(CPUTLBEntry, addr_read));
 
     /* The fast path is exactly one insn.  Thus we can perform the
@@ -1092,7 +1091,8 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
     tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT
                   | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0);
     /* delay slot */
-    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1, qemu_ld_opc[memop]);
+    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1,
+                    qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
 
     /* TLB Miss.  */
 
@@ -1105,10 +1105,10 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
 
     /* We use the helpers to extend SB and SW data, leaving the case
        of SL needing explicit extending below.  */
-    if ((memop & ~MO_BSWAP) == MO_SL) {
-        func = qemu_ld_trampoline[memop & ~MO_SIGN];
+    if ((memop & MO_SSIZE) == MO_SL) {
+        func = qemu_ld_trampoline[memop & (MO_BSWAP | MO_SIZE)];
     } else {
-        func = qemu_ld_trampoline[memop];
+        func = qemu_ld_trampoline[memop & (MO_BSWAP | MO_SSIZE)];
     }
     assert(func != NULL);
     tcg_out_call_nodelay(s, func);
@@ -1119,13 +1119,13 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
        Which complicates things for sparcv8plus.  */
     if (SPARC64) {
         /* We let the helper sign-extend SB and SW, but leave SL for here.  */
-        if (is_64 && (memop & ~MO_BSWAP) == MO_SL) {
+        if (is_64 && (memop & MO_SSIZE) == MO_SL) {
             tcg_out_arithi(s, data, TCG_REG_O0, 0, SHIFT_SRA);
         } else {
             tcg_out_mov(s, TCG_TYPE_REG, data, TCG_REG_O0);
         }
     } else {
-        if (s_bits == MO_64) {
+        if ((memop & MO_SIZE) == MO_64) {
             tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, 32, SHIFT_SLLX);
             tcg_out_arithi(s, TCG_REG_O1, TCG_REG_O1, 0, SHIFT_SRL);
             tcg_out_arith(s, data, TCG_REG_O0, TCG_REG_O1, ARITH_OR);
@@ -1147,7 +1147,7 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
     }
     tcg_out_ldst_rr(s, data, addr,
                     (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
-                    qemu_ld_opc[memop]);
+                    qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
 #endif /* CONFIG_SOFTMMU */
 }
 
@@ -1157,12 +1157,11 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
     TCGMemOp memop = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
     unsigned memi = get_mmuidx(oi);
-    TCGMemOp s_bits = memop & MO_SIZE;
     TCGReg addrz, param;
     tcg_insn_unit *func;
     tcg_insn_unit *label_ptr;
 
-    addrz = tcg_out_tlb_load(s, addr, memi, s_bits,
+    addrz = tcg_out_tlb_load(s, addr, memi, memop & MO_SIZE,
                              offsetof(CPUTLBEntry, addr_write));
 
     /* The fast path is exactly one insn.  Thus we can perform the entire
@@ -1172,7 +1171,8 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
     tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT
                   | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0);
     /* delay slot */
-    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1, qemu_st_opc[memop]);
+    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1,
+                    qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
 
     /* TLB Miss.  */
 
@@ -1182,13 +1182,13 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
         param++;
     }
     tcg_out_mov(s, TCG_TYPE_REG, param++, addr);
-    if (!SPARC64 && s_bits == MO_64) {
+    if (!SPARC64 && (memop & MO_SIZE) == MO_64) {
         /* Skip the high-part; we'll perform the extract in the trampoline.  */
         param++;
     }
     tcg_out_mov(s, TCG_TYPE_REG, param++, data);
 
-    func = qemu_st_trampoline[memop];
+    func = qemu_st_trampoline[memop & (MO_BSWAP | MO_SIZE)];
     assert(func != NULL);
     tcg_out_call_nodelay(s, func);
     /* delay slot */
@@ -1202,7 +1202,7 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
     }
     tcg_out_ldst_rr(s, data, addr,
                     (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
-                    qemu_st_opc[memop]);
+                    qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
 #endif /* CONFIG_SOFTMMU */
 }
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 8b43bbb122..7e088b1f28 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1076,10 +1076,19 @@ void tcg_dump_ops(TCGContext *s)
                     TCGMemOp op = get_memop(oi);
                     unsigned ix = get_mmuidx(oi);
 
-                    if (op < ARRAY_SIZE(ldst_name) && ldst_name[op]) {
-                        qemu_log(",%s,%u", ldst_name[op], ix);
-                    } else {
+                    if (op & ~(MO_AMASK | MO_BSWAP | MO_SSIZE)) {
                         qemu_log(",$0x%x,%u", op, ix);
+                    } else {
+                        const char *s_al = "", *s_op;
+                        if (op & MO_AMASK) {
+                            if ((op & MO_AMASK) == MO_ALIGN) {
+                                s_al = "al+";
+                            } else {
+                                s_al = "un+";
+                            }
+                        }
+                        s_op = ldst_name[op & (MO_BSWAP | MO_SSIZE)];
+                        qemu_log(",%s%s,%u", s_al, s_op, ix);
                     }
                     i = 1;
                 }
@@ -1378,16 +1387,20 @@ static void tcg_liveness_analysis(TCGContext *s)
                         memset(dead_temps, 1, s->nb_globals);
                     }
 
-                    /* input args are live */
+                    /* record arguments that die in this helper */
                     for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
                         arg = args[i];
                         if (arg != TCG_CALL_DUMMY_ARG) {
                             if (dead_temps[arg]) {
                                 dead_args |= (1 << i);
                             }
-                            dead_temps[arg] = 0;
                         }
                     }
+                    /* input arguments are live for preceeding opcodes */
+                    for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                        arg = args[i];
+                        dead_temps[arg] = 0;
+                    }
                     s->op_dead_args[oi] = dead_args;
                     s->op_sync_args[oi] = sync_args;
                 }
@@ -1522,12 +1535,16 @@ static void tcg_liveness_analysis(TCGContext *s)
                     memset(mem_temps, 1, s->nb_globals);
                 }
 
-                /* input args are live */
+                /* record arguments that die in this opcode */
                 for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
                     arg = args[i];
                     if (dead_temps[arg]) {
                         dead_args |= (1 << i);
                     }
+                }
+                /* input arguments are live for preceeding opcodes */
+                for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                    arg = args[i];
                     dead_temps[arg] = 0;
                 }
                 s->op_dead_args[oi] = dead_args;
@@ -1998,6 +2015,16 @@ static void tcg_reg_alloc_op(TCGContext *s,
                 if (!IS_DEAD_ARG(i)) {
                     goto allocate_in_reg;
                 }
+                /* check if the current register has already been allocated
+                   for another input aliased to an output */
+                int k2, i2;
+                for (k2 = 0 ; k2 < k ; k2++) {
+                    i2 = def->sorted_args[nb_oargs + k2];
+                    if ((def->args_ct[i2].ct & TCG_CT_IALIAS) &&
+                        (new_args[i2] == ts->reg)) {
+                        goto allocate_in_reg;
+                    }
+                }
             }
         }
         reg = ts->reg;
diff --git a/tci.c b/tci.c
index a14717d0d5..84449489d2 100644
--- a/tci.c
+++ b/tci.c
@@ -1107,7 +1107,7 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
             t0 = *tb_ptr++;
             taddr = tci_read_ulong(&tb_ptr);
             oi = tci_read_i(&tb_ptr);
-            switch (get_memop(oi)) {
+            switch (get_memop(oi) & (MO_BSWAP | MO_SSIZE)) {
             case MO_UB:
                 tmp32 = qemu_ld_ub;
                 break;
@@ -1144,7 +1144,7 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
             }
             taddr = tci_read_ulong(&tb_ptr);
             oi = tci_read_i(&tb_ptr);
-            switch (get_memop(oi)) {
+            switch (get_memop(oi) & (MO_BSWAP | MO_SSIZE)) {
             case MO_UB:
                 tmp64 = qemu_ld_ub;
                 break;
@@ -1193,7 +1193,7 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
             t0 = tci_read_r(&tb_ptr);
             taddr = tci_read_ulong(&tb_ptr);
             oi = tci_read_i(&tb_ptr);
-            switch (get_memop(oi)) {
+            switch (get_memop(oi) & (MO_BSWAP | MO_SIZE)) {
             case MO_UB:
                 qemu_st_b(t0);
                 break;
@@ -1217,7 +1217,7 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
             tmp64 = tci_read_r64(&tb_ptr);
             taddr = tci_read_ulong(&tb_ptr);
             oi = tci_read_i(&tb_ptr);
-            switch (get_memop(oi)) {
+            switch (get_memop(oi) & (MO_BSWAP | MO_SIZE)) {
             case MO_UB:
                 qemu_st_b(tmp64);
                 break;