summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2014-02-20 15:02:06 +0000
committerPeter Maydell <peter.maydell@linaro.org>2014-02-20 15:02:07 +0000
commit3d2bb5cc81ca52dcff854172625a3bb33987495c (patch)
tree464c866b0d1337bb232669cbdb7591880926ff20
parent61e8a923646903d76a6d952019716b417d42eedc (diff)
parent6399ab3325b7d4f77441c8a00fa9dae98bb0ac43 (diff)
downloadfocaccia-qemu-3d2bb5cc81ca52dcff854172625a3bb33987495c.tar.gz
focaccia-qemu-3d2bb5cc81ca52dcff854172625a3bb33987495c.zip
Merge remote-tracking branch 'remotes/rth/tcg-next' into staging
* remotes/rth/tcg-next:
  tcg/i386: Use SHLX/SHRX/SARX instructions
  tcg/i386: Use ANDN instruction
  tcg/i386: Add tcg_out_vex_modrm
  tcg/i386: Move TCG_CT_CONST_* to tcg-target.c
  disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX
  tcg/optimize: Add more identity simplifications
  tcg/optimize: Optmize ANDC X,Y,Y to MOV X,0
  tcg/optimize: Simply some logical ops to NOT
  tcg/optimize: Handle known-zeros masks for ANDC
  tcg/optimize: add known-zero bits compute for load ops
  tcg/optimize: improve known-zero bits for 32-bit ops
  tcg/optimize: fix known-zero bits optimization
  tcg/optimize: fix known-zero bits for right shift ops
  tcg-arm: The shift count of op_rotl_i32 is in args[2] not args[1].
  TCG: Fix 32-bit host allocation typo

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--disas/i386.c146
-rw-r--r--tcg/arm/tcg-target.c2
-rw-r--r--tcg/i386/tcg-target.c156
-rw-r--r--tcg/i386/tcg-target.h9
-rw-r--r--tcg/optimize.c165
-rw-r--r--tcg/tcg.c2
6 files changed, 414 insertions, 66 deletions
diff --git a/disas/i386.c b/disas/i386.c
index 044e02c032..00ceca9c51 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -171,6 +171,7 @@ static void print_operand_value (char *buf, size_t bufsize, int hex, bfd_vma dis
 static void print_displacement (char *, bfd_vma);
 static void OP_E (int, int);
 static void OP_G (int, int);
+static void OP_vvvv (int, int);
 static bfd_vma get64 (void);
 static bfd_signed_vma get32 (void);
 static bfd_signed_vma get32s (void);
@@ -264,6 +265,9 @@ static int rex_used;
    current instruction.  */
 static int used_prefixes;
 
+/* The VEX.vvvv register, unencoded.  */
+static int vex_reg;
+
 /* Flags stored in PREFIXES.  */
 #define PREFIX_REPZ 1
 #define PREFIX_REPNZ 2
@@ -278,6 +282,10 @@ static int used_prefixes;
 #define PREFIX_ADDR 0x400
 #define PREFIX_FWAIT 0x800
 
+#define PREFIX_VEX_0F    0x1000
+#define PREFIX_VEX_0F38  0x2000
+#define PREFIX_VEX_0F3A  0x4000
+
 /* Make sure that bytes from INFO->PRIVATE_DATA->BUFFER (inclusive)
    to ADDR (exclusive) are valid.  Returns 1 for success, longjmps
    on error.  */
@@ -323,6 +331,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
 
 #define XX { NULL, 0 }
 
+#define Bv { OP_vvvv, v_mode }
 #define Eb { OP_E, b_mode }
 #define Ev { OP_E, v_mode }
 #define Ed { OP_E, d_mode }
@@ -671,7 +680,8 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
 #define PREGRP102 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 102 } }
 #define PREGRP103 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 103 } }
 #define PREGRP104 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 104 } }
-
+#define PREGRP105 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 105 } }
+#define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } }
 
 #define X86_64_0  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } }
 #define X86_64_1  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } }
@@ -1449,7 +1459,7 @@ static const unsigned char threebyte_0x38_uses_DATA_prefix[256] = {
   /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
   /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1, /* df */
   /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
-  /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
+  /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
   /*       -------------------------------        */
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
@@ -1473,7 +1483,7 @@ static const unsigned char threebyte_0x38_uses_REPNZ_prefix[256] = {
   /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
   /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
   /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
-  /* f0 */ 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
+  /* f0 */ 1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
   /*       -------------------------------        */
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
@@ -1497,7 +1507,7 @@ static const unsigned char threebyte_0x38_uses_REPZ_prefix[256] = {
   /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
   /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
   /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
-  /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
+  /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
   /*       -------------------------------        */
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
@@ -2774,6 +2784,22 @@ static const struct dis386 prefix_user_table[][4] = {
     { "(bad)",	{ XX } },
   },
 
+  /* PREGRP105 */
+  {
+    { "andnS",	{ Gv, Bv, Ev } },
+    { "(bad)",	{ XX } },
+    { "(bad)",	{ XX } },
+    { "(bad)",	{ XX } },
+  },
+
+  /* PREGRP106 */
+  {
+    { "bextrS",	{ Gv, Ev, Bv } },
+    { "sarxS",	{ Gv, Ev, Bv } },
+    { "shlxS",	{ Gv, Ev, Bv } },
+    { "shrxS",	{ Gv, Ev, Bv } },
+  },
+
 };
 
 static const struct dis386 x86_64_table[][2] = {
@@ -3071,12 +3097,12 @@ static const struct dis386 three_byte_table[][256] = {
     /* f0 */
     { PREGRP87 },
     { PREGRP88 },
+    { PREGRP105 },
     { "(bad)", { XX } },
     { "(bad)", { XX } },
     { "(bad)", { XX } },
     { "(bad)", { XX } },
-    { "(bad)", { XX } },
-    { "(bad)", { XX } },
+    { PREGRP106 },
     /* f8 */
     { "(bad)", { XX } },
     { "(bad)", { XX } },
@@ -3477,6 +3503,74 @@ ckprefix (void)
     }
 }
 
+static void
+ckvexprefix (void)
+{
+    int op, vex2, vex3, newrex = 0, newpfx = prefixes;
+
+    if (address_mode == mode_16bit) {
+        return;
+    }
+
+    fetch_data(the_info, codep + 1);
+    op = *codep;
+
+    if (op != 0xc4 && op != 0xc5) {
+        return;
+    }
+
+    fetch_data(the_info, codep + 2);
+    vex2 = codep[1];
+
+    if (address_mode == mode_32bit && (vex2 & 0xc0) != 0xc0) {
+        return;
+    }
+
+    if (op == 0xc4) {
+        /* Three byte VEX prefix.  */
+        fetch_data(the_info, codep + 3);
+        vex3 = codep[2];
+
+        newrex |= (vex2 & 0x80 ? 0 : REX_R);
+        newrex |= (vex2 & 0x40 ? 0 : REX_X);
+        newrex |= (vex2 & 0x20 ? 0 : REX_B);
+        newrex |= (vex3 & 0x80 ? REX_W : 0);
+        switch (vex2 & 0x1f) {      /* VEX.m-mmmm */
+        case 1:
+            newpfx |= PREFIX_VEX_0F;
+            break;
+        case 2:
+            newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F38;
+            break;
+        case 3:
+            newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F3A;
+            break;
+        }
+        vex2 = vex3;
+        codep += 3;
+    } else {
+        /* Two byte VEX prefix.  */
+        newrex |= (vex2 & 0x80 ? 0 : REX_R);
+        codep += 2;
+    }
+
+    vex_reg = (~vex2 >> 3) & 15;     /* VEX.vvvv */
+    switch (vex2 & 3) {              /* VEX.pp */
+    case 1:
+        newpfx |= PREFIX_DATA;     /* 0x66 */
+        break;
+    case 2:
+        newpfx |= PREFIX_REPZ;     /* 0xf3 */
+        break;
+    case 3:
+        newpfx |= PREFIX_REPNZ;    /* 0xf2 */
+        break;
+    }
+
+    rex = newrex;
+    prefixes = newpfx;
+}
+
 /* Return the name of the prefix byte PREF, or NULL if PREF is not a
    prefix byte.  */
 
@@ -3598,6 +3692,7 @@ print_insn (bfd_vma pc, disassemble_info *info)
   const char *p;
   struct dis_private priv;
   unsigned char op;
+  unsigned char threebyte;
 
   if (info->mach == bfd_mach_x86_64_intel_syntax
       || info->mach == bfd_mach_x86_64)
@@ -3752,6 +3847,7 @@ print_insn (bfd_vma pc, disassemble_info *info)
 
   obufp = obuf;
   ckprefix ();
+  ckvexprefix ();
 
   insn_codep = codep;
   sizeflag = priv.orig_sizeflag;
@@ -3775,18 +3871,29 @@ print_insn (bfd_vma pc, disassemble_info *info)
     }
 
   op = 0;
+  if (prefixes & PREFIX_VEX_0F)
+    {
+      used_prefixes |= PREFIX_VEX_0F | PREFIX_VEX_0F38 | PREFIX_VEX_0F3A;
+      if (prefixes & PREFIX_VEX_0F38)
+        threebyte = 0x38;
+      else if (prefixes & PREFIX_VEX_0F3A)
+        threebyte = 0x3a;
+      else
+        threebyte = *codep++;
+      goto vex_opcode;
+    }
   if (*codep == 0x0f)
     {
-      unsigned char threebyte;
       fetch_data(info, codep + 2);
-      threebyte = *++codep;
+      threebyte = codep[1];
+      codep += 2;
+    vex_opcode:
       dp = &dis386_twobyte[threebyte];
-      need_modrm = twobyte_has_modrm[*codep];
-      uses_DATA_prefix = twobyte_uses_DATA_prefix[*codep];
-      uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[*codep];
-      uses_REPZ_prefix = twobyte_uses_REPZ_prefix[*codep];
-      uses_LOCK_prefix = (*codep & ~0x02) == 0x20;
-      codep++;
+      need_modrm = twobyte_has_modrm[threebyte];
+      uses_DATA_prefix = twobyte_uses_DATA_prefix[threebyte];
+      uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[threebyte];
+      uses_REPZ_prefix = twobyte_uses_REPZ_prefix[threebyte];
+      uses_LOCK_prefix = (threebyte & ~0x02) == 0x20;
       if (dp->name == NULL && dp->op[0].bytemode == IS_3BYTE_OPCODE)
 	{
           fetch_data(info, codep + 2);
@@ -5291,6 +5398,17 @@ OP_G (int bytemode, int sizeflag)
     }
 }
 
+static void
+OP_vvvv (int bytemode, int sizeflags)
+{
+    USED_REX (REX_W);
+    if (rex & REX_W) {
+        oappend(names64[vex_reg]);
+    } else {
+        oappend(names32[vex_reg]);
+    }
+}
+
 static bfd_vma
 get64 (void)
 {
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 82658a170c..c8884b31f4 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1866,7 +1866,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                             SHIFT_IMM_ROR((0x20 - args[2]) & 0x1f) :
                             SHIFT_IMM_LSL(0));
         } else {
-            tcg_out_dat_imm(s, COND_AL, ARITH_RSB, TCG_REG_TMP, args[1], 0x20);
+            tcg_out_dat_imm(s, COND_AL, ARITH_RSB, TCG_REG_TMP, args[2], 0x20);
             tcg_out_dat_reg(s, COND_AL, ARITH_MOV, args[0], 0, args[1],
                             SHIFT_REG_ROR(TCG_REG_TMP));
         }
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 5d4cf9386e..fef1717418 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -88,6 +88,11 @@ static const int tcg_target_call_oarg_regs[] = {
 #endif
 };
 
+/* Constants we accept.  */
+#define TCG_CT_CONST_S32 0x100
+#define TCG_CT_CONST_U32 0x200
+#define TCG_CT_CONST_I32 0x400
+
 /* Registers used with L constraint, which are the first argument 
    registers on x86_64, and two random call clobbered registers on
    i386. */
@@ -124,6 +129,16 @@ static bool have_movbe;
 # define have_movbe 0
 #endif
 
+/* We need this symbol in tcg-target.h, and we can't properly conditionalize
+   it there.  Therefore we always define the variable.  */
+bool have_bmi1;
+
+#if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
+static bool have_bmi2;
+#else
+# define have_bmi2 0
+#endif
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -166,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
         break;
     case 'c':
+    case_c:
         ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
         break;
@@ -194,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         tcg_regset_set32(ct->u.regs, 0, 0xf);
         break;
     case 'r':
+    case_r:
         ct->ct |= TCG_CT_REG;
         if (TCG_TARGET_REG_BITS == 64) {
             tcg_regset_set32(ct->u.regs, 0, 0xffff);
@@ -201,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
             tcg_regset_set32(ct->u.regs, 0, 0xff);
         }
         break;
+    case 'C':
+        /* With SHRX et al, we need not use ECX as shift count register.  */
+        if (have_bmi2) {
+            goto case_r;
+        } else {
+            goto case_c;
+        }
 
         /* qemu_ld/st address constraint */
     case 'L':
@@ -220,6 +244,9 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
     case 'Z':
         ct->ct |= TCG_CT_CONST_U32;
         break;
+    case 'I':
+        ct->ct |= TCG_CT_CONST_I32;
+        break;
 
     default:
         return -1;
@@ -243,6 +270,9 @@ static inline int tcg_target_const_match(tcg_target_long val,
     if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
         return 1;
     }
+    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
+        return 1;
+    }
     return 0;
 }
 
@@ -268,10 +298,13 @@ static inline int tcg_target_const_match(tcg_target_long val,
 # define P_REXB_RM	0
 # define P_GS           0
 #endif
+#define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */
+#define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
 #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
+#define OPC_ANDN        (0xf2 | P_EXT38)
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
 #define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_CALL_Jz	(0xe8)
@@ -309,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_SHIFT_1	(0xd1)
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
+#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
+#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
+#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 #define OPC_TESTL	(0x85)
 #define OPC_XCHG_ax_r32	(0x90)
 
@@ -398,9 +434,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 
     rex = 0;
     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
-    rex |= (r & 8) >> 1;		/* REX.R */
-    rex |= (x & 8) >> 2;		/* REX.X */
-    rex |= (rm & 8) >> 3;		/* REX.B */
+    rex |= (r & 8) >> 1;                /* REX.R */
+    rex |= (x & 8) >> 2;                /* REX.X */
+    rex |= (rm & 8) >> 3;               /* REX.B */
 
     /* P_REXB_{R,RM} indicates that the given register is the low byte.
        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
@@ -449,6 +485,48 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 }
 
+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+{
+    int tmp;
+
+    if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
+        /* Three byte VEX prefix.  */
+        tcg_out8(s, 0xc4);
+
+        /* VEX.m-mmmm */
+        if (opc & P_EXT38) {
+            tmp = 2;
+        } else if (opc & P_EXT) {
+            tmp = 1;
+        } else {
+            tcg_abort();
+        }
+        tmp |= 0x40;                       /* VEX.X */
+        tmp |= (r & 8 ? 0 : 0x80);         /* VEX.R */
+        tmp |= (rm & 8 ? 0 : 0x20);        /* VEX.B */
+        tcg_out8(s, tmp);
+
+        tmp = (opc & P_REXW ? 0x80 : 0);   /* VEX.W */
+    } else {
+        /* Two byte VEX prefix.  */
+        tcg_out8(s, 0xc5);
+
+        tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */
+    }
+    /* VEX.pp */
+    if (opc & P_DATA16) {
+        tmp |= 1;                          /* 0x66 */
+    } else if (opc & P_SIMDF3) {
+        tmp |= 2;                          /* 0xf3 */
+    } else if (opc & P_SIMDF2) {
+        tmp |= 3;                          /* 0xf2 */
+    }
+    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
+    tcg_out8(s, tmp);
+    tcg_out8(s, opc);
+    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
+}
+
 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
    We handle either RM and INDEX missing with a negative value.  In 64-bit
    mode for absolute addresses, ~RM is the size of the immediate operand
@@ -1638,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
-    int c, rexw = 0;
+    int c, vexop, rexw = 0;
 
 #if TCG_TARGET_REG_BITS == 64
 # define OP_32_64(x) \
@@ -1774,6 +1852,16 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    OP_32_64(andc):
+        if (const_args[2]) {
+            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32,
+                        args[0], args[1]);
+            tgen_arithi(s, ARITH_AND + rexw, args[0], ~args[2], 0);
+        } else {
+            tcg_out_vex_modrm(s, OPC_ANDN + rexw, args[0], args[2], args[1]);
+        }
+        break;
+
     OP_32_64(mul):
         if (const_args[2]) {
             int32_t val;
@@ -1799,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     OP_32_64(shl):
         c = SHIFT_SHL;
-        goto gen_shift;
+        vexop = OPC_SHLX;
+        goto gen_shift_maybe_vex;
     OP_32_64(shr):
         c = SHIFT_SHR;
-        goto gen_shift;
+        vexop = OPC_SHRX;
+        goto gen_shift_maybe_vex;
     OP_32_64(sar):
         c = SHIFT_SAR;
-        goto gen_shift;
+        vexop = OPC_SARX;
+        goto gen_shift_maybe_vex;
     OP_32_64(rotl):
         c = SHIFT_ROL;
         goto gen_shift;
     OP_32_64(rotr):
         c = SHIFT_ROR;
         goto gen_shift;
+    gen_shift_maybe_vex:
+        if (have_bmi2 && !const_args[2]) {
+            tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
+            break;
+        }
+        /* FALLTHRU */
     gen_shift:
         if (const_args[2]) {
             tcg_out_shifti(s, c + rexw, args[0], args[2]);
@@ -2002,10 +2099,11 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_and_i32, { "r", "0", "ri" } },
     { INDEX_op_or_i32, { "r", "0", "ri" } },
     { INDEX_op_xor_i32, { "r", "0", "ri" } },
+    { INDEX_op_andc_i32, { "r", "r", "ri" } },
 
-    { INDEX_op_shl_i32, { "r", "0", "ci" } },
-    { INDEX_op_shr_i32, { "r", "0", "ci" } },
-    { INDEX_op_sar_i32, { "r", "0", "ci" } },
+    { INDEX_op_shl_i32, { "r", "0", "Ci" } },
+    { INDEX_op_shr_i32, { "r", "0", "Ci" } },
+    { INDEX_op_sar_i32, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i32, { "r", "0", "ci" } },
     { INDEX_op_rotr_i32, { "r", "0", "ci" } },
 
@@ -2059,10 +2157,11 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_and_i64, { "r", "0", "reZ" } },
     { INDEX_op_or_i64, { "r", "0", "re" } },
     { INDEX_op_xor_i64, { "r", "0", "re" } },
+    { INDEX_op_andc_i64, { "r", "r", "rI" } },
 
-    { INDEX_op_shl_i64, { "r", "0", "ci" } },
-    { INDEX_op_shr_i64, { "r", "0", "ci" } },
-    { INDEX_op_sar_i64, { "r", "0", "ci" } },
+    { INDEX_op_shl_i64, { "r", "0", "Ci" } },
+    { INDEX_op_shr_i64, { "r", "0", "Ci" } },
+    { INDEX_op_sar_i64, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i64, { "r", "0", "ci" } },
     { INDEX_op_rotr_i64, { "r", "0", "ci" } },
 
@@ -2196,25 +2295,34 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
 static void tcg_target_init(TCGContext *s)
 {
-#if !(defined(have_cmov) && defined(have_movbe))
-    {
-        unsigned a, b, c, d;
-        int ret = __get_cpuid(1, &a, &b, &c, &d);
+    unsigned a, b, c, d;
+    int max = __get_cpuid_max(0, 0);
 
-# ifndef have_cmov
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+#ifndef have_cmov
         /* For 32-bit, 99% certainty that we're running on hardware that
            supports cmov, but we still need to check.  In case cmov is not
            available, we'll use a small forward branch.  */
-        have_cmov = ret && (d & bit_CMOV);
-# endif
-
-# ifndef have_movbe
+        have_cmov = (d & bit_CMOV) != 0;
+#endif
+#ifndef have_movbe
         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
            need to probe for it.  */
-        have_movbe = ret && (c & bit_MOVBE);
-# endif
+        have_movbe = (c & bit_MOVBE) != 0;
+#endif
     }
+
+    if (max >= 7) {
+        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
+        __cpuid_count(7, 0, a, b, c, d);
+#ifdef bit_BMI
+        have_bmi1 = (b & bit_BMI) != 0;
 #endif
+#ifndef have_bmi2
+        have_bmi2 = (b & bit_BMI2) != 0;
+#endif
+    }
 
     if (TCG_TARGET_REG_BITS == 64) {
         tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 92c0fcd36d..bdf2222452 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -64,9 +64,6 @@ typedef enum {
     TCG_REG_RDI = TCG_REG_EDI,
 } TCGReg;
 
-#define TCG_CT_CONST_S32 0x100
-#define TCG_CT_CONST_U32 0x200
-
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_ESP 
 #define TCG_TARGET_STACK_ALIGN 16
@@ -76,6 +73,8 @@ typedef enum {
 #define TCG_TARGET_CALL_STACK_OFFSET 0
 #endif
 
+extern bool have_bmi1;
+
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32         1
 #define TCG_TARGET_HAS_rot_i32          1
@@ -87,7 +86,7 @@ typedef enum {
 #define TCG_TARGET_HAS_bswap32_i32      1
 #define TCG_TARGET_HAS_neg_i32          1
 #define TCG_TARGET_HAS_not_i32          1
-#define TCG_TARGET_HAS_andc_i32         0
+#define TCG_TARGET_HAS_andc_i32         have_bmi1
 #define TCG_TARGET_HAS_orc_i32          0
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
@@ -115,7 +114,7 @@ typedef enum {
 #define TCG_TARGET_HAS_bswap64_i64      1
 #define TCG_TARGET_HAS_neg_i64          1
 #define TCG_TARGET_HAS_not_i64          1
-#define TCG_TARGET_HAS_andc_i64         0
+#define TCG_TARGET_HAS_andc_i64         have_bmi1
 #define TCG_TARGET_HAS_orc_i64          0
 #define TCG_TARGET_HAS_eqv_i64          0
 #define TCG_TARGET_HAS_nand_i64         0
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 89e2d6a3b3..7777743e88 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -655,11 +655,68 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                 }
             }
             break;
+        CASE_OP_32_64(xor):
+        CASE_OP_32_64(nand):
+            if (temps[args[1]].state != TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[2]].val == -1) {
+                i = 1;
+                goto try_not;
+            }
+            break;
+        CASE_OP_32_64(nor):
+            if (temps[args[1]].state != TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[2]].val == 0) {
+                i = 1;
+                goto try_not;
+            }
+            break;
+        CASE_OP_32_64(andc):
+            if (temps[args[2]].state != TCG_TEMP_CONST
+                && temps[args[1]].state == TCG_TEMP_CONST
+                && temps[args[1]].val == -1) {
+                i = 2;
+                goto try_not;
+            }
+            break;
+        CASE_OP_32_64(orc):
+        CASE_OP_32_64(eqv):
+            if (temps[args[2]].state != TCG_TEMP_CONST
+                && temps[args[1]].state == TCG_TEMP_CONST
+                && temps[args[1]].val == 0) {
+                i = 2;
+                goto try_not;
+            }
+            break;
+        try_not:
+            {
+                TCGOpcode not_op;
+                bool have_not;
+
+                if (def->flags & TCG_OPF_64BIT) {
+                    not_op = INDEX_op_not_i64;
+                    have_not = TCG_TARGET_HAS_not_i64;
+                } else {
+                    not_op = INDEX_op_not_i32;
+                    have_not = TCG_TARGET_HAS_not_i32;
+                }
+                if (!have_not) {
+                    break;
+                }
+                s->gen_opc_buf[op_index] = not_op;
+                reset_temp(args[0]);
+                gen_args[0] = args[0];
+                gen_args[1] = args[i];
+                args += 3;
+                gen_args += 2;
+                continue;
+            }
         default:
             break;
         }
 
-        /* Simplify expression for "op r, a, 0 => mov r, a" cases */
+        /* Simplify expression for "op r, a, const => mov r, a" cases */
         switch (op) {
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
@@ -670,28 +727,38 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(rotr):
         CASE_OP_32_64(or):
         CASE_OP_32_64(xor):
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
-                /* Proceed with possible constant folding. */
-                break;
-            }
-            if (temps[args[2]].state == TCG_TEMP_CONST
+        CASE_OP_32_64(andc):
+            if (temps[args[1]].state != TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0) {
-                if (temps_are_copies(args[0], args[1])) {
-                    s->gen_opc_buf[op_index] = INDEX_op_nop;
-                } else {
-                    s->gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
-                    gen_args += 2;
-                }
-                args += 3;
-                continue;
+                goto do_mov3;
+            }
+            break;
+        CASE_OP_32_64(and):
+        CASE_OP_32_64(orc):
+        CASE_OP_32_64(eqv):
+            if (temps[args[1]].state != TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[2]].val == -1) {
+                goto do_mov3;
             }
             break;
+        do_mov3:
+            if (temps_are_copies(args[0], args[1])) {
+                s->gen_opc_buf[op_index] = INDEX_op_nop;
+            } else {
+                s->gen_opc_buf[op_index] = op_to_mov(op);
+                tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
+                gen_args += 2;
+            }
+            args += 3;
+            continue;
         default:
             break;
         }
 
-        /* Simplify using known-zero bits */
+        /* Simplify using known-zero bits. Currently only ops with a single
+           output argument is supported. */
         mask = -1;
         affected = -1;
         switch (op) {
@@ -726,16 +793,36 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             mask = temps[args[1]].mask & mask;
             break;
 
-        CASE_OP_32_64(sar):
+        CASE_OP_32_64(andc):
+            /* Known-zeros does not imply known-ones.  Therefore unless
+               args[2] is constant, we can't infer anything from it.  */
             if (temps[args[2]].state == TCG_TEMP_CONST) {
-                mask = ((tcg_target_long)temps[args[1]].mask
-                        >> temps[args[2]].val);
+                mask = ~temps[args[2]].mask;
+                goto and_const;
             }
+            /* But we certainly know nothing outside args[1] may be set. */
+            mask = temps[args[1]].mask;
             break;
 
-        CASE_OP_32_64(shr):
+        case INDEX_op_sar_i32:
             if (temps[args[2]].state == TCG_TEMP_CONST) {
-                mask = temps[args[1]].mask >> temps[args[2]].val;
+                mask = (int32_t)temps[args[1]].mask >> temps[args[2]].val;
+            }
+            break;
+        case INDEX_op_sar_i64:
+            if (temps[args[2]].state == TCG_TEMP_CONST) {
+                mask = (int64_t)temps[args[1]].mask >> temps[args[2]].val;
+            }
+            break;
+
+        case INDEX_op_shr_i32:
+            if (temps[args[2]].state == TCG_TEMP_CONST) {
+                mask = (uint32_t)temps[args[1]].mask >> temps[args[2]].val;
+            }
+            break;
+        case INDEX_op_shr_i64:
+            if (temps[args[2]].state == TCG_TEMP_CONST) {
+                mask = (uint64_t)temps[args[1]].mask >> temps[args[2]].val;
             }
             break;
 
@@ -769,10 +856,40 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             mask = temps[args[3]].mask | temps[args[4]].mask;
             break;
 
+        CASE_OP_32_64(ld8u):
+        case INDEX_op_qemu_ld8u:
+            mask = 0xff;
+            break;
+        CASE_OP_32_64(ld16u):
+        case INDEX_op_qemu_ld16u:
+            mask = 0xffff;
+            break;
+        case INDEX_op_ld32u_i64:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_qemu_ld32u:
+#endif
+            mask = 0xffffffffu;
+            break;
+
+        CASE_OP_32_64(qemu_ld):
+            {
+                TCGMemOp mop = args[def->nb_oargs + def->nb_iargs];
+                if (!(mop & MO_SIGN)) {
+                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                }
+            }
+            break;
+
         default:
             break;
         }
 
+        /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit
+           results */
+        if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) {
+            mask &= 0xffffffffu;
+        }
+
         if (mask == 0) {
             assert(def->nb_oargs == 1);
             s->gen_opc_buf[op_index] = op_to_movi(op);
@@ -839,6 +956,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
 
         /* Simplify expression for "op r, a, a => movi r, 0" cases */
         switch (op) {
+        CASE_OP_32_64(andc):
         CASE_OP_32_64(sub):
         CASE_OP_32_64(xor):
             if (temps_are_copies(args[1], args[2])) {
@@ -1140,6 +1258,11 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             } else {
                 for (i = 0; i < def->nb_oargs; i++) {
                     reset_temp(args[i]);
+                    /* Save the corresponding known-zero bits mask for the
+                       first output argument (only one supported so far). */
+                    if (i == 0) {
+                        temps[args[i]].mask = mask;
+                    }
                 }
             }
             for (i = 0; i < def->nb_args; i++) {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index acd02b99b6..ffc851e0c6 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -526,7 +526,7 @@ static inline int tcg_temp_new_internal(TCGType type, int temp_local)
             ts->temp_local = temp_local;
             ts->name = NULL;
             ts++;
-            ts->base_type = TCG_TYPE_I32;
+            ts->base_type = type;
             ts->type = TCG_TYPE_I32;
             ts->temp_allocated = 1;
             ts->temp_local = temp_local;