1 files changed, 27 insertions, 9 deletions
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 6f8cdca756..cf7536bd52 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -686,6 +686,18 @@ static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
     }
 }
 
+static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
+{
+    /* Given the strength of x86 memory ordering, we only need care for
+       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
+       faster than "mfence", so don't bother with the sse insn.  */
+    if (a0 & TCG_MO_ST_LD) {
+        tcg_out8(s, 0xf0);
+        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
+        tcg_out8(s, 0);
+    }
+}
+
 static inline void tcg_out_push(TCGContext *s, int reg)
 {
     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
@@ -1202,7 +1214,10 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     TCGType ttype = TCG_TYPE_I32;
     TCGType tlbtype = TCG_TYPE_I32;
     int trexw = 0, hrexw = 0, tlbrexw = 0;
-    int a_bits = get_alignment_bits(opc);
+    unsigned a_bits = get_alignment_bits(opc);
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_mask = (1 << a_bits) - 1;
+    unsigned s_mask = (1 << s_bits) - 1;
     target_ulong tlb_mask;
 
     if (TCG_TARGET_REG_BITS == 64) {
@@ -1220,17 +1235,15 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     }
 
     tcg_out_mov(s, tlbtype, r0, addrlo);
-    if (a_bits >= 0) {
-        /* A byte access or an alignment check required */
+    /* If the required alignment is at least as large as the access, simply
+       copy the address and mask.  For lesser alignments, check that we don't
+       cross pages for the complete access.  */
+    if (a_bits >= s_bits) {
         tcg_out_mov(s, ttype, r1, addrlo);
-        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
     } else {
-        /* For unaligned access check that we don't cross pages using
-           the page address of the last byte.  */
-        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo,
-                             (1 << (opc & MO_SIZE)) - 1);
-        tlb_mask = TARGET_PAGE_MASK;
+        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
     }
+    tlb_mask = TARGET_PAGE_MASK | a_mask;
 
     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
@@ -2130,6 +2143,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    case INDEX_op_mb:
+        tcg_out_mb(s, args[0]);
+        break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
@@ -2195,6 +2211,8 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } },
     { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } },
 
+    { INDEX_op_mb, { } },
+
 #if TCG_TARGET_REG_BITS == 32
     { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } },
     { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },