about summary refs log tree commit diff stats
path: root/src/miasm/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'src/miasm/arch/x86')
-rw-r--r--src/miasm/arch/x86/__init__.py1
-rw-r--r--src/miasm/arch/x86/arch.py4878
-rw-r--r--src/miasm/arch/x86/ctype.py137
-rw-r--r--src/miasm/arch/x86/disasm.py30
-rw-r--r--src/miasm/arch/x86/jit.py296
-rw-r--r--src/miasm/arch/x86/lifter_model_call.py80
-rw-r--r--src/miasm/arch/x86/regs.py454
-rw-r--r--src/miasm/arch/x86/sem.py6065
8 files changed, 11941 insertions, 0 deletions
diff --git a/src/miasm/arch/x86/__init__.py b/src/miasm/arch/x86/__init__.py
new file mode 100644
index 00000000..bbad893b
--- /dev/null
+++ b/src/miasm/arch/x86/__init__.py
@@ -0,0 +1 @@
+__all__ = ["arch", "disasm", "regs", "sem"]
diff --git a/src/miasm/arch/x86/arch.py b/src/miasm/arch/x86/arch.py
new file mode 100644
index 00000000..5464a779
--- /dev/null
+++ b/src/miasm/arch/x86/arch.py
@@ -0,0 +1,4878 @@
+#-*- coding:utf-8 -*-
+
+from __future__ import print_function
+from builtins import range
+import re
+
+from future.utils import viewitems
+
+from miasm.core import utils
+from miasm.expression.expression import *
+from pyparsing import *
+from miasm.core.cpu import *
+from collections import defaultdict
+import miasm.arch.x86.regs as regs_module
+from miasm.arch.x86.regs import *
+from miasm.core.asm_ast import AstNode, AstInt, AstId, AstMem, AstOp
+from miasm.ir.ir import color_expr_html
+from miasm.core.utils import BRACKET_O, BRACKET_C
+
+
+log = logging.getLogger("x86_arch")
+console_handler = logging.StreamHandler()
+console_handler.setFormatter(logging.Formatter("[%(levelname)-8s]: %(message)s"))
+log.addHandler(console_handler)
+log.setLevel(logging.WARN)
+
+conditional_branch = ["JO", "JNO", "JB", "JAE",
+                      "JZ", "JNZ", "JBE", "JA",
+                      "JS", "JNS", "JPE", "JNP",
+                      #"L", "NL", "NG", "G"]
+                      "JL", "JGE", "JLE", "JG",
+                      "JCXZ", "JECXZ", "JRCXZ"]
+
+unconditional_branch = ['JMP', 'JMPF']
+
+f_isad = "AD"
+f_s08 = "S08"
+f_u08 = "U08"
+f_s16 = "S16"
+f_u16 = "U16"
+f_s32 = "S32"
+f_u32 = "U32"
+f_s64 = "S64"
+f_u64 = "U64"
+f_imm = 'IMM'
+
+f_imm2size = {f_s08: 8, f_s16: 16, f_s32: 32, f_s64: 64,
+              f_u08: 8, f_u16: 16, f_u32: 32, f_u64: 64}
+
+
+size2gpregs = {8: gpregs08, 16: gpregs16,
+               32: gpregs32, 64: gpregs64}
+
+
+replace_regs64 = {
+    AL: RAX[:8], CL: RCX[:8], DL: RDX[:8], BL: RBX[:8],
+    AH: RAX[8:16], CH: RCX[8:16], DH: RDX[8:16], BH: RBX[8:16],
+    SPL: RSP[0:8], BPL: RBP[0:8], SIL: RSI[0:8], DIL: RDI[0:8],
+    R8B: R8[0:8], R9B: R9[0:8], R10B: R10[0:8], R11B: R11[0:8],
+    R12B: R12[0:8], R13B: R13[0:8], R14B: R14[0:8], R15B: R15[0:8],
+
+    AX: RAX[:16], CX: RCX[:16], DX: RDX[:16], BX: RBX[:16],
+    SP: RSP[:16], BP: RBP[:16], SI: RSI[:16], DI: RDI[:16],
+    R8W:  R8[:16], R9W:  R9[:16], R10W: R10[:16], R11W: R11[:16],
+    R12W: R12[:16], R13W: R13[:16], R14W: R14[:16], R15W: R15[:16],
+
+    EAX: RAX[:32], ECX: RCX[:32], EDX: RDX[:32], EBX: RBX[:32],
+    ESP: RSP[:32], EBP: RBP[:32], ESI: RSI[:32], EDI: RDI[:32],
+    R8D: R8[:32], R9D: R9[:32], R10D: R10[:32], R11D: R11[:32],
+    R12D: R12[:32], R13D: R13[:32], R14D: R14[:32], R15D: R15[:32],
+
+    IP: RIP[:16], EIP: RIP[:32],
+
+    ExprId("ST", 64): float_st0,
+    ExprId("ST(0)", 64): float_st0,
+    ExprId("ST(1)", 64): float_st1,
+    ExprId("ST(2)", 64): float_st2,
+    ExprId("ST(3)", 64): float_st3,
+    ExprId("ST(4)", 64): float_st4,
+    ExprId("ST(5)", 64): float_st5,
+    ExprId("ST(6)", 64): float_st6,
+    ExprId("ST(7)", 64): float_st7,
+
+}
+
+replace_regs32 = {
+    AL: EAX[:8],   CL: ECX[:8],   DL: EDX[:8],   BL: EBX[:8],
+    AH: EAX[8:16], CH: ECX[8:16], DH: EDX[8:16], BH: EBX[8:16],
+
+    AX: EAX[:16], CX: ECX[:16], DX: EDX[:16], BX: EBX[:16],
+    SP: ESP[:16], BP: EBP[:16], SI: ESI[:16], DI: EDI[:16],
+
+    IP: EIP[:16],
+
+
+    ExprId("ST", 64): float_st0,
+    ExprId("ST(0)", 64): float_st0,
+    ExprId("ST(1)", 64): float_st1,
+    ExprId("ST(2)", 64): float_st2,
+    ExprId("ST(3)", 64): float_st3,
+    ExprId("ST(4)", 64): float_st4,
+    ExprId("ST(5)", 64): float_st5,
+    ExprId("ST(6)", 64): float_st6,
+    ExprId("ST(7)", 64): float_st7,
+
+}
+
+replace_regs16 = {
+    AL: AX[:8],   CL: CX[:8],   DL: DX[:8],   BL: BX[:8],
+    AH: AX[8:16], CH: CX[8:16], DH: DX[8:16], BH: BX[8:16],
+
+    AX: AX[:16],  CX: CX[:16],  DX: DX[:16],  BX: BX[:16],
+    SP: SP[:16],  BP: BP[:16],  SI: SI[:16],  DI: DI[:16],
+
+
+    ExprId("ST", 64): float_st0,
+    ExprId("ST(0)", 64): float_st0,
+    ExprId("ST(1)", 64): float_st1,
+    ExprId("ST(2)", 64): float_st2,
+    ExprId("ST(3)", 64): float_st3,
+    ExprId("ST(4)", 64): float_st4,
+    ExprId("ST(5)", 64): float_st5,
+    ExprId("ST(6)", 64): float_st6,
+    ExprId("ST(7)", 64): float_st7,
+
+}
+
+replace_regs = {16: replace_regs16,
+                32: replace_regs32,
+                64: replace_regs64}
+
+
+segm2enc = {CS: 1, SS: 2, DS: 3, ES: 4, FS: 5, GS: 6}
+enc2segm = dict((value, key) for key, value in viewitems(segm2enc))
+
+segm_info = reg_info_dct(enc2segm)
+
+
+
+enc2crx = {
+    0: cr0,
+    1: cr1,
+    2: cr2,
+    3: cr3,
+    4: cr4,
+    5: cr5,
+    6: cr6,
+    7: cr7,
+}
+
+crx_info = reg_info_dct(enc2crx)
+
+
+enc2drx = {
+    0: dr0,
+    1: dr1,
+    2: dr2,
+    3: dr3,
+    4: dr4,
+    5: dr5,
+    6: dr6,
+    7: dr7,
+}
+
+drx_info = reg_info_dct(enc2drx)
+
+
+
+# parser helper ###########
+PLUS = Suppress("+")
+MULT = Suppress("*")
+
+COLON = Suppress(":")
+
+
+LBRACK = Suppress("[")
+RBRACK = Suppress("]")
+
+
+gpreg = (
+    gpregs08.parser |
+    gpregs08_64.parser |
+    gpregs16.parser |
+    gpregs32.parser |
+    gpregs64.parser |
+    gpregs_xmm.parser |
+    gpregs_mm.parser |
+    gpregs_bnd.parser
+)
+
+
+def is_op_segm(expr):
+    """Returns True if is ExprOp and op == 'segm'"""
+    return expr.is_op('segm')
+
+def is_mem_segm(expr):
+    """Returns True if is ExprMem and ptr is_op_segm"""
+    return expr.is_mem() and is_op_segm(expr.ptr)
+
+
+def cb_deref_segmoff(tokens):
+    assert len(tokens) == 2
+    return AstOp('segm', tokens[0], tokens[1])
+
+
+def cb_deref_base_expr(tokens):
+    tokens = tokens[0]
+    assert isinstance(tokens, AstNode)
+    addr = tokens
+    return addr
+
+
+deref_mem_ad = (LBRACK + base_expr + RBRACK).setParseAction(cb_deref_base_expr)
+
+deref_ptr = (base_expr + COLON + base_expr).setParseAction(cb_deref_segmoff)
+
+
+PTR = Suppress('PTR')
+
+FAR = Suppress('FAR')
+
+
+BYTE = Literal('BYTE')
+WORD = Literal('WORD')
+DWORD = Literal('DWORD')
+QWORD = Literal('QWORD')
+TBYTE = Literal('TBYTE')
+XMMWORD = Literal('XMMWORD')
+
+MEMPREFIX2SIZE = {'BYTE': 8, 'WORD': 16, 'DWORD': 32,
+                  'QWORD': 64, 'TBYTE': 80, 'XMMWORD': 128}
+
+SIZE2MEMPREFIX = dict((value, key) for key, value in viewitems(MEMPREFIX2SIZE))
+
+def cb_deref_mem(tokens):
+    if len(tokens) == 2:
+        s, ptr = tokens
+        assert isinstance(ptr, AstNode)
+        return AstMem(ptr, MEMPREFIX2SIZE[s])
+    elif len(tokens) == 3:
+        s, segm, ptr = tokens
+        return AstMem(AstOp('segm', segm, ptr), MEMPREFIX2SIZE[s])
+    raise ValueError('len(tokens) > 3')
+
+mem_size = (BYTE | DWORD | QWORD | WORD | TBYTE | XMMWORD)
+deref_mem = (mem_size + PTR + Optional((base_expr + COLON))+ deref_mem_ad).setParseAction(cb_deref_mem)
+
+
+rmarg = (
+    gpregs08.parser |
+    gpregs08_64.parser |
+    gpregs16.parser |
+    gpregs32.parser |
+    gpregs64.parser |
+    gpregs_mm.parser |
+    gpregs_xmm.parser |
+    gpregs_bnd.parser
+)
+
+rmarg |= deref_mem
+
+
+mem_far = FAR + deref_mem
+
+
+cl_or_imm = r08_ecx.parser
+cl_or_imm |= base_expr
+
+
+
+class x86_arg(m_arg):
+    def asm_ast_to_expr(self, value, loc_db, size_hint=None, fixed_size=None):
+        if size_hint is None:
+            size_hint = self.parent.mode
+        if fixed_size is None:
+            fixed_size = set()
+        if isinstance(value, AstId):
+            if value.name in all_regs_ids_byname:
+                reg = all_regs_ids_byname[value.name]
+                fixed_size.add(reg.size)
+                return reg
+            if isinstance(value.name, ExprId):
+                fixed_size.add(value.name.size)
+                return value.name
+            if value.name in MEMPREFIX2SIZE:
+                return None
+            if value.name in ["FAR"]:
+                return None
+
+            loc_key = loc_db.get_or_create_name_location(value.name)
+            return ExprLoc(loc_key, size_hint)
+        if isinstance(value, AstOp):
+            # First pass to retrieve fixed_size
+            if value.op == "segm":
+                segm = self.asm_ast_to_expr(value.args[0], loc_db)
+                ptr = self.asm_ast_to_expr(value.args[1], loc_db, None, fixed_size)
+                return ExprOp('segm', segm, ptr)
+            args = [self.asm_ast_to_expr(arg, loc_db, None, fixed_size) for arg in value.args]
+            if len(fixed_size) == 0:
+                # No fixed size
+                pass
+            elif len(fixed_size) == 1:
+                # One fixed size, regen all
+                size = list(fixed_size)[0]
+                args = [self.asm_ast_to_expr(arg, loc_db, size, fixed_size) for arg in value.args]
+            else:
+                raise ValueError("Size conflict")
+            if None in args:
+                return None
+            return ExprOp(value.op, *args)
+        if isinstance(value, AstInt):
+            if 1 << size_hint < value.value:
+                size_hint *= 2
+            return ExprInt(value.value, size_hint)
+        if isinstance(value, AstMem):
+            fixed_size.add(value.size)
+            ptr = self.asm_ast_to_expr(value.ptr, loc_db, None, set())
+            if ptr is None:
+                return None
+            return ExprMem(ptr, value.size)
+        return None
+
+class r_al(reg_noarg, x86_arg):
+    reg_info = r08_eax
+    parser = reg_info.parser
+
+
+class r_ax(reg_noarg, x86_arg):
+    reg_info = r16_eax
+    parser = reg_info.parser
+
+
+class r_dx(reg_noarg, x86_arg):
+    reg_info = r16_edx
+    parser = reg_info.parser
+
+
+class r_eax(reg_noarg, x86_arg):
+    reg_info = r32_eax
+    parser = reg_info.parser
+
+
+class r_rax(reg_noarg, x86_arg):
+    reg_info = r64_eax
+    parser = reg_info.parser
+
+
+class r_cl(reg_noarg, x86_arg):
+    reg_info = r08_ecx
+    parser = reg_info.parser
+
+
+invmode = {16: 32, 32: 16}
+
+
+def opmode_prefix(mode):
+    size, opmode, admode = mode
+    if size in [16, 32]:
+        if opmode:
+            return invmode[size]
+        else:
+            return size
+    elif size == 64:
+        if opmode:
+            return 16
+        else:
+            return 32
+    raise NotImplementedError('not fully functional')
+
+
+def admode_prefix(mode):
+    size, opmode, admode = mode
+    if size in [16, 32]:
+        if admode:
+            return invmode[size]
+        else:
+            return size
+    elif size == 64:
+        return 64
+    raise NotImplementedError('not fully functional')
+
+
+def v_opmode_info(size, opmode, rex_w, stk):
+    if size in [16, 32]:
+        if opmode:
+            return invmode[size]
+        else:
+            return size
+    elif size == 64:
+        # Rex has the maximum priority
+        # Then opmode
+        # Then stacker
+        if rex_w == 1:
+            return 64
+        elif opmode == 1:
+            return 16
+        elif stk:
+            return 64
+        else:
+            return 32
+
+
+def v_opmode(p):
+    stk = hasattr(p, 'stk')
+    return v_opmode_info(p.mode, p.opmode, p.rex_w.value, stk)
+
+
+def v_admode_info(size, admode):
+    if size in [16, 32]:
+        if admode:
+            return invmode[size]
+        else:
+            return size
+    elif size == 64:
+        if admode == 1:
+            return 32
+        return 64
+
+
+def v_admode(p):
+    return v_admode_info(p.mode, p.admode)
+
+
+def offsize(p):
+    if p.opmode:
+        return 16
+    else:
+        return p.mode
+
+
+def get_prefix(s):
+    g = re.search(r'(\S+)(\s+)', s)
+    if not g:
+        return None, s
+    prefix, b = g.groups()
+    return prefix, s[len(prefix) + len(b):]
+
+
+repeat_mn = ["INS", "OUTS",
+             "MOVSB", "MOVSW", "MOVSD", "MOVSQ",
+             "SCASB", "SCASW", "SCASD", "SCASQ",
+             "LODSB", "LODSW", "LODSD", "LODSQ",
+             "STOSB", "STOSW", "STOSD", "STOSQ",
+             "CMPSB", "CMPSW", "CMPSD", "CMPSQ",
+             ]
+
+
+class group(object):
+
+    def __init__(self):
+        self.value = None
+
+
+class additional_info(object):
+
+    def __init__(self):
+        self.except_on_instr = False
+        self.g1 = group()
+        self.g2 = group()
+        self.vopmode = None
+        self.stk = False
+        self.v_opmode = None
+        self.v_admode = None
+        self.prefixed = b''
+
+
+class instruction_x86(instruction):
+    __slots__ = []
+
+    def __init__(self, *args, **kargs):
+        super(instruction_x86, self).__init__(*args, **kargs)
+
+    def v_opmode(self):
+        return self.additional_info.v_opmode
+
+    def v_admode(self):
+        return self.additional_info.v_admode
+
+    def dstflow(self):
+        if self.name in conditional_branch + unconditional_branch:
+            return True
+        if self.name.startswith('LOOP'):
+            return True
+        return self.name in ['CALL']
+
+    def dstflow2label(self, loc_db):
+        if self.additional_info.g1.value & 14 and self.name in repeat_mn:
+            return
+        expr = self.args[0]
+        if not expr.is_int():
+            return
+        addr = (int(expr) + int(self.offset)) & int(expr.mask)
+        loc_key = loc_db.get_or_create_offset_location(addr)
+        self.args[0] = ExprLoc(loc_key, expr.size)
+
+    def breakflow(self):
+        if self.name in conditional_branch + unconditional_branch:
+            return True
+        if self.name.startswith('LOOP'):
+            return True
+        if self.name.startswith('RET'):
+            return True
+        if self.name.startswith('INT'):
+            return True
+        if self.name.startswith('SYS'):
+            return True
+        return self.name in ['CALL', 'HLT', 'IRET', 'IRETD', 'IRETQ', 'ICEBP', 'UD2']
+
+    def splitflow(self):
+        if self.name in conditional_branch:
+            return True
+        if self.name in unconditional_branch:
+            return False
+        if self.name.startswith('LOOP'):
+            return True
+        if self.name.startswith('INT'):
+            return True
+        if self.name.startswith('SYS'):
+            return True
+        return self.name in ['CALL']
+
+    def setdstflow(self, a):
+        return
+
+    def is_subcall(self):
+        return self.name in ['CALL']
+
+    def getdstflow(self, loc_db):
+        if self.additional_info.g1.value & 14 and self.name in repeat_mn:
+            addr = int(self.offset)
+            loc_key = loc_db.get_or_create_offset_location(addr)
+            return [ExprLoc(loc_key, self.v_opmode())]
+        return [self.args[0]]
+
+    def get_symbol_size(self, symbol, loc_db):
+        return self.mode
+
+    def fixDstOffset(self):
+        expr = self.args[0]
+        if self.offset is None:
+            raise ValueError('symbol not resolved %s' % l)
+        if not isinstance(expr, ExprInt):
+            log.warning('dynamic dst %r', expr)
+            return
+        self.args[0] = ExprInt(int(expr) - self.offset, self.mode)
+
+    def get_info(self, c):
+        self.additional_info.g1.value = c.g1.value
+        self.additional_info.g2.value = c.g2.value
+        self.additional_info.stk = hasattr(c, 'stk')
+        self.additional_info.v_opmode = c.v_opmode()
+        self.additional_info.v_admode = c.v_admode()
+        self.additional_info.prefix = c.prefix
+        self.additional_info.prefixed = getattr(c, "prefixed", b"")
+
+    def __str__(self):
+        return self.to_string()
+
+    def to_string(self, loc_db=None):
+        o = super(instruction_x86, self).to_string(loc_db)
+        if self.additional_info.g1.value & 1:
+            o = "LOCK %s" % o
+        if self.additional_info.g1.value & 2:
+            if getattr(self.additional_info.prefixed, 'default', b"") != b"\xF2":
+                o = "REPNE %s" % o
+        if self.additional_info.g1.value & 8:
+            if getattr(self.additional_info.prefixed, 'default', b"") != b"\xF3":
+                o = "REP %s" % o
+        elif self.additional_info.g1.value & 4:
+            if getattr(self.additional_info.prefixed, 'default', b"") != b"\xF3":
+                o = "REPE %s" % o
+        return o
+
+    def to_html(self, loc_db=None):
+        o = super(instruction_x86, self).to_html(loc_db)
+        if self.additional_info.g1.value & 1:
+            text =  utils.set_html_text_color("LOCK", utils.COLOR_MNEMO)
+            o = "%s %s" % (text, o)
+        if self.additional_info.g1.value & 2:
+            if getattr(self.additional_info.prefixed, 'default', b"") != b"\xF2":
+                text =  utils.set_html_text_color("REPNE", utils.COLOR_MNEMO)
+                o = "%s %s" % (text, o)
+        if self.additional_info.g1.value & 8:
+            if getattr(self.additional_info.prefixed, 'default', b"") != b"\xF3":
+                text =  utils.set_html_text_color("REP", utils.COLOR_MNEMO)
+                o = "%s %s" % (text, o)
+        elif self.additional_info.g1.value & 4:
+            if getattr(self.additional_info.prefixed, 'default', b"") != b"\xF3":
+                text =  utils.set_html_text_color("REPE", utils.COLOR_MNEMO)
+                o = "%s %s" % (text, o)
+        return o
+
+
+    def get_args_expr(self):
+        args = []
+        for a in self.args:
+            a = a.replace_expr(replace_regs[self.mode])
+            args.append(a)
+        return args
+
+    @staticmethod
+    def arg2str(expr, index=None, loc_db=None):
+        if expr.is_id() or expr.is_int():
+            o = str(expr)
+        elif expr.is_loc():
+            if loc_db is not None:
+                o = loc_db.pretty_str(expr.loc_key)
+            else:
+                o = str(expr)
+        elif ((isinstance(expr, ExprOp) and expr.op == 'far' and
+               isinstance(expr.args[0], ExprMem)) or
+              isinstance(expr, ExprMem)):
+            if isinstance(expr, ExprOp):
+                prefix, expr = "FAR ", expr.args[0]
+            else:
+                prefix = ""
+            sz = SIZE2MEMPREFIX[expr.size]
+            segm = ""
+            if is_mem_segm(expr):
+                segm = "%s:" % expr.ptr.args[0]
+                expr = expr.ptr.args[1]
+            else:
+                expr = expr.ptr
+            if isinstance(expr, ExprOp):
+                s = str(expr).replace('(', '').replace(')', '')
+            else:
+                s = str(expr)
+            o = prefix + sz + ' PTR ' + str(segm) + '[%s]' % s
+        elif isinstance(expr, ExprOp) and expr.op == 'segm':
+            o = "%s:%s" % (expr.args[0], expr.args[1])
+        else:
+            raise ValueError('check this %r' % expr)
+        return "%s" % o
+
+
+    @staticmethod
+    def arg2html(expr, index=None, loc_db=None):
+        if expr.is_id() or expr.is_int() or expr.is_loc():
+            o = color_expr_html(expr, loc_db)
+        elif ((isinstance(expr, ExprOp) and expr.op == 'far' and
+               isinstance(expr.args[0], ExprMem)) or
+              isinstance(expr, ExprMem)):
+            if isinstance(expr, ExprOp):
+                prefix, expr = "FAR ", expr.args[0]
+            else:
+                prefix = ""
+            sz = SIZE2MEMPREFIX[expr.size]
+            sz =  '<font color="%s">%s</font>' % (utils.COLOR_MEM, sz)
+            segm = ""
+            if is_mem_segm(expr):
+                segm = "%s:" % expr.ptr.args[0]
+                expr = expr.ptr.args[1]
+            else:
+                expr = expr.ptr
+            if isinstance(expr, ExprOp):
+                s = color_expr_html(expr, loc_db)#.replace('(', '').replace(')', '')
+            else:
+                s = color_expr_html(expr, loc_db)
+            o = prefix + sz + ' PTR ' + str(segm) + BRACKET_O + str(s) + BRACKET_C
+        elif isinstance(expr, ExprOp) and expr.op == 'segm':
+            o = "%s:%s" % (
+                color_expr_html(expr.args[0], loc_db),
+                color_expr_html(expr.args[1], loc_db)
+            )
+        else:
+            raise ValueError('check this %r' % expr)
+        return "%s" % o
+
+
+
+class mn_x86(cls_mn):
+    name = "x86"
+    prefix_op_size = False
+    prefix_ad_size = False
+    regs = regs_module
+    all_mn = []
+    all_mn_mode = defaultdict(list)
+    all_mn_name = defaultdict(list)
+    all_mn_inst = defaultdict(list)
+    bintree = {}
+    num = 0
+    delayslot = 0
+    pc = {16: IP, 32: EIP, 64: RIP}
+    sp = {16: SP, 32: ESP, 64: RSP}
+    instruction = instruction_x86
+    max_instruction_len = 15
+
+    @classmethod
+    def getpc(cls, attrib):
+        return cls.pc[attrib]
+
+    @classmethod
+    def getsp(cls, attrib):
+        return cls.sp[attrib]
+
+    def v_opmode(self):
+        if hasattr(self, 'stk'):
+            stk = 1
+        else:
+            stk = 0
+        return v_opmode_info(self.mode, self.opmode, self.rex_w.value, stk)
+
+    def v_admode(self):
+        size, opmode, admode = self.mode, self.opmode, self.admode
+        if size in [16, 32]:
+            if admode:
+                return invmode[size]
+            else:
+                return size
+        elif size == 64:
+            if admode == 1:
+                return 32
+            return 64
+
+    def additional_info(self):
+        info = additional_info()
+        info.g1.value = self.g1.value
+        info.g2.value = self.g2.value
+        info.stk = hasattr(self, 'stk')
+        info.v_opmode = self.v_opmode()
+        info.prefixed = b""
+        if hasattr(self, 'prefixed'):
+            info.prefixed = self.prefixed.default
+        return info
+
+    @classmethod
+    def check_mnemo(cls, fields):
+        pass
+
+    @classmethod
+    def getmn(cls, name):
+        return name.upper()
+
+    @classmethod
+    def mod_fields(cls, fields):
+        prefix = [d_g1, d_g2, d_rex_p, d_rex_w, d_rex_r, d_rex_x, d_rex_b, d_vex, d_vex_l, d_vex_p, d_vex_v, d_vex_m]
+        return prefix + fields
+
+    @classmethod
+    def gen_modes(cls, subcls, name, bases, dct, fields):
+        dct['mode'] = None
+        return [(subcls, name, bases, dct, fields)]
+
+    @classmethod
+    def fromstring(cls, text, loc_db, mode):
+        pref = 0
+        prefix, new_s = get_prefix(text)
+        if prefix == "LOCK":
+            pref |= 1
+            text = new_s
+        elif prefix == "REPNE" or prefix == "REPNZ":
+            pref |= 2
+            text = new_s
+        elif prefix == "REPE" or prefix == "REPZ":
+            pref |= 4
+            text = new_s
+        elif prefix == "REP":
+            pref |= 8
+            text = new_s
+        c = super(mn_x86, cls).fromstring(text, loc_db, mode)
+        c.additional_info.g1.value = pref
+        return c
+
+    @classmethod
+    def pre_dis(cls, v, mode, offset):
+        offset_o = offset
+        pre_dis_info = {'opmode': 0,
+                        'admode': 0,
+                        'g1': 0,
+                        'g2': 0,
+                        'rex_p': 0,
+                        'rex_w': 0,
+                        'rex_r': 0,
+                        'rex_x': 0,
+                        'rex_b': 0,
+                        'vex_l': 0,
+                        'vex_p': 0,
+                        'vex_v': 0,
+                        'vex_m': 0,
+                        'vex' : 0,
+                        'prefix': b"",
+                        'prefixed': b"",
+                        }
+        while True:
+            c = v.getbytes(offset)
+            if c == b'\x66':
+                pre_dis_info['opmode'] = 1
+            elif c == b'\x67':
+                pre_dis_info['admode'] = 1
+            elif c == b'\xf0':
+                pre_dis_info['g1'] = 1
+            elif c == b'\xf2':
+                pre_dis_info['g1'] = 2
+            elif c == b'\xf3':
+                pre_dis_info['g1'] = 12
+
+            elif c == b'\x2e':
+                pre_dis_info['g2'] = 1
+            elif c == b'\x36':
+                pre_dis_info['g2'] = 2
+            elif c == b'\x3e':
+                pre_dis_info['g2'] = 3
+            elif c == b'\x26':
+                pre_dis_info['g2'] = 4
+            elif c == b'\x64':
+                pre_dis_info['g2'] = 5
+            elif c == b'\x65':
+                pre_dis_info['g2'] = 6
+
+            else:
+                break
+            pre_dis_info['prefix'] += c
+            offset += 1
+        vex3_prefix = b'\xc4'
+        vex2_prefix = b'\xc5'
+        rex_prefixes = b'@ABCDEFGHIJKLMNO'
+        if mode == 64 and c in rex_prefixes:
+            while c in rex_prefixes:
+                # multiple REX prefixes case - use last REX prefix
+                x = ord(c)
+                offset += 1
+                c = v.getbytes(offset)
+            pre_dis_info['rex_p'] = 1
+            pre_dis_info['rex_w'] = (x >> 3) & 1
+            pre_dis_info['rex_r'] = (x >> 2) & 1
+            pre_dis_info['rex_x'] = (x >> 1) & 1
+            pre_dis_info['rex_b'] = (x >> 0) & 1
+        elif mode == 64 and c == vex3_prefix:
+            offset += 1
+            c = ord(v.getbytes(offset))
+            pre_dis_info['vex'] = 1
+            pre_dis_info['rex_r'] = ((c >> 7) ^ 1) & 1
+            pre_dis_info['rex_x'] = ((c >> 6) ^ 1) & 1
+            pre_dis_info['rex_b'] = ((c >> 5) ^ 1) & 1
+            pre_dis_info['vex_m'] = (c & int('0b11111', 2))
+
+            offset += 1
+            c = ord(v.getbytes(offset))
+            pre_dis_info['rex_w'] = (c >> 7) & 1
+            pre_dis_info['vex_v'] = ((c >> 3) ^ 15) & 15
+            pre_dis_info['vex_l'] = (c >> 2) & 1
+            pre_dis_info['vex_p'] = c & int('0b11', 2)
+            offset += 1
+
+            if pre_dis_info['vex_p'] == 1:
+                pre_dis_info['opmode'] = 1
+            elif pre_dis_info['vex_p'] == 3:
+                pre_dis_info['g1'] = 2
+            elif pre_dis_info['vex_p'] == 2:
+                pre_dis_info['g1'] = 12
+
+        elif mode == 64 and c == vex2_prefix and v.getlen() > 2:
+            offset += 1
+            c = ord(v.getbytes(offset))
+            pre_dis_info['vex'] = 1
+            pre_dis_info['rex_r'] = ((c >> 7) ^ 1) & 1
+            pre_dis_info['rex_x'] = 0
+            pre_dis_info['rex_b'] = 0
+            pre_dis_info['rex_w'] = 0
+            pre_dis_info['vex_l'] = (c >> 2) & 1
+            pre_dis_info['vex_p'] = c & int('0b11', 2)
+            offset += 1
+
+            if pre_dis_info['vex_p'] == 1:
+                pre_dis_info['opmode'] = 1
+            elif pre_dis_info['vex_p'] == 3:
+                pre_dis_info['g1'] = 2
+            elif pre_dis_info['vex_p'] == 2:
+                pre_dis_info['g1'] = 12
+
+        elif pre_dis_info.get('g1', None) == 12 and c in [b'\xa6', b'\xa7', b'\xae', b'\xaf']:
+            pre_dis_info['g1'] = 4
+        return pre_dis_info, v, mode, offset, offset - offset_o
+
+    @classmethod
+    def get_cls_instance(cls, cc, mode, infos=None):
+        for opmode in [0, 1]:
+            for admode in [0, 1]:
+                c = cc()
+                c.init_class()
+
+                c.reset_class()
+                c.add_pre_dis_info()
+                c.dup_info(infos)
+                c.mode = mode
+                c.opmode = opmode
+                c.admode = admode
+
+                if not hasattr(c, 'stk') and hasattr(c, "fopmode") and c.fopmode.mode == 64:
+                    c.rex_w.value = 1
+                yield c
+
+    def post_dis(self):
+        if self.g2.value:
+            for a in self.args:
+                if not isinstance(a.expr, ExprMem):
+                    continue
+                m = a.expr
+                a.expr = ExprMem(
+                    ExprOp('segm', enc2segm[self.g2.value], m.ptr), m.size)
+        return self
+
+    def dup_info(self, infos):
+        if infos is not None:
+            self.g1.value = infos.g1.value
+            self.g2.value = infos.g2.value
+
+    def reset_class(self):
+        super(mn_x86, self).reset_class()
+        if hasattr(self, "opmode"):
+            del(self.opmode)
+        if hasattr(self, "admode"):
+            del(self.admode)
+
+    def add_pre_dis_info(self, pre_dis_info=None):
+        if pre_dis_info is None:
+            return True
+        if hasattr(self, "prefixed") and self.prefixed.default == b"\x66":
+            pre_dis_info['opmode'] = 0
+        self.opmode = pre_dis_info['opmode']
+        self.admode = pre_dis_info['admode']
+
+        if hasattr(self, 'no_xmm_pref') and\
+                pre_dis_info['prefix'] and\
+                pre_dis_info['prefix'][-1] in b'\x66\xf2\xf3':
+            return False
+        if (hasattr(self, "prefixed") and
+            not pre_dis_info['prefix'].endswith(self.prefixed.default)):
+            return False
+        if (self.rex_w.value is not None and
+            self.rex_w.value != pre_dis_info['rex_w']):
+            return False
+        else:
+            self.rex_w.value = pre_dis_info['rex_w']
+        self.rex_r.value = pre_dis_info['rex_r']
+        self.rex_b.value = pre_dis_info['rex_b']
+        self.rex_x.value = pre_dis_info['rex_x']
+        self.rex_p.value = pre_dis_info['rex_p']
+
+        self.vex.value = pre_dis_info['vex']
+        self.vex_l.value = pre_dis_info['vex_l']
+        self.vex_p.value = pre_dis_info['vex_p']
+        self.vex_v.value = pre_dis_info['vex_v']
+        self.vex_m.value = pre_dis_info['vex_m']
+
+        if hasattr(self, 'no_rex') and\
+           (self.rex_r.value or self.rex_b.value or
+            self.rex_x.value or self.rex_p.value):
+            return False
+
+        if self.vex.value == 0 and (hasattr(self, 'pref_0f')
+                                    or hasattr(self, 'pref_0f38')
+                                    or hasattr(self, 'pref_0f3a')):
+            return False
+
+        if hasattr(self, 'no_rep') and b'\xf3' in pre_dis_info['prefix']:
+            return False
+
+        if self.vex_m.value == 1 and not hasattr(self, 'pref_0f'):
+            return False
+        if self.vex_m.value == 2 and not hasattr(self, 'pref_0f38'):
+            return False
+        if self.vex_m.value == 3 and not hasattr(self, 'pref_0f3a'):
+            return False
+
+        self.g1.value = pre_dis_info['g1']
+        self.g2.value = pre_dis_info['g2']
+        self.prefix = pre_dis_info['prefix']
+        return True
+
+    def post_asm(self, v):
+        return v
+
+
+    def gen_prefix(self):
+        v = b""
+        rex = 0x40
+        if self.g1.value is None:
+            self.g1.value = 0
+        if self.g2.value is None:
+            self.g2.value = 0
+
+        if self.rex_w.value:
+            rex |= 0x8
+        if self.rex_r.value:
+            rex |= 0x4
+        if self.rex_x.value:
+            rex |= 0x2
+        if self.rex_b.value:
+            rex |= 0x1
+        if (rex != 0x40 and not self.vex.value) or self.rex_p.value == 1:
+            v = utils.int_to_byte(rex) + v
+            if hasattr(self, 'no_rex'):
+                return None
+
+        vex_byte1 = 0xc4
+        vex_byte2 = 0x00
+        vex_byte3 = 0x00
+
+        m_prefix = [field.fname for field in self.fields_order if 'pref_0f' in field.fname]
+        if m_prefix:
+            if m_prefix[0] == 'pref_0f':
+                vex_byte2 |= 0x01
+            elif m_prefix[0] == 'pref_0f38':
+                vex_byte2 |= 0x02
+            elif m_prefix[0] == 'pref_0f3a':
+                vex_byte2 |= 0x03
+
+        # TODO: L and p 
+        if m_prefix and m_prefix[0] == 'pref_0f' and not self.rex_w.value and not self.rex_b.value and ((hasattr(self, 'mod') and self.mod.value == 3) or not self.rex_x.value): # VEX2
+            print("test")
+            vex_version = 2
+            vex_byte1 = 0x00
+            vex_byte2 = 0xc5
+
+            if not hasattr(self, 'reg') or not self.rex_r.value:
+                vex_byte3 |= 0x80
+
+        else:
+            vex_version = 3
+            if not hasattr(self, 'reg') or not self.rex_r.value:
+                vex_byte2 |= 0x80
+            if (hasattr(self, 'mod') and self.mod.value == 3) or not self.rex_x.value:
+                vex_byte2 |= 0x40
+            if not self.rex_b.value:
+                vex_byte2 |= 0x20
+
+            if self.rex_w.value:
+                vex_byte3 |= 0x80
+
+        if self.vex.value == 1:
+            vex_byte3 |= ((15 - self.vex_v.value) << 3)
+            vex = (vex_byte1 << 16) | (vex_byte2 << 8) | vex_byte3
+            v = vex.to_bytes(vex_version, 'big') + v
+
+        if hasattr(self, 'prefixed'):
+            v = self.prefixed.default + v
+
+        if self.g1.value & 1:
+            v = b"\xf0" + v
+        if self.g1.value & 2:
+            if hasattr(self, 'no_xmm_pref'):
+                return None
+            v = b"\xf2" + v
+        if self.g1.value & 12:
+            if hasattr(self, 'no_xmm_pref'):
+                return None
+            v = b"\xf3" + v
+        if self.g2.value:
+            v = {
+                1: b'\x2e',
+                2: b'\x36',
+                3: b'\x3e',
+                4: b'\x26',
+                5: b'\x64',
+                6: b'\x65'
+            }[self.g2.value] + v
+        # mode prefix
+        if hasattr(self, "admode") and self.admode:
+            v = b"\x67" + v
+
+        if hasattr(self, "opmode") and self.opmode:
+            if hasattr(self, 'no_xmm_pref'):
+                return None
+            v = b"\x66" + v
+        return v
+
+    def encodefields(self, decoded):
+        v = super(mn_x86, self).encodefields(decoded)
+        prefix = self.gen_prefix()
+        if prefix is None:
+            return None
+        return prefix + v
+
+    def getnextflow(self, loc_db):
+        raise NotImplementedError('not fully functional')
+
+    def ir_pre_instruction(self):
+        return [ExprAssign(mRIP[self.mode],
+            ExprInt(self.offset + self.l, mRIP[self.mode].size))]
+
+    @classmethod
+    def filter_asm_candidates(cls, instr, candidates):
+
+        cand_same_mode = []
+        cand_diff_mode = []
+        out = []
+        for c, v in candidates:
+            if (hasattr(c, 'no_xmm_pref') and
+                (c.g1.value & 2 or c.g1.value & 4 or c.g1.value & 8 or c.opmode)):
+                continue
+            if hasattr(c, "fopmode") and v_opmode(c) != c.fopmode.mode:
+                continue
+            if hasattr(c, "fadmode") and v_admode(c) != c.fadmode.mode:
+                continue
+            # relative dstflow must not have opmode set
+            # (assign IP instead of EIP for instance)
+            if (instr.dstflow() and
+                instr.name not in ["JCXZ", "JECXZ", "JRCXZ"] and
+                len(instr.args) == 1 and
+                    isinstance(instr.args[0], ExprInt) and c.opmode):
+                continue
+
+            out.append((c, v))
+        candidates = out
+        for c, v in candidates:
+            if v_opmode(c) == instr.mode:
+                cand_same_mode += v
+        for c, v in candidates:
+            if v_opmode(c) != instr.mode:
+                cand_diff_mode += v
+        cand_same_mode.sort(key=len)
+        cand_diff_mode.sort(key=len)
+        return cand_same_mode + cand_diff_mode
+
+
+class bs_modname_size(bs_divert):
+    prio = 1
+
+    def divert(self, i, candidates):
+        out = []
+        for candidate in candidates:
+            cls, name, bases, dct, fields = candidate
+            fopmode = opmode_prefix(
+                (dct['mode'], dct['opmode'], dct['admode']))
+            mode = dct['mode']
+            size, opmode, admode = dct['mode'], dct['opmode'], dct['admode']
+            # no mode64 exinstance in name means no 64bit version of mnemo
+            if mode == 64:
+                if mode in self.args['name']:
+                    nfields = fields[:]
+                    f, i = getfieldindexby_name(nfields, 'rex_w')
+                    f = bs("1", l=0, cls=(bs_fbit,), fname="rex_w")
+                    osize = v_opmode_info(size, opmode, 1, 0)
+                    nfields[i] = f
+                    nfields = nfields[:-1]
+                    ndct = dict(dct)
+                    if osize in self.args['name']:
+                        ndct['name'] = self.args['name'][osize]
+                        out.append((cls, ndct['name'], bases, ndct, nfields))
+
+                    nfields = fields[:]
+                    nfields = nfields[:-1]
+                    f, i = getfieldindexby_name(nfields, 'rex_w')
+                    f = bs("0", l=0, cls=(bs_fbit,), fname="rex_w")
+                    osize = v_opmode_info(size, opmode, 0, 0)
+                    nfields[i] = f
+                    ndct = dict(dct)
+                    if osize in self.args['name']:
+                        ndct['name'] = self.args['name'][osize]
+                        out.append((cls, ndct['name'], bases, ndct, nfields))
+            else:
+                l = opmode_prefix((dct['mode'], dct['opmode'], dct['admode']))
+                osize = v_opmode_info(size, opmode, None, 0)
+                nfields = fields[:-1]
+                ndct = dict(dct)
+                if osize in self.args['name']:
+                    ndct['name'] = self.args['name'][osize]
+                    out.append((cls, ndct['name'], bases, ndct, nfields))
+        return out
+
+
+class bs_modname_jecx(bs_divert):
+    prio = 1
+
+    def divert(self, i, candidates):
+        out = []
+        for candidate in candidates:
+            cls, name, bases, dct, fields = candidate
+            fopmode = opmode_prefix(
+                (dct['mode'], dct['opmode'], dct['admode']))
+            mode = dct['mode']
+            size, opmode, admode = dct['mode'], dct['opmode'], dct['admode']
+
+            nfields = fields[:]
+            nfields = nfields[:-1]
+            args = dict(self.args)
+            ndct = dict(dct)
+            if mode == 64:
+                if admode:
+                    ndct['name'] = "JECXZ"
+                else:
+                    ndct['name'] = "JRCXZ"
+            elif mode == 32:
+                if admode:
+                    ndct['name'] = "JCXZ"
+                else:
+                    ndct['name'] = "JECXZ"
+            elif mode == 16:
+                if admode:
+                    ndct['name'] = "JECXZ"
+                else:
+                    ndct['name'] = "JCXZ"
+            else:
+                raise ValueError('unhandled mode')
+            out.append((cls, ndct['name'], bases, ndct, nfields))
+        return out
+
+
+class bs_modname_mode(bs_divert):
+    prio = 1
+
+    def divert(self, i, candidates):
+        out = []
+        for candidate in candidates:
+            cls, name, bases, dct, fields = candidate
+            fopmode = opmode_prefix(
+                (dct['mode'], dct['opmode'], dct['admode']))
+            size, opmode, admode = dct['mode'], dct['opmode'], dct['admode']
+
+            mode = dct['mode']
+            l = opmode_prefix((dct['mode'], dct['opmode'], dct['admode']))
+            osize = v_opmode_info(size, opmode, None, 0)
+            nfields = fields[:-1]
+            args = dict(self.args)
+            ndct = dict(dct)
+            if mode == 64 or osize == 32:
+                ndct['name'] = self.args['name'][mode]
+            else:
+                ndct['name'] = self.args['name'][16]
+            out.append((cls, ndct['name'], bases, ndct, nfields))
+        return out
+
+
+class x86_imm(imm_noarg):
+    parser = base_expr
+
+    def decodeval(self, v):
+        return swap_uint(self.l, v)
+
+    def encodeval(self, v):
+        return swap_uint(self.l, v)
+
+
+class x86_imm_fix_08(imm_noarg):
+    parser = base_expr
+    intsize = 8
+    intmask = (1 << intsize) - 1
+
+    def decodeval(self, v):
+        return self.ival
+
+    def encode(self):
+        v = self.expr2int(self.expr)
+        if v != self.ival:
+            return False
+        self.value = 0
+        return True
+
+
+class x86_08(x86_imm):
+    intsize = 8
+    intmask = (1 << intsize) - 1
+
+
+class x86_16(x86_imm):
+    intsize = 16
+    intmask = (1 << intsize) - 1
+
+
+class x86_32(x86_imm):
+    intsize = 32
+    intmask = (1 << intsize) - 1
+
+
+class x86_64(x86_imm):
+    intsize = 64
+    intmask = (1 << intsize) - 1
+
+
+class x86_08_ne(x86_imm):
+    intsize = 8
+    intmask = (1 << intsize) - 1
+
+    def encode(self):
+        return True
+
+    def decode(self, v):
+        v = swap_uint(self.l, v)
+        p = self.parent
+        admode = p.v_admode()
+        value = sign_ext(v, self.intsize, admode)
+        self.expr = ExprInt(value, admode)
+        return True
+
+
+class x86_16_ne(x86_08_ne):
+    intsize = 16
+    intmask = (1 << intsize) - 1
+
+
+class x86_32_ne(x86_08_ne):
+    intsize = 32
+    intmask = (1 << intsize) - 1
+
+
+class x86_64_ne(x86_08_ne):
+    intsize = 64
+    intmask = (1 << intsize) - 1
+
+
+class x86_s08to16(x86_imm):
+    in_size = 8
+    out_size = 16
+
+    def myexpr(self, x):
+        return ExprInt(x, 16)
+
+    def int2expr(self, v):
+        return self.myexpr(v)
+
+    def expr2int(self, e):
+        if not isinstance(e, ExprInt):
+            return None
+        v = int(e)
+        if v & ~((1 << self.l) - 1) != 0:
+            return None
+        return v
+
+    def decode(self, v):
+        v = v & self.lmask
+        v = self.decodeval(v)
+        if self.parent.v_opmode() == 64:
+            self.expr = ExprInt(sign_ext(v, self.in_size, 64), 64)
+        else:
+            if (1 << (self.l - 1)) & v:
+                v = sign_ext(v, self.l, self.out_size)
+            self.expr = self.myexpr(v)
+        return True
+
+    def encode(self):
+        if not isinstance(self.expr, ExprInt):
+            return False
+        v = int(self.expr)
+        opmode = self.parent.v_opmode()
+
+        out_size = self.out_size
+        if opmode != self.out_size:
+            if opmode == 32 and self.out_size == 64:
+                out_size = opmode
+                if v == sign_ext(
+                    int(v & ((1 << self.in_size) - 1)), self.in_size, out_size):
+                    pass
+                else:
+                    # test with rex_w
+                    self.parent.rex_w.value = 1
+                    opmode = self.parent.v_opmode()
+                    out_size = opmode
+                    if (v != sign_ext(
+                        int(v & ((1 << self.in_size) - 1)),
+                        self.in_size, out_size)):
+                        return False
+        if v != sign_ext(
+            int(v & ((1 << self.in_size) - 1)), self.in_size, out_size):
+            return False
+        v = self.encodeval(v)
+        self.value = (v & 0xffffffff) & self.lmask
+        return True
+
+    def decodeval(self, v):
+        return swap_uint(self.l, v)
+
+    def encodeval(self, v):
+        return swap_sint(self.l, v)
+
+
+class x86_s08to32(x86_s08to16):
+    in_size = 8
+    out_size = 32
+
+    def myexpr(self, x):
+        return ExprInt(x, 32)
+
+    def decode(self, v):
+        v = v & self.lmask
+        v = self.decodeval(v)
+        if self.parent.rex_w.value == 1:
+            v = ExprInt(sign_ext(v, self.in_size, 64), 64)
+        else:
+            v = ExprInt(sign_ext(v, self.in_size, 32), 32)
+
+        self.expr = v
+        return True
+
+
+class x86_s08to64(x86_s08to32):
+    in_size = 8
+    out_size = 64
+
+    def myexpr(self, x):
+        return ExprInt(x, 64)
+
+
+class x86_s32to64(x86_s08to32):
+    in_size = 32
+    out_size = 64
+
+    def myexpr(self, x):
+        return ExprInt(x, 64)
+
+
+class bs_eax(x86_arg):
+    reg_info = r_eax_all
+    rindex = 0
+    parser = reg_info.parser
+
+    def decode(self, v):
+        p = self.parent
+        expr = None
+        if hasattr(p, 'w8') and p.w8.value == 0:
+            expr = regs08_expr[self.rindex]
+        else:
+            expr = size2gpregs[p.v_opmode()].expr[self.rindex]
+        self.expr = expr
+        return True
+
+    def encode(self):
+        self.value = 0
+        p = self.parent
+        expr = self.expr
+        osize = p.v_opmode()
+        if hasattr(p, 'w8'):
+            if p.w8.value is None:
+                # XXX TODO: priority in w8 erase?
+                if expr.size == 8:
+                    p.w8.value = 0
+                else:
+                    p.w8.value = 1
+        if hasattr(p, 'w8') and p.w8.value == 0:
+            return expr == regs08_expr[self.rindex]
+        elif p.mode in [16, 32]:
+            return expr == size2gpregs[osize].expr[self.rindex]
+        elif p.mode == 64:
+            if expr == size2gpregs[64].expr[self.rindex]:
+                p.rex_w.value = 1
+                return True
+            elif expr == size2gpregs[osize].expr[self.rindex]:
+                return True
+            return False
+        return False
+
+class bs_seg(x86_arg):
+    reg_info = r_eax_all
+    rindex = 0
+    parser = reg_info.parser
+
+    def decode(self, v):
+        self.expr = self.reg_info.expr[0]
+        return True
+
+    def encode(self):
+        self.value = 0
+        return self.expr == self.reg_info.expr[0]
+
+
+class bs_edx(bs_eax):
+    reg_info = r_edx_all
+    rindex = 2
+    parser = reg_info.parser
+
+
+class bs_st(bs_eax):
+    reg_info = r_st_all
+    rindex = 0
+    parser = reg_info.parser
+
+
+class bs_cs(bs_seg):
+    reg_info = r_cs_all
+    rindex = 0
+    parser = reg_info.parser
+
+
+class bs_ds(bs_seg):
+    reg_info = r_ds_all
+    rindex = 0
+    parser = reg_info.parser
+
+
+class bs_es(bs_seg):
+    reg_info = r_es_all
+    rindex = 0
+    parser = reg_info.parser
+
+
+class bs_ss(bs_seg):
+    reg_info = r_ss_all
+    rindex = 0
+    parser = reg_info.parser
+
+
+class bs_fs(bs_seg):
+    reg_info = r_fs_all
+    rindex = 0
+    parser = reg_info.parser
+
+
+class bs_gs(bs_seg):
+    reg_info = r_gs_all
+    rindex = 0
+    parser = reg_info.parser
+
+
+class x86_reg_st(reg_noarg, x86_arg):
+    reg_info = r_st_all
+    parser = reg_info.parser
+
+
+class bs_sib_scale(bs_divert):
+    bsname = "sib_scale"
+
+    def divert(self, i, candidates):
+        out = []
+        done = False
+        for cls, name, bases, dct, fields in candidates:
+            if (not (admode_prefix(
+                (dct['mode'], dct['opmode'], dct['admode'])) != 16 and
+                'rm' in dct and dct['rm'] == 0b100 and
+                'mod' in dct and dct['mod'] != 0b11)):
+                ndct = dict(dct)
+                nfields = fields[:]
+                nfields[i] = None
+                ndct[self.args['fname']] = None
+                out.append((cls, ndct['name'], bases, ndct, nfields))
+                continue
+
+            nfields = fields[:]
+            args = dict(self.args)
+            ndct = dict(dct)
+            f = bs(**args)
+            nfields[i] = f
+            ndct[self.args['fname']] = None
+            out.append((cls, ndct['name'], bases, ndct, nfields))
+        return out
+
+
+class bs_sib_index(bs_sib_scale):
+    pass
+
+
+class bs_sib_base(bs_sib_scale):
+    pass
+
+
+class bs_disp(bs_divert):
+
+    def divert(self, i, candidates):
+        out = []
+        done = False
+        for cls, name, bases, dct, fields in candidates:
+            ndct = dict(dct)
+            nfields = fields[:]
+            if (admode_prefix(
+                (dct['mode'], dct['opmode'], dct['admode'])) == 16):
+                if 'mod' in dct and dct['mod'] == 0b00 and \
+                        'rm' in dct and dct['rm'] == 0b110:
+                    nfields[i] = bs(
+                        l=16, cls=(x86_16_ne,), fname=self.args['fname'])
+                    ndct[self.args['fname']] = True
+                    out.append((cls, ndct['name'], bases, ndct, nfields))
+                    continue
+                elif 'mod' in dct and dct['mod'] == 0b01:
+                    nfields[i] = bs(
+                        l=8, cls=(x86_08_ne,), fname=self.args['fname'])
+                    ndct[self.args['fname']] = True
+                    out.append((cls, ndct['name'], bases, ndct, nfields))
+                    continue
+                elif 'mod' in dct and dct['mod'] == 0b10:
+                    nfields[i] = bs(
+                        l=16, cls=(x86_16_ne,), fname=self.args['fname'])
+                    ndct[self.args['fname']] = True
+                    out.append((cls, ndct['name'], bases, ndct, nfields))
+                    continue
+            else:
+                if 'mod' in dct and dct['mod'] == 0b00 and \
+                        'rm' in dct and dct['rm'] == 0b101:
+                    nfields[i] = bs(
+                        l=32, cls=(x86_32_ne,), fname=self.args['fname'])
+                    ndct[self.args['fname']] = True
+                    out.append((cls, ndct['name'], bases, ndct, nfields))
+                    continue
+                elif 'mod' in dct and dct['mod'] == 0b01:
+                    nfields[i] = bs(
+                        l=8, cls=(x86_08_ne,), fname=self.args['fname'])
+                    ndct[self.args['fname']] = True
+                    out.append((cls, ndct['name'], bases, ndct, nfields))
+                    continue
+                elif 'mod' in dct and dct['mod'] == 0b10:
+                    nfields[i] = bs(
+                        l=32, cls=(x86_32_ne,), fname=self.args['fname'])
+                    ndct[self.args['fname']] = True
+                    out.append((cls, ndct['name'], bases, ndct, nfields))
+                    continue
+
+            nfields[i] = None
+            ndct[self.args['fname']] = None
+            out.append((cls, ndct['name'], bases, ndct, nfields))
+        return out
+
+
+def getmodrm(c):
+    return (c >> 6) & 3, (c >> 3) & 7, c & 7
+
+
+def setmodrm(mod, re, rm):
+    return ((mod & 3) << 6) | ((re & 7) << 3) | (rm & 7)
+
+
+def sib(c):
+    return modrm(c)
+
+db_afs_64 = []
+sib_64_s08_ebp = []
+
+
+def gen_modrm_form():
+    global db_afs_64, sib_64_s08_ebp
+    ebp = 5
+
+    sib_s08_ebp = [{f_isad: True} for i in range(0x100)]
+    sib_u32_ebp = [{f_isad: True} for i in range(0x100)]
+    sib_u32 = [{f_isad: True} for i in range(0x100)]
+
+    sib_u64 = []
+    for rex_x in range(2):
+        o = []
+        for rex_b in range(2):
+            x = [{f_isad: True} for i in range(0x100)]
+            o.append(x)
+        sib_u64.append(o)
+
+    sib_u64_ebp = []
+    for rex_x in range(2):
+        o = []
+        for rex_b in range(2):
+            x = [{f_isad: True} for i in range(0x100)]
+            o.append(x)
+        sib_u64_ebp.append(o)
+
+    sib_64_s08_ebp = []
+    for rex_x in range(2):
+        o = []
+        for rex_b in range(2):
+            x = [{f_isad: True} for i in range(0x100)]
+            o.append(x)
+        sib_64_s08_ebp.append(o)
+
+    for sib_rez in [sib_s08_ebp,
+                    sib_u32_ebp,
+                    sib_u32,
+                    sib_64_s08_ebp,
+                    sib_u64_ebp,
+                    sib_u64,
+                    ]:
+        for index in range(0x100):
+            ss, i, b = getmodrm(index)
+
+            if b == 0b101:
+                if sib_rez == sib_s08_ebp:
+                    sib_rez[index][f_imm] = f_s08
+                    sib_rez[index][ebp] = 1
+                elif sib_rez == sib_u32_ebp:
+                    sib_rez[index][f_imm] = f_u32
+                    sib_rez[index][ebp] = 1
+                elif sib_rez == sib_u32:
+                    sib_rez[index][f_imm] = f_u32
+                elif sib_rez == sib_u64_ebp:
+                    for rex_b in range(2):
+                        for rex_x in range(2):
+                            sib_rez[rex_x][rex_b][index][f_imm] = f_u32
+                            sib_rez[rex_x][rex_b][index][ebp + 8 * rex_b] = 1
+                elif sib_rez == sib_u64:
+                    for rex_b in range(2):
+                        for rex_x in range(2):
+                            sib_rez[rex_x][rex_b][index][f_imm] = f_u32
+                elif sib_rez == sib_64_s08_ebp:
+                    for rex_b in range(2):
+                        for rex_x in range(2):
+                            sib_rez[rex_x][rex_b][index][f_imm] = f_s08
+                            sib_rez[rex_x][rex_b][index][ebp + 8 * rex_b] = 1
+
+            else:
+                if sib_rez == sib_s08_ebp:
+                    sib_rez[index][b] = 1
+                    sib_rez[index][f_imm] = f_s08
+                elif sib_rez == sib_u32_ebp:
+                    sib_rez[index][b] = 1
+                    sib_rez[index][f_imm] = f_u32
+                elif sib_rez == sib_u32:
+                    sib_rez[index][b] = 1
+                elif sib_rez == sib_u64_ebp:
+                    for rex_b in range(2):
+                        for rex_x in range(2):
+                            sib_rez[rex_x][rex_b][index][b + 8 * rex_b] = 1
+                            sib_rez[rex_x][rex_b][index][f_imm] = f_u32
+                elif sib_rez == sib_u64:
+                    for rex_b in range(2):
+                        for rex_x in range(2):
+                            sib_rez[rex_x][rex_b][index][b + 8 * rex_b] = 1
+                elif sib_rez == sib_64_s08_ebp:
+                    for rex_b in range(2):
+                        for rex_x in range(2):
+                            sib_rez[rex_x][rex_b][index][f_imm] = f_s08
+                            sib_rez[rex_x][rex_b][index][b + 8 * rex_b] = 1
+
+            if i == 0b100 and sib_rez in [sib_s08_ebp, sib_u32_ebp, sib_u32]:
+                continue
+
+            if sib_rez in [sib_s08_ebp, sib_u32_ebp, sib_u32]:
+                tmp = i
+                if not tmp in sib_rez[index]:
+                    sib_rez[index][tmp] = 0  # 1 << ss
+                sib_rez[index][tmp] += 1 << ss
+            else:
+                for rex_b in range(2):
+                    for rex_x in range(2):
+                        tmp = i + 8 * rex_x
+                        if i == 0b100 and rex_x == 0:
+                            continue
+                        if not tmp in sib_rez[rex_x][rex_b][index]:
+                            sib_rez[rex_x][rex_b][index][tmp] = 0  # 1 << ss
+                        sib_rez[rex_x][rex_b][index][tmp] += 1 << ss
+
+    # 32bit
+    db_afs_32 = [None for i in range(0x100)]
+    for i in range(0x100):
+        index = i
+        mod, re, rm = getmodrm(i)
+
+        if mod == 0b00:
+            if rm == 0b100:
+                db_afs_32[index] = sib_u32
+            elif rm == 0b101:
+                db_afs_32[index] = {f_isad: True, f_imm: f_u32}
+            else:
+                db_afs_32[index] = {f_isad: True, rm: 1}
+        elif mod == 0b01:
+            if rm == 0b100:
+                db_afs_32[index] = sib_s08_ebp
+                continue
+            tmp = {f_isad: True, rm: 1, f_imm: f_s08}
+            db_afs_32[index] = tmp
+
+        elif mod == 0b10:
+            if rm == 0b100:
+                db_afs_32[index] = sib_u32_ebp
+            else:
+                db_afs_32[index] = {f_isad: True, rm: 1, f_imm: f_u32}
+        elif mod == 0b11:
+            db_afs_32[index] = {f_isad: False, rm: 1}
+
+    # 64bit
+    db_afs_64 = [None for i in range(0x400)]
+    for i in range(0x400):
+        index = i
+        rex_x = (index >> 9) & 1
+        rex_b = (index >> 8) & 1
+        mod, re, rm = getmodrm(i & 0xff)
+
+        if mod == 0b00:
+            if rm == 0b100:
+                db_afs_64[i] = sib_u64[rex_x][rex_b]
+            elif rm == 0b101:
+                db_afs_64[i] = {f_isad: True, f_imm: f_u32, 16: 1}
+            else:
+                db_afs_64[i] = {f_isad: True, rm + 8 * rex_b: 1}
+        elif mod == 0b01:
+            if rm == 0b100:
+                db_afs_64[i] = sib_64_s08_ebp[rex_x][rex_b]
+                continue
+            tmp = {f_isad: True, rm + 8 * rex_b: 1, f_imm: f_s08}
+            db_afs_64[i] = tmp
+
+        elif mod == 0b10:
+            if rm == 0b100:
+                db_afs_64[i] = sib_u64_ebp[rex_x][rex_b]
+            else:
+                db_afs_64[i] = {f_isad: True, rm + 8 * rex_b: 1, f_imm: f_u32}
+        elif mod == 0b11:
+            db_afs_64[i] = {f_isad: False, rm + 8 * rex_b: 1}
+
+    # 16bit
+    db_afs_16 = [None for i in range(0x100)]
+    _si = 6
+    _di = 7
+    _bx = 3
+    _bp = 5
+    for i in range(0x100):
+        index = i
+        mod, re, rm = getmodrm(i)
+
+        if mod == 0b00:
+            if rm == 0b100:
+                db_afs_16[index] = {f_isad: True, _si: 1}
+            elif rm == 0b101:
+                db_afs_16[index] = {f_isad: True, _di: 1}
+            elif rm == 0b110:
+                db_afs_16[index] = {
+                    f_isad: True, f_imm: f_u16}  # {f_isad:True,_bp:1}
+            elif rm == 0b111:
+                db_afs_16[index] = {f_isad: True, _bx: 1}
+            else:
+                db_afs_16[index] = {f_isad: True,
+                         [_si, _di][rm % 2]: 1,
+                    [_bx, _bp][(rm >> 1) % 2]: 1}
+        elif mod in [0b01, 0b10]:
+            if mod == 0b01:
+                my_imm = f_s08
+            else:
+                my_imm = f_u16
+
+            if rm == 0b100:
+                db_afs_16[index] = {f_isad: True, _si: 1, f_imm: my_imm}
+            elif rm == 0b101:
+                db_afs_16[index] = {f_isad: True, _di: 1, f_imm: my_imm}
+            elif rm == 0b110:
+                db_afs_16[index] = {f_isad: True, _bp: 1, f_imm: my_imm}
+            elif rm == 0b111:
+                db_afs_16[index] = {f_isad: True, _bx: 1, f_imm: my_imm}
+            else:
+                db_afs_16[index] = {f_isad: True,
+                         [_si, _di][rm % 2]: 1,
+                    [_bx, _bp][(rm >> 1) % 2]: 1,
+                    f_imm: my_imm}
+
+        elif mod == 0b11:
+            db_afs_16[index] = {f_isad: False, rm: 1}
+
+    byte2modrm = {}
+    byte2modrm[16] = db_afs_16
+    byte2modrm[32] = db_afs_32
+    byte2modrm[64] = db_afs_64
+
+    modrm2byte = {16: defaultdict(list),
+                  32: defaultdict(list),
+                  64: defaultdict(list),
+                  }
+    for size, db_afs in viewitems(byte2modrm):
+        for i, modrm in enumerate(db_afs):
+            if not isinstance(modrm, list):
+                # We only need sort for determinism
+                modrm = tuple(sorted(viewitems(modrm), key=str))
+                modrm2byte[size][modrm].append(i)
+                continue
+            for j, modrm_f in enumerate(modrm):
+                # We only need sort for determinism
+                modrm_f = tuple(sorted(viewitems(modrm_f), key=str))
+                modrm2byte[size][modrm_f].append((i, j))
+
+    return byte2modrm, modrm2byte
+
+byte2modrm, modrm2byte = gen_modrm_form()
+
+
+# ret is modr; ret is displacement
+def exprfindmod(e, o=None):
+    if o is None:
+        o = {}
+    if isinstance(e, ExprInt):
+        return e
+    if isinstance(e, ExprId):
+        i = size2gpregs[e.size].expr.index(e)
+        o[i] = 1
+        return None
+    elif isinstance(e, ExprOp):
+        out = None
+        if e.op == '+':
+            for a in e.args:
+                r = exprfindmod(a, o)
+                if out and r:
+                    raise ValueError('multiple displacement!')
+                out = r
+            return out
+        elif e.op == "*":
+            mul = int(e.args[1])
+            a = e.args[0]
+            i = size2gpregs[a.size].expr.index(a)
+            o[i] = mul
+        else:
+            raise ValueError('bad op')
+    return None
+
+def test_addr_size(ptr, size):
+    if isinstance(ptr, ExprInt):
+        return int(ptr) < (1 << size)
+    else:
+        return ptr.size == size
+
+SIZE2XMMREG = {64:gpregs_mm,
+               128:gpregs_xmm}
+SIZE2BNDREG = {64:gpregs_mm,
+               128:gpregs_bnd}
+
+def parse_mem(expr, parent, w8, sx=0, xmm=0, mm=0, bnd=0):
+    dct_expr = {}
+    opmode = parent.v_opmode()
+    if is_mem_segm(expr) and expr.ptr.args[0].is_int():
+        return None, None, False
+
+    if is_mem_segm(expr):
+        segm = expr.ptr.args[0]
+        ptr = expr.ptr.args[1]
+    else:
+        segm = None
+        ptr = expr.ptr
+
+    dct_expr[f_isad] = True
+    ad_size = ptr.size
+    admode = parent.v_admode()
+    if not test_addr_size(ptr, admode):
+        return None, None, False
+
+    if (w8 == 1 and expr.size != opmode and not sx and
+        not (hasattr(parent, 'sd') or hasattr(parent, 'wd'))):
+        return None, None, False
+
+    if hasattr(parent, 'wd'):
+        if expr.size == 16:
+            parent.wd.value = 1
+        elif expr.size == 32:
+            pass
+        else:
+            return None, None, False
+
+    if (not isinstance(ptr, ExprInt) and
+        parent.mode == 64 and
+        ptr.size == 32 and
+        parent.admode != 1):
+        return None, None, False
+    dct_expr = {f_isad: True}
+    disp = exprfindmod(ptr, dct_expr)
+    out = []
+    if disp is None:
+        # add 0 disp
+        disp = ExprInt(0, 32)
+    if disp is not None:
+        for signed, encoding, cast_size in [(True, f_s08, 8),
+                                           (True, f_s16, 16),
+                                           (True, f_s32, 32),
+                                           (False, f_u08, 8),
+                                           (False, f_u16, 16),
+                                           (False, f_u32, 32)]:
+            value = ExprInt(int(disp), cast_size)
+            if admode < value.size:
+                if signed:
+                    if int(disp) != sign_ext(int(value), admode, disp.size):
+                        continue
+                else:
+                    if int(disp) != int(value):
+                        continue
+            else:
+                if int(disp) != sign_ext(int(value), value.size, admode):
+                    continue
+            x1 = dict(dct_expr)
+            x1[f_imm] = (encoding, value)
+            out.append(x1)
+    else:
+        out = [dct_expr]
+    return out, segm, True
+
+def expr2modrm(expr, parent, w8, sx=0, xmm=0, mm=0, bnd=0):
+    dct_expr = {f_isad : False}
+
+    if mm or xmm or bnd:
+        if mm and expr.size != 64:
+            return None, None, False
+        elif xmm and expr.size != 128:
+            return None, None, False
+        elif bnd and expr.size != 128:
+            return None, None, False
+
+        if isinstance(expr, ExprId):
+            if bnd:
+                size2reg = SIZE2BNDREG
+            else:
+                size2reg = SIZE2XMMREG
+            selreg = size2reg[expr.size]
+            if not expr in selreg.expr:
+                return None, None, False
+            i = selreg.expr.index(expr)
+            dct_expr[i] = 1
+            return [dct_expr], None, True
+        else:
+            return parse_mem(expr, parent, w8, sx, xmm, mm)
+
+    elif expr.size == 64 and expr not in gpregs_mm.expr:
+        if hasattr(parent, 'sd'):
+            parent.sd.value = 1
+        elif hasattr(parent, 'wd'):
+            pass
+        elif hasattr(parent, 'stk'):
+            pass
+        else:
+            parent.rex_w.value = 1
+    opmode = parent.v_opmode()
+    if sx == 1:
+        opmode = 16
+    if sx == 2:
+        opmode = 32
+    if expr.size == 8 and w8 != 0:
+        return None, None, False
+
+    if w8 == 0 and expr.size != 8:
+        return None, None, False
+
+    if not isinstance(expr, ExprMem):
+        dct_expr[f_isad] = False
+        if xmm:
+            if expr in gpregs_xmm.expr:
+                i = gpregs_xmm.expr.index(expr)
+                dct_expr[i] = 1
+                return [dct_expr], None, True
+            else:
+                return None, None, False
+        if bnd:
+            if expr in gpregs_bnd.expr:
+                i = gpregs_bnd.expr.index(expr)
+                dct_expr[i] = 1
+                return [dct_expr], None, True
+            else:
+                return None, None, False
+        if mm:
+            if expr in gpregs_mm.expr:
+                i = gpregs_mm.expr.index(expr)
+                dct_expr[i] = 1
+                return [dct_expr], None, True
+            else:
+                return None, None, False
+        if w8 == 0:
+            if parent.mode == 64 and expr in gpregs08_64.expr:
+                r = gpregs08_64
+                parent.rex_p.value = 1
+            else:
+                parent.rex_p.value = 0
+                parent.rex_x.value = 0
+                r = size2gpregs[8]
+            if not expr in r.expr:
+                return None, None, False
+            i = r.expr.index(expr)
+            dct_expr[i] = 1
+            return [dct_expr], None, True
+        if opmode != expr.size:
+            return None, None, False
+        if not expr in size2gpregs[opmode].expr:
+            return None, None, False
+        i = size2gpregs[opmode].expr.index(expr)
+        if i > 7:
+            if parent.mode != 64:
+                return None, None, False
+        dct_expr[i] = 1
+        return [dct_expr], None, True
+    return parse_mem(expr, parent, w8, sx, xmm, mm, bnd)
+
+def modrm2expr(modrm, parent, w8, sx=0, xmm=0, mm=0, bnd=0):
+    o = []
+    if not modrm[f_isad]:
+        modrm_k = [key for key, value in viewitems(modrm) if value == 1]
+        if len(modrm_k) != 1:
+            raise ValueError('strange reg encoding %r' % modrm)
+        modrm_k = modrm_k[0]
+        if w8 == 0:
+            opmode = 8
+        elif sx == 1:
+            opmode = 16
+        elif sx == 2:
+            opmode = 32
+        else:
+            opmode = parent.v_opmode()
+        if xmm:
+            expr = gpregs_xmm.expr[modrm_k]
+        elif mm:
+            expr = gpregs_mm.expr[modrm_k]
+        elif bnd:
+            expr = gpregs_bnd.expr[modrm_k]
+        elif opmode == 8 and (parent.v_opmode() == 64 or parent.rex_p.value == 1):
+            expr = gpregs08_64.expr[modrm_k]
+        else:
+            expr = size2gpregs[opmode].expr[modrm_k]
+        return expr
+    admode = parent.v_admode()
+    opmode = parent.v_opmode()
+    for modrm_k, scale in viewitems(modrm):
+        if isinstance(modrm_k, int):
+            expr = size2gpregs[admode].expr[modrm_k]
+            if scale != 1:
+                expr = ExprInt(scale, admode) * expr
+            o.append(expr)
+    if f_imm in modrm:
+        if parent.disp.value is None:
+            return None
+        o.append(ExprInt(int(parent.disp.expr), admode))
+    if len(o) == 1:
+        expr = o[0]
+    else:
+        expr = ExprOp('+', *o)
+    if w8 == 0:
+        opmode = 8
+    elif sx == 1:
+        opmode = 16
+    elif sx == 2:
+        opmode = 32
+    if xmm:
+        opmode = 128
+    elif mm:
+        opmode = 64
+    elif bnd:
+        opmode = 128
+
+    expr = ExprMem(expr, size=opmode)
+    return expr
+
+
+class x86_rm_arg(x86_arg):
+    parser = rmarg
+
+    def fromstring(self, text, loc_db, parser_result=None):
+        start, stop = super(x86_rm_arg, self).fromstring(text, loc_db, parser_result)
+        p = self.parent
+        if start is None:
+            return None, None
+        return start, stop
+
+    def get_modrm(self):
+        p = self.parent
+        admode = p.v_admode()
+
+        if not admode in [16, 32, 64]:
+            raise ValueError('strange admode %r', admode)
+        v = setmodrm(p.mod.value, 0, p.rm.value)
+        v |= p.rex_b.value << 8
+        v |= p.rex_x.value << 9
+        if p.mode == 64:
+            # XXXx to check
+            admode = 64
+
+        xx = byte2modrm[admode][v]
+        if isinstance(xx, list):
+            if not p.sib_scale:
+                return False
+            v = setmodrm(p.sib_scale.value,
+                         p.sib_index.value,
+                         p.sib_base.value)
+            xx = xx[v]
+        return xx
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        self.expr = modrm2expr(xx, p, 1)
+        return self.expr is not None
+
+    def gen_cand(self, v_cand, admode):
+        if not admode in modrm2byte:
+            # XXX TODO: 64bit
+            return
+        if not v_cand:
+            return
+
+        p = self.parent
+        o_rex_x = p.rex_x.value
+        o_rex_b = p.rex_b.value
+        # add candidate without 0 imm
+        new_v_cand = []
+        moddd = False
+        for v in v_cand:
+            new_v_cand.append(v)
+            if f_imm in v and int(v[f_imm][1]) == 0:
+                v = dict(v)
+                del(v[f_imm])
+                new_v_cand.append(v)
+                moddd = True
+
+        v_cand = new_v_cand
+
+        out_c = []
+        for v in v_cand:
+            disp = None
+            # patch value in modrm
+            if f_imm in v:
+                size, disp = v[f_imm]
+                disp = int(disp)
+
+                v[f_imm] = size
+            vo = v
+            # We only need sort for determinism
+            v = tuple(sorted(viewitems(v), key=str))
+            admode = 64 if p.mode == 64 else admode
+            if not v in modrm2byte[admode]:
+                continue
+            xx = modrm2byte[admode][v]
+
+            # default case
+            for x in xx:
+                if type(x) == tuple:
+                    modrm, sib = x
+                else:
+                    modrm = x
+                    sib = None
+
+                # 16 bit cannot have sib
+                if sib is not None and admode == 16:
+                    continue
+                rex = modrm >> 8  # 0# XXX HACK REM temporary REX modrm>>8
+                if rex and admode != 64:
+                    continue
+
+                p.rex_x.value = (rex >> 1) & 1
+                p.rex_b.value = rex & 1
+
+                if o_rex_x is not None and p.rex_x.value != o_rex_x:
+                    continue
+                if o_rex_b is not None and p.rex_b.value != o_rex_b:
+                    continue
+
+                mod, re, rm = getmodrm(modrm)
+                # check re on parent
+                if hasattr(p, 'reg') and re != p.reg.value:
+                    continue
+
+                if sib is not None:
+                    s_scale, s_index, s_base = getmodrm(sib)
+                else:
+                    s_scale, s_index, s_base = None, None, None
+
+                p.mod.value = mod
+                p.rm.value = rm
+                p.sib_scale.value = s_scale
+                p.sib_index.value = s_index
+                p.sib_base.value = s_base
+                p.disp.value = disp
+                if disp is not None:
+                    p.disp.l = f_imm2size[vo[f_imm]]
+
+                yield True
+
+        return
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        p = self.parent
+        admode = p.v_admode()
+        mode = self.expr.size
+        v_cand, segm, ok = expr2modrm(self.expr, p, 1)
+        if segm:
+            p.g2.value = segm2enc[segm]
+        for x in self.gen_cand(v_cand, admode):
+            yield x
+
+class x86_rm_mem(x86_rm_arg):
+    def fromstring(self, text, loc_db, parser_result=None):
+        self.expr = None
+        start, stop = super(x86_rm_mem, self).fromstring(text, loc_db, parser_result)
+        if not isinstance(self.expr, ExprMem):
+            return None, None
+        return start, stop
+
+
+class x86_rm_mem_far(x86_rm_arg):
+    parser = mem_far
+    def fromstring(self, text, loc_db, parser_result=None):
+        self.expr = None
+        start, stop = super(x86_rm_mem_far, self).fromstring(text, loc_db, parser_result)
+        if not isinstance(self.expr, ExprMem):
+            return None, None
+        self.expr = ExprOp('far', self.expr)
+        return start, stop
+
+    def decode(self, v):
+        ret = super(x86_rm_mem_far, self).decode(v)
+        if not ret:
+            return ret
+        if isinstance(self.expr, m2_expr.ExprMem):
+            self.expr = ExprOp('far', self.expr)
+        return True
+
+    def encode(self):
+        if not (isinstance(self.expr, m2_expr.ExprOp) and
+                self.expr.op == 'far'):
+            return
+
+        expr = self.expr.args[0]
+        if isinstance(expr, ExprInt):
+            return
+        p = self.parent
+        admode = p.v_admode()
+        mode = expr.size
+        v_cand, segm, ok = expr2modrm(expr, p, 1)
+        if segm:
+            p.g2.value = segm2enc[segm]
+        for x in self.gen_cand(v_cand, admode):
+            yield x
+
+class x86_rm_w8(x86_rm_arg):
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        self.expr = modrm2expr(xx, p, p.w8.value)
+        return self.expr is not None
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        p = self.parent
+        if p.w8.value is None:
+            if self.expr.size == 8:
+                p.w8.value = 0
+            else:
+                p.w8.value = 1
+
+        v_cand, segm, ok = expr2modrm(self.expr, p, p.w8.value)
+        if segm:
+            p.g2.value = segm2enc[segm]
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+
+class x86_rm_sx(x86_rm_arg):
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        self.expr = modrm2expr(xx, p, p.w8.value, 1)
+        return self.expr is not None
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        p = self.parent
+        if p.w8.value is None:
+            if self.expr.size == 8:
+                p.w8.value = 0
+            else:
+                p.w8.value = 1
+        v_cand, segm, ok = expr2modrm(self.expr, p, p.w8.value, 1)
+        if segm:
+            p.g2.value = segm2enc[segm]
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+
+class x86_rm_sxd(x86_rm_arg):
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        self.expr = modrm2expr(xx, p, 1, 2)
+        return self.expr is not None
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        p = self.parent
+        v_cand, segm, ok = expr2modrm(self.expr, p, 1, 2)
+        if segm:
+            p.g2.value = segm2enc[segm]
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+
+class x86_rm_sd(x86_rm_arg):
+    out_size = 64
+    def get_s_value(self):
+        return self.parent.sd.value
+    def set_s_value(self, value):
+        self.parent.sd.value = value
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        expr = modrm2expr(xx, p, 1)
+        if not isinstance(expr, ExprMem):
+            return False
+        if self.get_s_value() == 0:
+            expr = ExprMem(expr.ptr, 32)
+        else:
+            expr = ExprMem(expr.ptr, self.out_size)
+        self.expr = expr
+        return self.expr is not None
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        p = self.parent
+        if not self.expr.size in [32, 64]:
+            return
+        self.set_s_value(0)
+        v_cand, segm, ok = expr2modrm(self.expr, p, 1)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+
+class x86_rm_wd(x86_rm_sd):
+    out_size = 16
+    def get_s_value(self):
+        return self.parent.wd.value
+    def set_s_value(self, value):
+        self.parent.wd.value = value
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        p = self.parent
+        p.wd.value = 0
+        v_cand, segm, ok = expr2modrm(self.expr, p, 1)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+
+class x86_rm_08(x86_rm_arg):
+    msize = 8
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        expr = modrm2expr(xx, p, 0)
+        if not isinstance(expr, ExprMem):
+            self.expr = expr
+            return True
+        self.expr = ExprMem(expr.ptr, self.msize)
+        return self.expr is not None
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        p = self.parent
+        v_cand, segm, ok = expr2modrm(self.expr, p, 0, 0, 0, 0)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+class x86_rm_reg_m08(x86_rm_arg):
+    msize = 8
+
+    def decode(self, v):
+        ret = x86_rm_arg.decode(self, v)
+        if not ret:
+            return ret
+        if not isinstance(self.expr, ExprMem):
+            return True
+        self.expr = ExprMem(self.expr.ptr, self.msize)
+        return self.expr is not None
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        p = self.parent
+        if isinstance(self.expr, ExprMem):
+            expr = ExprMem(self.expr.ptr, 32)
+        else:
+            expr = self.expr
+        v_cand, segm, ok = expr2modrm(expr, p, 1, 0, 0, 0)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+class x86_rm_reg_m16(x86_rm_reg_m08):
+    msize = 16
+
+class x86_rm_m64(x86_rm_arg):
+    msize = 64
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        expr = modrm2expr(xx, p, 1)
+        if not isinstance(expr, ExprMem):
+            return False
+        self.expr = ExprMem(expr.ptr, self.msize)
+        return self.expr is not None
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        p = self.parent
+        v_cand, segm, ok = expr2modrm(self.expr, p, 0, 0, 0, 1)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+
+class x86_rm_m80(x86_rm_m64):
+    msize = 80
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            return
+        if not isinstance(self.expr, ExprMem) or self.expr.size != self.msize:
+            return
+        p = self.parent
+        mode = p.mode
+        if mode == 64:
+            mode = 32
+        self.expr = ExprMem(self.expr.ptr, mode)
+        v_cand, segm, ok = expr2modrm(self.expr, p, 1)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+
+class x86_rm_m08(x86_rm_arg):
+    msize = 8
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        self.expr = modrm2expr(xx, p, 0)
+        return self.expr is not None
+
+    def encode(self):
+        if self.expr.size != 8:
+            return
+        p = self.parent
+        mode = p.mode
+        v_cand, segm, ok = expr2modrm(self.expr, p, 0)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+
+class x86_rm_m16(x86_rm_m80):
+    msize = 16
+
+
+class x86_rm_mm(x86_rm_m80):
+    msize = 64
+    is_mm = True
+    is_xmm = False
+    is_bnd = False
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        expr = modrm2expr(xx, p, 0, 0, self.is_xmm, self.is_mm, self.is_bnd)
+        if isinstance(expr, ExprMem):
+            if self.msize is None:
+                return False
+            if expr.size != self.msize:
+                expr = ExprMem(expr.ptr, self.msize)
+        self.expr = expr
+        return True
+
+
+    def encode(self):
+        expr = self.expr
+        if isinstance(expr, ExprInt):
+            return
+        if isinstance(expr, ExprMem) and expr.size != self.msize:
+            return
+        p = self.parent
+        mode = p.mode
+        if mode == 64:
+            mode = 32
+        if isinstance(expr, ExprMem):
+            if self.is_xmm:
+                expr = ExprMem(expr.ptr, 128)
+            elif self.is_mm:
+                expr = ExprMem(expr.ptr, 64)
+
+        v_cand, segm, ok = expr2modrm(expr, p, 0, 0, self.is_xmm, self.is_mm,
+                                      self.is_bnd)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+
+class x86_rm_mm_m64(x86_rm_mm):
+    msize = 64
+    is_mm = True
+    is_xmm = False
+
+class x86_rm_xmm(x86_rm_mm):
+    msize = 128
+    is_mm = False
+    is_xmm = True
+
+
+class x86_rm_xmm_m32(x86_rm_mm):
+    msize = 32
+    is_mm = False
+    is_xmm = True
+
+class x86_rm_xmm_m64(x86_rm_mm):
+    msize = 64
+    is_mm = False
+    is_xmm = True
+
+class x86_rm_xmm_m128(x86_rm_mm):
+    msize = 128
+    is_mm = False
+    is_xmm = True
+
+
+class x86_rm_xmm_reg(x86_rm_mm):
+    msize = None
+    is_mm = False
+    is_xmm = True
+
+class x86_rm_mm_reg(x86_rm_mm):
+    msize = None
+    is_mm = True
+    is_xmm = False
+
+
+class x86_rm_bnd(x86_rm_mm):
+    msize = 128
+    is_mm = False
+    is_xmm = False
+    is_bnd = True
+
+
+class x86_rm_bnd_reg(x86_rm_mm):
+    msize = None
+    is_mm = False
+    is_xmm = False
+    is_bnd = True
+
+
+class x86_rm_bnd_m64(x86_rm_mm):
+    msize = 64
+    is_mm = False
+    is_xmm = False
+    is_bnd = True
+
+
+class x86_rm_bnd_m128(x86_rm_mm):
+    msize = 128
+    is_mm = False
+    is_xmm = False
+    is_bnd = True
+
+
+class x86_rm_reg_noarg(object):
+    prio = default_prio + 1
+
+    parser = gpreg
+
+    def fromstring(self, text, loc_db, parser_result=None):
+        if not hasattr(self.parent, 'sx') and hasattr(self.parent, "w8"):
+            self.parent.w8.value = 1
+        if parser_result:
+            result, start, stop = parser_result[self.parser]
+            if result == [None]:
+                return None, None
+            self.expr = result
+            if self.expr.size == 8:
+                if hasattr(self.parent, 'sx') or not hasattr(self.parent, 'w8'):
+                    return None, None
+                self.parent.w8.value = 0
+            return start, stop
+        try:
+            result, start, stop = next(self.parser.scanString(text))
+        except StopIteration:
+            return None, None
+        expr = self.asm_ast_to_expr(result[0], loc_db)
+        if expr is None:
+            return None, None
+
+        self.expr = expr
+        if self.expr.size == 0:
+            if hasattr(self.parent, 'sx') or not hasattr(self.parent, 'w8'):
+                return None, None
+            self.parent.w8.value = 0
+
+        return start, stop
+
+    def getrexsize(self):
+        return self.parent.rex_r.value
+
+    def setrexsize(self, v):
+        self.parent.rex_r.value = v
+
+    def decode(self, v):
+        v = v & self.lmask
+        p = self.parent
+        opmode = p.v_opmode()
+        if not hasattr(p, 'sx') and (hasattr(p, 'w8') and p.w8.value == 0):
+            opmode = 8
+        r = size2gpregs[opmode]
+        if p.mode == 64 and self.getrexsize():
+            v |= 0x8
+        if p.v_opmode() == 64 or p.rex_p.value == 1:
+            if not hasattr(p, 'sx') and (hasattr(p, 'w8') and p.w8.value == 0):
+                r = gpregs08_64
+            elif p.rex_r.value == 1:
+                v |= 8
+        self.expr = r.expr[v]
+        return True
+
+    def encode(self):
+        if not isinstance(self.expr, ExprId):
+            return False
+        if self.expr in gpregs64.expr and not hasattr(self.parent, 'stk'):
+            self.parent.rex_w.value = 1
+        opmode = self.parent.v_opmode()
+        if not hasattr(self.parent, 'sx') and hasattr(self.parent, 'w8'):
+            self.parent.w8.value = 1
+        if self.expr.size == 8:
+            if hasattr(self.parent, 'sx') or not hasattr(self.parent, 'w8'):
+                return False
+            self.parent.w8.value = 0
+            opmode = 8
+        r = size2gpregs[opmode]
+        if self.expr in r.expr:
+            i = r.expr.index(self.expr)
+        elif (opmode == 8 and self.parent.mode == 64 and
+            self.expr in gpregs08_64.expr):
+            i = gpregs08_64.expr.index(self.expr)
+            self.parent.rex_p.value = 1
+        else:
+            log.debug("cannot encode reg %r", self.expr)
+            return False
+        if self.parent.v_opmode() == 64:
+            if i > 7:
+                self.setrexsize(1)
+                i -= 8
+        elif self.parent.mode == 64 and i > 7:
+            i -= 8
+            self.setrexsize(1)
+        self.value = i
+        return True
+
+
+class x86_rm_reg_mm(x86_rm_reg_noarg, x86_arg):
+    selreg = gpregs_mm
+    def decode(self, v):
+        if self.parent.mode == 64 and self.getrexsize():
+            v |= 0x8
+        self.expr = self.selreg.expr[v]
+        return True
+
+    def encode(self):
+        if not isinstance(self.expr, ExprId):
+            return False
+        if self.expr not in self.selreg.expr:
+            return False
+        i = self.selreg.expr.index(self.expr)
+        if self.parent.mode == 64 and i > 7:
+            i -= 8
+            self.setrexsize(1)
+        self.value = i
+        return True
+
+class x86_rm_reg_xmm(x86_rm_reg_mm):
+    selreg = gpregs_xmm
+
+class x86_rm_reg_bnd(x86_rm_reg_mm):
+    selreg = gpregs_bnd
+
+class x86_rm_reg(x86_rm_reg_noarg, x86_arg):
+    pass
+
+
+class x86_reg(x86_rm_reg):
+
+    def getrexsize(self):
+        return self.parent.rex_b.value
+
+    def setrexsize(self, v):
+        self.parent.rex_b.value = v
+
+class x86_vex_reg(x86_rm_reg):
+    # self.lmask = 15
+
+    def decode(self, v):
+        p = self.parent
+        
+        self.expr = size2gpregs[v_opmode(p)].expr[p.vex_v.value]
+        
+        return self.expr is not None
+
+    def encode(self):
+        opmode = self.parent.mode
+        size = self.expr.size
+
+        if opmode == 64 and size == 64:
+            self.parent.rex_w.value = 1
+        else:
+            self.parent.rex_w.value = 0
+
+        r = size2gpregs[size]
+        if self.expr in r.expr:
+            i = r.expr.index(self.expr)
+
+        self.parent.vex_v.value = i
+        self.parent.vex.value = 1
+        return True
+
+
+class x86_reg_modrm(x86_rm_reg):
+
+    def getrexsize(self):
+        return self.parent.rex_r.value
+
+    def setrexsize(self, v):
+        self.parent.rex_r.value = v
+
+
+
+class x86_reg_noarg(x86_rm_reg_noarg):
+
+    def getrexsize(self):
+        return self.parent.rex_b.value
+
+    def setrexsize(self, v):
+        self.parent.rex_b.value = v
+
+
+class x86_rm_segm(reg_noarg, x86_arg):
+    prio = default_prio + 1
+    reg_info = segmreg
+    parser = reg_info.parser
+
+
+class x86_rm_cr(reg_noarg, x86_arg):
+    prio = default_prio + 1
+    reg_info = crregs
+    parser = reg_info.parser
+
+
+class x86_rm_dr(reg_noarg, x86_arg):
+    prio = default_prio + 1
+    reg_info = drregs
+    parser = reg_info.parser
+
+
+class x86_rm_flt(reg_noarg, x86_arg):
+    prio = default_prio + 1
+    reg_info = fltregs
+    parser = reg_info.parser
+
+
+class bs_fbit(bsi):
+
+    def decode(self, v):
+        # value already decoded in pre_dis_info
+        return True
+
+
+class bs_cl1(bsi, x86_arg):
+    parser = cl_or_imm
+
+    def decode(self, v):
+        if v == 1:
+            self.expr = regs08_expr[1]
+        else:
+            self.expr = ExprInt(1, 8)
+        return True
+
+    def encode(self):
+        if self.expr == regs08_expr[1]:
+            self.value = 1
+        elif isinstance(self.expr, ExprInt) and int(self.expr) == 1:
+            self.value = 0
+        else:
+            return False
+        return True
+
+
+def sib_cond(cls, mode, v):
+    if admode_prefix((mode, v["opmode"], v["admode"])) == 16:
+        return None
+    if v['mod'] == 0b11:
+        return None
+    elif v['rm'] == 0b100:
+        return cls.ll
+    else:
+        return None
+    return v['rm'] == 0b100
+
+
+class bs_cond_scale(bs_cond):
+    # cond must return field len
+    ll = 2
+
+    @classmethod
+    def flen(cls, mode, v):
+        return sib_cond(cls, mode, v)
+
+    def encode(self):
+        if self.value is None:
+            self.value = 0
+            self.l = 0
+            return True
+        return super(bs_cond_scale, self).encode()
+
+    def decode(self, v):
+        self.value = v
+        return True
+
+
+class bs_cond_index(bs_cond_scale):
+    ll = 3
+
+    @classmethod
+    def flen(cls, mode, v):
+        return sib_cond(cls, mode, v)
+
+
+class bs_cond_disp(bs_cond):
+    # cond must return field len
+
+    @classmethod
+    def flen(cls, mode, v):
+        if admode_prefix((mode, v['opmode'], v['admode'])) == 16:
+            if v['mod'] == 0b00:
+                if v['rm'] == 0b110:
+                    return 16
+                else:
+                    return None
+            elif v['mod'] == 0b01:
+                return 8
+            elif v['mod'] == 0b10:
+                return 16
+            return None
+        # 32, 64
+        if 'sib_base' in v and v['sib_base'] == 0b101:
+            if v['mod'] == 0b00:
+                return 32
+            elif v['mod'] == 0b01:
+                return 8
+            elif v['mod'] == 0b10:
+                return 32
+            else:
+                return None
+
+        if v['mod'] == 0b00:
+            if v['rm'] == 0b101:
+                return 32
+            else:
+                return None
+        elif v['mod'] == 0b01:
+            return 8
+        elif v['mod'] == 0b10:
+            return 32
+        else:
+            return None
+
+    def encode(self):
+        if self.value is None:
+            self.value = 0
+            self.l = 0
+            return True
+        self.value = swap_uint(self.l, self.value)
+        return True
+
+    def decode(self, v):
+        admode = self.parent.v_admode()
+        v = swap_uint(self.l, v)
+        self.value = v
+        v = sign_ext(v, self.l, admode)
+        v = ExprInt(v, admode)
+        self.expr = v
+        return True
+
+
+class bs_cond_imm(bs_cond_scale, x86_arg):
+    parser = base_expr
+    max_size = 32
+
+    def fromstring(self, text, loc_db, parser_result=None):
+        if parser_result:
+            expr, start, stop = parser_result[self.parser]
+        else:
+            try:
+                expr, start, stop = next(self.parser.scanString(text))
+            except StopIteration:
+                expr = None
+        self.expr = expr
+
+        if len(self.parent.args) > 1:
+            l = self.parent.args[0].expr.size
+        else:
+            l = self.parent.v_opmode()
+        if isinstance(self.expr, ExprInt):
+            v = int(self.expr)
+            mask = ((1 << l) - 1)
+            self.expr = ExprInt(v & mask, l)
+
+        if self.expr is None:
+            log.debug('cannot fromstring int %r', text)
+            return None, None
+        return start, stop
+
+    @classmethod
+    def flen(cls, mode, v):
+        if 'w8' not in v or v['w8'] == 1:
+            if 'se' in v and v['se'] == 1:
+                return 8
+            else:
+                osize = v_opmode_info(mode, v['opmode'], v['rex_w'], 0)
+                osize = min(osize, cls.max_size)
+                return osize
+        return 8
+
+    def getmaxlen(self):
+        return 32
+
+    def encode(self):
+        if not isinstance(self.expr, ExprInt):
+            return
+        arg0_expr = self.parent.args[0].expr
+        self.parent.rex_w.value = 0
+        # special case for push
+        if len(self.parent.args) == 1:
+            v = int(self.expr)
+            l = self.parent.v_opmode()
+            l = min(l, self.max_size)
+
+            self.l = l
+            mask = ((1 << self.l) - 1)
+            if v != sign_ext(v & mask, self.l, l):
+                return
+            self.value = swap_uint(self.l, v & ((1 << self.l) - 1))
+            yield True
+            return
+
+        # assume 2 args; use first arg to guess op size
+        if arg0_expr.size == 64:
+            self.parent.rex_w.value = 1
+
+        l = self.parent.v_opmode()
+        v = int(self.expr)
+        if arg0_expr.size == 8:
+            if not hasattr(self.parent, 'w8'):
+                return
+            self.parent.w8.value = 0
+            l = 8
+            if hasattr(self.parent, 'se'):
+                self.parent.se.value = 0
+        elif hasattr(self.parent, 'se'):
+            if hasattr(self.parent, 'w8'):
+                self.parent.w8.value = 1
+            # try to generate signed extended version
+            if v == sign_ext(v & 0xFF, 8, arg0_expr.size):
+                self.parent.se.value = 1
+                self.l = 8
+                self.value = v & 0xFF
+                yield True
+            self.parent.se.value = 0
+        else:
+            if hasattr(self.parent, 'w8'):
+                self.parent.w8.value = 1
+        if l == 64:
+            self.l = self.getmaxlen()
+        else:
+            self.l = l
+
+        mask = ((1 << self.l) - 1)
+        if v != sign_ext(v & mask, self.l, l):
+            return
+        self.value = swap_uint(self.l, v & ((1 << self.l) - 1))
+        yield True
+
+    def decode(self, v):
+        opmode = self.parent.v_opmode()
+        v = swap_uint(self.l, v)
+        self.value = v
+        l_out = opmode
+        if hasattr(self.parent, 'w8') and self.parent.w8.value == 0:
+            l_out = 8
+        v = sign_ext(v, self.l, l_out)
+        self.expr = ExprInt(v, l_out)
+        return True
+
+
+class bs_cond_imm64(bs_cond_imm):
+    max_size = 64
+
+    def getmaxlen(self):
+        return 64
+
+    @classmethod
+    def flen(cls, mode, v):
+        if 'w8' not in v or v['w8'] == 1:
+            if 'se' in v and v['se'] == 1:
+                return 8
+            else:
+                osize = v_opmode_info(mode, v['opmode'], v['rex_w'], 0)
+                return osize
+        else:
+            return 8
+
+
+class bs_rel_off(bs_cond_imm):
+    parser = base_expr
+
+    def fromstring(self, text, loc_db, parser_result=None):
+        if parser_result:
+            expr, start, stop = parser_result[self.parser]
+        else:
+            try:
+                expr, start, stop = next(self.parser.scanString(text))
+            except StopIteration:
+                expr = None
+        self.expr = expr
+        l = self.parent.mode
+        if isinstance(self.expr, ExprInt):
+            v = int(self.expr)
+            mask = ((1 << l) - 1)
+            self.expr = ExprInt(v & mask, l)
+        return start, stop
+
+    @classmethod
+    def flen(cls, mode, v):
+        osize = v_opmode_info(mode, v['opmode'], v['rex_w'], 0)
+        if osize == 16:
+            return 16
+        else:
+            return 32
+
+    def encode(self):
+        if not isinstance(self.expr, ExprInt):
+            return
+        arg0_expr = self.parent.args[0].expr
+        if self.l == 0:
+            l = self.parent.v_opmode()
+            self.l = l
+        l = offsize(self.parent)
+        prefix = self.parent.gen_prefix()
+        parent_len = len(prefix) * 8 + self.parent.l + self.l
+        assert(parent_len % 8 == 0)
+
+        v = int(self.expr) - parent_len // 8
+        if prefix is None:
+            return
+        mask = ((1 << self.l) - 1)
+        if self.l > l:
+            return
+        if v != sign_ext(v & mask, self.l, l):
+            return
+        self.value = swap_uint(self.l, v & ((1 << self.l) - 1))
+        yield True
+
+    def decode(self, v):
+        v = swap_uint(self.l, v)
+        size = offsize(self.parent)
+        v = sign_ext(v, self.l, size)
+        v += self.parent.l
+        self.expr = ExprInt(v, size)
+        return True
+
+class bs_s08(bs_rel_off):
+    parser = base_expr
+
+    @classmethod
+    def flen(cls, mode, v):
+        return 8
+
+    def encode(self):
+        if not isinstance(self.expr, ExprInt):
+            return
+        arg0_expr = self.parent.args[0].expr
+        if self.l != 0:
+            l = self.l
+        else:
+            l = self.parent.v_opmode()
+            self.l = l
+        l = offsize(self.parent)
+        v = int(self.expr)
+        mask = ((1 << self.l) - 1)
+        if self.l > l:
+            return
+        if v != sign_ext(v & mask, self.l, l):
+            return
+        self.value = swap_uint(self.l, v & ((1 << self.l) - 1))
+        yield True
+
+    def decode(self, v):
+        v = swap_uint(self.l, v)
+        size = offsize(self.parent)
+        v = sign_ext(v, self.l, size)
+        self.expr = ExprInt(v, size)
+        return True
+
+
+class bs_rel_off08(bs_rel_off):
+
+    @classmethod
+    def flen(cls, mode, v):
+        return 8
+
+
+class bs_moff(bsi):
+
+    @classmethod
+    def flen(cls, mode, v):
+        osize = v_opmode_info(mode, v['opmode'], v['rex_w'], 0)
+        if osize == 16:
+            return 16
+        else:
+            return 32
+
+    def encode(self):
+        if not hasattr(self.parent, "mseg"):
+            return
+        m = self.parent.mseg.expr
+        if not (isinstance(m, ExprOp) and m.op == 'segm'):
+            return
+        if not isinstance(m.args[1], ExprInt):
+            return
+        l = self.parent.v_opmode()
+        if l == 16:
+            self.l = 16
+        else:
+            self.l = 32
+        v = int(m.args[1])
+        mask = ((1 << self.l) - 1)
+        if v != sign_ext(v & mask, self.l, l):
+            return
+        self.value = swap_uint(self.l, v & ((1 << self.l) - 1))
+        yield True
+
+    def decode(self, v):
+        opmode = self.parent.v_opmode()
+        if opmode == 64:
+            return False
+        v = swap_uint(self.l, v)
+        self.value = v
+        v = sign_ext(v, self.l, opmode)
+        self.expr = ExprInt(v, opmode)
+        return True
+
+
+class bs_movoff(x86_arg):
+    parser = deref_mem
+
+    def fromstring(self, text, loc_db, parser_result=None):
+        if parser_result:
+            e, start, stop = parser_result[self.parser]
+            if e is None:
+                return None, None
+            if not isinstance(e, ExprMem):
+                return None, None
+            self.expr = e
+            if self.expr is None:
+                return None, None
+            return start, stop
+        try:
+            v, start, stop = next(self.parser.scanString(text))
+        except StopIteration:
+            return None, None
+        if not isinstance(e, ExprMem):
+            return None, None
+        self.expr = v[0]
+        if self.expr is None:
+            log.debug('cannot fromstring int %r', text)
+            return None, None
+        return start, stop
+
+    @classmethod
+    def flen(cls, mode, v):
+        if mode == 64:
+            if v['admode']:
+                return 32
+            else:
+                return 64
+        asize = v_admode_info(mode, v['admode'])
+        return asize
+
+    def encode(self):
+        p = self.parent
+        if not isinstance(self.expr, ExprMem) or not isinstance(self.expr.ptr, ExprInt):
+            return
+        self.l = p.v_admode()
+        v = int(self.expr.ptr)
+        mask = ((1 << self.l) - 1)
+        if v != mask & v:
+            return
+        self.value = swap_uint(self.l, v & ((1 << self.l) - 1))
+        yield True
+
+    def decode(self, v):
+        if self.parent.mode == 64:
+            if self.parent.admode == 1:
+                l = 32
+            else:
+                l = 64
+        else:
+            l = self.parent.v_admode()
+        v = swap_uint(self.l, v)
+        self.value = v
+        v = sign_ext(v, self.l, l)
+        v = ExprInt(v, l)
+        size = self.parent.v_opmode()
+        if self.parent.w8.value == 0:
+            size = 8
+        self.expr = ExprMem(v, size)
+        return True
+
+
+class bs_msegoff(x86_arg):
+    parser = deref_ptr
+
+    def fromstring(self, text, loc_db, parser_result=None):
+        if parser_result:
+            e, start, stop = parser_result[self.parser]
+            if e is None:
+                return None, None
+            self.expr = e
+            if self.expr is None:
+                return None, None
+            return start, stop
+        try:
+            v, start, stop = next(self.parser.scanString(text))
+        except StopIteration:
+            return None, None
+        self.expr = v[0]
+        if self.expr is None:
+            log.debug('cannot fromstring int %r', text)
+            return None, None
+        return start, stop
+
+    def encode(self):
+        if not (isinstance(self.expr, ExprOp) and self.expr.op == 'segm'):
+            return
+        if not isinstance(self.expr.args[0], ExprInt):
+            return
+        if not isinstance(self.expr.args[1], ExprInt):
+            return
+        l = self.parent.v_opmode()
+        v = int(self.expr.args[0])
+        mask = ((1 << self.l) - 1)
+        if v != sign_ext(v & mask, self.l, l):
+            return
+        self.value = swap_uint(self.l, v & ((1 << self.l) - 1))
+        yield True
+
+    def decode(self, v):
+        opmode = self.parent.v_opmode()
+        v = swap_uint(self.l, v)
+        self.value = v
+        v = ExprInt(v, 16)
+        self.expr = ExprOp('segm', v, self.parent.off.expr)
+        return True
+
+
+d_rex_p = bs(l=0, cls=(bs_fbit,), fname="rex_p")
+d_rex_w = bs(l=0, cls=(bs_fbit,), fname="rex_w")
+d_rex_r = bs(l=0, cls=(bs_fbit,), fname="rex_r")
+d_rex_x = bs(l=0, cls=(bs_fbit,), fname="rex_x")
+d_rex_b = bs(l=0, cls=(bs_fbit,), fname="rex_b")
+
+d_vex = bs(l=0, cls=(bs_fbit,), fname="vex")
+d_vex_l = bs(l=0, cls=(bs_fbit,), fname="vex_l")
+d_vex_p = bs(l=0, cls=(bs_fbit,), fname="vex_p")
+d_vex_v = bs(l=0, cls=(bs_fbit,), fname="vex_v")
+d_vex_m = bs(l=0, cls=(bs_fbit,), fname="vex_m")
+
+pref_0f = bs(l=0, fname="pref_0f")
+pref_0f38 = bs(l=0, fname="pref_0f38")
+pref_0f3a = bs(l=0, fname="pref_0f3a")
+
+d_g1 = bs(l=0, cls=(bs_fbit,), fname="g1")
+d_g2 = bs(l=0, cls=(bs_fbit,), fname="g2")
+
+
+d_cl1 = bs(l=1, cls=(bs_cl1,), fname="cl1")
+
+
+w8 = bs(l=1, fname="w8")
+se = bs(l=1, fname="se")
+
+sx = bs(l=0, fname="sx")
+sxd = bs(l=0, fname="sx")
+
+
+xmmreg = bs(l=0, fname="xmmreg")
+mmreg = bs(l=0, fname="mmreg")
+
+pref_f2 = bs(l=0, fname="prefixed", default=b"\xf2")
+pref_f3 = bs(l=0, fname="prefixed", default=b"\xf3")
+pref_66 = bs(l=0, fname="prefixed", default=b"\x66")
+no_xmm_pref = bs(l=0, fname="no_xmm_pref")
+
+no_rex = bs(l=0, fname="no_rex")
+no_rep = bs(l=0, fname="no_rep")
+
+sib_scale = bs(l=2, cls=(bs_cond_scale,), fname = "sib_scale")
+sib_index = bs(l=3, cls=(bs_cond_index,), fname = "sib_index")
+sib_base = bs(l=3, cls=(bs_cond_index,), fname = "sib_base")
+
+disp = bs(l=0, cls=(bs_cond_disp,), fname = "disp")
+
+s08 = bs(l=8, cls=(bs_s08, ))
+
+u08 = bs(l=8, cls=(x86_08, x86_arg))
+u07 = bs(l=7, cls=(x86_08, x86_arg))
+u16 = bs(l=16, cls=(x86_16, x86_arg))
+u32 = bs(l=32, cls=(x86_32, x86_arg))
+s3264 = bs(l=32, cls=(x86_s32to64, x86_arg))
+
+u08_3 = bs(l=0, cls=(x86_imm_fix_08, x86_arg), ival = 3)
+
+d0 = bs("000", fname='reg')
+d1 = bs("001", fname='reg')
+d2 = bs("010", fname='reg')
+d3 = bs("011", fname='reg')
+d4 = bs("100", fname='reg')
+d5 = bs("101", fname='reg')
+d6 = bs("110", fname='reg')
+d7 = bs("111", fname='reg')
+
+sd = bs(l=1, fname="sd")
+wd = bs(l=1, fname="wd")
+
+stk = bs(l=0, fname="stk")
+
+
+class field_size(object):
+    prio = default_prio
+
+    def __init__(self, d=None):
+        if d is None:
+            d = {}
+        self.d = d
+
+    def get(self, opm, adm=None):
+        return self.d[opm]
+
+class bs_mem(object):
+    def encode(self):
+        return self.value != 0b11
+
+    def decode(self, v):
+        self.value = v
+        return v != 0b11
+
+class bs_reg(object):
+    def encode(self):
+        return self.value == 0b11
+
+    def decode(self, v):
+        self.value = v
+        return v == 0b11
+
+d_imm64 = bs(l=0, fname="imm64")
+
+d_eax = bs(l=0, cls=(bs_eax, ), fname='eax')
+d_edx = bs(l=0, cls=(bs_edx, ), fname='edx')
+d_st = bs(l=0, cls=(x86_reg_st, ), fname='st')
+d_imm = bs(l=0, cls=(bs_cond_imm,), fname="imm")
+d_imm64 = bs(l=0, cls=(bs_cond_imm64,), fname="imm")
+d_ax = bs(l=0, cls=(r_ax, ), fname='ax')
+d_dx = bs(l=0, cls=(r_dx, ), fname='dx')
+d_cl = bs(l=0, cls=(r_cl, ), fname='cl')
+
+d_cs = bs(l=0, cls=(bs_cs, ), fname='cs')
+d_ds = bs(l=0, cls=(bs_ds, ), fname='ds')
+d_es = bs(l=0, cls=(bs_es, ), fname='es')
+d_ss = bs(l=0, cls=(bs_ss, ), fname='ss')
+d_fs = bs(l=0, cls=(bs_fs, ), fname='fs')
+d_gs = bs(l=0, cls=(bs_gs, ), fname='gs')
+
+# Offset must be decoded in last position to have final instruction len
+rel_off = bs(l=0, cls=(bs_rel_off,), fname="off", order=-1)
+# Offset must be decoded in last position to have final instruction len
+rel_off08 = bs(l=8, cls=(bs_rel_off08,), fname="off", order=-1)
+moff = bs(l=0, cls=(bs_moff,), fname="off")
+msegoff = bs(l=16, cls=(bs_msegoff,), fname="mseg")
+movoff = bs(l=0, cls=(bs_movoff,), fname="off")
+mod = bs(l=2, fname="mod")
+mod_mem = bs(l=2, cls=(bs_mem,), fname="mod")
+mod_reg = bs(l=2, cls=(bs_reg,), fname="mod")
+
+rmreg = bs(l=3, cls=(x86_rm_reg, ), order =1, fname = "reg")
+reg = bs(l=3, cls=(x86_reg, ), order =1, fname = "reg")
+
+reg_modrm = bs(l=3, cls=(x86_reg_modrm, ), order =1, fname = "reg")
+
+vex_reg = bs(l=0, cls=(x86_vex_reg, ), order =1, fname = "vex_reg")
+
+regnoarg = bs(l=3, default_val="000", order=1, fname="reg")
+segm = bs(l=3, cls=(x86_rm_segm, ), order =1, fname = "reg")
+crreg = bs(l=3, cls=(x86_rm_cr, ), order =1, fname = "reg")
+drreg = bs(l=3, cls=(x86_rm_dr, ), order =1, fname = "reg")
+
+
+mm_reg = bs(l=3, cls=(x86_rm_reg_mm, ), order =1, fname = "reg")
+xmm_reg = bs(l=3, cls=(x86_rm_reg_xmm, ), order =1, fname = "reg")
+bnd_reg = bs(l=3, cls=(x86_rm_reg_bnd, ), order =1, fname = "reg")
+
+
+fltreg = bs(l=3, cls=(x86_rm_flt, ), order =1, fname = "reg")
+
+rm = bs(l=3, fname="rm")
+
+rm_arg = bs(l=0, cls=(x86_rm_arg,), fname='rmarg')
+rm_arg_w8 = bs(l=0, cls=(x86_rm_w8,), fname='rmarg')
+rm_arg_sx = bs(l=0, cls=(x86_rm_sx,), fname='rmarg')
+rm_arg_sxd = bs(l=0, cls=(x86_rm_sxd,), fname='rmarg')
+rm_arg_sd = bs(l=0, cls=(x86_rm_sd,), fname='rmarg')
+rm_arg_wd = bs(l=0, cls=(x86_rm_wd,), fname='rmarg')
+rm_arg_08 = bs(l=0, cls=(x86_rm_08,), fname='rmarg')
+rm_arg_reg_m08 = bs(l=0, cls=(x86_rm_reg_m08,), fname='rmarg')
+rm_arg_reg_m16 = bs(l=0, cls=(x86_rm_reg_m16,), fname='rmarg')
+rm_arg_m08 = bs(l=0, cls=(x86_rm_m08,), fname='rmarg')
+rm_arg_m64 = bs(l=0, cls=(x86_rm_m64,), fname='rmarg')
+rm_arg_m80 = bs(l=0, cls=(x86_rm_m80,), fname='rmarg')
+rm_arg_m16 = bs(l=0, cls=(x86_rm_m16,), fname='rmarg')
+
+rm_mem = bs(l=0, cls=(x86_rm_mem,), fname='rmarg')
+rm_mem_far = bs(l=0, cls=(x86_rm_mem_far,), fname='rmarg')
+
+rm_arg_mm = bs(l=0, cls=(x86_rm_mm,), fname='rmarg')
+rm_arg_mm_m64 = bs(l=0, cls=(x86_rm_mm_m64,), fname='rmarg')
+rm_arg_mm_reg = bs(l=0, cls=(x86_rm_mm_reg,), fname='rmarg')
+
+rm_arg_xmm = bs(l=0, cls=(x86_rm_xmm,), fname='rmarg')
+rm_arg_xmm_m32 = bs(l=0, cls=(x86_rm_xmm_m32,), fname='rmarg')
+rm_arg_xmm_m64 = bs(l=0, cls=(x86_rm_xmm_m64,), fname='rmarg')
+rm_arg_xmm_m128 = bs(l=0, cls=(x86_rm_xmm_m128,), fname='rmarg')
+rm_arg_xmm_reg = bs(l=0, cls=(x86_rm_xmm_reg,), fname='rmarg')
+
+rm_arg_bnd = bs(l=0, cls=(x86_rm_bnd,), fname='rmarg')
+rm_arg_bnd_m64 = bs(l=0, cls=(x86_rm_bnd_m64,), fname='rmarg')
+rm_arg_bnd_m128 = bs(l=0, cls=(x86_rm_bnd_m128,), fname='rmarg')
+rm_arg_bnd_reg = bs(l=0, cls=(x86_rm_bnd_reg,), fname='rmarg')
+
+
+swapargs = bs_swapargs(l=1, fname="swap", mn_mod=list(range(1 << 1)))
+
+
+class bs_op_mode(bsi):
+
+    def decode(self, v):
+        opmode = self.parent.v_opmode()
+        return opmode == self.mode
+
+
+class bs_ad_mode(bsi):
+
+    def decode(self, v):
+        admode = self.parent.v_admode()
+        return admode == self.mode
+
+
+class bs_op_mode_no64(bsi):
+
+    def encode(self):
+        if self.parent.mode == 64:
+            return False
+        return super(bs_op_mode_no64, self).encode()
+
+    def decode(self, v):
+        if self.parent.mode == 64:
+            return False
+        opmode = self.parent.v_opmode()
+        return opmode == self.mode
+
+
+class bs_op_mode64(bsi):
+    def encode(self):
+        if self.parent.mode != 64:
+            return False
+        return super(bs_op_mode64, self).encode()
+
+    def decode(self, v):
+        if self.parent.mode != 64:
+            return False
+        return True
+
+class bs_op_modeno64(bsi):
+    def encode(self):
+        if self.parent.mode == 64:
+            return False
+        return super(bs_op_modeno64, self).encode()
+
+    def decode(self, v):
+        if self.parent.mode == 64:
+            return False
+        return True
+
+
+
+bs_opmode16 = bs(l=0, cls=(bs_op_mode,), mode = 16, fname="fopmode")
+bs_opmode32 = bs(l=0, cls=(bs_op_mode,), mode = 32, fname="fopmode")
+bs_opmode64 = bs(l=0, cls=(bs_op_mode,), mode = 64, fname="fopmode")
+
+
+bs_admode16 = bs(l=0, cls=(bs_ad_mode,), mode = 16, fname="fadmode")
+bs_admode32 = bs(l=0, cls=(bs_ad_mode,), mode = 32, fname="fadmode")
+bs_admode64 = bs(l=0, cls=(bs_ad_mode,), mode = 64, fname="fadmode")
+
+bs_opmode16_no64 = bs(l=0, cls=(bs_op_mode_no64,), mode = 16, fname="fopmode")
+bs_opmode32_no64 = bs(l=0, cls=(bs_op_mode_no64,), mode = 32, fname="fopmode")
+
+bs_mode64 = bs(l=0, cls=(bs_op_mode64,))
+bs_modeno64 = bs(l=0, cls=(bs_op_modeno64,))
+
+
+cond_list = ["O", "NO", "B", "AE",
+             "Z", "NZ", "BE", "A",
+             "S", "NS", "PE", "NP",
+             #"L", "NL", "NG", "G"]
+             "L", "GE", "LE", "G"]
+cond = bs_mod_name(l=4, fname='cond', mn_mod=cond_list)
+
+
+def rmmod(r, rm_arg_x=rm_arg, modrm=mod):
+    return [modrm, r, rm, sib_scale, sib_index, sib_base, disp, rm_arg_x]
+
+#
+# mode | reg | rm #
+#
+
+#
+# scale | index | base #
+#
+
+#
+# Prefix | REX prefix | Opcode | mod/rm | sib | displacement | immediate #
+#
+
+
+def addop(name, fields, args=None, alias=False):
+    dct = {"fields": fields}
+    dct["alias"] = alias
+    if args is not None:
+        dct['args'] = args
+    type(name, (mn_x86,), dct)
+"""
+class ia32_aaa(mn_x86):
+    fields = [bs8(0x37)]
+"""
+addop("aaa", [bs8(0x37)])
+addop("aas", [bs8(0x3F)])
+addop("aad", [bs8(0xd5), u08])
+addop("aam", [bs8(0xd4), u08])
+
+addop("adc", [bs("0001010"), w8, d_eax, d_imm])
+addop("adc", [bs("100000"), se, w8] + rmmod(d2, rm_arg_w8) + [d_imm])
+addop("adc", [bs("000100"), swapargs, w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+
+addop("add", [bs("0000010"), w8, d_eax, d_imm])
+addop("add", [bs("100000"), se, w8] + rmmod(d0, rm_arg_w8) + [d_imm])
+addop("add", [bs("000000"), swapargs, w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+
+addop("and", [bs("0010010"), w8, d_eax, d_imm])
+addop("and", [bs("100000"), se, w8] + rmmod(d4, rm_arg_w8) + [d_imm])
+addop("and", [bs("001000"), swapargs, w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+
+addop("bndmov", [bs8(0x0f), bs8(0x1a), pref_66, bs_modeno64] +
+      rmmod(bnd_reg, rm_arg_bnd_m64), [bnd_reg, rm_arg_bnd_m64])
+addop("bndmov", [bs8(0x0f), bs8(0x1a), pref_66, bs_mode64] +
+      rmmod(bnd_reg, rm_arg_bnd_m128), [bnd_reg, rm_arg_bnd_m128])
+addop("bndmov", [bs8(0x0f), bs8(0x1b), pref_66, bs_modeno64] +
+      rmmod(bnd_reg, rm_arg_bnd_m64), [rm_arg_bnd_m64, bnd_reg])
+addop("bndmov", [bs8(0x0f), bs8(0x1b), pref_66, bs_mode64] +
+      rmmod(bnd_reg, rm_arg_bnd_m128), [rm_arg_bnd_m128, bnd_reg])
+
+
+
+addop("bsf", [bs8(0x0f), bs8(0xbc), no_rep] + rmmod(rmreg))
+addop("bsr", [bs8(0x0f), bs8(0xbd), mod,
+    rmreg, rm, sib_scale, sib_index, sib_base, disp, rm_arg])
+
+addop("bswap", [bs8(0x0f), bs('11001'), reg])
+
+addop("bt", [bs8(0x0f), bs8(0xa3)] + rmmod(rmreg), [rm_arg, rmreg])
+addop("bt", [bs8(0x0f), bs8(0xba)] + rmmod(d4) + [u08])
+addop("btc", [bs8(0x0f), bs8(0xbb)] + rmmod(rmreg), [rm_arg, rmreg])
+addop("btc", [bs8(0x0f), bs8(0xba)] + rmmod(d7) + [u08])
+
+
+addop("btr", [bs8(0x0f), bs8(0xb3)] + rmmod(rmreg), [rm_arg, rmreg])
+addop("btr", [bs8(0x0f), bs8(0xba)] + rmmod(d6) + [u08])
+addop("bts", [bs8(0x0f), bs8(0xab)] + rmmod(rmreg), [rm_arg, rmreg])
+addop("bts", [bs8(0x0f), bs8(0xba)] + rmmod(d5) + [u08])
+
+addop("call", [bs8(0xe8), rel_off])
+addop("call", [bs8(0xff), stk] + rmmod(d2))
+addop("call", [bs8(0xff), stk] + rmmod(d3, rm_arg_x=rm_mem_far, modrm=mod_mem))
+addop("call", [bs8(0x9a), bs_modeno64, moff, msegoff])
+
+
+addop("cbw", [bs8(0x98), bs_opmode16])
+addop("cwde", [bs8(0x98), bs_opmode32])
+addop("cdqe", [bs8(0x98), bs_opmode64])
+
+addop("clc", [bs8(0xf8)])
+addop("cld", [bs8(0xfc)])
+addop("cli", [bs8(0xfa)])
+addop("clts", [bs8(0x0f), bs8(0x06)])
+addop("cmc", [bs8(0xf5)])
+
+addop("cmov", [bs8(0x0f), bs('0100'), cond] + rmmod(rmreg))
+
+addop("cmp", [bs("0011110"), w8, d_eax, d_imm])
+addop("cmp", [bs("100000"), se, w8] + rmmod(d7, rm_arg_w8) + [d_imm])
+addop("cmp", [bs("001110"), swapargs, w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+
+
+addop("cmpsb", [bs8(0xa6)])
+addop("cmpsw", [bs8(0xa7), bs_opmode16])
+addop("cmpsd", [bs8(0xa7), bs_opmode32])
+addop("cmpsq", [bs8(0xa7), bs_opmode64])
+
+addop("cmpxchg", [bs8(0x0f), bs('1011000'), w8]
+      + rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+addop("cmpxchg8b", [bs8(0x0f), bs8(0xc7), bs_opmode16] + rmmod(d1, rm_arg_m64))
+addop("cmpxchg8b", [bs8(0x0f), bs8(0xc7), bs_opmode32] + rmmod(d1, rm_arg_m64))
+addop("cmpxchg16b", [bs8(0x0f), bs8(0xc7), bs_opmode64] + rmmod(d1, rm_arg_xmm_m128))
+
+# XXX TODO CMPXCHG8/16
+
+addop("comiss", [bs8(0x0f), bs8(0x2f), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm_m32), [xmm_reg, rm_arg_xmm_m32])
+addop("comisd", [bs8(0x0f), bs8(0x2f), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m64), [xmm_reg, rm_arg_xmm_m64])
+
+addop("cpuid", [bs8(0x0f), bs8(0xa2)])
+
+addop("cwd", [bs8(0x99), bs_opmode16])
+addop("cdq", [bs8(0x99), bs_opmode32])
+addop("cqo", [bs8(0x99), bs_opmode64])
+
+
+addop("daa", [bs8(0x27)])
+addop("das", [bs8(0x2f)])
+addop("dec", [bs('1111111'), w8] + rmmod(d1, rm_arg_w8))
+addop("dec", [bs('01001'), reg, bs_modeno64])
+addop("div", [bs('1111011'), w8] + rmmod(d6, rm_arg_w8))
+addop("enter", [bs8(0xc8), u16, u08])
+
+# float #####
+addop("fwait", [bs8(0x9b)])
+
+addop("f2xm1", [bs8(0xd9), bs8(0xf0)])
+addop("fabs", [bs8(0xd9), bs8(0xe1)])
+
+addop("fadd", [bs("11011"), sd, bs("00")] + rmmod(d0, rm_arg_sd))
+addop("fadd", [bs("11011"), swapargs, bs("00"),
+      bs("11000"), d_st, fltreg], [d_st, fltreg])
+addop("faddp", [bs8(0xde), bs("11000"), fltreg, d_st])
+addop("fiadd", [bs("11011"), wd, bs("10")] + rmmod(d0, rm_arg_wd))
+
+addop("fbld", [bs8(0xdf)] + rmmod(d4, rm_arg_m80))
+addop("fbldp", [bs8(0xdf)] + rmmod(d6, rm_arg_m80))
+addop("fchs", [bs8(0xd9), bs8(0xe0)])
+# addop("fclex", [bs8(0x9b), bs8(0xdb), bs8(0xe2)])
+addop("fnclex", [bs8(0xdb), bs8(0xe2)])
+
+addop("fcmovb", [bs8(0xda), bs("11000"), d_st, fltreg])
+addop("fcmove", [bs8(0xda), bs("11001"), d_st, fltreg])
+addop("fcmovbe", [bs8(0xda), bs("11010"), d_st, fltreg])
+addop("fcmovu", [bs8(0xda), bs("11011"), d_st, fltreg])
+addop("fcmovnb", [bs8(0xdb), bs("11000"), d_st, fltreg])
+addop("fcmovne", [bs8(0xdb), bs("11001"), d_st, fltreg])
+addop("fcmovnbe", [bs8(0xdb), bs("11010"), d_st, fltreg])
+addop("fcmovnu", [bs8(0xdb), bs("11011"), d_st, fltreg])
+
+addop("fcom", [bs("11011"), sd, bs("00")] + rmmod(d2, rm_arg_sd))
+addop("fcom", [bs("11011"), swapargs, bs("00"),
+      bs("11010"), d_st, fltreg], [d_st, fltreg])
+addop("fcomp", [bs("11011"), sd, bs("00")] + rmmod(d3, rm_arg_sd))
+addop("fcomp",
+      [bs("11011"), swapargs, bs("00"), bs("11011"),
+      d_st, fltreg], [d_st, fltreg])
+addop("fcompp", [bs8(0xde), bs8(0xd9)])
+
+addop("fcomi", [bs8(0xdb), bs("11110"), d_st, fltreg])
+addop("fcomip", [bs8(0xdf), bs("11110"), d_st, fltreg])
+addop("fucomi", [bs8(0xdb), bs("11101"), d_st, fltreg])
+addop("fucomip", [bs8(0xdf), bs("11101"), d_st, fltreg])
+
+addop("fcos", [bs8(0xd9), bs8(0xff)])
+addop("fdecstp", [bs8(0xd9), bs8(0xf6)])
+
+
+addop("fdiv", [bs("11011"), sd, bs("00")] + rmmod(d6, rm_arg_sd))
+addop("fdiv", [bs8(0xd8), bs("11110"), d_st, fltreg])
+addop("fdiv", [bs8(0xdc), bs("11111"), fltreg, d_st])
+addop("fdivp", [bs8(0xde), bs("11111"), fltreg, d_st])
+addop("fidiv", [bs("11011"), wd, bs("10")] + rmmod(d6, rm_arg_wd))
+
+addop("fdivr", [bs("11011"), sd, bs("00")] + rmmod(d7, rm_arg_sd))
+addop("fdivr", [bs8(0xd8), bs("11111"), d_st, fltreg])
+addop("fdivr", [bs8(0xdc), bs("11110"), fltreg, d_st])
+addop("fdivrp", [bs8(0xde), bs("11110"), fltreg, d_st])
+addop("fidivr", [bs("11011"), wd, bs("10")] + rmmod(d7, rm_arg_wd))
+
+addop("ffree", [bs8(0xdd), bs("11000"), fltreg])
+addop("ficom", [bs("11011"), wd, bs("10")] + rmmod(d2, rm_arg_wd))
+addop("ficomp", [bs("11011"), wd, bs("10")] + rmmod(d3, rm_arg_wd))
+addop("fild", [bs("11011"), wd, bs("11")] + rmmod(d0, rm_arg_wd))
+addop("fild", [bs8(0xdf)] + rmmod(d5, rm_arg_m64))
+
+addop("fincstp", [bs8(0xd9), bs8(0xf7)])
+
+addop("blsi", [pref_0f38, bs8(0xf3), vex_reg] + rmmod(bs("011"), rm_arg), [vex_reg, rm_arg])
+addop("andn", [pref_0f38, bs8(0xf2), vex_reg] + rmmod(rmreg, rm_arg), [rmreg, vex_reg, rm_arg])
+addop("bextr", [pref_0f38, bs8(0xf7), vex_reg] + rmmod(rmreg, rm_arg), [rmreg, rm_arg, vex_reg])
+addop("blsmsk", [pref_0f38, bs8(0xf3), vex_reg] + rmmod(bs("010"), rm_arg), [vex_reg, rm_arg])
+addop("blsr", [pref_0f38, bs8(0xf3), vex_reg] + rmmod(bs("001"), rm_arg), [vex_reg, rm_arg])
+addop("bzhi", [pref_0f38, bs8(0xf5), vex_reg] + rmmod(rmreg, rm_arg), [rmreg, rm_arg, vex_reg])
+addop("tzcnt", [bs8(0x0f), bs8(0xbc), pref_f3] + rmmod(rmreg, rm_arg), [rmreg, rm_arg])
+
+# addop("finit", [bs8(0x9b), bs8(0xdb), bs8(0xe3)])
+addop("fninit", [bs8(0xdb), bs8(0xe3)])
+
+addop("fist", [bs("11011"), wd, bs("11")] + rmmod(d2, rm_arg_wd))
+addop("fistp", [bs("11011"), wd, bs("11")] + rmmod(d3, rm_arg_wd))
+addop("fistp", [bs8(0xdf)] + rmmod(d7, rm_arg_m64))
+
+addop("fisttp", [bs("11011"), wd, bs("11")] + rmmod(d1, rm_arg_wd))
+addop("fisttp", [bs8(0xdd)] + rmmod(d1, rm_arg_m64))
+
+addop("fld", [bs("11011"), sd, bs("01")] + rmmod(d0, rm_arg_sd))
+addop("fld", [bs8(0xdb)] + rmmod(d5, rm_arg_m80))
+addop("fld", [bs8(0xd9), bs("11000"), fltreg])
+
+addop("fld1", [bs8(0xd9), bs8(0xe8)])
+addop("fldl2t", [bs8(0xd9), bs8(0xe9)])
+addop("fldl2e", [bs8(0xd9), bs8(0xea)])
+addop("fldpi", [bs8(0xd9), bs8(0xeb)])
+addop("fldlg2", [bs8(0xd9), bs8(0xec)])
+addop("fldln2", [bs8(0xd9), bs8(0xed)])
+addop("fldz", [bs8(0xd9), bs8(0xee)])
+
+addop("fldcw", [bs8(0xd9)] + rmmod(d5, rm_arg_m16))
+addop("fldenv", [bs8(0xd9)] + rmmod(d4, rm_arg_m80))  # XXX TODO: m14?
+
+addop("fmul", [bs("11011"), sd, bs("00")] + rmmod(d1, rm_arg_sd))
+addop("fmul", [bs("11011"), swapargs, bs("00"),
+      bs("11001"), d_st, fltreg], [d_st, fltreg])
+addop("fmulp", [bs8(0xde), bs("11001"), fltreg, d_st])
+addop("fimul", [bs("11011"), wd, bs("10")] + rmmod(d1, rm_arg_wd))
+
+addop("fnop", [bs8(0xd9), bs8(0xd0)])
+addop("fpatan", [bs8(0xd9), bs8(0xf3)])
+addop("fprem", [bs8(0xd9), bs8(0xf8)])
+addop("fprem1", [bs8(0xd9), bs8(0xf5)])
+addop("fptan", [bs8(0xd9), bs8(0xf2)])
+addop("frndint", [bs8(0xd9), bs8(0xfc)])
+addop("frstor", [bs8(0xdd)] + rmmod(d4, rm_arg_m80))  # XXX TODO: m94 ?
+# addop("fsave", [bs8(0x9b), bs8(0xdd)] + rmmod(d6, rm_arg_m80)) # XXX
+# TODO: m94 ?
+addop("fnsave", [bs8(0xdd)] + rmmod(d6, rm_arg_m80))  # XXX TODO: m94 ?
+
+addop("fscale", [bs8(0xd9), bs8(0xfd)])
+addop("fsin", [bs8(0xd9), bs8(0xfe)])
+addop("fsincos", [bs8(0xd9), bs8(0xfb)])
+addop("fsqrt", [bs8(0xd9), bs8(0xfa)])
+
+addop("fst", [bs("11011"), sd, bs("01")] + rmmod(d2, rm_arg_sd))
+addop("fst", [bs8(0xdd), bs("11010"), fltreg])
+addop("fstp", [bs("11011"), sd, bs("01")] + rmmod(d3, rm_arg_sd))
+addop("fstp", [bs8(0xdb)] + rmmod(d7, rm_arg_m80))
+addop("fstp", [bs8(0xdd), bs("11011"), fltreg])
+
+# addop("fstcw", [bs8(0x9b), bs8(0xd9)] + rmmod(d7, rm_arg_m16))
+addop("fnstcw", [bs8(0xd9)] + rmmod(d7, rm_arg_m16))
+# addop("fstenv", [bs8(0x9b), bs8(0xd9)] + rmmod(d6, rm_arg_m80)) # XXX
+# TODO: m14?
+addop("fnstenv", [bs8(0xd9)] + rmmod(d6, rm_arg_m80))  # XXX TODO: m14?
+# addop("fstsw", [bs8(0x9b), bs8(0xdd)] + rmmod(d7, rm_arg_m16))
+addop("fnstsw", [bs8(0xdd)] + rmmod(d7, rm_arg_m16))
+# addop("fstsw", [bs8(0x9b), bs8(0xdf), bs8(0xe0), d_ax])
+addop("fnstsw", [bs8(0xdf), bs8(0xe0), d_ax])
+
+addop("fsub", [bs("11011"), sd, bs("00")] + rmmod(d4, rm_arg_sd))
+addop("fsub", [bs8(0xd8), bs("11100"), d_st, fltreg])
+addop("fsub", [bs8(0xdc), bs("11101"), fltreg, d_st])
+addop("fsubp", [bs8(0xde), bs("11101"), fltreg, d_st])
+addop("fisub", [bs("11011"), wd, bs("10")] + rmmod(d4, rm_arg_wd))
+
+addop("fsubr", [bs("11011"), sd, bs("00")] + rmmod(d5, rm_arg_sd))
+addop("fsubr", [bs8(0xd8), bs("11101"), d_st, fltreg])
+addop("fsubr", [bs8(0xdc), bs("11100"), fltreg, d_st])
+addop("fsubrp", [bs8(0xde), bs("11100"), fltreg, d_st])
+addop("fisubr", [bs("11011"), wd, bs("10")] + rmmod(d5, rm_arg_wd))
+addop("ftst", [bs8(0xd9), bs8(0xe4)])
+
+
+addop("fucom", [bs8(0xdd), bs("11100"), fltreg])
+addop("fucomp", [bs8(0xdd), bs("11101"), fltreg])
+addop("fucompp", [bs8(0xda), bs8(0xe9)])
+
+addop("fxam", [bs8(0xd9), bs8(0xe5)])
+addop("fxch", [bs8(0xd9), bs("11001"), fltreg])
+addop("fxrstor", [bs8(0x0f), bs8(0xae)]
+      + rmmod(d1, rm_arg_m80))  # XXX TODO m512
+addop("fxsave", [bs8(0x0f), bs8(0xae)]
+      + rmmod(d0, rm_arg_m80))  # XXX TODO m512
+addop("stmxcsr", [bs8(0x0f), bs8(0xae)] + rmmod(d3))
+addop("ldmxcsr", [bs8(0x0f), bs8(0xae)] + rmmod(d2))
+
+addop("fxtract", [bs8(0xd9), bs8(0xf4)])
+addop("fyl2x", [bs8(0xd9), bs8(0xf1)])
+addop("fyl2xp1", [bs8(0xd9), bs8(0xf9)])
+
+addop("hlt", [bs8(0xf4)])
+addop("icebp", [bs8(0xf1)])
+
+addop("idiv", [bs('1111011'), w8] + rmmod(d7, rm_arg_w8))
+
+addop("imul", [bs('1111011'), w8] + rmmod(d5, rm_arg_w8))
+addop("imul", [bs8(0x0f), bs8(0xaf)] + rmmod(rmreg))
+
+addop("imul", [bs("011010"), se, bs('1')] + rmmod(rmreg) + [d_imm])
+
+addop("in", [bs("1110010"), w8, d_eax, u08])
+addop("in", [bs("1110110"), w8, d_eax, d_edx])
+
+addop("inc", [bs('1111111'), w8] + rmmod(d0, rm_arg_w8))
+addop("inc", [bs('01000'), reg, bs_modeno64])
+
+addop("insb", [bs8(0x6c)])
+addop("insw", [bs8(0x6d), bs_opmode16])
+addop("insd", [bs8(0x6d), bs_opmode32])
+addop("insd", [bs8(0x6d), bs_opmode64])
+
+addop("int", [bs8(0xcc), u08_3])
+addop("int", [bs8(0xcd), u08])
+addop("into", [bs8(0xce)])
+addop("invd", [bs8(0x0f), bs8(0x08)])
+addop("invlpg", [bs8(0x0f), bs8(0x01)] + rmmod(d7))
+
+addop("iret", [bs8(0xcf), bs_opmode16])
+addop("iretd", [bs8(0xcf), bs_opmode32])
+addop("iretq", [bs8(0xcf), bs_opmode64])
+
+addop("j", [bs('0111'), cond, rel_off08])
+
+addop("jcxz", [bs8(0xe3), rel_off08, bs_admode16])
+addop("jecxz", [bs8(0xe3), rel_off08, bs_admode32])
+addop("jrcxz", [bs8(0xe3), rel_off08, bs_admode64])
+
+addop("j", [bs8(0x0f), bs('1000'), cond, rel_off])
+addop("jmp", [bs8(0xeb), rel_off08])
+addop("jmp", [bs8(0xe9), rel_off])
+# TODO XXX replace stk force64?
+addop("jmp", [bs8(0xff), stk] + rmmod(d4))
+addop("jmp", [bs8(0xea), bs_modeno64, moff, msegoff])
+
+addop("jmp", [bs8(0xff)] + rmmod(d5, rm_arg_x=rm_mem_far, modrm=mod_mem))
+
+addop("lahf", [bs8(0x9f)])
+addop("lar", [bs8(0x0f), bs8(0x02)] + rmmod(rmreg))
+
+addop("lea", [bs8(0x8d)] + rmmod(rmreg, rm_arg_x=rm_mem, modrm=mod_mem))
+addop("les", [bs8(0xc4)] + rmmod(rmreg, rm_arg_x=rm_mem, modrm=mod_mem))
+addop("lds", [bs8(0xc5)] + rmmod(rmreg, rm_arg_x=rm_mem, modrm=mod_mem))
+addop("lss", [bs8(0x0f), bs8(0xb2)] + rmmod(rmreg, rm_arg_x=rm_mem, modrm=mod_mem))
+addop("lfs", [bs8(0x0f), bs8(0xb4)] + rmmod(rmreg, rm_arg_x=rm_mem, modrm=mod_mem))
+addop("lgs", [bs8(0x0f), bs8(0xb5)] + rmmod(rmreg, rm_arg_x=rm_mem, modrm=mod_mem))
+
+addop("lgdt", [bs8(0x0f), bs8(0x01)] + rmmod(d2, modrm=mod_mem))
+addop("lidt", [bs8(0x0f), bs8(0x01)] + rmmod(d3, modrm=mod_mem))
+
+addop("lfence", [bs8(0x0f), bs8(0xae), bs8(0xe8), no_xmm_pref])
+addop("mfence", [bs8(0x0f), bs8(0xae), bs8(0xf0)])
+addop("sfence", [bs8(0x0f), bs8(0xae), bs8(0xf8)])
+
+addop("leave", [bs8(0xc9), stk])
+
+addop("lodsb", [bs8(0xac)])
+addop("lodsw", [bs8(0xad), bs_opmode16])
+addop("lodsd", [bs8(0xad), bs_opmode32])
+addop("lodsq", [bs8(0xad), bs_opmode64])
+
+addop("loop", [bs8(0xe2), rel_off08])
+addop("loope", [bs8(0xe1), rel_off08])
+addop("loopne", [bs8(0xe0), rel_off08])
+addop("lsl", [bs8(0x0f), bs8(0x03)] + rmmod(rmreg))
+addop("monitor", [bs8(0x0f), bs8(0x01), bs8(0xc8)])
+
+addop("mov", [bs("100010"), swapargs, w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+addop("mov", [bs("100011"), swapargs, bs('0')] + rmmod(segm), [rm_arg, segm])
+addop("mov", [bs("101000"), swapargs, w8, d_eax, movoff], [d_eax, movoff])
+addop("mov", [bs("1011"), w8, reg, d_imm64])
+addop("mov", [bs("1100011"), w8] + rmmod(d0, rm_arg_w8) + [d_imm])
+addop("mov", [bs8(0x0f), bs("001000"), swapargs, bs('0')]
+      + rmmod(crreg), [rm_arg, crreg])
+addop("mov", [bs8(0x0f), bs("001000"), swapargs, bs('1')]
+      + rmmod(drreg), [rm_arg, drreg])
+addop("movsb", [bs8(0xa4)])
+addop("movsw", [bs8(0xa5), bs_opmode16])
+addop("movsd", [bs8(0xa5), bs_opmode32])
+addop("movsq", [bs8(0xa5), bs_opmode64])
+
+addop("movsx", [bs8(0x0f), bs("1011111"), w8, sx] + rmmod(rmreg, rm_arg_sx))
+addop("movsxd", [bs8(0x63), sxd, bs_mode64] + rmmod(rmreg, rm_arg_sxd))
+
+addop("movups", [bs8(0x0f), bs("0001000"), swapargs, no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+addop("movsd", [bs8(0x0f), bs("0001000"), swapargs, pref_f2]
+      + rmmod(xmm_reg, rm_arg_xmm_m64), [xmm_reg, rm_arg_xmm_m64])
+addop("movss", [bs8(0x0f), bs("0001000"), swapargs, pref_f3] +
+      rmmod(xmm_reg, rm_arg_xmm_m32), [xmm_reg, rm_arg_xmm_m32])
+addop("movupd", [bs8(0x0f), bs8(0x10), pref_66] + rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+addop("movupd", [bs8(0x0f), bs8(0x11), pref_66] + rmmod(xmm_reg, rm_arg_xmm), [rm_arg_xmm, xmm_reg])
+
+
+addop("movd", [bs8(0x0f), bs('011'), swapargs, bs('1110'), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg), [mm_reg, rm_arg])
+addop("movd", [bs8(0x0f), bs('011'), swapargs, bs('1110'), pref_66, bs_opmode32] +
+      rmmod(xmm_reg, rm_arg), [xmm_reg, rm_arg])
+addop("movq", [bs8(0x0f), bs('011'), swapargs, bs('1110'), pref_66, bs_opmode64] +
+      rmmod(xmm_reg, rm_arg), [xmm_reg, rm_arg])
+
+addop("movq", [bs8(0x0f), bs('011'), swapargs, bs('1111'), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64), [mm_reg, rm_arg_mm_m64])
+
+addop("movq", [bs8(0x0f), bs8(0x7e), pref_f3] +
+      rmmod(xmm_reg, rm_arg_xmm_m64), [xmm_reg, rm_arg_xmm_m64])
+addop("movq", [bs8(0x0f), bs8(0xd6), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m64), [rm_arg_xmm_m64, xmm_reg])
+
+addop("movmskps", [bs8(0x0f), bs8(0x50), no_xmm_pref] +
+      rmmod(reg_modrm, rm_arg_xmm_reg))
+addop("movmskpd", [bs8(0x0f), bs8(0x50), pref_66] +
+      rmmod(reg_modrm, rm_arg_xmm_reg))
+
+addop("movnti", [bs8(0x0f), bs8(0xc3)] + rmmod(rmreg), [rm_arg, rmreg])
+
+addop("addss", [bs8(0x0f), bs8(0x58), pref_f3] + rmmod(xmm_reg, rm_arg_xmm_m32))
+addop("addsd", [bs8(0x0f), bs8(0x58), pref_f2] + rmmod(xmm_reg, rm_arg_xmm_m64))
+
+addop("subss", [bs8(0x0f), bs8(0x5c), pref_f3] + rmmod(xmm_reg, rm_arg_xmm_m32))
+addop("subsd", [bs8(0x0f), bs8(0x5c), pref_f2] + rmmod(xmm_reg, rm_arg_xmm_m64))
+
+addop("mulss", [bs8(0x0f), bs8(0x59), pref_f3] + rmmod(xmm_reg, rm_arg_xmm_m32))
+addop("mulsd", [bs8(0x0f), bs8(0x59), pref_f2] + rmmod(xmm_reg, rm_arg_xmm_m64))
+
+addop("divss", [bs8(0x0f), bs8(0x5e), pref_f3] + rmmod(xmm_reg, rm_arg_xmm_m32))
+addop("divsd", [bs8(0x0f), bs8(0x5e), pref_f2] + rmmod(xmm_reg, rm_arg_xmm_m64))
+
+addop("roundss", [bs8(0x0f), bs8(0x3a), bs8(0x0a), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m32) + [u08])
+addop("roundsd", [bs8(0x0f), bs8(0x3a), bs8(0x0b), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m64) + [u08])
+
+addop("pminsw", [bs8(0x0f), bs8(0xea), no_xmm_pref] + rmmod(mm_reg, rm_arg_mm))
+addop("pminsw", [bs8(0x0f), bs8(0xea), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+addop("ucomiss", [bs8(0x0f), bs8(0x2e), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm_m32))
+addop("ucomisd", [bs8(0x0f), bs8(0x2e), pref_66] + rmmod(xmm_reg, rm_arg_xmm_m64))
+
+
+addop("movzx", [bs8(0x0f), bs("1011011"), w8, sx] + rmmod(rmreg, rm_arg_sx))
+addop("mul", [bs('1111011'), w8] + rmmod(d4, rm_arg_w8))
+
+addop("neg", [bs('1111011'), w8] + rmmod(d3, rm_arg_w8))
+addop("nop", [bs8(0x0f), bs8(0x1f)] + rmmod(d0, rm_arg))  # XXX TODO m512
+addop("nop", [bs8(0x0f), bs8(0x1f)] + rmmod(d1, rm_arg))  # XXX TODO m512
+addop("nop", [bs8(0x0f), bs8(0x1f)] + rmmod(d2, rm_arg))  # XXX TODO m512
+addop("nop", [bs8(0x0f), bs8(0x1f)] + rmmod(d3, rm_arg))  # XXX TODO m512
+addop("nop", [bs8(0x0f), bs8(0x1f)] + rmmod(d4, rm_arg))  # XXX TODO m512
+addop("nop", [bs8(0x0f), bs8(0x1f)] + rmmod(d5, rm_arg))  # XXX TODO m512
+addop("nop", [bs8(0x0f), bs8(0x1f)] + rmmod(d6, rm_arg))  # XXX TODO m512
+addop("nop", [bs8(0x0f), bs8(0x1f)] + rmmod(d7, rm_arg))  # XXX TODO m512
+addop("not", [bs('1111011'), w8] + rmmod(d2, rm_arg_w8))
+addop("or", [bs("0000110"), w8, d_eax, d_imm])
+addop("or", [bs("100000"), se, w8] + rmmod(d1, rm_arg_w8) + [d_imm])
+addop("or", [bs("000010"), swapargs, w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+addop("out", [bs("1110011"), w8, u08, d_eax])
+addop("out", [bs("1110111"), w8, d_edx, d_eax])
+
+addop("outsb", [bs8(0x6e)])
+addop("outsw", [bs8(0x6f), bs_opmode16])
+addop("outsd", [bs8(0x6f), bs_opmode32])
+addop("outsd", [bs8(0x6f), bs_opmode64])
+
+addop("setalc", [bs8(0xD6)])
+
+# addop("pause", [bs8(0xf3), bs8(0x90)])
+
+addop("popw", [bs8(0x8f), stk, bs_opmode16] + rmmod(d0))
+addop("popw", [bs("01011"), stk, reg, bs_opmode16])
+addop("popw", [bs8(0x1f), stk, d_ds, bs_opmode16])
+addop("popw", [bs8(0x07), stk, d_es, bs_opmode16])
+addop("popw", [bs8(0x17), stk, d_ss, bs_opmode16])
+addop("popw", [bs8(0x0f), stk, bs8(0xa1), d_fs, bs_opmode16])
+addop("popw", [bs8(0x0f), stk, bs8(0xa9), d_gs, bs_opmode16])
+
+addop("pop", [bs8(0x8f), stk, bs_opmode32] + rmmod(d0))
+addop("pop", [bs("01011"), stk, reg, bs_opmode32])
+addop("pop", [bs8(0x1f), stk, d_ds, bs_opmode32])
+addop("pop", [bs8(0x07), stk, d_es, bs_opmode32])
+addop("pop", [bs8(0x17), stk, d_ss, bs_opmode32])
+addop("pop", [bs8(0x0f), stk, bs8(0xa1), d_fs, bs_opmode32])
+addop("pop", [bs8(0x0f), stk, bs8(0xa9), d_gs, bs_opmode32])
+
+addop("pop", [bs8(0x8f), stk, bs_opmode64] + rmmod(d0))
+addop("pop", [bs("01011"), stk, reg, bs_opmode64])
+addop("pop", [bs8(0x1f), stk, d_ds, bs_opmode64])
+addop("pop", [bs8(0x07), stk, d_es, bs_opmode64])
+addop("pop", [bs8(0x17), stk, d_ss, bs_opmode64])
+addop("pop", [bs8(0x0f), stk, bs8(0xa1), d_fs, bs_opmode64])
+addop("pop", [bs8(0x0f), stk, bs8(0xa9), d_gs, bs_opmode64])
+
+
+addop("popa", [bs8(0x61), stk, bs_opmode16])
+addop("popad", [bs8(0x61), stk, bs_opmode32])
+
+addop("popfw", [bs8(0x9d), stk, bs_opmode16])
+addop("popfd", [bs8(0x9d), stk, bs_opmode32])
+addop("popfq", [bs8(0x9d), stk, bs_opmode64])
+
+addop("prefetch0", [bs8(0x0f), bs8(0x18)] + rmmod(d1, rm_arg_m08))
+addop("prefetch1", [bs8(0x0f), bs8(0x18)] + rmmod(d2, rm_arg_m08))
+addop("prefetch2", [bs8(0x0f), bs8(0x18)] + rmmod(d3, rm_arg_m08))
+addop("prefetchnta", [bs8(0x0f), bs8(0x18)] + rmmod(d0, rm_arg_m08))
+addop("prefetchw", [bs8(0x0f), bs8(0x0d)] + rmmod(d1, rm_arg_m08))
+
+addop("pushw", [bs8(0xff), stk, bs_opmode16] + rmmod(d6))
+addop("pushw", [bs("01010"), stk, reg, bs_opmode16])
+addop("pushw", [bs8(0x6a), s08, stk, bs_opmode16])
+addop("pushw", [bs8(0x68), d_imm, stk, bs_opmode16])
+addop("pushw", [bs8(0x0e), stk, d_cs, bs_opmode16])
+addop("pushw", [bs8(0x16), stk, d_ss, bs_opmode16])
+addop("pushw", [bs8(0x1e), stk, d_ds, bs_opmode16])
+addop("pushw", [bs8(0x06), stk, d_es, bs_opmode16])
+addop("pushw", [bs8(0x0f), stk, bs8(0xa0), d_fs, bs_opmode16])
+addop("pushw", [bs8(0x0f), stk, bs8(0xa8), d_gs, bs_opmode16])
+
+addop("push", [bs8(0xff), stk, bs_opmode32] + rmmod(d6))
+addop("push", [bs("01010"), stk, reg, bs_opmode32])
+addop("push", [bs8(0x6a), s08, stk, bs_opmode32])
+addop("push", [bs8(0x68), d_imm, stk, bs_opmode32])
+addop("push", [bs8(0x0e), stk, d_cs, bs_opmode32])
+addop("push", [bs8(0x16), stk, d_ss, bs_opmode32])
+addop("push", [bs8(0x1e), stk, d_ds, bs_opmode32])
+addop("push", [bs8(0x06), stk, d_es, bs_opmode32])
+addop("push", [bs8(0x0f), stk, bs8(0xa0), d_fs, bs_opmode32])
+addop("push", [bs8(0x0f), stk, bs8(0xa8), d_gs, bs_opmode32])
+
+addop("push", [bs8(0xff), stk, bs_opmode64] + rmmod(d6))
+addop("push", [bs("01010"), stk, reg, bs_opmode64])
+addop("push", [bs8(0x6a), s08, stk, bs_opmode64])
+addop("push", [bs8(0x68), d_imm, stk, bs_opmode64])
+addop("push", [bs8(0x0e), stk, d_cs, bs_opmode64])
+addop("push", [bs8(0x16), stk, d_ss, bs_opmode64])
+addop("push", [bs8(0x1e), stk, d_ds, bs_opmode64])
+addop("push", [bs8(0x06), stk, d_es, bs_opmode64])
+addop("push", [bs8(0x0f), stk, bs8(0xa0), d_fs, bs_opmode64])
+addop("push", [bs8(0x0f), stk, bs8(0xa8), d_gs, bs_opmode64])
+
+addop("pusha", [bs8(0x60), stk, bs_opmode16_no64])
+addop("pushad", [bs8(0x60), stk, bs_opmode32_no64])
+
+
+addop("pushfw", [bs8(0x9c), stk, bs_opmode16])
+addop("pushfd", [bs8(0x9c), stk, bs_opmode32])
+addop("pushfq", [bs8(0x9c), stk, bs_opmode64])
+
+addop("rcl", [bs('110100'), d_cl1, w8] +
+      rmmod(d2, rm_arg_w8), [rm_arg_w8, d_cl1])
+addop("rcl", [bs('1100000'), w8] + rmmod(d2, rm_arg_w8) + [u08])
+addop("rcr", [bs('110100'), d_cl1, w8] +
+      rmmod(d3, rm_arg_w8), [rm_arg_w8, d_cl1])
+addop("rcr", [bs('1100000'), w8] + rmmod(d3, rm_arg_w8) + [u08])
+addop("rol", [bs('110100'), d_cl1, w8]
+      + rmmod(d0, rm_arg_w8), [rm_arg_w8, d_cl1])
+addop("rol", [bs('1100000'), w8] + rmmod(d0, rm_arg_w8) + [u08])
+addop("ror", [bs('110100'), d_cl1, w8]
+      + rmmod(d1, rm_arg_w8), [rm_arg_w8, d_cl1])
+addop("ror", [bs('1100000'), w8] + rmmod(d1, rm_arg_w8) + [u08])
+
+addop("rdmsr", [bs8(0x0f), bs8(0x32)])
+addop("rdpmc", [bs8(0x0f), bs8(0x33)])
+addop("rdtsc", [bs8(0x0f), bs8(0x31)])
+addop("ret", [bs8(0xc3), stk])
+addop("ret", [bs8(0xc2), stk, u16])
+addop("retf", [bs8(0xcb), stk])
+addop("retf", [bs8(0xca), stk, u16])
+
+addop("rsm", [bs8(0x0f), bs8(0xaa)])
+addop("sahf", [bs8(0x9e)])
+
+# XXX tipo in doc: /4 instead of /6
+addop("sal", [bs('110100'), d_cl1, w8] +
+      rmmod(d6, rm_arg_w8), [rm_arg_w8, d_cl1])
+addop("sal", [bs('1100000'), w8] + rmmod(d6, rm_arg_w8) + [u08])
+addop("sar", [bs('110100'), d_cl1, w8] +
+      rmmod(d7, rm_arg_w8), [rm_arg_w8, d_cl1])
+addop("sar", [bs('1100000'), w8] + rmmod(d7, rm_arg_w8) + [u08])
+
+addop("scasb", [bs8(0xae)])
+addop("scasw", [bs8(0xaf), bs_opmode16])
+addop("scasd", [bs8(0xaf), bs_opmode32])
+addop("scasq", [bs8(0xaf), bs_opmode64])
+
+addop("shl", [bs('110100'), d_cl1, w8]
+      + rmmod(d4, rm_arg_w8), [rm_arg_w8, d_cl1])
+addop("shl", [bs('1100000'), w8] + rmmod(d4, rm_arg_w8) + [u08])
+addop("shr", [bs('110100'), d_cl1, w8]
+      + rmmod(d5, rm_arg_w8), [rm_arg_w8, d_cl1])
+addop("shr", [bs('1100000'), w8] + rmmod(d5, rm_arg_w8) + [u08])
+
+addop("sbb", [bs("0001110"), w8, d_eax, d_imm])
+addop("sbb", [bs("100000"), se, w8] + rmmod(d3, rm_arg_w8) + [d_imm])
+addop("sbb", [bs("000110"), swapargs, w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+
+addop("set", [bs8(0x0f), bs('1001'), cond] + rmmod(regnoarg, rm_arg_08))
+addop("sgdt", [bs8(0x0f), bs8(0x01)] + rmmod(d0, modrm=mod_mem))
+addop("shld", [bs8(0x0f), bs8(0xa4)] +
+      rmmod(rmreg) + [u08], [rm_arg, rmreg, u08])
+addop("shld", [bs8(0x0f), bs8(0xa5)] +
+      rmmod(rmreg) + [d_cl], [rm_arg, rmreg, d_cl])
+addop("shrd", [bs8(0x0f), bs8(0xac)] +
+      rmmod(rmreg) + [u08], [rm_arg, rmreg, u08])
+addop("shrd", [bs8(0x0f), bs8(0xad)] +
+      rmmod(rmreg) + [d_cl], [rm_arg, rmreg, d_cl])
+addop("sidt", [bs8(0x0f), bs8(0x01)] + rmmod(d1, modrm=mod_mem))
+addop("sldt", [bs8(0x0f), bs8(0x00)] + rmmod(d0, rm_arg_x=rm_arg_reg_m16))
+addop("smsw", [bs8(0x0f), bs8(0x01)] + rmmod(d4))
+addop("stc", [bs8(0xf9)])
+addop("std", [bs8(0xfd)])
+addop("sti", [bs8(0xfb)])
+addop("stosb", [bs8(0xaa)])
+addop("stosw", [bs8(0xab), bs_opmode16])
+addop("stosd", [bs8(0xab), bs_opmode32])
+addop("stosq", [bs8(0xab), bs_opmode64])
+
+addop("str", [bs8(0x0f), bs8(0x00)] + rmmod(d1))
+
+addop("sub", [bs("0010110"), w8, d_eax, d_imm])
+addop("sub", [bs("100000"), se, w8] + rmmod(d5, rm_arg_w8) + [d_imm])
+addop("sub", [bs("001010"), swapargs, w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+
+addop("syscall", [bs8(0x0f), bs8(0x05)])
+addop("sysenter", [bs8(0x0f), bs8(0x34)])
+addop("sysexit", [bs8(0x0f), bs8(0x35)])
+addop("sysret", [bs8(0x0f), bs8(0x07)])
+addop("test", [bs("1010100"), w8, d_eax, d_imm])
+addop("test", [bs("1111011"), w8] + rmmod(d0, rm_arg_w8) + [d_imm])
+addop("test", [bs("1000010"), w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+addop("ud2", [bs8(0x0f), bs8(0x0b)])
+addop("verr", [bs8(0x0f), bs8(0x00)] + rmmod(d4))
+addop("verw", [bs8(0x0f), bs8(0x00)] + rmmod(d5))
+addop("wbinvd", [bs8(0x0f), bs8(0x09)])
+addop("wrmsr", [bs8(0x0f), bs8(0x30)])
+addop("xadd", [bs8(0x0f), bs("1100000"), w8]
+      + rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+
+addop("nop", [bs8(0x90), no_rex], alias=True)
+
+addop("xchg", [bs('10010'), d_eax, reg])
+addop("xchg", [bs('1000011'), w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+addop("xlat", [bs8(0xd7)])
+
+
+addop("xor", [bs("0011010"), w8, d_eax, d_imm])
+addop("xor", [bs("100000"), se, w8] + rmmod(d6, rm_arg_w8) + [d_imm])
+addop("xor", [bs("001100"), swapargs, w8] +
+      rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
+
+
+addop("xgetbv", [bs8(0x0f), bs8(0x01), bs8(0xd0)])
+
+
+
+#### MMX/SSE/AVX operations
+#### Categories are the same than here: https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+####
+
+### Arithmetic (integers)
+###
+
+## Move
+# SSE
+addop("movapd", [bs8(0x0f), bs("0010100"), swapargs]
+      + rmmod(xmm_reg, rm_arg_xmm) + [bs_opmode16], [xmm_reg, rm_arg_xmm])
+addop("movaps", [bs8(0x0f), bs("0010100"), swapargs]
+      + rmmod(xmm_reg, rm_arg_xmm_m128) + [bs_opmode32], [xmm_reg, rm_arg_xmm_m128])
+addop("movaps", [bs8(0x0f), bs("0010100"), swapargs]
+      + rmmod(xmm_reg, rm_arg_xmm_m128) + [bs_opmode64], [xmm_reg, rm_arg_xmm_m128])
+addop("movdqu", [bs8(0x0f), bs("011"), swapargs, bs("1111"), pref_f3]
+      + rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+addop("movdqa", [bs8(0x0f), bs("011"), swapargs, bs("1111"), pref_66]
+      + rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+addop("movhpd", [bs8(0x0f), bs("0001011"), swapargs, pref_66] +
+      rmmod(xmm_reg, rm_arg_m64), [xmm_reg, rm_arg_m64])
+addop("movhps", [bs8(0x0f), bs("0001011"), swapargs, no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_m64), [xmm_reg, rm_arg_m64])
+addop("movlpd", [bs8(0x0f), bs("0001001"), swapargs, pref_66] +
+      rmmod(xmm_reg, rm_arg_m64), [xmm_reg, rm_arg_m64])
+addop("movlps", [bs8(0x0f), bs("0001001"), swapargs, no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_m64), [xmm_reg, rm_arg_m64])
+
+addop("movhlps", [bs8(0x0f), bs8(0x12), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm_reg), [xmm_reg, rm_arg_xmm_reg])
+addop("movlhps", [bs8(0x0f), bs8(0x16), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm_reg), [xmm_reg, rm_arg_xmm_reg])
+
+addop("movdq2q", [bs8(0x0f), bs8(0xd6), pref_f2] +
+      rmmod(mm_reg, rm_arg_xmm_reg), [mm_reg, rm_arg_xmm_reg])
+addop("movq2dq", [bs8(0x0f), bs8(0xd6), pref_f3] +
+      rmmod(xmm_reg, rm_arg_mm))
+
+## Additions
+# SSE
+addop("paddb", [bs8(0x0f), bs8(0xfc), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+addop("paddw", [bs8(0x0f), bs8(0xfd), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+addop("paddd", [bs8(0x0f), bs8(0xfe), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+addop("paddq", [bs8(0x0f), bs8(0xd4), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+addop("paddb", [bs8(0x0f), bs8(0xfc), no_xmm_pref] + rmmod(mm_reg, rm_arg_mm))
+addop("paddw", [bs8(0x0f), bs8(0xfd), no_xmm_pref] + rmmod(mm_reg, rm_arg_mm))
+addop("paddd", [bs8(0x0f), bs8(0xfe), no_xmm_pref] + rmmod(mm_reg, rm_arg_mm))
+addop("paddq", [bs8(0x0f), bs8(0xd4), no_xmm_pref] + rmmod(mm_reg, rm_arg_mm))
+
+## Substractions
+# SSE
+addop("psubb", [bs8(0x0f), bs8(0xf8), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+addop("psubw", [bs8(0x0f), bs8(0xf9), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+addop("psubd", [bs8(0x0f), bs8(0xfa), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+addop("psubq", [bs8(0x0f), bs8(0xfb), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+addop("psubb", [bs8(0x0f), bs8(0xf8), no_xmm_pref] + rmmod(mm_reg, rm_arg_mm))
+addop("psubw", [bs8(0x0f), bs8(0xf9), no_xmm_pref] + rmmod(mm_reg, rm_arg_mm))
+addop("psubd", [bs8(0x0f), bs8(0xfa), no_xmm_pref] + rmmod(mm_reg, rm_arg_mm))
+addop("psubq", [bs8(0x0f), bs8(0xfb), no_xmm_pref] + rmmod(mm_reg, rm_arg_mm))
+
+### Arithmetic (floating-point)
+###
+
+## Additions
+# SSE
+addop("addps", [bs8(0x0f), bs8(0x58), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
+addop("addpd", [bs8(0x0f), bs8(0x58), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+## Substractions
+# SSE
+addop("subps", [bs8(0x0f), bs8(0x5c), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
+addop("subpd", [bs8(0x0f), bs8(0x5c), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+## Multiplications
+# SSE
+addop("mulps", [bs8(0x0f), bs8(0x59), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
+addop("mulpd", [bs8(0x0f), bs8(0x59), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+## Divisions
+# SSE
+addop("divps", [bs8(0x0f), bs8(0x5e), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
+addop("divpd", [bs8(0x0f), bs8(0x5e), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+### Logical (floating-point)
+###
+
+## XOR
+addop("xorps", [bs8(0x0f), bs8(0x57), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
+addop("xorpd", [bs8(0x0f), bs8(0x57), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+## AND
+addop("andps", [bs8(0x0f), bs8(0x54), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
+addop("andpd", [bs8(0x0f), bs8(0x54), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+addop("andnps", [bs8(0x0f), bs8(0x55), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
+addop("andnpd", [bs8(0x0f), bs8(0x55), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+## OR
+addop("orps", [bs8(0x0f), bs8(0x56), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
+addop("orpd", [bs8(0x0f), bs8(0x56), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+## AND
+# MMX
+addop("pand", [bs8(0x0f), bs8(0xdb), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+# SSE
+addop("pand", [bs8(0x0f), bs8(0xdb), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+## ANDN
+# MMX
+addop("pandn", [bs8(0x0f), bs8(0xdf), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+# SSE
+addop("pandn", [bs8(0x0f), bs8(0xdf), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+## OR
+# MMX
+addop("por", [bs8(0x0f), bs8(0xeb), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+# SSE
+addop("por", [bs8(0x0f), bs8(0xeb), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+## XOR
+# MMX
+addop("pxor", [bs8(0x0f), bs8(0xef), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+# MMX
+addop("pxor", [bs8(0x0f), bs8(0xef), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+### Comparisons (floating-point)
+###
+addop("minps", [bs8(0x0f), bs8(0x5d), no_xmm_pref] + rmmod(xmm_reg,
+                                                           rm_arg_xmm_m128))
+addop("minss", [bs8(0x0f), bs8(0x5d), pref_f3] + rmmod(xmm_reg,
+                                                       rm_arg_xmm_m32))
+addop("minpd", [bs8(0x0f), bs8(0x5d), pref_66] + rmmod(xmm_reg,
+                                                       rm_arg_xmm_m128))
+addop("minsd", [bs8(0x0f), bs8(0x5d), pref_f2] + rmmod(xmm_reg,
+                                                       rm_arg_xmm_m64))
+addop("maxps", [bs8(0x0f), bs8(0x5f), no_xmm_pref] + rmmod(xmm_reg,
+                                                           rm_arg_xmm_m128))
+addop("maxpd", [bs8(0x0f), bs8(0x5f), pref_66] + rmmod(xmm_reg,
+                                                       rm_arg_xmm_m128))
+addop("maxsd", [bs8(0x0f), bs8(0x5f), pref_f2] + rmmod(xmm_reg, rm_arg_xmm_m64))
+addop("maxss", [bs8(0x0f), bs8(0x5f), pref_f3] + rmmod(xmm_reg, rm_arg_xmm_m32))
+
+for cond_name, value in [
+        ("eq", 0x00),
+        ("lt", 0x01),
+        ("le", 0x02),
+        ("unord", 0x03),
+        ("neq", 0x04),
+        ("nlt", 0x05),
+        ("nle", 0x06),
+        ("ord", 0x07),
+]:
+    addop("cmp%sps" % cond_name, [bs8(0x0f), bs8(0xc2), no_xmm_pref] +
+          rmmod(xmm_reg, rm_arg_xmm_m64) + [bs8(value)])
+    addop("cmp%spd" % cond_name, [bs8(0x0f), bs8(0xc2), pref_66] +
+          rmmod(xmm_reg, rm_arg_xmm_m64) + [bs8(value)])
+    addop("cmp%sss" % cond_name, [bs8(0x0f), bs8(0xc2), pref_f3] +
+          rmmod(xmm_reg, rm_arg_xmm_m32) + [bs8(value)])
+    addop("cmp%ssd" % cond_name, [bs8(0x0f), bs8(0xc2), pref_f2] +
+          rmmod(xmm_reg, rm_arg_xmm_m32) + [bs8(value)])
+
+
+
+addop("pshufb", [bs8(0x0f), bs8(0x38), bs8(0x00), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("pshufb", [bs8(0x0f), bs8(0x38), bs8(0x00), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("pshufd", [bs8(0x0f), bs8(0x70), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128) + [u08])
+addop("pshuflw", [bs8(0x0f), bs8(0x70), pref_f2] +
+      rmmod(xmm_reg, rm_arg_xmm_m128) + [u08])
+addop("pshufhw", [bs8(0x0f), bs8(0x70), pref_f3] +
+      rmmod(xmm_reg, rm_arg_xmm_m128) + [u08])
+
+
+### Convert
+### SS = single precision
+### SD = double precision
+###
+
+## SS -> SD
+##
+
+addop("cvtdq2pd", [bs8(0x0f), bs8(0xe6), pref_f3]
+      + rmmod(xmm_reg, rm_arg_xmm_m64))
+addop("cvtdq2ps", [bs8(0x0f), bs8(0x5b), no_xmm_pref]
+      + rmmod(xmm_reg, rm_arg_xmm))
+addop("cvtpd2dq", [bs8(0x0f), bs8(0xe6), pref_f2]
+      + rmmod(xmm_reg, rm_arg_xmm))
+addop("cvtpd2pi", [bs8(0x0f), bs8(0x2d), pref_66]
+      + rmmod(mm_reg, rm_arg_xmm))
+addop("cvtpd2ps", [bs8(0x0f), bs8(0x5a), pref_66]
+      + rmmod(xmm_reg, rm_arg_xmm))
+addop("cvtpi2pd", [bs8(0x0f), bs8(0x2a), pref_66]
+      + rmmod(xmm_reg, rm_arg_mm_m64))
+addop("cvtpi2ps", [bs8(0x0f), bs8(0x2a), no_xmm_pref]
+      + rmmod(xmm_reg, rm_arg_mm_m64))
+addop("cvtps2dq", [bs8(0x0f), bs8(0x5b), pref_66]
+      + rmmod(xmm_reg, rm_arg_xmm))
+addop("cvtps2pd", [bs8(0x0f), bs8(0x5a), no_xmm_pref]
+      + rmmod(xmm_reg, rm_arg_xmm_m64))
+addop("cvtps2pi", [bs8(0x0f), bs8(0x2d), no_xmm_pref]
+      + rmmod(mm_reg, rm_arg_xmm_m64))
+addop("cvtsd2si", [bs8(0x0f), bs8(0x2d), pref_f2]
+      + rmmod(reg, rm_arg_xmm_m64))
+addop("cvtsd2ss", [bs8(0x0f), bs8(0x5a), pref_f2]
+      + rmmod(xmm_reg, rm_arg_xmm_m64))
+addop("cvtsi2sd", [bs8(0x0f), bs8(0x2a), pref_f2]
+      + rmmod(xmm_reg, rm_arg))
+addop("cvtsi2ss", [bs8(0x0f), bs8(0x2a), xmmreg, pref_f3]
+      + rmmod(xmm_reg, rm_arg))
+addop("cvtss2sd", [bs8(0x0f), bs8(0x5a), pref_f3]
+      + rmmod(xmm_reg, rm_arg_xmm_m32))
+addop("cvtss2si", [bs8(0x0f), bs8(0x2d), pref_f3]
+      + rmmod(rmreg, rm_arg_xmm_m32))
+addop("cvttpd2pi",[bs8(0x0f), bs8(0x2c), pref_66]
+      + rmmod(mm_reg, rm_arg_xmm))
+addop("cvttpd2dq",[bs8(0x0f), bs8(0xe6), pref_66]
+      + rmmod(xmm_reg, rm_arg_xmm))
+addop("cvttps2dq",[bs8(0x0f), bs8(0x5b), pref_f3]
+      + rmmod(xmm_reg, rm_arg_xmm))
+addop("cvttps2pi",[bs8(0x0f), bs8(0x2c), no_xmm_pref]
+      + rmmod(mm_reg, rm_arg_xmm_m64))
+addop("cvttsd2si",[bs8(0x0f), bs8(0x2c), pref_f2]
+      + rmmod(reg, rm_arg_xmm_m64))
+addop("cvttss2si",[bs8(0x0f), bs8(0x2c), pref_f3]
+      + rmmod(reg, rm_arg_xmm_m32))
+
+addop("palignr", [bs8(0x0f), bs8(0x73), bs8(0x0f), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64) + [u08], [mm_reg, rm_arg_mm_m64, u08])
+addop("palignr", [bs8(0x0f), bs8(0x3a), bs8(0x0f), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128) + [u08], [xmm_reg, rm_arg_xmm_m128, u08])
+
+addop("psrlq", [bs8(0x0f), bs8(0x73), no_xmm_pref] +
+      rmmod(d2, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psrlq", [bs8(0x0f), bs8(0x73), pref_66] +
+      rmmod(d2, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psrlq", [bs8(0x0f), bs8(0xd3), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("psrlq", [bs8(0x0f), bs8(0xd3), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+
+addop("psrld", [bs8(0x0f), bs8(0x72), no_xmm_pref] +
+      rmmod(d2, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psrld", [bs8(0x0f), bs8(0x72), pref_66] +
+      rmmod(d2, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psrld", [bs8(0x0f), bs8(0xd2), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("psrld", [bs8(0x0f), bs8(0xd2), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+addop("psrldq", [bs8(0x0f), bs8(0x73), pref_66] +
+      rmmod(d3, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psrlw", [bs8(0x0f), bs8(0x71), no_xmm_pref] +
+      rmmod(d2, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psrlw", [bs8(0x0f), bs8(0x71), pref_66] +
+      rmmod(d2, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psrlw", [bs8(0x0f), bs8(0xd1), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64), [mm_reg, rm_arg_mm_m64])
+addop("psrlw", [bs8(0x0f), bs8(0xd1), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128), [xmm_reg, rm_arg_xmm_m128])
+
+addop("psraw", [bs8(0x0f), bs8(0xe1), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64), [mm_reg, rm_arg_mm_m64])
+addop("psraw", [bs8(0x0f), bs8(0xe1), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128), [xmm_reg, rm_arg_xmm_m128])
+
+addop("psraw", [bs8(0x0f), bs8(0x71), no_xmm_pref] +
+      rmmod(d4, rm_arg_mm_m64) + [u08], [rm_arg_mm_m64, u08])
+addop("psraw", [bs8(0x0f), bs8(0x71), pref_66] +
+      rmmod(d4, rm_arg_xmm_m128) + [u08], [rm_arg_xmm_m128, u08])
+
+addop("psrad", [bs8(0x0f), bs8(0xe2), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64), [mm_reg, rm_arg_mm_m64])
+addop("psrad", [bs8(0x0f), bs8(0xe2), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128), [xmm_reg, rm_arg_xmm_m128])
+
+addop("psrad", [bs8(0x0f), bs8(0x72), no_xmm_pref] +
+      rmmod(d4, rm_arg_mm_m64) + [u08], [rm_arg_mm_m64, u08])
+addop("psrad", [bs8(0x0f), bs8(0x72), pref_66] +
+      rmmod(d4, rm_arg_xmm_m128) + [u08], [rm_arg_xmm_m128, u08])
+
+
+addop("psllq", [bs8(0x0f), bs8(0x73), no_xmm_pref] +
+      rmmod(d6, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psllq", [bs8(0x0f), bs8(0x73), pref_66] +
+      rmmod(d6, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psllq", [bs8(0x0f), bs8(0xf3), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("psllq", [bs8(0x0f), bs8(0xf3), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+
+addop("pslld", [bs8(0x0f), bs8(0x72), no_xmm_pref] +
+      rmmod(d6, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("pslld", [bs8(0x0f), bs8(0x72), pref_66] +
+      rmmod(d6, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("pslld", [bs8(0x0f), bs8(0xf2), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("pslld", [bs8(0x0f), bs8(0xf2), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+
+addop("psllw", [bs8(0x0f), bs8(0x71), no_xmm_pref] +
+      rmmod(d6, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psllw", [bs8(0x0f), bs8(0x71), pref_66] +
+      rmmod(d6, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psllw", [bs8(0x0f), bs8(0xf1), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("psllw", [bs8(0x0f), bs8(0xf1), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+addop("pslldq", [bs8(0x0f), bs8(0x73), pref_66] +
+      rmmod(d7, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+
+addop("pmaxub", [bs8(0x0f), bs8(0xde), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pmaxub", [bs8(0x0f), bs8(0xde), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pmaxuw", [bs8(0x0f), bs8(0x38), bs8(0x3e), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pmaxud", [bs8(0x0f), bs8(0x38), bs8(0x3f), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pmaxsw", [bs8(0x0f), bs8(0xee), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("pmaxsw", [bs8(0x0f), bs8(0xee), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+
+addop("pminub", [bs8(0x0f), bs8(0xda), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pminub", [bs8(0x0f), bs8(0xda), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pminuw", [bs8(0x0f), bs8(0x38), bs8(0x3a), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pminud", [bs8(0x0f), bs8(0x38), bs8(0x3b), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+addop("pcmpeqb", [bs8(0x0f), bs8(0x74), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pcmpeqb", [bs8(0x0f), bs8(0x74), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pcmpeqw", [bs8(0x0f), bs8(0x75), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pcmpeqw", [bs8(0x0f), bs8(0x75), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pcmpeqd", [bs8(0x0f), bs8(0x76), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pcmpeqd", [bs8(0x0f), bs8(0x76), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pcmpgtb", [bs8(0x0f), bs8(0x64), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pcmpgtb", [bs8(0x0f), bs8(0x64), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pcmpgtw", [bs8(0x0f), bs8(0x65), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pcmpgtw", [bs8(0x0f), bs8(0x65), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pcmpgtd", [bs8(0x0f), bs8(0x66), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pcmpgtd", [bs8(0x0f), bs8(0x66), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pcmpeqq", [bs8(0x0f), bs8(0x38), bs8(0x29), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+addop("pcmpgtq", [bs8(0x0f), bs8(0x38), bs8(0x37), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpckhbw", [bs8(0x0f), bs8(0x68), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpckhbw", [bs8(0x0f), bs8(0x68), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpckhwd", [bs8(0x0f), bs8(0x69), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpckhwd", [bs8(0x0f), bs8(0x69), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpckhdq", [bs8(0x0f), bs8(0x6a), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpckhdq", [bs8(0x0f), bs8(0x6a), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpckhqdq", [bs8(0x0f), bs8(0x6d), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+
+addop("punpcklbw", [bs8(0x0f), bs8(0x60), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpcklbw", [bs8(0x0f), bs8(0x60), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpcklwd", [bs8(0x0f), bs8(0x61), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpcklwd", [bs8(0x0f), bs8(0x61), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpckldq", [bs8(0x0f), bs8(0x62), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpckldq", [bs8(0x0f), bs8(0x62), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpcklqdq", [bs8(0x0f), bs8(0x6c), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+addop("unpckhps", [bs8(0x0f), bs8(0x15), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm))
+addop("unpckhpd", [bs8(0x0f), bs8(0x15), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+addop("unpcklps", [bs8(0x0f), bs8(0x14), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm))
+addop("unpcklpd", [bs8(0x0f), bs8(0x14), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+
+addop("pinsrb", [bs8(0x0f), bs8(0x3a), bs8(0x20), pref_66] +
+      rmmod(xmm_reg, rm_arg_reg_m08) + [u08])
+addop("pinsrd", [bs8(0x0f), bs8(0x3a), bs8(0x22), pref_66, bs_opmode32] +
+      rmmod(xmm_reg, rm_arg) + [u08])
+addop("pinsrq", [bs8(0x0f), bs8(0x3a), bs8(0x22), pref_66] +
+      rmmod(xmm_reg, rm_arg_m64) + [bs_opmode64] + [u08])
+
+addop("pinsrw", [bs8(0x0f), bs8(0xc4), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_reg_m16) + [u08])
+addop("pinsrw", [bs8(0x0f), bs8(0xc4), pref_66] +
+      rmmod(xmm_reg, rm_arg_reg_m16) + [u08])
+
+
+addop("pextrb", [bs8(0x0f), bs8(0x3a), bs8(0x14), pref_66] +
+      rmmod(xmm_reg, rm_arg_reg_m08) + [u08], [rm_arg_reg_m08, xmm_reg, u08])
+addop("pextrd", [bs8(0x0f), bs8(0x3a), bs8(0x16), pref_66, bs_opmode32] +
+      rmmod(xmm_reg, rm_arg) + [u08], [rm_arg, xmm_reg, u08])
+addop("pextrq", [bs8(0x0f), bs8(0x3a), bs8(0x16), pref_66] +
+      rmmod(xmm_reg, rm_arg_m64) + [bs_opmode64] + [u08], [rm_arg_m64, xmm_reg, u08])
+
+
+addop("pextrw", [bs8(0x0f), bs8(0x3a), bs8(0x15), pref_66] +
+      rmmod(xmm_reg, rm_arg_reg_m16) + [u08], [rm_arg_reg_m16, xmm_reg, u08])
+addop("pextrw", [bs8(0x0f), bs8(0xc5), no_xmm_pref] +
+      rmmod(rmreg, rm_arg_mm) + [u08], [rmreg, rm_arg_mm, u08])
+addop("pextrw", [bs8(0x0f), bs8(0xc5), pref_66] +
+      rmmod(rmreg, rm_arg_xmm) + [u08], [rmreg, rm_arg_xmm, u08])
+
+
+addop("sqrtpd", [bs8(0x0f), bs8(0x51), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+addop("sqrtps", [bs8(0x0f), bs8(0x51), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm))
+addop("sqrtsd", [bs8(0x0f), bs8(0x51), pref_f2] +
+      rmmod(xmm_reg, rm_arg_xmm_m64))
+addop("sqrtss", [bs8(0x0f), bs8(0x51), pref_f3] +
+      rmmod(xmm_reg, rm_arg_xmm_m32))
+
+addop("pmovmskb", [bs8(0x0f), bs8(0xd7), no_xmm_pref] +
+      rmmod(reg_modrm, rm_arg_mm_reg))
+addop("pmovmskb", [bs8(0x0f), bs8(0xd7), pref_66] +
+      rmmod(reg_modrm, rm_arg_xmm_reg))
+
+addop("shufps", [bs8(0x0f), bs8(0xc6), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm) + [u08])
+addop("shufpd", [bs8(0x0f), bs8(0xc6), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm) + [u08])
+
+addop("aesenc", [bs8(0x0f), bs8(0x38), bs8(0xdc), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+addop("aesdec", [bs8(0x0f), bs8(0x38), bs8(0xde), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+addop("aesenclast", [bs8(0x0f), bs8(0x38), bs8(0xdd), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+addop("aesdeclast", [bs8(0x0f), bs8(0x38), bs8(0xdf), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
+addop("packsswb", [bs8(0x0f), bs8(0x63), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("packsswb", [bs8(0x0f), bs8(0x63), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("packssdw", [bs8(0x0f), bs8(0x6b), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("packssdw", [bs8(0x0f), bs8(0x6b), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+
+addop("packuswb", [bs8(0x0f), bs8(0x67), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("packuswb", [bs8(0x0f), bs8(0x67), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+
+addop("pmullw", [bs8(0x0f), bs8(0xd5), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("pmullw", [bs8(0x0f), bs8(0xd5), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("pmulhuw", [bs8(0x0f), bs8(0xe4), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("pmulhuw", [bs8(0x0f), bs8(0xe4), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("pmulhw", [bs8(0x0f), bs8(0xe5), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("pmulhw", [bs8(0x0f), bs8(0xe5), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("pmuludq", [bs8(0x0f), bs8(0xf4), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("pmuludq", [bs8(0x0f), bs8(0xf4), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+
+
+addop("psubusb", [bs8(0x0f), bs8(0xd8), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("psubusb", [bs8(0x0f), bs8(0xd8), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("psubusw", [bs8(0x0f), bs8(0xd9), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("psubusw", [bs8(0x0f), bs8(0xd9), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("psubsb", [bs8(0x0f), bs8(0xe8), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("psubsb", [bs8(0x0f), bs8(0xe8), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("psubsw", [bs8(0x0f), bs8(0xe9), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("psubsw", [bs8(0x0f), bs8(0xe9), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+
+
+addop("paddusb", [bs8(0x0f), bs8(0xdc), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("paddusb", [bs8(0x0f), bs8(0xdc), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("paddusw", [bs8(0x0f), bs8(0xdd), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("paddusw", [bs8(0x0f), bs8(0xdd), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("paddsb", [bs8(0x0f), bs8(0xec), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("paddsb", [bs8(0x0f), bs8(0xec), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("paddsw", [bs8(0x0f), bs8(0xed), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("paddsw", [bs8(0x0f), bs8(0xed), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+
+addop("pmaddwd", [bs8(0x0f), bs8(0xf5), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("pmaddwd", [bs8(0x0f), bs8(0xf5), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+
+addop("psadbw", [bs8(0x0f), bs8(0xf6), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("psadbw", [bs8(0x0f), bs8(0xf6), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+
+addop("pavgb", [bs8(0x0f), bs8(0xe0), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("pavgb", [bs8(0x0f), bs8(0xe0), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+addop("pavgw", [bs8(0x0f), bs8(0xe3), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64))
+addop("pavgw", [bs8(0x0f), bs8(0xe3), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128))
+
+addop("maskmovq", [bs8(0x0f), bs8(0xf7), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_reg))
+addop("maskmovdqu", [bs8(0x0f), bs8(0xf7), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_reg))
+
+addop("emms", [bs8(0x0f), bs8(0x77)])
+
+addop("incssp", [pref_f3, bs8(0x0f), bs8(0xae)] + rmmod(d5))
+addop("rdssp", [pref_f3, bs8(0x0f), bs8(0x1e)] + rmmod(d1, modrm=mod_reg))
+addop("saveprevssp", [pref_f3, bs8(0x0f), bs8(0x01), bs8(0xea)])
+addop("rstorssp", [pref_f3, bs8(0x0f), bs8(0x01)] + rmmod(d5, rm_arg_xmm, modrm=mod_mem))
+addop("wrss", [bs8(0x0f), bs8(0x38), bs8(0xf6)] + rmmod(rmreg, rm_arg), [rm_arg, rmreg])
+addop("wruss", [pref_66, bs8(0x0f), bs8(0x38), bs8(0xf5)] + rmmod(rmreg, rm_arg), [rm_arg, rmreg])
+addop("setssbsy", [pref_f3, bs8(0x0f), bs8(0x01), bs8(0xe8)])
+addop("clrssbsy", [pref_f3, bs8(0x0f), bs8(0xae)] + rmmod(d6, rm_arg_xmm))
+addop("endbr64", [pref_f3, bs8(0x0f), bs8(0x1e), bs8(0xfa)])
+addop("endbr32", [pref_f3, bs8(0x0f), bs8(0x1e), bs8(0xfb)])
+
+mn_x86.bintree = factor_one_bit(mn_x86.bintree)
+# mn_x86.bintree = factor_fields_all(mn_x86.bintree)
+"""
+mod reg r/m
+ XX XXX XXX
+
+"""
+
+
+def print_size(e):
+    print(e, e.size)
+    return e
diff --git a/src/miasm/arch/x86/ctype.py b/src/miasm/arch/x86/ctype.py
new file mode 100644
index 00000000..2a61689a
--- /dev/null
+++ b/src/miasm/arch/x86/ctype.py
@@ -0,0 +1,137 @@
+from miasm.core.objc import CLeafTypes, ObjCDecl, PADDING_TYPE_NAME
+from miasm.core.ctypesmngr import CTypeId, CTypePtr
+
+
+class CTypeAMD64_unk(CLeafTypes):
+    """Define C types sizes/alignment for x86_64 architecture"""
+
+    obj_pad = ObjCDecl(PADDING_TYPE_NAME, 1, 1) # __padding__ is size 1/align 1
+
+    obj_char = ObjCDecl("char", 1, 1)
+    obj_short = ObjCDecl("short", 2, 2)
+    obj_int = ObjCDecl("int", 4, 4)
+    obj_long = ObjCDecl("long", 8, 8)
+
+    obj_uchar = ObjCDecl("uchar", 1, 1)
+    obj_ushort = ObjCDecl("ushort", 2, 2)
+    obj_uint = ObjCDecl("uint", 4, 4)
+    obj_ulong = ObjCDecl("ulong", 8, 8)
+
+    obj_void = ObjCDecl("void", 1, 1)
+
+    obj_enum = ObjCDecl("enum", 4, 4)
+
+    obj_float = ObjCDecl("float", 4, 4)
+    obj_double = ObjCDecl("double", 8, 8)
+    obj_ldouble = ObjCDecl("ldouble", 16, 16)
+
+    def __init__(self):
+        self.types = {
+            CTypeId(PADDING_TYPE_NAME): self.obj_pad,
+
+            CTypeId('char'): self.obj_char,
+            CTypeId('short'): self.obj_short,
+            CTypeId('int'): self.obj_int,
+            CTypeId('void'): self.obj_void,
+            CTypeId('long',): self.obj_long,
+            CTypeId('float'): self.obj_float,
+            CTypeId('double'): self.obj_double,
+
+            CTypeId('signed', 'char'): self.obj_char,
+            CTypeId('unsigned', 'char'): self.obj_uchar,
+
+            CTypeId('short', 'int'): self.obj_short,
+            CTypeId('signed', 'short'): self.obj_short,
+            CTypeId('signed', 'short', 'int'): self.obj_short,
+            CTypeId('unsigned', 'short'): self.obj_ushort,
+            CTypeId('unsigned', 'short', 'int'): self.obj_ushort,
+
+            CTypeId('unsigned', ): self.obj_uint,
+            CTypeId('unsigned', 'int'): self.obj_uint,
+            CTypeId('signed', 'int'): self.obj_int,
+
+            CTypeId('long', 'int'): self.obj_long,
+            CTypeId('long', 'long'): self.obj_long,
+            CTypeId('long', 'long', 'int'): self.obj_long,
+            CTypeId('signed', 'long', 'long'): self.obj_long,
+            CTypeId('unsigned', 'long', 'long'): self.obj_ulong,
+            CTypeId('signed', 'long', 'long', 'int'): self.obj_long,
+            CTypeId('unsigned', 'long', 'long', 'int'): self.obj_ulong,
+
+            CTypeId('signed', 'long'): self.obj_long,
+            CTypeId('unsigned', 'long'): self.obj_ulong,
+            CTypeId('signed', 'long', 'int'): self.obj_long,
+            CTypeId('unsigned', 'long', 'int'): self.obj_ulong,
+
+            CTypeId('long', 'double'): self.obj_ldouble,
+            CTypePtr(CTypeId('void')): self.obj_ulong,
+        }
+
+
+
+
+
+class CTypeX86_unk(CLeafTypes):
+    """Define C types sizes/alignment for x86_32 architecture"""
+
+    obj_pad = ObjCDecl(PADDING_TYPE_NAME, 1, 1) # __padding__ is size 1/align 1
+
+    obj_char = ObjCDecl("char", 1, 1)
+    obj_short = ObjCDecl("short", 2, 2)
+    obj_int = ObjCDecl("int", 4, 4)
+    obj_long = ObjCDecl("long", 4, 4)
+
+    obj_uchar = ObjCDecl("uchar", 1, 1)
+    obj_ushort = ObjCDecl("ushort", 2, 2)
+    obj_uint = ObjCDecl("uint", 4, 4)
+    obj_ulong = ObjCDecl("ulong", 4, 4)
+
+    obj_void = ObjCDecl("void", 1, 1)
+
+    obj_enum = ObjCDecl("enum", 4, 4)
+
+    obj_float = ObjCDecl("float", 4, 4)
+    obj_double = ObjCDecl("double", 8, 8)
+    obj_ldouble = ObjCDecl("ldouble", 16, 16)
+
+    def __init__(self):
+        self.types = {
+            CTypeId(PADDING_TYPE_NAME): self.obj_pad,
+
+            CTypeId('char'): self.obj_char,
+            CTypeId('short'): self.obj_short,
+            CTypeId('int'): self.obj_int,
+            CTypeId('void'): self.obj_void,
+            CTypeId('long',): self.obj_long,
+            CTypeId('float'): self.obj_float,
+            CTypeId('double'): self.obj_double,
+
+            CTypeId('signed', 'char'): self.obj_char,
+            CTypeId('unsigned', 'char'): self.obj_uchar,
+
+            CTypeId('short', 'int'): self.obj_short,
+            CTypeId('signed', 'short'): self.obj_short,
+            CTypeId('signed', 'short', 'int'): self.obj_short,
+            CTypeId('unsigned', 'short'): self.obj_ushort,
+            CTypeId('unsigned', 'short', 'int'): self.obj_ushort,
+
+            CTypeId('unsigned', ): self.obj_uint,
+            CTypeId('unsigned', 'int'): self.obj_uint,
+            CTypeId('signed', 'int'): self.obj_int,
+
+            CTypeId('long', 'int'): self.obj_long,
+            CTypeId('long', 'long'): self.obj_long,
+            CTypeId('long', 'long', 'int'): self.obj_long,
+            CTypeId('signed', 'long', 'long'): self.obj_long,
+            CTypeId('unsigned', 'long', 'long'): self.obj_ulong,
+            CTypeId('signed', 'long', 'long', 'int'): self.obj_long,
+            CTypeId('unsigned', 'long', 'long', 'int'): self.obj_ulong,
+
+            CTypeId('signed', 'long'): self.obj_long,
+            CTypeId('unsigned', 'long'): self.obj_ulong,
+            CTypeId('signed', 'long', 'int'): self.obj_long,
+            CTypeId('unsigned', 'long', 'int'): self.obj_ulong,
+
+            CTypeId('long', 'double'): self.obj_ldouble,
+            CTypePtr(CTypeId('void')): self.obj_uint,
+        }
diff --git a/src/miasm/arch/x86/disasm.py b/src/miasm/arch/x86/disasm.py
new file mode 100644
index 00000000..49b7158a
--- /dev/null
+++ b/src/miasm/arch/x86/disasm.py
@@ -0,0 +1,30 @@
+from miasm.core.asmblock import disasmEngine
+from miasm.arch.x86.arch import mn_x86
+
+
+cb_x86_funcs = []
+
+
+def cb_x86_disasm(mdis, cur_block, offset_to_dis):
+    for func in cb_x86_funcs:
+        func(mdis, cur_block, offset_to_dis)
+
+
+class dis_x86(disasmEngine):
+    attrib = None
+
+    def __init__(self, bs=None, **kwargs):
+        super(dis_x86, self).__init__(mn_x86, self.attrib, bs, **kwargs)
+        self.dis_block_callback = cb_x86_disasm
+
+
+class dis_x86_16(dis_x86):
+    attrib = 16
+
+
+class dis_x86_32(dis_x86):
+    attrib = 32
+
+
+class dis_x86_64(dis_x86):
+    attrib = 64
diff --git a/src/miasm/arch/x86/jit.py b/src/miasm/arch/x86/jit.py
new file mode 100644
index 00000000..a90dec07
--- /dev/null
+++ b/src/miasm/arch/x86/jit.py
@@ -0,0 +1,296 @@
+from builtins import range
+import logging
+
+from miasm.jitter.jitload import Jitter, named_arguments
+from miasm.arch.x86.sem import Lifter_X86_16, Lifter_X86_32, Lifter_X86_64
+from miasm.jitter.codegen import CGen
+from miasm.ir.translators.C import TranslatorC
+
+log = logging.getLogger('jit_x86')
+hnd = logging.StreamHandler()
+hnd.setFormatter(logging.Formatter("[%(levelname)-8s]: %(message)s"))
+log.addHandler(hnd)
+log.setLevel(logging.CRITICAL)
+
+
+class x86_32_CGen(CGen):
+    def __init__(self, lifter):
+        self.lifter = lifter
+        self.PC = self.lifter.arch.regs.RIP
+        self.translator = TranslatorC(self.lifter.loc_db)
+        self.init_arch_C()
+
+    def gen_post_code(self, attrib, pc_value):
+        out = []
+        if attrib.log_regs:
+            # Update PC for dump_gpregs
+            out.append("%s = %s;" % (self.C_PC, pc_value))
+            out.append('dump_gpregs_32(jitcpu->cpu);')
+        return out
+
+class x86_64_CGen(x86_32_CGen):
+    def gen_post_code(self, attrib, pc_value):
+        out = []
+        if attrib.log_regs:
+            # Update PC for dump_gpregs
+            out.append("%s = %s;" % (self.C_PC, pc_value))
+            out.append('dump_gpregs_64(jitcpu->cpu);')
+        return out
+
+class jitter_x86_16(Jitter):
+
+    C_Gen = x86_32_CGen
+
+    def __init__(self, loc_db, *args, **kwargs):
+        Jitter.__init__(self, Lifter_X86_16(loc_db), *args, **kwargs)
+        self.vm.set_little_endian()
+        self.lifter.do_stk_segm = False
+        self.orig_irbloc_fix_regs_for_mode = self.lifter.irbloc_fix_regs_for_mode
+        self.lifter.irbloc_fix_regs_for_mode = self.lifterbloc_fix_regs_for_mode
+
+    def lifterbloc_fix_regs_for_mode(self, irblock, attrib=64):
+        return self.orig_irbloc_fix_regs_for_mode(irblock, 64)
+
+    def push_uint16_t(self, value):
+        self.cpu.SP -= self.lifter.sp.size // 8
+        self.vm.set_u16(self.cpu.SP, value)
+
+    def pop_uint16_t(self):
+        value = self.vm.get_u16(self.cpu.SP)
+        self.cpu.SP += self.lifter.sp.size // 8
+        return value
+
+    def get_stack_arg(self, index):
+        return self.vm.get_u16(self.cpu.SP + 4 * index)
+
+    def init_run(self, *args, **kwargs):
+        Jitter.init_run(self, *args, **kwargs)
+        self.cpu.IP = self.pc
+
+
+class jitter_x86_32(Jitter):
+
+    C_Gen = x86_32_CGen
+
+    def __init__(self, loc_db, *args, **kwargs):
+        Jitter.__init__(self, Lifter_X86_32(loc_db), *args, **kwargs)
+        self.vm.set_little_endian()
+        self.lifter.do_stk_segm = False
+
+        self.orig_irbloc_fix_regs_for_mode = self.lifter.irbloc_fix_regs_for_mode
+        self.lifter.irbloc_fix_regs_for_mode = self.lifterbloc_fix_regs_for_mode
+
+    def lifterbloc_fix_regs_for_mode(self, irblock, attrib=64):
+        return self.orig_irbloc_fix_regs_for_mode(irblock, 64)
+
+    def push_uint16_t(self, value):
+        self.cpu.ESP -= self.lifter.sp.size // 8
+        self.vm.set_u16(self.cpu.ESP, value)
+
+    def pop_uint16_t(self):
+        value = self.vm.get_u16(self.cpu.ESP)
+        self.cpu.ESP += self.lifter.sp.size // 8
+        return value
+
+    def push_uint32_t(self, value):
+        self.cpu.ESP -= self.lifter.sp.size // 8
+        self.vm.set_u32(self.cpu.ESP, value)
+
+    def pop_uint32_t(self):
+        value = self.vm.get_u32(self.cpu.ESP)
+        self.cpu.ESP += self.lifter.sp.size // 8
+        return value
+
+    def get_stack_arg(self, index):
+        return self.vm.get_u32(self.cpu.ESP + 4 * index)
+
+    def init_run(self, *args, **kwargs):
+        Jitter.init_run(self, *args, **kwargs)
+        self.cpu.EIP = self.pc
+
+    # calling conventions
+
+    # stdcall
+    @named_arguments
+    def func_args_stdcall(self, n_args):
+        ret_ad = self.pop_uint32_t()
+        args = [self.pop_uint32_t() for _ in range(n_args)]
+        return ret_ad, args
+
+    def func_ret_stdcall(self, ret_addr, ret_value1=None, ret_value2=None):
+        self.pc = self.cpu.EIP = ret_addr
+        if ret_value1 is not None:
+            self.cpu.EAX = ret_value1
+        if ret_value2 is not None:
+            self.cpu.EDX = ret_value2
+
+    def func_prepare_stdcall(self, ret_addr, *args):
+        for arg in reversed(args):
+            self.push_uint32_t(arg)
+        self.push_uint32_t(ret_addr)
+
+    get_arg_n_stdcall = get_stack_arg
+
+    # cdecl
+    @named_arguments
+    def func_args_cdecl(self, n_args):
+        ret_ad = self.pop_uint32_t()
+        args = [self.get_stack_arg(i) for i in range(n_args)]
+        return ret_ad, args
+
+    def func_ret_cdecl(self, ret_addr, ret_value1=None, ret_value2=None):
+        self.pc = self.cpu.EIP = ret_addr
+        if ret_value1 is not None:
+            self.cpu.EAX = ret_value1
+        if ret_value2 is not None:
+            self.cpu.EDX = ret_value2
+
+    get_arg_n_cdecl = get_stack_arg
+
+    # System V
+    func_args_systemv = func_args_cdecl
+    func_ret_systemv = func_ret_cdecl
+    func_prepare_systemv = func_prepare_stdcall
+    get_arg_n_systemv = get_stack_arg
+
+
+    # fastcall
+    @named_arguments
+    def func_args_fastcall(self, n_args):
+        args_regs = ['ECX', 'EDX']
+        ret_ad = self.pop_uint32_t()
+        args = []
+        for i in range(n_args):
+            args.append(self.get_arg_n_fastcall(i))
+        return ret_ad, args
+
+    def func_prepare_fastcall(self, ret_addr, *args):
+        args_regs = ['ECX', 'EDX']
+        for i in range(min(len(args), len(args_regs))):
+            setattr(self.cpu, args_regs[i], args[i])
+        remaining_args = args[len(args_regs):]
+        for arg in reversed(remaining_args):
+            self.push_uint32_t(arg)
+        self.push_uint32_t(ret_addr)
+
+    def get_arg_n_fastcall(self, index):
+        args_regs = ['ECX', 'EDX']
+        if index < len(args_regs):
+            return getattr(self.cpu, args_regs[index])
+        return self.get_stack_arg(index - len(args_regs))
+
+    def syscall_args_systemv(self, n_args):
+        # Documentation: http://man7.org/linux/man-pages/man2/syscall.2.html
+        # args: 
+        #   i386          ebx   ecx   edx   esi   edi   ebp   -
+        args = [self.cpu.EBX, self.cpu.ECX, self.cpu.EDX, self.cpu.ESI,
+                self.cpu.EDI, self.cpu.EBP][:n_args]
+        return args
+
+    def syscall_ret_systemv(self, value):
+        # Documentation: http://man7.org/linux/man-pages/man2/syscall.2.html
+        self.cpu.EAX = value
+
+
+class jitter_x86_64(Jitter):
+
+    C_Gen = x86_64_CGen
+    args_regs_systemv = ['RDI', 'RSI', 'RDX', 'RCX', 'R8', 'R9']
+    args_regs_stdcall = ['RCX', 'RDX', 'R8', 'R9']
+
+    def __init__(self, loc_db, *args, **kwargs):
+        Jitter.__init__(self, Lifter_X86_64(loc_db), *args, **kwargs)
+        self.vm.set_little_endian()
+        self.lifter.do_stk_segm = False
+
+        self.orig_irbloc_fix_regs_for_mode = self.lifter.irbloc_fix_regs_for_mode
+        self.lifter.irbloc_fix_regs_for_mode = self.lifterbloc_fix_regs_for_mode
+
+    def lifterbloc_fix_regs_for_mode(self, irblock, attrib=64):
+        return self.orig_irbloc_fix_regs_for_mode(irblock, 64)
+
+    def push_uint64_t(self, value):
+        self.cpu.RSP -= self.lifter.sp.size // 8
+        self.vm.set_u64(self.cpu.RSP, value)
+
+    def pop_uint64_t(self):
+        value = self.vm.get_u64(self.cpu.RSP)
+        self.cpu.RSP += self.lifter.sp.size // 8
+        return value
+
+    def get_stack_arg(self, index):
+        return self.vm.get_u64(self.cpu.RSP + 8 * index)
+
+    def init_run(self, *args, **kwargs):
+        Jitter.init_run(self, *args, **kwargs)
+        self.cpu.RIP = self.pc
+
+    # calling conventions
+
+    # stdcall
+    @named_arguments
+    def func_args_stdcall(self, n_args):
+        args_regs = self.args_regs_stdcall
+        ret_ad = self.pop_uint64_t()
+        args = []
+        for i in range(min(n_args, 4)):
+            args.append(self.cpu.get_gpreg()[args_regs[i]])
+        for i in range(max(0, n_args - 4)):
+            # Take into account the shadow registers on the stack 
+            # (Microsoft 64bit stdcall ABI)
+            # => Skip the first 4 stack parameters
+            args.append(self.get_stack_arg(4 + i))
+        return ret_ad, args
+
+    def func_prepare_stdcall(self, ret_addr, *args):
+        args_regs = self.args_regs_stdcall
+        for i in range(min(len(args), len(args_regs))):
+            setattr(self.cpu, args_regs[i], args[i])
+        remaining_args = args[len(args_regs):]
+        for arg in reversed(remaining_args):
+            self.push_uint64_t(arg)
+        self.push_uint64_t(ret_addr)
+
+    def func_ret_stdcall(self, ret_addr, ret_value=None):
+        self.pc = self.cpu.RIP = ret_addr
+        if ret_value is not None:
+            self.cpu.RAX = ret_value
+        return True
+
+    # cdecl
+    func_args_cdecl = func_args_stdcall
+    func_ret_cdecl = func_ret_stdcall
+    func_prepare_cdecl = func_prepare_stdcall
+
+    # System V
+
+    def get_arg_n_systemv(self, index):
+        args_regs = self.args_regs_systemv
+        if index < len(args_regs):
+            return getattr(self.cpu, args_regs[index])
+        return self.get_stack_arg(index - len(args_regs))
+
+    @named_arguments
+    def func_args_systemv(self, n_args):
+        ret_ad = self.pop_uint64_t()
+        args = [self.get_arg_n_systemv(index) for index in range(n_args)]
+        return ret_ad, args
+
+    func_ret_systemv = func_ret_cdecl
+
+    def func_prepare_systemv(self, ret_addr, *args):
+        args_regs = self.args_regs_systemv
+        self.push_uint64_t(ret_addr)
+        for i in range(min(len(args), len(args_regs))):
+            setattr(self.cpu, args_regs[i], args[i])
+        remaining_args = args[len(args_regs):]
+        for arg in reversed(remaining_args):
+            self.push_uint64_t(arg)
+
+    def syscall_args_systemv(self, n_args):
+        args = [self.cpu.RDI, self.cpu.RSI, self.cpu.RDX, self.cpu.R10,
+                self.cpu.R8, self.cpu.R9][:n_args]
+        return args
+
+    def syscall_ret_systemv(self, value):
+        self.cpu.RAX = value
diff --git a/src/miasm/arch/x86/lifter_model_call.py b/src/miasm/arch/x86/lifter_model_call.py
new file mode 100644
index 00000000..e75f8c69
--- /dev/null
+++ b/src/miasm/arch/x86/lifter_model_call.py
@@ -0,0 +1,80 @@
+#-*- coding:utf-8 -*-
+
+from miasm.expression.expression import ExprAssign, ExprOp
+from miasm.ir.ir import AssignBlock
+from miasm.ir.analysis import LifterModelCall
+from miasm.arch.x86.sem import Lifter_X86_16, Lifter_X86_32, Lifter_X86_64
+
+
+class LifterModelCall_x86_16(Lifter_X86_16, LifterModelCall):
+
+    def __init__(self, loc_db):
+        Lifter_X86_16.__init__(self, loc_db)
+        self.ret_reg = self.arch.regs.AX
+
+    def get_out_regs(self, _):
+        return set([self.ret_reg, self.sp])
+
+class LifterModelCall_x86_32(Lifter_X86_32, LifterModelCall_x86_16):
+
+    def __init__(self, loc_db):
+        Lifter_X86_32.__init__(self, loc_db)
+        self.ret_reg = self.arch.regs.EAX
+
+    def sizeof_char(self):
+        return 8
+
+    def sizeof_short(self):
+        return 16
+
+    def sizeof_int(self):
+        return 32
+
+    def sizeof_long(self):
+        return 32
+
+    def sizeof_pointer(self):
+        return 32
+
+
+class LifterModelCall_x86_64(Lifter_X86_64, LifterModelCall_x86_16):
+
+    def __init__(self, loc_db):
+        Lifter_X86_64.__init__(self, loc_db)
+        self.ret_reg = self.arch.regs.RAX
+
+    def call_effects(self, ad, instr):
+        call_assignblk = AssignBlock(
+            [
+                ExprAssign(
+                    self.ret_reg,
+                    ExprOp(
+                        'call_func_ret',
+                        ad,
+                        self.sp,
+                        self.arch.regs.RCX,
+                        self.arch.regs.RDX,
+                        self.arch.regs.R8,
+                        self.arch.regs.R9,
+                    )
+                ),
+                ExprAssign(self.sp, ExprOp('call_func_stack', ad, self.sp)),
+            ],
+            instr
+        )
+        return [call_assignblk], []
+
+    def sizeof_char(self):
+        return 8
+
+    def sizeof_short(self):
+        return 16
+
+    def sizeof_int(self):
+        return 32
+
+    def sizeof_long(self):
+        return 64
+
+    def sizeof_pointer(self):
+        return 64
diff --git a/src/miasm/arch/x86/regs.py b/src/miasm/arch/x86/regs.py
new file mode 100644
index 00000000..dc0b9264
--- /dev/null
+++ b/src/miasm/arch/x86/regs.py
@@ -0,0 +1,454 @@
+from builtins import range
+from miasm.expression.expression import ExprId
+from miasm.core.cpu import reg_info
+
+
+IP = ExprId('IP', 16)
+EIP = ExprId('EIP', 32)
+RIP = ExprId('RIP', 64)
+exception_flags = ExprId('exception_flags', 32)
+interrupt_num = ExprId('interrupt_num', 8)
+
+# GP
+
+
+regs08_str = ["AL", "CL", "DL", "BL", "AH", "CH", "DH", "BH"] + \
+    ["R%dB" % (i + 8) for i in range(8)]
+regs08_expr = [ExprId(x, 8) for x in regs08_str]
+
+regs08_64_str = ["AL", "CL", "DL", "BL", "SPL", "BPL", "SIL", "DIL"] + \
+    ["R%dB" % (i + 8) for i in range(8)]
+regs08_64_expr = [ExprId(x, 8) for x in regs08_64_str]
+
+
+regs16_str = ["AX", "CX", "DX", "BX", "SP", "BP", "SI", "DI"] + \
+    ["R%dW" % (i + 8) for i in range(8)]
+regs16_expr = [ExprId(x, 16) for x in regs16_str]
+
+regs32_str = ["EAX", "ECX", "EDX", "EBX", "ESP", "EBP", "ESI", "EDI"] + \
+    ["R%dD" % (i + 8) for i in range(8)] + ["EIP"]
+regs32_expr = [ExprId(x, 32) for x in regs32_str]
+
+regs64_str = ["RAX", "RCX", "RDX", "RBX", "RSP", "RBP", "RSI", "RDI",
+              "R8", "R9", "R10", "R11", "R12", "R13", "R14", "R15",
+              "RIP"]
+regs64_expr = [ExprId(x, 64) for x in regs64_str]
+
+
+regs_xmm_str = ["XMM%d" % i for i in range(16)]
+regs_xmm_expr = [ExprId(x, 128) for x in regs_xmm_str]
+
+regs_mm_str = ["MM%d" % i for i in range(16)]
+regs_mm_expr = [ExprId(x, 64) for x in regs_mm_str]
+
+regs_bnd_str = ["BND%d" % i for i in range(4)]
+regs_bnd_expr = [ExprId(x, 128) for x in regs_bnd_str]
+
+gpregs08 = reg_info(regs08_str, regs08_expr)
+gpregs08_64 = reg_info(regs08_64_str, regs08_64_expr)
+gpregs16 = reg_info(regs16_str, regs16_expr)
+gpregs32 = reg_info(regs32_str, regs32_expr)
+gpregs64 = reg_info(regs64_str, regs64_expr)
+
+gpregs_xmm = reg_info(regs_xmm_str, regs_xmm_expr)
+gpregs_mm = reg_info(regs_mm_str, regs_mm_expr)
+gpregs_bnd = reg_info(regs_bnd_str, regs_bnd_expr)
+
+r08_eax = reg_info([regs08_str[0]], [regs08_expr[0]])
+r16_eax = reg_info([regs16_str[0]], [regs16_expr[0]])
+r32_eax = reg_info([regs32_str[0]], [regs32_expr[0]])
+r64_eax = reg_info([regs64_str[0]], [regs64_expr[0]])
+
+r08_ecx = reg_info([regs08_str[1]], [regs08_expr[1]])
+
+r_eax_all = reg_info(
+    [regs08_str[0], regs16_str[0], regs32_str[0], regs64_str[0]],
+    [regs08_expr[0], regs16_expr[0], regs32_expr[0], regs64_expr[0]])
+r_edx_all = reg_info(
+    [regs08_str[2], regs16_str[2], regs32_str[2], regs64_str[2]],
+    [regs08_expr[2], regs16_expr[2], regs32_expr[2], regs64_expr[2]])
+
+r16_edx = reg_info([regs16_str[2]], [regs16_expr[2]])
+
+
+selectr_str = ["ES", "CS", "SS", "DS", "FS", "GS"]
+selectr_expr = [ExprId(x, 16) for x in selectr_str]
+segmreg = reg_info(selectr_str, selectr_expr)
+
+crregs32_str = ["CR%d" % i for i in range(8)]
+crregs32_expr = [ExprId(x, 32) for x in crregs32_str]
+crregs = reg_info(crregs32_str, crregs32_expr)
+
+
+drregs32_str = ["DR%d" % i for i in range(8)]
+drregs32_expr = [ExprId(x, 32) for x in drregs32_str]
+drregs = reg_info(drregs32_str, drregs32_expr)
+
+
+fltregs32_str = ["ST(%d)" % i for i in range(8)]
+fltregs32_expr = [ExprId(x, 64) for x in fltregs32_str]
+fltregs = reg_info(fltregs32_str, fltregs32_expr)
+
+r_st_all = reg_info(['ST'],
+                    [ExprId('ST', 64)])
+
+r_cs_all = reg_info(['CS'],
+                    [ExprId('CS', 16)])
+r_ds_all = reg_info(['DS'],
+                    [ExprId('DS', 16)])
+r_es_all = reg_info(['ES'],
+                    [ExprId('ES', 16)])
+r_ss_all = reg_info(['SS'],
+                    [ExprId('SS', 16)])
+r_fs_all = reg_info(['FS'],
+                    [ExprId('FS', 16)])
+r_gs_all = reg_info(['GS'],
+                    [ExprId('GS', 16)])
+
+
+AL = regs08_expr[0]
+CL = regs08_expr[1]
+DL = regs08_expr[2]
+BL = regs08_expr[3]
+AH = regs08_expr[4]
+CH = regs08_expr[5]
+DH = regs08_expr[6]
+BH = regs08_expr[7]
+R8B = regs08_expr[8]
+R9B = regs08_expr[9]
+R10B = regs08_expr[10]
+R11B = regs08_expr[11]
+R12B = regs08_expr[12]
+R13B = regs08_expr[13]
+R14B = regs08_expr[14]
+R15B = regs08_expr[15]
+
+SPL = regs08_64_expr[4]
+BPL = regs08_64_expr[5]
+SIL = regs08_64_expr[6]
+DIL = regs08_64_expr[7]
+
+
+AX = regs16_expr[0]
+CX = regs16_expr[1]
+DX = regs16_expr[2]
+BX = regs16_expr[3]
+SP = regs16_expr[4]
+BP = regs16_expr[5]
+SI = regs16_expr[6]
+DI = regs16_expr[7]
+R8W = regs16_expr[8]
+R9W = regs16_expr[9]
+R10W = regs16_expr[10]
+R11W = regs16_expr[11]
+R12W = regs16_expr[12]
+R13W = regs16_expr[13]
+R14W = regs16_expr[14]
+R15W = regs16_expr[15]
+
+
+EAX = regs32_expr[0]
+ECX = regs32_expr[1]
+EDX = regs32_expr[2]
+EBX = regs32_expr[3]
+ESP = regs32_expr[4]
+EBP = regs32_expr[5]
+ESI = regs32_expr[6]
+EDI = regs32_expr[7]
+R8D = regs32_expr[8]
+R9D = regs32_expr[9]
+R10D = regs32_expr[10]
+R11D = regs32_expr[11]
+R12D = regs32_expr[12]
+R13D = regs32_expr[13]
+R14D = regs32_expr[14]
+R15D = regs32_expr[15]
+
+
+RAX = regs64_expr[0]
+RCX = regs64_expr[1]
+RDX = regs64_expr[2]
+RBX = regs64_expr[3]
+RSP = regs64_expr[4]
+RBP = regs64_expr[5]
+RSI = regs64_expr[6]
+RDI = regs64_expr[7]
+R8 = regs64_expr[8]
+R9 = regs64_expr[9]
+R10 = regs64_expr[10]
+R11 = regs64_expr[11]
+R12 = regs64_expr[12]
+R13 = regs64_expr[13]
+R14 = regs64_expr[14]
+R15 = regs64_expr[15]
+
+
+reg_zf = 'zf'
+reg_nf = 'nf'
+reg_pf = 'pf'
+reg_of = 'of'
+reg_cf = 'cf'
+reg_tf = 'tf'
+reg_if = 'i_f'
+reg_df = 'df'
+reg_af = 'af'
+reg_iopl = 'iopl_f'
+reg_nt = 'nt'
+reg_rf = 'rf'
+reg_vm = 'vm'
+reg_ac = 'ac'
+reg_vif = 'vif'
+reg_vip = 'vip'
+reg_id = 'i_d'
+
+
+reg_es = "ES"
+reg_cs = "CS"
+reg_ss = "SS"
+reg_ds = "DS"
+reg_fs = "FS"
+reg_gs = "GS"
+
+reg_dr0 = 'DR0'
+reg_dr1 = 'DR1'
+reg_dr2 = 'DR2'
+reg_dr3 = 'DR3'
+reg_dr4 = 'DR4'
+reg_dr5 = 'DR5'
+reg_dr6 = 'DR6'
+reg_dr7 = 'DR7'
+
+reg_cr0 = 'CR0'
+reg_cr1 = 'CR1'
+reg_cr2 = 'CR2'
+reg_cr3 = 'CR3'
+reg_cr4 = 'CR4'
+reg_cr5 = 'CR5'
+reg_cr6 = 'CR6'
+reg_cr7 = 'CR7'
+
+reg_mm0 = 'MM0'
+reg_mm1 = 'MM1'
+reg_mm2 = 'MM2'
+reg_mm3 = 'MM3'
+reg_mm4 = 'MM4'
+reg_mm5 = 'MM5'
+reg_mm6 = 'MM6'
+reg_mm7 = 'MM7'
+
+reg_tsc = "tsc"
+
+reg_float_c0 = 'float_c0'
+reg_float_c1 = 'float_c1'
+reg_float_c2 = 'float_c2'
+reg_float_c3 = 'float_c3'
+reg_float_stack_ptr = "float_stack_ptr"
+reg_float_control = 'reg_float_control'
+reg_float_eip = 'reg_float_eip'
+reg_float_cs = 'reg_float_cs'
+reg_float_address = 'reg_float_address'
+reg_float_ds = 'reg_float_ds'
+
+
+dr0 = ExprId(reg_dr0, 32)
+dr1 = ExprId(reg_dr1, 32)
+dr2 = ExprId(reg_dr2, 32)
+dr3 = ExprId(reg_dr3, 32)
+dr4 = ExprId(reg_dr4, 32)
+dr5 = ExprId(reg_dr5, 32)
+dr6 = ExprId(reg_dr6, 32)
+dr7 = ExprId(reg_dr7, 32)
+
+cr0 = ExprId(reg_cr0, 32)
+cr1 = ExprId(reg_cr1, 32)
+cr2 = ExprId(reg_cr2, 32)
+cr3 = ExprId(reg_cr3, 32)
+cr4 = ExprId(reg_cr4, 32)
+cr5 = ExprId(reg_cr5, 32)
+cr6 = ExprId(reg_cr6, 32)
+cr7 = ExprId(reg_cr7, 32)
+
+mm0 = ExprId(reg_mm0, 64)
+mm1 = ExprId(reg_mm1, 64)
+mm2 = ExprId(reg_mm2, 64)
+mm3 = ExprId(reg_mm3, 64)
+mm4 = ExprId(reg_mm4, 64)
+mm5 = ExprId(reg_mm5, 64)
+mm6 = ExprId(reg_mm6, 64)
+mm7 = ExprId(reg_mm7, 64)
+
+XMM0 = regs_xmm_expr[0]
+XMM1 = regs_xmm_expr[1]
+XMM2 = regs_xmm_expr[2]
+XMM3 = regs_xmm_expr[3]
+XMM4 = regs_xmm_expr[4]
+XMM5 = regs_xmm_expr[5]
+XMM6 = regs_xmm_expr[6]
+XMM7 = regs_xmm_expr[7]
+XMM8 = regs_xmm_expr[8]
+XMM9 = regs_xmm_expr[9]
+XMM10 = regs_xmm_expr[10]
+XMM11 = regs_xmm_expr[11]
+XMM12 = regs_xmm_expr[12]
+XMM13 = regs_xmm_expr[13]
+XMM14 = regs_xmm_expr[14]
+XMM15 = regs_xmm_expr[15]
+
+# tmp1= ExprId(reg_tmp1)
+zf = ExprId(reg_zf, size=1)
+nf = ExprId(reg_nf, size=1)
+pf = ExprId(reg_pf, size=1)
+of = ExprId(reg_of, size=1)
+cf = ExprId(reg_cf, size=1)
+tf = ExprId(reg_tf, size=1)
+i_f = ExprId(reg_if, size=1)
+df = ExprId(reg_df, size=1)
+af = ExprId(reg_af, size=1)
+iopl = ExprId(reg_iopl, size=2)
+nt = ExprId(reg_nt, size=1)
+rf = ExprId(reg_rf, size=1)
+vm = ExprId(reg_vm, size=1)
+ac = ExprId(reg_ac, size=1)
+vif = ExprId(reg_vif, size=1)
+vip = ExprId(reg_vip, size=1)
+i_d = ExprId(reg_id, size=1)
+
+ES = ExprId(reg_es, size=16)
+CS = ExprId(reg_cs, size=16)
+SS = ExprId(reg_ss, size=16)
+DS = ExprId(reg_ds, size=16)
+FS = ExprId(reg_fs, size=16)
+GS = ExprId(reg_gs, size=16)
+
+tsc = ExprId(reg_tsc, size=64)
+
+float_c0 = ExprId(reg_float_c0, size=1)
+float_c1 = ExprId(reg_float_c1, size=1)
+float_c2 = ExprId(reg_float_c2, size=1)
+float_c3 = ExprId(reg_float_c3, size=1)
+float_stack_ptr = ExprId(reg_float_stack_ptr, size=3)
+float_control = ExprId(reg_float_control, 16)
+float_eip = ExprId(reg_float_eip, 32)
+float_cs = ExprId(reg_float_cs, size=16)
+float_address = ExprId(reg_float_address, 32)
+float_ds = ExprId(reg_float_ds, size=16)
+
+float_st0 = ExprId("float_st0", 64)
+float_st1 = ExprId("float_st1", 64)
+float_st2 = ExprId("float_st2", 64)
+float_st3 = ExprId("float_st3", 64)
+float_st4 = ExprId("float_st4", 64)
+float_st5 = ExprId("float_st5", 64)
+float_st6 = ExprId("float_st6", 64)
+float_st7 = ExprId("float_st7", 64)
+
+
+float_list = [float_st0, float_st1, float_st2, float_st3,
+              float_st4, float_st5, float_st6, float_st7]
+
+float_replace = {fltregs32_expr[i]: float_list[i] for i in range(8)}
+float_replace[r_st_all.expr[0]] = float_st0
+
+
+EAX_init = ExprId('EAX_init', 32)
+EBX_init = ExprId('EBX_init', 32)
+ECX_init = ExprId('ECX_init', 32)
+EDX_init = ExprId('EDX_init', 32)
+ESI_init = ExprId('ESI_init', 32)
+EDI_init = ExprId('EDI_init', 32)
+ESP_init = ExprId('ESP_init', 32)
+EBP_init = ExprId('EBP_init', 32)
+
+
+RAX_init = ExprId('RAX_init', 64)
+RBX_init = ExprId('RBX_init', 64)
+RCX_init = ExprId('RCX_init', 64)
+RDX_init = ExprId('RDX_init', 64)
+RSI_init = ExprId('RSI_init', 64)
+RDI_init = ExprId('RDI_init', 64)
+RSP_init = ExprId('RSP_init', 64)
+RBP_init = ExprId('RBP_init', 64)
+
+
+all_regs_ids = [
+    AL, CL, DL, BL, AH, CH, DH, BH,
+    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B,
+    SPL, BPL, SIL, DIL,
+    AX, CX, DX, BX, SP, BP, SI, DI,
+    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W,
+    IP,
+    EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
+    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D,
+    EIP,
+
+    RAX, RBX, RCX, RDX, RSP, RBP, RIP, RSI, RDI,
+    R8, R9, R10, R11, R12, R13, R14, R15,
+    zf, nf, pf, of, cf, af, df,
+    tf, i_f, iopl, nt, rf, vm, ac, vif, vip, i_d,
+    float_control, float_eip, float_cs, float_address, float_ds,
+    tsc,
+    ES, CS, SS, DS, FS, GS,
+    float_st0, float_st1, float_st2, float_st3,
+    float_st4, float_st5, float_st6, float_st7,
+    float_c0, float_c1, float_c2, float_c3,
+    cr0, cr3,
+    dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7,
+    float_stack_ptr,
+    mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7,
+
+    XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
+
+
+    exception_flags, interrupt_num,
+] + fltregs32_expr
+
+all_regs_ids_no_alias = [
+    RAX, RBX, RCX, RDX, RSP, RBP, RIP, RSI, RDI,
+    R8, R9, R10, R11, R12, R13, R14, R15,
+    zf, nf, pf, of, cf, af, df,
+    tf, i_f, iopl, nt, rf, vm, ac, vif, vip, i_d,
+    float_control, float_eip, float_cs, float_address, float_ds,
+    tsc,
+    ES, CS, SS, DS, FS, GS,
+    float_st0, float_st1, float_st2, float_st3,
+    float_st4, float_st5, float_st6, float_st7,
+    float_c0, float_c1, float_c2, float_c3,
+    cr0, cr3,
+    dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7,
+    float_stack_ptr,
+    mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7,
+    XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
+
+
+    exception_flags, interrupt_num,
+] + fltregs32_expr
+
+attrib_to_regs = {
+    16: regs16_expr + all_regs_ids_no_alias[all_regs_ids_no_alias.index(zf):] + [IP],
+    32: regs32_expr + all_regs_ids_no_alias[all_regs_ids_no_alias.index(zf):] + [EIP],
+    64: all_regs_ids_no_alias,
+}
+
+all_regs_ids_byname = dict([(x.name, x) for x in all_regs_ids])
+
+all_regs_ids_init = [ExprId("%s_init" % x.name, x.size) for x in all_regs_ids]
+
+regs_init = {}
+for i, r in enumerate(all_regs_ids):
+    regs_init[r] = all_regs_ids_init[i]
+
+regs_flt_expr = [float_st0, float_st1, float_st2, float_st3,
+                 float_st4, float_st5, float_st6, float_st7,
+                 ]
+
+mRAX = {16: AX, 32: EAX, 64: RAX}
+mRBX = {16: BX, 32: EBX, 64: RBX}
+mRCX = {16: CX, 32: ECX, 64: RCX}
+mRDX = {16: DX, 32: EDX, 64: RDX}
+mRSI = {16: SI, 32: ESI, 64: RSI}
+mRDI = {16: DI, 32: EDI, 64: RDI}
+mRBP = {16: BP, 32: EBP, 64: RBP}
+mRSP = {16: SP, 32: ESP, 64: RSP}
+mRIP = {16: IP, 32: EIP, 64: RIP}
diff --git a/src/miasm/arch/x86/sem.py b/src/miasm/arch/x86/sem.py
new file mode 100644
index 00000000..d19290b6
--- /dev/null
+++ b/src/miasm/arch/x86/sem.py
@@ -0,0 +1,6065 @@
+#
+# Copyright (C) 2011 EADS France, Fabrice Desclaux <fabrice.desclaux@eads.net>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+from builtins import range
+
+from future.utils import viewitems
+
+import logging
+import miasm.expression.expression as m2_expr
+from miasm.expression.simplifications import expr_simp
+from miasm.arch.x86.regs import *
+from miasm.arch.x86.arch import mn_x86, repeat_mn, replace_regs, is_mem_segm
+from miasm.ir.ir import Lifter, IRBlock, AssignBlock
+from miasm.core.sembuilder import SemBuilder
+from miasm.jitter.csts import EXCEPT_DIV_BY_ZERO, EXCEPT_ILLEGAL_INSN, \
+    EXCEPT_PRIV_INSN, EXCEPT_SOFT_BP, EXCEPT_INT_XX, EXCEPT_INT_1, \
+    EXCEPT_SYSCALL
+import math
+import struct
+
+
+LOG_X86_SEM = logging.getLogger("x86_sem")
+CONSOLE_HANDLER = logging.StreamHandler()
+CONSOLE_HANDLER.setFormatter(logging.Formatter("[%(levelname)-8s]: %(message)s"))
+LOG_X86_SEM.addHandler(CONSOLE_HANDLER)
+LOG_X86_SEM.setLevel(logging.WARNING)
+
+
+# SemBuilder context
+ctx = {'mRAX': mRAX,
+       'mRBX': mRBX,
+       'mRCX': mRCX,
+       'mRDX': mRDX,
+       'zf': zf,
+       }
+sbuild = SemBuilder(ctx)
+
+
+
+"""
+http://www.emulators.com/docs/nx11_flags.htm
+
+CF(A+B) = (((A XOR B) XOR D) < 0) XOR (((A XOR D) AND NOT (A XOR B)) < 0)
+CF(A-B) = (((A XOR B) XOR D) < 0) XOR (((A XOR D) AND (A XOR B)) < 0)
+
+OF(A+B) = ((A XOR D) AND NOT (A XOR B)) < 0
+OF(A-B) = ((A XOR D) AND (A XOR B)) < 0
+"""
+
+
+# XXX TODO make default check against 0 or not 0 (same eq as in C)
+def update_flag_zf_eq(a, b):
+    return [m2_expr.ExprAssign(zf, m2_expr.ExprOp("FLAG_EQ_CMP", a, b))]
+
+
+def update_flag_zf(a):
+    return [
+        m2_expr.ExprAssign(
+            zf,
+            m2_expr.ExprCond(
+                a,
+                m2_expr.ExprInt(0, zf.size),
+                m2_expr.ExprInt(1, zf.size)
+            )
+        )
+    ]
+
+
+def update_flag_nf(arg):
+    return [
+        m2_expr.ExprAssign(
+            nf,
+            m2_expr.ExprOp("FLAG_SIGN_SUB", arg, m2_expr.ExprInt(0, arg.size))
+        )
+    ]
+
+
+def update_flag_pf(a):
+    return [m2_expr.ExprAssign(pf,
+                            m2_expr.ExprOp('parity',
+                                           a & m2_expr.ExprInt(0xFF, a.size)))]
+
+
+def update_flag_af(op1, op2, res):
+    return [m2_expr.ExprAssign(af, (op1 ^ op2 ^ res)[4:5])]
+
+
+def update_flag_znp(a):
+    e = []
+    e += update_flag_zf(a)
+    e += update_flag_nf(a)
+    e += update_flag_pf(a)
+    return e
+
+
+def update_flag_np(result):
+    e = []
+    e += update_flag_nf(result)
+    e += update_flag_pf(result)
+    return e
+
+
+def null_flag_co():
+    e = []
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, of.size)))
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprInt(0, cf.size)))
+    return e
+
+
+def update_flag_arith(a):
+    e = []
+    e += update_flag_znp(a)
+    return e
+
+
+def update_flag_zfaddwc_eq(arg1, arg2, arg3):
+    return [m2_expr.ExprAssign(zf, m2_expr.ExprOp("FLAG_EQ_ADDWC", arg1, arg2, arg3))]
+
+def update_flag_zfsubwc_eq(arg1, arg2, arg3):
+    return [m2_expr.ExprAssign(zf, m2_expr.ExprOp("FLAG_EQ_SUBWC", arg1, arg2, arg3))]
+
+
+def update_flag_arith_add_znp(arg1, arg2):
+    """
+    Compute znp flags for (arg1 + arg2)
+    """
+    e = []
+    e += update_flag_zf_eq(arg1, -arg2)
+    e += [m2_expr.ExprAssign(nf, m2_expr.ExprOp("FLAG_SIGN_SUB", arg1, -arg2))]
+    e += update_flag_pf(arg1+arg2)
+    return e
+
+
+def update_flag_arith_addwc_znp(arg1, arg2, arg3):
+    """
+    Compute znp flags for (arg1 + arg2 + cf)
+    """
+    e = []
+    e += update_flag_zfaddwc_eq(arg1, arg2, arg3)
+    e += [m2_expr.ExprAssign(nf, m2_expr.ExprOp("FLAG_SIGN_ADDWC", arg1, arg2, arg3))]
+    e += update_flag_pf(arg1+arg2+arg3.zeroExtend(arg2.size))
+    return e
+
+
+
+
+def update_flag_arith_sub_znp(arg1, arg2):
+    """
+    Compute znp flags for (arg1 - arg2)
+    """
+    e = []
+    e += update_flag_zf_eq(arg1, arg2)
+    e += [m2_expr.ExprAssign(nf, m2_expr.ExprOp("FLAG_SIGN_SUB", arg1, arg2))]
+    e += update_flag_pf(arg1 - arg2)
+    return e
+
+
+def update_flag_arith_subwc_znp(arg1, arg2, arg3):
+    """
+    Compute znp flags for (arg1 - (arg2 + cf))
+    """
+    e = []
+    e += update_flag_zfsubwc_eq(arg1, arg2, arg3)
+    e += [m2_expr.ExprAssign(nf, m2_expr.ExprOp("FLAG_SIGN_SUBWC", arg1, arg2, arg3))]
+    e += update_flag_pf(arg1 - (arg2+arg3.zeroExtend(arg2.size)))
+    return e
+
+
+def check_ops_msb(a, b, c):
+    if not a or not b or not c or a != b or a != c:
+        raise ValueError('bad ops size %s %s %s' % (a, b, c))
+
+
+def arith_flag(a, b, c):
+    a_s, b_s, c_s = a.size, b.size, c.size
+    check_ops_msb(a_s, b_s, c_s)
+    a_s, b_s, c_s = a.msb(), b.msb(), c.msb()
+    return a_s, b_s, c_s
+
+# checked: ok for adc add because b & c before +cf
+
+
+def update_flag_add_cf(op1, op2, res):
+    "Compute cf in @res = @op1 + @op2"
+    #return [m2_expr.ExprAssign(cf, m2_expr.ExprOp("FLAG_SUB_CF", op1, -op2))]
+    return [m2_expr.ExprAssign(cf, m2_expr.ExprOp("FLAG_ADD_CF", op1, op2))]
+
+
+def update_flag_add_of(op1, op2, res):
+    "Compute of in @res = @op1 + @op2"
+    return [m2_expr.ExprAssign(of, m2_expr.ExprOp("FLAG_ADD_OF", op1, op2))]
+
+
+# checked: ok for sbb add because b & c before +cf
+def update_flag_sub_cf(op1, op2, res):
+    "Compote CF in @res = @op1 - @op2"
+    return [m2_expr.ExprAssign(cf, m2_expr.ExprOp("FLAG_SUB_CF", op1, op2))]
+
+
+def update_flag_sub_of(op1, op2, res):
+    "Compote OF in @res = @op1 - @op2"
+    return [m2_expr.ExprAssign(of, m2_expr.ExprOp("FLAG_SUB_OF", op1, op2))]
+
+
+def update_flag_addwc_cf(op1, op2, op3):
+    "Compute cf in @res = @op1 + @op2 + @op3"
+    return [m2_expr.ExprAssign(cf, m2_expr.ExprOp("FLAG_ADDWC_CF", op1, op2, op3))]
+
+
+def update_flag_addwc_of(op1, op2, op3):
+    "Compute of in @res = @op1 + @op2 + @op3"
+    return [m2_expr.ExprAssign(of, m2_expr.ExprOp("FLAG_ADDWC_OF", op1, op2, op3))]
+
+
+
+def update_flag_subwc_cf(op1, op2, op3):
+    "Compute cf in @res = @op1 + @op2 + @op3"
+    return [m2_expr.ExprAssign(cf, m2_expr.ExprOp("FLAG_SUBWC_CF", op1, op2, op3))]
+
+
+def update_flag_subwc_of(op1, op2, op3):
+    "Compute of in @res = @op1 + @op2 + @op3"
+    return [m2_expr.ExprAssign(of, m2_expr.ExprOp("FLAG_SUBWC_OF", op1, op2, op3))]
+
+
+
+
+def update_flag_arith_add_co(x, y, z):
+    e = []
+    e += update_flag_add_cf(x, y, z)
+    e += update_flag_add_of(x, y, z)
+    return e
+
+
+def update_flag_arith_sub_co(x, y, z):
+    e = []
+    e += update_flag_sub_cf(x, y, z)
+    e += update_flag_sub_of(x, y, z)
+    return e
+
+
+
+
+def update_flag_arith_addwc_co(arg1, arg2, arg3):
+    e = []
+    e += update_flag_addwc_cf(arg1, arg2, arg3)
+    e += update_flag_addwc_of(arg1, arg2, arg3)
+    return e
+
+
+def update_flag_arith_subwc_co(arg1, arg2, arg3):
+    e = []
+    e += update_flag_subwc_cf(arg1, arg2, arg3)
+    e += update_flag_subwc_of(arg1, arg2, arg3)
+    return e
+
+
+
+def set_float_cs_eip(instr):
+    e = []
+    # XXX TODO check float updt
+    e.append(m2_expr.ExprAssign(float_eip,
+                             m2_expr.ExprInt(instr.offset, float_eip.size)))
+    e.append(m2_expr.ExprAssign(float_cs, CS))
+    return e
+
+
+def mode2addrsize(mode):
+    """Returns the address size for a given @mode"""
+
+    mode2size = {16:32, 32:32, 64:64}
+    if mode not in mode2size:
+        raise RuntimeError("Unknown size %s", mode)
+    return mode2size[mode]
+
+
+def instr2addrsize(instr):
+    """Returns the address size for a given @instr"""
+
+    return mode2addrsize(instr.mode)
+
+
+def expraddr(mode, ptr):
+    """Returns memory address pointer with size according to current @mode"""
+    return ptr.zeroExtend(mode2addrsize(mode))
+
+
+def fix_mem_args_size(instr, *args):
+    out = []
+    for arg in args:
+        if not arg.is_mem():
+            out.append(arg)
+            continue
+        ptr = arg.ptr
+        size = arg.size
+        if ptr.is_op('segm'):
+            ptr = m2_expr.ExprOp(
+                'segm', ptr.args[0], expraddr(instr.mode, ptr.args[1]))
+        else:
+            ptr = expraddr(instr.mode, ptr)
+        out.append(m2_expr.ExprMem(ptr, size))
+    return out
+
+
+def mem2double(instr, arg):
+    """
+    Add float conversion if argument is an ExprMem
+    @arg: argument to transform
+    """
+    if isinstance(arg, m2_expr.ExprMem):
+        if arg.size > 64:
+            # TODO: move to 80 bits
+            arg = m2_expr.ExprMem(expraddr(instr.mode, arg.ptr), size=64)
+        return m2_expr.ExprOp('sint_to_fp', arg.signExtend(64))
+    else:
+        return arg
+
+
+def float_implicit_st0(arg1, arg2):
+    """
+    Generate full float operators if one argument is implicit (float_st0)
+    """
+    if arg2 is None:
+        arg2 = arg1
+        arg1 = float_st0
+    return arg1, arg2
+
+
+def gen_jcc(ir, instr, cond, dst, jmp_if):
+    """
+    Macro to generate jcc semantic
+    @ir: ir instance
+    @instr: instruction
+    @cond: condition of the jcc
+    @dst: the destination if jcc is taken
+    @jmp_if: jump if/notif cond
+    """
+
+    e = []
+    meip = mRIP[ir.IRDst.size]
+    loc_next = ir.get_next_loc_key(instr)
+    loc_next_expr = m2_expr.ExprLoc(loc_next, dst.size)
+
+    if jmp_if:
+        dstA, dstB = dst, loc_next_expr
+    else:
+        dstA, dstB = loc_next_expr, dst
+    mn_dst = m2_expr.ExprCond(cond,
+                              dstA.zeroExtend(ir.IRDst.size),
+                              dstB.zeroExtend(ir.IRDst.size))
+    e.append(m2_expr.ExprAssign(meip, mn_dst))
+    e.append(m2_expr.ExprAssign(ir.IRDst, mn_dst))
+    return e, []
+
+
+def gen_fcmov(ir, instr, cond, arg1, arg2, mov_if):
+    """Generate fcmov
+    @ir: ir instance
+    @instr: instruction instance
+    @cond: condition
+    @mov_if: invert condition if False"""
+
+    loc_do, loc_do_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_skip = ir.get_next_loc_key(instr)
+    loc_skip_expr = m2_expr.ExprLoc(loc_skip, ir.IRDst.size)
+    if mov_if:
+        dstA, dstB = loc_do_expr, loc_skip_expr
+    else:
+        dstA, dstB = loc_skip_expr, loc_do_expr
+    e = []
+    e_do, extra_irs = [m2_expr.ExprAssign(arg1, arg2)], []
+    e_do.append(m2_expr.ExprAssign(ir.IRDst, loc_skip_expr))
+    e.append(m2_expr.ExprAssign(ir.IRDst, m2_expr.ExprCond(cond, dstA, dstB)))
+    return e, [IRBlock(ir.loc_db, loc_do, [AssignBlock(e_do, instr)])]
+
+
+def gen_cmov(ir, instr, cond, dst, src, mov_if):
+    """Generate cmov
+    @ir: ir instance
+    @instr: instruction instance
+    @cond: condition
+    @mov_if: invert condition if False"""
+
+    loc_do, loc_do_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_skip = ir.get_next_loc_key(instr)
+    loc_skip_expr = m2_expr.ExprLoc(loc_skip, ir.IRDst.size)
+    if mov_if:
+        dstA, dstB = loc_do_expr, loc_skip_expr
+    else:
+        dstA, dstB = loc_skip_expr, loc_do_expr
+    e = []
+    if instr.mode == 64:
+        # Force destination set in order to zero high bit orders
+        # In 64 bit:
+        # cmovz eax, ebx
+        # if zf == 0 => high part of RAX is set to zero
+        e.append(m2_expr.ExprAssign(dst, dst))
+    e_do, extra_irs = mov(ir, instr, dst, src)
+    e_do.append(m2_expr.ExprAssign(ir.IRDst, loc_skip_expr))
+    e.append(m2_expr.ExprAssign(ir.IRDst, m2_expr.ExprCond(cond, dstA, dstB)))
+    return e, [IRBlock(ir.loc_db, loc_do, [AssignBlock(e_do, instr)])]
+
+
+def mov(_, instr, dst, src):
+    if dst in [ES, CS, SS, DS, FS, GS]:
+        src = src[:dst.size]
+    if src in [ES, CS, SS, DS, FS, GS]:
+        src = src.zeroExtend(dst.size)
+    e = [m2_expr.ExprAssign(dst, src)]
+    return e, []
+
+
+def movq(_, instr, dst, src):
+    src_final = (src.zeroExtend(dst.size)
+                 if dst.size >= src.size else
+                 src[:dst.size])
+    return [m2_expr.ExprAssign(dst, src_final)], []
+
+
+@sbuild.parse
+def xchg(arg1, arg2):
+    arg1 = arg2
+    arg2 = arg1
+
+
+
+def movzx(_, instr, dst, src):
+    e = [m2_expr.ExprAssign(dst, src.zeroExtend(dst.size))]
+    return e, []
+
+
+def movsx(_, instr, dst, src):
+    e = [m2_expr.ExprAssign(dst, src.signExtend(dst.size))]
+    return e, []
+
+
+def lea(_, instr, dst, src):
+    ptr = src.ptr
+    if is_mem_segm(src):
+        # Do not use segmentation here
+        ptr = ptr.args[1]
+
+    if ptr.size > dst.size:
+        ptr = ptr[:dst.size]
+    e = [m2_expr.ExprAssign(dst, ptr.zeroExtend(dst.size))]
+    return e, []
+
+
+def add(_, instr, dst, src):
+    e = []
+
+    result = dst + src
+
+    e += update_flag_arith_add_znp(dst, src)
+    e += update_flag_arith_add_co(dst, src, result)
+    e += update_flag_af(dst, src, result)
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def xadd(_, instr, dst, src):
+    e = []
+
+    result = dst + src
+    e += update_flag_arith_add_znp(dst, src)
+    e += update_flag_arith_add_co(src, dst, result)
+    e += update_flag_af(dst, src, result)
+    if dst != src:
+        e.append(m2_expr.ExprAssign(src, dst))
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def adc(_, instr, dst, src):
+    e = []
+
+    arg1 = dst
+    arg2 = src
+    result = arg1 + (arg2 + cf.zeroExtend(src.size))
+
+    e += update_flag_arith_addwc_znp(arg1, arg2, cf)
+    e += update_flag_arith_addwc_co(arg1, arg2, cf)
+    e += update_flag_af(arg1, arg2, result)
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def sub(_, instr, dst, src):
+    e = []
+    arg1, arg2 = dst, src
+    result = dst - src
+
+    e += update_flag_arith_sub_znp(arg1, arg2)
+    e += update_flag_arith_sub_co(arg1, arg2, result)
+    e += update_flag_af(dst, src, result)
+
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+# a-(b+cf)
+
+
+def sbb(_, instr, dst, src):
+    e = []
+    arg1 = dst
+    arg2 = src
+    result = arg1 - (arg2 + cf.zeroExtend(src.size))
+
+    e += update_flag_arith_subwc_znp(arg1, arg2, cf)
+    e += update_flag_af(arg1, arg2, result)
+    e += update_flag_arith_subwc_co(arg1, arg2, cf)
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def neg(_, instr, src):
+    e = []
+    dst = m2_expr.ExprInt(0, src.size)
+    arg1, arg2 = dst, src
+    result = arg1 - arg2
+
+    e += update_flag_arith_sub_znp(arg1, arg2)
+    e += update_flag_arith_sub_co(arg1, arg2, result)
+    e += update_flag_af(arg1, arg2, result)
+    e.append(m2_expr.ExprAssign(src, result))
+    return (e, [])
+
+
+def l_not(_, instr, dst):
+    e = []
+    result = (~dst)
+    e.append(m2_expr.ExprAssign(dst, result))
+    return (e, [])
+
+
+def l_cmp(_, instr, dst, src):
+    e = []
+    arg1, arg2 = dst, src
+    result = dst - src
+
+    e += update_flag_arith_sub_znp(arg1, arg2)
+    e += update_flag_arith_sub_co(arg1, arg2, result)
+    e += update_flag_af(dst, src, result)
+    return (e, [])
+
+
+def xor(_, instr, dst, src):
+    e = []
+    result = dst ^ src
+    e += [m2_expr.ExprAssign(zf, m2_expr.ExprOp('FLAG_EQ_CMP', dst, src))]
+    e += update_flag_np(result)
+    e += null_flag_co()
+    e.append(m2_expr.ExprAssign(dst, result))
+    return (e, [])
+
+
+def pxor(_, instr, dst, src):
+    e = []
+    result = dst ^ src
+    e.append(m2_expr.ExprAssign(dst, result))
+    return (e, [])
+
+
+def l_or(_, instr, dst, src):
+    e = []
+    result = dst | src
+    e += [m2_expr.ExprAssign(zf, m2_expr.ExprOp('FLAG_EQ', dst | src))]
+    e += update_flag_np(result)
+    e += null_flag_co()
+    e.append(m2_expr.ExprAssign(dst, result))
+    return (e, [])
+
+
+def l_and(_, instr, dst, src):
+    e = []
+    result = dst & src
+    e += [m2_expr.ExprAssign(zf, m2_expr.ExprOp('FLAG_EQ_AND', dst, src))]
+    e += update_flag_np(result)
+    e += null_flag_co()
+
+    e.append(m2_expr.ExprAssign(dst, result))
+    return (e, [])
+
+
+def l_test(_, instr, dst, src):
+    e = []
+    result = dst & src
+
+    e += [m2_expr.ExprAssign(zf, m2_expr.ExprOp('FLAG_EQ_CMP', result, m2_expr.ExprInt(0, result.size)))]
+    e += [m2_expr.ExprAssign(nf, m2_expr.ExprOp("FLAG_SIGN_SUB", result, m2_expr.ExprInt(0, result.size)))]
+    e += update_flag_pf(result)
+    e += null_flag_co()
+
+    return (e, [])
+
+
+def get_shift(dst, src):
+    if isinstance(src, m2_expr.ExprInt):
+        src = m2_expr.ExprInt(int(src), dst.size)
+    else:
+        src = src.zeroExtend(dst.size)
+    if dst.size == 64:
+        shift = src & m2_expr.ExprInt(63, src.size)
+    else:
+        shift = src & m2_expr.ExprInt(31, src.size)
+    shift = expr_simp(shift)
+    return shift
+
+
+def _rotate_tpl(ir, instr, dst, src, op, left=False):
+    '''Template to generate a rotater with operation @op
+    A temporary basic block is generated to handle 0-rotate
+    @op: operation to execute
+    @left (optional): indicates a left rotate if set, default is False
+    '''
+    # Compute results
+    shifter = get_shift(dst, src)
+    res = m2_expr.ExprOp(op, dst, shifter)
+
+    # CF is computed with 1-less round than `res`
+    new_cf = m2_expr.ExprOp(
+        op, dst, shifter - m2_expr.ExprInt(1, size=shifter.size))
+    new_cf = new_cf.msb() if left else new_cf[:1]
+
+    # OF is defined only for @b == 1
+    new_of = m2_expr.ExprCond(src - m2_expr.ExprInt(1, size=src.size),
+                              m2_expr.ExprInt(0, size=of.size),
+                              res.msb() ^ new_cf if left else (dst ^ res).msb())
+
+    # Build basic blocks
+    e_do = [m2_expr.ExprAssign(cf, new_cf),
+            m2_expr.ExprAssign(of, new_of),
+            m2_expr.ExprAssign(dst, res)
+            ]
+    e = []
+    if instr.mode == 64:
+        # Force destination set in order to zero high bit orders
+        # In 64 bit:
+        # rol eax, cl
+        # if cl == 0 => high part of RAX is set to zero
+        e.append(m2_expr.ExprAssign(dst, dst))
+    # Don't generate conditional shifter on constant
+    if isinstance(shifter, m2_expr.ExprInt):
+        if int(shifter) != 0:
+            return (e_do, [])
+        else:
+            return (e, [])
+    loc_do, loc_do_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_skip = ir.get_next_loc_key(instr)
+    loc_skip_expr = m2_expr.ExprLoc(loc_skip, ir.IRDst.size)
+    e_do.append(m2_expr.ExprAssign(ir.IRDst, loc_skip_expr))
+    e.append(m2_expr.ExprAssign(
+        ir.IRDst, m2_expr.ExprCond(shifter, loc_do_expr, loc_skip_expr)))
+    return (e, [IRBlock(ir.loc_db, loc_do, [AssignBlock(e_do, instr)])])
+
+
+def l_rol(ir, instr, dst, src):
+    return _rotate_tpl(ir, instr, dst, src, '<<<', left=True)
+
+
+def l_ror(ir, instr, dst, src):
+    return _rotate_tpl(ir, instr, dst, src, '>>>')
+
+
+def rotate_with_carry_tpl(ir, instr, op, dst, src):
+    # Compute results
+    shifter = get_shift(dst, src).zeroExtend(dst.size + 1)
+    result = m2_expr.ExprOp(op, m2_expr.ExprCompose(dst, cf), shifter)
+
+    new_cf = result[dst.size:dst.size +1]
+    new_dst = result[:dst.size]
+
+    result_trunc = result[:dst.size]
+    if op == '<<<':
+        of_value = result_trunc.msb() ^ new_cf
+    else:
+        of_value = (dst ^ result_trunc).msb()
+    # OF is defined only for @b == 1
+    new_of = m2_expr.ExprCond(src - m2_expr.ExprInt(1, size=src.size),
+                              m2_expr.ExprInt(0, size=of.size),
+                              of_value)
+
+
+    # Build basic blocks
+    e_do = [m2_expr.ExprAssign(cf, new_cf),
+            m2_expr.ExprAssign(of, new_of),
+            m2_expr.ExprAssign(dst, new_dst)
+            ]
+    e = [m2_expr.ExprAssign(dst, dst)]
+    # Don't generate conditional shifter on constant
+    if isinstance(shifter, m2_expr.ExprInt):
+        if int(shifter) != 0:
+            return (e_do, [])
+        else:
+            return (e, [])
+    loc_do, loc_do_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_skip = ir.get_next_loc_key(instr)
+    loc_skip_expr = m2_expr.ExprLoc(loc_skip, ir.IRDst.size)
+    e_do.append(m2_expr.ExprAssign(ir.IRDst, loc_skip_expr))
+    e.append(m2_expr.ExprAssign(
+        ir.IRDst, m2_expr.ExprCond(shifter, loc_do_expr, loc_skip_expr)))
+    return (e, [IRBlock(ir.loc_db, loc_do, [AssignBlock(e_do, instr)])])
+
+def rcl(ir, instr, dst, src):
+    return rotate_with_carry_tpl(ir, instr, '<<<', dst, src)
+
+def rcr(ir, instr, dst, src):
+    return rotate_with_carry_tpl(ir, instr, '>>>', dst, src)
+
+
+def _shift_tpl(op, ir, instr, a, b, c=None, op_inv=None, left=False,
+               custom_of=None):
+    """Template to generate a shifter with operation @op
+    A temporary basic block is generated to handle 0-shift
+    @op: operation to execute
+    @c (optional): if set, instruction has a bit provider
+    @op_inv (optional): opposite operation of @op. Must be provided if @c
+    @left (optional): indicates a left shift if set, default is False
+    @custom_of (optional): if set, override the computed value of OF
+    """
+    if c is not None:
+        shifter = get_shift(a, c)
+    else:
+        shifter = get_shift(a, b)
+
+    res = m2_expr.ExprOp(op, a, shifter)
+    cf_from_dst = m2_expr.ExprOp(op, a,
+                                 (shifter - m2_expr.ExprInt(1, a.size)))
+    cf_from_dst = cf_from_dst.msb() if left else cf_from_dst[:1]
+
+    new_cf = cf_from_dst
+    i1 = m2_expr.ExprInt(1, size=a.size)
+    if c is not None:
+        # There is a source for new bits
+        isize = m2_expr.ExprInt(a.size, size=a.size)
+        mask = m2_expr.ExprOp(op_inv, i1, (isize - shifter)) - i1
+
+        # An overflow can occurred, emulate the 'undefined behavior'
+        # Overflow behavior if (shift / size % 2)
+        base_cond_overflow = shifter if left else (
+            shifter - m2_expr.ExprInt(1, size=shifter.size))
+        cond_overflow = base_cond_overflow & m2_expr.ExprInt(a.size, shifter.size)
+        if left:
+            # Overflow occurs one round before right
+            mask = m2_expr.ExprCond(cond_overflow, mask, ~mask)
+        else:
+            mask = m2_expr.ExprCond(cond_overflow, ~mask, mask)
+
+        # Build res with dst and src
+        res = ((m2_expr.ExprOp(op, a, shifter) & mask) |
+               (m2_expr.ExprOp(op_inv, b, (isize - shifter)) & ~mask))
+
+        # Overflow case: cf come from src (bit number shifter % size)
+        cf_from_src = m2_expr.ExprOp(op, b,
+                                     (shifter.zeroExtend(b.size) &
+                                      m2_expr.ExprInt(a.size - 1, b.size)) - i1)
+        cf_from_src = cf_from_src.msb() if left else cf_from_src[:1]
+        new_cf = m2_expr.ExprCond(cond_overflow, cf_from_src, cf_from_dst)
+
+    # Overflow flag, only occurred when shifter is equal to 1
+    if custom_of is None:
+        value_of = a.msb() ^ a[-2:-1] if left else b[:1] ^ a.msb()
+    else:
+        value_of = custom_of
+
+    # Build basic blocks
+    e_do = [
+        m2_expr.ExprAssign(cf, new_cf),
+        m2_expr.ExprAssign(of, m2_expr.ExprCond(shifter - i1,
+                                             m2_expr.ExprInt(0, of.size),
+                                             value_of)),
+        m2_expr.ExprAssign(a, res),
+    ]
+    e_do += update_flag_znp(res)
+    e = []
+    if instr.mode == 64:
+        # Force destination set in order to zero high bit orders
+        # In 64 bit:
+        # shr eax, cl
+        # if cl == 0 => high part of RAX is set to zero
+        e.append(m2_expr.ExprAssign(a, a))
+    # Don't generate conditional shifter on constant
+    if isinstance(shifter, m2_expr.ExprInt):
+        if int(shifter) != 0:
+            return (e_do, [])
+        else:
+            return (e, [])
+    loc_do, loc_do_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_skip = ir.get_next_loc_key(instr)
+    loc_skip_expr = m2_expr.ExprLoc(loc_skip, ir.IRDst.size)
+    e_do.append(m2_expr.ExprAssign(ir.IRDst, loc_skip_expr))
+    e.append(m2_expr.ExprAssign(ir.IRDst, m2_expr.ExprCond(shifter, loc_do_expr,
+                                                        loc_skip_expr)))
+    return e, [IRBlock(ir.loc_db, loc_do, [AssignBlock(e_do, instr)])]
+
+
+def sar(ir, instr, dst, src):
+    # Fixup OF, always cleared if src != 0
+    i0 = m2_expr.ExprInt(0, size=of.size)
+    return _shift_tpl("a>>", ir, instr, dst, src, custom_of=i0)
+
+
+def shr(ir, instr, dst, src):
+    return _shift_tpl(">>", ir, instr, dst, src, custom_of=dst.msb())
+
+
+def shrd(ir, instr, dst, src1, src2):
+    return _shift_tpl(">>>", ir, instr, dst, src1, src2, "<<<")
+
+
+def shl(ir, instr, dst, src):
+    return _shift_tpl("<<", ir, instr, dst, src, left=True)
+
+
+def shld(ir, instr, dst, src1, src2):
+    return _shift_tpl("<<<", ir, instr, dst, src1, src2, ">>>", left=True)
+
+
+# XXX todo ###
+def cmc(_, instr):
+    e = [m2_expr.ExprAssign(cf, m2_expr.ExprCond(cf, m2_expr.ExprInt(0, cf.size),
+                                              m2_expr.ExprInt(1, cf.size)))]
+    return e, []
+
+
+def clc(_, instr):
+    e = [m2_expr.ExprAssign(cf, m2_expr.ExprInt(0, cf.size))]
+    return e, []
+
+
+def stc(_, instr):
+    e = [m2_expr.ExprAssign(cf, m2_expr.ExprInt(1, cf.size))]
+    return e, []
+
+
+def cld(_, instr):
+    e = [m2_expr.ExprAssign(df, m2_expr.ExprInt(0, df.size))]
+    return e, []
+
+
+def std(_, instr):
+    e = [m2_expr.ExprAssign(df, m2_expr.ExprInt(1, df.size))]
+    return e, []
+
+
+def cli(_, instr):
+    e = [m2_expr.ExprAssign(i_f, m2_expr.ExprInt(0, i_f.size))]
+    return e, []
+
+
+def sti(_, instr):
+    e = [m2_expr.ExprAssign(exception_flags, m2_expr.ExprInt(EXCEPT_PRIV_INSN, 32))]
+    return e, []
+
+
+def inc(_, instr, dst):
+    e = []
+    src = m2_expr.ExprInt(1, dst.size)
+    arg1, arg2 = dst, src
+    result = dst + src
+
+    e += update_flag_arith_add_znp(arg1, arg2)
+    e += update_flag_af(arg1, arg2, result)
+    e += update_flag_add_of(arg1, arg2, result)
+
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def dec(_, instr, dst):
+    e = []
+    src = m2_expr.ExprInt(1, dst.size)
+    arg1, arg2 = dst, src
+    result = dst - src
+
+    e += update_flag_arith_sub_znp(arg1, arg2)
+    e += update_flag_af(arg1, arg2, result)
+    e += update_flag_sub_of(arg1, arg2, result)
+
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def push_gen(ir, instr, src, size):
+    e = []
+    if not size in [16, 32, 64]:
+        raise ValueError('bad size stacker!')
+    if src.size < size:
+        src = src.zeroExtend(size)
+    off_size = src.size
+
+    sp = mRSP[instr.mode]
+    new_sp = sp - m2_expr.ExprInt(off_size // 8, sp.size)
+    e.append(m2_expr.ExprAssign(sp, new_sp))
+    if ir.do_stk_segm:
+        new_sp = ir.gen_segm_expr(SS, new_sp)
+    e.append(m2_expr.ExprAssign(ir.ExprMem(new_sp, off_size),
+                             src))
+    return e, []
+
+
+def push(ir, instr, src):
+    return push_gen(ir, instr, src, instr.mode)
+
+
+def pushw(ir, instr, src):
+    return push_gen(ir, instr, src, 16)
+
+
+def pop_gen(ir, instr, src, size):
+    e = []
+    if not size in [16, 32, 64]:
+        raise ValueError('bad size stacker!')
+
+    sp = mRSP[instr.mode]
+    new_sp = sp + m2_expr.ExprInt(src.size // 8, sp.size)
+    # Don't generate SP/ESP/RSP incrementation on POP SP/ESP/RSP
+    if not (src in mRSP.values()):
+        e.append(m2_expr.ExprAssign(sp, new_sp))
+    # XXX FIX XXX for pop [esp]
+    if isinstance(src, m2_expr.ExprMem):
+        src = expr_simp(src.replace_expr({sp: new_sp}))
+    result = sp
+    if ir.do_stk_segm:
+        result = ir.gen_segm_expr(SS, result)
+
+    e.append(m2_expr.ExprAssign(src, ir.ExprMem(result, src.size)))
+    return e, []
+
+
+def pop(ir, instr, src):
+    return pop_gen(ir, instr, src, instr.mode)
+
+
+def popw(ir, instr, src):
+    return pop_gen(ir, instr, src, 16)
+
+
+def sete(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_EQ", zf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setnz(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_EQ", ~zf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setl(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_S<", nf, of).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setg(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_S>", nf, of, zf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setge(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_S>=", nf, of).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def seta(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_U>", cf, zf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setae(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_U>=", cf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setb(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_U<", cf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setbe(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_U<=", cf, zf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setns(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_NEG", ~nf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def sets(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_NEG", nf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def seto(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            of.zeroExtend(dst.size)
+        )
+    )
+    return e, []
+
+
+def setp(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            pf.zeroExtend(dst.size)
+        )
+    )
+    return e, []
+
+
+def setnp(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprCond(
+                pf,
+                m2_expr.ExprInt(0, dst.size),
+                m2_expr.ExprInt(1, dst.size)
+            )
+        )
+    )
+    return e, []
+
+
+def setle(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_S<=", nf, of, zf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setna(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_U<=", cf, zf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setnbe(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_U>", cf, zf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setno(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprCond(
+                of,
+                m2_expr.ExprInt(0, dst.size),
+                m2_expr.ExprInt(1, dst.size)
+            )
+        )
+    )
+    return e, []
+
+
+def setnb(_, instr, dst):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst,
+            m2_expr.ExprOp("CC_U>=", cf).zeroExtend(dst.size),
+        )
+    )
+    return e, []
+
+
+def setalc(_, instr):
+    dst = mRAX[instr.mode][0:8]
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst, m2_expr.ExprCond(cf, m2_expr.ExprInt(0xff, dst.size),
+                                              m2_expr.ExprInt(0, dst.size))))
+    return e, []
+
+
+def bswap(_, instr, dst):
+    e = []
+    if dst.size == 16:
+        # BSWAP referencing a 16-bit register is undefined
+        # Seems to return 0 actually
+        result = m2_expr.ExprInt(0, 16)
+    elif dst.size == 32:
+        result = m2_expr.ExprCompose(
+            dst[24:32], dst[16:24], dst[8:16], dst[:8])
+    elif dst.size == 64:
+        result = m2_expr.ExprCompose(dst[56:64], dst[48:56], dst[40:48], dst[32:40],
+                                     dst[24:32], dst[16:24], dst[8:16], dst[:8])
+    else:
+        raise ValueError('the size DOES matter')
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def cmps(ir, instr, size):
+    loc_df_0, loc_df_0_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_df_1, loc_df_1_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_next_expr = m2_expr.ExprLoc(ir.get_next_loc_key(instr), ir.IRDst.size)
+
+    src1 = mRSI[instr.mode][:instr.v_admode()]
+    src2 = mRDI[instr.mode][:instr.v_admode()]
+
+    if ir.do_str_segm:
+        if instr.additional_info.g2.value:
+            raise NotImplementedError("add segm support")
+        src1_sgm = ir.gen_segm_expr(DS, src1)
+        src2_sgm = ir.gen_segm_expr(ES, src2)
+    else:
+        src1_sgm = src1
+        src2_sgm = src2
+
+    offset = m2_expr.ExprInt(size // 8, src1.size)
+
+    e, _ = l_cmp(ir, instr,
+                 ir.ExprMem(src1_sgm, size),
+                 ir.ExprMem(src2_sgm, size))
+
+
+    e0 = []
+    e0.append(m2_expr.ExprAssign(src1, src1 + offset))
+    e0.append(m2_expr.ExprAssign(src2, src2 + offset))
+    e0.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e0 = IRBlock(ir.loc_db, loc_df_0, [AssignBlock(e0, instr)])
+
+    e1 = []
+    e1.append(m2_expr.ExprAssign(src1, src1 - offset))
+    e1.append(m2_expr.ExprAssign(src2, src2 - offset))
+    e1.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e1 = IRBlock(ir.loc_db, loc_df_1, [AssignBlock(e1, instr)])
+
+    e.append(m2_expr.ExprAssign(ir.IRDst,
+                             m2_expr.ExprCond(df, loc_df_1_expr, loc_df_0_expr)))
+    return e, [e0, e1]
+
+
+def scas(ir, instr, size):
+    loc_df_0, loc_df_0_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_df_1, loc_df_1_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_next_expr = m2_expr.ExprLoc(ir.get_next_loc_key(instr), ir.IRDst.size)
+
+    src = mRDI[instr.mode][:instr.v_admode()]
+
+    if ir.do_str_segm:
+        if instr.additional_info.g2.value:
+            raise NotImplementedError("add segm support")
+        src_sgm = ir.gen_segm_expr(ES, src)
+
+    else:
+        src_sgm = src
+
+    offset = m2_expr.ExprInt(size // 8, src.size)
+    e, extra = l_cmp(ir, instr,
+                     mRAX[instr.mode][:size],
+                     ir.ExprMem(src_sgm, size))
+
+    e0 = []
+    e0.append(m2_expr.ExprAssign(src, src + offset))
+
+    e0.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e0 = IRBlock(ir.loc_db, loc_df_0, [AssignBlock(e0, instr)])
+
+    e1 = []
+    e1.append(m2_expr.ExprAssign(src, src - offset))
+    e1.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e1 = IRBlock(ir.loc_db, loc_df_1, [AssignBlock(e1, instr)])
+
+    e.append(m2_expr.ExprAssign(ir.IRDst,
+                             m2_expr.ExprCond(df, loc_df_1_expr, loc_df_0_expr)))
+
+    return e, [e0, e1]
+
+
+def compose_eflag(s=32):
+    args = []
+
+    args = [cf, m2_expr.ExprInt(1, 1), pf, m2_expr.ExprInt(0, 1), af,
+            m2_expr.ExprInt(0, 1), zf, nf, tf, i_f, df, of, iopl]
+
+    if s == 32:
+        args += [nt, m2_expr.ExprInt(0, 1), rf, vm, ac, vif, vip, i_d]
+    elif s == 16:
+        args += [nt, m2_expr.ExprInt(0, 1)]
+    else:
+        raise ValueError('unk size')
+    if s == 32:
+        args.append(m2_expr.ExprInt(0, 10))
+    return m2_expr.ExprCompose(*args)
+
+
+def pushfd(ir, instr):
+    return push(ir, instr, compose_eflag())
+
+
+def pushfq(ir, instr):
+    return push(ir, instr, compose_eflag().zeroExtend(64))
+
+
+def pushfw(ir, instr):
+    return pushw(ir, instr, compose_eflag(16))
+
+
+def popfd(ir, instr):
+    tmp = ir.ExprMem(mRSP[instr.mode], 32)
+    e = []
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprSlice(tmp, 0, 1)))
+    e.append(m2_expr.ExprAssign(pf, m2_expr.ExprSlice(tmp, 2, 3)))
+    e.append(m2_expr.ExprAssign(af, m2_expr.ExprSlice(tmp, 4, 5)))
+    e.append(m2_expr.ExprAssign(zf, m2_expr.ExprSlice(tmp, 6, 7)))
+    e.append(m2_expr.ExprAssign(nf, m2_expr.ExprSlice(tmp, 7, 8)))
+    e.append(m2_expr.ExprAssign(tf, m2_expr.ExprSlice(tmp, 8, 9)))
+    e.append(m2_expr.ExprAssign(i_f, m2_expr.ExprSlice(tmp, 9, 10)))
+    e.append(m2_expr.ExprAssign(df, m2_expr.ExprSlice(tmp, 10, 11)))
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprSlice(tmp, 11, 12)))
+    e.append(m2_expr.ExprAssign(iopl, m2_expr.ExprSlice(tmp, 12, 14)))
+    e.append(m2_expr.ExprAssign(nt, m2_expr.ExprSlice(tmp, 14, 15)))
+    e.append(m2_expr.ExprAssign(rf, m2_expr.ExprSlice(tmp, 16, 17)))
+    e.append(m2_expr.ExprAssign(vm, m2_expr.ExprSlice(tmp, 17, 18)))
+    e.append(m2_expr.ExprAssign(ac, m2_expr.ExprSlice(tmp, 18, 19)))
+    e.append(m2_expr.ExprAssign(vif, m2_expr.ExprSlice(tmp, 19, 20)))
+    e.append(m2_expr.ExprAssign(vip, m2_expr.ExprSlice(tmp, 20, 21)))
+    e.append(m2_expr.ExprAssign(i_d, m2_expr.ExprSlice(tmp, 21, 22)))
+    e.append(m2_expr.ExprAssign(mRSP[instr.mode],
+                             mRSP[instr.mode] + m2_expr.ExprInt(instr.mode // 8, mRSP[instr.mode].size)))
+    e.append(m2_expr.ExprAssign(exception_flags,
+                             m2_expr.ExprCond(m2_expr.ExprSlice(tmp, 8, 9),
+                                              m2_expr.ExprInt(
+                                                  EXCEPT_SOFT_BP, 32),
+                                              exception_flags
+                                              )
+                             )
+             )
+    return e, []
+
+
+def _tpl_eflags(tmp):
+    """Extract eflags from @tmp
+    @tmp: Expr instance with a size >= 16
+    """
+    return [m2_expr.ExprAssign(dest, tmp[base:base + dest.size])
+            for base, dest in ((0, cf), (2, pf), (4, af), (6, zf), (7, nf),
+                               (8, tf), (9, i_f), (10, df), (11, of),
+                               (12, iopl), (14, nt))]
+
+
+def popfw(ir, instr):
+    tmp = ir.ExprMem(mRSP[instr.mode], 16)
+    e = _tpl_eflags(tmp)
+    e.append(
+        m2_expr.ExprAssign(mRSP[instr.mode], mRSP[instr.mode] + m2_expr.ExprInt(2, mRSP[instr.mode].size)))
+    return e, []
+
+pa_regs = [
+    mRAX, mRCX,
+    mRDX, mRBX,
+    mRSP, mRBP,
+    mRSI, mRDI
+]
+
+
+def pusha_gen(ir, instr, size):
+    e = []
+    cur_sp = mRSP[instr.mode]
+    for i, reg in enumerate(pa_regs):
+        stk_ptr = cur_sp + m2_expr.ExprInt(-(size // 8) * (i + 1), instr.mode)
+        e.append(m2_expr.ExprAssign(ir.ExprMem(stk_ptr, size), reg[size]))
+    e.append(m2_expr.ExprAssign(cur_sp, stk_ptr))
+    return e, []
+
+
+def pusha(ir, instr):
+    return pusha_gen(ir, instr, 16)
+
+
+def pushad(ir, instr):
+    return pusha_gen(ir, instr, 32)
+
+
+def popa_gen(ir, instr, size):
+    e = []
+    cur_sp = mRSP[instr.mode]
+    for i, reg in enumerate(reversed(pa_regs)):
+        if reg == mRSP:
+            continue
+        stk_ptr = cur_sp + m2_expr.ExprInt((size // 8) * i, instr.mode)
+        e.append(m2_expr.ExprAssign(reg[size], ir.ExprMem(stk_ptr, size)))
+
+    stk_ptr = cur_sp + m2_expr.ExprInt((size // 8) * (i + 1), instr.mode)
+    e.append(m2_expr.ExprAssign(cur_sp, stk_ptr))
+
+    return e, []
+
+
+def popa(ir, instr):
+    return popa_gen(ir, instr, 16)
+
+
+def popad(ir, instr):
+    return popa_gen(ir, instr, 32)
+
+
+def call(ir, instr, dst):
+    e = []
+    # opmode, admode = instr.opmode, instr.admode
+    s = dst.size
+    meip = mRIP[ir.IRDst.size]
+    opmode, admode = s, instr.v_admode()
+    myesp = mRSP[instr.mode][:opmode]
+    n = m2_expr.ExprLoc(ir.get_next_loc_key(instr), ir.IRDst.size)
+
+    if isinstance(dst, m2_expr.ExprOp):
+        if dst.op == "segm":
+            # Far call segm:addr
+            if instr.mode not in [16, 32]:
+                raise RuntimeError('not supported')
+            segm = dst.args[0]
+            base = dst.args[1]
+            m1 = segm.zeroExtend(CS.size)
+            m2 = base.zeroExtend(meip.size)
+        elif dst.op == "far":
+            # Far call far [eax]
+            addr = dst.args[0].ptr
+            m1 = ir.ExprMem(addr, CS.size)
+            m2 = ir.ExprMem(addr + m2_expr.ExprInt(2, addr.size), meip.size)
+        else:
+            raise RuntimeError("bad call operator")
+
+        e.append(m2_expr.ExprAssign(CS, m1))
+        e.append(m2_expr.ExprAssign(meip, m2))
+
+        e.append(m2_expr.ExprAssign(ir.IRDst, m2))
+
+        c = myesp + m2_expr.ExprInt(-s // 8, s)
+        e.append(m2_expr.ExprAssign(ir.ExprMem(c, size=s).zeroExtend(s),
+                                 CS.zeroExtend(s)))
+
+        c = myesp + m2_expr.ExprInt((-2 * s) // 8, s)
+        e.append(m2_expr.ExprAssign(ir.ExprMem(c, size=s).zeroExtend(s),
+                                 meip.zeroExtend(s)))
+
+        c = myesp + m2_expr.ExprInt((-2 * s) // 8, s)
+        e.append(m2_expr.ExprAssign(myesp, c))
+        return e, []
+
+    c = myesp + m2_expr.ExprInt(-s // 8, s)
+    e.append(m2_expr.ExprAssign(myesp, c))
+    if ir.do_stk_segm:
+        c = ir.gen_segm_expr(SS, c)
+
+    e.append(m2_expr.ExprAssign(ir.ExprMem(c, size=s), n))
+    e.append(m2_expr.ExprAssign(meip, dst.zeroExtend(ir.IRDst.size)))
+    e.append(m2_expr.ExprAssign(ir.IRDst, dst.zeroExtend(ir.IRDst.size)))
+    return e, []
+
+
+def ret(ir, instr, src=None):
+    e = []
+    meip = mRIP[ir.IRDst.size]
+    size, admode = instr.v_opmode(), instr.v_admode()
+    myesp = mRSP[instr.mode][:size]
+
+    if src is None:
+        value = (myesp + (m2_expr.ExprInt(size // 8, size)))
+    else:
+        src = m2_expr.ExprInt(int(src), size)
+        value = (myesp + (m2_expr.ExprInt(size // 8, size) + src))
+
+    e.append(m2_expr.ExprAssign(myesp, value))
+    result = myesp
+    if ir.do_stk_segm:
+        result = ir.gen_segm_expr(SS, result)
+
+    e.append(m2_expr.ExprAssign(meip, ir.ExprMem(
+        result, size=size).zeroExtend(size)))
+    e.append(m2_expr.ExprAssign(ir.IRDst,
+                             ir.ExprMem(result, size=size).zeroExtend(size)))
+    return e, []
+
+
+def retf(ir, instr, src=None):
+    e = []
+    meip = mRIP[ir.IRDst.size]
+    size, admode = instr.v_opmode(), instr.v_admode()
+    if src is None:
+        src = m2_expr.ExprInt(0, instr.mode)
+    myesp = mRSP[instr.mode][:size]
+
+    src = src.zeroExtend(size)
+
+    result = myesp
+    if ir.do_stk_segm:
+        result = ir.gen_segm_expr(SS, result)
+
+    e.append(m2_expr.ExprAssign(meip, ir.ExprMem(
+        result, size=size).zeroExtend(size)))
+    e.append(m2_expr.ExprAssign(ir.IRDst,
+                             ir.ExprMem(result, size=size).zeroExtend(size)))
+    # e.append(m2_expr.ExprAssign(meip, ir.ExprMem(c, size = s)))
+    result = myesp + m2_expr.ExprInt(size // 8, size)
+    if ir.do_stk_segm:
+        result = ir.gen_segm_expr(SS, result)
+
+    e.append(m2_expr.ExprAssign(CS, ir.ExprMem(result, size=16)))
+
+    value = myesp + (m2_expr.ExprInt((2 * size) // 8, size) + src)
+    e.append(m2_expr.ExprAssign(myesp, value))
+    return e, []
+
+
+def leave(ir, instr):
+    size = instr.mode
+    myesp = mRSP[size]
+    e = []
+    e.append(m2_expr.ExprAssign(mRBP[size], ir.ExprMem(mRBP[size], size=size)))
+    e.append(m2_expr.ExprAssign(myesp,
+                             m2_expr.ExprInt(size // 8, size) + mRBP[size]))
+    return e, []
+
+
+def enter(ir, instr, src1, src2):
+    size, admode = instr.v_opmode(), instr.v_admode()
+    myesp = mRSP[instr.mode][:size]
+    myebp = mRBP[instr.mode][:size]
+
+    src1 = src1.zeroExtend(size)
+
+    e = []
+    esp_tmp = myesp - m2_expr.ExprInt(size // 8, size)
+    e.append(m2_expr.ExprAssign(ir.ExprMem(esp_tmp, size=size),
+                             myebp))
+    e.append(m2_expr.ExprAssign(myebp, esp_tmp))
+    e.append(m2_expr.ExprAssign(myesp,
+                             myesp - (src1 + m2_expr.ExprInt(size // 8, size))))
+    return e, []
+
+
+def jmp(ir, instr, dst):
+    e = []
+    meip = mRIP[ir.IRDst.size]
+
+    if isinstance(dst, m2_expr.ExprOp):
+        if dst.op == "segm":
+            # Far jmp segm:addr
+            segm = dst.args[0]
+            base = dst.args[1]
+            m1 = segm.zeroExtend(CS.size)
+            m2 = base.zeroExtend(meip.size)
+        elif dst.op == "far":
+            # Far jmp far [eax]
+            addr = dst.args[0].ptr
+            m1 = ir.ExprMem(addr, CS.size)
+            m2 = ir.ExprMem(addr + m2_expr.ExprInt(2, addr.size), meip.size)
+        else:
+            raise RuntimeError("bad jmp operator")
+
+        e.append(m2_expr.ExprAssign(CS, m1))
+        e.append(m2_expr.ExprAssign(meip, m2))
+        e.append(m2_expr.ExprAssign(ir.IRDst, m2))
+
+    else:
+        # Classic jmp
+        e.append(m2_expr.ExprAssign(meip, dst))
+        e.append(m2_expr.ExprAssign(ir.IRDst, dst))
+
+        if isinstance(dst, m2_expr.ExprMem):
+            dst = meip
+    return e, []
+
+
+def jz(ir, instr, dst):
+    #return gen_jcc(ir, instr, zf, dst, True)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_EQ", zf), dst, True)
+
+
+def jcxz(ir, instr, dst):
+    return gen_jcc(ir, instr, mRCX[instr.mode][:16], dst, False)
+
+
+def jecxz(ir, instr, dst):
+    return gen_jcc(ir, instr, mRCX[instr.mode][:32], dst, False)
+
+
+def jrcxz(ir, instr, dst):
+    return gen_jcc(ir, instr, mRCX[instr.mode], dst, False)
+
+
+def jnz(ir, instr, dst):
+    #return gen_jcc(ir, instr, zf, dst, False)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_EQ", zf), dst, False)
+
+
+
+def jp(ir, instr, dst):
+    return gen_jcc(ir, instr, pf, dst, True)
+
+
+def jnp(ir, instr, dst):
+    return gen_jcc(ir, instr, pf, dst, False)
+
+
+def ja(ir, instr, dst):
+    #return gen_jcc(ir, instr, cf | zf, dst, False)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_U>", cf, zf), dst, True)
+
+
+def jae(ir, instr, dst):
+    #return gen_jcc(ir, instr, cf, dst, False)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_U>=", cf), dst, True)
+
+
+def jb(ir, instr, dst):
+    #return gen_jcc(ir, instr, cf, dst, True)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_U<", cf), dst, True)
+
+
+def jbe(ir, instr, dst):
+    #return gen_jcc(ir, instr, cf | zf, dst, True)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_U<=", cf, zf), dst, True)
+
+
+def jge(ir, instr, dst):
+    #return gen_jcc(ir, instr, nf - of, dst, False)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_S>=", nf, of), dst, True)
+
+
+def jg(ir, instr, dst):
+    #return gen_jcc(ir, instr, zf | (nf - of), dst, False)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_S>", nf, of, zf), dst, True)
+
+
+def jl(ir, instr, dst):
+    #return gen_jcc(ir, instr, nf - of, dst, True)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_S<", nf, of), dst, True)
+
+
+def jle(ir, instr, dst):
+    #return gen_jcc(ir, instr, zf | (nf - of), dst, True)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_S<=", nf, of, zf), dst, True)
+
+
+
+def js(ir, instr, dst):
+    #return gen_jcc(ir, instr, nf, dst, True)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_NEG", nf), dst, True)
+
+
+
+def jns(ir, instr, dst):
+    #return gen_jcc(ir, instr, nf, dst, False)
+    return gen_jcc(ir, instr, m2_expr.ExprOp("CC_NEG", nf), dst, False)
+
+
+def jo(ir, instr, dst):
+    return gen_jcc(ir, instr, of, dst, True)
+
+
+def jno(ir, instr, dst):
+    return gen_jcc(ir, instr, of, dst, False)
+
+
+def loop(ir, instr, dst):
+    e = []
+    meip = mRIP[ir.IRDst.size]
+    admode = instr.v_admode()
+    myecx = mRCX[instr.mode][:admode]
+
+    n = m2_expr.ExprLoc(ir.get_next_loc_key(instr), ir.IRDst.size)
+    c = myecx - m2_expr.ExprInt(1, myecx.size)
+    dst_o = m2_expr.ExprCond(c,
+                             dst.zeroExtend(ir.IRDst.size),
+                             n.zeroExtend(ir.IRDst.size))
+    e.append(m2_expr.ExprAssign(myecx, c))
+    e.append(m2_expr.ExprAssign(meip, dst_o))
+    e.append(m2_expr.ExprAssign(ir.IRDst, dst_o))
+    return e, []
+
+
+def loopne(ir, instr, dst):
+    e = []
+    meip = mRIP[ir.IRDst.size]
+    admode = instr.v_admode()
+    myecx = mRCX[instr.mode][:admode]
+
+    n = m2_expr.ExprLoc(ir.get_next_loc_key(instr), ir.IRDst.size)
+
+    c = m2_expr.ExprCond(myecx - m2_expr.ExprInt(1, size=myecx.size),
+                         m2_expr.ExprInt(1, 1),
+                         m2_expr.ExprInt(0, 1))
+    c &= zf ^ m2_expr.ExprInt(1, 1)
+
+    e.append(m2_expr.ExprAssign(myecx, myecx - m2_expr.ExprInt(1, myecx.size)))
+    dst_o = m2_expr.ExprCond(c,
+                             dst.zeroExtend(ir.IRDst.size),
+                             n.zeroExtend(ir.IRDst.size))
+    e.append(m2_expr.ExprAssign(meip, dst_o))
+    e.append(m2_expr.ExprAssign(ir.IRDst, dst_o))
+    return e, []
+
+
+def loope(ir, instr, dst):
+    e = []
+    meip = mRIP[ir.IRDst.size]
+    admode = instr.v_admode()
+    myecx = mRCX[instr.mode][:admode]
+
+    n = m2_expr.ExprLoc(ir.get_next_loc_key(instr), ir.IRDst.size)
+    c = m2_expr.ExprCond(myecx - m2_expr.ExprInt(1, size=myecx.size),
+                         m2_expr.ExprInt(1, 1),
+                         m2_expr.ExprInt(0, 1))
+    c &= zf
+    e.append(m2_expr.ExprAssign(myecx, myecx - m2_expr.ExprInt(1, myecx.size)))
+    dst_o = m2_expr.ExprCond(c,
+                             dst.zeroExtend(ir.IRDst.size),
+                             n.zeroExtend(ir.IRDst.size))
+    e.append(m2_expr.ExprAssign(meip, dst_o))
+    e.append(m2_expr.ExprAssign(ir.IRDst, dst_o))
+    return e, []
+
+# XXX size to do; eflag
+
+
+def div(ir, instr, src1):
+    e = []
+    size = src1.size
+    if size == 8:
+        src2 = mRAX[instr.mode][:16]
+    elif size in [16, 32, 64]:
+        s1, s2 = mRDX[size], mRAX[size]
+        src2 = m2_expr.ExprCompose(s2, s1)
+    else:
+        raise ValueError('div arg not impl', src1)
+
+    c_d = m2_expr.ExprOp('udiv', src2, src1.zeroExtend(src2.size))
+    c_r = m2_expr.ExprOp('umod', src2, src1.zeroExtend(src2.size))
+
+    # if 8 bit div, only ax is assigned
+    if size == 8:
+        e.append(m2_expr.ExprAssign(src2, m2_expr.ExprCompose(c_d[:8], c_r[:8])))
+    else:
+        e.append(m2_expr.ExprAssign(s1, c_r[:size]))
+        e.append(m2_expr.ExprAssign(s2, c_d[:size]))
+
+    loc_div, loc_div_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_except, loc_except_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_next = ir.get_next_loc_key(instr)
+    loc_next_expr = m2_expr.ExprLoc(loc_next, ir.IRDst.size)
+
+    do_div = []
+    do_div += e
+    do_div.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    blk_div = IRBlock(ir.loc_db, loc_div, [AssignBlock(do_div, instr)])
+
+    do_except = []
+    do_except.append(m2_expr.ExprAssign(exception_flags, m2_expr.ExprInt(
+        EXCEPT_DIV_BY_ZERO, exception_flags.size)))
+    do_except.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    blk_except = IRBlock(ir.loc_db, loc_except, [AssignBlock(do_except, instr)])
+
+    e = []
+    e.append(m2_expr.ExprAssign(ir.IRDst,
+                             m2_expr.ExprCond(src1, loc_div_expr, loc_except_expr)))
+
+    return e, [blk_div, blk_except]
+
+
+# XXX size to do; eflag
+
+def idiv(ir, instr, src1):
+    e = []
+    size = src1.size
+
+    if size == 8:
+        src2 = mRAX[instr.mode][:16]
+    elif size in [16, 32, 64]:
+        s1, s2 = mRDX[size], mRAX[size]
+        src2 = m2_expr.ExprCompose(s2, s1)
+    else:
+        raise ValueError('div arg not impl', src1)
+
+    c_d = m2_expr.ExprOp('sdiv', src2, src1.signExtend(src2.size))
+    c_r = m2_expr.ExprOp('smod', src2, src1.signExtend(src2.size))
+
+    # if 8 bit div, only ax is assigned
+    if size == 8:
+        e.append(m2_expr.ExprAssign(src2, m2_expr.ExprCompose(c_d[:8], c_r[:8])))
+    else:
+        e.append(m2_expr.ExprAssign(s1, c_r[:size]))
+        e.append(m2_expr.ExprAssign(s2, c_d[:size]))
+
+    loc_div, loc_div_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_except, loc_except_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_next = ir.get_next_loc_key(instr)
+    loc_next_expr = m2_expr.ExprLoc(loc_next, ir.IRDst.size)
+
+    do_div = []
+    do_div += e
+    do_div.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    blk_div = IRBlock(ir.loc_db, loc_div, [AssignBlock(do_div, instr)])
+
+    do_except = []
+    do_except.append(m2_expr.ExprAssign(exception_flags, m2_expr.ExprInt(
+        EXCEPT_DIV_BY_ZERO, exception_flags.size)))
+    do_except.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    blk_except = IRBlock(ir.loc_db, loc_except, [AssignBlock(do_except, instr)])
+
+    e = []
+    e.append(m2_expr.ExprAssign(ir.IRDst,
+                             m2_expr.ExprCond(src1, loc_div_expr, loc_except_expr)))
+
+    return e, [blk_div, blk_except]
+
+
+# XXX size to do; eflag
+
+
+def mul(_, instr, src1):
+    e = []
+    size = src1.size
+    if src1.size in [16, 32, 64]:
+        result = m2_expr.ExprOp('*',
+                                mRAX[size].zeroExtend(size * 2),
+                                src1.zeroExtend(size * 2))
+        e.append(m2_expr.ExprAssign(mRAX[size], result[:size]))
+        e.append(m2_expr.ExprAssign(mRDX[size], result[size:size * 2]))
+
+    elif src1.size == 8:
+        result = m2_expr.ExprOp('*',
+                                mRAX[instr.mode][:8].zeroExtend(16),
+                                src1.zeroExtend(16))
+        e.append(m2_expr.ExprAssign(mRAX[instr.mode][:16], result))
+    else:
+        raise ValueError('unknow size')
+
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprCond(result[size:size * 2],
+                                                  m2_expr.ExprInt(1, 1),
+                                                  m2_expr.ExprInt(0, 1))))
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprCond(result[size:size * 2],
+                                                  m2_expr.ExprInt(1, 1),
+                                                  m2_expr.ExprInt(0, 1))))
+
+    return e, []
+
+
+def imul(_, instr, src1, src2=None, src3=None):
+    e = []
+    size = src1.size
+    if src2 is None:
+        if size in [16, 32, 64]:
+            result = m2_expr.ExprOp('*',
+                                    mRAX[size].signExtend(size * 2),
+                                    src1.signExtend(size * 2))
+            e.append(m2_expr.ExprAssign(mRAX[size], result[:size]))
+            e.append(m2_expr.ExprAssign(mRDX[size], result[size:size * 2]))
+        elif size == 8:
+            dst = mRAX[instr.mode][:16]
+            result = m2_expr.ExprOp('*',
+                                    mRAX[instr.mode][:8].signExtend(16),
+                                    src1.signExtend(16))
+
+            e.append(m2_expr.ExprAssign(dst, result))
+        value = m2_expr.ExprCond(result - result[:size].signExtend(size * 2),
+                                 m2_expr.ExprInt(1, 1),
+                                 m2_expr.ExprInt(0, 1))
+        e.append(m2_expr.ExprAssign(cf, value))
+        value = m2_expr.ExprCond(result - result[:size].signExtend(size * 2),
+                                 m2_expr.ExprInt(1, 1),
+                                 m2_expr.ExprInt(0, 1))
+        e.append(m2_expr.ExprAssign(of, value))
+
+    else:
+        if src3 is None:
+            src3 = src2
+            src2 = src1
+        result = m2_expr.ExprOp('*',
+                                src2.signExtend(size * 2),
+                                src3.signExtend(size * 2))
+        e.append(m2_expr.ExprAssign(src1, result[:size]))
+
+        value = m2_expr.ExprCond(result - result[:size].signExtend(size * 2),
+                                 m2_expr.ExprInt(1, 1),
+                                 m2_expr.ExprInt(0, 1))
+        e.append(m2_expr.ExprAssign(cf, value))
+        value = m2_expr.ExprCond(result - result[:size].signExtend(size * 2),
+                                 m2_expr.ExprInt(1, 1),
+                                 m2_expr.ExprInt(0, 1))
+        e.append(m2_expr.ExprAssign(of, value))
+    return e, []
+
+
+def cbw(_, instr):
+    # Only in 16 bit
+    e = []
+    tempAL = mRAX[instr.v_opmode()][:8]
+    tempAX = mRAX[instr.v_opmode()][:16]
+    e.append(m2_expr.ExprAssign(tempAX, tempAL.signExtend(16)))
+    return e, []
+
+
+def cwde(_, instr):
+    # Only in 32/64 bit
+    e = []
+    tempAX = mRAX[instr.v_opmode()][:16]
+    tempEAX = mRAX[instr.v_opmode()][:32]
+    e.append(m2_expr.ExprAssign(tempEAX, tempAX.signExtend(32)))
+    return e, []
+
+
+def cdqe(_, instr):
+    # Only in 64 bit
+    e = []
+    tempEAX = mRAX[instr.mode][:32]
+    tempRAX = mRAX[instr.mode][:64]
+    e.append(m2_expr.ExprAssign(tempRAX, tempEAX.signExtend(64)))
+    return e, []
+
+
+def cwd(_, instr):
+    # Only in 16 bit
+    e = []
+    tempAX = mRAX[instr.mode][:16]
+    tempDX = mRDX[instr.mode][:16]
+    result = tempAX.signExtend(32)
+    e.append(m2_expr.ExprAssign(tempAX, result[:16]))
+    e.append(m2_expr.ExprAssign(tempDX, result[16:32]))
+    return e, []
+
+
+def cdq(_, instr):
+    # Only in 32/64 bit
+    e = []
+    tempEAX = mRAX[instr.v_opmode()]
+    tempEDX = mRDX[instr.v_opmode()]
+    result = tempEAX.signExtend(64)
+    e.append(m2_expr.ExprAssign(tempEDX, result[32:64]))
+    return e, []
+
+
+def cqo(_, instr):
+    # Only in 64 bit
+    e = []
+    tempRAX = mRAX[instr.mode][:64]
+    tempRDX = mRDX[instr.mode][:64]
+    result = tempRAX.signExtend(128)
+    e.append(m2_expr.ExprAssign(tempRAX, result[:64]))
+    e.append(m2_expr.ExprAssign(tempRDX, result[64:128]))
+    return e, []
+
+
+def stos(ir, instr, size):
+    loc_df_0, loc_df_0_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_df_1, loc_df_1_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_next_expr = m2_expr.ExprLoc(ir.get_next_loc_key(instr), ir.IRDst.size)
+
+    addr_o = mRDI[instr.mode][:instr.v_admode()]
+    addr = addr_o
+    addr_p = addr + m2_expr.ExprInt(size // 8, addr.size)
+    addr_m = addr - m2_expr.ExprInt(size // 8, addr.size)
+    if ir.do_str_segm:
+        mss = ES
+        if instr.additional_info.g2.value:
+            raise NotImplementedError("add segm support")
+        addr = ir.gen_segm_expr(mss, addr)
+
+
+    b = mRAX[instr.mode][:size]
+
+    e0 = []
+    e0.append(m2_expr.ExprAssign(addr_o, addr_p))
+    e0.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e0 = IRBlock(ir.loc_db, loc_df_0, [AssignBlock(e0, instr)])
+
+    e1 = []
+    e1.append(m2_expr.ExprAssign(addr_o, addr_m))
+    e1.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e1 = IRBlock(ir.loc_db, loc_df_1, [AssignBlock(e1, instr)])
+
+    e = []
+    e.append(m2_expr.ExprAssign(ir.ExprMem(addr, size), b))
+    e.append(m2_expr.ExprAssign(ir.IRDst,
+                             m2_expr.ExprCond(df, loc_df_1_expr, loc_df_0_expr)))
+    return e, [e0, e1]
+
+
+def lods(ir, instr, size):
+    loc_df_0, loc_df_0_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_df_1, loc_df_1_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_next_expr = m2_expr.ExprLoc(ir.get_next_loc_key(instr), ir.IRDst.size)
+    e = []
+
+    addr_o = mRSI[instr.mode][:instr.v_admode()]
+    addr = addr_o
+    addr_p = addr + m2_expr.ExprInt(size // 8, addr.size)
+    addr_m = addr - m2_expr.ExprInt(size // 8, addr.size)
+    if ir.do_str_segm:
+        mss = DS
+        if instr.additional_info.g2.value:
+            raise NotImplementedError("add segm support")
+        addr = ir.gen_segm_expr(mss, addr)
+
+
+    b = mRAX[instr.mode][:size]
+
+    e0 = []
+    e0.append(m2_expr.ExprAssign(addr_o, addr_p))
+    e0.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e0 = IRBlock(ir.loc_db, loc_df_0, [AssignBlock(e0, instr)])
+
+    e1 = []
+    e1.append(m2_expr.ExprAssign(addr_o, addr_m))
+    e1.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e1 = IRBlock(ir.loc_db, loc_df_1, [AssignBlock(e1, instr)])
+
+    e = []
+    if instr.mode == 64 and b.size == 32:
+        e.append(m2_expr.ExprAssign(mRAX[instr.mode],
+                                 ir.ExprMem(addr, size).zeroExtend(64)))
+    else:
+        e.append(m2_expr.ExprAssign(b, ir.ExprMem(addr, size)))
+
+    e.append(m2_expr.ExprAssign(ir.IRDst,
+                             m2_expr.ExprCond(df, loc_df_1_expr, loc_df_0_expr)))
+    return e, [e0, e1]
+
+
+def movs(ir, instr, size):
+    loc_df_0, loc_df_0_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_df_1, loc_df_1_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_next_expr = m2_expr.ExprLoc(ir.get_next_loc_key(instr), ir.IRDst.size)
+
+    dst = mRDI[instr.mode][:instr.v_admode()]
+    src = mRSI[instr.mode][:instr.v_admode()]
+
+    e = []
+    if ir.do_str_segm:
+        if instr.additional_info.g2.value:
+            raise NotImplementedError("add segm support")
+        src_sgm = ir.gen_segm_expr(DS, src)
+        dst_sgm = ir.gen_segm_expr(ES, dst)
+
+    else:
+        src_sgm = src
+        dst_sgm = dst
+
+    offset = m2_expr.ExprInt(size // 8, src.size)
+
+    e.append(m2_expr.ExprAssign(ir.ExprMem(dst_sgm, size),
+                             ir.ExprMem(src_sgm, size)))
+
+    e0 = []
+    e0.append(m2_expr.ExprAssign(src, src + offset))
+    e0.append(m2_expr.ExprAssign(dst, dst + offset))
+    e0.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e0 = IRBlock(ir.loc_db, loc_df_0, [AssignBlock(e0, instr)])
+
+    e1 = []
+    e1.append(m2_expr.ExprAssign(src, src - offset))
+    e1.append(m2_expr.ExprAssign(dst, dst - offset))
+    e1.append(m2_expr.ExprAssign(ir.IRDst, loc_next_expr))
+    e1 = IRBlock(ir.loc_db, loc_df_1, [AssignBlock(e1, instr)])
+
+    e.append(m2_expr.ExprAssign(ir.IRDst,
+                             m2_expr.ExprCond(df, loc_df_1_expr, loc_df_0_expr)))
+    return e, [e0, e1]
+
+
+def movsd(_, instr, dst, src):
+    # 64 bits access
+    if dst.is_id() and src.is_id():
+        src = src[:64]
+        dst = dst[:64]
+    elif dst.is_mem() and src.is_id():
+        dst = m2_expr.ExprMem(dst.ptr, 64)
+        src = src[:64]
+    else:
+        src = m2_expr.ExprMem(src.ptr, 64)
+        # Erase dst high bits
+        src = src.zeroExtend(dst.size)
+    return [m2_expr.ExprAssign(dst, src)], []
+
+
+def movsd_dispatch(ir, instr, dst=None, src=None):
+    if dst is None and src is None:
+        return movs(ir, instr, 32)
+    else:
+        return movsd(ir, instr, dst, src)
+
+
+def float_prev(flt, popcount=1):
+    if not flt in float_list:
+        return None
+    i = float_list.index(flt)
+    if i < popcount:
+        # Drop value (ex: FSTP ST(0))
+        return None
+    flt = float_list[i - popcount]
+    return flt
+
+
+def float_pop(avoid_flt=None, popcount=1):
+    """
+    Generate floatpop semantic (@popcount times), avoiding the avoid_flt@ float
+    @avoid_flt: float avoided in the generated semantic
+    @popcount: pop count
+    """
+    avoid_flt = float_prev(avoid_flt, popcount)
+    e = []
+    for i in range(8 - popcount):
+        if avoid_flt != float_list[i]:
+            e.append(m2_expr.ExprAssign(float_list[i],
+                                     float_list[i + popcount]))
+    fill_value = m2_expr.ExprOp("sint_to_fp", m2_expr.ExprInt(0, 64))
+    for i in range(8 - popcount, 8):
+        e.append(m2_expr.ExprAssign(float_list[i],
+                                 fill_value))
+    e.append(
+        m2_expr.ExprAssign(float_stack_ptr,
+                        float_stack_ptr - m2_expr.ExprInt(popcount, 3)))
+    return e
+
+# XXX TODO
+
+
+def fcom(_, instr, dst=None, src=None):
+
+    if dst is None and src is None:
+        dst, src = float_st0, float_st1
+    elif src is None:
+        src = mem2double(instr, dst)
+        dst = float_st0
+
+    e = []
+
+    e.append(m2_expr.ExprAssign(float_c0, m2_expr.ExprOp('fcom_c0', dst, src)))
+    e.append(m2_expr.ExprAssign(float_c1, m2_expr.ExprOp('fcom_c1', dst, src)))
+    e.append(m2_expr.ExprAssign(float_c2, m2_expr.ExprOp('fcom_c2', dst, src)))
+    e.append(m2_expr.ExprAssign(float_c3, m2_expr.ExprOp('fcom_c3', dst, src)))
+
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def ftst(_, instr):
+    dst = float_st0
+
+    e = []
+    src = m2_expr.ExprOp('sint_to_fp', m2_expr.ExprInt(0, 64))
+    e.append(m2_expr.ExprAssign(float_c0, m2_expr.ExprOp('fcom_c0', dst, src)))
+    e.append(m2_expr.ExprAssign(float_c1, m2_expr.ExprOp('fcom_c1', dst, src)))
+    e.append(m2_expr.ExprAssign(float_c2, m2_expr.ExprOp('fcom_c2', dst, src)))
+    e.append(m2_expr.ExprAssign(float_c3, m2_expr.ExprOp('fcom_c3', dst, src)))
+
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fxam(ir, instr):
+    """
+    NaN:
+        C3, C2, C0 = 001;
+    Normal:
+        C3, C2, C0 = 010;
+    Infinity:
+        C3, C2, C0 = 011;
+    Zero:
+        C3, C2, C0 = 100;
+    Empty:
+        C3, C2, C0 = 101;
+    Denormal:
+        C3, C2, C0 = 110;
+
+    C1 = sign bit of ST; (* 0 for positive, 1 for negative *)
+    """
+    dst = float_st0
+
+    # Empty not handled
+    locs = {}
+    for name in ["NaN", "Normal", "Infinity", "Zero", "Denormal"]:
+        locs[name] = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_next = ir.get_next_loc_key(instr)
+    loc_next_expr = m2_expr.ExprLoc(loc_next, ir.IRDst.size)
+
+    # if Denormal:
+    #     if zero:
+    #         do_zero
+    #     else:
+    #         do_denormal
+    # else:
+    #     if Nan:
+    #         do_nan
+    #     else:
+    #         if infinity:
+    #             do_infinity
+    #         else:
+    #             do_normal
+
+    irdst = m2_expr.ExprCond(
+        m2_expr.expr_is_IEEE754_denormal(dst),
+        m2_expr.ExprCond(m2_expr.expr_is_IEEE754_zero(dst),
+                 locs["Zero"][1],
+                 locs["Denormal"][1],
+        ),
+        m2_expr.ExprCond(m2_expr.expr_is_NaN(dst),
+                 locs["NaN"][1],
+                 m2_expr.ExprCond(m2_expr.expr_is_infinite(dst),
+                          locs["Infinity"][1],
+                          locs["Normal"][1],
+                 )
+        )
+    )
+    base = [m2_expr.ExprAssign(ir.IRDst, irdst),
+         m2_expr.ExprAssign(float_c1, dst.msb())
+    ]
+    base += set_float_cs_eip(instr)
+
+    out = [
+        IRBlock(ir.loc_db, locs["Zero"][0], [AssignBlock({
+            float_c0: m2_expr.ExprInt(0, float_c0.size),
+            float_c2: m2_expr.ExprInt(0, float_c2.size),
+            float_c3: m2_expr.ExprInt(1, float_c3.size),
+            ir.IRDst: loc_next_expr,
+        }, instr)]),
+        IRBlock(ir.loc_db, locs["Denormal"][0], [AssignBlock({
+            float_c0: m2_expr.ExprInt(0, float_c0.size),
+            float_c2: m2_expr.ExprInt(1, float_c2.size),
+            float_c3: m2_expr.ExprInt(1, float_c3.size),
+            ir.IRDst: loc_next_expr,
+        }, instr)]),
+        IRBlock(ir.loc_db, locs["NaN"][0], [AssignBlock({
+            float_c0: m2_expr.ExprInt(1, float_c0.size),
+            float_c2: m2_expr.ExprInt(0, float_c2.size),
+            float_c3: m2_expr.ExprInt(0, float_c3.size),
+            ir.IRDst: loc_next_expr,
+        }, instr)]),
+        IRBlock(ir.loc_db, locs["Infinity"][0], [AssignBlock({
+            float_c0: m2_expr.ExprInt(1, float_c0.size),
+            float_c2: m2_expr.ExprInt(1, float_c2.size),
+            float_c3: m2_expr.ExprInt(0, float_c3.size),
+            ir.IRDst: loc_next_expr,
+        }, instr)]),
+        IRBlock(ir.loc_db, locs["Normal"][0], [AssignBlock({
+            float_c0: m2_expr.ExprInt(0, float_c0.size),
+            float_c2: m2_expr.ExprInt(1, float_c2.size),
+            float_c3: m2_expr.ExprInt(0, float_c3.size),
+            ir.IRDst: loc_next_expr,
+        }, instr)]),
+    ]
+    return base, out
+
+
+def ficom(_, instr, dst, src=None):
+
+    dst, src = float_implicit_st0(dst, src)
+
+    e = []
+
+    e.append(m2_expr.ExprAssign(float_c0,
+                             m2_expr.ExprOp('fcom_c0', dst,
+                                            src.zeroExtend(dst.size))))
+    e.append(m2_expr.ExprAssign(float_c1,
+                             m2_expr.ExprOp('fcom_c1', dst,
+                                            src.zeroExtend(dst.size))))
+    e.append(m2_expr.ExprAssign(float_c2,
+                             m2_expr.ExprOp('fcom_c2', dst,
+                                            src.zeroExtend(dst.size))))
+    e.append(m2_expr.ExprAssign(float_c3,
+                             m2_expr.ExprOp('fcom_c3', dst,
+                                            src.zeroExtend(dst.size))))
+
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fcomi(_, instr, dst=None, src=None):
+    # TODO unordered float
+    if dst is None and src is None:
+        dst, src = float_st0, float_st1
+    elif src is None:
+        src = dst
+        dst = float_st0
+
+    e = []
+
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprOp('fcom_c0', dst, src)))
+    e.append(m2_expr.ExprAssign(pf, m2_expr.ExprOp('fcom_c2', dst, src)))
+    e.append(m2_expr.ExprAssign(zf, m2_expr.ExprOp('fcom_c3', dst, src)))
+
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(nf, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(af, m2_expr.ExprInt(0, 1)))
+
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fcomip(ir, instr, dst=None, src=None):
+    e, extra = fcomi(ir, instr, dst, src)
+    e += float_pop()
+    e += set_float_cs_eip(instr)
+    return e, extra
+
+
+def fucomi(ir, instr, dst=None, src=None):
+    # TODO unordered float
+    return fcomi(ir, instr, dst, src)
+
+
+def fucomip(ir, instr, dst=None, src=None):
+    # TODO unordered float
+    return fcomip(ir, instr, dst, src)
+
+
+def fcomp(ir, instr, dst=None, src=None):
+    e, extra = fcom(ir, instr, dst, src)
+    e += float_pop()
+    e += set_float_cs_eip(instr)
+    return e, extra
+
+
+def fcompp(ir, instr, dst=None, src=None):
+    e, extra = fcom(ir, instr, dst, src)
+    e += float_pop(popcount=2)
+    e += set_float_cs_eip(instr)
+    return e, extra
+
+
+def ficomp(ir, instr, dst, src=None):
+    e, extra = ficom(ir, instr, dst, src)
+    e += float_pop()
+    e += set_float_cs_eip(instr)
+    return e, extra
+
+
+def fucom(ir, instr, dst=None, src=None):
+    # TODO unordered float
+    return fcom(ir, instr, dst, src)
+
+
+def fucomp(ir, instr, dst=None, src=None):
+    # TODO unordered float
+    return fcomp(ir, instr, dst, src)
+
+
+def fucompp(ir, instr, dst=None, src=None):
+    # TODO unordered float
+    return fcompp(ir, instr, dst, src)
+
+
+def comiss(_, instr, dst, src):
+    # TODO unordered float
+
+    e = []
+
+    dst = m2_expr.ExprOp('sint_to_fp', dst[:32])
+    src = m2_expr.ExprOp('sint_to_fp', src[:32])
+
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprOp('fcom_c0', dst, src)))
+    e.append(m2_expr.ExprAssign(pf, m2_expr.ExprOp('fcom_c2', dst, src)))
+    e.append(m2_expr.ExprAssign(zf, m2_expr.ExprOp('fcom_c3', dst, src)))
+
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(nf, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(af, m2_expr.ExprInt(0, 1)))
+
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def comisd(_, instr, dst, src):
+    # TODO unordered float
+
+    e = []
+
+    dst = m2_expr.ExprOp('sint_to_fp', dst[:64])
+    src = m2_expr.ExprOp('sint_to_fp', src[:64])
+
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprOp('fcom_c0', dst, src)))
+    e.append(m2_expr.ExprAssign(pf, m2_expr.ExprOp('fcom_c2', dst, src)))
+    e.append(m2_expr.ExprAssign(zf, m2_expr.ExprOp('fcom_c3', dst, src)))
+
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(nf, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(af, m2_expr.ExprInt(0, 1)))
+
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fld(_, instr, src):
+
+    if src.size == 32:
+        src = m2_expr.ExprOp("fpconvert_fp64", src)
+    if isinstance(src, m2_expr.ExprMem) and src.size > 64:
+        raise NotImplementedError('convert from 80bits')
+
+    e = []
+    e.append(m2_expr.ExprAssign(float_st7, float_st6))
+    e.append(m2_expr.ExprAssign(float_st6, float_st5))
+    e.append(m2_expr.ExprAssign(float_st5, float_st4))
+    e.append(m2_expr.ExprAssign(float_st4, float_st3))
+    e.append(m2_expr.ExprAssign(float_st3, float_st2))
+    e.append(m2_expr.ExprAssign(float_st2, float_st1))
+    e.append(m2_expr.ExprAssign(float_st1, float_st0))
+    e.append(m2_expr.ExprAssign(float_st0, src))
+    e.append(
+        m2_expr.ExprAssign(float_stack_ptr,
+                        float_stack_ptr + m2_expr.ExprInt(1, 3)))
+
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fst(_, instr, dst):
+    e = []
+
+    if isinstance(dst, m2_expr.ExprMem) and dst.size > 64:
+        raise NotImplementedError('convert to 80bits')
+    src = float_st0
+
+    if dst.size == 32:
+        src = m2_expr.ExprOp("fpconvert_fp32", src)
+    e.append(m2_expr.ExprAssign(dst, src))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fstp(ir, instr, dst):
+    e = []
+
+    if isinstance(dst, m2_expr.ExprMem) and dst.size > 64:
+        raise NotImplementedError('convert to 80bits')
+
+    if isinstance(dst, m2_expr.ExprMem):
+        src = float_st0
+        if dst.size == 32:
+            src = m2_expr.ExprOp("fpconvert_fp32", src)
+        e.append(m2_expr.ExprAssign(dst, src))
+    else:
+        src = float_st0
+        if float_list.index(dst) > 1:
+            # a = st0 -> st0 is dropped
+            # a = st1 -> st0 = st0, useless
+            e.append(m2_expr.ExprAssign(float_prev(dst), src))
+
+    e += set_float_cs_eip(instr)
+    e += float_pop(dst)
+    return e, []
+
+
+def fist(_, instr, dst):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fp_to_sint%d' % dst.size,
+                                                 float_st0)))
+
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fistp(ir, instr, dst):
+    e, extra = fist(ir, instr, dst)
+    e += float_pop(dst)
+    return e, extra
+
+
+def fisttp(_, instr, dst):
+    e = []
+    e.append(m2_expr.ExprAssign(
+        dst,
+        m2_expr.ExprOp('fp_to_sint%d' % dst.size,
+                       m2_expr.ExprOp('fpround_towardszero', float_st0)
+        )))
+
+    e += set_float_cs_eip(instr)
+    e += float_pop(dst)
+    return e, []
+
+
+def fild(ir, instr, src):
+    # XXXXX
+    src = m2_expr.ExprOp('sint_to_fp', src.signExtend(64))
+    e = []
+    e += set_float_cs_eip(instr)
+    e_fld, extra = fld(ir, instr, src)
+    e += e_fld
+    return e, extra
+
+
+def fldz(ir, instr):
+    return fld(ir, instr, m2_expr.ExprOp('sint_to_fp', m2_expr.ExprInt(0, 64)))
+
+
+def fld1(ir, instr):
+    return fld(ir, instr, m2_expr.ExprOp('sint_to_fp', m2_expr.ExprInt(1, 64)))
+
+
+def fldl2t(ir, instr):
+    value_f = math.log(10) / math.log(2)
+    value = struct.unpack('Q', struct.pack('d', value_f))[0]
+    return fld(ir, instr, m2_expr.ExprOp(
+        'sint_to_fp',
+        m2_expr.ExprInt(value, 64)
+    ))
+
+
+def fldpi(ir, instr):
+    value_f = math.pi
+    value = struct.unpack('Q', struct.pack('d', value_f))[0]
+    return fld(ir, instr, m2_expr.ExprOp(
+        'sint_to_fp',
+        m2_expr.ExprInt(value, 64)
+    ))
+
+
+def fldln2(ir, instr):
+    value_f = math.log(2)
+    value = struct.unpack('Q', struct.pack('d', value_f))[0]
+    return fld(ir, instr, m2_expr.ExprOp('mem_64_to_double',
+                                         m2_expr.ExprInt(value, 64)))
+
+
+def fldl2e(ir, instr):
+    x = struct.pack('d', 1 / math.log(2))
+    x = struct.unpack('Q', x)[0]
+    return fld(ir, instr, m2_expr.ExprOp('mem_64_to_double',
+                                         m2_expr.ExprInt(x, 64)))
+
+
+def fldlg2(ir, instr):
+    x = struct.pack('d', math.log10(2))
+    x = struct.unpack('Q', x)[0]
+    return fld(ir, instr, m2_expr.ExprOp('mem_64_to_double',
+                                         m2_expr.ExprInt(x, 64)))
+
+
+def fadd(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fadd', dst, src)))
+
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fiadd(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fiadd', dst, src)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fisub(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fisub', dst, src)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fisubr(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fisub', src, dst)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fpatan(_, instr):
+    e = []
+    a = float_st1
+    e.append(m2_expr.ExprAssign(float_prev(a),
+                             m2_expr.ExprOp('fpatan', float_st0, float_st1)))
+    e += set_float_cs_eip(instr)
+    e += float_pop(a)
+    return e, []
+
+
+def fprem(_, instr):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('fprem', float_st0, float_st1)))
+    # Remaining bits (ex: used in argument reduction in tan)
+    quotient = m2_expr.ExprOp('fp_to_sint32', m2_expr.ExprOp('fpround_towardszero', m2_expr.ExprOp('fdiv', float_st0, float_st1)))
+    e += [m2_expr.ExprAssign(float_c0, quotient[2:3]),
+          m2_expr.ExprAssign(float_c3, quotient[1:2]),
+          m2_expr.ExprAssign(float_c1, quotient[0:1]),
+          # Consider the reduction is always completed
+          m2_expr.ExprAssign(float_c2, m2_expr.ExprInt(0, 1)),
+          ]
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fprem1(_, instr):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('fprem1', float_st0, float_st1)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def faddp(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(float_prev(dst), m2_expr.ExprOp('fadd', dst, src)))
+    e += set_float_cs_eip(instr)
+    e += float_pop(dst)
+    return e, []
+
+
+def fninit(_, instr):
+    e = []
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fyl2x(_, instr):
+    e = []
+    a = float_st1
+    e.append(
+        m2_expr.ExprAssign(float_prev(a), m2_expr.ExprOp('fyl2x', float_st0, float_st1)))
+    e += set_float_cs_eip(instr)
+    e += float_pop(a)
+    return e, []
+
+
+def fnstenv(ir, instr, dst):
+    e = []
+    # XXX TODO tag word, ...
+    status_word = m2_expr.ExprCompose(m2_expr.ExprInt(0, 8),
+                                      float_c0, float_c1, float_c2,
+                                      float_stack_ptr, float_c3,
+                                      m2_expr.ExprInt(0, 1))
+
+    s = instr.mode
+    # The behaviour in 64bit is identical to 32 bit
+    # This will truncate addresses
+    size = min(32, s)
+    ad = ir.ExprMem(dst.ptr, size=16)
+    e.append(m2_expr.ExprAssign(ad, float_control))
+    ad = ir.ExprMem(
+        dst.ptr + m2_expr.ExprInt(
+            (size // 8) * 1,
+            dst.ptr.size
+        ),
+        size=16
+    )
+    e.append(m2_expr.ExprAssign(ad, status_word))
+    ad = ir.ExprMem(
+        dst.ptr + m2_expr.ExprInt(
+            (size // 8) * 3,
+            dst.ptr.size
+        ),
+        size=size
+    )
+    e.append(m2_expr.ExprAssign(ad, float_eip[:size]))
+    ad = ir.ExprMem(
+        dst.ptr + m2_expr.ExprInt(
+            (size // 8) * 4,
+            dst.ptr.size
+        ),
+        size=16
+    )
+    e.append(m2_expr.ExprAssign(ad, float_cs))
+    ad = ir.ExprMem(
+        dst.ptr + m2_expr.ExprInt(
+            (size // 8) * 5,
+            dst.ptr.size
+        ),
+        size=size
+    )
+    e.append(m2_expr.ExprAssign(ad, float_address[:size]))
+    ad = ir.ExprMem(
+        dst.ptr + m2_expr.ExprInt(
+            (size // 8) * 6,
+            dst.ptr.size
+        ),
+        size=16
+    )
+    e.append(m2_expr.ExprAssign(ad, float_ds))
+    return e, []
+
+
+def fldenv(ir, instr, src):
+    e = []
+    # Inspired from fnstenv (same TODOs / issues)
+
+    s = instr.mode
+    # The behaviour in 64bit is identical to 32 bit
+    # This will truncate addresses
+    size = min(32, s)
+
+    # Float control
+    ad = ir.ExprMem(src.ptr, size=16)
+    e.append(m2_expr.ExprAssign(float_control, ad))
+
+    # Status word
+    ad = ir.ExprMem(
+        src.ptr + m2_expr.ExprInt(
+            size // (8 * 1),
+            size=src.ptr.size
+        ),
+        size=16
+    )
+    e += [
+        m2_expr.ExprAssign(x, y) for x, y in ((float_c0, ad[8:9]),
+                                              (float_c1, ad[9:10]),
+                                              (float_c2, ad[10:11]),
+                                              (float_stack_ptr, ad[11:14]),
+                                              (float_c3, ad[14:15]))
+    ]
+
+    # EIP, CS, Address, DS
+    for offset, target in (
+            (3, float_eip[:size]),
+            (4, float_cs),
+            (5, float_address[:size]),
+            (6, float_ds)
+    ):
+        ad = ir.ExprMem(
+            src.ptr + m2_expr.ExprInt(
+                size // ( 8 * offset),
+                size=src.ptr.size
+            ),
+            size=target.size
+        )
+        e.append(m2_expr.ExprAssign(target, ad))
+
+    return e, []
+
+
+def fsub(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fsub', dst, src)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fsubp(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(float_prev(dst), m2_expr.ExprOp('fsub', dst, src)))
+    e += set_float_cs_eip(instr)
+    e += float_pop(dst)
+    return e, []
+
+
+def fsubr(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fsub', src, dst)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fsubrp(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(float_prev(dst), m2_expr.ExprOp('fsub', src, dst)))
+    e += set_float_cs_eip(instr)
+    e += float_pop(dst)
+    return e, []
+
+
+def fmul(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fmul', dst, src)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fimul(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fimul', dst, src)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fdiv(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fdiv', dst, src)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fdivr(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fdiv', src, dst)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fdivrp(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(float_prev(dst), m2_expr.ExprOp('fdiv', src, dst)))
+    e += set_float_cs_eip(instr)
+    e += float_pop(dst)
+    return e, []
+
+
+def fidiv(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fidiv', dst, src)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fidivr(_, instr, dst, src=None):
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('fidiv', src, dst)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fdivp(_, instr, dst, src=None):
+    # Invalid emulation
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(float_prev(dst), m2_expr.ExprOp('fdiv', dst, src)))
+    e += set_float_cs_eip(instr)
+    e += float_pop(dst)
+    return e, []
+
+
+def fmulp(_, instr, dst, src=None):
+    # Invalid emulation
+    dst, src = float_implicit_st0(dst, src)
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(float_prev(dst), m2_expr.ExprOp('fmul', dst, src)))
+    e += set_float_cs_eip(instr)
+    e += float_pop(dst)
+    return e, []
+
+
+def ftan(_, instr, src):
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('ftan', src)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fxch(_, instr, src):
+    e = []
+    src = mem2double(instr, src)
+    e.append(m2_expr.ExprAssign(float_st0, src))
+    e.append(m2_expr.ExprAssign(src, float_st0))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fptan(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st7, float_st6))
+    e.append(m2_expr.ExprAssign(float_st6, float_st5))
+    e.append(m2_expr.ExprAssign(float_st5, float_st4))
+    e.append(m2_expr.ExprAssign(float_st4, float_st3))
+    e.append(m2_expr.ExprAssign(float_st3, float_st2))
+    e.append(m2_expr.ExprAssign(float_st2, float_st1))
+    e.append(m2_expr.ExprAssign(float_st1, m2_expr.ExprOp('ftan', float_st0)))
+    e.append(
+        m2_expr.ExprAssign(
+            float_st0,
+            m2_expr.ExprOp(
+                'sint_to_fp',
+                m2_expr.ExprInt(1, 64)
+            )
+        )
+    )
+    e.append(
+        m2_expr.ExprAssign(float_stack_ptr,
+                        float_stack_ptr + m2_expr.ExprInt(1, 3)))
+    return e, []
+
+
+def frndint(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('frndint', float_st0)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fsin(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('fsin', float_st0)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fcos(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('fcos', float_st0)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fsincos(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st7, float_st6))
+    e.append(m2_expr.ExprAssign(float_st6, float_st5))
+    e.append(m2_expr.ExprAssign(float_st5, float_st4))
+    e.append(m2_expr.ExprAssign(float_st4, float_st3))
+    e.append(m2_expr.ExprAssign(float_st3, float_st2))
+    e.append(m2_expr.ExprAssign(float_st2, float_st1))
+    e.append(m2_expr.ExprAssign(float_st1, m2_expr.ExprOp('fsin', float_st0)))
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('fcos', float_st0)))
+    e.append(
+        m2_expr.ExprAssign(float_stack_ptr,
+                        float_stack_ptr + m2_expr.ExprInt(1, 3)))
+    return e, []
+
+
+def fscale(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('fscale', float_st0,
+                                                       float_st1)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def f2xm1(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('f2xm1', float_st0)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fchs(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('fchs', float_st0)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fsqrt(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('fsqrt', float_st0)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fabs(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(float_st0, m2_expr.ExprOp('fabs', float_st0)))
+    e += set_float_cs_eip(instr)
+    return e, []
+
+
+def fnstsw(_, instr, dst):
+    args = [
+        # Exceptions -> 0
+        m2_expr.ExprInt(0, 8),
+        float_c0,
+        float_c1,
+        float_c2,
+        float_stack_ptr,
+        float_c3,
+        # B: FPU is not busy -> 0
+        m2_expr.ExprInt(0, 1)]
+    e = [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*args))]
+    return e, []
+
+
+def fnstcw(_, instr, dst):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, float_control))
+    return e, []
+
+
+def fldcw(_, instr, src):
+    e = []
+    e.append(m2_expr.ExprAssign(float_control, src))
+    return e, []
+
+
+def fwait(_, instr):
+    return [], []
+
+
+def fcmovb(ir, instr, arg1, arg2):
+    return gen_fcmov(ir, instr, cf, arg1, arg2, True)
+
+
+def fcmove(ir, instr, arg1, arg2):
+    return gen_fcmov(ir, instr, zf, arg1, arg2, True)
+
+
+def fcmovbe(ir, instr, arg1, arg2):
+    return gen_fcmov(ir, instr, cf | zf, arg1, arg2, True)
+
+
+def fcmovu(ir, instr, arg1, arg2):
+    return gen_fcmov(ir, instr, pf, arg1, arg2, True)
+
+
+def fcmovnb(ir, instr, arg1, arg2):
+    return gen_fcmov(ir, instr, cf, arg1, arg2, False)
+
+
+def fcmovne(ir, instr, arg1, arg2):
+    return gen_fcmov(ir, instr, zf, arg1, arg2, False)
+
+
+def fcmovnbe(ir, instr, arg1, arg2):
+    return gen_fcmov(ir, instr, cf | zf, arg1, arg2, False)
+
+
+def fcmovnu(ir, instr, arg1, arg2):
+    return gen_fcmov(ir, instr, pf, arg1, arg2, False)
+
+
+def nop(_, instr, a=None):
+    return [], []
+
+
+def prefetch0(_, instr, src=None):
+    # see 4-198 on this documentation
+    # https://www-ssl.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    return [], []
+
+
+def prefetch1(_, instr, src=None):
+    # see 4-198 on this documentation
+    # https://www-ssl.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    return [], []
+
+
+def prefetch2(_, instr, src=None):
+    # see 4-198 on this documentation
+    # https://www-ssl.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    return [], []
+
+
+def prefetchw(_, instr, src=None):
+    # see 4-201 on this documentation
+    # https://www-ssl.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    return [], []
+
+def prefetchnta(_, instr, src=None):
+    # see 4-201 on this documentation
+    # https://www-ssl.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    return [], []
+
+
+def lfence(_, instr, src=None):
+    # see 3-485 on this documentation
+    # https://www-ssl.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    return [], []
+
+
+def mfence(_, instr, src=None):
+    # see 3-516 on this documentation
+    # https://www-ssl.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    return [], []
+
+
+def sfence(_, instr, src=None):
+    # see 3-356 on this documentation
+    # https://www-ssl.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    return [], []
+
+
+def ud2(_, instr, src=None):
+    e = [m2_expr.ExprAssign(exception_flags, m2_expr.ExprInt(
+        EXCEPT_ILLEGAL_INSN, exception_flags.size))]
+    return e, []
+
+
+def hlt(_, instr):
+    e = []
+    except_int = EXCEPT_PRIV_INSN
+    e.append(m2_expr.ExprAssign(exception_flags, m2_expr.ExprInt(except_int, 32)))
+    return e, []
+
+
+def rdtsc(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(tsc, tsc + m2_expr.ExprInt(1, 64)))
+    e.append(m2_expr.ExprAssign(mRAX[32], tsc[:32]))
+    e.append(m2_expr.ExprAssign(mRDX[32], tsc[32:]))
+    return e, []
+
+
+def daa(_, instr):
+    e = []
+    r_al = mRAX[instr.mode][:8]
+
+    cond1 = m2_expr.expr_is_unsigned_greater(r_al[:4], m2_expr.ExprInt(0x9, 4)) | af
+    e.append(m2_expr.ExprAssign(af, cond1))
+
+    cond2 = m2_expr.expr_is_unsigned_greater(m2_expr.ExprInt(6, 8), r_al)
+    cond3 = m2_expr.expr_is_unsigned_greater(r_al, m2_expr.ExprInt(0x99, 8)) | cf
+
+    cf_c1 = m2_expr.ExprCond(cond1,
+                             cf | (cond2),
+                             m2_expr.ExprInt(0, 1))
+    new_cf = m2_expr.ExprCond(cond3,
+                              m2_expr.ExprInt(1, 1),
+                              m2_expr.ExprInt(0, 1))
+    e.append(m2_expr.ExprAssign(cf, new_cf))
+
+    al_c1 = m2_expr.ExprCond(cond1,
+                             r_al + m2_expr.ExprInt(6, 8),
+                             r_al)
+
+    new_al = m2_expr.ExprCond(cond3,
+                              al_c1 + m2_expr.ExprInt(0x60, 8),
+                              al_c1)
+    e.append(m2_expr.ExprAssign(r_al, new_al))
+    e += update_flag_znp(new_al)
+    return e, []
+
+
+def das(_, instr):
+    e = []
+    r_al = mRAX[instr.mode][:8]
+
+    cond1 = m2_expr.expr_is_unsigned_greater(r_al[:4], m2_expr.ExprInt(0x9, 4)) | af
+    e.append(m2_expr.ExprAssign(af, cond1))
+
+    cond2 = m2_expr.expr_is_unsigned_greater(m2_expr.ExprInt(6, 8), r_al)
+    cond3 = m2_expr.expr_is_unsigned_greater(r_al, m2_expr.ExprInt(0x99, 8)) | cf
+
+    cf_c1 = m2_expr.ExprCond(cond1,
+                             cf | (cond2),
+                             m2_expr.ExprInt(0, 1))
+    new_cf = m2_expr.ExprCond(cond3,
+                              m2_expr.ExprInt(1, 1),
+                              cf_c1)
+    e.append(m2_expr.ExprAssign(cf, new_cf))
+
+    al_c1 = m2_expr.ExprCond(cond1,
+                             r_al - m2_expr.ExprInt(6, 8),
+                             r_al)
+
+    new_al = m2_expr.ExprCond(cond3,
+                              al_c1 - m2_expr.ExprInt(0x60, 8),
+                              al_c1)
+    e.append(m2_expr.ExprAssign(r_al, new_al))
+    e += update_flag_znp(new_al)
+    return e, []
+
+
+def aam(ir, instr, src):
+    e = []
+    assert src.is_int()
+
+    value = int(src)
+    if value:
+        tempAL = mRAX[instr.mode][0:8]
+        newEAX = m2_expr.ExprCompose(
+            m2_expr.ExprOp("umod", tempAL, src),
+            m2_expr.ExprOp("udiv", tempAL, src),
+            mRAX[instr.mode][16:]
+        )
+        e += [m2_expr.ExprAssign(mRAX[instr.mode], newEAX)]
+        e += update_flag_arith(newEAX)
+        e.append(m2_expr.ExprAssign(af, m2_expr.ExprInt(0, 1)))
+    else:
+        e.append(
+            m2_expr.ExprAssign(
+                exception_flags,
+                m2_expr.ExprInt(EXCEPT_DIV_BY_ZERO, exception_flags.size)
+            )
+        )
+    return e, []
+
+
+def aad(_, instr, src):
+    e = []
+    tempAL = mRAX[instr.mode][0:8]
+    tempAH = mRAX[instr.mode][8:16]
+    newEAX = m2_expr.ExprCompose((tempAL + (tempAH * src)) & m2_expr.ExprInt(0xFF, 8),
+                                 m2_expr.ExprInt(0, 8),
+                                 mRAX[instr.mode][16:])
+    e += [m2_expr.ExprAssign(mRAX[instr.mode], newEAX)]
+    e += update_flag_arith(newEAX)
+    e.append(m2_expr.ExprAssign(af, m2_expr.ExprInt(0, 1)))
+    return e, []
+
+
+def _tpl_aaa(_, instr, op):
+    """Templating for aaa, aas with operation @op
+    @op: operation to apply
+    """
+    e = []
+    r_al = mRAX[instr.mode][:8]
+    r_ah = mRAX[instr.mode][8:16]
+    r_ax = mRAX[instr.mode][:16]
+    i0 = m2_expr.ExprInt(0, 1)
+    i1 = m2_expr.ExprInt(1, 1)
+    # cond: if (al & 0xf) > 9 OR af == 1
+    cond = (r_al & m2_expr.ExprInt(0xf, 8)) - m2_expr.ExprInt(9, 8)
+    cond = ~cond.msb() & m2_expr.ExprCond(cond, i1, i0)
+    cond |= af & i1
+
+    to_add = m2_expr.ExprInt(0x106, size=r_ax.size)
+    if op == "-":
+        # Avoid ExprOp("-", A, B), should be ExprOp("+", A, ExprOp("-", B))
+        first_part = r_ax - to_add
+    else:
+        first_part = m2_expr.ExprOp(op, r_ax, to_add)
+    new_ax = first_part & m2_expr.ExprInt(0xff0f,
+                                          size=r_ax.size)
+    # set AL
+    e.append(m2_expr.ExprAssign(r_ax, m2_expr.ExprCond(cond, new_ax, r_ax)))
+    e.append(m2_expr.ExprAssign(af, cond))
+    e.append(m2_expr.ExprAssign(cf, cond))
+    return e, []
+
+
+def aaa(ir, instr):
+    return _tpl_aaa(ir, instr, "+")
+
+
+def aas(ir, instr):
+    return _tpl_aaa(ir, instr, "-")
+
+
+def bsr_bsf(ir, instr, dst, src, op_func):
+    """
+    IF SRC == 0
+        ZF = 1
+        DEST is left unchanged
+    ELSE
+        ZF = 0
+        DEST = @op_func(SRC)
+    """
+    loc_src_null, loc_src_null_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_src_not_null, loc_src_not_null_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
+    loc_next = ir.get_next_loc_key(instr)
+    loc_next_expr = m2_expr.ExprLoc(loc_next, ir.IRDst.size)
+
+    aff_dst = m2_expr.ExprAssign(ir.IRDst, loc_next_expr)
+    e = [m2_expr.ExprAssign(ir.IRDst, m2_expr.ExprCond(src,
+                                                    loc_src_not_null_expr,
+                                                    loc_src_null_expr))]
+    e_src_null = []
+    e_src_null.append(m2_expr.ExprAssign(zf, m2_expr.ExprInt(1, zf.size)))
+    # XXX destination is undefined
+    e_src_null.append(aff_dst)
+
+    e_src_not_null = []
+    e_src_not_null.append(m2_expr.ExprAssign(zf, m2_expr.ExprInt(0, zf.size)))
+    e_src_not_null.append(m2_expr.ExprAssign(dst, op_func(src)))
+    e_src_not_null.append(aff_dst)
+
+    return e, [IRBlock(ir.loc_db, loc_src_null, [AssignBlock(e_src_null, instr)]),
+               IRBlock(ir.loc_db, loc_src_not_null, [AssignBlock(e_src_not_null, instr)])]
+
+
+def bsf(ir, instr, dst, src):
+    return bsr_bsf(ir, instr, dst, src,
+                   lambda src: m2_expr.ExprOp("cnttrailzeros", src))
+
+
+def bsr(ir, instr, dst, src):
+    return bsr_bsf(
+        ir, instr, dst, src,
+        lambda src: m2_expr.ExprInt(src.size - 1, src.size) - m2_expr.ExprOp("cntleadzeros", src)
+    )
+
+
+def arpl(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(exception_flags, m2_expr.ExprInt(1 << 7, 32)))
+    return e, []
+
+
+def ins(_, instr, size):
+    e = []
+    e.append(m2_expr.ExprAssign(exception_flags, m2_expr.ExprInt(1 << 7, 32)))
+    return e, []
+
+
+def sidt(ir, instr, dst):
+    e = []
+    if not isinstance(dst, m2_expr.ExprMem) or dst.size != 32:
+        raise ValueError('not exprmem 32bit instance!!')
+    ptr = dst.ptr
+    LOG_X86_SEM.warning("DEFAULT SIDT ADDRESS %s!!", dst)
+    e.append(m2_expr.ExprAssign(ir.ExprMem(ptr, 32),
+                             m2_expr.ExprInt(0xe40007ff, 32)))
+    e.append(
+        m2_expr.ExprAssign(ir.ExprMem(ptr + m2_expr.ExprInt(4, ptr.size), 16),
+                        m2_expr.ExprInt(0x8245, 16)))
+    return e, []
+
+
+def sldt(_, instr, dst):
+    LOG_X86_SEM.warning("DEFAULT SLDT ADDRESS %s!!", dst)
+    e = [m2_expr.ExprAssign(dst, m2_expr.ExprInt(0, dst.size))]
+    return e, []
+
+
+def cmovz(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, zf, dst, src, True)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_EQ", zf), dst, src, True)
+
+
+def cmovnz(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, zf, dst, src, False)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_EQ", zf), dst, src, False)
+
+
+def cmovpe(ir, instr, dst, src):
+    return gen_cmov(ir, instr, pf, dst, src, True)
+
+
+def cmovnp(ir, instr, dst, src):
+    return gen_cmov(ir, instr, pf, dst, src, False)
+
+
+def cmovge(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, nf ^ of, dst, src, False)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_S>=", nf, of), dst, src, True)
+
+
+def cmovg(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, zf | (nf ^ of), dst, src, False)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_S>", nf, of, zf), dst, src, True)
+
+
+def cmovl(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, nf ^ of, dst, src, True)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_S<", nf, of), dst, src, True)
+
+
+def cmovle(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, zf | (nf ^ of), dst, src, True)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_S<=", nf, of, zf), dst, src, True)
+
+
+def cmova(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, cf | zf, dst, src, False)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_U>", cf, zf), dst, src, True)
+
+
+def cmovae(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, cf, dst, src, False)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_U>=", cf), dst, src, True)
+
+
+def cmovbe(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, cf | zf, dst, src, True)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_U<=", cf, zf), dst, src, True)
+
+
+def cmovb(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, cf, dst, src, True)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_U<", cf), dst, src, True)
+
+
+def cmovo(ir, instr, dst, src):
+    return gen_cmov(ir, instr, of, dst, src, True)
+
+
+def cmovno(ir, instr, dst, src):
+    return gen_cmov(ir, instr, of, dst, src, False)
+
+
+def cmovs(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, nf, dst, src, True)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_NEG", nf), dst, src, True)
+
+
+def cmovns(ir, instr, dst, src):
+    #return gen_cmov(ir, instr, nf, dst, src, False)
+    return gen_cmov(ir, instr, m2_expr.ExprOp("CC_NEG", nf), dst, src, False)
+
+
+def icebp(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(exception_flags,
+                             m2_expr.ExprInt(EXCEPT_SOFT_BP, 32)))
+    return e, []
+# XXX
+
+
+def l_int(_, instr, src):
+    e = []
+    # XXX
+    assert src.is_int()
+    value = int(src)
+    if value == 1:
+        except_int = EXCEPT_INT_1
+    elif value == 3:
+        except_int = EXCEPT_SOFT_BP
+    else:
+        except_int = EXCEPT_INT_XX
+    e.append(m2_expr.ExprAssign(exception_flags,
+                             m2_expr.ExprInt(except_int, 32)))
+    e.append(m2_expr.ExprAssign(interrupt_num, src))
+    return e, []
+
+
+def l_sysenter(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(exception_flags,
+                             m2_expr.ExprInt(EXCEPT_PRIV_INSN, 32)))
+    return e, []
+
+
+def l_syscall(_, instr):
+    e = []
+    e.append(m2_expr.ExprAssign(exception_flags,
+                             m2_expr.ExprInt(EXCEPT_SYSCALL, 32)))
+    return e, []
+
+# XXX
+
+
+def l_out(_, instr, src1, src2):
+    e = []
+    e.append(m2_expr.ExprAssign(exception_flags,
+                             m2_expr.ExprInt(EXCEPT_PRIV_INSN, 32)))
+    return e, []
+
+# XXX
+
+
+def l_outs(_, instr, size):
+    e = []
+    e.append(m2_expr.ExprAssign(exception_flags,
+                             m2_expr.ExprInt(EXCEPT_PRIV_INSN, 32)))
+    return e, []
+
+# XXX actually, xlat performs al = (ds:[e]bx + ZeroExtend(al))
+
+
+def xlat(ir, instr):
+    e = []
+    ptr = mRAX[instr.mode][0:8].zeroExtend(mRBX[instr.mode].size)
+    src = ir.ExprMem(mRBX[instr.mode] + ptr, 8)
+    e.append(m2_expr.ExprAssign(mRAX[instr.mode][0:8], src))
+    return e, []
+
+
+def cpuid(_, instr):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(mRAX[instr.mode],
+                        m2_expr.ExprOp('x86_cpuid', mRAX[instr.mode], m2_expr.ExprInt(0, instr.mode))))
+    e.append(
+        m2_expr.ExprAssign(mRBX[instr.mode],
+                        m2_expr.ExprOp('x86_cpuid', mRAX[instr.mode], m2_expr.ExprInt(1, instr.mode))))
+    e.append(
+        m2_expr.ExprAssign(mRCX[instr.mode],
+                        m2_expr.ExprOp('x86_cpuid', mRAX[instr.mode], m2_expr.ExprInt(2, instr.mode))))
+    e.append(
+        m2_expr.ExprAssign(mRDX[instr.mode],
+                        m2_expr.ExprOp('x86_cpuid', mRAX[instr.mode], m2_expr.ExprInt(3, instr.mode))))
+    return e, []
+
+
+def bittest_get(ir, instr, src, index):
+    index = index.zeroExtend(src.size)
+    if isinstance(src, m2_expr.ExprMem):
+        b_mask = {16: 4, 32: 5, 64: 6}
+        b_decal = {16: 1, 32: 3, 64: 7}
+        ptr = src.ptr
+        segm = is_mem_segm(src)
+        if segm:
+            ptr = ptr.args[1]
+
+        off_bit = index.zeroExtend(
+            src.size) & m2_expr.ExprInt((1 << b_mask[src.size]) - 1,
+                                        src.size)
+        off_byte = ((index.zeroExtend(ptr.size) >> m2_expr.ExprInt(3, ptr.size)) &
+                    m2_expr.ExprInt(((1 << src.size) - 1) ^ b_decal[src.size], ptr.size))
+
+        addr = ptr + off_byte
+        if segm:
+            addr = ir.gen_segm_expr(src.ptr.args[0], addr)
+
+        d = ir.ExprMem(addr, src.size)
+    else:
+        off_bit = m2_expr.ExprOp(
+            '&', index, m2_expr.ExprInt(src.size - 1, src.size))
+        d = src
+    return d, off_bit
+
+
+def bt(ir, instr, src, index):
+    e = []
+    index = index.zeroExtend(src.size)
+    d, off_bit = bittest_get(ir, instr, src, index)
+    d = d >> off_bit
+    e.append(m2_expr.ExprAssign(cf, d[:1]))
+    return e, []
+
+
+def btc(ir, instr, src, index):
+    e = []
+    d, off_bit = bittest_get(ir, instr, src, index)
+    e.append(m2_expr.ExprAssign(cf, (d >> off_bit)[:1]))
+
+    m = m2_expr.ExprInt(1, src.size) << off_bit
+    e.append(m2_expr.ExprAssign(d, d ^ m))
+
+    return e, []
+
+
+def bts(ir, instr, src, index):
+    e = []
+    d, off_bit = bittest_get(ir, instr, src, index)
+    e.append(m2_expr.ExprAssign(cf, (d >> off_bit)[:1]))
+    m = m2_expr.ExprInt(1, src.size) << off_bit
+    e.append(m2_expr.ExprAssign(d, d | m))
+
+    return e, []
+
+
+def btr(ir, instr, src, index):
+    e = []
+    d, off_bit = bittest_get(ir, instr, src, index)
+    e.append(m2_expr.ExprAssign(cf, (d >> off_bit)[:1]))
+    m = ~(m2_expr.ExprInt(1, src.size) << off_bit)
+    e.append(m2_expr.ExprAssign(d, d & m))
+
+    return e, []
+
+
+def into(_, instr):
+    return [], []
+
+
+def l_in(_, instr, src1, src2):
+    e = []
+    e.append(m2_expr.ExprAssign(exception_flags,
+                             m2_expr.ExprInt(EXCEPT_PRIV_INSN, 32)))
+    return e, []
+
+
+@sbuild.parse
+def cmpxchg(arg1, arg2):
+    accumulator = mRAX[instr.v_opmode()][:arg1.size]
+    if (accumulator - arg1):
+        zf = i1(0)
+        accumulator = arg1
+    else:
+        zf = i1(1)
+        arg1 = arg2
+
+
+@sbuild.parse
+def cmpxchg8b(arg1):
+    accumulator = {mRAX[32], mRDX[32]}
+    if accumulator - arg1:
+        zf = i1(0)
+        mRAX[32] = arg1[:32]
+        mRDX[32] = arg1[32:]
+    else:
+        zf = i1(1)
+        arg1 = {mRBX[32], mRCX[32]}
+
+
+@sbuild.parse
+def cmpxchg16b(arg1):
+    accumulator = {mRAX[64], mRDX[64]}
+    if accumulator - arg1:
+        zf = i1(0)
+        mRAX[64] = arg1[:64]
+        mRDX[64] = arg1[64:]
+    else:
+        zf = i1(1)
+        arg1 = {mRBX[64], mRCX[64]}
+
+
+def lds(ir, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, ir.ExprMem(src.ptr, size=dst.size)))
+    DS_value = ir.ExprMem(src.ptr + m2_expr.ExprInt(dst.size // 8, src.ptr.size),
+                          size=16)
+    e.append(m2_expr.ExprAssign(DS, DS_value))
+    return e, []
+
+
+def les(ir, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, ir.ExprMem(src.ptr, size=dst.size)))
+    ES_value = ir.ExprMem(src.ptr + m2_expr.ExprInt(dst.size // 8, src.ptr.size),
+                          size=16)
+    e.append(m2_expr.ExprAssign(ES, ES_value))
+    return e, []
+
+
+def lss(ir, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, ir.ExprMem(src.ptr, size=dst.size)))
+    SS_value = ir.ExprMem(src.ptr + m2_expr.ExprInt(dst.size // 8, src.ptr.size),
+                          size=16)
+    e.append(m2_expr.ExprAssign(SS, SS_value))
+    return e, []
+
+
+def lfs(ir, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, ir.ExprMem(src.ptr, size=dst.size)))
+    FS_value = ir.ExprMem(src.ptr + m2_expr.ExprInt(dst.size // 8, src.ptr.size),
+                          size=16)
+    e.append(m2_expr.ExprAssign(FS, FS_value))
+    return e, []
+
+
+def lgs(ir, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, ir.ExprMem(src.ptr, size=dst.size)))
+    GS_value = ir.ExprMem(src.ptr + m2_expr.ExprInt(dst.size // 8, src.ptr.size),
+                          size=16)
+    e.append(m2_expr.ExprAssign(GS, GS_value))
+    return e, []
+
+
+def lahf(_, instr):
+    e = []
+    args = [cf, m2_expr.ExprInt(1, 1), pf, m2_expr.ExprInt(0, 1), af,
+            m2_expr.ExprInt(0, 1), zf, nf]
+    e.append(
+        m2_expr.ExprAssign(mRAX[instr.mode][8:16], m2_expr.ExprCompose(*args)))
+    return e, []
+
+
+def sahf(_, instr):
+    tmp = mRAX[instr.mode][8:16]
+    e = []
+    e.append(m2_expr.ExprAssign(cf, tmp[0:1]))
+    e.append(m2_expr.ExprAssign(pf, tmp[2:3]))
+    e.append(m2_expr.ExprAssign(af, tmp[4:5]))
+    e.append(m2_expr.ExprAssign(zf, tmp[6:7]))
+    e.append(m2_expr.ExprAssign(nf, tmp[7:8]))
+    return e, []
+
+
+def lar(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('access_segment', src)))
+    e.append(m2_expr.ExprAssign(zf, m2_expr.ExprOp('access_segment_ok', src)))
+    return e, []
+
+
+def lsl(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('load_segment_limit', src)))
+    e.append(m2_expr.ExprAssign(zf, m2_expr.ExprOp('load_segment_limit_ok', src)))
+    return e, []
+
+
+def fclex(_, instr):
+    # XXX TODO
+    return [], []
+
+
+def fnclex(_, instr):
+    # XXX TODO
+    return [], []
+
+
+def l_str(_, instr, dst):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('load_tr_segment_selector',
+                                                 m2_expr.ExprInt(0, 32))))
+    return e, []
+
+
+def movd(_, instr, dst, src):
+    e = []
+    if dst in regs_mm_expr:
+        e.append(m2_expr.ExprAssign(
+            dst, m2_expr.ExprCompose(src, m2_expr.ExprInt(0, 32))))
+    elif dst in regs_xmm_expr:
+        e.append(m2_expr.ExprAssign(
+            dst, m2_expr.ExprCompose(src, m2_expr.ExprInt(0, 96))))
+    else:
+        e.append(m2_expr.ExprAssign(dst, src[:32]))
+    return e, []
+
+
+def movdqu(_, instr, dst, src):
+    # XXX TODO alignment check
+    return [m2_expr.ExprAssign(dst, src)], []
+
+
+def movapd(_, instr, dst, src):
+    # XXX TODO alignment check
+    return [m2_expr.ExprAssign(dst, src)], []
+
+
+def andps(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('&', dst, src)))
+    return e, []
+
+
+def andnps(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('&', dst ^ dst.mask, src)))
+    return e, []
+
+
+def orps(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('|', dst, src)))
+    return e, []
+
+
+def xorps(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprOp('^', dst, src)))
+    return e, []
+
+
+def rdmsr(ir, instr):
+    e = [m2_expr.ExprAssign(exception_flags,m2_expr.ExprInt(EXCEPT_PRIV_INSN, 32))]
+    return e, []
+
+
+def wrmsr(ir, instr):
+    e = [m2_expr.ExprAssign(exception_flags,m2_expr.ExprInt(EXCEPT_PRIV_INSN, 32))]
+    return e, []
+
+# MMX/SSE/AVX operations
+#
+
+def vec_op_clip(op, size, callback=None):
+    """
+    Generate simd operations
+    @op: the operator
+    @size: size of an element
+    """
+    def vec_op_clip_instr(ir, instr, dst, src):
+        if op == '-':
+            result = dst[:size] - src[:size]
+        else:
+            result = m2_expr.ExprOp(op, dst[:size], src[:size])
+        if callback is not None:
+            result = callback(result)
+        return [m2_expr.ExprAssign(dst[:size], result)], []
+    return vec_op_clip_instr
+
+# Generic vertical operation
+
+
+def vec_vertical_sem(op, elt_size, reg_size, dst, src, apply_on_output):
+    assert reg_size % elt_size == 0
+    n = reg_size // elt_size
+    if op == '-':
+        ops = [
+            apply_on_output((dst[i * elt_size:(i + 1) * elt_size]
+                             - src[i * elt_size:(i + 1) * elt_size]))
+            for i in range(0, n)
+        ]
+    else:
+        ops = [
+            apply_on_output(m2_expr.ExprOp(op, dst[i * elt_size:(i + 1) * elt_size],
+                                           src[i * elt_size:(i + 1) * elt_size]))
+            for i in range(0, n)
+        ]
+
+    return m2_expr.ExprCompose(*ops)
+
+
+def __vec_vertical_instr_gen(op, elt_size, sem, apply_on_output):
+    def vec_instr(ir, instr, dst, src):
+        e = []
+        if isinstance(src, m2_expr.ExprMem):
+            src = ir.ExprMem(src.ptr, dst.size)
+        reg_size = dst.size
+        e.append(m2_expr.ExprAssign(dst, sem(op, elt_size, reg_size, dst, src,
+                                          apply_on_output)))
+        return e, []
+    return vec_instr
+
+
+def vec_vertical_instr(op, elt_size, apply_on_output=lambda x: x):
+    return __vec_vertical_instr_gen(op, elt_size, vec_vertical_sem,
+                                    apply_on_output)
+
+
+def _keep_mul_high(expr, signed=False):
+    assert expr.is_op("*") and len(expr.args) == 2
+
+    if signed:
+        arg1 = expr.args[0].signExtend(expr.size * 2)
+        arg2 = expr.args[1].signExtend(expr.size * 2)
+    else:
+        arg1 = expr.args[0].zeroExtend(expr.size * 2)
+        arg2 = expr.args[1].zeroExtend(expr.size * 2)
+    return m2_expr.ExprOp("*", arg1, arg2)[expr.size:]
+
+# Op, signed => associated comparison
+_min_max_func = {
+    ("min", False): m2_expr.expr_is_unsigned_lower,
+    ("min", True): m2_expr.expr_is_signed_lower,
+    ("max", False): m2_expr.expr_is_unsigned_greater,
+    ("max", True): m2_expr.expr_is_signed_greater,
+}
+def _min_max(expr, signed):
+    assert (expr.is_op("min") or expr.is_op("max")) and len(expr.args) == 2
+    return m2_expr.ExprCond(
+        _min_max_func[(expr.op, signed)](expr.args[1], expr.args[0]),
+        expr.args[1],
+        expr.args[0],
+    )
+
+def _float_min_max(expr):
+    assert (expr.is_op("fmin") or expr.is_op("fmax")) and len(expr.args) == 2
+    src1 = expr.args[0]
+    src2 = expr.args[1]
+    if expr.is_op("fmin"):
+        comp = m2_expr.expr_is_float_lower(src1, src2)
+    elif expr.is_op("fmax"):
+        comp = m2_expr.expr_is_float_lower(src2, src1)
+
+    # x86 documentation (for MIN):
+    # IF ((SRC1 = 0.0) and (SRC2 = 0.0)) THEN DEST <-SRC2;
+    # ELSE IF (SRC1 = SNaN) THEN DEST <-SRC2; FI;
+    # ELSE IF (SRC2 = SNaN) THEN DEST <-SRC2; FI;
+    # ELSE IF (SRC1 < SRC2) THEN DEST <-SRC1;
+    # ELSE DEST<-SRC2;
+    #
+    # But this includes the NaN output of "SRC1 < SRC2"
+    # Associated text is more detailed, and this is the version impl here
+    return m2_expr.ExprCond(
+        m2_expr.expr_is_sNaN(src2), src2,
+        m2_expr.ExprCond(
+            m2_expr.expr_is_NaN(src2) | m2_expr.expr_is_NaN(src1), src2,
+            m2_expr.ExprCond(comp, src1, src2)
+        )
+    )
+
+
+# Integer arithmetic
+#
+
+# Additions
+#
+
+# SSE
+paddb = vec_vertical_instr('+', 8)
+paddw = vec_vertical_instr('+', 16)
+paddd = vec_vertical_instr('+', 32)
+paddq = vec_vertical_instr('+', 64)
+
+# Substractions
+#
+
+# SSE
+psubb = vec_vertical_instr('-', 8)
+psubw = vec_vertical_instr('-', 16)
+psubd = vec_vertical_instr('-', 32)
+psubq = vec_vertical_instr('-', 64)
+
+# Multiplications
+#
+
+# SSE
+pmullb = vec_vertical_instr('*', 8)
+pmullw = vec_vertical_instr('*', 16)
+pmulld = vec_vertical_instr('*', 32)
+pmullq = vec_vertical_instr('*', 64)
+pmulhub = vec_vertical_instr('*', 8, _keep_mul_high)
+pmulhuw = vec_vertical_instr('*', 16, _keep_mul_high)
+pmulhud = vec_vertical_instr('*', 32, _keep_mul_high)
+pmulhuq = vec_vertical_instr('*', 64, _keep_mul_high)
+pmulhb = vec_vertical_instr('*', 8, lambda x: _keep_mul_high(x, signed=True))
+pmulhw = vec_vertical_instr('*', 16, lambda x: _keep_mul_high(x, signed=True))
+pmulhd = vec_vertical_instr('*', 32, lambda x: _keep_mul_high(x, signed=True))
+pmulhq = vec_vertical_instr('*', 64, lambda x: _keep_mul_high(x, signed=True))
+
+def pmuludq(ir, instr, dst, src):
+    e = []
+    if dst.size == 64:
+        e.append(m2_expr.ExprAssign(
+            dst,
+            src[:32].zeroExtend(64) * dst[:32].zeroExtend(64)
+        ))
+    elif dst.size == 128:
+        e.append(m2_expr.ExprAssign(
+            dst[:64],
+            src[:32].zeroExtend(64) * dst[:32].zeroExtend(64)
+        ))
+        e.append(m2_expr.ExprAssign(
+            dst[64:],
+            src[64:96].zeroExtend(64) * dst[64:96].zeroExtend(64)
+        ))
+    else:
+        raise RuntimeError("Unsupported size %d" % dst.size)
+    return e, []
+
+# Mix
+#
+
+# SSE
+def pmaddwd(ir, instr, dst, src):
+    sizedst = 32
+    sizesrc = 16
+    out = []
+    for start in range(0, dst.size, sizedst):
+        base = start
+        mul1 = src[base: base + sizesrc].signExtend(sizedst) * dst[base: base + sizesrc].signExtend(sizedst)
+        base += sizesrc
+        mul2 = src[base: base + sizesrc].signExtend(sizedst) * dst[base: base + sizesrc].signExtend(sizedst)
+        out.append(mul1 + mul2)
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+
+def _absolute(expr):
+    """Return abs(@expr)"""
+    signed = expr.msb()
+    value_unsigned = (expr ^ expr.mask) + m2_expr.ExprInt(1, expr.size)
+    return m2_expr.ExprCond(signed, value_unsigned, expr)
+
+
+def psadbw(ir, instr, dst, src):
+    sizedst = 16
+    sizesrc = 8
+    out_dst = []
+    for start in range(0, dst.size, 64):
+        out = []
+        for src_start in range(0, 64, sizesrc):
+            beg = start + src_start
+            end = beg + sizesrc
+            # Not clear in the doc equations, but in the text, src and dst are:
+            # "8 unsigned byte integers"
+            out.append(_absolute(dst[beg: end].zeroExtend(sizedst) - src[beg: end].zeroExtend(sizedst)))
+        out_dst.append(m2_expr.ExprOp("+", *out))
+        out_dst.append(m2_expr.ExprInt(0, 64 - sizedst))
+
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out_dst))], []
+
+def _average(expr):
+    assert expr.is_op("avg") and len(expr.args) == 2
+
+    arg1 = expr.args[0].zeroExtend(expr.size * 2)
+    arg2 = expr.args[1].zeroExtend(expr.size * 2)
+    one = m2_expr.ExprInt(1, arg1.size)
+    # avg(unsigned) = (a + b + 1) >> 1, addition being at least on one more bit
+    return ((arg1 + arg2 + one) >> one)[:expr.size]
+
+pavgb = vec_vertical_instr('avg', 8, _average)
+pavgw = vec_vertical_instr('avg', 16, _average)
+
+# Comparisons
+#
+
+# SSE
+pminsw = vec_vertical_instr('min', 16, lambda x: _min_max(x, signed=True))
+pminub = vec_vertical_instr('min', 8, lambda x: _min_max(x, signed=False))
+pminuw = vec_vertical_instr('min', 16, lambda x: _min_max(x, signed=False))
+pminud = vec_vertical_instr('min', 32, lambda x: _min_max(x, signed=False))
+pmaxub = vec_vertical_instr('max', 8, lambda x: _min_max(x, signed=False))
+pmaxuw = vec_vertical_instr('max', 16, lambda x: _min_max(x, signed=False))
+pmaxud = vec_vertical_instr('max', 32, lambda x: _min_max(x, signed=False))
+pmaxsw = vec_vertical_instr('max', 16, lambda x: _min_max(x, signed=True))
+
+# Floating-point arithmetic
+#
+
+# SSE
+addss = vec_op_clip('fadd', 32)
+addsd = vec_op_clip('fadd', 64)
+addps = vec_vertical_instr('fadd', 32)
+addpd = vec_vertical_instr('fadd', 64)
+subss = vec_op_clip('fsub', 32)
+subsd = vec_op_clip('fsub', 64)
+subps = vec_vertical_instr('fsub', 32)
+subpd = vec_vertical_instr('fsub', 64)
+mulss = vec_op_clip('fmul', 32)
+mulsd = vec_op_clip('fmul', 64)
+mulps = vec_vertical_instr('fmul', 32)
+mulpd = vec_vertical_instr('fmul', 64)
+divss = vec_op_clip('fdiv', 32)
+divsd = vec_op_clip('fdiv', 64)
+divps = vec_vertical_instr('fdiv', 32)
+divpd = vec_vertical_instr('fdiv', 64)
+
+# Comparisons (floating-point)
+
+minps = vec_vertical_instr('fmin', 32, _float_min_max)
+minpd = vec_vertical_instr('fmin', 64, _float_min_max)
+minss = vec_op_clip('fmin', 32, _float_min_max)
+minsd = vec_op_clip('fmin', 64, _float_min_max)
+maxps = vec_vertical_instr('fmax', 32, _float_min_max)
+maxpd = vec_vertical_instr('fmax', 64, _float_min_max)
+maxss = vec_op_clip('fmax', 32, _float_min_max)
+maxsd = vec_op_clip('fmax', 64, _float_min_max)
+
+def _float_compare_to_mask(expr):
+    if expr.op == 'unord':
+        to_ext = m2_expr.expr_is_NaN(expr.args[0]) | m2_expr.expr_is_NaN(expr.args[1])
+    elif expr.op == 'ord':
+        to_ext = ~m2_expr.expr_is_NaN(expr.args[0]) & ~m2_expr.expr_is_NaN(expr.args[1])
+    else:
+        if expr.op == '==fu':
+            to_ext = m2_expr.expr_is_float_equal(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(0, 1)
+        elif expr.op == '<fu':
+            to_ext = m2_expr.expr_is_float_lower(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(0, 1)
+        elif expr.op == '<=fu':
+            to_ext = (m2_expr.expr_is_float_equal(expr.args[0], expr.args[1]) |
+                      m2_expr.expr_is_float_lower(expr.args[0], expr.args[1]))
+            on_NaN = m2_expr.ExprInt(0, 1)
+        elif expr.op == '!=fu':
+            to_ext = ~m2_expr.expr_is_float_equal(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(1, 1)
+        elif expr.op == '!<fu':
+            to_ext = ~m2_expr.expr_is_float_lower(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(1, 1)
+        elif expr.op == '!<=fu':
+            to_ext = ~(m2_expr.expr_is_float_equal(expr.args[0], expr.args[1]) |
+                      m2_expr.expr_is_float_lower(expr.args[0], expr.args[1]))
+            on_NaN = m2_expr.ExprInt(1, 1)
+
+        to_ext = m2_expr.ExprCond(
+            m2_expr.expr_is_NaN(expr.args[0]) | m2_expr.expr_is_NaN(expr.args[1]),
+            on_NaN,
+            to_ext
+        )
+    return to_ext.signExtend(expr.size)
+
+cmpeqps = vec_vertical_instr('==fu', 32, lambda x: _float_compare_to_mask(x))
+cmpeqpd = vec_vertical_instr('==fu', 64, lambda x: _float_compare_to_mask(x))
+cmpeqss = vec_op_clip('==fu', 32, lambda x: _float_compare_to_mask(x))
+cmpeqsd = vec_op_clip('==fu', 64, lambda x: _float_compare_to_mask(x))
+cmpltps = vec_vertical_instr('<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpltpd = vec_vertical_instr('<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpltss = vec_op_clip('<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpltsd = vec_op_clip('<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpleps = vec_vertical_instr('<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmplepd = vec_vertical_instr('<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpless = vec_op_clip('<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmplesd = vec_op_clip('<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpunordps = vec_vertical_instr('unord', 32, lambda x: _float_compare_to_mask(x))
+cmpunordpd = vec_vertical_instr('unord', 64, lambda x: _float_compare_to_mask(x))
+cmpunordss = vec_op_clip('unord', 32, lambda x: _float_compare_to_mask(x))
+cmpunordsd = vec_op_clip('unord', 64, lambda x: _float_compare_to_mask(x))
+cmpneqps = vec_vertical_instr('!=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpneqpd = vec_vertical_instr('!=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpneqss = vec_op_clip('!=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpneqsd = vec_op_clip('!=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnltps = vec_vertical_instr('!<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnltpd = vec_vertical_instr('!<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnltss = vec_op_clip('!<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnltsd = vec_op_clip('!<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnleps = vec_vertical_instr('!<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnlepd = vec_vertical_instr('!<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnless = vec_op_clip('!<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnlesd = vec_op_clip('!<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpordps = vec_vertical_instr('ord', 32, lambda x: _float_compare_to_mask(x))
+cmpordpd = vec_vertical_instr('ord', 64, lambda x: _float_compare_to_mask(x))
+cmpordss = vec_op_clip('ord', 32, lambda x: _float_compare_to_mask(x))
+cmpordsd = vec_op_clip('ord', 64, lambda x: _float_compare_to_mask(x))
+
+# Logical (floating-point)
+#
+
+# MMX/SSE/AVX
+
+
+def pand(_, instr, dst, src):
+    e = []
+    result = dst & src
+    # No flag assigned
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def pandn(_, instr, dst, src):
+    e = []
+    result = (dst ^ dst.mask) & src
+    # No flag assigned
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def por(_, instr, dst, src):
+    e = []
+    result = dst | src
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+
+def cvtdq2pd(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst[:64],
+            m2_expr.ExprOp(
+                'sint_to_fp',
+                src[:32].signExtend(64)
+            )
+        )
+    )
+    e.append(
+        m2_expr.ExprAssign(
+            dst[64:128],
+            m2_expr.ExprOp(
+                'sint_to_fp',
+                src[32:64].signExtend(64)
+            )
+        )
+    )
+    return e, []
+
+
+def cvtdq2ps(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('sint_to_fp', src[:32])))
+    e.append(
+        m2_expr.ExprAssign(dst[32:64], m2_expr.ExprOp('sint_to_fp', src[32:64])))
+    e.append(
+        m2_expr.ExprAssign(dst[64:96], m2_expr.ExprOp('sint_to_fp', src[64:96])))
+    e.append(
+        m2_expr.ExprAssign(dst[96:128], m2_expr.ExprOp('sint_to_fp', src[96:128])))
+    return e, []
+
+
+def cvtpd2dq(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:64])))
+    e.append(
+        m2_expr.ExprAssign(dst[32:64], m2_expr.ExprOp('fp_to_sint32', src[64:128])))
+    e.append(m2_expr.ExprAssign(dst[64:128], m2_expr.ExprInt(0, 64)))
+    return e, []
+
+
+def cvtpd2pi(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:64])))
+    e.append(
+        m2_expr.ExprAssign(dst[32:64], m2_expr.ExprOp('fp_to_sint32', src[64:128])))
+    return e, []
+
+
+def cvtpd2ps(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('fpconvert_fp32', src[:64])))
+    e.append(
+        m2_expr.ExprAssign(dst[32:64], m2_expr.ExprOp('fpconvert_fp32', src[64:128])))
+    e.append(m2_expr.ExprAssign(dst[64:128], m2_expr.ExprInt(0, 64)))
+    return e, []
+
+
+def cvtpi2pd(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst[:64],
+            m2_expr.ExprOp(
+                'sint_to_fp',
+                src[:32].signExtend(64)
+            )
+        )
+    )
+    e.append(
+        m2_expr.ExprAssign(
+            dst[64:128],
+            m2_expr.ExprOp(
+                'sint_to_fp',
+                src[32:64].signExtend(64))
+        )
+    )
+    return e, []
+
+
+def cvtpi2ps(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('sint_to_fp', src[:32])))
+    e.append(
+        m2_expr.ExprAssign(dst[32:64], m2_expr.ExprOp('sint_to_fp', src[32:64])))
+    return e, []
+
+
+def cvtps2dq(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:32])))
+    e.append(
+        m2_expr.ExprAssign(dst[32:64], m2_expr.ExprOp('fp_to_sint32', src[32:64])))
+    e.append(
+        m2_expr.ExprAssign(dst[64:96], m2_expr.ExprOp('fp_to_sint32', src[64:96])))
+    e.append(
+        m2_expr.ExprAssign(dst[96:128], m2_expr.ExprOp('fp_to_sint32', src[96:128])))
+    return e, []
+
+
+def cvtps2pd(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:64], m2_expr.ExprOp('fpconvert_fp64', src[:32])))
+    e.append(
+        m2_expr.ExprAssign(dst[64:128], m2_expr.ExprOp('fpconvert_fp64', src[32:64])))
+    return e, []
+
+
+def cvtps2pi(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:32])))
+    e.append(
+        m2_expr.ExprAssign(dst[32:64], m2_expr.ExprOp('fp_to_sint32', src[32:64])))
+    return e, []
+
+
+def cvtsd2si(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:64])))
+    return e, []
+
+
+def cvtsd2ss(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('fpconvert_fp32', src[:64])))
+    return e, []
+
+
+def cvtsi2sd(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(
+            dst[:64],
+            m2_expr.ExprOp(
+                'sint_to_fp',
+                src[:32].signExtend(64)
+            )
+        )
+    )
+    return e, []
+
+
+def cvtsi2ss(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('sint_to_fp', src[:32])))
+    return e, []
+
+
+def cvtss2sd(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:64], m2_expr.ExprOp('fpconvert_fp64', src[:32])))
+    return e, []
+
+
+def cvtss2si(_, instr, dst, src):
+    e = []
+    e.append(
+        m2_expr.ExprAssign(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:32])))
+    return e, []
+
+
+def _cvtt_tpl(dst, src, numbers, double):
+    e = []
+    for i in numbers:
+        # For CVTT*D2* (Convert with Truncation ... Double-Precision) to work,
+        # a first conversion fp64 -> fp32 is needed
+        if double:
+            tmp_src = m2_expr.ExprOp('fpconvert_fp32', src[i*64:i*64 + 64])
+        else:
+            tmp_src = src[i*32:i*32 + 32]
+
+        e.append(m2_expr.ExprAssign(
+            dst[i*32:i*32 + 32],
+            m2_expr.ExprOp('fp_to_sint32', m2_expr.ExprOp(
+                'fpround_towardszero',
+                tmp_src
+            ))))
+    return e
+
+def cvttpd2pi(_, instr, dst, src):
+    return _cvtt_tpl(dst, src, [0, 1], double=True), []
+
+def cvttpd2dq(_, instr, dst, src):
+    e = _cvtt_tpl(dst, src, [0, 1], double=True)
+    e.append(m2_expr.ExprAssign(dst[64:128], m2_expr.ExprInt(0, 64)))
+    return e, []
+
+def cvttsd2si(_, instr, dst, src):
+    return _cvtt_tpl(dst, src, [0], double=True), []
+
+def cvttps2dq(_, instr, dst, src):
+    return _cvtt_tpl(dst, src, [0, 1, 2, 3], double=False), []
+
+def cvttps2pi(_, instr, dst, src):
+    return _cvtt_tpl(dst, src, [0, 1], double=False), []
+
+def cvttss2si(_, instr, dst, src):
+    return _cvtt_tpl(dst, src, [0], double=False), []
+
+def movss(_, instr, dst, src):
+    e = []
+    if not isinstance(dst, m2_expr.ExprMem) and not isinstance(src, m2_expr.ExprMem):
+        # Source and Destination xmm
+        e.append(m2_expr.ExprAssign(dst[:32], src[:32]))
+    elif not isinstance(src, m2_expr.ExprMem) and isinstance(dst, m2_expr.ExprMem):
+        # Source XMM Destination Mem
+        e.append(m2_expr.ExprAssign(dst, src[:32]))
+    else:
+        # Source Mem Destination XMM
+        e.append(m2_expr.ExprAssign(
+            dst, m2_expr.ExprCompose(src, m2_expr.ExprInt(0, 96))))
+    return e, []
+
+
+def ucomiss(_, instr, src1, src2):
+    e = []
+    e.append(m2_expr.ExprAssign(zf, m2_expr.ExprOp(
+        'ucomiss_zf', src1[:32], src2[:32])))
+    e.append(m2_expr.ExprAssign(pf, m2_expr.ExprOp(
+        'ucomiss_pf', src1[:32], src2[:32])))
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprOp(
+        'ucomiss_cf', src1[:32], src2[:32])))
+
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(af, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(nf, m2_expr.ExprInt(0, 1)))
+
+    return e, []
+
+def ucomisd(_, instr, src1, src2):
+    e = []
+    e.append(m2_expr.ExprAssign(zf, m2_expr.ExprOp(
+        'ucomisd_zf', src1[:64], src2[:64])))
+    e.append(m2_expr.ExprAssign(pf, m2_expr.ExprOp(
+        'ucomisd_pf', src1[:64], src2[:64])))
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprOp(
+        'ucomisd_cf', src1[:64], src2[:64])))
+
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(af, m2_expr.ExprInt(0, 1)))
+    e.append(m2_expr.ExprAssign(nf, m2_expr.ExprInt(0, 1)))
+
+    return e, []
+
+def blsi(_, instr, dst, src):
+    e = []
+
+    arg1 = m2_expr.ExprInt(0, src.size)
+    neg_src = arg1 - src
+    result = neg_src & src
+
+    e += update_flag_zf(result)
+    e += update_flag_nf(result)
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, of.size)))
+
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprCond(src,
+                                                     m2_expr.ExprInt(1, 1),
+                                                     m2_expr.ExprInt(0, 1))))
+
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+def andn(_, instr, dst, src1, src2):
+    e = []
+
+    arg1 = m2_expr.ExprInt(0, src1.size)
+    neg_src1 = arg1 - src1
+    result = neg_src1 & src2
+
+    e += update_flag_zf(result)
+    e += update_flag_nf(result)
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, of.size)))
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprInt(0, cf.size)))
+
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+def bextr(_, instr, dst, src1, src2):
+    e = []
+
+    # TODO: change zero extension to 512 bits when AVX is supported
+    start = (src2 & m2_expr.ExprInt(0xFF, src2.size)).zeroExtend(256)
+    length = ((src2 & m2_expr.ExprInt(0xFF00, src2.size)) >> m2_expr.ExprInt(8, src2.size)).zeroExtend(256)
+
+    tmp = src1.zeroExtend(256) >> start
+    mask = m2_expr.ExprInt(0, 256).mask >> (m2_expr.ExprInt(256, 256) - length)
+
+    tmp = tmp & mask
+    result = tmp[:dst.size]
+
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+def blsmsk(_, instr, dst, src):
+    e = []
+
+    tmp = src - m2_expr.ExprInt(1, src.size)
+    result = src ^ tmp
+
+    e += update_flag_nf(result)
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, of.size)))
+    e.append(m2_expr.ExprAssign(zf, m2_expr.ExprInt(0, zf.size)))
+
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprCond(src,
+                                                     m2_expr.ExprInt(0, 1),
+                                                     m2_expr.ExprInt(1, 1))))
+
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+def blsr(_, instr, dst, src):
+    e = []
+
+    tmp = src - m2_expr.ExprInt(1, src.size)
+    result = tmp & src
+
+    e += update_flag_zf(result)
+    e += update_flag_nf(result)
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, of.size)))
+
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprCond(src,
+                                                     m2_expr.ExprInt(0, 1),
+                                                     m2_expr.ExprInt(1, 1))))
+
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+def tzcnt(ir, instr, dst, src):
+    e = []
+
+    operand_size = m2_expr.ExprInt(dst.size, dst.size)
+
+    result = m2_expr.ExprCond(src, m2_expr.ExprOp("cnttrailzeros", src), operand_size)
+
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprCond(m2_expr.ExprOp("FLAG_EQ_CMP", result, operand_size),
+                                                     m2_expr.ExprInt(1, 1),
+                                                     m2_expr.ExprInt(0, 1))))
+
+    e += update_flag_zf(result)
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+def bzhi(_, instr, dst, src1, src2):
+    e = []
+
+    operand_size = m2_expr.ExprInt(dst.size, dst.size)
+    index = src2[:7].zeroExtend(dst.size)
+    mask = m2_expr.ExprInt(0, dst.size).mask >> (operand_size
+                                                 - index
+                                                 - m2_expr.ExprInt(1, dst.size))
+
+    result = m2_expr.ExprCond(m2_expr.ExprOp("FLAG_SIGN_SUB", index, operand_size),
+                              src1 & mask, src1)
+
+
+    operand_size_dec = operand_size - m2_expr.ExprInt(1, dst.size)
+    e.append(m2_expr.ExprAssign(cf, m2_expr.ExprCond(m2_expr.ExprOp("FLAG_SIGN_SUB", operand_size_dec, index),
+                                                     m2_expr.ExprInt(1, 1),
+                                                     m2_expr.ExprInt(0, 1))))
+
+    e += update_flag_zf(result)
+    e += update_flag_nf(result)
+    e.append(m2_expr.ExprAssign(of, m2_expr.ExprInt(0, of.size)))
+    e.append(m2_expr.ExprAssign(dst, result))
+    return e, []
+
+def pshufb(_, instr, dst, src):
+    e = []
+    if dst.size == 64:
+        bit_l = 3
+    elif dst.size == 128:
+        bit_l = 4
+    else:
+        raise NotImplementedError("bad size")
+    for i in range(0, src.size, 8):
+        index = src[
+            i:i + bit_l].zeroExtend(dst.size) << m2_expr.ExprInt(3, dst.size)
+        value = (dst >> index)[:8]
+        e.append(m2_expr.ExprAssign(dst[i:i + 8],
+                                 m2_expr.ExprCond(src[i + 7:i + 8],
+                                                  m2_expr.ExprInt(0, 8),
+                                                  value)))
+    return e, []
+
+
+def pshufd(_, instr, dst, src, imm):
+    control = int(imm)
+    out = []
+    for i in range(4):
+        shift = ((control >> (i * 2)) & 3) * 32
+        # shift is 2 bits long, expr.size is 128
+        # => shift + 32 <= src.size
+        out.append(src[shift: shift + 32])
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+
+def pshuflw(_, instr, dst, src, imm):
+    control = int(imm)
+    out = []
+    for i in range(4):
+        shift = ((control >> (i * 2)) & 3) * 16
+        out.append(src[shift: shift + 16])
+    out.append(src[64:])
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+
+def pshufhw(_, instr, dst, src, imm):
+    control = int(imm)
+    out = [src[:64]]
+    for i in range(4):
+        shift = ((control >> (i * 2)) & 3) * 16
+        out.append(src[shift + 64: shift + 16 + 64])
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+
+def ps_rl_ll(ir, instr, dst, src, op, size):
+    mask = {16: 0xF,
+            32: 0x1F,
+            64: 0x3F}[size]
+    mask = m2_expr.ExprInt(mask, dst.size)
+
+    # Saturate the counter to 2**size
+    count = src.zeroExtend(dst.size)
+    count = m2_expr.ExprCond(count & expr_simp(~mask),
+                             m2_expr.ExprInt(size, dst.size), # saturation
+                             count, # count < 2**size
+    )
+    count = count[:size]
+    if src.is_int():
+        count = expr_simp(count)
+
+    out = []
+    for i in range(0, dst.size, size):
+        out.append(m2_expr.ExprOp(op, dst[i:i + size], count))
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+
+def psrlw(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, ">>", 16)
+
+
+def psrld(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, ">>", 32)
+
+
+def psrlq(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, ">>", 64)
+
+
+def psllw(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, "<<", 16)
+
+
+def pslld(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, "<<",  32)
+
+
+def psllq(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, "<<",  64)
+
+
+def psraw(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, "a>>", 16)
+
+
+def psrad(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, "a>>", 32)
+
+
+def pslldq(_, instr, dst, src):
+    assert src.is_int()
+    e = []
+    count = int(src)
+    if count > 15:
+        return [m2_expr.ExprAssign(dst, m2_expr.ExprInt(0, dst.size))], []
+    else:
+        return [m2_expr.ExprAssign(dst, dst << m2_expr.ExprInt(8 * count, dst.size))], []
+
+
+def psrldq(_, instr, dst, src):
+    assert src.is_int()
+    count = int(src)
+    if count > 15:
+        return [m2_expr.ExprAssign(dst, m2_expr.ExprInt(0, dst.size))], []
+    else:
+        return [m2_expr.ExprAssign(dst, dst >> m2_expr.ExprInt(8 * count, dst.size))], []
+
+
+def iret(ir, instr):
+    """IRET implementation
+    XXX: only support "no-privilege change"
+    """
+    size = instr.v_opmode()
+    exprs, _ = retf(ir, instr, m2_expr.ExprInt(size // 8, size=size))
+    tmp = mRSP[instr.mode][:size] + m2_expr.ExprInt((2 * size) // 8, size=size)
+    exprs += _tpl_eflags(tmp)
+    return exprs, []
+
+
+def pcmpeq(_, instr, dst, src, size):
+    e = []
+    for i in range(0, dst.size, size):
+        test = m2_expr.expr_is_equal(dst[i:i + size], src[i:i + size])
+        e.append(m2_expr.ExprAssign(dst[i:i + size],
+                                 m2_expr.ExprCond(test,
+                                                  m2_expr.ExprInt(-1, size),
+                                                  m2_expr.ExprInt(0, size))))
+    return e, []
+
+
+def pcmpgt(_, instr, dst, src, size):
+    e = []
+    for i in range(0, dst.size, size):
+        test = m2_expr.expr_is_signed_greater(dst[i:i + size], src[i:i + size])
+        e.append(m2_expr.ExprAssign(dst[i:i + size],
+                                 m2_expr.ExprCond(test,
+                                                  m2_expr.ExprInt(-1, size),
+                                                  m2_expr.ExprInt(0, size))))
+    return e, []
+
+
+def pcmpeqb(ir, instr, dst, src):
+    return pcmpeq(ir, instr, dst, src, 8)
+
+def pcmpeqw(ir, instr, dst, src):
+    return pcmpeq(ir, instr, dst, src, 16)
+
+def pcmpeqd(ir, instr, dst, src):
+    return pcmpeq(ir, instr, dst, src, 32)
+
+def pcmpeqq(ir, instr, dst, src):
+    return pcmpeq(ir, instr, dst, src, 64)
+
+
+
+
+def pcmpgtb(ir, instr, dst, src):
+    return pcmpgt(ir, instr, dst, src, 8)
+
+def pcmpgtw(ir, instr, dst, src):
+    return pcmpgt(ir, instr, dst, src, 16)
+
+def pcmpgtd(ir, instr, dst, src):
+    return pcmpgt(ir, instr, dst, src, 32)
+
+def pcmpgtq(ir, instr, dst, src):
+    return pcmpgt(ir, instr, dst, src, 64)
+
+
+
+def punpck(_, instr, dst, src, size, off):
+    e = []
+    slices = []
+    for i in range(dst.size // (2 * size)):
+        slices.append(dst[size * i + off: size * i + off + size])
+        slices.append(src[size * i + off: size * i + off + size])
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*slices)))
+    return e, []
+
+
+def punpckhbw(ir, instr, dst, src):
+    return punpck(ir, instr, dst, src, 8, dst.size // 2)
+
+
+def punpckhwd(ir, instr, dst, src):
+    return punpck(ir, instr, dst, src, 16, dst.size // 2)
+
+
+def punpckhdq(ir, instr, dst, src):
+    return punpck(ir, instr, dst, src, 32, dst.size // 2)
+
+
+def punpckhqdq(ir, instr, dst, src):
+    return punpck(ir, instr, dst, src, 64, dst.size // 2)
+
+
+def punpcklbw(ir, instr, dst, src):
+    return punpck(ir, instr, dst, src, 8, 0)
+
+
+def punpcklwd(ir, instr, dst, src):
+    return punpck(ir, instr, dst, src, 16, 0)
+
+
+def punpckldq(ir, instr, dst, src):
+    return punpck(ir, instr, dst, src, 32, 0)
+
+
+def punpcklqdq(ir, instr, dst, src):
+    return punpck(ir, instr, dst, src, 64, 0)
+
+
+def pinsr(_, instr, dst, src, imm, size):
+    e = []
+
+    mask = {8: 0xF,
+            16: 0x7,
+            32: 0x3,
+            64: 0x1}[size]
+
+    sel = (int(imm) & mask) * size
+    e.append(m2_expr.ExprAssign(dst[sel:sel + size], src[:size]))
+
+    return e, []
+
+
+def pinsrb(ir, instr, dst, src, imm):
+    return pinsr(ir, instr, dst, src, imm, 8)
+
+
+def pinsrw(ir, instr, dst, src, imm):
+    return pinsr(ir, instr, dst, src, imm, 16)
+
+
+def pinsrd(ir, instr, dst, src, imm):
+    return pinsr(ir, instr, dst, src, imm, 32)
+
+
+def pinsrq(ir, instr, dst, src, imm):
+    return pinsr(ir, instr, dst, src, imm, 64)
+
+
+def pextr(_, instr, dst, src, imm, size):
+    e = []
+
+    mask = {8: 0xF,
+            16: 0x7,
+            32: 0x3,
+            64: 0x1}[size]
+
+    sel = (int(imm) & mask) * size
+    e.append(m2_expr.ExprAssign(dst, src[sel:sel + size].zeroExtend(dst.size)))
+
+    return e, []
+
+
+def pextrb(ir, instr, dst, src, imm):
+    return pextr(ir, instr, dst, src, imm, 8)
+
+
+def pextrw(ir, instr, dst, src, imm):
+    return pextr(ir, instr, dst, src, imm, 16)
+
+
+def pextrd(ir, instr, dst, src, imm):
+    return pextr(ir, instr, dst, src, imm, 32)
+
+
+def pextrq(ir, instr, dst, src, imm):
+    return pextr(ir, instr, dst, src, imm, 64)
+
+
+def unpckhps(_, instr, dst, src):
+    e = []
+    src = m2_expr.ExprCompose(dst[64:96], src[64:96], dst[96:128], src[96:128])
+    e.append(m2_expr.ExprAssign(dst, src))
+    return e, []
+
+
+def unpckhpd(_, instr, dst, src):
+    e = []
+    src = m2_expr.ExprCompose(dst[64:128], src[64:128])
+    e.append(m2_expr.ExprAssign(dst, src))
+    return e, []
+
+
+def unpcklps(_, instr, dst, src):
+    e = []
+    src = m2_expr.ExprCompose(dst[0:32], src[0:32], dst[32:64], src[32:64])
+    e.append(m2_expr.ExprAssign(dst, src))
+    return e, []
+
+
+def unpcklpd(_, instr, dst, src):
+    e = []
+    src = m2_expr.ExprCompose(dst[0:64], src[0:64])
+    e.append(m2_expr.ExprAssign(dst, src))
+    return e, []
+
+
+def movlpd(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst[:64], src[:64]))
+    return e, []
+
+
+def movlps(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst[:64], src[:64]))
+    return e, []
+
+
+def movhpd(_, instr, dst, src):
+    e = []
+    if src.size == 64:
+        e.append(m2_expr.ExprAssign(dst[64:128], src))
+    elif dst.size == 64:
+        e.append(m2_expr.ExprAssign(dst, src[64:128]))
+    else:
+        raise RuntimeError("bad encoding!")
+    return e, []
+
+
+def movlhps(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst[64:128], src[:64]))
+    return e, []
+
+
+def movhlps(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst[:64], src[64:128]))
+    return e, []
+
+
+def movdq2q(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, src[:64]))
+    return e, []
+
+
+def movq2dq(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst, src[:64].zeroExtend(dst.size)))
+    return e, []
+
+
+def sqrt_gen(_, instr, dst, src, size):
+    e = []
+    out = []
+    for i in range(src.size // size):
+        out.append(m2_expr.ExprOp('fsqrt',
+                                  src[i * size: (i + 1) * size]))
+    src = m2_expr.ExprCompose(*out)
+    e.append(m2_expr.ExprAssign(dst, src))
+    return e, []
+
+
+def sqrtpd(ir, instr, dst, src):
+    return sqrt_gen(ir, instr, dst, src, 64)
+
+
+def sqrtps(ir, instr, dst, src):
+    return sqrt_gen(ir, instr, dst, src, 32)
+
+
+def sqrtsd(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst[:64],
+                             m2_expr.ExprOp('fsqrt',
+                                            src[:64])))
+    return e, []
+
+
+def sqrtss(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAssign(dst[:32],
+                             m2_expr.ExprOp('fsqrt',
+                                            src[:32])))
+    return e, []
+
+
+def pmovmskb(_, instr, dst, src):
+    e = []
+    out = []
+    for i in range(src.size // 8):
+        out.append(src[8 * i + 7:8 * (i + 1)])
+    src = m2_expr.ExprCompose(*out)
+    e.append(m2_expr.ExprAssign(dst, src.zeroExtend(dst.size)))
+    return e, []
+
+
+def smsw(ir, instr, dst):
+    e = []
+    LOG_X86_SEM.warning("DEFAULT SMSW %s!!", str(dst))
+    e.append(m2_expr.ExprAssign(dst, m2_expr.ExprInt(0x80050033, 32)[:dst.size]))
+    return e, []
+
+
+def bndmov(ir, instr, dst, src):
+    # Implemented as a NOP, because BND side effects are not yet supported
+    return [], []
+
+def palignr(ir, instr, dst, src, imm):
+    # dst.src >> imm * 8 [:dst.size]
+
+    shift = int(imm) * 8
+    if shift == 0:
+        result = src
+    elif shift == src.size:
+        result = dst
+    elif shift > src.size:
+        result = dst >> m2_expr.ExprInt(shift - src.size, dst.size)
+    else:
+        # shift < src.size
+        result = m2_expr.ExprCompose(
+            src[shift:],
+            dst[:shift],
+        )
+
+    return [m2_expr.ExprAssign(dst, result)], []
+
+
+def _signed_to_signed_saturation(expr, dst_size):
+    """Saturate the expr @expr for @dst_size bit
+    Signed saturation return MAX_INT / MIN_INT or value depending on the value
+    """
+    assert expr.size > dst_size
+
+    median = 1 << (dst_size - 1)
+
+    min_int = m2_expr.ExprInt(- median, dst_size)
+    max_int = m2_expr.ExprInt(median - 1, dst_size)
+
+    test_min_int = min_int.signExtend(expr.size)
+    test_max_int = max_int.signExtend(expr.size)
+
+    value = expr[:dst_size]
+
+    return m2_expr.ExprCond(
+        m2_expr.ExprOp(
+            m2_expr.TOK_INF_EQUAL_SIGNED,
+            expr,
+            test_min_int
+        ),
+        min_int,
+        m2_expr.ExprCond(
+            m2_expr.ExprOp(
+                m2_expr.TOK_INF_SIGNED,
+                expr,
+                test_max_int
+            ),
+            value,
+            max_int
+        )
+    )
+
+
+def _signed_to_unsigned_saturation(expr, dst_size):
+    """Saturate the expr @expr for @dst_size bit
+    Unsigned saturation return MAX_INT or value depending on the value
+    """
+    assert expr.size > dst_size
+
+    zero = m2_expr.ExprInt(0, dst_size)
+    test_zero = m2_expr.ExprInt(0, expr.size)
+
+    max_int = m2_expr.ExprInt(-1, dst_size)
+    test_max_int = max_int.zeroExtend(expr.size)
+
+    value = expr[:dst_size]
+
+    return m2_expr.ExprCond(
+        m2_expr.ExprOp(
+            m2_expr.TOK_INF_EQUAL_SIGNED,
+            expr,
+            test_zero
+        ),
+        zero,
+        m2_expr.ExprCond(
+            m2_expr.ExprOp(
+                m2_expr.TOK_INF_SIGNED,
+                expr,
+                test_max_int
+            ),
+            value,
+            max_int
+        )
+    )
+
+
+
+def packsswb(ir, instr, dst, src):
+    out = []
+    for source in [dst, src]:
+        for start in range(0, dst.size, 16):
+            out.append(_signed_to_signed_saturation(source[start:start + 16], 8))
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+
+def packssdw(ir, instr, dst, src):
+    out = []
+    for source in [dst, src]:
+        for start in range(0, dst.size, 32):
+            out.append(_signed_to_signed_saturation(source[start:start + 32], 16))
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+
+def packuswb(ir, instr, dst, src):
+    out = []
+    for source in [dst, src]:
+        for start in range(0, dst.size, 16):
+            out.append(_signed_to_unsigned_saturation(source[start:start + 16], 8))
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+
+def _saturation_sub_unsigned(expr):
+    assert expr.is_op("+") and len(expr.args) == 2 and expr.args[-1].is_op("-")
+
+    # Compute the soustraction on one more bit to be able to distinguish cases:
+    # 0x48 - 0xd7 in 8 bit, should saturate
+    arg1 = expr.args[0].zeroExtend(expr.size + 1)
+    arg2 = expr.args[1].args[0].zeroExtend(expr.size + 1)
+    return _signed_to_unsigned_saturation(arg1 - arg2, expr.size)
+
+def _saturation_sub_signed(expr):
+    assert expr.is_op("+") and len(expr.args) == 2 and expr.args[-1].is_op("-")
+
+    # Compute the subtraction on two more bits, see _saturation_sub_unsigned
+    arg1 = expr.args[0].signExtend(expr.size + 2)
+    arg2 = expr.args[1].args[0].signExtend(expr.size + 2)
+    return _signed_to_signed_saturation(arg1 - arg2, expr.size)
+
+def _saturation_add(expr):
+    assert expr.is_op("+") and len(expr.args) == 2
+
+    # Compute the addition on one more bit to be able to distinguish cases:
+    # 0x48 + 0xd7 in 8 bit, should saturate
+
+    arg1 = expr.args[0].zeroExtend(expr.size + 1)
+    arg2 = expr.args[1].zeroExtend(expr.size + 1)
+
+    # We can also use _signed_to_unsigned_saturation with two additional bits (to
+    # distinguish minus and overflow case)
+    # The resulting expression being more complicated with an impossible case
+    # (signed=True), we rewrite the rule here
+
+    return m2_expr.ExprCond((arg1 + arg2).msb(), m2_expr.ExprInt(-1, expr.size),
+                            expr)
+
+def _saturation_add_signed(expr):
+    assert expr.is_op("+") and len(expr.args) == 2
+
+    # Compute the subtraction on two more bits, see _saturation_add_unsigned
+
+    arg1 = expr.args[0].signExtend(expr.size + 2)
+    arg2 = expr.args[1].signExtend(expr.size + 2)
+
+    return _signed_to_signed_saturation(arg1 + arg2, expr.size)
+
+
+# Saturate SSE operations
+
+psubusb = vec_vertical_instr('-', 8, _saturation_sub_unsigned)
+psubusw = vec_vertical_instr('-', 16, _saturation_sub_unsigned)
+paddusb = vec_vertical_instr('+', 8, _saturation_add)
+paddusw = vec_vertical_instr('+', 16, _saturation_add)
+psubsb = vec_vertical_instr('-', 8, _saturation_sub_signed)
+psubsw = vec_vertical_instr('-', 16, _saturation_sub_signed)
+paddsb = vec_vertical_instr('+', 8, _saturation_add_signed)
+paddsw = vec_vertical_instr('+', 16, _saturation_add_signed)
+
+
+# Others SSE operations
+
+def maskmovq(ir, instr, src, mask):
+    loc_next = ir.get_next_loc_key(instr)
+    loc_next_expr = m2_expr.ExprLoc(loc_next, ir.IRDst.size)
+    blks = []
+
+    # For each possibility, check if a write is necessary
+    check_labels = [m2_expr.ExprLoc(ir.loc_db.add_location(), ir.IRDst.size)
+                    for _ in range(0, mask.size, 8)]
+    # If the write has to be done, do it (otherwise, nothing happen)
+    write_labels = [m2_expr.ExprLoc(ir.loc_db.add_location(), ir.IRDst.size)
+                    for _ in range(0, mask.size, 8)]
+
+    # Build check blocks
+    for i, start in enumerate(range(0, mask.size, 8)):
+        bit = mask[start + 7: start + 8]
+        cur_label = check_labels[i]
+        next_check_label = check_labels[i + 1] if (i + 1) < len(check_labels) else loc_next_expr
+        write_label = write_labels[i]
+        check = m2_expr.ExprAssign(ir.IRDst,
+                                m2_expr.ExprCond(bit,
+                                                 write_label,
+                                                 next_check_label))
+        blks.append(IRBlock(ir.loc_db, cur_label.loc_key, [AssignBlock([check], instr)]))
+
+    # Build write blocks
+    dst_addr = mRDI[instr.mode]
+    for i, start in enumerate(range(0, mask.size, 8)):
+        cur_label = write_labels[i]
+        next_check_label = check_labels[i + 1] if (i + 1) < len(check_labels) else loc_next_expr
+        write_addr = dst_addr + m2_expr.ExprInt(i, dst_addr.size)
+
+        # @8[DI/EDI/RDI + i] = src[byte i]
+        write_mem = m2_expr.ExprAssign(m2_expr.ExprMem(write_addr, 8),
+                                    src[start: start + 8])
+        jump = m2_expr.ExprAssign(ir.IRDst, next_check_label)
+        blks.append(IRBlock(ir.loc_db, cur_label.loc_key, [AssignBlock([write_mem, jump], instr)]))
+
+    # If mask is null, bypass all
+    e = [m2_expr.ExprAssign(ir.IRDst, m2_expr.ExprCond(mask,
+                                                    check_labels[0],
+                                                    loc_next_expr))]
+    return e, blks
+
+
+def emms(ir, instr):
+    # Implemented as a NOP
+    return [], []
+
+def incssp(ir, instr, dst):
+    # Implemented as a NOP
+    return [], []
+
+def rdssp(ir, instr, dst):
+    # Implemented as a NOP
+    return [], []
+
+def saveprevssp(ir, instr):
+    # Implemented as a NOP
+    return [], []
+
+def rstorssp(ir, instr, dst):
+    # Implemented as a NOP
+    return [], []
+
+def wrss(ir, instr, src, dst):
+    # Implemented as a NOP
+    return [], []
+
+def wruss(ir, instr, src, dst):
+    # Implemented as a NOP
+    return [], []
+
+def setssbsy(ir, instr):
+    # Implemented as a NOP
+    return [], []
+
+def clrssbsy(ir, instr, dst):
+    # Implemented as a NOP
+    return [], []
+
+def endbr64(ir, instr):
+    # Implemented as a NOP
+    return [], []
+
+def endbr32(ir, instr):
+    # Implemented as a NOP
+    return [], []
+
+# Common value without too many option, 0x1fa0
+STMXCSR_VALUE = 0x1fa0
+def stmxcsr(ir, instr, dst):
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprInt(STMXCSR_VALUE, dst.size))], []
+
+def ldmxcsr(ir, instr, dst):
+    # Implemented as a NOP
+    return [], []
+
+
+def _select4(src, control):
+    # Implementation inspired from Intel Intrinsics Guide
+    # @control is already resolved (was an immediate)
+
+    if control == 0:
+        return src[:32] # 0
+    elif control == 1:
+        return src[32:64]
+    elif control == 2:
+        return src[64:96]
+    elif control == 3:
+        return src[96:]
+    else:
+        raise ValueError("Control must be on 2 bits")
+
+
+def shufps(ir, instr, dst, src, imm8):
+    out = []
+    control = int(imm8)
+    for i in range(4):
+        if i < 2:
+            source = dst
+        else:
+            source = src
+        out.append(_select4(source, (control >> (i * 2)) & 3))
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+
+def shufpd(ir, instr, dst, src, imm8):
+    out = []
+    control = int(imm8)
+    out.append(dst[64:] if control & 1 else dst[:64])
+    out.append(src[64:] if control & 2 else src[:64])
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out))], []
+
+def movmskps(ir, instr, dst, src):
+    out = []
+    for i in range(4):
+        out.append(src[(32 * i) + 31:(32 * i) + 32])
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out).zeroExtend(dst.size))], []
+
+def movmskpd(ir, instr, dst, src):
+    out = []
+    for i in range(2):
+        out.append(src[(64 * i) + 63:(64 * i) + 64])
+    return [m2_expr.ExprAssign(dst, m2_expr.ExprCompose(*out).zeroExtend(dst.size))], []
+
+def _roundscalar(ir, inst, dst, src, imm8, double):
+    res = None
+    ctl = int(imm8)
+    dst_expr = dst[:64] if double else dst[:32]
+    src_expr = src[:64] if double else src[:32]
+    if ctl & 0x4 != 0:
+        # Use MXCSR rounding config
+        # TODO: here we assume it's round to nearest, ties to even
+        res = m2_expr.ExprOp('fpround_towardsnearest', src_expr)
+    else:
+        # Use encoded rounding mechanism
+        rounding_mechanism = ctl & 0x3
+        ROUNDING_MODE = {
+            0x0: 'fpround_towardsnearest',
+            0x1: 'fpround_down',
+            0x2: 'fpround_up',
+            0x3: 'fpround_towardszero'
+        }
+        res = m2_expr.ExprOp(ROUNDING_MODE[rounding_mechanism], src_expr)
+    return [m2_expr.ExprAssign(dst_expr, res)], []
+
+def roundss(ir, inst, dst, src, imm8):
+    return _roundscalar(ir, inst, dst, src, imm8, False)
+
+def roundsd(ir, inst, dst, src, imm8):
+    return _roundscalar(ir, inst, dst, src, imm8, True)
+
+def fxsave(_ir, _instr, _src):
+    # Implemented as a NOP for now
+    return [], []
+
+def fxrstor(_ir, _instr, _dst):
+    # Implemented as a NOP for now
+    return [], []
+
+
+mnemo_func = {'mov': mov,
+              'xchg': xchg,
+              'movzx': movzx,
+              'movsx': movsx,
+              'movsxd': movsx,
+              'lea': lea,
+              'add': add,
+              'xadd': xadd,
+              'adc': adc,
+              'sub': sub,
+              'sbb': sbb,
+              'neg': neg,
+              'not': l_not,
+              'cmp': l_cmp,
+              'xor': xor,
+              'pxor': pxor,
+              'or': l_or,
+              'and': l_and,
+              'test': l_test,
+              'rol': l_rol,
+              'ror': l_ror,
+              'rcl': rcl,
+              'rcr': rcr,
+              'sar': sar,
+              'shr': shr,
+              'sal': shl,
+              'shl': shl,
+              'shld': shld,
+              'cmc': cmc,
+              'clc': clc,
+              'stc': stc,
+              'cld': cld,
+              'std': std,
+              'cli': cli,
+              'sti': sti,
+              'bsf': bsf,
+              'bsr': bsr,
+              'inc': inc,
+              'dec': dec,
+              'push': push,
+              'pushw': pushw,
+              'pop': pop,
+              'popw': popw,
+              'sete': sete,
+              'setnz': setnz,
+              'setl': setl,
+              'setg': setg,
+              'setge': setge,
+              'seta': seta,
+              'setae': setae,
+              'setb': setb,
+              'setbe': setbe,
+              'setns': setns,
+              'sets': sets,
+              'seto': seto,
+              'setp': setp,
+              'setpe': setp,
+              'setnp': setnp,
+              'setpo': setnp,
+              'setle': setle,
+              'setng': setle,
+              'setna': setna,
+              'setnbe': setnbe,
+              'setno': setno,
+              'setnc': setnb,
+              'setz': sete,
+              'setne': setnz,
+              'setnb': setae,
+              'setnae': setb,
+              'setc': setb,
+              'setnge': setl,
+              'setnl': setge,
+              'setnle': setg,
+              'setalc': setalc,
+              'bswap': bswap,
+              'cmpsb': lambda ir, instr: cmps(ir, instr, 8),
+              'cmpsw': lambda ir, instr: cmps(ir, instr, 16),
+              'cmpsd': lambda ir, instr: cmps(ir, instr, 32),
+              'cmpsq': lambda ir, instr: cmps(ir, instr, 64),
+              'scasb': lambda ir, instr: scas(ir, instr, 8),
+              'scasw': lambda ir, instr: scas(ir, instr, 16),
+              'scasd': lambda ir, instr: scas(ir, instr, 32),
+              'scasq': lambda ir, instr: scas(ir, instr, 64),
+              'pushfd': pushfd,
+              'pushfq': pushfq,
+              'pushfw': pushfw,
+              'popfd': popfd,
+              'popfq': popfd,
+              'popfw': popfw,
+              'pusha': pusha,
+              'pushad': pushad,
+              'popad': popad,
+              'popa': popa,
+              'call': call,
+              'ret': ret,
+              'retf': retf,
+              'iret': iret,
+              'iretd': iret,
+              'leave': leave,
+              'enter': enter,
+              'jmp': jmp,
+              'jz': jz,
+              'je': jz,
+              'jcxz': jcxz,
+              'jecxz': jecxz,
+              'jrcxz': jrcxz,
+              'jnz': jnz,
+              'jp': jp,
+              'jpe': jp,
+              'jnp': jnp,
+              'ja': ja,
+              'jae': jae,
+              'jb': jb,
+              'jbe': jbe,
+              'jg': jg,
+              'jge': jge,
+              'jl': jl,
+              'jle': jle,
+              'js': js,
+              'jns': jns,
+              'jo': jo,
+              'jno': jno,
+              'loop': loop,
+              'loopne': loopne,
+              'loope': loope,
+              'div': div,
+              'mul': mul,
+              'imul': imul,
+              'idiv': idiv,
+
+              'cbw': cbw,
+              'cwde': cwde,
+              'cdqe': cdqe,
+
+              'cwd': cwd,
+              'cdq': cdq,
+              'cqo': cqo,
+
+              'daa': daa,
+              'das': das,
+              'aam': aam,
+              'aad': aad,
+              'aaa': aaa,
+              'aas': aas,
+              'shrd': shrd,
+              'stosb': lambda ir, instr: stos(ir, instr, 8),
+              'stosw': lambda ir, instr: stos(ir, instr, 16),
+              'stosd': lambda ir, instr: stos(ir, instr, 32),
+              'stosq': lambda ir, instr: stos(ir, instr, 64),
+
+              'lodsb': lambda ir, instr: lods(ir, instr, 8),
+              'lodsw': lambda ir, instr: lods(ir, instr, 16),
+              'lodsd': lambda ir, instr: lods(ir, instr, 32),
+              'lodsq': lambda ir, instr: lods(ir, instr, 64),
+
+              'movsb': lambda ir, instr: movs(ir, instr, 8),
+              'movsw': lambda ir, instr: movs(ir, instr, 16),
+              'movsd': movsd_dispatch,
+              'movsq': lambda ir, instr: movs(ir, instr, 64),
+              'fcomp': fcomp,
+              'fcompp': fcompp,
+              'ficomp': ficomp,
+              'fucom': fucom,
+              'fucomp': fucomp,
+              'fucompp': fucompp,
+              'comiss': comiss,
+              'comisd': comisd,
+              'nop': nop,
+              'ud2': ud2,
+              'prefetch0': prefetch0,
+              'prefetch1': prefetch1,
+              'prefetch2': prefetch2,
+              'prefetchw': prefetchw,
+              'prefetchnta': prefetchnta,
+              'lfence': lfence,
+              'mfence': mfence,
+              'sfence': sfence,
+              'fnop': nop,  # XXX
+              'hlt': hlt,
+              'rdtsc': rdtsc,
+              'fst': fst,
+              'fstp': fstp,
+              'fist': fist,
+              'fistp': fistp,
+              'fisttp': fisttp,
+              'fld': fld,
+              'fldz': fldz,
+              'fld1': fld1,
+              'fldl2t': fldl2t,
+              'fldpi': fldpi,
+              'fldln2': fldln2,
+              'fldl2e': fldl2e,
+              'fldlg2': fldlg2,
+              'fild': fild,
+              'fadd': fadd,
+              'fiadd': fiadd,
+              'fisub': fisub,
+              'fisubr': fisubr,
+              'fpatan': fpatan,
+              'fprem': fprem,
+              'fprem1': fprem1,
+              'fninit': fninit,
+              'fyl2x': fyl2x,
+              'faddp': faddp,
+              'fsub': fsub,
+              'fsubp': fsubp,
+              'fsubr': fsubr,
+              'fsubrp': fsubrp,
+              'fmul': fmul,
+              'fimul': fimul,
+              'fmulp': fmulp,
+              'fdiv': fdiv,
+              'fdivr': fdivr,
+              'fdivrp': fdivrp,
+              'fidiv': fidiv,
+              'fidivr': fidivr,
+              'fdivp': fdivp,
+              'fxch': fxch,
+              'fptan': fptan,
+              'frndint': frndint,
+              'fsin': fsin,
+              'fcos': fcos,
+              'fsincos': fsincos,
+              'fscale': fscale,
+              'f2xm1': f2xm1,
+              'fchs': fchs,
+              'fsqrt': fsqrt,
+              'fabs': fabs,
+              'fnstsw': fnstsw,
+              'fnstcw': fnstcw,
+              'fldcw': fldcw,
+              'fwait': fwait,
+              'fcmovb':   fcmovb,
+              'fcmove':   fcmove,
+              'fcmovbe':  fcmovbe,
+              'fcmovu':   fcmovu,
+              'fcmovnb':  fcmovnb,
+              'fcmovne':  fcmovne,
+              'fcmovnbe': fcmovnbe,
+              'fcmovnu':  fcmovnu,
+              'fnstenv': fnstenv,
+              'fldenv': fldenv,
+              'sidt': sidt,
+              'sldt': sldt,
+              'arpl': arpl,
+              'cmovz': cmovz,
+              'cmove': cmovz,
+              'cmovnz': cmovnz,
+              'cmovpe': cmovpe,
+              'cmovnp': cmovnp,
+              'cmovge': cmovge,
+              'cmovnl': cmovge,
+              'cmovg': cmovg,
+              'cmovl': cmovl,
+              'cmova': cmova,
+              'cmovae': cmovae,
+              'cmovbe': cmovbe,
+              'cmovb': cmovb,
+              'cmovnge': cmovl,
+              'cmovle': cmovle,
+              'cmovng': cmovle,
+              'cmovo': cmovo,
+              'cmovno': cmovno,
+              'cmovs': cmovs,
+              'cmovns': cmovns,
+              'icebp': icebp,
+              'int': l_int,
+              'xlat': xlat,
+              'bt': bt,
+              'cpuid': cpuid,
+              'fcom': fcom,
+              'ftst': ftst,
+              'fxam': fxam,
+              'ficom': ficom,
+              'fcomi': fcomi,
+              'fcomip': fcomip,
+              'fucomi': fucomi,
+              'fucomip': fucomip,
+              'insb': lambda ir, instr: ins(ir, instr, 8),
+              'insw': lambda ir, instr: ins(ir, instr, 16),
+              'insd': lambda ir, instr: ins(ir, instr, 32),
+              'btc': btc,
+              'bts': bts,
+              'btr': btr,
+              'into': into,
+              'in': l_in,
+              'outsb': lambda ir, instr: l_outs(ir, instr, 8),
+              'outsw': lambda ir, instr: l_outs(ir, instr, 16),
+              'outsd': lambda ir, instr: l_outs(ir, instr, 32),
+
+              'out': l_out,
+              "sysenter": l_sysenter,
+              "syscall": l_syscall,
+              "cmpxchg": cmpxchg,
+              "cmpxchg8b": cmpxchg8b,
+              "lds": lds,
+              "les": les,
+              "lss": lss,
+              "lfs": lfs,
+              "lgs": lgs,
+              "lahf": lahf,
+              "sahf": sahf,
+              "lar": lar,
+              "lsl": lsl,
+              "fclex": fclex,
+              "fnclex": fnclex,
+              "str": l_str,
+              "movd": movd,
+              "movdqu": movdqu,
+              "movdqa": movdqu,
+              "movapd": movapd,  # XXX TODO alignment check
+              "movupd": movapd,  # XXX TODO alignment check
+              "movaps": movapd,  # XXX TODO alignment check
+              "movups": movapd,  # XXX TODO alignment check
+              "andps": andps,
+              "andpd": andps,
+              "andnps": andnps,
+              "andnpd": andnps,
+              "orps": orps,
+              "orpd": orps,
+              "xorps": xorps,
+              "xorpd": xorps,
+
+              "movq": movq,
+
+              "pminsw": pminsw,
+              "cvtdq2pd": cvtdq2pd,
+              "cvtdq2ps": cvtdq2ps,
+              "cvtpd2dq": cvtpd2dq,
+              "cvtpd2pi": cvtpd2pi,
+              "cvtpd2ps": cvtpd2ps,
+              "cvtpi2pd": cvtpi2pd,
+              "cvtpi2ps": cvtpi2ps,
+              "cvtps2dq": cvtps2dq,
+              "cvtps2pd": cvtps2pd,
+              "cvtps2pi": cvtps2pi,
+              "cvtsd2si": cvtsd2si,
+              "cvtsd2ss": cvtsd2ss,
+              "cvtsi2sd": cvtsi2sd,
+              "cvtsi2ss": cvtsi2ss,
+              "cvtss2sd": cvtss2sd,
+              "cvtss2si": cvtss2si,
+              "cvttpd2pi": cvttpd2pi,
+              "cvttpd2dq": cvttpd2dq,
+              "cvttps2dq": cvttps2dq,
+              "cvttps2pi": cvttps2pi,
+              "cvttsd2si": cvttsd2si,
+              "cvttss2si": cvttss2si,
+
+
+              "bndmov": bndmov,
+
+
+
+
+              "movss": movss,
+
+              "ucomiss": ucomiss,
+              "ucomisd": ucomisd,
+
+              # BMI operations
+              "blsi": blsi,
+              "andn": andn,
+              "bextr": bextr,
+              "blsmsk": blsmsk,
+              "blsr": blsr,
+              "tzcnt": tzcnt,
+              "bzhi": bzhi,
+
+              #
+              # MMX/AVX/SSE operations
+
+              # Arithmetic (integers)
+              #
+
+              # Additions
+              # SSE
+              "paddb": paddb,
+              "paddw": paddw,
+              "paddd": paddd,
+              "paddq": paddq,
+
+              # Substractions
+              # SSE
+              "psubb": psubb,
+              "psubw": psubw,
+              "psubd": psubd,
+              "psubq": psubq,
+
+              # Multiplications
+              # SSE
+              "pmullb": pmullb,
+              "pmullw": pmullw,
+              "pmulld": pmulld,
+              "pmullq": pmullq,
+              "pmulhub": pmulhub,
+              "pmulhuw": pmulhuw,
+              "pmulhud": pmulhud,
+              "pmulhuq": pmulhuq,
+              "pmulhb": pmulhb,
+              "pmulhw": pmulhw,
+              "pmulhd": pmulhd,
+              "pmulhq": pmulhq,
+              "pmuludq": pmuludq,
+
+              # Mix
+              # SSE
+              "pmaddwd": pmaddwd,
+              "psadbw": psadbw,
+              "pavgb": pavgb,
+              "pavgw": pavgw,
+
+              # Arithmetic (floating-point)
+              #
+
+              # Additions
+              # SSE
+              "addss": addss,
+              "addsd": addsd,
+              "addps": addps,
+              "addpd": addpd,
+
+              # Substractions
+              # SSE
+              "subss": subss,
+              "subsd": subsd,
+              "subps": subps,
+              "subpd": subpd,
+
+              # Multiplications
+              # SSE
+              "mulss": mulss,
+              "mulsd": mulsd,
+              "mulps": mulps,
+              "mulpd": mulpd,
+
+              # Divisions
+              # SSE
+              "divss": divss,
+              "divsd": divsd,
+              "divps": divps,
+              "divpd": divpd,
+
+              # Rounding
+              "roundss": roundss,
+              "roundsd": roundsd,
+
+              # Comparisons (floating-point)
+              #
+              "minps": minps,
+              "minpd": minpd,
+              "minss": minss,
+              "minsd": minsd,
+              "maxps": maxps,
+              "maxpd": maxpd,
+              "maxss": maxss,
+              "maxsd": maxsd,
+              "cmpeqps": cmpeqps,
+              "cmpeqpd": cmpeqpd,
+              "cmpeqss": cmpeqss,
+              "cmpeqsd": cmpeqsd,
+              "cmpltps": cmpltps,
+              "cmpltpd": cmpltpd,
+              "cmpltss": cmpltss,
+              "cmpltsd": cmpltsd,
+              "cmpleps": cmpleps,
+              "cmplepd": cmplepd,
+              "cmpless": cmpless,
+              "cmplesd": cmplesd,
+              "cmpunordps": cmpunordps,
+              "cmpunordpd": cmpunordpd,
+              "cmpunordss": cmpunordss,
+              "cmpunordsd": cmpunordsd,
+              "cmpneqps": cmpneqps,
+              "cmpneqpd": cmpneqpd,
+              "cmpneqss": cmpneqss,
+              "cmpneqsd": cmpneqsd,
+              "cmpnltps": cmpnltps,
+              "cmpnltpd": cmpnltpd,
+              "cmpnltss": cmpnltss,
+              "cmpnltsd": cmpnltsd,
+              "cmpnleps": cmpnleps,
+              "cmpnlepd": cmpnlepd,
+              "cmpnless": cmpnless,
+              "cmpnlesd": cmpnlesd,
+              "cmpordps": cmpordps,
+              "cmpordpd": cmpordpd,
+              "cmpordss": cmpordss,
+              "cmpordsd": cmpordsd,
+
+              # Logical (floating-point)
+              #
+
+              "pand": pand,
+              "pandn": pandn,
+              "por": por,
+
+              "rdmsr": rdmsr,
+              "wrmsr": wrmsr,
+              "pshufb": pshufb,
+              "pshufd": pshufd,
+              "pshuflw": pshuflw,
+              "pshufhw": pshufhw,
+
+              "psrlw": psrlw,
+              "psrld": psrld,
+              "psrlq": psrlq,
+              "psllw": psllw,
+              "pslld": pslld,
+              "psllq": psllq,
+              "pslldq": pslldq,
+              "psrldq": psrldq,
+              "psraw": psraw,
+              "psrad": psrad,
+
+              "palignr": palignr,
+
+              "pmaxub": pmaxub,
+              "pmaxuw": pmaxuw,
+              "pmaxud": pmaxud,
+              "pmaxsw": pmaxsw,
+
+              "pminub": pminub,
+              "pminuw": pminuw,
+              "pminud": pminud,
+
+              "pcmpeqb": pcmpeqb,
+              "pcmpeqw": pcmpeqw,
+              "pcmpeqd": pcmpeqd,
+              "pcmpeqq": pcmpeqq,
+
+              "pcmpgtb": pcmpgtb,
+              "pcmpgtw": pcmpgtw,
+              "pcmpgtd": pcmpgtd,
+              "pcmpgtq": pcmpgtq,
+
+              "punpckhbw": punpckhbw,
+              "punpckhwd": punpckhwd,
+              "punpckhdq": punpckhdq,
+              "punpckhqdq": punpckhqdq,
+
+
+              "punpcklbw": punpcklbw,
+              "punpcklwd": punpcklwd,
+              "punpckldq": punpckldq,
+              "punpcklqdq": punpcklqdq,
+
+              "pinsrb": pinsrb,
+              "pinsrw": pinsrw,
+              "pinsrd": pinsrd,
+              "pinsrq": pinsrq,
+
+              "pextrb": pextrb,
+              "pextrw": pextrw,
+              "pextrd": pextrd,
+              "pextrq": pextrq,
+
+              "unpckhps": unpckhps,
+              "unpckhpd": unpckhpd,
+              "unpcklps": unpcklps,
+              "unpcklpd": unpcklpd,
+
+              "movlpd": movlpd,
+              "movlps": movlps,
+              "movhpd": movhpd,
+              "movhps": movhpd,
+              "movlhps": movlhps,
+              "movhlps": movhlps,
+              "movdq2q": movdq2q,
+              "movq2dq": movq2dq,
+
+              "sqrtpd": sqrtpd,
+              "sqrtps": sqrtps,
+              "sqrtsd": sqrtsd,
+              "sqrtss": sqrtss,
+
+              "pmovmskb": pmovmskb,
+
+              "packsswb": packsswb,
+              "packssdw": packssdw,
+              "packuswb": packuswb,
+
+              "psubusb": psubusb,
+              "psubusw": psubusw,
+              "paddusb": paddusb,
+              "paddusw": paddusw,
+              "psubsb": psubsb,
+              "psubsw": psubsw,
+              "paddsb": paddsb,
+              "paddsw": paddsw,
+
+              "smsw": smsw,
+              "maskmovq": maskmovq,
+              "maskmovdqu": maskmovq,
+              "emms": emms,
+              "shufps": shufps,
+              "shufpd": shufpd,
+              "movmskps": movmskps,
+              "movmskpd": movmskpd,
+              "stmxcsr": stmxcsr,
+              "ldmxcsr": ldmxcsr,
+
+              # CET (Control-flow Enforcement Technology)
+              "incssp": incssp,
+              "rdssp": rdssp,
+              "saveprevssp": saveprevssp,
+              "rstorssp": rstorssp,
+              "wrss": wrss,
+              "wruss": wruss,
+              "setssbsy": setssbsy,
+              "clrssbsy": clrssbsy,
+              "endbr64": endbr64,
+              "endbr32": endbr32,
+              "fxsave": fxsave,
+              "fxrstor": fxrstor,
+              }
+
+
+class Lifter_X86_16(Lifter):
+
+    def __init__(self, loc_db):
+        Lifter.__init__(self, mn_x86, 16, loc_db)
+        self.do_stk_segm = False
+        self.do_ds_segm = False
+        self.do_str_segm = False
+        self.do_all_segm = False
+        self.pc = IP
+        self.sp = SP
+        self.IRDst = m2_expr.ExprId('IRDst', 16)
+        # Size of memory pointer access in IR
+        # 16 bit mode memory accesses may be greater than 16 bits
+        # 32 bit size may be enough
+        self.addrsize = 32
+
+    def mod_pc(self, instr, instr_ir, extra_ir):
+        pass
+
+    def ExprMem(self, ptr, size):
+        """Generate a memory access to @ptr
+        The ptr is resized to a fixed size self.addrsize
+
+        @ptr: Expr instance to the memory address
+        @size: size of the memory"""
+
+        return m2_expr.ExprMem(expraddr(self.addrsize, ptr), size)
+
+    def gen_segm_expr(self, selector, addr):
+        ptr = m2_expr.ExprOp(
+            'segm',
+            selector,
+            addr.zeroExtend(self.addrsize)
+        )
+
+        return ptr
+
+    def get_ir(self, instr):
+        args = instr.args[:]
+        args = [arg.replace_expr(float_replace) for arg in args]
+        args = fix_mem_args_size(instr, *args)
+        my_ss = None
+        if self.do_ds_segm:
+            my_ss = DS
+        if self.do_all_segm and instr.additional_info.g2.value:
+            my_ss = {1: CS, 2: SS, 3: DS, 4: ES, 5: FS, 6: GS}[
+                instr.additional_info.g2.value]
+        if my_ss is not None:
+            for i, a in enumerate(args):
+                if a.is_mem() and not is_mem_segm(a):
+                    args[i] = self.ExprMem(m2_expr.ExprOp('segm', my_ss,
+                                                          a.ptr), a.size)
+
+        if not instr.name.lower() in mnemo_func:
+            raise NotImplementedError(
+                "Mnemonic %s not implemented" % instr.name)
+
+        instr_ir, extra_ir = mnemo_func[
+            instr.name.lower()](self, instr, *args)
+        self.mod_pc(instr, instr_ir, extra_ir)
+        instr.additional_info.except_on_instr = False
+        if instr.additional_info.g1.value & 14 == 0 or \
+                not instr.name in repeat_mn:
+            return instr_ir, extra_ir
+        if instr.name == "MOVSD" and len(instr.args) == 2:
+            return instr_ir, extra_ir
+
+        instr.additional_info.except_on_instr = True
+        admode = instr.v_admode()
+        c_reg = mRCX[instr.mode][:admode]
+
+        zf_val = None
+        # set if zf is tested (cmps, scas)
+        for e in instr_ir:  # +[updt_c]:
+            if e.dst == zf:
+                zf_val = e.src
+
+        cond_dec = m2_expr.ExprCond(c_reg - m2_expr.ExprInt(1, c_reg.size),
+                                    m2_expr.ExprInt(0, 1), m2_expr.ExprInt(1, 1))
+        # end condition
+        if zf_val is None:
+            c_cond = cond_dec
+        elif instr.additional_info.g1.value & 2:  # REPNE and REPNZ
+            c_cond = cond_dec | zf
+        elif instr.additional_info.g1.value & 12:  # REPE, REP and REPZ
+            c_cond = cond_dec | (zf ^ m2_expr.ExprInt(1, 1))
+
+        # gen while
+        loc_do, loc_do_expr = self.gen_loc_key_and_expr(self.IRDst.size)
+        loc_end, loc_end_expr = self.gen_loc_key_and_expr(self.IRDst.size)
+        loc_skip = self.get_next_loc_key(instr)
+        loc_skip_expr = m2_expr.ExprLoc(loc_skip, self.IRDst.size)
+        loc_next = self.get_next_loc_key(instr)
+        loc_next_expr = m2_expr.ExprLoc(loc_next, self.IRDst.size)
+
+        fix_next_loc = {loc_next_expr: loc_end_expr}
+        new_extra_ir = [irblock.modify_exprs(mod_src=lambda expr: expr.replace_expr(fix_next_loc))
+                        for irblock in extra_ir]
+
+        cond_bloc = []
+        cond_bloc.append(m2_expr.ExprAssign(c_reg,
+                                         c_reg - m2_expr.ExprInt(1,
+                                                                 c_reg.size)))
+        cond_bloc.append(m2_expr.ExprAssign(self.IRDst, m2_expr.ExprCond(c_cond,
+                                                                      loc_skip_expr,
+                                                                      loc_do_expr)))
+        cond_bloc = IRBlock(self.loc_db, loc_end, [AssignBlock(cond_bloc, instr)])
+        e_do = instr_ir
+
+        c = IRBlock(self.loc_db, loc_do, [AssignBlock(e_do, instr)])
+        e_n = [m2_expr.ExprAssign(self.IRDst, m2_expr.ExprCond(c_reg, loc_do_expr,
+                                                            loc_skip_expr))]
+        return e_n, [cond_bloc, c] + new_extra_ir
+
+    def expr_fix_regs_for_mode(self, e, mode=64):
+        return e.replace_expr(replace_regs[mode])
+
+    def expraff_fix_regs_for_mode(self, e, mode=64):
+        dst = self.expr_fix_regs_for_mode(e.dst, mode)
+        src = self.expr_fix_regs_for_mode(e.src, mode)
+        return m2_expr.ExprAssign(dst, src)
+
+    def irbloc_fix_regs_for_mode(self, irblock, mode=64):
+        irs = []
+        for assignblk in irblock:
+            new_assignblk = dict(assignblk)
+            for dst, src in viewitems(assignblk):
+                del new_assignblk[dst]
+                # Special case for 64 bits:
+                # If destination is a 32 bit reg, zero extend the 64 bit reg
+                if mode == 64:
+                    if (isinstance(dst, m2_expr.ExprId) and
+                            dst.size == 32 and
+                            dst in replace_regs[64]):
+                        src = src.zeroExtend(64)
+                        dst = replace_regs[64][dst].arg
+                dst = self.expr_fix_regs_for_mode(dst, mode)
+                src = self.expr_fix_regs_for_mode(src, mode)
+                new_assignblk[dst] = src
+            irs.append(AssignBlock(new_assignblk, assignblk.instr))
+        return IRBlock(self.loc_db, irblock.loc_key, irs)
+
+
+class Lifter_X86_32(Lifter_X86_16):
+
+    def __init__(self, loc_db):
+        Lifter.__init__(self, mn_x86, 32, loc_db)
+        self.do_stk_segm = False
+        self.do_ds_segm = False
+        self.do_str_segm = False
+        self.do_all_segm = False
+        self.pc = EIP
+        self.sp = ESP
+        self.IRDst = m2_expr.ExprId('IRDst', 32)
+        self.addrsize = 32
+
+
+class Lifter_X86_64(Lifter_X86_16):
+
+    def __init__(self, loc_db):
+        Lifter.__init__(self, mn_x86, 64, loc_db)
+        self.do_stk_segm = False
+        self.do_ds_segm = False
+        self.do_str_segm = False
+        self.do_all_segm = False
+        self.pc = RIP
+        self.sp = RSP
+        self.IRDst = m2_expr.ExprId('IRDst', 64)
+        self.addrsize = 64
+
+    def mod_pc(self, instr, instr_ir, extra_ir):
+        # fix RIP for 64 bit
+        pc_fixed = {self.pc: m2_expr.ExprInt(instr.offset + instr.l, 64)}
+
+        for i, expr in enumerate(instr_ir):
+            dst, src = expr.dst, expr.src
+            if dst != self.pc:
+                dst = dst.replace_expr(pc_fixed)
+            src = src.replace_expr(pc_fixed)
+            instr_ir[i] = m2_expr.ExprAssign(dst, src)
+
+        for idx, irblock in enumerate(extra_ir):
+            extra_ir[idx] = irblock.modify_exprs(lambda expr: expr.replace_expr(pc_fixed) \
+                                                 if expr != self.pc else expr,
+                                                 lambda expr: expr.replace_expr(pc_fixed))