about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorCamille Mougey <commial@gmail.com>2016-01-08 14:07:24 +0100
committerCamille Mougey <commial@gmail.com>2016-01-08 14:07:24 +0100
commitc86694812fcd935a7310e7ce96018d973c464f56 (patch)
tree9bb4d742967a9b1123ae586f2b4c26b0eff02e39
parentee96ee5e19f6c6712a871a19c07eb89c35e3dce5 (diff)
parent55c24ef7ad982a700fcf1f3ae4296cf99019fbb5 (diff)
downloadmiasm-c86694812fcd935a7310e7ce96018d973c464f56.tar.gz
miasm-c86694812fcd935a7310e7ce96018d973c464f56.zip
Merge pull request #297 from serpilliere/add_sse_instr
Add sse instr
-rw-r--r--example/jitter/x86_32.py1
-rw-r--r--miasm2/arch/x86/arch.py315
-rw-r--r--miasm2/arch/x86/sem.py1127
-rw-r--r--miasm2/jitter/arch/JitCore_x86.c37
-rw-r--r--test/arch/x86/arch.py283
-rw-r--r--test/arch/x86/unit/mn_pcmpeq.py64
-rw-r--r--test/arch/x86/unit/mn_pextr.py25
-rw-r--r--test/arch/x86/unit/mn_pinsr.py25
-rw-r--r--test/arch/x86/unit/mn_pmaxu.py25
-rw-r--r--test/arch/x86/unit/mn_pminu.py25
-rw-r--r--test/arch/x86/unit/mn_pmovmskb.py26
-rw-r--r--test/arch/x86/unit/mn_pshufb.py25
-rw-r--r--test/arch/x86/unit/mn_psrl_psll.py55
-rw-r--r--test/arch/x86/unit/mn_punpck.py124
-rw-r--r--test/test_all.py9
15 files changed, 1893 insertions, 273 deletions
diff --git a/example/jitter/x86_32.py b/example/jitter/x86_32.py
index 1b2aa012..2eee1742 100644
--- a/example/jitter/x86_32.py
+++ b/example/jitter/x86_32.py
@@ -38,4 +38,3 @@ myjit.add_breakpoint(0x1337beef, code_sentinelle)
 
 myjit.init_run(run_addr)
 myjit.continue_run()
-del(myjit)
diff --git a/miasm2/arch/x86/arch.py b/miasm2/arch/x86/arch.py
index 74ac0939..36e2e3b7 100644
--- a/miasm2/arch/x86/arch.py
+++ b/miasm2/arch/x86/arch.py
@@ -2151,6 +2151,54 @@ class x86_rm_wd(x86_rm_sd):
             yield x
 
 
+class x86_rm_08(x86_rm_arg):
+    msize = 8
+
+    def decode(self, v):
+        p = self.parent
+        xx = self.get_modrm()
+        expr = modrm2expr(xx, p, 0)
+        if not isinstance(expr, ExprMem):
+            self.expr = expr
+            return True
+        self.expr = ExprMem(expr.arg, self.msize)
+        return self.expr is not None
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            raise StopIteration
+        p = self.parent
+        v_cand, segm, ok = expr2modrm(self.expr, p, 0, 0, 0, 0)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+class x86_rm_reg_m08(x86_rm_arg):
+    msize = 8
+
+    def decode(self, v):
+        ret = x86_rm_arg.decode(self, v)
+        if not ret:
+            return ret
+        if not isinstance(self.expr, ExprMem):
+            return True
+        self.expr = ExprMem(self.expr.arg, self.msize)
+        return self.expr is not None
+
+    def encode(self):
+        if isinstance(self.expr, ExprInt):
+            raise StopIteration
+        p = self.parent
+        if isinstance(self.expr, ExprMem):
+            expr = ExprMem(self.expr.arg, 32)
+        else:
+            expr = self.expr
+        v_cand, segm, ok = expr2modrm(expr, p, 1, 0, 0, 0)
+        for x in self.gen_cand(v_cand, p.v_admode()):
+            yield x
+
+class x86_rm_reg_m16(x86_rm_reg_m08):
+    msize = 16
+
 class x86_rm_m64(x86_rm_arg):
     msize = 64
 
@@ -2222,8 +2270,11 @@ class x86_rm_mm(x86_rm_m80):
         p = self.parent
         xx = self.get_modrm()
         expr = modrm2expr(xx, p, 0, 0, self.is_xmm, self.is_mm)
-        if isinstance(expr, ExprMem) and expr.size != self.msize:
-            expr = ExprMem(expr.arg, self.msize)
+        if isinstance(expr, ExprMem):
+            if self.msize is None:
+                return False
+            if expr.size != self.msize:
+                expr = ExprMem(expr.arg, self.msize)
         self.expr = expr
         return True
 
@@ -2271,6 +2322,16 @@ class x86_rm_xmm_m64(x86_rm_mm):
     is_xmm = True
 
 
+class x86_rm_xmm_reg(x86_rm_mm):
+    msize = None
+    is_mm = False
+    is_xmm = True
+
+class x86_rm_mm_reg(x86_rm_mm):
+    msize = None
+    is_mm = True
+    is_xmm = False
+
 class x86_rm_reg_noarg(object):
     prio = default_prio + 1
 
@@ -3068,17 +3129,22 @@ rm_arg_sx = bs(l=0, cls=(x86_rm_sx,), fname='rmarg')
 rm_arg_sxd = bs(l=0, cls=(x86_rm_sxd,), fname='rmarg')
 rm_arg_sd = bs(l=0, cls=(x86_rm_sd,), fname='rmarg')
 rm_arg_wd = bs(l=0, cls=(x86_rm_wd,), fname='rmarg')
+rm_arg_08 = bs(l=0, cls=(x86_rm_08,), fname='rmarg')
+rm_arg_reg_m08 = bs(l=0, cls=(x86_rm_reg_m08,), fname='rmarg')
+rm_arg_reg_m16 = bs(l=0, cls=(x86_rm_reg_m16,), fname='rmarg')
+rm_arg_m08 = bs(l=0, cls=(x86_rm_m08,), fname='rmarg')
 rm_arg_m64 = bs(l=0, cls=(x86_rm_m64,), fname='rmarg')
 rm_arg_m80 = bs(l=0, cls=(x86_rm_m80,), fname='rmarg')
-rm_arg_m08 = bs(l=0, cls=(x86_rm_m08,), fname='rmarg')
 rm_arg_m16 = bs(l=0, cls=(x86_rm_m16,), fname='rmarg')
 
 rm_arg_mm = bs(l=0, cls=(x86_rm_mm,), fname='rmarg')
 rm_arg_mm_m64 = bs(l=0, cls=(x86_rm_mm_m64,), fname='rmarg')
+rm_arg_mm_reg = bs(l=0, cls=(x86_rm_mm_reg,), fname='rmarg')
 
 rm_arg_xmm = bs(l=0, cls=(x86_rm_xmm,), fname='rmarg')
 rm_arg_xmm_m32 = bs(l=0, cls=(x86_rm_xmm_m32,), fname='rmarg')
 rm_arg_xmm_m64 = bs(l=0, cls=(x86_rm_xmm_m64,), fname='rmarg')
+rm_arg_xmm_reg = bs(l=0, cls=(x86_rm_xmm_reg,), fname='rmarg')
 
 swapargs = bs_swapargs(l=1, fname="swap", mn_mod=range(1 << 1))
 
@@ -3725,7 +3791,7 @@ addop("sbb", [bs("100000"), se, w8] + rmmod(d3, rm_arg_w8) + [d_imm])
 addop("sbb", [bs("000110"), swapargs, w8] +
       rmmod(rmreg, rm_arg_w8), [rm_arg_w8, rmreg])
 
-addop("set", [bs8(0x0f), bs('1001'), cond] + rmmod(regnoarg, rm_arg_m08))
+addop("set", [bs8(0x0f), bs('1001'), cond] + rmmod(regnoarg, rm_arg_08))
 addop("sgdt", [bs8(0x0f), bs8(0x01)] + rmmod(d0, modrm=mod_mem))
 addop("shld", [bs8(0x0f), bs8(0xa4)] +
       rmmod(rmreg) + [u08], [rm_arg, rmreg, u08])
@@ -3807,6 +3873,22 @@ addop("movdqu", [bs8(0x0f), bs("011"), swapargs, bs("1111"), pref_f3]
 addop("movdqa", [bs8(0x0f), bs("011"), swapargs, bs("1111"), pref_66]
       + rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
 
+addop("movhpd", [bs8(0x0f), bs("0001011"), swapargs, pref_66] +
+      rmmod(xmm_reg, rm_arg_m64), [xmm_reg, rm_arg_m64])
+addop("movhps", [bs8(0x0f), bs("0001011"), swapargs, no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_m64), [xmm_reg, rm_arg_m64])
+addop("movlpd", [bs8(0x0f), bs("0001001"), swapargs, pref_66] +
+      rmmod(xmm_reg, rm_arg_m64), [xmm_reg, rm_arg_m64])
+addop("movlps", [bs8(0x0f), bs("0001001"), swapargs, no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_m64), [xmm_reg, rm_arg_m64])
+
+addop("movhlps", [bs8(0x0f), bs8(0x12), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm_reg), [xmm_reg, rm_arg_xmm_reg])
+addop("movlhps", [bs8(0x0f), bs8(0x16), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm_reg), [xmm_reg, rm_arg_xmm_reg])
+
+addop("movdq2q", [bs8(0x0f), bs8(0xd6), pref_f2] +
+      rmmod(mm_reg, rm_arg_xmm_reg), [mm_reg, rm_arg_xmm_reg])
 
 
 ## Additions
@@ -3867,6 +3949,9 @@ addop("xorpd", [bs8(0x0f), bs8(0x57), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
 addop("andps", [bs8(0x0f), bs8(0x54), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
 addop("andpd", [bs8(0x0f), bs8(0x54), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
 
+addop("andnps", [bs8(0x0f), bs8(0x55), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
+addop("andnpd", [bs8(0x0f), bs8(0x55), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
+
 ## OR
 addop("orps", [bs8(0x0f), bs8(0x56), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm))
 addop("orpd", [bs8(0x0f), bs8(0x56), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
@@ -3879,6 +3964,14 @@ addop("pand", [bs8(0x0f), bs8(0xdb), no_xmm_pref] +
 addop("pand", [bs8(0x0f), bs8(0xdb), pref_66] +
       rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
 
+## ANDN
+# MMX
+addop("pandn", [bs8(0x0f), bs8(0xdf), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+# SSE
+addop("pandn", [bs8(0x0f), bs8(0xdf), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
 ## OR
 # MMX
 addop("por", [bs8(0x0f), bs8(0xeb), no_xmm_pref] +
@@ -3895,6 +3988,15 @@ addop("pxor", [bs8(0x0f), bs8(0xef), no_xmm_pref] +
 addop("pxor", [bs8(0x0f), bs8(0xef), pref_66] +
       rmmod(xmm_reg, rm_arg_xmm))
 
+addop("pshufb", [bs8(0x0f), bs8(0x38), bs8(0x00), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pshufb", [bs8(0x0f), bs8(0x38), bs8(0x00), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+addop("pshufd", [bs8(0x0f), bs8(0x70), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm) + [u08])
+
+
+
 ### Convert
 ### SS = single precision
 ### SD = double precision
@@ -3948,6 +4050,211 @@ addop("cvttsd2si",[bs8(0x0f), bs8(0x2c), pref_f2]
 addop("cvttss2si",[bs8(0x0f), bs8(0x2c), pref_f3]
       + rmmod(reg, rm_arg_xmm_m32))
 
+addop("psrlq", [bs8(0x0f), bs8(0x73), no_xmm_pref] +
+      rmmod(d2, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psrlq", [bs8(0x0f), bs8(0x73), pref_66] +
+      rmmod(d2, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psrlq", [bs8(0x0f), bs8(0xd3), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("psrlq", [bs8(0x0f), bs8(0xd3), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+
+addop("psrld", [bs8(0x0f), bs8(0x72), no_xmm_pref] +
+      rmmod(d2, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psrld", [bs8(0x0f), bs8(0x72), pref_66] +
+      rmmod(d2, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psrld", [bs8(0x0f), bs8(0xd2), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("psrld", [bs8(0x0f), bs8(0xd2), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+
+addop("psrlw", [bs8(0x0f), bs8(0x71), no_xmm_pref] +
+      rmmod(d2, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psrlw", [bs8(0x0f), bs8(0x71), pref_66] +
+      rmmod(d2, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psrlw", [bs8(0x0f), bs8(0xd1), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("psrlw", [bs8(0x0f), bs8(0xd1), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+
+
+addop("psllq", [bs8(0x0f), bs8(0x73), no_xmm_pref] +
+      rmmod(d6, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psllq", [bs8(0x0f), bs8(0x73), pref_66] +
+      rmmod(d6, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psllq", [bs8(0x0f), bs8(0xf3), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("psllq", [bs8(0x0f), bs8(0xf3), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+
+addop("pslld", [bs8(0x0f), bs8(0x72), no_xmm_pref] +
+      rmmod(d6, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("pslld", [bs8(0x0f), bs8(0x72), pref_66] +
+      rmmod(d6, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("pslld", [bs8(0x0f), bs8(0xf2), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("pslld", [bs8(0x0f), bs8(0xf2), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+
+addop("psllw", [bs8(0x0f), bs8(0x71), no_xmm_pref] +
+      rmmod(d6, rm_arg_mm) + [u08], [rm_arg_mm, u08])
+addop("psllw", [bs8(0x0f), bs8(0x71), pref_66] +
+      rmmod(d6, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
+
+addop("psllw", [bs8(0x0f), bs8(0xf1), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+addop("psllw", [bs8(0x0f), bs8(0xf1), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+
+
+addop("pmaxub", [bs8(0x0f), bs8(0xde), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pmaxub", [bs8(0x0f), bs8(0xde), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pmaxuw", [bs8(0x0f), bs8(0x38), bs8(0x3e), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pmaxud", [bs8(0x0f), bs8(0x38), bs8(0x3f), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+addop("pminub", [bs8(0x0f), bs8(0xda), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pminub", [bs8(0x0f), bs8(0xda), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pminuw", [bs8(0x0f), bs8(0x38), bs8(0x3a), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pminud", [bs8(0x0f), bs8(0x38), bs8(0x3b), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+addop("pcmpeqb", [bs8(0x0f), bs8(0x74), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pcmpeqb", [bs8(0x0f), bs8(0x74), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pcmpeqw", [bs8(0x0f), bs8(0x75), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pcmpeqw", [bs8(0x0f), bs8(0x75), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("pcmpeqd", [bs8(0x0f), bs8(0x76), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("pcmpeqd", [bs8(0x0f), bs8(0x76), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+addop("punpckhbw", [bs8(0x0f), bs8(0x68), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpckhbw", [bs8(0x0f), bs8(0x68), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpckhwd", [bs8(0x0f), bs8(0x69), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpckhwd", [bs8(0x0f), bs8(0x69), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpckhdq", [bs8(0x0f), bs8(0x6a), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpckhdq", [bs8(0x0f), bs8(0x6a), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpckhqdq", [bs8(0x0f), bs8(0x6d), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+
+addop("punpcklbw", [bs8(0x0f), bs8(0x60), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpcklbw", [bs8(0x0f), bs8(0x60), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpcklwd", [bs8(0x0f), bs8(0x61), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpcklwd", [bs8(0x0f), bs8(0x61), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpckldq", [bs8(0x0f), bs8(0x62), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm))
+addop("punpckldq", [bs8(0x0f), bs8(0x62), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+addop("punpcklqdq", [bs8(0x0f), bs8(0x6c), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+addop("unpckhps", [bs8(0x0f), bs8(0x15), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm))
+addop("unpckhpd", [bs8(0x0f), bs8(0x15), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+addop("unpcklps", [bs8(0x0f), bs8(0x14), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm))
+addop("unpcklpd", [bs8(0x0f), bs8(0x14), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+
+
+
+addop("pinsrb", [bs8(0x0f), bs8(0x3a), bs8(0x20), pref_66] +
+      rmmod(xmm_reg, rm_arg_reg_m08) + [u08])
+addop("pinsrd", [bs8(0x0f), bs8(0x3a), bs8(0x22), pref_66, bs_opmode32] +
+      rmmod(xmm_reg, rm_arg) + [u08])
+addop("pinsrq", [bs8(0x0f), bs8(0x3a), bs8(0x22), pref_66] +
+      rmmod(xmm_reg, rm_arg_m64) + [bs_opmode64] + [u08])
+
+addop("pinsrw", [bs8(0x0f), bs8(0xc4), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_reg_m16) + [u08])
+addop("pinsrw", [bs8(0x0f), bs8(0xc4), pref_66] +
+      rmmod(xmm_reg, rm_arg_reg_m16) + [u08])
+
+
+addop("pextrb", [bs8(0x0f), bs8(0x3a), bs8(0x14), pref_66] +
+      rmmod(xmm_reg, rm_arg_reg_m08) + [u08], [rm_arg_reg_m08, xmm_reg, u08])
+addop("pextrd", [bs8(0x0f), bs8(0x3a), bs8(0x16), pref_66, bs_opmode32] +
+      rmmod(xmm_reg, rm_arg) + [u08], [rm_arg, xmm_reg, u08])
+addop("pextrq", [bs8(0x0f), bs8(0x3a), bs8(0x16), pref_66] +
+      rmmod(xmm_reg, rm_arg_m64) + [bs_opmode64] + [u08], [rm_arg_m64, xmm_reg, u08])
+
+
+addop("pextrw", [bs8(0x0f), bs8(0x3a), bs8(0x15), pref_66] +
+      rmmod(xmm_reg, rm_arg_m16) + [u08], [rm_arg_m16, xmm_reg, u08])
+#addop("pextrw", [bs8(0x0f), bs8(0x3a), bs8(0x15), no_xmm_pref] +
+#      rmmod(mm_reg, rm_arg_m16) + [u08], [rm_arg_m16, mm_reg, u08])
+
+addop("pextrw", [bs8(0x0f), bs8(0xc5), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_reg_m16) + [u08], [rm_arg_reg_m16, mm_reg, u08])
+addop("pextrw", [bs8(0x0f), bs8(0xc5), pref_66] +
+      rmmod(xmm_reg, rm_arg_reg_m16) + [u08], [rm_arg_reg_m16, xmm_reg, u08])
+
+
+addop("sqrtpd", [bs8(0x0f), bs8(0x51), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm))
+addop("sqrtps", [bs8(0x0f), bs8(0x51), no_xmm_pref] +
+      rmmod(xmm_reg, rm_arg_xmm))
+addop("sqrtsd", [bs8(0x0f), bs8(0x51), pref_f2] +
+      rmmod(xmm_reg, rm_arg_xmm_m64))
+addop("sqrtss", [bs8(0x0f), bs8(0x51), pref_f3] +
+      rmmod(xmm_reg, rm_arg_xmm_m32))
+
+addop("pmovmskb", [bs8(0x0f), bs8(0xd7), no_xmm_pref] +
+      rmmod(reg, rm_arg_mm_reg))
+addop("pmovmskb", [bs8(0x0f), bs8(0xd7), pref_66] +
+      rmmod(reg, rm_arg_xmm_reg))
+
 
 mn_x86.bintree = factor_one_bit(mn_x86.bintree)
 # mn_x86.bintree = factor_fields_all(mn_x86.bintree)
diff --git a/miasm2/arch/x86/sem.py b/miasm2/arch/x86/sem.py
index 802b6283..ff27937f 100644
--- a/miasm2/arch/x86/sem.py
+++ b/miasm2/arch/x86/sem.py
@@ -65,13 +65,13 @@ OF(A-B) = ((A XOR D) AND (A XOR B)) < 0
 """
 
 
-
 # XXX TODO make default check against 0 or not 0 (same eq as in C)
 
 
 def update_flag_zf(a):
-    return [m2_expr.ExprAff(zf, m2_expr.ExprCond(a, m2_expr.ExprInt_from(zf, 0),
-                                                 m2_expr.ExprInt_from(zf, 1)))]
+    return [m2_expr.ExprAff(
+        zf, m2_expr.ExprCond(a, m2_expr.ExprInt_from(zf, 0),
+                             m2_expr.ExprInt_from(zf, 1)))]
 
 
 def update_flag_nf(a):
@@ -123,6 +123,7 @@ def arith_flag(a, b, c):
 
 # checked: ok for adc add because b & c before +cf
 
+
 def update_flag_add_cf(op1, op2, res):
     "Compute cf in @res = @op1 + @op2"
     ret = (((op1 ^ op2) ^ res) ^ ((op1 ^ res) & (~(op1 ^ op2)))).msb()
@@ -172,6 +173,7 @@ def set_float_cs_eip(instr):
     e.append(m2_expr.ExprAff(float_cs, CS))
     return e
 
+
 def mem2double(arg):
     """
     Add float convertion if argument is an ExprMem
@@ -185,6 +187,7 @@ def mem2double(arg):
     else:
         return arg
 
+
 def float_implicit_st0(arg1, arg2):
     """
     Generate full float operators if one argument is implicit (float_st0)
@@ -279,6 +282,7 @@ def movzx(ir, instr, a, b):
     e = [m2_expr.ExprAff(a, b.zeroExtend(a.size))]
     return e, []
 
+
 def movsx(ir, instr, a, b):
     e = [m2_expr.ExprAff(a, b.signExtend(a.size))]
     return e, []
@@ -317,7 +321,7 @@ def adc(ir, instr, a, b):
     e = []
     c = a + (b + m2_expr.ExprCompose([(m2_expr.ExprInt(0, a.size - 1),
                                        1, a.size),
-                              (cf, 0, 1)]))
+                                      (cf, 0, 1)]))
     e += update_flag_arith(c)
     e += update_flag_af(a, b, c)
     e += update_flag_add(a, b, c)
@@ -341,7 +345,7 @@ def sbb(ir, instr, a, b):
     e = []
     c = a - (b + m2_expr.ExprCompose([(m2_expr.ExprInt(0, a.size - 1),
                                        1, a.size),
-                              (cf, 0, 1)]))
+                                      (cf, 0, 1)]))
     e += update_flag_arith(c)
     e += update_flag_af(a, b, c)
     e += update_flag_sub(a, b, c)
@@ -391,6 +395,7 @@ def pxor(ir, instr, a, b):
     e.append(m2_expr.ExprAff(a, c))
     return e, []
 
+
 def l_or(ir, instr, a, b):
     e = []
     c = a | b
@@ -414,7 +419,6 @@ def l_test(ir, instr, a, b):
     return e, []
 
 
-
 def get_shift(a, b):
     # b.size must match a
     b = b.zeroExtend(a.size)
@@ -425,6 +429,7 @@ def get_shift(a, b):
     shift = expr_simp(shift)
     return shift
 
+
 def _rotate_tpl(ir, instr, a, b, op, left=False, include_cf=False):
     """Template to generate a rotater with operation @op
     A temporary basic block is generated to handle 0-rotate
@@ -515,7 +520,8 @@ def _shift_tpl(op, ir, instr, a, b, c=None, op_inv=None, left=False,
 
         # An overflow can occured, emulate the 'undefined behavior'
         # Overflow behavior if (shift / size % 2)
-        base_cond_overflow = c if left else (c - m2_expr.ExprInt(1, size=c.size))
+        base_cond_overflow = c if left else (
+            c - m2_expr.ExprInt(1, size=c.size))
         cond_overflow = base_cond_overflow & m2_expr.ExprInt(a.size, c.size)
         if left:
             # Overflow occurs one round before right
@@ -584,9 +590,9 @@ def shrd_cl(ir, instr, a, b):
     c = (a >> shifter) | (b << (m2_expr.ExprInt_from(a, a.size) - shifter))
     new_cf = (a >> (shifter - m2_expr.ExprInt_from(a, 1)))[:1]
     e.append(m2_expr.ExprAff(cf, m2_expr.ExprCond(shifter,
-                                  new_cf,
-                                  cf)
-                     )
+                                                  new_cf,
+                                                  cf)
+                             )
              )
     e.append(m2_expr.ExprAff(of, a.msb()))
     e += update_flag_znp(c)
@@ -604,9 +610,9 @@ def sal(ir, instr, a, b):
     c = m2_expr.ExprOp('a<<', a, shifter)
     new_cf = (a >> (m2_expr.ExprInt_from(a, a.size) - shifter))[:1]
     e.append(m2_expr.ExprAff(cf, m2_expr.ExprCond(shifter,
-                                  new_cf,
-                                  cf)
-                     )
+                                                  new_cf,
+                                                  cf)
+                             )
              )
     e += update_flag_znp(c)
     e.append(m2_expr.ExprAff(of, c.msb() ^ new_cf))
@@ -674,6 +680,7 @@ def inc(ir, instr, a):
     e.append(m2_expr.ExprAff(a, c))
     return e, []
 
+
 def dec(ir, instr, a):
     e = []
     b = m2_expr.ExprInt_from(a, -1)
@@ -705,9 +712,11 @@ def push_gen(ir, instr, a, size):
     e.append(m2_expr.ExprAff(m2_expr.ExprMem(new_sp, size), a))
     return e, []
 
+
 def push(ir, instr, a):
     return push_gen(ir, instr, a, instr.mode)
 
+
 def pushw(ir, instr, a):
     return push_gen(ir, instr, a, 16)
 
@@ -731,32 +740,37 @@ def pop_gen(ir, instr, a, size):
     e.append(m2_expr.ExprAff(a, m2_expr.ExprMem(c, a.size)))
     return e, []
 
+
 def pop(ir, instr, a):
     return pop_gen(ir, instr, a, instr.mode)
 
+
 def popw(ir, instr, a):
     return pop_gen(ir, instr, a, 16)
 
 
 def sete(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(zf, m2_expr.ExprInt_from(a, 1),
-                                                 m2_expr.ExprInt_from(a, 0))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(zf, m2_expr.ExprInt_from(a, 1),
+                                            m2_expr.ExprInt_from(a, 0))))
     return e, []
 
 
 def setnz(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(zf, m2_expr.ExprInt_from(a, 0),
-                                                 m2_expr.ExprInt_from(a, 1))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(zf, m2_expr.ExprInt_from(a, 0),
+                                            m2_expr.ExprInt_from(a, 1))))
     return e, []
 
 
 def setl(ir, instr, a):
     e = []
     e.append(
-        m2_expr.ExprAff(a, m2_expr.ExprCond(nf - of, m2_expr.ExprInt_from(a, 1),
-                                            m2_expr.ExprInt_from(a, 0))))
+        m2_expr.ExprAff(
+            a, m2_expr.ExprCond(nf - of, m2_expr.ExprInt_from(a, 1),
+                                m2_expr.ExprInt_from(a, 0))))
     return e, []
 
 
@@ -772,75 +786,83 @@ def setg(ir, instr, a):
 def setge(ir, instr, a):
     e = []
     e.append(
-        m2_expr.ExprAff(a, m2_expr.ExprCond(nf - of, m2_expr.ExprInt_from(a, 0),
-                                            m2_expr.ExprInt_from(a, 1))))
+        m2_expr.ExprAff(
+            a, m2_expr.ExprCond(nf - of, m2_expr.ExprInt_from(a, 0),
+                                m2_expr.ExprInt_from(a, 1))))
     return e, []
 
 
 def seta(ir, instr, a):
     e = []
     e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(cf | zf,
-                                 m2_expr.ExprInt_from(a, 0),
-                                 m2_expr.ExprInt_from(a, 1))))
+                                                 m2_expr.ExprInt_from(a, 0),
+                                                 m2_expr.ExprInt_from(a, 1))))
 
     return e, []
 
 
 def setae(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(cf, m2_expr.ExprInt_from(a, 0),
-                                                 m2_expr.ExprInt_from(a, 1))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(cf, m2_expr.ExprInt_from(a, 0),
+                                            m2_expr.ExprInt_from(a, 1))))
     return e, []
 
 
 def setb(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(cf, m2_expr.ExprInt_from(a, 1),
-                                                 m2_expr.ExprInt_from(a, 0))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(cf, m2_expr.ExprInt_from(a, 1),
+                                            m2_expr.ExprInt_from(a, 0))))
     return e, []
 
 
 def setbe(ir, instr, a):
     e = []
     e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(cf | zf,
-                                 m2_expr.ExprInt_from(a, 1),
-                                 m2_expr.ExprInt_from(a, 0)))
+                                                 m2_expr.ExprInt_from(a, 1),
+                                                 m2_expr.ExprInt_from(a, 0)))
              )
     return e, []
 
 
 def setns(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(nf, m2_expr.ExprInt_from(a, 0),
-                                                 m2_expr.ExprInt_from(a, 1))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(nf, m2_expr.ExprInt_from(a, 0),
+                                            m2_expr.ExprInt_from(a, 1))))
     return e, []
 
 
 def sets(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(nf, m2_expr.ExprInt_from(a, 1),
-                                                 m2_expr.ExprInt_from(a, 0))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(nf, m2_expr.ExprInt_from(a, 1),
+                                            m2_expr.ExprInt_from(a, 0))))
     return e, []
 
 
 def seto(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(of, m2_expr.ExprInt_from(a, 1),
-                                                 m2_expr.ExprInt_from(a, 0))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(of, m2_expr.ExprInt_from(a, 1),
+                                            m2_expr.ExprInt_from(a, 0))))
     return e, []
 
 
 def setp(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(pf, m2_expr.ExprInt_from(a, 1),
-                                                 m2_expr.ExprInt_from(a, 0))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(pf, m2_expr.ExprInt_from(a, 1),
+                                            m2_expr.ExprInt_from(a, 0))))
     return e, []
 
 
 def setnp(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(pf, m2_expr.ExprInt_from(a, 0),
-                                                 m2_expr.ExprInt_from(a, 1))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(pf, m2_expr.ExprInt_from(a, 0),
+                                            m2_expr.ExprInt_from(a, 1))))
     return e, []
 
 
@@ -865,23 +887,25 @@ def setna(ir, instr, a):
 def setnbe(ir, instr, a):
     e = []
     e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(cf | zf,
-                                 m2_expr.ExprInt_from(a, 0),
-                                 m2_expr.ExprInt_from(a, 1)))
+                                                 m2_expr.ExprInt_from(a, 0),
+                                                 m2_expr.ExprInt_from(a, 1)))
              )
     return e, []
 
 
 def setno(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(of, m2_expr.ExprInt_from(a, 0),
-                                                 m2_expr.ExprInt_from(a, 1))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(of, m2_expr.ExprInt_from(a, 0),
+                                            m2_expr.ExprInt_from(a, 1))))
     return e, []
 
 
 def setnb(ir, instr, a):
     e = []
-    e.append(m2_expr.ExprAff(a, m2_expr.ExprCond(cf, m2_expr.ExprInt_from(a, 0),
-                                                 m2_expr.ExprInt_from(a, 1))))
+    e.append(
+        m2_expr.ExprAff(a, m2_expr.ExprCond(cf, m2_expr.ExprInt_from(a, 0),
+                                            m2_expr.ExprInt_from(a, 1))))
     return e, []
 
 
@@ -898,24 +922,24 @@ def bswap(ir, instr, a):
     e = []
     if a.size == 16:
         c = m2_expr.ExprCompose([(a[:8],        8, 16),
-                         (a[8:16],      0,  8),
-                         ])
+                                 (a[8:16],      0,  8),
+                                 ])
     elif a.size == 32:
         c = m2_expr.ExprCompose([(a[:8],      24, 32),
-                         (a[8:16],    16, 24),
-                         (a[16:24],   8, 16),
-                         (a[24:32],   0, 8),
-                         ])
+                                 (a[8:16],    16, 24),
+                                 (a[16:24],   8, 16),
+                                 (a[24:32],   0, 8),
+                                 ])
     elif a.size == 64:
         c = m2_expr.ExprCompose([(a[:8],      56, 64),
-                         (a[8:16],    48, 56),
-                         (a[16:24],   40, 48),
-                         (a[24:32],   32, 40),
-                         (a[32:40],   24, 32),
-                         (a[40:48],   16, 24),
-                         (a[48:56],    8, 16),
-                         (a[56:64],    0, 8),
-                         ])
+                                 (a[8:16],    48, 56),
+                                 (a[16:24],   40, 48),
+                                 (a[24:32],   32, 40),
+                                 (a[32:40],   24, 32),
+                                 (a[40:48],   16, 24),
+                                 (a[48:56],    8, 16),
+                                 (a[56:64],    0, 8),
+                                 ])
     else:
         raise ValueError('the size DOES matter')
     e.append(m2_expr.ExprAff(a, c))
@@ -1008,9 +1032,11 @@ def compose_eflag(s=32):
 def pushfd(ir, instr):
     return push(ir, instr, compose_eflag())
 
+
 def pushfq(ir, instr):
     return push(ir, instr, compose_eflag().zeroExtend(64))
 
+
 def pushfw(ir, instr):
     return pushw(ir, instr, compose_eflag(16))
 
@@ -1036,13 +1062,14 @@ def popfd(ir, instr):
     e.append(m2_expr.ExprAff(vip, m2_expr.ExprSlice(tmp, 20, 21)))
     e.append(m2_expr.ExprAff(i_d, m2_expr.ExprSlice(tmp, 21, 22)))
     e.append(m2_expr.ExprAff(mRSP[instr.mode],
-                             mRSP[instr.mode] + m2_expr.ExprInt_from(mRSP[instr.mode], instr.mode/8)))
+                             mRSP[instr.mode] + m2_expr.ExprInt_from(mRSP[instr.mode], instr.mode / 8)))
     e.append(m2_expr.ExprAff(exception_flags,
-                     m2_expr.ExprCond(m2_expr.ExprSlice(tmp, 8, 9),
-                              m2_expr.ExprInt32(EXCEPT_SOFT_BP),
-                              exception_flags
-                              )
-                     )
+                             m2_expr.ExprCond(m2_expr.ExprSlice(tmp, 8, 9),
+                                              m2_expr.ExprInt32(
+                                                  EXCEPT_SOFT_BP),
+                                              exception_flags
+                                              )
+                             )
              )
     return e, []
 
@@ -1060,7 +1087,8 @@ def _tpl_eflags(tmp):
 def popfw(ir, instr):
     tmp = m2_expr.ExprMem(mRSP[instr.mode])
     e = _tpl_eflags(tmp)
-    e.append(m2_expr.ExprAff(mRSP[instr.mode], mRSP[instr.mode] + m2_expr.ExprInt(2, mRSP[instr.mode].size)))
+    e.append(
+        m2_expr.ExprAff(mRSP[instr.mode], mRSP[instr.mode] + m2_expr.ExprInt(2, mRSP[instr.mode].size)))
     return e, []
 
 
@@ -1118,7 +1146,6 @@ def call(ir, instr, dst):
     myesp = mRSP[instr.mode][:opmode]
     n = m2_expr.ExprId(ir.get_next_label(instr), ir.IRDst.size)
 
-
     if (isinstance(dst, m2_expr.ExprOp) and dst.op == "segm"):
         # call far
         if instr.mode != 16:
@@ -1132,19 +1159,18 @@ def call(ir, instr, dst):
 
         e.append(m2_expr.ExprAff(ir.IRDst, m2))
 
-        c = myesp + m2_expr.ExprInt(-s/8, s)
+        c = myesp + m2_expr.ExprInt(-s / 8, s)
         e.append(m2_expr.ExprAff(m2_expr.ExprMem(c, size=s).zeroExtend(s),
                                  CS.zeroExtend(s)))
 
-        c = myesp + m2_expr.ExprInt(-2*s/8, s)
+        c = myesp + m2_expr.ExprInt(-2 * s / 8, s)
         e.append(m2_expr.ExprAff(m2_expr.ExprMem(c, size=s).zeroExtend(s),
                                  meip.zeroExtend(s)))
 
-        c = myesp + m2_expr.ExprInt((-2*s) / 8, s)
+        c = myesp + m2_expr.ExprInt((-2 * s) / 8, s)
         e.append(m2_expr.ExprAff(myesp, c))
         return e, []
 
-
     c = myesp + m2_expr.ExprInt((-s / 8), s)
     e.append(m2_expr.ExprAff(myesp, c))
     if ir.do_stk_segm:
@@ -1152,7 +1178,7 @@ def call(ir, instr, dst):
     e.append(m2_expr.ExprAff(m2_expr.ExprMem(c, size=s), n))
     e.append(m2_expr.ExprAff(meip, dst.zeroExtend(ir.IRDst.size)))
     e.append(m2_expr.ExprAff(ir.IRDst, dst.zeroExtend(ir.IRDst.size)))
-    #if not expr_is_int_or_label(dst):
+    # if not expr_is_int_or_label(dst):
     #    dst = meip
     return e, []
 
@@ -1167,10 +1193,10 @@ def ret(ir, instr, a=None):
 
     if a is None:
         a = m2_expr.ExprInt(0, s)
-        value =  (myesp + (m2_expr.ExprInt((s / 8), s)))
+        value = (myesp + (m2_expr.ExprInt((s / 8), s)))
     else:
         a = a.zeroExtend(s)
-        value =  (myesp + (m2_expr.ExprInt((s / 8), s) + a))
+        value = (myesp + (m2_expr.ExprInt((s / 8), s) + a))
 
     e.append(m2_expr.ExprAff(myesp, value))
     c = myesp
@@ -1206,7 +1232,7 @@ def retf(ir, instr, a=None):
         c = m2_expr.ExprOp('segm', SS, c)
     e.append(m2_expr.ExprAff(CS, m2_expr.ExprMem(c, size=16)))
 
-    value =  myesp + (m2_expr.ExprInt((2*s) / 8, s) + a)
+    value = myesp + (m2_expr.ExprInt((2 * s) / 8, s) + a)
     e.append(m2_expr.ExprAff(myesp, value))
     return e, []
 
@@ -1235,7 +1261,7 @@ def enter(ir, instr, a, b):
     esp_tmp = myesp - m2_expr.ExprInt(s / 8, s)
     e.append(m2_expr.ExprAff(m2_expr.ExprMem(esp_tmp,
                              size=s),
-                     myebp))
+                             myebp))
     e.append(m2_expr.ExprAff(myebp, esp_tmp))
     e.append(m2_expr.ExprAff(myesp,
                              myesp - (a + m2_expr.ExprInt(s / 8, s))))
@@ -1260,8 +1286,11 @@ def jmpf(ir, instr, a):
     if (isinstance(a, m2_expr.ExprOp) and a.op == "segm"):
         segm = a.args[0]
         base = a.args[1]
-        m1 = segm.zeroExtend(CS.size)#m2_expr.ExprMem(m2_expr.ExprOp('segm', segm, base), 16)
-        m2 = base.zeroExtend(meip.size)#m2_expr.ExprMem(m2_expr.ExprOp('segm', segm, base + m2_expr.ExprInt_from(base, 2)), s)
+        m1 = segm.zeroExtend(
+            CS.size)  # m2_expr.ExprMem(m2_expr.ExprOp('segm', segm, base), 16)
+        m2 = base.zeroExtend(meip.size)
+                             # m2_expr.ExprMem(m2_expr.ExprOp('segm', segm,
+                             # base + m2_expr.ExprInt_from(base, 2)), s)
     else:
         m1 = m2_expr.ExprMem(a, 16)
         m2 = m2_expr.ExprMem(a + m2_expr.ExprInt_from(a, 2), meip.size)
@@ -1301,7 +1330,7 @@ def jnp(ir, instr, dst):
 
 
 def ja(ir, instr, dst):
-    return gen_jcc(ir, instr, cf|zf, dst, False)
+    return gen_jcc(ir, instr, cf | zf, dst, False)
 
 
 def jae(ir, instr, dst):
@@ -1313,23 +1342,23 @@ def jb(ir, instr, dst):
 
 
 def jbe(ir, instr, dst):
-    return gen_jcc(ir, instr, cf|zf, dst, True)
+    return gen_jcc(ir, instr, cf | zf, dst, True)
 
 
 def jge(ir, instr, dst):
-    return gen_jcc(ir, instr, nf-of, dst, False)
+    return gen_jcc(ir, instr, nf - of, dst, False)
 
 
 def jg(ir, instr, dst):
-    return gen_jcc(ir, instr, zf|(nf-of), dst, False)
+    return gen_jcc(ir, instr, zf | (nf - of), dst, False)
 
 
 def jl(ir, instr, dst):
-    return gen_jcc(ir, instr, nf-of, dst, True)
+    return gen_jcc(ir, instr, nf - of, dst, True)
 
 
 def jle(ir, instr, dst):
-    return gen_jcc(ir, instr, zf|(nf-of), dst, True)
+    return gen_jcc(ir, instr, zf | (nf - of), dst, True)
 
 
 def js(ir, instr, dst):
@@ -1374,8 +1403,8 @@ def loopne(ir, instr, dst):
     n = m2_expr.ExprId(ir.get_next_label(instr), ir.IRDst.size)
 
     c = m2_expr.ExprCond(myecx - m2_expr.ExprInt(1, size=myecx.size),
-                 m2_expr.ExprInt1(1),
-                 m2_expr.ExprInt1(0))
+                         m2_expr.ExprInt1(1),
+                         m2_expr.ExprInt1(0))
     c &= zf ^ m2_expr.ExprInt1(1)
 
     e.append(m2_expr.ExprAff(myecx, myecx - m2_expr.ExprInt_from(myecx, 1)))
@@ -1395,8 +1424,8 @@ def loope(ir, instr, dst):
 
     n = m2_expr.ExprId(ir.get_next_label(instr), ir.IRDst.size)
     c = m2_expr.ExprCond(myecx - m2_expr.ExprInt(1, size=myecx.size),
-                 m2_expr.ExprInt1(1),
-                 m2_expr.ExprInt1(0))
+                         m2_expr.ExprInt1(1),
+                         m2_expr.ExprInt1(0))
     c &= zf
     e.append(m2_expr.ExprAff(myecx, myecx - m2_expr.ExprInt_from(myecx, 1)))
     dst_o = m2_expr.ExprCond(c,
@@ -1416,7 +1445,7 @@ def div(ir, instr, a):
     elif size in [16, 32, 64]:
         s1, s2 = mRDX[size], mRAX[size]
         b = m2_expr.ExprCompose([(s2, 0, size),
-                         (s1, size, size*2)])
+                                 (s1, size, size * 2)])
     else:
         raise ValueError('div arg not impl', a)
 
@@ -1426,7 +1455,7 @@ def div(ir, instr, a):
     # if 8 bit div, only ax is affected
     if size == 8:
         e.append(m2_expr.ExprAff(b, m2_expr.ExprCompose([(c_d[:8], 0, 8),
-                                         (c_r[:8], 8, 16)])))
+                                                         (c_r[:8], 8, 16)])))
     else:
         e.append(m2_expr.ExprAff(s1, c_r[:size]))
         e.append(m2_expr.ExprAff(s2, c_d[:size]))
@@ -1444,7 +1473,7 @@ def idiv(ir, instr, a):
     elif size in [16, 32]:
         s1, s2 = mRDX[size], mRAX[size]
         b = m2_expr.ExprCompose([(s2, 0, size),
-                         (s1, size, size*2)])
+                                 (s1, size, size * 2)])
     else:
         raise ValueError('div arg not impl', a)
 
@@ -1454,7 +1483,7 @@ def idiv(ir, instr, a):
     # if 8 bit div, only ax is affected
     if size == 8:
         e.append(m2_expr.ExprAff(b, m2_expr.ExprCompose([(c_d[:8], 0, 8),
-                                         (c_r[:8], 8, 16)])))
+                                                         (c_r[:8], 8, 16)])))
     else:
         e.append(m2_expr.ExprAff(s1, c_r[:size]))
         e.append(m2_expr.ExprAff(s2, c_d[:size]))
@@ -1468,25 +1497,25 @@ def mul(ir, instr, a):
     size = a.size
     if a.size in [16, 32, 64]:
         result = m2_expr.ExprOp('*',
-                        mRAX[size].zeroExtend(size * 2),
-                        a.zeroExtend(size * 2))
+                                mRAX[size].zeroExtend(size * 2),
+                                a.zeroExtend(size * 2))
         e.append(m2_expr.ExprAff(mRAX[size], result[:size]))
         e.append(m2_expr.ExprAff(mRDX[size], result[size:size * 2]))
 
     elif a.size == 8:
         result = m2_expr.ExprOp('*',
-                        mRAX[instr.mode][:8].zeroExtend(16),
-                        a.zeroExtend(16))
+                                mRAX[instr.mode][:8].zeroExtend(16),
+                                a.zeroExtend(16))
         e.append(m2_expr.ExprAff(mRAX[instr.mode][:16], result))
     else:
         raise ValueError('unknow size')
 
     e.append(m2_expr.ExprAff(of, m2_expr.ExprCond(result[size:size * 2],
-                                  m2_expr.ExprInt1(1),
-                                  m2_expr.ExprInt1(0))))
+                                                  m2_expr.ExprInt1(1),
+                                                  m2_expr.ExprInt1(0))))
     e.append(m2_expr.ExprAff(cf, m2_expr.ExprCond(result[size:size * 2],
-                                  m2_expr.ExprInt1(1),
-                                  m2_expr.ExprInt1(0))))
+                                                  m2_expr.ExprInt1(1),
+                                                  m2_expr.ExprInt1(0))))
 
     return e, []
 
@@ -1497,15 +1526,15 @@ def imul(ir, instr, a, b=None, c=None):
     if b is None:
         if size in [16, 32, 64]:
             result = m2_expr.ExprOp('*',
-                            mRAX[size].signExtend(size * 2),
-                            a.signExtend(size * 2))
+                                    mRAX[size].signExtend(size * 2),
+                                    a.signExtend(size * 2))
             e.append(m2_expr.ExprAff(mRAX[size], result[:size]))
             e.append(m2_expr.ExprAff(mRDX[size], result[size:size * 2]))
         elif size == 8:
             dst = mRAX[instr.mode][:16]
             result = m2_expr.ExprOp('*',
-                            mRAX[instr.mode][:8].signExtend(16),
-                            a.signExtend(16))
+                                    mRAX[instr.mode][:8].signExtend(16),
+                                    a.signExtend(16))
 
             e.append(m2_expr.ExprAff(dst, result))
         value = m2_expr.ExprCond(result - result[:size].signExtend(size * 2),
@@ -1522,17 +1551,17 @@ def imul(ir, instr, a, b=None, c=None):
             c = b
             b = a
         result = m2_expr.ExprOp('*',
-                        b.signExtend(size * 2),
-                        c.signExtend(size * 2))
+                                b.signExtend(size * 2),
+                                c.signExtend(size * 2))
         e.append(m2_expr.ExprAff(a, result[:size]))
 
         value = m2_expr.ExprCond(result - result[:size].signExtend(size * 2),
                                  m2_expr.ExprInt1(1),
                                  m2_expr.ExprInt1(0))
         e.append(m2_expr.ExprAff(cf, value))
-        value =  m2_expr.ExprCond(result - result[:size].signExtend(size * 2),
-                                  m2_expr.ExprInt1(1),
-                                  m2_expr.ExprInt1(0))
+        value = m2_expr.ExprCond(result - result[:size].signExtend(size * 2),
+                                 m2_expr.ExprInt1(1),
+                                 m2_expr.ExprInt1(0))
         e.append(m2_expr.ExprAff(of, value))
     return e, []
 
@@ -1703,6 +1732,7 @@ def movs(ir, instr, size):
                              m2_expr.ExprCond(df, lbl_df_1, lbl_df_0)))
     return e, [e0, e1]
 
+
 def movsd(ir, instr, a, b):
     e = []
     if isinstance(a, m2_expr.ExprId) and isinstance(b, m2_expr.ExprMem):
@@ -1713,7 +1743,8 @@ def movsd(ir, instr, a, b):
     e.append(m2_expr.ExprAff(a, b))
     return e, []
 
-def movsd_dispatch(ir, instr, a = None, b = None):
+
+def movsd_dispatch(ir, instr, a=None, b=None):
     if a is None and b is None:
         return movs(ir, instr, 32)
     else:
@@ -1739,11 +1770,11 @@ def float_pop(avoid_flt=None, popcount=1):
     """
     avoid_flt = float_prev(avoid_flt, popcount)
     e = []
-    for i in xrange(8-popcount):
+    for i in xrange(8 - popcount):
         if avoid_flt != float_list[i]:
             e.append(m2_expr.ExprAff(float_list[i],
-                                     float_list[i+popcount]))
-    for i in xrange(8-popcount, 8):
+                                     float_list[i + popcount]))
+    for i in xrange(8 - popcount, 8):
         e.append(m2_expr.ExprAff(float_list[i],
                                  m2_expr.ExprInt_from(float_list[i], 0)))
     e.append(
@@ -1800,7 +1831,7 @@ def fxam(ir, instr):
     return e, []
 
 
-def ficom(ir, instr, a, b = None):
+def ficom(ir, instr, a, b=None):
 
     a, b = float_implicit_st0(a, b)
 
@@ -1823,7 +1854,6 @@ def ficom(ir, instr, a, b = None):
     return e, []
 
 
-
 def fcomi(ir, instr, a=None, b=None):
     # TODO unordered float
     if a is None and b is None:
@@ -1857,6 +1887,7 @@ def fucomi(ir, instr, a=None, b=None):
     # TODO unordered float
     return fcomi(ir, instr, a, b)
 
+
 def fucomip(ir, instr, a=None, b=None):
     # TODO unordered float
     return fcomip(ir, instr, a, b)
@@ -1876,7 +1907,7 @@ def fcompp(ir, instr, a=None, b=None):
     return e, extra
 
 
-def ficomp(ir, instr, a, b = None):
+def ficomp(ir, instr, a, b=None):
     e, extra = ficom(ir, instr, a, b)
     e += float_pop()
     e += set_float_cs_eip(instr)
@@ -2003,11 +2034,13 @@ def fist(ir, instr, a):
     e += set_float_cs_eip(instr)
     return e, []
 
+
 def fistp(ir, instr, a):
     e, extra = fist(ir, instr, a)
     e += float_pop(a)
     return e, extra
 
+
 def fisttp(ir, instr, a):
     e = []
     e.append(m2_expr.ExprAff(a,
@@ -2040,7 +2073,7 @@ def fld1(ir, instr):
 
 
 def fldl2t(ir, instr):
-    value_f = math.log(10)/math.log(2)
+    value_f = math.log(10) / math.log(2)
     value = struct.unpack('I', struct.pack('f', value_f))[0]
     return fld(ir, instr, m2_expr.ExprOp('int_32_to_double',
                                          m2_expr.ExprInt32(value)))
@@ -2083,6 +2116,7 @@ def fadd(ir, instr, a, b=None):
     e += set_float_cs_eip(instr)
     return e, []
 
+
 def fiadd(ir, instr, a, b=None):
     a, b = float_implicit_st0(a, b)
     e = []
@@ -2122,7 +2156,8 @@ def fpatan(ir, instr):
 
 def fprem(ir, instr):
     e = []
-    e.append(m2_expr.ExprAff(float_st0, m2_expr.ExprOp('fprem', float_st0, float_st1)))
+    e.append(
+        m2_expr.ExprAff(float_st0, m2_expr.ExprOp('fprem', float_st0, float_st1)))
     # Remaining bits (ex: used in argument reduction in tan)
     remain = m2_expr.ExprOp('fprem_lsb', float_st0, float_st1)
     e += [m2_expr.ExprAff(float_c0, remain[2:3]),
@@ -2130,14 +2165,15 @@ def fprem(ir, instr):
           m2_expr.ExprAff(float_c1, remain[0:1]),
           # Consider the reduction is always completed
           m2_expr.ExprAff(float_c2, m2_expr.ExprInt1(0)),
-    ]
+          ]
     e += set_float_cs_eip(instr)
     return e, []
 
 
 def fprem1(ir, instr):
     e = []
-    e.append(m2_expr.ExprAff(float_st0, m2_expr.ExprOp('fprem1', float_st0, float_st1)))
+    e.append(
+        m2_expr.ExprAff(float_st0, m2_expr.ExprOp('fprem1', float_st0, float_st1)))
     e += set_float_cs_eip(instr)
     return e, []
 
@@ -2161,7 +2197,8 @@ def fninit(ir, instr):
 def fyl2x(ir, instr):
     e = []
     a = float_st1
-    e.append(m2_expr.ExprAff(float_prev(a), m2_expr.ExprOp('fyl2x', float_st0, float_st1)))
+    e.append(
+        m2_expr.ExprAff(float_prev(a), m2_expr.ExprOp('fyl2x', float_st0, float_st1)))
     e += set_float_cs_eip(instr)
     e += float_pop(a)
     return e, []
@@ -2171,13 +2208,13 @@ def fnstenv(ir, instr, a):
     e = []
     # XXX TODO tag word, ...
     status_word = m2_expr.ExprCompose([(m2_expr.ExprInt8(0), 0, 8),
-                               (float_c0,           8, 9),
-                               (float_c1,           9, 10),
-                               (float_c2,           10, 11),
-                               (float_stack_ptr,    11, 14),
-                               (float_c3,           14, 15),
-                               (m2_expr.ExprInt1(0), 15, 16),
-                               ])
+                                       (float_c0,           8, 9),
+                                       (float_c1,           9, 10),
+                                       (float_c2,           10, 11),
+                                       (float_stack_ptr,    11, 14),
+                                       (float_c3,           14, 15),
+                                       (m2_expr.ExprInt1(0), 15, 16),
+                                       ])
 
     s = instr.mode
     # The behaviour in 64bit is identical to 32 bit
@@ -2202,6 +2239,7 @@ def fnstenv(ir, instr, a):
     e.append(m2_expr.ExprAff(ad, float_ds))
     return e, []
 
+
 def fldenv(ir, instr, a):
     e = []
     # Inspired from fnstenv (same TODOs / issues)
@@ -2211,11 +2249,11 @@ def fldenv(ir, instr, a):
     # This will truncate addresses
     s = min(32, s)
 
-    ## Float control
+    # Float control
     ad = m2_expr.ExprMem(a.arg, size=16)
     e.append(m2_expr.ExprAff(float_control, ad))
 
-    ## Status word
+    # Status word
     ad = m2_expr.ExprMem(a.arg + m2_expr.ExprInt(s / 8 * 1, size=a.arg.size),
                          size=16)
     e += [m2_expr.ExprAff(x, y) for x, y in ((float_c0, ad[8:9]),
@@ -2225,7 +2263,7 @@ def fldenv(ir, instr, a):
                                              (float_c3, ad[14:15]))
           ]
 
-    ## EIP, CS, Address, DS
+    # EIP, CS, Address, DS
     for offset, target in ((3, float_eip[:s]),
                            (4, float_cs),
                            (5, float_address[:s]),
@@ -2247,6 +2285,7 @@ def fsub(ir, instr, a, b=None):
     e += set_float_cs_eip(instr)
     return e, []
 
+
 def fsubp(ir, instr, a, b=None):
     a, b = float_implicit_st0(a, b)
     e = []
@@ -2284,6 +2323,7 @@ def fmul(ir, instr, a, b=None):
     e += set_float_cs_eip(instr)
     return e, []
 
+
 def fimul(ir, instr, a, b=None):
     a, b = float_implicit_st0(a, b)
     e = []
@@ -2301,6 +2341,7 @@ def fdiv(ir, instr, a, b=None):
     e += set_float_cs_eip(instr)
     return e, []
 
+
 def fdivr(ir, instr, a, b=None):
     a, b = float_implicit_st0(a, b)
     e = []
@@ -2446,6 +2487,7 @@ def f2xm1(ir, instr):
     e += set_float_cs_eip(instr)
     return e, []
 
+
 def fchs(ir, instr):
     e = []
     e.append(m2_expr.ExprAff(float_st0, m2_expr.ExprOp('fchs', float_st0)))
@@ -2470,7 +2512,7 @@ def fabs(ir, instr):
 def fnstsw(ir, instr, dst):
     args = [
         # Exceptions -> 0
-        (m2_expr.ExprInt8(0),0, 8),
+        (m2_expr.ExprInt8(0), 0, 8),
         (float_c0,           8, 9),
         (float_c1,           9, 10),
         (float_c2,           10, 11),
@@ -2507,7 +2549,7 @@ def fcmove(ir, instr, arg1, arg2):
 
 
 def fcmovbe(ir, instr, arg1, arg2):
-    return gen_fcmov(ir, instr, cf|zf, arg1, arg2, True)
+    return gen_fcmov(ir, instr, cf | zf, arg1, arg2, True)
 
 
 def fcmovu(ir, instr, arg1, arg2):
@@ -2523,7 +2565,7 @@ def fcmovne(ir, instr, arg1, arg2):
 
 
 def fcmovnbe(ir, instr, arg1, arg2):
-    return gen_fcmov(ir, instr, cf|zf, arg1, arg2, False)
+    return gen_fcmov(ir, instr, cf | zf, arg1, arg2, False)
 
 
 def fcmovnu(ir, instr, arg1, arg2):
@@ -2545,7 +2587,8 @@ def rdtsc(ir, instr):
     e = []
     e.append(m2_expr.ExprAff(tsc1, tsc1 + m2_expr.ExprInt32(1)))
     e.append(m2_expr.ExprAff(tsc2, tsc2 + m2_expr.ExprCond(tsc1 - tsc1.mask,
-                                                           m2_expr.ExprInt32(0),
+                                                           m2_expr.ExprInt32(
+                                                               0),
                                                            m2_expr.ExprInt32(1))))
     e.append(m2_expr.ExprAff(mRAX[32], tsc1))
     e.append(m2_expr.ExprAff(mRDX[32], tsc2))
@@ -2559,11 +2602,9 @@ def daa(ir, instr):
     cond1 = expr_cmpu(r_al[:4], m2_expr.ExprInt(0x9, 4)) | af
     e.append(m2_expr.ExprAff(af, cond1))
 
-
     cond2 = expr_cmpu(m2_expr.ExprInt8(6), r_al)
     cond3 = expr_cmpu(r_al, m2_expr.ExprInt8(0x99)) | cf
 
-
     cf_c1 = m2_expr.ExprCond(cond1,
                              cf | (cond2),
                              m2_expr.ExprInt1(0))
@@ -2583,6 +2624,7 @@ def daa(ir, instr):
     e += update_flag_znp(new_al)
     return e, []
 
+
 def das(ir, instr):
     e = []
     r_al = mRAX[instr.mode][:8]
@@ -2590,11 +2632,9 @@ def das(ir, instr):
     cond1 = expr_cmpu(r_al[:4], m2_expr.ExprInt(0x9, 4)) | af
     e.append(m2_expr.ExprAff(af, cond1))
 
-
     cond2 = expr_cmpu(m2_expr.ExprInt8(6), r_al)
     cond3 = expr_cmpu(r_al, m2_expr.ExprInt8(0x99)) | cf
 
-
     cf_c1 = m2_expr.ExprCond(cond1,
                              cf | (cond2),
                              m2_expr.ExprInt1(0))
@@ -2619,10 +2659,10 @@ def aam(ir, instr, a):
     e = []
     tempAL = mRAX[instr.mode][0:8]
     newEAX = m2_expr.ExprCompose([
-                         (tempAL % a,           0,  8),
+        (tempAL % a,           0,  8),
                         (tempAL / a,           8,  16),
                         (mRAX[instr.mode][16:], 16, mRAX[instr.mode].size),
-                         ])
+    ])
     e += [m2_expr.ExprAff(mRAX[instr.mode], newEAX)]
     e += update_flag_arith(newEAX)
     e.append(m2_expr.ExprAff(af, m2_expr.ExprInt1(0)))
@@ -2634,11 +2674,11 @@ def aad(ir, instr, a):
     tempAL = mRAX[instr.mode][0:8]
     tempAH = mRAX[instr.mode][8:16]
     newEAX = m2_expr.ExprCompose([
-            ((tempAL + (tempAH * a)) & m2_expr.ExprInt8(0xFF), 0,  8),
+        ((tempAL + (tempAH * a)) & m2_expr.ExprInt8(0xFF), 0,  8),
             (m2_expr.ExprInt8(0),                              8,  16),
             (mRAX[instr.mode][16:],
              16, mRAX[instr.mode].size),
-            ])
+    ])
     e += [m2_expr.ExprAff(mRAX[instr.mode], newEAX)]
     e += update_flag_arith(newEAX)
     e.append(m2_expr.ExprAff(af, m2_expr.ExprInt1(0)))
@@ -2708,9 +2748,11 @@ def bsr_bsf(ir, instr, a, b, op_name):
     return e, [irbloc(lbl_src_null.name, [e_src_null]),
                irbloc(lbl_src_not_null.name, [e_src_not_null])]
 
+
 def bsf(ir, instr, a, b):
     return bsr_bsf(ir, instr, a, b, "bsf")
 
+
 def bsr(ir, instr, a, b):
     return bsr_bsf(ir, instr, a, b, "bsr")
 
@@ -2737,7 +2779,7 @@ def sidt(ir, instr, a):
                              m2_expr.ExprInt32(0xe40007ff)))
     e.append(
         m2_expr.ExprAff(m2_expr.ExprMem(m2_expr.ExprOp("+", b,
-        m2_expr.ExprInt_from(b, 4)), 16), m2_expr.ExprInt16(0x8245)))
+                                                       m2_expr.ExprInt_from(b, 4)), 16), m2_expr.ExprInt16(0x8245)))
     return e, []
 
 
@@ -2750,6 +2792,7 @@ def sldt(ir, instr, a):
 def cmovz(ir, instr, arg1, arg2):
     return gen_cmov(ir, instr, zf, arg1, arg2, True)
 
+
 def cmovnz(ir, instr, arg1, arg2):
     return gen_cmov(ir, instr, zf, arg1, arg2, False)
 
@@ -2763,23 +2806,23 @@ def cmovnp(ir, instr, arg1, arg2):
 
 
 def cmovge(ir, instr, arg1, arg2):
-    return gen_cmov(ir, instr, nf^of, arg1, arg2, False)
+    return gen_cmov(ir, instr, nf ^ of, arg1, arg2, False)
 
 
 def cmovg(ir, instr, arg1, arg2):
-    return gen_cmov(ir, instr, zf|(nf^of), arg1, arg2, False)
+    return gen_cmov(ir, instr, zf | (nf ^ of), arg1, arg2, False)
 
 
 def cmovl(ir, instr, arg1, arg2):
-    return gen_cmov(ir, instr, nf^of, arg1, arg2, True)
+    return gen_cmov(ir, instr, nf ^ of, arg1, arg2, True)
 
 
 def cmovle(ir, instr, arg1, arg2):
-    return gen_cmov(ir, instr, zf|(nf^of), arg1, arg2, True)
+    return gen_cmov(ir, instr, zf | (nf ^ of), arg1, arg2, True)
 
 
 def cmova(ir, instr, arg1, arg2):
-    return gen_cmov(ir, instr, cf|zf, arg1, arg2, False)
+    return gen_cmov(ir, instr, cf | zf, arg1, arg2, False)
 
 
 def cmovae(ir, instr, arg1, arg2):
@@ -2787,7 +2830,7 @@ def cmovae(ir, instr, arg1, arg2):
 
 
 def cmovbe(ir, instr, arg1, arg2):
-    return gen_cmov(ir, instr, cf|zf, arg1, arg2, True)
+    return gen_cmov(ir, instr, cf | zf, arg1, arg2, True)
 
 
 def cmovb(ir, instr, arg1, arg2):
@@ -2813,7 +2856,7 @@ def cmovns(ir, instr, arg1, arg2):
 def icebp(ir, instr):
     e = []
     e.append(m2_expr.ExprAff(exception_flags,
-                     m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
+                             m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
     return e, []
 # XXX
 
@@ -2826,7 +2869,7 @@ def l_int(ir, instr, a):
     else:
         except_int = EXCEPT_INT_XX
     e.append(m2_expr.ExprAff(exception_flags,
-                     m2_expr.ExprInt32(except_int)))
+                             m2_expr.ExprInt32(except_int)))
     e.append(m2_expr.ExprAff(interrupt_num, a))
     return e, []
 
@@ -2834,7 +2877,7 @@ def l_int(ir, instr, a):
 def l_sysenter(ir, instr):
     e = []
     e.append(m2_expr.ExprAff(exception_flags,
-                     m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
+                             m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
     return e, []
 
 # XXX
@@ -2843,7 +2886,7 @@ def l_sysenter(ir, instr):
 def l_out(ir, instr, a, b):
     e = []
     e.append(m2_expr.ExprAff(exception_flags,
-                     m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
+                             m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
     return e, []
 
 # XXX
@@ -2852,7 +2895,7 @@ def l_out(ir, instr, a, b):
 def l_outs(ir, instr, size):
     e = []
     e.append(m2_expr.ExprAff(exception_flags,
-                     m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
+                             m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
     return e, []
 
 # XXX actually, xlat performs al = (ds:[e]bx + ZeroExtend(al))
@@ -2861,7 +2904,7 @@ def l_outs(ir, instr, size):
 def xlat(ir, instr):
     e = []
     a = m2_expr.ExprCompose([(m2_expr.ExprInt(0, 24), 8, 32),
-                     (mRAX[instr.mode][0:8], 0, 8)])
+                             (mRAX[instr.mode][0:8], 0, 8)])
     b = m2_expr.ExprMem(m2_expr.ExprOp('+', mRBX[instr.mode], a), 8)
     e.append(m2_expr.ExprAff(mRAX[instr.mode][0:8], b))
     return e, []
@@ -2871,30 +2914,31 @@ def cpuid(ir, instr):
     e = []
     e.append(
         m2_expr.ExprAff(mRAX[instr.mode],
-        m2_expr.ExprOp('cpuid', mRAX[instr.mode], m2_expr.ExprInt(0, instr.mode))))
+                        m2_expr.ExprOp('cpuid', mRAX[instr.mode], m2_expr.ExprInt(0, instr.mode))))
     e.append(
         m2_expr.ExprAff(mRBX[instr.mode],
-        m2_expr.ExprOp('cpuid', mRAX[instr.mode], m2_expr.ExprInt(1, instr.mode))))
+                        m2_expr.ExprOp('cpuid', mRAX[instr.mode], m2_expr.ExprInt(1, instr.mode))))
     e.append(
         m2_expr.ExprAff(mRCX[instr.mode],
-        m2_expr.ExprOp('cpuid', mRAX[instr.mode], m2_expr.ExprInt(2, instr.mode))))
+                        m2_expr.ExprOp('cpuid', mRAX[instr.mode], m2_expr.ExprInt(2, instr.mode))))
     e.append(
         m2_expr.ExprAff(mRDX[instr.mode],
-        m2_expr.ExprOp('cpuid', mRAX[instr.mode], m2_expr.ExprInt(3, instr.mode))))
+                        m2_expr.ExprOp('cpuid', mRAX[instr.mode], m2_expr.ExprInt(3, instr.mode))))
     return e, []
 
 
 def bittest_get(a, b):
     b = b.zeroExtend(a.size)
     if isinstance(a, m2_expr.ExprMem):
-        b_mask = {16:4, 32:5, 64:6}
-        b_decal = {16:1, 32:3, 64:7}
+        b_mask = {16: 4, 32: 5, 64: 6}
+        b_decal = {16: 1, 32: 3, 64: 7}
         ptr = a.arg
-        off_bit = b.zeroExtend(a.size) & m2_expr.ExprInt((1<<b_mask[a.size])-1,
-                                                         a.size)
+        off_bit = b.zeroExtend(
+            a.size) & m2_expr.ExprInt((1 << b_mask[a.size]) - 1,
+                                      a.size)
         off_byte = ((b.zeroExtend(ptr.size) >> m2_expr.ExprInt_from(ptr, 3)) &
                     m2_expr.ExprInt_from(ptr,
-                                         ((1<<a.size)-1) ^ b_decal[a.size]))
+                                         ((1 << a.size) - 1) ^ b_decal[a.size]))
 
         d = m2_expr.ExprMem(ptr + off_byte, a.size)
     else:
@@ -2950,7 +2994,7 @@ def into(ir, instr):
 def l_in(ir, instr, a, b):
     e = []
     e.append(m2_expr.ExprAff(exception_flags,
-                     m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
+                             m2_expr.ExprInt32(EXCEPT_PRIV_INSN)))
     return e, []
 
 
@@ -2980,7 +3024,7 @@ def cmpxchg8b(arg1):
 def lds(ir, instr, a, b):
     e = []
     e.append(m2_expr.ExprAff(a, m2_expr.ExprMem(b.arg, size=a.size)))
-    DS_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size/8),
+    DS_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size / 8),
                                size=16)
     e.append(m2_expr.ExprAff(DS, DS_value))
     return e, []
@@ -2989,7 +3033,7 @@ def lds(ir, instr, a, b):
 def les(ir, instr, a, b):
     e = []
     e.append(m2_expr.ExprAff(a, m2_expr.ExprMem(b.arg, size=a.size)))
-    ES_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size/8),
+    ES_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size / 8),
                                size=16)
     e.append(m2_expr.ExprAff(ES, ES_value))
     return e, []
@@ -2998,23 +3042,25 @@ def les(ir, instr, a, b):
 def lss(ir, instr, a, b):
     e = []
     e.append(m2_expr.ExprAff(a, m2_expr.ExprMem(b.arg, size=a.size)))
-    SS_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size/8),
+    SS_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size / 8),
                                size=16)
     e.append(m2_expr.ExprAff(SS, SS_value))
     return e, []
 
+
 def lfs(ir, instr, a, b):
     e = []
     e.append(m2_expr.ExprAff(a, m2_expr.ExprMem(b.arg, size=a.size)))
-    FS_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size/8),
+    FS_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size / 8),
                                size=16)
     e.append(m2_expr.ExprAff(FS, FS_value))
     return e, []
 
+
 def lgs(ir, instr, a, b):
     e = []
     e.append(m2_expr.ExprAff(a, m2_expr.ExprMem(b.arg, size=a.size)))
-    GS_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size/8),
+    GS_value = m2_expr.ExprMem(b.arg + m2_expr.ExprInt_from(b.arg, a.size / 8),
                                size=16)
     e.append(m2_expr.ExprAff(GS, GS_value))
     return e, []
@@ -3027,7 +3073,8 @@ def lahf(ir, instr):
             m2_expr.ExprInt1(0), zf, nf]
     for i in xrange(len(regs)):
         args.append((regs[i], i, i + 1))
-    e.append(m2_expr.ExprAff(mRAX[instr.mode][8:16], m2_expr.ExprCompose(args)))
+    e.append(
+        m2_expr.ExprAff(mRAX[instr.mode][8:16], m2_expr.ExprCompose(args)))
     return e, []
 
 
@@ -3085,6 +3132,7 @@ def movd(ir, instr, a, b):
         e.append(m2_expr.ExprAff(a, b[:32]))
     return e, []
 
+
 def movdqu(ir, instr, a, b):
     # XXX TODO alignement check
     return [m2_expr.ExprAff(a, b)], []
@@ -3101,6 +3149,12 @@ def andps(ir, instr, a, b):
     return e, []
 
 
+def andnps(ir, instr, a, b):
+    e = []
+    e.append(m2_expr.ExprAff(a, m2_expr.ExprOp('&', a ^ a.mask, b)))
+    return e, []
+
+
 def orps(ir, instr, a, b):
     e = []
     e.append(m2_expr.ExprAff(a, m2_expr.ExprOp('|', a, b)))
@@ -3114,22 +3168,28 @@ def xorps(ir, instr, a, b):
 
 
 def rdmsr(ir, instr):
-    msr_addr = m2_expr.ExprId('MSR') + m2_expr.ExprInt32(8) * mRCX[instr.mode][:32]
+    msr_addr = m2_expr.ExprId('MSR') + m2_expr.ExprInt32(
+        8) * mRCX[instr.mode][:32]
     e = []
-    e.append(m2_expr.ExprAff(mRAX[instr.mode][:32], m2_expr.ExprMem(msr_addr, 32)))
-    e.append(m2_expr.ExprAff(mRDX[instr.mode][:32], m2_expr.ExprMem(msr_addr + m2_expr.ExprInt_from(msr_addr, 4), 32)))
+    e.append(
+        m2_expr.ExprAff(mRAX[instr.mode][:32], m2_expr.ExprMem(msr_addr, 32)))
+    e.append(m2_expr.ExprAff(mRDX[instr.mode][:32], m2_expr.ExprMem(
+        msr_addr + m2_expr.ExprInt_from(msr_addr, 4), 32)))
     return e, []
 
+
 def wrmsr(ir, instr):
-    msr_addr = m2_expr.ExprId('MSR') + m2_expr.ExprInt32(8) * mRCX[instr.mode][:32]
+    msr_addr = m2_expr.ExprId('MSR') + m2_expr.ExprInt32(
+        8) * mRCX[instr.mode][:32]
     e = []
     src = m2_expr.ExprCompose([(mRAX[instr.mode][:32], 0, 32),
                                (mRDX[instr.mode][:32], 32, 64)])
     e.append(m2_expr.ExprAff(m2_expr.ExprMem(msr_addr, 64), src))
     return e, []
 
-### MMX/SSE/AVX operations
-###
+# MMX/SSE/AVX operations
+#
+
 
 def vec_op_clip(op, size):
     """
@@ -3145,40 +3205,52 @@ def vec_op_clip(op, size):
     return vec_op_clip_instr
 
 # Generic vertical operation
+
+
 def vec_vertical_sem(op, elt_size, reg_size, a, b):
     assert(reg_size % elt_size == 0)
-    n = reg_size/elt_size
+    n = reg_size / elt_size
     if op == '-':
-        ops = [((a[i*elt_size:(i+1)*elt_size] - b[i*elt_size:(i+1)*elt_size]),
-               i*elt_size, (i+1)*elt_size) for i in xrange(0, n)]
+        ops = [(
+            (a[i * elt_size:(i + 1) * elt_size]
+             - b[i * elt_size:(i + 1) * elt_size]),
+               i * elt_size, (i + 1) * elt_size) for i in xrange(0, n)]
     else:
-        ops = [(m2_expr.ExprOp(op, a[i*elt_size:(i+1)*elt_size],
-                               b[i*elt_size:(i+1)*elt_size]),
-                i*elt_size,
-                (i+1)*elt_size) for i in xrange(0, n)]
+        ops = [(m2_expr.ExprOp(op, a[i * elt_size:(i + 1) * elt_size],
+                               b[i * elt_size:(i + 1) * elt_size]),
+                i * elt_size,
+                (i + 1) * elt_size) for i in xrange(0, n)]
 
     return m2_expr.ExprCompose(ops)
 
+
 def float_vec_vertical_sem(op, elt_size, reg_size, a, b):
     assert(reg_size % elt_size == 0)
-    n = reg_size/elt_size
+    n = reg_size / elt_size
 
     x_to_int, int_to_x = {32: ('float_to_int_%d', 'int_%d_to_float'),
                           64: ('double_to_int_%d', 'int_%d_to_double')}[elt_size]
     if op == '-':
         ops = [(m2_expr.ExprOp(x_to_int % elt_size,
-                               m2_expr.ExprOp(int_to_x % elt_size, a[i*elt_size:(i+1)*elt_size]) -
-                               m2_expr.ExprOp(int_to_x % elt_size, b[i*elt_size:(i+1)*elt_size])),
-                i*elt_size, (i+1)*elt_size) for i in xrange(0, n)]
+                               m2_expr.ExprOp(int_to_x % elt_size, a[i * elt_size:(i + 1) * elt_size]) -
+                               m2_expr.ExprOp(
+                                   int_to_x % elt_size, b[i * elt_size:(
+                                       i + 1) * elt_size])),
+                i * elt_size, (i + 1) * elt_size) for i in xrange(0, n)]
     else:
         ops = [(m2_expr.ExprOp(x_to_int % elt_size,
                                m2_expr.ExprOp(op,
-                                              m2_expr.ExprOp(int_to_x % elt_size, a[i*elt_size:(i+1)*elt_size]),
-                                              m2_expr.ExprOp(int_to_x % elt_size, b[i*elt_size:(i+1)*elt_size]))),
-                i*elt_size, (i+1)*elt_size) for i in xrange(0, n)]
+                                              m2_expr.ExprOp(
+                                                  int_to_x % elt_size, a[i * elt_size:(
+                                                      i + 1) * elt_size]),
+                                              m2_expr.ExprOp(
+                                                  int_to_x % elt_size, b[i * elt_size:(
+                                                      i + 1) * elt_size]))),
+                i * elt_size, (i + 1) * elt_size) for i in xrange(0, n)]
 
     return m2_expr.ExprCompose(ops)
 
+
 def __vec_vertical_instr_gen(op, elt_size, sem):
     def vec_instr(ir, instr, a, b):
         e = []
@@ -3189,17 +3261,19 @@ def __vec_vertical_instr_gen(op, elt_size, sem):
         return e, []
     return vec_instr
 
+
 def vec_vertical_instr(op, elt_size):
     return __vec_vertical_instr_gen(op, elt_size, vec_vertical_sem)
 
+
 def float_vec_vertical_instr(op, elt_size):
     return __vec_vertical_instr_gen(op, elt_size, float_vec_vertical_sem)
 
-### Integer arithmetic
-###
+# Integer arithmetic
+#
 
-## Additions
-##
+# Additions
+#
 
 # SSE
 paddb = vec_vertical_instr('+', 8)
@@ -3207,8 +3281,8 @@ paddw = vec_vertical_instr('+', 16)
 paddd = vec_vertical_instr('+', 32)
 paddq = vec_vertical_instr('+', 64)
 
-## Substractions
-##
+# Substractions
+#
 
 # SSE
 psubb = vec_vertical_instr('-', 8)
@@ -3216,8 +3290,8 @@ psubw = vec_vertical_instr('-', 16)
 psubd = vec_vertical_instr('-', 32)
 psubq = vec_vertical_instr('-', 64)
 
-### Floating-point arithmetic
-###
+# Floating-point arithmetic
+#
 
 # SSE
 addss = vec_op_clip('+', 32)
@@ -3237,10 +3311,12 @@ divsd = vec_op_clip('/', 64)
 divps = float_vec_vertical_instr('/', 32)
 divpd = float_vec_vertical_instr('/', 64)
 
-### Logical (floating-point)
-###
+# Logical (floating-point)
+#
 
 # MMX/SSE/AVX
+
+
 def pand(ir, instr, a, b):
     e = []
     c = a & b
@@ -3249,6 +3325,14 @@ def pand(ir, instr, a, b):
     return e, []
 
 
+def pandn(ir, instr, a, b):
+    e = []
+    c = (a ^ a.mask) & b
+    # No flag affected
+    e.append(m2_expr.ExprAff(a, c))
+    return e, []
+
+
 def por(ir, instr, a, b):
     e = []
     c = a | b
@@ -3261,139 +3345,204 @@ def pminsw(ir, instr, a, b):
     e.append(m2_expr.ExprAff(a, m2_expr.ExprCond((a - b).msb(), a, b)))
     return e, []
 
+
 def cvtdq2pd(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:64], m2_expr.ExprOp('int_32_to_double', b[:32])))
-    e.append(m2_expr.ExprAff(a[64:128], m2_expr.ExprOp('int_32_to_double', b[32:64])))
+    e.append(
+        m2_expr.ExprAff(a[:64], m2_expr.ExprOp('int_32_to_double', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[64:128], m2_expr.ExprOp('int_32_to_double', b[32:64])))
     return e, []
 
+
 def cvtdq2ps(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('int_32_to_float', b[:32])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('int_32_to_float', b[32:64])))
-    e.append(m2_expr.ExprAff(a[64:96], m2_expr.ExprOp('int_32_to_float', b[64:96])))
-    e.append(m2_expr.ExprAff(a[96:128], m2_expr.ExprOp('int_32_to_float', b[96:128])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('int_32_to_float', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('int_32_to_float', b[32:64])))
+    e.append(
+        m2_expr.ExprAff(a[64:96], m2_expr.ExprOp('int_32_to_float', b[64:96])))
+    e.append(
+        m2_expr.ExprAff(a[96:128], m2_expr.ExprOp('int_32_to_float', b[96:128])))
     return e, []
 
+
 def cvtpd2dq(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_int_32', b[:64])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_to_int_32', b[64:128])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_int_32', b[:64])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_to_int_32', b[64:128])))
     e.append(m2_expr.ExprAff(a[64:128], m2_expr.ExprInt64(0)))
     return e, []
 
+
 def cvtpd2pi(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_int_32', b[:64])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_to_int_32', b[64:128])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_int_32', b[:64])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_to_int_32', b[64:128])))
     return e, []
 
+
 def cvtpd2ps(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_float', b[:64])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_to_float', b[64:128])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_float', b[:64])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_to_float', b[64:128])))
     e.append(m2_expr.ExprAff(a[64:128], m2_expr.ExprInt64(0)))
     return e, []
 
+
 def cvtpi2pd(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:64], m2_expr.ExprOp('int_32_to_double', b[:32])))
-    e.append(m2_expr.ExprAff(a[64:128], m2_expr.ExprOp('int_32_to_double', b[32:64])))
+    e.append(
+        m2_expr.ExprAff(a[:64], m2_expr.ExprOp('int_32_to_double', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[64:128], m2_expr.ExprOp('int_32_to_double', b[32:64])))
     return e, []
 
+
 def cvtpi2ps(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('int_32_to_float', b[:32])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('int_32_to_float', b[32:64])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('int_32_to_float', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('int_32_to_float', b[32:64])))
     return e, []
 
+
 def cvtps2dq(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_to_int_32', b[:32])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('float_to_int_32', b[32:64])))
-    e.append(m2_expr.ExprAff(a[64:96], m2_expr.ExprOp('float_to_int_32', b[64:96])))
-    e.append(m2_expr.ExprAff(a[96:128], m2_expr.ExprOp('float_to_int_32', b[96:128])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_to_int_32', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('float_to_int_32', b[32:64])))
+    e.append(
+        m2_expr.ExprAff(a[64:96], m2_expr.ExprOp('float_to_int_32', b[64:96])))
+    e.append(
+        m2_expr.ExprAff(a[96:128], m2_expr.ExprOp('float_to_int_32', b[96:128])))
     return e, []
 
+
 def cvtps2pd(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:64], m2_expr.ExprOp('float_to_double', b[:32])))
-    e.append(m2_expr.ExprAff(a[64:128], m2_expr.ExprOp('float_to_double', b[32:64])))
+    e.append(
+        m2_expr.ExprAff(a[:64], m2_expr.ExprOp('float_to_double', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[64:128], m2_expr.ExprOp('float_to_double', b[32:64])))
     return e, []
 
+
 def cvtps2pi(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_to_int_32', b[:32])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('float_to_int_32', b[32:64])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_to_int_32', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('float_to_int_32', b[32:64])))
     return e, []
 
+
 def cvtsd2si(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_int_32', b[:64])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_int_32', b[:64])))
     return e, []
 
+
 def cvtsd2ss(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_float', b[:64])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_to_float', b[:64])))
     return e, []
 
+
 def cvtsi2sd(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:64], m2_expr.ExprOp('int_32_to_double', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[:64], m2_expr.ExprOp('int_32_to_double', b[:32])))
     return e, []
 
+
 def cvtsi2ss(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('int_32_to_float', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('int_32_to_float', b[:32])))
     return e, []
 
+
 def cvtss2sd(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:64], m2_expr.ExprOp('float_to_double', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[:64], m2_expr.ExprOp('float_to_double', b[:32])))
     return e, []
 
+
 def cvtss2si(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_to_int_32', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_to_int_32', b[:32])))
     return e, []
 
+
 def cvttpd2pi(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_trunc_to_int_32', b[:64])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_trunc_to_int_32', b[64:128])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_trunc_to_int_32', b[:64])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_trunc_to_int_32', b[64:128])))
     return e, []
 
+
 def cvttpd2dq(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_trunc_to_int_32', b[:64])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_trunc_to_int_32', b[64:128])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_trunc_to_int_32', b[:64])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('double_trunc_to_int_32', b[64:128])))
     e.append(m2_expr.ExprAff(a[64:128], m2_expr.ExprInt64(0)))
     return e, []
 
+
 def cvttps2dq(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_trunc_to_int_32', b[:32])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('float_trunc_to_int_32', b[32:64])))
-    e.append(m2_expr.ExprAff(a[64:96], m2_expr.ExprOp('float_trunc_to_int_32', b[64:96])))
-    e.append(m2_expr.ExprAff(a[96:128], m2_expr.ExprOp('float_trunc_to_int_32', b[96:128])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_trunc_to_int_32', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('float_trunc_to_int_32', b[32:64])))
+    e.append(
+        m2_expr.ExprAff(a[64:96], m2_expr.ExprOp('float_trunc_to_int_32', b[64:96])))
+    e.append(
+        m2_expr.ExprAff(a[96:128], m2_expr.ExprOp('float_trunc_to_int_32', b[96:128])))
     return e, []
 
+
 def cvttps2pi(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_trunc_to_int_32', b[:32])))
-    e.append(m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('float_trunc_to_int_32', b[32:64])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_trunc_to_int_32', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[32:64], m2_expr.ExprOp('float_trunc_to_int_32', b[32:64])))
     return e, []
 
+
 def cvttsd2si(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_trunc_to_int_32', b[:64])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('double_trunc_to_int_32', b[:64])))
     return e, []
 
+
 def cvttss2si(ir, instr, a, b):
     e = []
-    e.append(m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_trunc_to_int_32', b[:32])))
+    e.append(
+        m2_expr.ExprAff(a[:32], m2_expr.ExprOp('float_trunc_to_int_32', b[:32])))
     return e, []
 
+
 def movss(ir, instr, a, b):
     e = []
     if not isinstance(a, m2_expr.ExprMem) and not isinstance(b, m2_expr.ExprMem):
@@ -3422,6 +3571,89 @@ def ucomiss(ir, instr, a, b):
     return e, []
 
 
+def pshufb(ir, instr, a, b):
+    e = []
+    if a.size == 64:
+        bit_l = 3
+    elif a.size == 128:
+        bit_l = 4
+    else:
+        raise NotImplementedError("bad size")
+    for i in xrange(0, b.size, 8):
+        index = b[i:i + bit_l].zeroExtend(a.size) << m2_expr.ExprInt(3, a.size)
+        value = (a >> index)[:8]
+        e.append(m2_expr.ExprAff(a[i:i + 8],
+                                 m2_expr.ExprCond(b[i + 7:i + 8],
+                                                  m2_expr.ExprInt8(0),
+                                                  value)))
+    return e, []
+
+
+def pshufd(ir, instr, a, b, c):
+    e = []
+    for i in xrange(4):
+        index = c[2 * i:2 * (i + 1)].zeroExtend(a.size)
+        index <<= m2_expr.ExprInt(5, a.size)
+        value = (a >> index)[:32]
+        e.append(m2_expr.ExprAff(a[32 * i:32 * (i + 1)], value))
+    return e, []
+
+
+def ps_rl_ll(ir, instr, a, b, op, size):
+    lbl_zero = m2_expr.ExprId(ir.gen_label(), ir.IRDst.size)
+    lbl_do = m2_expr.ExprId(ir.gen_label(), ir.IRDst.size)
+    lbl_next = m2_expr.ExprId(ir.get_next_label(instr), ir.IRDst.size)
+
+    if b.size == 8:
+        count = b.zeroExtend(a.size)
+    else:
+        count = b.zeroExtend(a.size)
+
+    mask = {16: 0xF,
+            32: 0x1F,
+            64: 0x3F}[size]
+    test = count & m2_expr.ExprInt(((1 << a.size) - 1) ^ mask, a.size)
+    e = [m2_expr.ExprAff(ir.IRDst, m2_expr.ExprCond(test,
+                                                    lbl_zero,
+                                                    lbl_do))]
+
+    e_zero = [m2_expr.ExprAff(a, m2_expr.ExprInt(0, a.size)),
+              m2_expr.ExprAff(ir.IRDst, lbl_next)]
+
+    e_do = []
+    for i in xrange(0, a.size, size):
+        e.append(m2_expr.ExprAff(a[i:i + size], m2_expr.ExprOp(op,
+                                                               a[i:i + size],
+                                                               count[:size])))
+
+    e_do.append(m2_expr.ExprAff(ir.IRDst, lbl_next))
+    return e, [irbloc(lbl_do.name, [e_do]), irbloc(lbl_zero.name, [e_zero])]
+
+
+def psrlw(ir, instr, a, b):
+    return ps_rl_ll(ir, instr, a, b, ">>", 16)
+
+
+def psrld(ir, instr, a, b):
+    return ps_rl_ll(ir, instr, a, b, ">>", 32)
+
+
+def psrlq(ir, instr, a, b):
+    return ps_rl_ll(ir, instr, a, b, ">>", 64)
+
+
+def psllw(ir, instr, a, b):
+    return ps_rl_ll(ir, instr, a, b, "<<", 16)
+
+
+def pslld(ir, instr, a, b):
+    return ps_rl_ll(ir, instr, a, b, "<<",  32)
+
+
+def psllq(ir, instr, a, b):
+    return ps_rl_ll(ir, instr, a, b, "<<",  64)
+
+
 def iret(ir, instr):
     """IRET implementation
     XXX: only support "no-privilege change"
@@ -3433,6 +3665,299 @@ def iret(ir, instr):
     return exprs, []
 
 
+def pmaxu(ir, instr, a, b, size):
+    e = []
+    for i in xrange(0, a.size, size):
+        op1 = a[i:i + size]
+        op2 = b[i:i + size]
+        res = op1 - op2
+        # Compote CF in @res = @op1 - @op2
+        ret = (((op1 ^ op2) ^ res) ^ ((op1 ^ res) & (op1 ^ op2))).msb()
+
+        e.append(m2_expr.ExprAff(a[i:i + size],
+                                 m2_expr.ExprCond(ret,
+                                                  b[i:i + size],
+                                                  a[i:i + size])))
+    return e, []
+
+
+def pmaxub(ir, instr, a, b):
+    return pmaxu(ir, instr, a, b, 8)
+
+
+def pmaxuw(ir, instr, a, b):
+    return pmaxu(ir, instr, a, b, 16)
+
+
+def pmaxud(ir, instr, a, b):
+    return pmaxu(ir, instr, a, b, 32)
+
+
+def pminu(ir, instr, a, b, size):
+    e = []
+    for i in xrange(0, a.size, size):
+        op1 = a[i:i + size]
+        op2 = b[i:i + size]
+        res = op1 - op2
+        # Compote CF in @res = @op1 - @op2
+        ret = (((op1 ^ op2) ^ res) ^ ((op1 ^ res) & (op1 ^ op2))).msb()
+
+        e.append(m2_expr.ExprAff(a[i:i + size],
+                                 m2_expr.ExprCond(ret,
+                                                  a[i:i + size],
+                                                  b[i:i + size])))
+    return e, []
+
+
+def pminub(ir, instr, a, b):
+    return pminu(ir, instr, a, b, 8)
+
+
+def pminuw(ir, instr, a, b):
+    return pminu(ir, instr, a, b, 16)
+
+
+def pminud(ir, instr, a, b):
+    return pminu(ir, instr, a, b, 32)
+
+
+def pcmpeq(ir, instr, a, b, size):
+    e = []
+    for i in xrange(0, a.size, size):
+        test = a[i:i + size] - b[i:i + size]
+        e.append(m2_expr.ExprAff(a[i:i + size],
+                                 m2_expr.ExprCond(test,
+                                                  m2_expr.ExprInt(0, size),
+                                                  m2_expr.ExprInt(-1, size))))
+    return e, []
+
+
+def pcmpeqb(ir, instr, a, b):
+    return pcmpeq(ir, instr, a, b, 8)
+
+
+def pcmpeqw(ir, instr, a, b):
+    return pcmpeq(ir, instr, a, b, 16)
+
+
+def pcmpeqd(ir, instr, a, b):
+    return pcmpeq(ir, instr, a, b, 32)
+
+
+def punpck(ir, instr, a, b, size, off):
+    e = []
+    for i in xrange(a.size / (2 * size)):
+        src1 = a[size * i + off: size * i + off + size]
+        src2 = b[size * i + off: size * i + off + size]
+        e.append(m2_expr.ExprAff(a[size * 2 * i: size * 2 * i + size], src1))
+        e.append(
+            m2_expr.ExprAff(a[size * (2 * i + 1): size * (2 * i + 1) + size], src2))
+    return e, []
+
+
+def punpckhbw(ir, instr, a, b):
+    return punpck(ir, instr, a, b, 8, a.size / 2)
+
+
+def punpckhwd(ir, instr, a, b):
+    return punpck(ir, instr, a, b, 16, a.size / 2)
+
+
+def punpckhdq(ir, instr, a, b):
+    return punpck(ir, instr, a, b, 32, a.size / 2)
+
+
+def punpckhqdq(ir, instr, a, b):
+    return punpck(ir, instr, a, b, 64, a.size / 2)
+
+
+def punpcklbw(ir, instr, a, b):
+    return punpck(ir, instr, a, b, 8, 0)
+
+
+def punpcklwd(ir, instr, a, b):
+    return punpck(ir, instr, a, b, 16, 0)
+
+
+def punpckldq(ir, instr, a, b):
+    return punpck(ir, instr, a, b, 32, 0)
+
+
+def punpcklqdq(ir, instr, a, b):
+    return punpck(ir, instr, a, b, 64, 0)
+
+
+def pinsr(ir, instr, a, b, c, size):
+    e = []
+
+    mask = {8: 0xF,
+            16: 0x7,
+            32: 0x3,
+            64: 0x1}[size]
+
+    sel = (int(c.arg) & mask) * size
+    e.append(m2_expr.ExprAff(a[sel:sel + size], b[:size]))
+
+    return e, []
+
+
+def pinsrb(ir, instr, a, b, c):
+    return pinsr(ir, instr, a, b, c, 8)
+
+
+def pinsrw(ir, instr, a, b, c):
+    return pinsr(ir, instr, a, b, c, 16)
+
+
+def pinsrd(ir, instr, a, b, c):
+    return pinsr(ir, instr, a, b, c, 32)
+
+
+def pinsrq(ir, instr, a, b, c):
+    return pinsr(ir, instr, a, b, c, 64)
+
+
+def pextr(ir, instr, a, b, c, size):
+    e = []
+
+    mask = {8: 0xF,
+            16: 0x7,
+            32: 0x3,
+            64: 0x1}[size]
+
+    sel = (int(c.arg) & mask) * size
+    e.append(m2_expr.ExprAff(a, b[sel:sel + size].zeroExtend(a.size)))
+
+    return e, []
+
+
+def pextrb(ir, instr, a, b, c):
+    return pextr(ir, instr, a, b, c, 8)
+
+
+def pextrw(ir, instr, a, b, c):
+    return pextr(ir, instr, a, b, c, 16)
+
+
+def pextrd(ir, instr, a, b, c):
+    return pextr(ir, instr, a, b, c, 32)
+
+
+def pextrq(ir, instr, a, b, c):
+    return pextr(ir, instr, a, b, c, 64)
+
+
+def unpckhps(ir, instr, a, b):
+    e = []
+    src = m2_expr.ExprCompose([(a[64:96], 0, 32),
+                               (b[64:96], 32, 64),
+                               (a[96:128], 64, 96),
+                               (b[96:128], 96, 128)])
+    e.append(m2_expr.ExprAff(a, src))
+    return e, []
+
+
+def unpckhpd(ir, instr, a, b):
+    e = []
+    src = m2_expr.ExprCompose([(a[64:128], 0, 64),
+                               (b[64:128], 64, 128)])
+    e.append(m2_expr.ExprAff(a, src))
+    return e, []
+
+
+def unpcklps(ir, instr, a, b):
+    e = []
+    src = m2_expr.ExprCompose([(a[0:32], 0, 32),
+                               (b[0:32], 32, 64),
+                               (a[32:64], 64, 96),
+                               (b[32:64], 96, 128)])
+    e.append(m2_expr.ExprAff(a, src))
+    return e, []
+
+
+def unpcklpd(ir, instr, a, b):
+    e = []
+    src = m2_expr.ExprCompose([(a[0:64], 0, 64),
+                               (b[0:64], 64, 128)])
+    e.append(m2_expr.ExprAff(a, src))
+    return e, []
+
+
+def movlpd(ir, instr, a, b):
+    e = []
+    e.append(m2_expr.ExprAff(a[:64], b[:64]))
+    return e, []
+
+
+def movlps(ir, instr, a, b):
+    e = []
+    e.append(m2_expr.ExprAff(a[:64], b[:64]))
+    return e, []
+
+
+def movhpd(ir, instr, a, b):
+    e = []
+    e.append(m2_expr.ExprAff(a[64:128], b[:64]))
+    return e, []
+
+
+def movhps(ir, instr, a, b):
+    e = []
+    e.append(m2_expr.ExprAff(a[64:128], b[:64]))
+    return e, []
+
+
+def movdq2q(ir, instr, a, b):
+    e = []
+    e.append(m2_expr.ExprAff(a, b[:64]))
+    return e, []
+
+
+def sqrt_gen(ir, instr, a, b, size):
+    e = []
+    out = []
+    for i in b.size / size:
+        out.append((m2_expr.ExprOp('fsqrt' % size,
+                                   b[i * size: (i + 1) * size]),
+                    i * size, (i + 1) * size))
+    src = m2_expr.ExprCompose(out)
+    e.append(m2_expr.ExprAff(a, src))
+    return e, []
+
+
+def sqrtpd(ir, instr, a, b):
+    return sqrt_gen(ir, instr, a, b, 64)
+
+
+def sqrtps(ir, instr, a, b):
+    return sqrt_gen(ir, instr, a, b, 32)
+
+
+def sqrtsd(ir, instr, a, b):
+    e = []
+    e.append(m2_expr.ExprAff(a[:64],
+                             m2_expr.ExprOp('fsqrt',
+                                            b[:64])))
+    return e, []
+
+
+def sqrtss(ir, instr, a, b):
+    e = []
+    e.append(m2_expr.ExprAff(a[:32],
+                             m2_expr.ExprOp('fsqrt',
+                                            b[:32])))
+    return e, []
+
+
+def pmovmskb(ir, instr, a, b):
+    e = []
+    out = []
+    for i in xrange(b.size / 8):
+        out.append((b[8 * i + 7:8 * (i + 1)], i, i + 1))
+    src = m2_expr.ExprCompose(out)
+    e.append(m2_expr.ExprAff(a, src.zeroExtend(a.size)))
+    return e, []
+
 mnemo_func = {'mov': mov,
               'xchg': xchg,
               'movzx': movzx,
@@ -3676,8 +4201,8 @@ mnemo_func = {'mov': mov,
               'cmovz': cmovz,
               'cmove': cmovz,
               'cmovnz': cmovnz,
-              'cmovpe':cmovpe,
-              'cmovnp':cmovnp,
+              'cmovpe': cmovpe,
+              'cmovnp': cmovnp,
               'cmovge': cmovge,
               'cmovnl': cmovge,
               'cmovg': cmovg,
@@ -3736,14 +4261,16 @@ mnemo_func = {'mov': mov,
               "fnclex": fnclex,
               "str": l_str,
               "movd": movd,
-              "movdqu":movdqu,
-              "movdqa":movdqu,
-              "movapd": movapd, # XXX TODO alignement check
-              "movupd": movapd, # XXX TODO alignement check
-              "movaps": movapd, # XXX TODO alignement check
-              "movups": movapd, # XXX TODO alignement check
+              "movdqu": movdqu,
+              "movdqa": movdqu,
+              "movapd": movapd,  # XXX TODO alignement check
+              "movupd": movapd,  # XXX TODO alignement check
+              "movaps": movapd,  # XXX TODO alignement check
+              "movups": movapd,  # XXX TODO alignement check
               "andps": andps,
               "andpd": andps,
+              "andnps": andnps,
+              "andnpd": andnps,
               "orps": orps,
               "orpd": orps,
               "xorps": xorps,
@@ -3787,13 +4314,13 @@ mnemo_func = {'mov': mov,
 
               "ucomiss": ucomiss,
 
-              ####
-              #### MMX/AVX/SSE operations
+              #
+              # MMX/AVX/SSE operations
 
-              ### Arithmetic (integers)
-              ###
+              # Arithmetic (integers)
+              #
 
-              ## Additions
+              # Additions
               # SSE
               "paddb": paddb,
               "paddw": paddw,
@@ -3807,45 +4334,108 @@ mnemo_func = {'mov': mov,
               "psubd": psubd,
               "psubq": psubq,
 
-              ### Arithmetic (floating-point)
-              ###
+              # Arithmetic (floating-point)
+              #
 
-              ## Additions
+              # Additions
               # SSE
               "addss": addss,
               "addsd": addsd,
               "addps": addps,
               "addpd": addpd,
 
-              ## Substractions
+              # Substractions
               # SSE
               "subss": subss,
               "subsd": subsd,
               "subps": subps,
               "subpd": subpd,
 
-              ## Multiplications
+              # Multiplications
               # SSE
               "mulss": mulss,
               "mulsd": mulsd,
               "mulps": mulps,
               "mulpd": mulpd,
 
-              ## Divisions
+              # Divisions
               # SSE
               "divss": divss,
               "divsd": divsd,
               "divps": divps,
               "divpd": divpd,
 
-              ### Logical (floating-point)
-              ###
+              # Logical (floating-point)
+              #
 
               "pand": pand,
+              "pandn": pandn,
               "por": por,
 
               "rdmsr": rdmsr,
               "wrmsr": wrmsr,
+              "pshufb": pshufb,
+              "pshufd": pshufd,
+
+              "psrlw": psrlw,
+              "psrld": psrld,
+              "psrlq": psrlq,
+              "psllw": psllw,
+              "pslld": pslld,
+              "psllq": psllq,
+
+              "pmaxub": pmaxub,
+              "pmaxuw": pmaxuw,
+              "pmaxud": pmaxud,
+
+              "pminub": pminub,
+              "pminuw": pminuw,
+              "pminud": pminud,
+
+              "pcmpeqb": pcmpeqb,
+              "pcmpeqw": pcmpeqw,
+              "pcmpeqd": pcmpeqd,
+
+              "punpckhbw": punpckhbw,
+              "punpckhwd": punpckhwd,
+              "punpckhdq": punpckhdq,
+              "punpckhqdq": punpckhqdq,
+
+
+              "punpcklbw": punpcklbw,
+              "punpcklwd": punpcklwd,
+              "punpckldq": punpckldq,
+              "punpcklqdq": punpcklqdq,
+
+              "pinsrb": pinsrb,
+              "pinsrw": pinsrw,
+              "pinsrd": pinsrd,
+              "pinsrq": pinsrq,
+
+              "pextrb": pextrb,
+              "pextrw": pextrw,
+              "pextrd": pextrd,
+              "pextrq": pextrq,
+
+              "unpckhps": unpckhps,
+              "unpckhpd": unpckhpd,
+              "unpcklps": unpcklps,
+              "unpcklpd": unpcklpd,
+
+              "movlpd": movlpd,
+              "movlps": movlps,
+              "movhpd": movhpd,
+              "movhps": movhps,
+              "movlhps": movhps,
+              "movhlps": movlps,
+              "movdq2q": movdq2q,
+
+              "sqrtpd": sqrtpd,
+              "sqrtps": sqrtps,
+              "sqrtsd": sqrtsd,
+              "sqrtss": sqrtss,
+
+              "pmovmskb": pmovmskb,
 
               }
 
@@ -3881,7 +4471,8 @@ class ir_x86_16(ir):
                                                              a.arg), a.size)
 
         if not instr.name.lower() in mnemo_func:
-            raise NotImplementedError("Mnemonic %s not implemented" % instr.name)
+            raise NotImplementedError(
+                "Mnemonic %s not implemented" % instr.name)
 
         instr_ir, extra_ir = mnemo_func[
             instr.name.lower()](self, instr, *args)
@@ -3960,8 +4551,8 @@ class ir_x86_16(ir):
                 if destination is a 32 bit reg, zero extend the 64 bit reg
                 """
                 if mode == 64:
-                    if (isinstance(e.dst, m2_expr.ExprId) and \
-                            e.dst.size == 32 and \
+                    if (isinstance(e.dst, m2_expr.ExprId) and
+                            e.dst.size == 32 and
                             e.dst in replace_regs[64]):
                         src = self.expr_fix_regs_for_mode(e.src, mode)
                         dst = replace_regs[64][e.dst].arg
diff --git a/miasm2/jitter/arch/JitCore_x86.c b/miasm2/jitter/arch/JitCore_x86.c
index 9d15cd20..12fc1b7e 100644
--- a/miasm2/jitter/arch/JitCore_x86.c
+++ b/miasm2/jitter/arch/JitCore_x86.c
@@ -45,6 +45,15 @@ reg_dict gpreg_dict[] = { {.name = "RAX", .offset = offsetof(vm_cpu_t, RAX)},
 			  {.name = "FS", .offset = offsetof(vm_cpu_t, FS)},
 			  {.name = "GS", .offset = offsetof(vm_cpu_t, GS)},
 
+			  {.name = "MM0", .offset = offsetof(vm_cpu_t, MM0)},
+			  {.name = "MM1", .offset = offsetof(vm_cpu_t, MM1)},
+			  {.name = "MM2", .offset = offsetof(vm_cpu_t, MM2)},
+			  {.name = "MM3", .offset = offsetof(vm_cpu_t, MM3)},
+			  {.name = "MM4", .offset = offsetof(vm_cpu_t, MM4)},
+			  {.name = "MM5", .offset = offsetof(vm_cpu_t, MM5)},
+			  {.name = "MM6", .offset = offsetof(vm_cpu_t, MM6)},
+			  {.name = "MM7", .offset = offsetof(vm_cpu_t, MM7)},
+
 };
 
 
@@ -96,6 +105,16 @@ PyObject* cpu_get_gpreg(JitCpu* self)
     get_reg(FS);
     get_reg(GS);
 
+    get_reg(MM0);
+    get_reg(MM1);
+    get_reg(MM2);
+    get_reg(MM3);
+    get_reg(MM4);
+    get_reg(MM5);
+    get_reg(MM6);
+    get_reg(MM7);
+
+
     return dict;
 }
 
@@ -475,6 +494,15 @@ getset_reg_R_u16(BP);
 
 getset_reg_R_u16(IP);
 
+getset_reg_u64(MM0);
+getset_reg_u64(MM1);
+getset_reg_u64(MM2);
+getset_reg_u64(MM3);
+getset_reg_u64(MM4);
+getset_reg_u64(MM5);
+getset_reg_u64(MM6);
+getset_reg_u64(MM7);
+
 
 PyObject* get_gpreg_offset_all(void)
 {
@@ -750,6 +778,15 @@ static PyGetSetDef JitCpu_getseters[] = {
 
     {"IP", (getter)JitCpu_get_IP, (setter)JitCpu_set_IP, "IP", NULL},
 
+    {"MM0", (getter)JitCpu_get_MM0, (setter)JitCpu_set_MM0, "MM0", NULL},
+    {"MM1", (getter)JitCpu_get_MM1, (setter)JitCpu_set_MM1, "MM1", NULL},
+    {"MM2", (getter)JitCpu_get_MM2, (setter)JitCpu_set_MM2, "MM2", NULL},
+    {"MM3", (getter)JitCpu_get_MM3, (setter)JitCpu_set_MM3, "MM3", NULL},
+    {"MM4", (getter)JitCpu_get_MM4, (setter)JitCpu_set_MM4, "MM4", NULL},
+    {"MM5", (getter)JitCpu_get_MM5, (setter)JitCpu_set_MM5, "MM5", NULL},
+    {"MM6", (getter)JitCpu_get_MM6, (setter)JitCpu_set_MM6, "MM6", NULL},
+    {"MM7", (getter)JitCpu_get_MM7, (setter)JitCpu_set_MM7, "MM7", NULL},
+
 
     {NULL}  /* Sentinel */
 };
diff --git a/test/arch/x86/arch.py b/test/arch/x86/arch.py
index 7d6260a2..dfe4ef91 100644
--- a/test/arch/x86/arch.py
+++ b/test/arch/x86/arch.py
@@ -2238,6 +2238,33 @@ reg_tests = [
     (m32, "00000000    MOVAPD     XMMWORD PTR [EBP+0xFFFFFFB8], XMM0",
      "660f2945b8"),
 
+
+    (m32, "00000000    MOVLPD     XMM0, QWORD PTR [ESP+0x4]",
+     "660F12442404"),
+    (m32, "00000000    MOVLPS     XMM0, QWORD PTR [ESP+0x4]",
+     "0F12442404"),
+    (m32, "00000000    MOVLPD     QWORD PTR [ESP+0x4], XMM0",
+     "660F13442404"),
+    (m32, "00000000    MOVLPS     QWORD PTR [ESP+0x4], XMM0",
+     "0F13442404"),
+
+    (m32, "00000000    MOVHPD     XMM0, QWORD PTR [ESP+0x4]",
+     "660F16442404"),
+    (m32, "00000000    MOVHPS     XMM0, QWORD PTR [ESP+0x4]",
+     "0F16442404"),
+    (m32, "00000000    MOVHPD     QWORD PTR [ESP+0x4], XMM0",
+     "660F17442404"),
+    (m32, "00000000    MOVHPS     QWORD PTR [ESP+0x4], XMM0",
+     "0F17442404"),
+
+    (m32, "00000000    MOVLHPS    XMM2, XMM1",
+     "0F16D1"),
+    (m32, "00000000    MOVHLPS    XMM2, XMM1",
+     "0F12D1"),
+
+    (m32, "00000000    MOVDQ2Q    MM2, XMM1",
+     "F20Fd6D1"),
+
     (m32, "00000000    MOVUPS     XMM2, XMMWORD PTR [ECX]",
      "0f1011"),
     (m32, "00000000    MOVSD      XMM2, QWORD PTR [ECX]",
@@ -2287,6 +2314,11 @@ reg_tests = [
      "0f548327bd2c00"),
     (m32, "00000000    ANDPD      XMM0, XMMWORD PTR [EBX+0x2CBD27]",
      "660f548327bd2c00"),
+    (m32, "00000000    ANDNPS     XMM0, XMMWORD PTR [EBX+0x2CBD27]",
+     "0f558327bd2c00"),
+    (m32, "00000000    ANDNPD     XMM0, XMMWORD PTR [EBX+0x2CBD27]",
+     "660f558327bd2c00"),
+
 
     (m32, "00000000    SUBSD      XMM1, XMM0",
      "f20f5cc8"),
@@ -2434,6 +2466,18 @@ reg_tests = [
     (m32, "00000000    PAND       XMM0, XMM4",
      "660fdbc4"),
 
+    (m32, "00000000    PANDN      MM2, MM6",
+     "0fdfd6"),
+    (m32, "00000000    PANDN      XMM2, XMM6",
+     "660fdfd6"),
+
+
+    (m32, "00000000    PANDN      MM0, MM4",
+     "0fdfc4"),
+    (m32, "00000000    PANDN      XMM0, XMM4",
+     "660fdfc4"),
+
+
     (m32, "00000000    POR        XMM0, XMM1",
      "660febc1"),
     (m32, "00000000    POR        XMM6, XMMWORD PTR [ECX+0x10]",
@@ -2574,6 +2618,245 @@ reg_tests = [
     (m32, "00000000    COMISD     XMM7, XMM6",
     "660F2FFE"),
 
+    (m32, "00000000    PSHUFB     MM6, QWORD PTR [ESI]",
+    "0F380036"),
+    (m32, "00000000    PSHUFB     XMM6, XMMWORD PTR [ESI]",
+    "660F380036"),
+    (m32, "00000000    PSHUFD     XMM6, XMMWORD PTR [ESI], 0xEE",
+    "660F7036EE"),
+
+
+    (m32, "00000000    PSRLQ      MM6, 0x5",
+    "0F73D605"),
+    (m32, "00000000    PSRLQ      XMM6, 0x5",
+    "660F73D605"),
+    (m32, "00000000    PSRLD      MM6, 0x5",
+    "0F72D605"),
+    (m32, "00000000    PSRLD      XMM6, 0x5",
+    "660F72D605"),
+    (m32, "00000000    PSRLW      MM6, 0x5",
+    "0F71D605"),
+    (m32, "00000000    PSRLW      XMM6, 0x5",
+    "660F71D605"),
+
+
+    (m32, "00000000    PSRLQ      MM2, QWORD PTR [EDX]",
+    "0FD312"),
+    (m32, "00000000    PSRLQ      XMM2, XMMWORD PTR [EDX]",
+    "660FD312"),
+
+    (m32, "00000000    PSRLD      MM2, QWORD PTR [EDX]",
+    "0FD212"),
+    (m32, "00000000    PSRLD      XMM2, XMMWORD PTR [EDX]",
+    "660FD212"),
+
+    (m32, "00000000    PSRLW      MM2, QWORD PTR [EDX]",
+    "0FD112"),
+    (m32, "00000000    PSRLW      XMM2, XMMWORD PTR [EDX]",
+    "660FD112"),
+
+
+
+
+    (m32, "00000000    PSLLQ      MM6, 0x5",
+    "0F73F605"),
+    (m32, "00000000    PSLLQ      XMM6, 0x5",
+    "660F73F605"),
+    (m32, "00000000    PSLLD      MM6, 0x5",
+    "0F72F605"),
+    (m32, "00000000    PSLLD      XMM6, 0x5",
+    "660F72F605"),
+    (m32, "00000000    PSLLW      MM6, 0x5",
+    "0F71F605"),
+    (m32, "00000000    PSLLW      XMM6, 0x5",
+    "660F71F605"),
+
+
+    (m32, "00000000    PSLLQ      MM2, QWORD PTR [EDX]",
+    "0FF312"),
+    (m32, "00000000    PSLLQ      XMM2, XMMWORD PTR [EDX]",
+    "660FF312"),
+
+    (m32, "00000000    PSLLD      MM2, QWORD PTR [EDX]",
+    "0FF212"),
+    (m32, "00000000    PSLLD      XMM2, XMMWORD PTR [EDX]",
+    "660FF212"),
+
+    (m32, "00000000    PSLLW      MM2, QWORD PTR [EDX]",
+    "0FF112"),
+    (m32, "00000000    PSLLW      XMM2, XMMWORD PTR [EDX]",
+    "660FF112"),
+
+    (m32, "00000000    PMAXUB     MM2, QWORD PTR [EDX]",
+    "0FDE12"),
+    (m32, "00000000    PMAXUB     XMM2, XMMWORD PTR [EDX]",
+    "660FDE12"),
+
+    (m32, "00000000    PMAXUW     XMM2, XMMWORD PTR [EDX]",
+    "660F383E12"),
+    (m32, "00000000    PMAXUD     XMM2, XMMWORD PTR [EDX]",
+    "660F383F12"),
+
+
+
+    (m32, "00000000    PMINUB     MM2, QWORD PTR [EDX]",
+    "0FDA12"),
+    (m32, "00000000    PMINUB     XMM2, XMMWORD PTR [EDX]",
+    "660FDA12"),
+
+    (m32, "00000000    PMINUW     XMM2, XMMWORD PTR [EDX]",
+    "660F383A12"),
+    (m32, "00000000    PMINUD     XMM2, XMMWORD PTR [EDX]",
+    "660F383B12"),
+
+
+    (m32, "00000000    PCMPEQB    MM2, QWORD PTR [EDX]",
+    "0F7412"),
+    (m32, "00000000    PCMPEQB    XMM2, XMMWORD PTR [EDX]",
+    "660F7412"),
+
+    (m32, "00000000    PCMPEQW    MM2, QWORD PTR [EDX]",
+    "0F7512"),
+    (m32, "00000000    PCMPEQW    XMM2, XMMWORD PTR [EDX]",
+    "660F7512"),
+
+    (m32, "00000000    PCMPEQD    MM2, QWORD PTR [EDX]",
+    "0F7612"),
+    (m32, "00000000    PCMPEQD    XMM2, XMMWORD PTR [EDX]",
+    "660F7612"),
+
+
+    (m32, "00000000    PUNPCKHBW  MM2, QWORD PTR [EDX]",
+    "0F6812"),
+    (m32, "00000000    PUNPCKHBW  XMM2, XMMWORD PTR [EDX]",
+    "660F6812"),
+
+    (m32, "00000000    PUNPCKHWD  MM2, QWORD PTR [EDX]",
+    "0F6912"),
+    (m32, "00000000    PUNPCKHWD  XMM2, XMMWORD PTR [EDX]",
+    "660F6912"),
+
+    (m32, "00000000    PUNPCKHDQ  MM2, QWORD PTR [EDX]",
+    "0F6A12"),
+    (m32, "00000000    PUNPCKHDQ  XMM2, XMMWORD PTR [EDX]",
+    "660F6A12"),
+
+    (m32, "00000000    PUNPCKHQDQ XMM2, XMMWORD PTR [EDX]",
+    "660F6D12"),
+
+
+    (m32, "00000000    PUNPCKLBW  MM2, QWORD PTR [EDX]",
+    "0F6012"),
+    (m32, "00000000    PUNPCKLBW  XMM2, XMMWORD PTR [EDX]",
+    "660F6012"),
+
+    (m32, "00000000    PUNPCKLWD  MM2, QWORD PTR [EDX]",
+    "0F6112"),
+    (m32, "00000000    PUNPCKLWD  XMM2, XMMWORD PTR [EDX]",
+    "660F6112"),
+
+    (m32, "00000000    PUNPCKLDQ  MM2, QWORD PTR [EDX]",
+    "0F6212"),
+    (m32, "00000000    PUNPCKLDQ  XMM2, XMMWORD PTR [EDX]",
+    "660F6212"),
+
+    (m32, "00000000    PUNPCKLQDQ XMM2, XMMWORD PTR [EDX]",
+    "660F6C12"),
+
+
+    (m32, "00000000    PINSRB     XMM2, BYTE PTR [EDX], 0x5",
+    "660F3A201205"),
+
+    (m32, "00000000    PINSRW     MM2, WORD PTR [EDX], 0x5",
+    "0FC41205"),
+    (m32, "00000000    PINSRW     XMM2, WORD PTR [EDX], 0x5",
+    "660FC41205"),
+
+    (m32, "00000000    PINSRD     XMM2, DWORD PTR [EDX], 0x5",
+    "660F3A221205"),
+
+
+    (m64, "00000000    PINSRB     XMM2, BYTE PTR [RDX], 0x5",
+    "660F3A201205"),
+
+    (m64, "00000000    PINSRW     MM2, WORD PTR [RDX], 0x5",
+    "0FC41205"),
+    (m64, "00000000    PINSRW     XMM2, WORD PTR [RDX], 0x5",
+    "660FC41205"),
+
+
+    (m64, "00000000    PINSRB     XMM2, EDX, 0x5",
+    "660F3A20D205"),
+
+    (m64, "00000000    PINSRW     MM2, EDX, 0x5",
+    "0FC4D205"),
+    (m64, "00000000    PINSRW     XMM2, EDX, 0x5",
+    "660FC4D205"),
+
+    (m64, "00000000    PINSRB     XMM2, RDX, 0x5",
+    "66480F3A20D205"),
+
+    (m64, "00000000    PINSRW     MM2, RDX, 0x5",
+    "480FC4D205"),
+    (m64, "00000000    PINSRW     XMM2, RDX, 0x5",
+    "66480FC4D205"),
+
+
+    (m64, "00000000    PINSRD     XMM2, DWORD PTR [RDX], 0x5",
+    "660F3A221205"),
+    (m64, "00000000    PINSRQ     XMM2, QWORD PTR [RDX], 0x5",
+    "66480F3A221205"),
+
+
+
+
+
+    (m32, "00000000    PEXTRB     BYTE PTR [EDX], XMM2, 0x5",
+    "660F3A141205"),
+    (m32, "00000000    PEXTRB     EAX, XMM2, 0x5",
+    "660F3A14D005"),
+
+    (m32, "00000000    PEXTRW     WORD PTR [EDX], XMM2, 0x5",
+    "660F3A151205"),
+
+
+    (m32, "00000000    PEXTRW     WORD PTR [EDX], MM2, 0x5",
+    "0FC51205"),
+    (m32, "00000000    PEXTRW     WORD PTR [EDX], XMM2, 0x5",
+    "660FC51205"),
+
+    (m32, "00000000    PEXTRD     DWORD PTR [EDX], XMM2, 0x5",
+    "660F3A161205"),
+
+    (m64, "00000000    PEXTRD     DWORD PTR [RDX], XMM2, 0x5",
+    "660F3A161205"),
+    (m64, "00000000    PEXTRQ     QWORD PTR [RDX], XMM2, 0x5",
+    "66480F3A161205"),
+
+
+    (m32, "00000000    UNPCKHPS   XMM2, XMMWORD PTR [EDX]",
+     "0f1512"),
+    (m32, "00000000    UNPCKHPD   XMM2, XMMWORD PTR [EDX]",
+     "660f1512"),
+
+    (m32, "00000000    UNPCKLPS   XMM2, XMMWORD PTR [EDX]",
+     "0f1412"),
+    (m32, "00000000    UNPCKLPD   XMM2, XMMWORD PTR [EDX]",
+     "660f1412"),
+
+    (m32, "00000000    SQRTPD     XMM2, XMMWORD PTR [EDX]",
+     "660f5112"),
+    (m32, "00000000    SQRTPS     XMM2, XMMWORD PTR [EDX]",
+     "0f5112"),
+    (m32, "00000000    SQRTSD     XMM2, QWORD PTR [EDX]",
+     "F20f5112"),
+    (m32, "00000000    SQRTSS     XMM2, DWORD PTR [EDX]",
+     "F30f5112"),
+
+    (m32, "00000000    PMOVMSKB   EAX, MM7",
+     "0FD7C7"),
+    (m32, "00000000    PMOVMSKB   EAX, XMM7",
+     "660FD7C7"),
 
 
 ]
diff --git a/test/arch/x86/unit/mn_pcmpeq.py b/test/arch/x86/unit/mn_pcmpeq.py
new file mode 100644
index 00000000..a8774cbc
--- /dev/null
+++ b/test/arch/x86/unit/mn_pcmpeq.py
@@ -0,0 +1,64 @@
+#! /usr/bin/env python
+from asm_test import Asm_Test
+import sys
+
+class Test_PCMPEQB(Asm_Test):
+    TXT = '''
+    main:
+       CALL    next
+       .byte 0x88, 0x78, 0x66, 0x56, 0x44, 0x3F, 0xFF, 0x11
+       .byte 0x89, 0x77, 0x66, 0x55, 0xF9, 0x33, 0x22, 0x11
+    next:
+       POP     EBP
+       MOVQ    MM0, QWORD PTR [EBP]
+       MOVQ    MM1, MM0
+       PCMPEQB MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x11FF3F4456667888
+        assert self.myjit.cpu.MM1 == 0xFF00000000FF0000
+
+
+class Test_PCMPEQW(Asm_Test):
+    TXT = '''
+    main:
+       CALL    next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x3F, 0x22, 0x11
+       .byte 0x89, 0x77, 0x66, 0x55, 0xF9, 0x33, 0x22, 0x11
+    next:
+       POP     EBP
+       MOVQ    MM0, QWORD PTR [EBP]
+       MOVQ    MM1, MM0
+       PCMPEQW MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x11223F4455667788
+        assert self.myjit.cpu.MM1 == 0xFFFF0000FFFF0000
+
+
+
+class Test_PCMPEQD(Asm_Test):
+    TXT = '''
+    main:
+       CALL    next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x3F, 0x22, 0x11
+       .byte 0x88, 0x77, 0x66, 0x55, 0xF9, 0x33, 0x22, 0x11
+    next:
+       POP     EBP
+       MOVQ    MM0, QWORD PTR [EBP]
+       MOVQ    MM1, MM0
+       PCMPEQD MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x11223F4455667788
+        assert self.myjit.cpu.MM1 == 0x00000000FFFFFFFF
+
+
+if __name__ == "__main__":
+    [test()() for test in [Test_PCMPEQB, Test_PCMPEQW, Test_PCMPEQD]]
diff --git a/test/arch/x86/unit/mn_pextr.py b/test/arch/x86/unit/mn_pextr.py
new file mode 100644
index 00000000..eb724cf9
--- /dev/null
+++ b/test/arch/x86/unit/mn_pextr.py
@@ -0,0 +1,25 @@
+#! /usr/bin/env python
+from asm_test import Asm_Test
+import sys
+
+class Test_PEXTRB(Asm_Test):
+    TXT = '''
+    main:
+       CALL      next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01
+    next:
+       POP       EBP
+       MOV       EAX, 0xFFFFFFFF
+       MOVQ      MM0, QWORD PTR [EBP]
+       PEXTRW    EAX, MM0, 2
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788
+        assert self.myjit.cpu.EAX == 0x3344
+
+
+if __name__ == "__main__":
+    [test()() for test in [Test_PEXTRB]]
diff --git a/test/arch/x86/unit/mn_pinsr.py b/test/arch/x86/unit/mn_pinsr.py
new file mode 100644
index 00000000..b7a86d2d
--- /dev/null
+++ b/test/arch/x86/unit/mn_pinsr.py
@@ -0,0 +1,25 @@
+#! /usr/bin/env python
+from asm_test import Asm_Test
+import sys
+
+class Test_PINSRB(Asm_Test):
+    TXT = '''
+    main:
+       CALL      next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01
+    next:
+       POP       EBP
+       MOVQ      MM0, QWORD PTR [EBP]
+       MOVQ      MM1, MM0
+       PINSRW    MM1, QWORD PTR [EBP+0x8], 2
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788
+        assert self.myjit.cpu.MM1 == 0x1122070855667788
+
+
+if __name__ == "__main__":
+    [test()() for test in [Test_PINSRB]]
diff --git a/test/arch/x86/unit/mn_pmaxu.py b/test/arch/x86/unit/mn_pmaxu.py
new file mode 100644
index 00000000..08e54c03
--- /dev/null
+++ b/test/arch/x86/unit/mn_pmaxu.py
@@ -0,0 +1,25 @@
+#! /usr/bin/env python
+from asm_test import Asm_Test
+import sys
+
+class Test_PMAXU(Asm_Test):
+    TXT = '''
+    main:
+       CALL   next
+       .byte 0x88, 0x76, 0x66, 0x54, 0x44, 0x32, 0x00, 0x10
+       .byte 0x87, 0x77, 0x66, 0x55, 0x40, 0x33, 0x22, 0x11
+    next:
+       POP    EBP
+       MOVQ   MM0, QWORD PTR [EBP]
+       MOVQ   MM1, MM0
+       PMAXUB MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1000324454667688
+        assert self.myjit.cpu.MM1 == 0x1122334455667788
+
+
+if __name__ == "__main__":
+    [test()() for test in [Test_PMAXU]]
diff --git a/test/arch/x86/unit/mn_pminu.py b/test/arch/x86/unit/mn_pminu.py
new file mode 100644
index 00000000..38a29787
--- /dev/null
+++ b/test/arch/x86/unit/mn_pminu.py
@@ -0,0 +1,25 @@
+#! /usr/bin/env python
+from asm_test import Asm_Test
+import sys
+
+class Test_PMINU(Asm_Test):
+    TXT = '''
+    main:
+       CALL   next
+       .byte 0x88, 0x78, 0x66, 0x56, 0x44, 0x3F, 0xFF, 0x1F
+       .byte 0x89, 0x77, 0x66, 0x55, 0xF9, 0x33, 0x22, 0x11
+    next:
+       POP    EBP
+       MOVQ   MM0, QWORD PTR [EBP]
+       MOVQ   MM1, MM0
+       PMINUB MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1FFF3F4456667888
+        assert self.myjit.cpu.MM1 == 0x1122334455667788
+
+
+if __name__ == "__main__":
+    [test()() for test in [Test_PMINU]]
diff --git a/test/arch/x86/unit/mn_pmovmskb.py b/test/arch/x86/unit/mn_pmovmskb.py
new file mode 100644
index 00000000..97435794
--- /dev/null
+++ b/test/arch/x86/unit/mn_pmovmskb.py
@@ -0,0 +1,26 @@
+#! /usr/bin/env python
+from asm_test import Asm_Test
+import sys
+
+class Test_PMOVMSKB(Asm_Test):
+    TXT = '''
+    main:
+       CALL      next
+       .byte 0x88, 0x77, 0xE6, 0x55, 0xC4, 0x33, 0x22, 0x11
+       .byte 0x01, 0x02, 0xFF, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA
+    next:
+       POP       EBP
+       MOV       EAX, 0xFFFFFFFF
+       MOVQ      MM0, QWORD PTR [EBP]
+       MOVQ      MM1, MM0
+       PMOVMSKB  EAX, MM1
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x112233C455E67788
+        assert self.myjit.cpu.MM1 == 0x112233C455E67788
+        assert self.myjit.cpu.EAX == 0x00000015
+
+if __name__ == "__main__":
+    [test()() for test in [Test_PMOVMSKB,]]
diff --git a/test/arch/x86/unit/mn_pshufb.py b/test/arch/x86/unit/mn_pshufb.py
new file mode 100644
index 00000000..187b2f72
--- /dev/null
+++ b/test/arch/x86/unit/mn_pshufb.py
@@ -0,0 +1,25 @@
+#! /usr/bin/env python
+from asm_test import Asm_Test
+import sys
+
+class Test_PSHUFB(Asm_Test):
+    TXT = '''
+    main:
+       CALL   next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0
+    next:
+       POP    EBP
+       MOVQ   MM0, QWORD PTR [EBP]
+       MOVQ   MM1, MM0
+       PSHUFB MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788L
+        assert self.myjit.cpu.MM1 == 0x8877665544332211L
+
+
+if __name__ == "__main__":
+    [test()() for test in [Test_PSHUFB]]
diff --git a/test/arch/x86/unit/mn_psrl_psll.py b/test/arch/x86/unit/mn_psrl_psll.py
new file mode 100644
index 00000000..93a356f7
--- /dev/null
+++ b/test/arch/x86/unit/mn_psrl_psll.py
@@ -0,0 +1,55 @@
+#! /usr/bin/env python
+from asm_test import Asm_Test
+import sys
+
+class Test_PSRL(Asm_Test):
+    TXT = '''
+    main:
+       CALL   next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0
+    next:
+       POP    EBP
+       MOVQ   MM0, QWORD PTR [EBP]
+       MOVQ   MM1, MM0
+       MOVQ   MM2, MM0
+       MOVQ   MM3, MM0
+       PSRLW  MM1, QWORD PTR [EBP+0x8]
+       PSRLD  MM2, QWORD PTR [EBP+0x8]
+       PSRLQ  MM3, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788L
+        assert self.myjit.cpu.MM1 == 0x0112033405560778L
+        assert self.myjit.cpu.MM2 == 0x0112233405566778L
+        assert self.myjit.cpu.MM3 == 0x0112233445566778L
+
+class Test_PSLL(Asm_Test):
+    TXT = '''
+    main:
+       CALL   next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0
+    next:
+       POP    EBP
+       MOVQ   MM0, QWORD PTR [EBP]
+       MOVQ   MM1, MM0
+       MOVQ   MM2, MM0
+       MOVQ   MM3, MM0
+       PSLLW  MM1, QWORD PTR [EBP+0x8]
+       PSLLD  MM2, QWORD PTR [EBP+0x8]
+       PSLLQ  MM3, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788L
+        assert self.myjit.cpu.MM1 == 0x1220344056607880L
+        assert self.myjit.cpu.MM2 == 0x1223344056677880L
+        assert self.myjit.cpu.MM3 == 0x1223344556677880L
+
+
+if __name__ == "__main__":
+    [test()() for test in [Test_PSRL, Test_PSLL]]
diff --git a/test/arch/x86/unit/mn_punpck.py b/test/arch/x86/unit/mn_punpck.py
new file mode 100644
index 00000000..84d86c32
--- /dev/null
+++ b/test/arch/x86/unit/mn_punpck.py
@@ -0,0 +1,124 @@
+#! /usr/bin/env python
+from asm_test import Asm_Test
+import sys
+
+class Test_PUNPCKHBW(Asm_Test):
+    TXT = '''
+    main:
+       CALL      next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x01, 0x02, 0xFF, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA
+    next:
+       POP       EBP
+       MOVQ      MM0, QWORD PTR [EBP]
+       MOVQ      MM1, MM0
+       PUNPCKHBW MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788
+        assert self.myjit.cpu.MM1 == 0xAA11BB22CC33DD44
+
+
+class Test_PUNPCKHWD(Asm_Test):
+    TXT = '''
+    main:
+       CALL      next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x01, 0x02, 0xFF, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA
+    next:
+       POP       EBP
+       MOVQ      MM0, QWORD PTR [EBP]
+       MOVQ      MM1, MM0
+       PUNPCKHWD MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788
+        assert self.myjit.cpu.MM1 == 0xAABB1122CCDD3344
+
+
+
+class Test_PUNPCKHDQ(Asm_Test):
+    TXT = '''
+    main:
+       CALL      next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x01, 0x02, 0xFF, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA
+    next:
+       POP       EBP
+       MOVQ      MM0, QWORD PTR [EBP]
+       MOVQ      MM1, MM0
+       PUNPCKHDQ MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788
+        assert self.myjit.cpu.MM1 == 0xAABBCCDD11223344
+
+
+
+
+class Test_PUNPCKLBW(Asm_Test):
+    TXT = '''
+    main:
+       CALL      next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x01, 0x02, 0xFF, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA
+    next:
+       POP       EBP
+       MOVQ      MM0, QWORD PTR [EBP]
+       MOVQ      MM1, MM0
+       PUNPCKLBW MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788
+        assert self.myjit.cpu.MM1 == 0xEE55FF6602770188
+
+
+class Test_PUNPCKLWD(Asm_Test):
+    TXT = '''
+    main:
+       CALL      next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x01, 0x02, 0xFF, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA
+    next:
+       POP       EBP
+       MOVQ      MM0, QWORD PTR [EBP]
+       MOVQ      MM1, MM0
+       PUNPCKLWD MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788
+        assert self.myjit.cpu.MM1 == 0xEEFF556602017788
+
+
+
+class Test_PUNPCKLDQ(Asm_Test):
+    TXT = '''
+    main:
+       CALL      next
+       .byte 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11
+       .byte 0x01, 0x02, 0xFF, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA
+    next:
+       POP       EBP
+       MOVQ      MM0, QWORD PTR [EBP]
+       MOVQ      MM1, MM0
+       PUNPCKLDQ MM1, QWORD PTR [EBP+0x8]
+       RET
+    '''
+
+    def check(self):
+        assert self.myjit.cpu.MM0 == 0x1122334455667788
+        assert self.myjit.cpu.MM1 == 0xEEFF020155667788
+
+if __name__ == "__main__":
+    [test()() for test in [Test_PUNPCKHBW, Test_PUNPCKHWD, Test_PUNPCKHDQ,
+                           Test_PUNPCKLBW, Test_PUNPCKLWD, Test_PUNPCKLDQ,]]
diff --git a/test/test_all.py b/test/test_all.py
index 71ea51a5..e52123ea 100644
--- a/test/test_all.py
+++ b/test/test_all.py
@@ -40,6 +40,15 @@ for script in ["x86/sem.py",
                "x86/unit/mn_daa.py",
                "x86/unit/mn_das.py",
                "x86/unit/mn_int.py",
+               "x86/unit/mn_pshufb.py",
+               "x86/unit/mn_psrl_psll.py",
+               "x86/unit/mn_pmaxu.py",
+               "x86/unit/mn_pminu.py",
+               "x86/unit/mn_pcmpeq.py",
+               "x86/unit/mn_punpck.py",
+               "x86/unit/mn_pinsr.py",
+               "x86/unit/mn_pextr.py",
+               "x86/unit/mn_pmovmskb.py",
                "arm/arch.py",
                "arm/sem.py",
                "aarch64/unit/mn_ubfm.py",