10 files changed, 931 insertions, 396 deletions
diff --git a/miasm2/arch/x86/arch.py b/miasm2/arch/x86/arch.py
index 815eaee6..bf872667 100644
--- a/miasm2/arch/x86/arch.py
+++ b/miasm2/arch/x86/arch.py
@@ -3771,7 +3771,8 @@ addop("movq", [bs8(0x0f), bs8(0xd6), pref_66] +
 
 addop("movmskps", [bs8(0x0f), bs8(0x50), no_xmm_pref] +
       rmmod(reg_modrm, rm_arg_xmm_reg))
-
+addop("movmskpd", [bs8(0x0f), bs8(0x50), pref_66] +
+      rmmod(reg_modrm, rm_arg_xmm_reg))
 
 addop("addss", [bs8(0x0f), bs8(0x58), pref_f3] + rmmod(xmm_reg, rm_arg_xmm_m32))
 addop("addsd", [bs8(0x0f), bs8(0x58), pref_f2] + rmmod(xmm_reg, rm_arg_xmm_m64))
@@ -3792,10 +3793,6 @@ addop("pminsw", [bs8(0x0f), bs8(0xea), pref_66] + rmmod(xmm_reg, rm_arg_xmm))
 addop("ucomiss", [bs8(0x0f), bs8(0x2e), no_xmm_pref] + rmmod(xmm_reg, rm_arg_xmm_m32))
 addop("ucomisd", [bs8(0x0f), bs8(0x2e), pref_66] + rmmod(xmm_reg, rm_arg_xmm_m64))
 
-addop("maxsd", [bs8(0x0f), bs8(0x5f), pref_f2] + rmmod(xmm_reg, rm_arg_xmm_m64))
-addop("maxss", [bs8(0x0f), bs8(0x5f), pref_f3] + rmmod(xmm_reg, rm_arg_xmm_m32))
-
-
 
 addop("movzx", [bs8(0x0f), bs("1011011"), w8, sx] + rmmod(rmreg, rm_arg_sx))
 addop("mul", [bs('1111011'), w8] + rmmod(d4, rm_arg_w8))
@@ -4021,9 +4018,9 @@ addop("xgetbv", [bs8(0x0f), bs8(0x01), bs8(0xd0)])
 addop("movapd", [bs8(0x0f), bs("0010100"), swapargs]
       + rmmod(xmm_reg, rm_arg_xmm) + [bs_opmode16], [xmm_reg, rm_arg_xmm])
 addop("movaps", [bs8(0x0f), bs("0010100"), swapargs]
-      + rmmod(xmm_reg, rm_arg_xmm) + [bs_opmode32], [xmm_reg, rm_arg_xmm])
+      + rmmod(xmm_reg, rm_arg_xmm_m128) + [bs_opmode32], [xmm_reg, rm_arg_xmm_m128])
 addop("movaps", [bs8(0x0f), bs("0010100"), swapargs]
-      + rmmod(xmm_reg, rm_arg_xmm) + [bs_opmode64], [xmm_reg, rm_arg_xmm])
+      + rmmod(xmm_reg, rm_arg_xmm_m128) + [bs_opmode64], [xmm_reg, rm_arg_xmm_m128])
 addop("movdqu", [bs8(0x0f), bs("011"), swapargs, bs("1111"), pref_f3]
       + rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
 addop("movdqa", [bs8(0x0f), bs("011"), swapargs, bs("1111"), pref_66]
@@ -4045,7 +4042,8 @@ addop("movlhps", [bs8(0x0f), bs8(0x16), no_xmm_pref] +
 
 addop("movdq2q", [bs8(0x0f), bs8(0xd6), pref_f2] +
       rmmod(mm_reg, rm_arg_xmm_reg), [mm_reg, rm_arg_xmm_reg])
-
+addop("movq2dq", [bs8(0x0f), bs8(0xd6), pref_f3] +
+      rmmod(xmm_reg, rm_arg_mm))
 
 ## Additions
 # SSE
@@ -4144,13 +4142,54 @@ addop("pxor", [bs8(0x0f), bs8(0xef), no_xmm_pref] +
 addop("pxor", [bs8(0x0f), bs8(0xef), pref_66] +
       rmmod(xmm_reg, rm_arg_xmm))
 
+### Comparisons (floating-point)
+###
+addop("minps", [bs8(0x0f), bs8(0x5d), no_xmm_pref] + rmmod(xmm_reg,
+                                                           rm_arg_xmm_m128))
+addop("minss", [bs8(0x0f), bs8(0x5d), pref_f3] + rmmod(xmm_reg,
+                                                       rm_arg_xmm_m32))
+addop("minpd", [bs8(0x0f), bs8(0x5d), pref_66] + rmmod(xmm_reg,
+                                                       rm_arg_xmm_m128))
+addop("minsd", [bs8(0x0f), bs8(0x5d), pref_f2] + rmmod(xmm_reg,
+                                                       rm_arg_xmm_m64))
+addop("maxps", [bs8(0x0f), bs8(0x5f), no_xmm_pref] + rmmod(xmm_reg,
+                                                           rm_arg_xmm_m128))
+addop("maxpd", [bs8(0x0f), bs8(0x5f), pref_66] + rmmod(xmm_reg,
+                                                       rm_arg_xmm_m128))
+addop("maxsd", [bs8(0x0f), bs8(0x5f), pref_f2] + rmmod(xmm_reg, rm_arg_xmm_m64))
+addop("maxss", [bs8(0x0f), bs8(0x5f), pref_f3] + rmmod(xmm_reg, rm_arg_xmm_m32))
+
+for cond_name, value in [
+        ("eq", 0x00),
+        ("lt", 0x01),
+        ("le", 0x02),
+        ("unord", 0x03),
+        ("neq", 0x04),
+        ("nlt", 0x05),
+        ("nle", 0x06),
+        ("ord", 0x07),
+]:
+    addop("cmp%sps" % cond_name, [bs8(0x0f), bs8(0xc2), no_xmm_pref] +
+          rmmod(xmm_reg, rm_arg_xmm_m64) + [bs8(value)])
+    addop("cmp%spd" % cond_name, [bs8(0x0f), bs8(0xc2), pref_66] +
+          rmmod(xmm_reg, rm_arg_xmm_m64) + [bs8(value)])
+    addop("cmp%sss" % cond_name, [bs8(0x0f), bs8(0xc2), pref_f3] +
+          rmmod(xmm_reg, rm_arg_xmm_m32) + [bs8(value)])
+    addop("cmp%ssd" % cond_name, [bs8(0x0f), bs8(0xc2), pref_f2] +
+          rmmod(xmm_reg, rm_arg_xmm_m32) + [bs8(value)])
+
+
+
 addop("pshufb", [bs8(0x0f), bs8(0x38), bs8(0x00), no_xmm_pref] +
-      rmmod(mm_reg, rm_arg_mm))
+      rmmod(mm_reg, rm_arg_mm_m64))
 addop("pshufb", [bs8(0x0f), bs8(0x38), bs8(0x00), pref_66] +
-      rmmod(xmm_reg, rm_arg_xmm))
+      rmmod(xmm_reg, rm_arg_xmm_m128))
 addop("pshufd", [bs8(0x0f), bs8(0x70), pref_66] +
-      rmmod(xmm_reg, rm_arg_xmm) + [u08])
-
+      rmmod(xmm_reg, rm_arg_xmm_m128) + [u08])
+addop("pshuflw", [bs8(0x0f), bs8(0x70), pref_f2] +
+      rmmod(xmm_reg, rm_arg_xmm_m128) + [u08])
+addop("pshufhw", [bs8(0x0f), bs8(0x70), pref_f3] +
+      rmmod(xmm_reg, rm_arg_xmm_m128) + [u08])
 
 
 ### Convert
@@ -4241,10 +4280,29 @@ addop("psrlw", [bs8(0x0f), bs8(0x71), pref_66] +
       rmmod(d2, rm_arg_xmm) + [u08], [rm_arg_xmm, u08])
 
 addop("psrlw", [bs8(0x0f), bs8(0xd1), no_xmm_pref] +
-      rmmod(mm_reg, rm_arg_mm), [mm_reg, rm_arg_mm])
+      rmmod(mm_reg, rm_arg_mm_m64), [mm_reg, rm_arg_mm_m64])
 addop("psrlw", [bs8(0x0f), bs8(0xd1), pref_66] +
-      rmmod(xmm_reg, rm_arg_xmm), [xmm_reg, rm_arg_xmm])
+      rmmod(xmm_reg, rm_arg_xmm_m128), [xmm_reg, rm_arg_xmm_m128])
+
+addop("psraw", [bs8(0x0f), bs8(0xe1), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64), [mm_reg, rm_arg_mm_m64])
+addop("psraw", [bs8(0x0f), bs8(0xe1), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128), [xmm_reg, rm_arg_xmm_m128])
+
+addop("psraw", [bs8(0x0f), bs8(0x71), no_xmm_pref] +
+      rmmod(d4, rm_arg_mm_m64) + [u08], [rm_arg_mm_m64, u08])
+addop("psraw", [bs8(0x0f), bs8(0x71), pref_66] +
+      rmmod(d4, rm_arg_xmm_m128) + [u08], [rm_arg_xmm_m128, u08])
+
+addop("psrad", [bs8(0x0f), bs8(0xe2), no_xmm_pref] +
+      rmmod(mm_reg, rm_arg_mm_m64), [mm_reg, rm_arg_mm_m64])
+addop("psrad", [bs8(0x0f), bs8(0xe2), pref_66] +
+      rmmod(xmm_reg, rm_arg_xmm_m128), [xmm_reg, rm_arg_xmm_m128])
 
+addop("psrad", [bs8(0x0f), bs8(0x72), no_xmm_pref] +
+      rmmod(d4, rm_arg_mm_m64) + [u08], [rm_arg_mm_m64, u08])
+addop("psrad", [bs8(0x0f), bs8(0x72), pref_66] +
+      rmmod(d4, rm_arg_xmm_m128) + [u08], [rm_arg_xmm_m128, u08])
 
 
 addop("psllq", [bs8(0x0f), bs8(0x73), no_xmm_pref] +
diff --git a/miasm2/arch/x86/sem.py b/miasm2/arch/x86/sem.py
index f3ca3a62..ef939144 100644
--- a/miasm2/arch/x86/sem.py
+++ b/miasm2/arch/x86/sem.py
@@ -1883,7 +1883,7 @@ def float_pop(avoid_flt=None, popcount=1):
         if avoid_flt != float_list[i]:
             e.append(m2_expr.ExprAff(float_list[i],
                                      float_list[i + popcount]))
-    fill_value = m2_expr.ExprOp("int_64_to_double",
+    fill_value = m2_expr.ExprOp("sint_to_fp64",
                                 m2_expr.ExprInt(0, float_list[i].size))
     for i in xrange(8 - popcount, 8):
         e.append(m2_expr.ExprAff(float_list[i],
@@ -1919,7 +1919,7 @@ def ftst(_, instr):
     dst = float_st0
 
     e = []
-    src = m2_expr.ExprOp('int_32_to_double', m2_expr.ExprInt(0, 32))
+    src = m2_expr.ExprOp('sint_to_fp64', m2_expr.ExprInt(0, 32))
     e.append(m2_expr.ExprAff(float_c0, m2_expr.ExprOp('fcom_c0', dst, src)))
     e.append(m2_expr.ExprAff(float_c1, m2_expr.ExprOp('fcom_c1', dst, src)))
     e.append(m2_expr.ExprAff(float_c2, m2_expr.ExprOp('fcom_c2', dst, src)))
@@ -2045,8 +2045,8 @@ def comiss(_, instr, dst, src):
 
     e = []
 
-    dst = m2_expr.ExprOp('int_32_to_float', dst[:32])
-    src = m2_expr.ExprOp('int_32_to_float', src[:32])
+    dst = m2_expr.ExprOp('sint_to_fp32', dst[:32])
+    src = m2_expr.ExprOp('sint_to_fp32', src[:32])
 
     e.append(m2_expr.ExprAff(cf, m2_expr.ExprOp('fcom_c0', dst, src)))
     e.append(m2_expr.ExprAff(pf, m2_expr.ExprOp('fcom_c2', dst, src)))
@@ -2065,8 +2065,8 @@ def comisd(_, instr, dst, src):
 
     e = []
 
-    dst = m2_expr.ExprOp('int_64_to_double', dst[:64])
-    src = m2_expr.ExprOp('int_64_to_double', src[:64])
+    dst = m2_expr.ExprOp('sint_to_fp64', dst[:64])
+    src = m2_expr.ExprOp('sint_to_fp64', src[:64])
 
     e.append(m2_expr.ExprAff(cf, m2_expr.ExprOp('fcom_c0', dst, src)))
     e.append(m2_expr.ExprAff(pf, m2_expr.ExprOp('fcom_c2', dst, src)))
@@ -2081,7 +2081,9 @@ def comisd(_, instr, dst, src):
 
 
 def fld(_, instr, src):
-    src = mem2double(instr, src)
+
+    if src.size == 32:
+        src = m2_expr.ExprOp("fpconvert_fp64", src)
 
     e = []
     e.append(m2_expr.ExprAff(float_st7, float_st6))
@@ -2103,13 +2105,12 @@ def fld(_, instr, src):
 def fst(_, instr, dst):
     e = []
 
-    if isinstance(dst, m2_expr.ExprMem):
-        if dst.size > 64:
-            raise NotImplementedError('float to long')
-        src = m2_expr.ExprOp('double_to_mem_%.2d' % dst.size, float_st0)
-    else:
-        src = float_st0
+    if isinstance(dst, m2_expr.ExprMem) and dst.size > 64:
+        raise NotImplementedError('convert to 80bits')
+    src = float_st0
 
+    if dst.size == 32:
+        src = m2_expr.ExprOp("fpconvert_fp32", src)
     e.append(m2_expr.ExprAff(dst, src))
     e += set_float_cs_eip(instr)
     return e, []
@@ -2118,12 +2119,13 @@ def fst(_, instr, dst):
 def fstp(ir, instr, dst):
     e = []
 
-    if isinstance(dst, m2_expr.ExprMem):
-        if dst.size > 64:
-            # TODO: move to 80 bits
-            dst = ir.ExprMem(dst.arg, size=64)
+    if isinstance(dst, m2_expr.ExprMem) and dst.size > 64:
+        raise NotImplementedError('convert to 80bits')
 
-        src = m2_expr.ExprOp('double_to_mem_%.2d' % dst.size, float_st0)
+    if isinstance(dst, m2_expr.ExprMem):
+        src = float_st0
+        if dst.size == 32:
+            src = m2_expr.ExprOp("fpconvert_fp32", src)
         e.append(m2_expr.ExprAff(dst, src))
     else:
         src = float_st0
@@ -2139,7 +2141,7 @@ def fstp(ir, instr, dst):
 
 def fist(_, instr, dst):
     e = []
-    e.append(m2_expr.ExprAff(dst, m2_expr.ExprOp('double_to_int_%d' % dst.size,
+    e.append(m2_expr.ExprAff(dst, m2_expr.ExprOp('fp_to_sint%d' % dst.size,
                                                  float_st0)))
 
     e += set_float_cs_eip(instr)
@@ -2154,9 +2156,11 @@ def fistp(ir, instr, dst):
 
 def fisttp(_, instr, dst):
     e = []
-    e.append(m2_expr.ExprAff(dst,
-                             m2_expr.ExprOp('double_trunc_to_int_%d' % dst.size,
-                                            float_st0)))
+    e.append(m2_expr.ExprAff(
+        dst,
+        m2_expr.ExprOp('fp_to_sint%d' % dst.size,
+                       m2_expr.ExprOp('fpround_towardszero', float_st0)
+        )))
 
     e += set_float_cs_eip(instr)
     e += float_pop(dst)
@@ -2165,7 +2169,7 @@ def fisttp(_, instr, dst):
 
 def fild(ir, instr, src):
     # XXXXX
-    src = m2_expr.ExprOp('int_%.2d_to_double' % src.size, src)
+    src = m2_expr.ExprOp('sint_to_fp64', src)
     e = []
     e += set_float_cs_eip(instr)
     e_fld, extra = fld(ir, instr, src)
@@ -2174,26 +2178,26 @@ def fild(ir, instr, src):
 
 
 def fldz(ir, instr):
-    return fld(ir, instr, m2_expr.ExprOp('int_32_to_double',
+    return fld(ir, instr, m2_expr.ExprOp('sint_to_fp64',
                                          m2_expr.ExprInt(0, 32)))
 
 
 def fld1(ir, instr):
-    return fld(ir, instr, m2_expr.ExprOp('int_32_to_double',
+    return fld(ir, instr, m2_expr.ExprOp('sint_to_fp64',
                                          m2_expr.ExprInt(1, 32)))
 
 
 def fldl2t(ir, instr):
     value_f = math.log(10) / math.log(2)
     value = struct.unpack('I', struct.pack('f', value_f))[0]
-    return fld(ir, instr, m2_expr.ExprOp('int_32_to_double',
+    return fld(ir, instr, m2_expr.ExprOp('sint_to_fp64',
                                          m2_expr.ExprInt(value, 32)))
 
 
 def fldpi(ir, instr):
     value_f = math.pi
     value = struct.unpack('I', struct.pack('f', value_f))[0]
-    return fld(ir, instr, m2_expr.ExprOp('int_32_to_double',
+    return fld(ir, instr, m2_expr.ExprOp('sint_to_fp64',
                                          m2_expr.ExprInt(value, 32)))
 
 
@@ -2534,7 +2538,7 @@ def fptan(_, instr):
     e.append(m2_expr.ExprAff(float_st2, float_st1))
     e.append(m2_expr.ExprAff(float_st1, m2_expr.ExprOp('ftan', float_st0)))
     e.append(m2_expr.ExprAff(float_st0,
-                             m2_expr.ExprOp('int_32_to_double',
+                             m2_expr.ExprOp('sint_to_fp64',
                                             m2_expr.ExprInt(1, 32))))
     e.append(
         m2_expr.ExprAff(float_stack_ptr,
@@ -3371,8 +3375,7 @@ def wrmsr(ir, instr):
 # MMX/SSE/AVX operations
 #
 
-
-def vec_op_clip(op, size):
+def vec_op_clip(op, size, callback=None):
     """
     Generate simd operations
     @op: the operator
@@ -3380,9 +3383,12 @@ def vec_op_clip(op, size):
     """
     def vec_op_clip_instr(ir, instr, dst, src):
         if op == '-':
-            return [m2_expr.ExprAff(dst[:size], dst[:size] - src[:size])], []
+            result = dst[:size] - src[:size]
         else:
-            return [m2_expr.ExprAff(dst[:size], m2_expr.ExprOp(op, dst[:size], src[:size]))], []
+            result = m2_expr.ExprOp(op, dst[:size], src[:size])
+        if callback is not None:
+            result = callback(result)
+        return [m2_expr.ExprAff(dst[:size], result)], []
     return vec_op_clip_instr
 
 # Generic vertical operation
@@ -3407,38 +3413,6 @@ def vec_vertical_sem(op, elt_size, reg_size, dst, src, apply_on_output):
     return m2_expr.ExprCompose(*ops)
 
 
-def float_vec_vertical_sem(op, elt_size, reg_size, dst, src, apply_on_output):
-    assert reg_size % elt_size == 0
-    n = reg_size / elt_size
-
-    x_to_int, int_to_x = {32: ('float_to_int_%d', 'int_%d_to_float'),
-                          64: ('double_to_int_%d', 'int_%d_to_double')}[elt_size]
-    if op == '-':
-        ops = [
-            apply_on_output(m2_expr.ExprOp(
-                x_to_int % elt_size,
-                m2_expr.ExprOp(int_to_x % elt_size, dst[i * elt_size:(i + 1) * elt_size]) -
-                m2_expr.ExprOp(
-                    int_to_x % elt_size, src[i * elt_size:(
-                        i + 1) * elt_size])))
-            for i in xrange(0, n)
-        ]
-    else:
-        ops = [
-            apply_on_output(m2_expr.ExprOp(
-                x_to_int % elt_size,
-                m2_expr.ExprOp(op,
-                               m2_expr.ExprOp(
-                                   int_to_x % elt_size, dst[i * elt_size:(
-                                       i + 1) * elt_size]),
-                               m2_expr.ExprOp(
-                                   int_to_x % elt_size, src[i * elt_size:(
-                                       i + 1) * elt_size]))))
-            for i in xrange(0, n)]
-
-    return m2_expr.ExprCompose(*ops)
-
-
 def __vec_vertical_instr_gen(op, elt_size, sem, apply_on_output):
     def vec_instr(ir, instr, dst, src):
         e = []
@@ -3456,11 +3430,6 @@ def vec_vertical_instr(op, elt_size, apply_on_output=lambda x: x):
                                     apply_on_output)
 
 
-def float_vec_vertical_instr(op, elt_size, apply_on_output=lambda x: x):
-    return __vec_vertical_instr_gen(op, elt_size, float_vec_vertical_sem,
-                                    apply_on_output)
-
-
 def _keep_mul_high(expr, signed=False):
     assert expr.is_op("*") and len(expr.args) == 2
 
@@ -3487,6 +3456,32 @@ def _min_max(expr, signed):
         expr.args[0],
     )
 
+def _float_min_max(expr):
+    assert (expr.is_op("fmin") or expr.is_op("fmax")) and len(expr.args) == 2
+    src1 = expr.args[0]
+    src2 = expr.args[1]
+    if expr.is_op("fmin"):
+        comp = m2_expr.expr_is_float_lower(src1, src2)
+    elif expr.is_op("fmax"):
+        comp = m2_expr.expr_is_float_lower(src2, src1)
+
+    # x86 documentation (for MIN):
+    # IF ((SRC1 = 0.0) and (SRC2 = 0.0)) THEN DEST <-SRC2;
+    # ELSE IF (SRC1 = SNaN) THEN DEST <-SRC2; FI;
+    # ELSE IF (SRC2 = SNaN) THEN DEST <-SRC2; FI;
+    # ELSE IF (SRC1 < SRC2) THEN DEST <-SRC1;
+    # ELSE DEST<-SRC2;
+    #
+    # But this includes the NaN output of "SRC1 < SRC2"
+    # Associated text is more detailed, and this is the version impl here
+    return m2_expr.ExprCond(
+        m2_expr.expr_is_sNaN(src2), src2,
+        m2_expr.ExprCond(
+            m2_expr.expr_is_NaN(src2) | m2_expr.expr_is_NaN(src1), src2,
+            m2_expr.ExprCond(comp, src1, src2)
+        )
+    )
+
 
 # Integer arithmetic
 #
@@ -3616,22 +3611,100 @@ pmaxsw = vec_vertical_instr('max', 16, lambda x: _min_max(x, signed=True))
 #
 
 # SSE
-addss = vec_op_clip('+', 32)
-addsd = vec_op_clip('+', 64)
-addps = float_vec_vertical_instr('+', 32)
-addpd = float_vec_vertical_instr('+', 64)
-subss = vec_op_clip('-', 32)
-subsd = vec_op_clip('-', 64)
-subps = float_vec_vertical_instr('-', 32)
-subpd = float_vec_vertical_instr('-', 64)
-mulss = vec_op_clip('*', 32)
-mulsd = vec_op_clip('*', 64)
-mulps = float_vec_vertical_instr('*', 32)
-mulpd = float_vec_vertical_instr('*', 64)
-divss = vec_op_clip('/', 32)
-divsd = vec_op_clip('/', 64)
-divps = float_vec_vertical_instr('/', 32)
-divpd = float_vec_vertical_instr('/', 64)
+addss = vec_op_clip('fadd', 32)
+addsd = vec_op_clip('fadd', 64)
+addps = vec_vertical_instr('fadd', 32)
+addpd = vec_vertical_instr('fadd', 64)
+subss = vec_op_clip('fsub', 32)
+subsd = vec_op_clip('fsub', 64)
+subps = vec_vertical_instr('fsub', 32)
+subpd = vec_vertical_instr('fsub', 64)
+mulss = vec_op_clip('fmul', 32)
+mulsd = vec_op_clip('fmul', 64)
+mulps = vec_vertical_instr('fmul', 32)
+mulpd = vec_vertical_instr('fmul', 64)
+divss = vec_op_clip('fdiv', 32)
+divsd = vec_op_clip('fdiv', 64)
+divps = vec_vertical_instr('fdiv', 32)
+divpd = vec_vertical_instr('fdiv', 64)
+
+# Comparisons (floating-point)
+
+minps = vec_vertical_instr('fmin', 32, _float_min_max)
+minpd = vec_vertical_instr('fmin', 64, _float_min_max)
+minss = vec_op_clip('fmin', 32, _float_min_max)
+minsd = vec_op_clip('fmin', 64, _float_min_max)
+maxps = vec_vertical_instr('fmax', 32, _float_min_max)
+maxpd = vec_vertical_instr('fmax', 64, _float_min_max)
+maxss = vec_op_clip('fmax', 32, _float_min_max)
+maxsd = vec_op_clip('fmax', 64, _float_min_max)
+
+def _float_compare_to_mask(expr):
+    if expr.op == 'unord':
+        to_ext = m2_expr.expr_is_NaN(expr.args[0]) | m2_expr.expr_is_NaN(expr.args[1])
+    elif expr.op == 'ord':
+        to_ext = ~m2_expr.expr_is_NaN(expr.args[0]) & ~m2_expr.expr_is_NaN(expr.args[1])
+    else:
+        if expr.op == '==fu':
+            to_ext = m2_expr.expr_is_float_equal(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(0, 1)
+        elif expr.op == '<fu':
+            to_ext = m2_expr.expr_is_float_lower(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(0, 1)
+        elif expr.op == '<=fu':
+            to_ext = (m2_expr.expr_is_float_equal(expr.args[0], expr.args[1]) |
+                      m2_expr.expr_is_float_lower(expr.args[0], expr.args[1]))
+            on_NaN = m2_expr.ExprInt(0, 1)
+        elif expr.op == '!=fu':
+            to_ext = ~m2_expr.expr_is_float_equal(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(1, 1)
+        elif expr.op == '!<fu':
+            to_ext = ~m2_expr.expr_is_float_lower(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(1, 1)
+        elif expr.op == '!<=fu':
+            to_ext = ~(m2_expr.expr_is_float_equal(expr.args[0], expr.args[1]) |
+                      m2_expr.expr_is_float_lower(expr.args[0], expr.args[1]))
+            on_NaN = m2_expr.ExprInt(1, 1)
+
+        to_ext = m2_expr.ExprCond(
+            m2_expr.expr_is_NaN(expr.args[0]) | m2_expr.expr_is_NaN(expr.args[1]),
+            on_NaN,
+            to_ext
+        )
+    return to_ext.signExtend(expr.size)
+
+cmpeqps = vec_vertical_instr('==fu', 32, lambda x: _float_compare_to_mask(x))
+cmpeqpd = vec_vertical_instr('==fu', 64, lambda x: _float_compare_to_mask(x))
+cmpeqss = vec_op_clip('==fu', 32, lambda x: _float_compare_to_mask(x))
+cmpeqsd = vec_op_clip('==fu', 64, lambda x: _float_compare_to_mask(x))
+cmpltps = vec_vertical_instr('<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpltpd = vec_vertical_instr('<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpltss = vec_op_clip('<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpltsd = vec_op_clip('<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpleps = vec_vertical_instr('<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmplepd = vec_vertical_instr('<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpless = vec_op_clip('<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmplesd = vec_op_clip('<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpunordps = vec_vertical_instr('unord', 32, lambda x: _float_compare_to_mask(x))
+cmpunordpd = vec_vertical_instr('unord', 64, lambda x: _float_compare_to_mask(x))
+cmpunordss = vec_op_clip('unord', 32, lambda x: _float_compare_to_mask(x))
+cmpunordsd = vec_op_clip('unord', 64, lambda x: _float_compare_to_mask(x))
+cmpneqps = vec_vertical_instr('!=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpneqpd = vec_vertical_instr('!=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpneqss = vec_op_clip('!=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpneqsd = vec_op_clip('!=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnltps = vec_vertical_instr('!<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnltpd = vec_vertical_instr('!<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnltss = vec_op_clip('!<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnltsd = vec_op_clip('!<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnleps = vec_vertical_instr('!<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnlepd = vec_vertical_instr('!<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnless = vec_op_clip('!<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnlesd = vec_op_clip('!<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpordps = vec_vertical_instr('ord', 32, lambda x: _float_compare_to_mask(x))
+cmpordpd = vec_vertical_instr('ord', 64, lambda x: _float_compare_to_mask(x))
+cmpordss = vec_op_clip('ord', 32, lambda x: _float_compare_to_mask(x))
+cmpordsd = vec_op_clip('ord', 64, lambda x: _float_compare_to_mask(x))
 
 # Logical (floating-point)
 #
@@ -3665,31 +3738,31 @@ def por(_, instr, dst, src):
 def cvtdq2pd(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('int_32_to_double', src[:32])))
+        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('sint_to_fp64', src[:32])))
     e.append(
-        m2_expr.ExprAff(dst[64:128], m2_expr.ExprOp('int_32_to_double', src[32:64])))
+        m2_expr.ExprAff(dst[64:128], m2_expr.ExprOp('sint_to_fp64', src[32:64])))
     return e, []
 
 
 def cvtdq2ps(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('int_32_to_float', src[:32])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('sint_to_fp32', src[:32])))
     e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('int_32_to_float', src[32:64])))
+        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('sint_to_fp32', src[32:64])))
     e.append(
-        m2_expr.ExprAff(dst[64:96], m2_expr.ExprOp('int_32_to_float', src[64:96])))
+        m2_expr.ExprAff(dst[64:96], m2_expr.ExprOp('sint_to_fp32', src[64:96])))
     e.append(
-        m2_expr.ExprAff(dst[96:128], m2_expr.ExprOp('int_32_to_float', src[96:128])))
+        m2_expr.ExprAff(dst[96:128], m2_expr.ExprOp('sint_to_fp32', src[96:128])))
     return e, []
 
 
 def cvtpd2dq(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('double_to_int_32', src[:64])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:64])))
     e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('double_to_int_32', src[64:128])))
+        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('fp_to_sint32', src[64:128])))
     e.append(m2_expr.ExprAff(dst[64:128], m2_expr.ExprInt(0, 64)))
     return e, []
 
@@ -3697,18 +3770,18 @@ def cvtpd2dq(_, instr, dst, src):
 def cvtpd2pi(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('double_to_int_32', src[:64])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:64])))
     e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('double_to_int_32', src[64:128])))
+        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('fp_to_sint32', src[64:128])))
     return e, []
 
 
 def cvtpd2ps(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('double_to_float', src[:64])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('fpconvert_fp32', src[:64])))
     e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('double_to_float', src[64:128])))
+        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('fpconvert_fp32', src[64:128])))
     e.append(m2_expr.ExprAff(dst[64:128], m2_expr.ExprInt(0, 64)))
     return e, []
 
@@ -3716,148 +3789,131 @@ def cvtpd2ps(_, instr, dst, src):
 def cvtpi2pd(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('int_32_to_double', src[:32])))
+        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('sint_to_fp64', src[:32])))
     e.append(
-        m2_expr.ExprAff(dst[64:128], m2_expr.ExprOp('int_32_to_double', src[32:64])))
+        m2_expr.ExprAff(dst[64:128], m2_expr.ExprOp('sint_to_fp64', src[32:64])))
     return e, []
 
 
 def cvtpi2ps(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('int_32_to_float', src[:32])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('sint_to_fp32', src[:32])))
     e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('int_32_to_float', src[32:64])))
+        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('sint_to_fp32', src[32:64])))
     return e, []
 
 
 def cvtps2dq(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('float_to_int_32', src[:32])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:32])))
     e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('float_to_int_32', src[32:64])))
+        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('fp_to_sint32', src[32:64])))
     e.append(
-        m2_expr.ExprAff(dst[64:96], m2_expr.ExprOp('float_to_int_32', src[64:96])))
+        m2_expr.ExprAff(dst[64:96], m2_expr.ExprOp('fp_to_sint32', src[64:96])))
     e.append(
-        m2_expr.ExprAff(dst[96:128], m2_expr.ExprOp('float_to_int_32', src[96:128])))
+        m2_expr.ExprAff(dst[96:128], m2_expr.ExprOp('fp_to_sint32', src[96:128])))
     return e, []
 
 
 def cvtps2pd(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('float_to_double', src[:32])))
+        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('fpconvert_fp64', src[:32])))
     e.append(
-        m2_expr.ExprAff(dst[64:128], m2_expr.ExprOp('float_to_double', src[32:64])))
+        m2_expr.ExprAff(dst[64:128], m2_expr.ExprOp('fpconvert_fp64', src[32:64])))
     return e, []
 
 
 def cvtps2pi(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('float_to_int_32', src[:32])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:32])))
     e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('float_to_int_32', src[32:64])))
+        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('fp_to_sint32', src[32:64])))
     return e, []
 
 
 def cvtsd2si(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('double_to_int_32', src[:64])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:64])))
     return e, []
 
 
 def cvtsd2ss(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('double_to_float', src[:64])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('fpconvert_fp32', src[:64])))
     return e, []
 
 
 def cvtsi2sd(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('int_32_to_double', src[:32])))
+        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('sint_to_fp64', src[:32])))
     return e, []
 
 
 def cvtsi2ss(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('int_32_to_float', src[:32])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('sint_to_fp32', src[:32])))
     return e, []
 
 
 def cvtss2sd(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('float_to_double', src[:32])))
+        m2_expr.ExprAff(dst[:64], m2_expr.ExprOp('fpconvert_fp64', src[:32])))
     return e, []
 
 
 def cvtss2si(_, instr, dst, src):
     e = []
     e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('float_to_int_32', src[:32])))
+        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('fp_to_sint32', src[:32])))
     return e, []
 
 
-def cvttpd2pi(_, instr, dst, src):
+def _cvtt_tpl(dst, src, numbers, double):
     e = []
-    e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('double_trunc_to_int_32', src[:64])))
-    e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('double_trunc_to_int_32', src[64:128])))
-    return e, []
+    for i in numbers:
+        # For CVTT*D2* (Convert with Truncation ... Double-Precision) to work,
+        # a first conversion fp64 -> fp32 is needed
+        if double:
+            tmp_src = m2_expr.ExprOp('fpconvert_fp32', src[i*64:i*64 + 64])
+        else:
+            tmp_src = src[i*32:i*32 + 32]
+
+        e.append(m2_expr.ExprAff(
+            dst[i*32:i*32 + 32],
+            m2_expr.ExprOp('fp_to_sint32', m2_expr.ExprOp(
+                'fpround_towardszero',
+                tmp_src
+            ))))
+    return e
 
+def cvttpd2pi(_, instr, dst, src):
+    return _cvtt_tpl(dst, src, [0, 1], double=True), []
 
 def cvttpd2dq(_, instr, dst, src):
-    e = []
-    e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('double_trunc_to_int_32', src[:64])))
-    e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('double_trunc_to_int_32', src[64:128])))
+    e = _cvtt_tpl(dst, src, [0, 1], double=True)
     e.append(m2_expr.ExprAff(dst[64:128], m2_expr.ExprInt(0, 64)))
     return e, []
 
+def cvttsd2si(_, instr, dst, src):
+    return _cvtt_tpl(dst, src, [0], double=True), []
 
 def cvttps2dq(_, instr, dst, src):
-    e = []
-    e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('float_trunc_to_int_32', src[:32])))
-    e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('float_trunc_to_int_32', src[32:64])))
-    e.append(
-        m2_expr.ExprAff(dst[64:96], m2_expr.ExprOp('float_trunc_to_int_32', src[64:96])))
-    e.append(
-        m2_expr.ExprAff(dst[96:128], m2_expr.ExprOp('float_trunc_to_int_32', src[96:128])))
-    return e, []
-
+    return _cvtt_tpl(dst, src, [0, 1, 2, 3], double=False), []
 
 def cvttps2pi(_, instr, dst, src):
-    e = []
-    e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('float_trunc_to_int_32', src[:32])))
-    e.append(
-        m2_expr.ExprAff(dst[32:64], m2_expr.ExprOp('float_trunc_to_int_32', src[32:64])))
-    return e, []
-
-
-def cvttsd2si(_, instr, dst, src):
-    e = []
-    e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('double_trunc_to_int_32', src[:64])))
-    return e, []
-
+    return _cvtt_tpl(dst, src, [0, 1], double=False), []
 
 def cvttss2si(_, instr, dst, src):
-    e = []
-    e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('float_trunc_to_int_32', src[:32])))
-    return e, []
-
+    return _cvtt_tpl(dst, src, [0], double=False), []
 
 def movss(_, instr, dst, src):
     e = []
@@ -3925,52 +3981,55 @@ def pshufb(_, instr, dst, src):
 
 
 def pshufd(_, instr, dst, src, imm):
-    e = []
+    control = int(imm)
+    out = []
     for i in xrange(4):
-        index = imm[2 * i:2 * (i + 1)].zeroExtend(dst.size)
-        index <<= m2_expr.ExprInt(5, dst.size)
-        value = (dst >> index)[:32]
-        e.append(m2_expr.ExprAff(dst[32 * i:32 * (i + 1)], value))
-    return e, []
+        shift = ((control >> (i * 2)) & 3) * 32
+        # shift is 2 bits long, expr.size is 128
+        # => shift + 32 <= src.size
+        out.append(src[shift: shift + 32])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
 
 
-def ps_rl_ll(ir, instr, dst, src, op, size):
-    loc_zero, loc_zero_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
-    loc_do, loc_do_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
-    loc_next = ir.get_next_loc_key(instr)
-    loc_next_expr = m2_expr.ExprLoc(loc_next, ir.IRDst.size)
+def pshuflw(_, instr, dst, src, imm):
+    control = int(imm)
+    out = []
+    for i in xrange(4):
+        shift = ((control >> (i * 2)) & 3) * 16
+        out.append(src[shift: shift + 16])
+    out.append(src[64:])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
 
-    if src.size == 8:
-        count = src.zeroExtend(dst.size)
-    else:
-        count = src.zeroExtend(dst.size)
 
+def pshufhw(_, instr, dst, src, imm):
+    control = int(imm)
+    out = [src[:64]]
+    for i in xrange(4):
+        shift = ((control >> (i * 2)) & 3) * 16
+        out.append(src[shift + 64: shift + 16 + 64])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
+
+
+def ps_rl_ll(ir, instr, dst, src, op, size):
     mask = {16: 0xF,
             32: 0x1F,
             64: 0x3F}[size]
-    test = expr_simp(count & m2_expr.ExprInt(
-        ((1 << dst.size) - 1) ^ mask, dst.size))
-    e = [m2_expr.ExprAff(ir.IRDst, m2_expr.ExprCond(test,
-                                                    loc_zero_expr,
-                                                    loc_do_expr))]
-
-    slices = []
-    for i in xrange(0, dst.size, size):
-        slices.append(m2_expr.ExprOp(op, dst[i:i + size], count[:size]))
+    mask = m2_expr.ExprInt(mask, dst.size)
 
-    if isinstance(test, m2_expr.ExprInt):
-        if int(test) == 0:
-            return [m2_expr.ExprAff(dst[0:dst.size], m2_expr.ExprCompose(*slices))], []
-        else:
-            return [m2_expr.ExprAff(dst, m2_expr.ExprInt(0, dst.size))], []
+    # Saturate the counter to 2**size
+    count = src.zeroExtend(dst.size)
+    count = m2_expr.ExprCond(count & expr_simp(~mask),
+                             m2_expr.ExprInt(size, dst.size), # saturation
+                             count, # count < 2**size
+    )
+    count = count[:size]
+    if src.is_int():
+        count = expr_simp(count)
 
-    e_zero = [m2_expr.ExprAff(dst, m2_expr.ExprInt(0, dst.size)),
-              m2_expr.ExprAff(ir.IRDst, loc_next_expr)]
-    e_do = []
-    e.append(m2_expr.ExprAff(dst[0:dst.size], m2_expr.ExprCompose(*slices)))
-    e_do.append(m2_expr.ExprAff(ir.IRDst, loc_next_expr))
-    return e, [IRBlock(loc_do, [AssignBlock(e_do, instr)]),
-               IRBlock(loc_zero, [AssignBlock(e_zero, instr)])]
+    out = []
+    for i in xrange(0, dst.size, size):
+        out.append(m2_expr.ExprOp(op, dst[i:i + size], count))
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
 
 
 def psrlw(ir, instr, dst, src):
@@ -3997,6 +4056,14 @@ def psllq(ir, instr, dst, src):
     return ps_rl_ll(ir, instr, dst, src, "<<",  64)
 
 
+def psraw(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, "a>>", 16)
+
+
+def psrad(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, "a>>", 32)
+
+
 def pslldq(_, instr, dst, src):
     assert src.is_int()
     e = []
@@ -4250,11 +4317,17 @@ def movdq2q(_, instr, dst, src):
     return e, []
 
 
+def movq2dq(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAff(dst, src[:64].zeroExtend(dst.size)))
+    return e, []
+
+
 def sqrt_gen(_, instr, dst, src, size):
     e = []
     out = []
-    for i in src.size / size:
-        out.append(m2_expr.ExprOp('fsqrt' % size,
+    for i in xrange(src.size / size):
+        out.append(m2_expr.ExprOp('fsqrt',
                                   src[i * size: (i + 1) * size]))
     src = m2_expr.ExprCompose(*out)
     e.append(m2_expr.ExprAff(dst, src))
@@ -4479,10 +4552,10 @@ def maskmovq(ir, instr, src, mask):
     blks = []
 
     # For each possibility, check if a write is necessary
-    check_labels = [m2_expr.ExprId(ir.gen_label(), ir.IRDst.size)
+    check_labels = [m2_expr.ExprLoc(ir.loc_db.add_location(), ir.IRDst.size)
                     for _ in xrange(0, mask.size, 8)]
     # If the write has to be done, do it (otherwise, nothing happen)
-    write_labels = [m2_expr.ExprId(ir.gen_label(), ir.IRDst.size)
+    write_labels = [m2_expr.ExprLoc(ir.loc_db.add_location(), ir.IRDst.size)
                     for _ in xrange(0, mask.size, 8)]
 
     # Build check blocks
@@ -4495,7 +4568,7 @@ def maskmovq(ir, instr, src, mask):
                                 m2_expr.ExprCond(bit,
                                                  write_label,
                                                  next_check_label))
-        blks.append(IRBlock(cur_label.name.loc_key, [AssignBlock([check], instr)]))
+        blks.append(IRBlock(cur_label.loc_key, [AssignBlock([check], instr)]))
 
     # Build write blocks
     dst_addr = mRDI[instr.mode]
@@ -4509,7 +4582,7 @@ def maskmovq(ir, instr, src, mask):
         write_mem = m2_expr.ExprAff(m2_expr.ExprMem(write_addr, 8),
                                     src[start: start + 8])
         jump = m2_expr.ExprAff(ir.IRDst, next_check_label)
-        blks.append(IRBlock(cur_label.name.loc_key, [AssignBlock([write_mem, jump], instr)]))
+        blks.append(IRBlock(cur_label.loc_key, [AssignBlock([write_mem, jump], instr)]))
 
     # If mask is null, bypass all
     e = [m2_expr.ExprAff(ir.IRDst, m2_expr.ExprCond(mask,
@@ -4522,6 +4595,63 @@ def emms(ir, instr):
     # Implemented as a NOP
     return [], []
 
+# Common value without too many option, 0x1fa0
+STMXCSR_VALUE = 0x1fa0
+def stmxcsr(ir, instr, dst):
+    return [m2_expr.ExprAff(dst, m2_expr.ExprInt(STMXCSR_VALUE, dst.size))], []
+
+def ldmxcsr(ir, instr, dst):
+    # Implemented as a NOP
+    return [], []
+
+
+def _select4(src, control):
+    # Implementation inspired from Intel Intrisics Guide
+    # @control is already resolved (was an immediate)
+
+    if control == 0:
+        return src[:32] # 0
+    elif control == 1:
+        return src[32:64]
+    elif control == 2:
+        return src[64:96]
+    elif control == 3:
+        return src[96:]
+    else:
+        raise ValueError("Control must be on 2 bits")
+
+
+def shufps(ir, instr, dst, src, imm8):
+    out = []
+    control = int(imm8)
+    for i in xrange(4):
+        if i < 2:
+            source = dst
+        else:
+            source = src
+        out.append(_select4(source, (control >> (i * 2)) & 3))
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
+
+
+def shufpd(ir, instr, dst, src, imm8):
+    out = []
+    control = int(imm8)
+    out.append(dst[64:] if control & 1 else dst[:64])
+    out.append(src[64:] if control & 2 else src[:64])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
+
+def movmskps(ir, instr, dst, src):
+    out = []
+    for i in xrange(4):
+        out.append(src[(32 * i) + 31:(32 * i) + 32])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out).zeroExtend(dst.size))], []
+
+def movmskpd(ir, instr, dst, src):
+    out = []
+    for i in xrange(2):
+        out.append(src[(64 * i) + 63:(64 * i) + 64])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out).zeroExtend(dst.size))], []
+
 
 mnemo_func = {'mov': mov,
               'xchg': xchg,
@@ -4961,6 +5091,49 @@ mnemo_func = {'mov': mov,
               "divps": divps,
               "divpd": divpd,
 
+              # Comparisons (floating-point)
+              #
+              "minps": minps,
+              "minpd": minpd,
+              "minss": minss,
+              "minsd": minsd,
+              "maxps": maxps,
+              "maxpd": maxpd,
+              "maxss": maxss,
+              "maxsd": maxsd,
+              "cmpeqps": cmpeqps,
+              "cmpeqpd": cmpeqpd,
+              "cmpeqss": cmpeqss,
+              "cmpeqsd": cmpeqsd,
+              "cmpltps": cmpltps,
+              "cmpltpd": cmpltpd,
+              "cmpltss": cmpltss,
+              "cmpltsd": cmpltsd,
+              "cmpleps": cmpleps,
+              "cmplepd": cmplepd,
+              "cmpless": cmpless,
+              "cmplesd": cmplesd,
+              "cmpunordps": cmpunordps,
+              "cmpunordpd": cmpunordpd,
+              "cmpunordss": cmpunordss,
+              "cmpunordsd": cmpunordsd,
+              "cmpneqps": cmpneqps,
+              "cmpneqpd": cmpneqpd,
+              "cmpneqss": cmpneqss,
+              "cmpneqsd": cmpneqsd,
+              "cmpnltps": cmpnltps,
+              "cmpnltpd": cmpnltpd,
+              "cmpnltss": cmpnltss,
+              "cmpnltsd": cmpnltsd,
+              "cmpnleps": cmpnleps,
+              "cmpnlepd": cmpnlepd,
+              "cmpnless": cmpnless,
+              "cmpnlesd": cmpnlesd,
+              "cmpordps": cmpordps,
+              "cmpordpd": cmpordpd,
+              "cmpordss": cmpordss,
+              "cmpordsd": cmpordsd,
+
               # Logical (floating-point)
               #
 
@@ -4972,6 +5145,8 @@ mnemo_func = {'mov': mov,
               "wrmsr": wrmsr,
               "pshufb": pshufb,
               "pshufd": pshufd,
+              "pshuflw": pshuflw,
+              "pshufhw": pshufhw,
 
               "psrlw": psrlw,
               "psrld": psrld,
@@ -4981,6 +5156,8 @@ mnemo_func = {'mov': mov,
               "psllq": psllq,
               "pslldq": pslldq,
               "psrldq": psrldq,
+              "psraw": psraw,
+              "psrad": psrad,
 
               "palignr": palignr,
 
@@ -5036,6 +5213,7 @@ mnemo_func = {'mov': mov,
               "movlhps": movlhps,
               "movhlps": movhlps,
               "movdq2q": movdq2q,
+              "movq2dq": movq2dq,
 
               "sqrtpd": sqrtpd,
               "sqrtps": sqrtps,
@@ -5061,6 +5239,12 @@ mnemo_func = {'mov': mov,
               "maskmovq": maskmovq,
               "maskmovdqu": maskmovq,
               "emms": emms,
+              "shufps": shufps,
+              "shufpd": shufpd,
+              "movmskps": movmskps,
+              "movmskpd": movmskpd,
+              "stmxcsr": stmxcsr,
+              "ldmxcsr": ldmxcsr,
               }
 
 
diff --git a/miasm2/expression/expression.py b/miasm2/expression/expression.py
index 8e63e6a2..3cf37070 100644
--- a/miasm2/expression/expression.py
+++ b/miasm2/expression/expression.py
@@ -1018,26 +1018,12 @@ class ExprOp(Expr):
                            TOK_POS_STRICT,
                           ]:
             size = 1
-        elif self._op in ['mem_16_to_double', 'mem_32_to_double',
-                           'mem_64_to_double', 'mem_80_to_double',
-                           'int_16_to_double', 'int_32_to_double',
-                           'int_64_to_double', 'int_80_to_double']:
-            size = 64
-        elif self._op in ['double_to_mem_16', 'double_to_int_16',
-                           'float_trunc_to_int_16', 'double_trunc_to_int_16']:
-            size = 16
-        elif self._op in ['double_to_mem_32', 'double_to_int_32',
-                           'float_trunc_to_int_32', 'double_trunc_to_int_32',
-                           'double_to_float']:
-            size = 32
-        elif self._op in ['double_to_mem_64', 'double_to_int_64',
-                           'float_trunc_to_int_64', 'double_trunc_to_int_64',
-                           'float_to_double']:
-            size = 64
-        elif self._op in ['double_to_mem_80', 'double_to_int_80',
-                           'float_trunc_to_int_80',
-                           'double_trunc_to_int_80']:
-            size = 80
+        elif self._op.startswith("sint_to_fp"):
+            size = int(self._op[len("sint_to_fp"):])
+        elif self._op.startswith("fp_to_sint"):
+            size = int(self._op[len("fp_to_sint"):])
+        elif self._op.startswith("fpconvert_fp"):
+            size = int(self._op[len("fpconvert_fp"):])
         elif self._op in ['segm']:
             size = self._args[1].size
         else:
@@ -1884,3 +1870,80 @@ def expr_is_signed_lower_or_equal(op1, op2):
     of = _expr_compute_of(op1, op2)
     zf = _expr_compute_zf(op1, op2)
     return zf | (nf ^ of)
+
+# sign bit | exponent | significand
+size_to_IEEE754_info = {
+    16: {
+        "exponent": 5,
+        "significand": 10,
+    },
+    32: {
+        "exponent": 8,
+        "significand": 23,
+    },
+    64: {
+        "exponent": 11,
+        "significand": 52,
+    },
+}
+
+def expr_is_NaN(expr):
+    """Return 1 or 0 on 1 bit if expr represent a NaN value according to IEEE754
+    """
+    info = size_to_IEEE754_info[expr.size]
+    exponent = expr[info["significand"]: info["significand"] + info["exponent"]]
+
+    # exponent is full of 1s and significand is not NULL
+    return ExprCond(exponent - ExprInt(-1, exponent.size),
+                    ExprInt(0, 1),
+                    ExprCond(expr[:info["significand"]], ExprInt(1, 1),
+                             ExprInt(0, 1)))
+
+
+def expr_is_qNaN(expr):
+    """Return 1 or 0 on 1 bit if expr represent a qNaN (quiet) value according to
+    IEEE754
+    """
+    info = size_to_IEEE754_info[expr.size]
+    significand_top = expr[info["significand"]: info["significand"] + 1]
+    return expr_is_NaN(expr) & significand_top
+
+
+def expr_is_sNaN(expr):
+    """Return 1 or 0 on 1 bit if expr represent a sNaN (signalling) value according
+    to IEEE754
+    """
+    info = size_to_IEEE754_info[expr.size]
+    significand_top = expr[info["significand"]: info["significand"] + 1]
+    return expr_is_NaN(expr) & ~significand_top
+
+
+def expr_is_float_lower(op1, op2):
+    """Return 1 on 1 bit if @op1 < @op2, 0 otherwise.
+    /!\ Assume @op1 and @op2 are not NaN
+    Comparision is the floating point one, defined in IEEE754
+    """
+    sign1, sign2 = op1.msb(), op2.msb()
+    magn1, magn2 = op1[:-1], op2[:-1]
+    return ExprCond(sign1 ^ sign2,
+                    # Sign different, only the sign matters
+                    sign1, # sign1 ? op1 < op2 : op1 >= op2
+                    # Sign equals, the result is inversed for negatives
+                    sign1 ^ (expr_is_unsigned_lower(magn1, magn2)))
+
+
+def expr_is_float_equal(op1, op2):
+    """Return 1 on 1 bit if @op1 == @op2, 0 otherwise.
+    /!\ Assume @op1 and @op2 are not NaN
+    Comparision is the floating point one, defined in IEEE754
+    """
+    sign1, sign2 = op1.msb(), op2.msb()
+    magn1, magn2 = op1[:-1], op2[:-1]
+    return ExprCond(magn1 ^ magn2,
+                    ExprInt(0, 1),
+                    ExprCond(magn1,
+                             # magn1 == magn2, are the signal equals?
+                             ~(sign1 ^ sign2),
+                             # Special case: -0.0 == +0.0
+                             ExprInt(1, 1))
+                    )
diff --git a/miasm2/ir/translators/C.py b/miasm2/ir/translators/C.py
index cafec7c8..f8fd4d3b 100644
--- a/miasm2/ir/translators/C.py
+++ b/miasm2/ir/translators/C.py
@@ -94,17 +94,70 @@ class TranslatorC(Translator):
                     self.from_expr(expr.args[0]),
                     self._size2mask(expr.args[0].size),
                 )
-            elif (expr.op.startswith("double_to_") or
-                  expr.op.endswith("_to_double")   or
-                  expr.op.startswith("access_")    or
+            elif expr.op in [
+                    "ftan", "frndint", "f2xm1", "fsin", "fsqrt", "fabs", "fcos",
+                    "fchs",
+            ]:
+                return "fpu_%s%d(%s)" % (
+                    expr.op,
+                    expr.size,
+                    self.from_expr(expr.args[0]),
+                )
+            elif (expr.op.startswith("access_")    or
                   expr.op.startswith("load_")      or
                   expr.op.startswith("fxam_c")     or
-                  expr.op in ["-", "ftan", "frndint", "f2xm1",
-                              "fsin", "fsqrt", "fabs", "fcos", "fchs"]):
+                  expr.op in ["-"]):
                 return "%s(%s)" % (
                     expr.op,
                     self.from_expr(expr.args[0])
                 )
+            elif expr.op.startswith("fpround_"):
+                return "%s_fp%d(%s)" % (
+                    expr.op,
+                    expr.size,
+                    self.from_expr(expr.args[0]),
+                )
+            elif expr.op.startswith("sint_to_fp"):
+                dest_size = expr.size
+                arg_size = expr.args[0].size
+                if (arg_size, dest_size) in [
+                        (32, 32), (64, 64), (32, 64),
+                ]:
+                    func = "sint%d_to_fp%d" % (arg_size, dest_size)
+                else:
+                    raise RuntimeError(
+                        "Unsupported size for sint_to_fp: %r to %r" % (
+                            arg_size,
+                            dest_size
+                        ))
+                return "%s(%s)" % (func, self.from_expr(expr.args[0]))
+            elif expr.op.startswith("fp_to_sint"):
+                dest_size = expr.size
+                arg_size = expr.args[0].size
+                if (arg_size, dest_size) in [
+                        (32, 32), (64, 64), (64, 32),
+                ]:
+                    func = "fp%d_to_sint%d" % (arg_size, dest_size)
+                else:
+                    raise RuntimeError(
+                        "Unsupported size for fp_to_sint: %r to %r" % (
+                            arg_size,
+                            dest_size
+                        ))
+                return "%s(%s)" % (func, self.from_expr(expr.args[0]))
+            elif expr.op.startswith("fpconvert_fp"):
+                dest_size = expr.size
+                arg_size = expr.args[0].size
+                if (arg_size, dest_size) in [
+                        (32, 64), (64, 32)
+                ]:
+                    func = "fp%d_to_fp%d" % (arg_size, dest_size)
+                else:
+                    raise RuntimeError(
+                        "Unsupported size for fpconvert: %r to %r" % (arg_size,
+                                                                      dest_size)
+                    )
+                return "%s(%s)" % (func, self.from_expr(expr.args[0]))
             else:
                 raise NotImplementedError('Unknown op: %r' % expr.op)
 
@@ -155,10 +208,11 @@ class TranslatorC(Translator):
             elif (expr.op.startswith("fcom")  or
                   expr.op in ["fadd", "fsub", "fdiv", 'fmul', "fscale",
                               "fprem", "fprem_lsb", "fyl2x", "fpatan"]):
-                return "fpu_%s(%s, %s)" % (
+                return "fpu_%s%d(%s, %s)" % (
                     expr.op,
+                    expr.size,
                     self.from_expr(expr.args[0]),
-                    self.from_expr(expr.args[1])
+                    self.from_expr(expr.args[1]),
                 )
             elif expr.op == "segm":
                 return "segm2addr(jitcpu, %s, %s)" % (
@@ -209,8 +263,8 @@ class TranslatorC(Translator):
         if expr.size in [8, 16, 32, 64, 128]:
             size = expr.size
         else:
-            # Uncommon expression size
-            size = expr.size
+            # Uncommon expression size, use at least uint8
+            size = max(expr.size, 8)
             next_power = 1
             while next_power <= size:
                 next_power <<= 1
diff --git a/miasm2/jitter/arch/JitCore_x86.h b/miasm2/jitter/arch/JitCore_x86.h
index 221ba5db..a5fc4bd4 100644
--- a/miasm2/jitter/arch/JitCore_x86.h
+++ b/miasm2/jitter/arch/JitCore_x86.h
@@ -49,14 +49,14 @@ typedef struct {
 
 	uint64_t cond;
 
-	double float_st0;
-	double float_st1;
-	double float_st2;
-	double float_st3;
-	double float_st4;
-	double float_st5;
-	double float_st6;
-	double float_st7;
+	uint64_t float_st0;
+	uint64_t float_st1;
+	uint64_t float_st2;
+	uint64_t float_st3;
+	uint64_t float_st4;
+	uint64_t float_st5;
+	uint64_t float_st6;
+	uint64_t float_st7;
 
 	unsigned int float_c0;
 	unsigned int float_c1;
diff --git a/miasm2/jitter/llvmconvert.py b/miasm2/jitter/llvmconvert.py
index d63351cc..c4e6709d 100644
--- a/miasm2/jitter/llvmconvert.py
+++ b/miasm2/jitter/llvmconvert.py
@@ -51,6 +51,17 @@ class LLVMType(llvm_ir.Type):
         else:
             raise ValueError()
 
+    @classmethod
+    def fptype(cls, size):
+        """Return the floating type corresponding to precision @size"""
+        if size == 32:
+            precision = llvm_ir.FloatType()
+        elif size == 64:
+            precision = llvm_ir.DoubleType()
+        else:
+            raise RuntimeError("Unsupported precision: %x", size)
+        return precision
+
 
 class LLVMContext():
 
@@ -236,8 +247,16 @@ class LLVMContext_JIT(LLVMContext):
         i8 = LLVMType.IntType(8)
         p8 = llvm_ir.PointerType(i8)
         itype = LLVMType.IntType(64)
+        ftype = llvm_ir.FloatType()
+        dtype = llvm_ir.DoubleType()
         fc = {"llvm.ctpop.i8": {"ret": i8,
                                 "args": [i8]},
+              "llvm.nearbyint.f32": {"ret": ftype,
+                                     "args": [ftype]},
+              "llvm.nearbyint.f64": {"ret": dtype,
+                                     "args": [dtype]},
+              "llvm.trunc.f32": {"ret": ftype,
+                                 "args": [ftype]},
               "segm2addr": {"ret": itype,
                             "args": [p8,
                                      itype,
@@ -245,6 +264,22 @@ class LLVMContext_JIT(LLVMContext):
               "x86_cpuid": {"ret": itype,
                         "args": [itype,
                                  itype]},
+              "fcom_c0": {"ret": itype,
+                          "args": [dtype,
+                                   dtype]},
+              "fcom_c1": {"ret": itype,
+                          "args": [dtype,
+                                   dtype]},
+              "fcom_c2": {"ret": itype,
+                          "args": [dtype,
+                                   dtype]},
+              "fcom_c3": {"ret": itype,
+                          "args": [dtype,
+                                   dtype]},
+              "llvm.sqrt.f32": {"ret": ftype,
+                                "args": [ftype]},
+              "llvm.sqrt.f64": {"ret": dtype,
+                                "args": [dtype]},
         }
 
         for k in [8, 16]:
@@ -466,10 +501,7 @@ class LLVMFunction():
                           [llvm_ir.Constant(LLVMType.IntType(),
                                             offset)])
         regs = self.llvm_context.ir_arch.arch.regs
-        if hasattr(regs, "float_list") and expr in regs.float_list:
-            pointee_type = llvm_ir.DoubleType()
-        else:
-            pointee_type = LLVMType.IntType(expr.size)
+        pointee_type = LLVMType.IntType(expr.size)
         ptr_casted = builder.bitcast(ptr,
                                      llvm_ir.PointerType(pointee_type))
         # Store in cache
@@ -764,15 +796,19 @@ class LLVMFunction():
                 itype = LLVMType.IntType(expr.size)
                 cond_ok = self.builder.icmp_unsigned("<", count,
                                                      itype(expr.size))
+                zero = itype(0)
                 if op == ">>":
                     callback = builder.lshr
                 elif op == "<<":
                     callback = builder.shl
                 elif op == "a>>":
                     callback = builder.ashr
+                    # x a>> size is 0 or -1, depending on x sign
+                    cond_neg = self.builder.icmp_signed("<", value, zero)
+                    zero = self.builder.select(cond_neg, itype(-1), zero)
 
                 ret = self.builder.select(cond_ok, callback(value, count),
-                                          itype(0))
+                                          zero)
                 self.update_cache(expr, ret)
                 return ret
 
@@ -800,19 +836,118 @@ class LLVMFunction():
                 self.update_cache(expr, ret)
                 return ret
 
+            if op.startswith("sint_to_fp"):
+                fptype = LLVMType.fptype(expr.size)
+                arg = self.add_ir(expr.args[0])
+                ret = builder.sitofp(arg, fptype)
+                ret = builder.bitcast(ret, llvm_ir.IntType(expr.size))
+                self.update_cache(expr, ret)
+                return ret
 
+            if op == "fp_to_sint32":
+                size_arg = expr.args[0].size
+                fptype_orig = LLVMType.fptype(size_arg)
+                arg = self.add_ir(expr.args[0])
+                arg = builder.bitcast(arg, fptype_orig)
+                # Enforce IEEE-754 behavior. This could be enhanced with
+                # 'llvm.experimental.constrained.nearbyint'
+                if size_arg == 32:
+                    func = self.mod.get_global("llvm.nearbyint.f32")
+                elif size_arg == 64:
+                    func = self.mod.get_global("llvm.nearbyint.f64")
+                else:
+                    raise RuntimeError("Unsupported size")
+                rounded = builder.call(func, [arg])
+                ret = builder.fptoui(rounded, llvm_ir.IntType(expr.size))
+                self.update_cache(expr, ret)
+                return ret
 
-            if op in ["int_16_to_double", "int_32_to_double", "int_64_to_double",
-                      "mem_16_to_double", "mem_32_to_double", "mem_64_to_double"]:
+            if op.startswith("fpconvert_fp"):
+                assert len(expr.args) == 1
+                size_arg = expr.args[0].size
+                fptype = LLVMType.fptype(expr.size)
+                fptype_orig = LLVMType.fptype(size_arg)
                 arg = self.add_ir(expr.args[0])
-                ret = builder.uitofp(arg, llvm_ir.DoubleType())
+                arg = builder.bitcast(arg, fptype_orig)
+                if expr.size > size_arg:
+                    fc = builder.fpext
+                elif expr.size < size_arg:
+                    fc = builder.fptrunc
+                else:
+                    raise RuntimeError("Not supported, same size")
+                ret = fc(arg, fptype)
+                ret = builder.bitcast(ret, llvm_ir.IntType(expr.size))
+                self.update_cache(expr, ret)
+                return ret
+
+            if op.startswith("fpround_"):
+                assert len(expr.args) == 1
+                fptype = LLVMType.fptype(expr.size)
+                arg = self.add_ir(expr.args[0])
+                arg = builder.bitcast(arg, fptype)
+                if op == "fpround_towardszero" and expr.size == 32:
+                    fc = self.mod.get_global("llvm.trunc.f32")
+                else:
+                    raise RuntimeError("Not supported, same size")
+                rounded = builder.call(fc, [arg])
+                ret = builder.bitcast(rounded, llvm_ir.IntType(expr.size))
                 self.update_cache(expr, ret)
                 return ret
 
-            if op in ["double_to_int_16", "double_to_int_32", "double_to_int_64",
-                      "double_to_mem_16", "double_to_mem_32", "double_to_mem_64"]:
+            if op in ["fcom_c0", "fcom_c1", "fcom_c2", "fcom_c3"]:
+                arg1 = self.add_ir(expr.args[0])
+                arg2 = self.add_ir(expr.args[0])
+                fc_name = op
+                fc_ptr = self.mod.get_global(fc_name)
+                casted_args = [
+                    builder.bitcast(arg1, llvm_ir.DoubleType()),
+                    builder.bitcast(arg2, llvm_ir.DoubleType()),
+                ]
+                ret = builder.call(fc_ptr, casted_args)
+
+                # Cast ret if needed
+                ret_size = fc_ptr.return_value.type.width
+                if ret_size > expr.size:
+                    ret = builder.trunc(ret, LLVMType.IntType(expr.size))
+                self.update_cache(expr, ret)
+                return ret
+
+            if op in ["fsqrt"]:
                 arg = self.add_ir(expr.args[0])
-                ret = builder.fptoui(arg, llvm_ir.IntType(expr.size))
+
+                # Apply the correct sqrt func
+                if expr.size == 32:
+                    arg = builder.bitcast(arg, llvm_ir.FloatType())
+                    ret = builder.call(self.mod.get_global("llvm.sqrt.f32"),
+                                       [arg])
+                elif expr.size == 64:
+                    arg = builder.bitcast(arg, llvm_ir.DoubleType())
+                    ret = builder.call(self.mod.get_global("llvm.sqrt.f64"),
+                                       [arg])
+                else:
+                    raise RuntimeError("Unsupported precision: %x", expr.size)
+
+                ret = builder.bitcast(ret, llvm_ir.IntType(expr.size))
+                self.update_cache(expr, ret)
+                return ret
+
+            if op in ["fadd", "fmul", "fsub", "fdiv"]:
+                # More than 2 args not yet supported
+                assert len(expr.args) == 2
+                arg1 = self.add_ir(expr.args[0])
+                arg2 = self.add_ir(expr.args[1])
+                precision = LLVMType.fptype(expr.size)
+                arg1 = builder.bitcast(arg1, precision)
+                arg2 = builder.bitcast(arg2, precision)
+                if op == "fadd":
+                    ret = builder.fadd(arg1, arg2)
+                elif op == "fmul":
+                    ret = builder.fmul(arg1, arg2)
+                elif op == "fsub":
+                    ret = builder.fsub(arg1, arg2)
+                elif op == "fdiv":
+                    ret = builder.fdiv(arg1, arg2)
+                ret = builder.bitcast(ret, llvm_ir.IntType(expr.size))
                 self.update_cache(expr, ret)
                 return ret
 
@@ -832,10 +967,6 @@ class LLVMFunction():
                     callback = builder.urem
                 elif op == "/":
                     callback = builder.udiv
-                elif op == "fadd":
-                    callback = builder.fadd
-                elif op == "fdiv":
-                    callback = builder.fdiv
                 else:
                     raise NotImplementedError('Unknown op: %s' % op)
 
diff --git a/miasm2/jitter/op_semantics.c b/miasm2/jitter/op_semantics.c
index 0420532a..0bc3fcc5 100644
--- a/miasm2/jitter/op_semantics.c
+++ b/miasm2/jitter/op_semantics.c
@@ -355,147 +355,92 @@ void dump_float(void)
 	*/
 }
 
-double mem_32_to_double(unsigned int m)
+uint32_t fpu_fadd32(uint32_t a, uint32_t b)
 {
-	float f;
-	double d;
-
-	f = *((float*)&m);
-	d = f;
-#ifdef DEBUG_MIASM_DOUBLE
-	dump_float();
-	printf("%d float %e\n", m, d);
-#endif
-	return d;
-}
-
-
-double mem_64_to_double(uint64_t m)
-{
-	double d;
-	d = *((double*)&m);
+	float c;
+	c = *((float*)&a) + *((float*)&b);
 #ifdef DEBUG_MIASM_DOUBLE
 	dump_float();
-	printf("%"PRId64" double %e\n", m, d);
-#endif
-	return d;
-}
-
-double int_16_to_double(unsigned int m)
-{
-	double d;
-
-	d = (double)(m&0xffff);
-#ifdef DEBUG_MIASM_DOUBLE
-	dump_float();
-	printf("%d double %e\n", m, d);
-#endif
-	return d;
-}
-
-double int_32_to_double(unsigned int m)
-{
-	double d;
-
-	d = (double)m;
-#ifdef DEBUG_MIASM_DOUBLE
-	dump_float();
-	printf("%d double %e\n", m, d);
+	printf("%e + %e -> %e\n", a, b, c);
 #endif
-	return d;
+	return *((uint32_t*)&c);
 }
 
-double int_64_to_double(uint64_t m)
+uint64_t fpu_fadd64(uint64_t a, uint64_t b)
 {
-	double d;
-
-	d = (double)m;
+	double c;
+	c = *((double*)&a) + *((double*)&b);
 #ifdef DEBUG_MIASM_DOUBLE
 	dump_float();
-	printf("%"PRId64" double %e\n", m, d);
+	printf("%e + %e -> %e\n", a, b, c);
 #endif
-	return d;
+	return *((uint64_t*)&c);
 }
 
-int16_t double_to_int_16(double d)
+uint32_t fpu_fsub32(uint32_t a, uint32_t b)
 {
-	int16_t i;
-
-	i = (int16_t)d;
+	float c;
+	c = *((float*)&a) - *((float*)&b);
 #ifdef DEBUG_MIASM_DOUBLE
 	dump_float();
-	printf("%e int %d\n", d, i);
+	printf("%e + %e -> %e\n", a, b, c);
 #endif
-	return i;
+	return *((uint32_t*)&c);
 }
 
-int32_t double_to_int_32(double d)
+uint64_t fpu_fsub64(uint64_t a, uint64_t b)
 {
-	int32_t i;
-
-	i = (int32_t)d;
+	double c;
+	c = *((double*)&a) - *((double*)&b);
 #ifdef DEBUG_MIASM_DOUBLE
 	dump_float();
-	printf("%e int %d\n", d, i);
+	printf("%e + %e -> %e\n", a, b, c);
 #endif
-	return i;
+	return *((uint64_t*)&c);
 }
 
-int64_t double_to_int_64(double d)
+uint32_t fpu_fmul32(uint32_t a, uint32_t b)
 {
-	int64_t i;
-
-	i = (int64_t)d;
+	float c;
+	c = *((float*)&a) * *((float*)&b);
 #ifdef DEBUG_MIASM_DOUBLE
 	dump_float();
-	printf("%e int %"PRId64"\n", d, i);
+	printf("%e * %e -> %e\n", a, b, c);
 #endif
-	return i;
+	return *((uint32_t*)&c);
 }
 
-
-double fpu_fadd(double a, double b)
+uint64_t fpu_fmul64(uint64_t a, uint64_t b)
 {
 	double c;
-	c = a + b;
+	c = *((double*)&a) * *((double*)&b);
 #ifdef DEBUG_MIASM_DOUBLE
 	dump_float();
-	printf("%e + %e -> %e\n", a, b, c);
+	printf("%e * %e -> %e\n", a, b, c);
 #endif
-	return c;
+	return *((uint64_t*)&c);
 }
 
-double fpu_fsub(double a, double b)
+uint32_t fpu_fdiv32(uint32_t a, uint32_t b)
 {
-	double c;
-	c = a - b;
-#ifdef DEBUG_MIASM_DOUBLE
-	dump_float();
-	printf("%e - %e -> %e\n", a, b, c);
-#endif
-	return c;
-}
-
-double fpu_fmul(double a, double b)
-{
-	double c;
-	c = a * b;
+	float c;
+	c = *((float*)&a) / *((float*)&b);
 #ifdef DEBUG_MIASM_DOUBLE
 	dump_float();
 	printf("%e * %e -> %e\n", a, b, c);
 #endif
-	return c;
+	return *((uint32_t*)&c);
 }
 
-double fpu_fdiv(double a, double b)
+uint64_t fpu_fdiv64(uint64_t a, uint64_t b)
 {
 	double c;
-	c = a / b;
+	c = *((double*)&a) / *((double*)&b);
 #ifdef DEBUG_MIASM_DOUBLE
 	dump_float();
-	printf("%e / %e -> %e\n", a, b, c);
+	printf("%e * %e -> %e\n", a, b, c);
 #endif
-	return c;
+	return *((uint64_t*)&c);
 }
 
 double fpu_ftan(double a)
@@ -567,15 +512,26 @@ double fpu_f2xm1(double a)
 	return b;
 }
 
-double fpu_fsqrt(double a)
+uint32_t fpu_fsqrt32(uint32_t a)
+{
+	float b;
+	b = sqrtf(*((float*)&a));
+#ifdef DEBUG_MIASM_DOUBLE
+	dump_float();
+	printf("%e sqrt %e\n", a, b);
+#endif
+	return *((uint32_t*)&b);
+}
+
+uint64_t fpu_fsqrt64(uint64_t a)
 {
 	double b;
-	b = sqrt(a);
+	b = sqrt(*((double*)&a));
 #ifdef DEBUG_MIASM_DOUBLE
 	dump_float();
 	printf("%e sqrt %e\n", a, b);
 #endif
-	return b;
+	return *((uint64_t*)&b);
 }
 
 double fpu_fabs(double a)
@@ -751,30 +707,75 @@ unsigned int fpu_fxam_c3(double a)
 	}
 }
 
-unsigned int double_to_mem_32(double d)
+uint64_t sint64_to_fp64(int64_t a)
 {
-	unsigned int m;
-	float f;
-	f = d;
-	m = *((unsigned int*)&f);
-#ifdef DEBUG_MIASM_DOUBLE
-	dump_float();
-	printf("%d %e\n", m, d);
-#endif
-	return m;
+	double result = (double) a;
+	return *((uint64_t*)&result);
 }
 
-uint64_t double_to_mem_64(double d)
+uint32_t sint32_to_fp32(int32_t a)
 {
-	uint64_t m;
-	m = *((uint64_t*)&d);
-#ifdef DEBUG_MIASM_DOUBLE
-	dump_float();
-	printf("%"PRId64" %e\n", m, d);
-#endif
-	return m;
+	float result = (float) a;
+	return *((uint32_t*)&result);
+}
+
+uint64_t sint32_to_fp64(int32_t a)
+{
+	double result = (double) a;
+	return *((uint64_t*)&result);
 }
 
+int32_t fp32_to_sint32(uint32_t a)
+{
+	// Enforce nearbyint (IEEE-754 behavior)
+	float rounded = *((float*)&a);
+	rounded = nearbyintf(rounded);
+	return (int32_t) rounded;
+}
+
+int64_t fp64_to_sint64(uint64_t a)
+{
+	// Enforce nearbyint (IEEE-754 behavior)
+	double rounded = *((double*)&a);
+	rounded = nearbyint(rounded);
+	return (int64_t) rounded;
+}
+
+int32_t fp64_to_sint32(uint64_t a)
+{
+	// Enforce nearbyint (IEEE-754 behavior)
+	double rounded = *((double*)&a);
+	rounded = nearbyint(rounded);
+	return (int32_t) rounded;
+}
+
+uint32_t fp64_to_fp32(uint64_t a)
+{
+	float result = (float) *((double*)&a);
+	return *((uint32_t*)&result);
+}
+
+uint64_t fp32_to_fp64(uint32_t a)
+{
+	double result = (double) *((float*)&a);
+	return *((uint64_t*)&result);
+}
+
+uint32_t fpround_towardszero_fp32(uint32_t a)
+{
+	float rounded = *((float*)&a);
+	rounded = truncf(rounded);
+	return *((uint32_t*)&rounded);
+}
+
+uint64_t fpround_towardszero_fp64(uint64_t a)
+{
+	double rounded = *((float*)&a);
+	rounded = trunc(rounded);
+	return *((uint64_t*)&rounded);
+}
+
+
 UDIV(16)
 UDIV(32)
 UDIV(64)
diff --git a/miasm2/jitter/op_semantics.h b/miasm2/jitter/op_semantics.h
index 3eb81cff..f8042895 100644
--- a/miasm2/jitter/op_semantics.h
+++ b/miasm2/jitter/op_semantics.h
@@ -96,19 +96,23 @@ int16_t idiv16(int16_t a, int16_t b);
 int16_t imod16(int16_t a, int16_t b);
 
 unsigned int x86_cpuid(unsigned int a, unsigned int reg_num);
-double int2double(unsigned int m);
 
-double fpu_fadd(double a, double b);
-double fpu_fsub(double a, double b);
-double fpu_fmul(double a, double b);
-double fpu_fdiv(double a, double b);
+uint32_t fpu_fadd32(uint32_t a, uint32_t b);
+uint64_t fpu_fadd64(uint64_t a, uint64_t b);
+uint32_t fpu_fsub32(uint32_t a, uint32_t b);
+uint64_t fpu_fsub64(uint64_t a, uint64_t b);
+uint32_t fpu_fmul32(uint32_t a, uint32_t b);
+uint64_t fpu_fmul64(uint64_t a, uint64_t b);
+uint32_t fpu_fdiv32(uint32_t a, uint32_t b);
+uint64_t fpu_fdiv64(uint64_t a, uint64_t b);
 double fpu_ftan(double a);
 double fpu_frndint(double a);
 double fpu_fsin(double a);
 double fpu_fcos(double a);
 double fpu_fscale(double a, double b);
 double fpu_f2xm1(double a);
-double fpu_fsqrt(double a);
+uint32_t fpu_fsqrt32(uint32_t a);
+uint64_t fpu_fsqrt64(uint64_t a);
 double fpu_fabs(double a);
 double fpu_fprem(double a, double b);
 double fpu_fchs(double a);
@@ -124,18 +128,16 @@ unsigned int fpu_fxam_c1(double a);
 unsigned int fpu_fxam_c2(double a);
 unsigned int fpu_fxam_c3(double a);
 
-
-double mem_32_to_double(unsigned int m);
-double mem_64_to_double(uint64_t m);
-double int_16_to_double(unsigned int m);
-double int_32_to_double(unsigned int m);
-double int_64_to_double(uint64_t m);
-int16_t double_to_int_16(double d);
-int32_t double_to_int_32(double d);
-int64_t double_to_int_64(double d);
-unsigned int double_to_mem_32(double d);
-uint64_t double_to_mem_64(double d);
-
+uint64_t sint64_to_fp64(int64_t a);
+uint32_t sint32_to_fp32(int32_t a);
+uint64_t sint32_to_fp64(int32_t a);
+int32_t fp32_to_sint32(uint32_t a);
+int64_t fp64_to_sint64(uint64_t a);
+int32_t fp64_to_sint32(uint64_t a);
+uint32_t fp64_to_fp32(uint64_t a);
+uint64_t fp32_to_fp64(uint32_t a);
+uint32_t fpround_towardszero_fp32(uint32_t a);
+uint64_t fpround_towardszero_fp64(uint64_t a);
 
 #define SHIFT_RIGHT_ARITH(size, value, shift)				\
 	((uint ## size ## _t)((((uint64_t) (shift)) > ((size) - 1))?	\
diff --git a/test/arch/x86/arch.py b/test/arch/x86/arch.py
index 43e973e1..ce6012a0 100644
--- a/test/arch/x86/arch.py
+++ b/test/arch/x86/arch.py
@@ -2306,7 +2306,8 @@ reg_tests = [
      "0f50c2"),
     (m64, "00000000    MOVMSKPS   R8D, XMM2",
      "440f50c2"),
-
+    (m64, "00000000    MOVMSKPD   EAX, XMM2",
+     "660F50C2"),
     (m32, "00000000    ADDSS      XMM2, DWORD PTR [ECX]",
      "f30f5811"),
     (m32, "00000000    ADDSS      XMM1, XMM2",
@@ -2351,6 +2352,32 @@ reg_tests = [
     (m32, "00000000    MAXSS      XMM0, DWORD PTR [EBX + 0x2CBD37]",
      "f30f5f8337bd2c00"),
 
+
+    (m32, "00000000    MINPS      XMM0, XMM2",
+     "0F5DC2"),
+    (m32, "00000000    MINSS      XMM0, XMM3",
+     "F30F5DC3"),
+    (m32, "00000000    MINPD      XMM0, XMM4",
+     "660F5DC4"),
+    (m32, "00000000    MINSD      XMM0, XMM5",
+     "F20F5DC5"),
+    (m32, "00000000    MAXPS      XMM0, XMM6",
+     "0F5FC6"),
+    (m32, "00000000    MAXPD      XMM0, XMM1",
+     "660F5FC1"),
+    (m32, "00000000    MAXSD      XMM0, XMM2",
+     "F20F5FC2"),
+    (m32, "00000000    MAXSS      XMM0, XMM7",
+     "F30F5FC7"),
+    (m32, "00000000    CMPEQPS    XMM0, XMM3",
+     "0FC2C300"),
+    (m32, "00000000    CMPEQSS    XMM0, XMM4",
+     "F30FC2C400"),
+    (m32, "00000000    CMPEQPD    XMM0, XMM5",
+     "660FC2C500"),
+    (m32, "00000000    CMPEQSD    XMM0, XMM6",
+     "F20FC2C600"),
+
     (m32, "00000000    CVTDQ2PD   XMM0, XMM3",
      "f30fe6c3"),
     (m32, "00000000    CVTDQ2PS   XMM0, XMM3",
@@ -2485,6 +2512,9 @@ reg_tests = [
     (m64, "00000000    MOVQ       RCX, XMM0",
      "66480F7EC1"),
 
+    (m32, "00000000    MOVQ2DQ    XMM0, MM1",
+     "F30FD6C1"),
+
     (m32, "00000000    PAND       MM2, MM6",
      "0fdbd6"),
     (m32, "00000000    PAND       XMM2, XMM6",
@@ -2671,6 +2701,14 @@ reg_tests = [
     (m32, "00000000    PSRLW      XMM6, 0x5",
     "660F71D605"),
 
+    (m32, "00000000    PSRAW      XMM0, 0x7",
+     "660F71E007"),
+    (m32, "00000000    PSRAW      XMM0, XMM3",
+     "660FE1C3"),
+    (m32, "00000000    PSRAD      XMM0, 0x7",
+     "660F72E007"),
+    (m32, "00000000    PSRAD      XMM0, XMM3",
+     "660FE2C3"),
 
     (m32, "00000000    PSRLQ      MM2, QWORD PTR [EDX]",
     "0FD312"),
diff --git a/test/test_all.py b/test/test_all.py
index a10ab026..665fc3a5 100755
--- a/test/test_all.py
+++ b/test/test_all.py
@@ -162,6 +162,7 @@ QEMU_TESTS = [
     "xchg",
     "string",
     "misc",
+    'sse',
     # Unsupported
     # "floats", "segs", "code16", "exceptions", "single_step"
 ]
@@ -169,6 +170,9 @@ QEMU_TESTS = [
 
 for test_name in QEMU_TESTS:
     for jitter in QEMUTest.jitter_engines:
+        if (test_name, jitter) in [("sse", "python")]:
+            # SKIP unsupported
+            continue
         tags = [TAGS[jitter]] if jitter in TAGS else []
         testset += QEMUTest(test_name, jitter, tags=tags)