about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorAjax <commial@gmail.com>2018-07-10 14:13:03 +0200
committerAjax <commial@gmail.com>2018-07-10 14:14:35 +0200
commitf23860a734e4c10e4ca13586c2e5f1127f325ee5 (patch)
tree0184d8761ca985756d3a9a92234a02364a7dfb04
parentdd6fad30a3f78f3903e63c67a886d7db090aa25a (diff)
downloadmiasm-f23860a734e4c10e4ca13586c2e5f1127f325ee5.tar.gz
miasm-f23860a734e4c10e4ca13586c2e5f1127f325ee5.zip
Add support for new SSE operations (based on QEMU i386 test_sse)
-rw-r--r--miasm2/arch/x86/sem.py403
1 files changed, 299 insertions, 104 deletions
diff --git a/miasm2/arch/x86/sem.py b/miasm2/arch/x86/sem.py
index 6c3008aa..ef939144 100644
--- a/miasm2/arch/x86/sem.py
+++ b/miasm2/arch/x86/sem.py
@@ -3375,8 +3375,7 @@ def wrmsr(ir, instr):
 # MMX/SSE/AVX operations
 #
 
-
-def vec_op_clip(op, size):
+def vec_op_clip(op, size, callback=None):
     """
     Generate simd operations
     @op: the operator
@@ -3384,9 +3383,12 @@ def vec_op_clip(op, size):
     """
     def vec_op_clip_instr(ir, instr, dst, src):
         if op == '-':
-            return [m2_expr.ExprAff(dst[:size], dst[:size] - src[:size])], []
+            result = dst[:size] - src[:size]
         else:
-            return [m2_expr.ExprAff(dst[:size], m2_expr.ExprOp(op, dst[:size], src[:size]))], []
+            result = m2_expr.ExprOp(op, dst[:size], src[:size])
+        if callback is not None:
+            result = callback(result)
+        return [m2_expr.ExprAff(dst[:size], result)], []
     return vec_op_clip_instr
 
 # Generic vertical operation
@@ -3411,38 +3413,6 @@ def vec_vertical_sem(op, elt_size, reg_size, dst, src, apply_on_output):
     return m2_expr.ExprCompose(*ops)
 
 
-def float_vec_vertical_sem(op, elt_size, reg_size, dst, src, apply_on_output):
-    assert reg_size % elt_size == 0
-    n = reg_size / elt_size
-
-    x_to_int, int_to_x = {32: ('float_to_int_%d', 'int_%d_to_float'),
-                          64: ('double_to_int_%d', 'int_%d_to_double')}[elt_size]
-    if op == '-':
-        ops = [
-            apply_on_output(m2_expr.ExprOp(
-                x_to_int % elt_size,
-                m2_expr.ExprOp(int_to_x % elt_size, dst[i * elt_size:(i + 1) * elt_size]) -
-                m2_expr.ExprOp(
-                    int_to_x % elt_size, src[i * elt_size:(
-                        i + 1) * elt_size])))
-            for i in xrange(0, n)
-        ]
-    else:
-        ops = [
-            apply_on_output(m2_expr.ExprOp(
-                x_to_int % elt_size,
-                m2_expr.ExprOp(op,
-                               m2_expr.ExprOp(
-                                   int_to_x % elt_size, dst[i * elt_size:(
-                                       i + 1) * elt_size]),
-                               m2_expr.ExprOp(
-                                   int_to_x % elt_size, src[i * elt_size:(
-                                       i + 1) * elt_size]))))
-            for i in xrange(0, n)]
-
-    return m2_expr.ExprCompose(*ops)
-
-
 def __vec_vertical_instr_gen(op, elt_size, sem, apply_on_output):
     def vec_instr(ir, instr, dst, src):
         e = []
@@ -3460,11 +3430,6 @@ def vec_vertical_instr(op, elt_size, apply_on_output=lambda x: x):
                                     apply_on_output)
 
 
-def float_vec_vertical_instr(op, elt_size, apply_on_output=lambda x: x):
-    return __vec_vertical_instr_gen(op, elt_size, float_vec_vertical_sem,
-                                    apply_on_output)
-
-
 def _keep_mul_high(expr, signed=False):
     assert expr.is_op("*") and len(expr.args) == 2
 
@@ -3491,6 +3456,32 @@ def _min_max(expr, signed):
         expr.args[0],
     )
 
+def _float_min_max(expr):
+    assert (expr.is_op("fmin") or expr.is_op("fmax")) and len(expr.args) == 2
+    src1 = expr.args[0]
+    src2 = expr.args[1]
+    if expr.is_op("fmin"):
+        comp = m2_expr.expr_is_float_lower(src1, src2)
+    elif expr.is_op("fmax"):
+        comp = m2_expr.expr_is_float_lower(src2, src1)
+
+    # x86 documentation (for MIN):
+    # IF ((SRC1 = 0.0) and (SRC2 = 0.0)) THEN DEST <-SRC2;
+    # ELSE IF (SRC1 = SNaN) THEN DEST <-SRC2; FI;
+    # ELSE IF (SRC2 = SNaN) THEN DEST <-SRC2; FI;
+    # ELSE IF (SRC1 < SRC2) THEN DEST <-SRC1;
+    # ELSE DEST<-SRC2;
+    #
+    # But this includes the NaN output of "SRC1 < SRC2"
+    # Associated text is more detailed, and this is the version impl here
+    return m2_expr.ExprCond(
+        m2_expr.expr_is_sNaN(src2), src2,
+        m2_expr.ExprCond(
+            m2_expr.expr_is_NaN(src2) | m2_expr.expr_is_NaN(src1), src2,
+            m2_expr.ExprCond(comp, src1, src2)
+        )
+    )
+
 
 # Integer arithmetic
 #
@@ -3620,22 +3611,100 @@ pmaxsw = vec_vertical_instr('max', 16, lambda x: _min_max(x, signed=True))
 #
 
 # SSE
-addss = vec_op_clip('+', 32)
-addsd = vec_op_clip('+', 64)
-addps = float_vec_vertical_instr('+', 32)
-addpd = float_vec_vertical_instr('+', 64)
-subss = vec_op_clip('-', 32)
-subsd = vec_op_clip('-', 64)
-subps = float_vec_vertical_instr('-', 32)
-subpd = float_vec_vertical_instr('-', 64)
-mulss = vec_op_clip('*', 32)
-mulsd = vec_op_clip('*', 64)
-mulps = float_vec_vertical_instr('*', 32)
-mulpd = float_vec_vertical_instr('*', 64)
-divss = vec_op_clip('/', 32)
-divsd = vec_op_clip('/', 64)
-divps = float_vec_vertical_instr('/', 32)
-divpd = float_vec_vertical_instr('/', 64)
+addss = vec_op_clip('fadd', 32)
+addsd = vec_op_clip('fadd', 64)
+addps = vec_vertical_instr('fadd', 32)
+addpd = vec_vertical_instr('fadd', 64)
+subss = vec_op_clip('fsub', 32)
+subsd = vec_op_clip('fsub', 64)
+subps = vec_vertical_instr('fsub', 32)
+subpd = vec_vertical_instr('fsub', 64)
+mulss = vec_op_clip('fmul', 32)
+mulsd = vec_op_clip('fmul', 64)
+mulps = vec_vertical_instr('fmul', 32)
+mulpd = vec_vertical_instr('fmul', 64)
+divss = vec_op_clip('fdiv', 32)
+divsd = vec_op_clip('fdiv', 64)
+divps = vec_vertical_instr('fdiv', 32)
+divpd = vec_vertical_instr('fdiv', 64)
+
+# Comparisons (floating-point)
+
+minps = vec_vertical_instr('fmin', 32, _float_min_max)
+minpd = vec_vertical_instr('fmin', 64, _float_min_max)
+minss = vec_op_clip('fmin', 32, _float_min_max)
+minsd = vec_op_clip('fmin', 64, _float_min_max)
+maxps = vec_vertical_instr('fmax', 32, _float_min_max)
+maxpd = vec_vertical_instr('fmax', 64, _float_min_max)
+maxss = vec_op_clip('fmax', 32, _float_min_max)
+maxsd = vec_op_clip('fmax', 64, _float_min_max)
+
+def _float_compare_to_mask(expr):
+    if expr.op == 'unord':
+        to_ext = m2_expr.expr_is_NaN(expr.args[0]) | m2_expr.expr_is_NaN(expr.args[1])
+    elif expr.op == 'ord':
+        to_ext = ~m2_expr.expr_is_NaN(expr.args[0]) & ~m2_expr.expr_is_NaN(expr.args[1])
+    else:
+        if expr.op == '==fu':
+            to_ext = m2_expr.expr_is_float_equal(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(0, 1)
+        elif expr.op == '<fu':
+            to_ext = m2_expr.expr_is_float_lower(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(0, 1)
+        elif expr.op == '<=fu':
+            to_ext = (m2_expr.expr_is_float_equal(expr.args[0], expr.args[1]) |
+                      m2_expr.expr_is_float_lower(expr.args[0], expr.args[1]))
+            on_NaN = m2_expr.ExprInt(0, 1)
+        elif expr.op == '!=fu':
+            to_ext = ~m2_expr.expr_is_float_equal(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(1, 1)
+        elif expr.op == '!<fu':
+            to_ext = ~m2_expr.expr_is_float_lower(expr.args[0], expr.args[1])
+            on_NaN = m2_expr.ExprInt(1, 1)
+        elif expr.op == '!<=fu':
+            to_ext = ~(m2_expr.expr_is_float_equal(expr.args[0], expr.args[1]) |
+                      m2_expr.expr_is_float_lower(expr.args[0], expr.args[1]))
+            on_NaN = m2_expr.ExprInt(1, 1)
+
+        to_ext = m2_expr.ExprCond(
+            m2_expr.expr_is_NaN(expr.args[0]) | m2_expr.expr_is_NaN(expr.args[1]),
+            on_NaN,
+            to_ext
+        )
+    return to_ext.signExtend(expr.size)
+
+cmpeqps = vec_vertical_instr('==fu', 32, lambda x: _float_compare_to_mask(x))
+cmpeqpd = vec_vertical_instr('==fu', 64, lambda x: _float_compare_to_mask(x))
+cmpeqss = vec_op_clip('==fu', 32, lambda x: _float_compare_to_mask(x))
+cmpeqsd = vec_op_clip('==fu', 64, lambda x: _float_compare_to_mask(x))
+cmpltps = vec_vertical_instr('<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpltpd = vec_vertical_instr('<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpltss = vec_op_clip('<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpltsd = vec_op_clip('<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpleps = vec_vertical_instr('<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmplepd = vec_vertical_instr('<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpless = vec_op_clip('<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmplesd = vec_op_clip('<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpunordps = vec_vertical_instr('unord', 32, lambda x: _float_compare_to_mask(x))
+cmpunordpd = vec_vertical_instr('unord', 64, lambda x: _float_compare_to_mask(x))
+cmpunordss = vec_op_clip('unord', 32, lambda x: _float_compare_to_mask(x))
+cmpunordsd = vec_op_clip('unord', 64, lambda x: _float_compare_to_mask(x))
+cmpneqps = vec_vertical_instr('!=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpneqpd = vec_vertical_instr('!=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpneqss = vec_op_clip('!=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpneqsd = vec_op_clip('!=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnltps = vec_vertical_instr('!<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnltpd = vec_vertical_instr('!<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnltss = vec_op_clip('!<fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnltsd = vec_op_clip('!<fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnleps = vec_vertical_instr('!<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnlepd = vec_vertical_instr('!<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpnless = vec_op_clip('!<=fu', 32, lambda x: _float_compare_to_mask(x))
+cmpnlesd = vec_op_clip('!<=fu', 64, lambda x: _float_compare_to_mask(x))
+cmpordps = vec_vertical_instr('ord', 32, lambda x: _float_compare_to_mask(x))
+cmpordpd = vec_vertical_instr('ord', 64, lambda x: _float_compare_to_mask(x))
+cmpordss = vec_op_clip('ord', 32, lambda x: _float_compare_to_mask(x))
+cmpordsd = vec_op_clip('ord', 64, lambda x: _float_compare_to_mask(x))
 
 # Logical (floating-point)
 #
@@ -3834,6 +3903,8 @@ def cvttpd2dq(_, instr, dst, src):
     e.append(m2_expr.ExprAff(dst[64:128], m2_expr.ExprInt(0, 64)))
     return e, []
 
+def cvttsd2si(_, instr, dst, src):
+    return _cvtt_tpl(dst, src, [0], double=True), []
 
 def cvttps2dq(_, instr, dst, src):
     return _cvtt_tpl(dst, src, [0, 1, 2, 3], double=False), []
@@ -3842,11 +3913,7 @@ def cvttps2pi(_, instr, dst, src):
     return _cvtt_tpl(dst, src, [0, 1], double=False), []
 
 def cvttss2si(_, instr, dst, src):
-    e = []
-    e.append(
-        m2_expr.ExprAff(dst[:32], m2_expr.ExprOp('float_trunc_to_int_32', src[:32])))
-    return e, []
-
+    return _cvtt_tpl(dst, src, [0], double=False), []
 
 def movss(_, instr, dst, src):
     e = []
@@ -3914,52 +3981,55 @@ def pshufb(_, instr, dst, src):
 
 
 def pshufd(_, instr, dst, src, imm):
-    e = []
+    control = int(imm)
+    out = []
     for i in xrange(4):
-        index = imm[2 * i:2 * (i + 1)].zeroExtend(dst.size)
-        index <<= m2_expr.ExprInt(5, dst.size)
-        value = (dst >> index)[:32]
-        e.append(m2_expr.ExprAff(dst[32 * i:32 * (i + 1)], value))
-    return e, []
+        shift = ((control >> (i * 2)) & 3) * 32
+        # shift is 2 bits long, expr.size is 128
+        # => shift + 32 <= src.size
+        out.append(src[shift: shift + 32])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
 
 
-def ps_rl_ll(ir, instr, dst, src, op, size):
-    loc_zero, loc_zero_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
-    loc_do, loc_do_expr = ir.gen_loc_key_and_expr(ir.IRDst.size)
-    loc_next = ir.get_next_loc_key(instr)
-    loc_next_expr = m2_expr.ExprLoc(loc_next, ir.IRDst.size)
+def pshuflw(_, instr, dst, src, imm):
+    control = int(imm)
+    out = []
+    for i in xrange(4):
+        shift = ((control >> (i * 2)) & 3) * 16
+        out.append(src[shift: shift + 16])
+    out.append(src[64:])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
+
+
+def pshufhw(_, instr, dst, src, imm):
+    control = int(imm)
+    out = [src[:64]]
+    for i in xrange(4):
+        shift = ((control >> (i * 2)) & 3) * 16
+        out.append(src[shift + 64: shift + 16 + 64])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
 
-    if src.size == 8:
-        count = src.zeroExtend(dst.size)
-    else:
-        count = src.zeroExtend(dst.size)
 
+def ps_rl_ll(ir, instr, dst, src, op, size):
     mask = {16: 0xF,
             32: 0x1F,
             64: 0x3F}[size]
-    test = expr_simp(count & m2_expr.ExprInt(
-        ((1 << dst.size) - 1) ^ mask, dst.size))
-    e = [m2_expr.ExprAff(ir.IRDst, m2_expr.ExprCond(test,
-                                                    loc_zero_expr,
-                                                    loc_do_expr))]
-
-    slices = []
-    for i in xrange(0, dst.size, size):
-        slices.append(m2_expr.ExprOp(op, dst[i:i + size], count[:size]))
+    mask = m2_expr.ExprInt(mask, dst.size)
 
-    if isinstance(test, m2_expr.ExprInt):
-        if int(test) == 0:
-            return [m2_expr.ExprAff(dst[0:dst.size], m2_expr.ExprCompose(*slices))], []
-        else:
-            return [m2_expr.ExprAff(dst, m2_expr.ExprInt(0, dst.size))], []
+    # Saturate the counter to 2**size
+    count = src.zeroExtend(dst.size)
+    count = m2_expr.ExprCond(count & expr_simp(~mask),
+                             m2_expr.ExprInt(size, dst.size), # saturation
+                             count, # count < 2**size
+    )
+    count = count[:size]
+    if src.is_int():
+        count = expr_simp(count)
 
-    e_zero = [m2_expr.ExprAff(dst, m2_expr.ExprInt(0, dst.size)),
-              m2_expr.ExprAff(ir.IRDst, loc_next_expr)]
-    e_do = []
-    e.append(m2_expr.ExprAff(dst[0:dst.size], m2_expr.ExprCompose(*slices)))
-    e_do.append(m2_expr.ExprAff(ir.IRDst, loc_next_expr))
-    return e, [IRBlock(loc_do, [AssignBlock(e_do, instr)]),
-               IRBlock(loc_zero, [AssignBlock(e_zero, instr)])]
+    out = []
+    for i in xrange(0, dst.size, size):
+        out.append(m2_expr.ExprOp(op, dst[i:i + size], count))
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
 
 
 def psrlw(ir, instr, dst, src):
@@ -3986,6 +4056,14 @@ def psllq(ir, instr, dst, src):
     return ps_rl_ll(ir, instr, dst, src, "<<",  64)
 
 
+def psraw(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, "a>>", 16)
+
+
+def psrad(ir, instr, dst, src):
+    return ps_rl_ll(ir, instr, dst, src, "a>>", 32)
+
+
 def pslldq(_, instr, dst, src):
     assert src.is_int()
     e = []
@@ -4239,11 +4317,17 @@ def movdq2q(_, instr, dst, src):
     return e, []
 
 
+def movq2dq(_, instr, dst, src):
+    e = []
+    e.append(m2_expr.ExprAff(dst, src[:64].zeroExtend(dst.size)))
+    return e, []
+
+
 def sqrt_gen(_, instr, dst, src, size):
     e = []
     out = []
-    for i in src.size / size:
-        out.append(m2_expr.ExprOp('fsqrt' % size,
+    for i in xrange(src.size / size):
+        out.append(m2_expr.ExprOp('fsqrt',
                                   src[i * size: (i + 1) * size]))
     src = m2_expr.ExprCompose(*out)
     e.append(m2_expr.ExprAff(dst, src))
@@ -4468,10 +4552,10 @@ def maskmovq(ir, instr, src, mask):
     blks = []
 
     # For each possibility, check if a write is necessary
-    check_labels = [m2_expr.ExprId(ir.gen_label(), ir.IRDst.size)
+    check_labels = [m2_expr.ExprLoc(ir.loc_db.add_location(), ir.IRDst.size)
                     for _ in xrange(0, mask.size, 8)]
     # If the write has to be done, do it (otherwise, nothing happen)
-    write_labels = [m2_expr.ExprId(ir.gen_label(), ir.IRDst.size)
+    write_labels = [m2_expr.ExprLoc(ir.loc_db.add_location(), ir.IRDst.size)
                     for _ in xrange(0, mask.size, 8)]
 
     # Build check blocks
@@ -4484,7 +4568,7 @@ def maskmovq(ir, instr, src, mask):
                                 m2_expr.ExprCond(bit,
                                                  write_label,
                                                  next_check_label))
-        blks.append(IRBlock(cur_label.name.loc_key, [AssignBlock([check], instr)]))
+        blks.append(IRBlock(cur_label.loc_key, [AssignBlock([check], instr)]))
 
     # Build write blocks
     dst_addr = mRDI[instr.mode]
@@ -4498,7 +4582,7 @@ def maskmovq(ir, instr, src, mask):
         write_mem = m2_expr.ExprAff(m2_expr.ExprMem(write_addr, 8),
                                     src[start: start + 8])
         jump = m2_expr.ExprAff(ir.IRDst, next_check_label)
-        blks.append(IRBlock(cur_label.name.loc_key, [AssignBlock([write_mem, jump], instr)]))
+        blks.append(IRBlock(cur_label.loc_key, [AssignBlock([write_mem, jump], instr)]))
 
     # If mask is null, bypass all
     e = [m2_expr.ExprAff(ir.IRDst, m2_expr.ExprCond(mask,
@@ -4511,6 +4595,63 @@ def emms(ir, instr):
     # Implemented as a NOP
     return [], []
 
+# Common value without too many option, 0x1fa0
+STMXCSR_VALUE = 0x1fa0
+def stmxcsr(ir, instr, dst):
+    return [m2_expr.ExprAff(dst, m2_expr.ExprInt(STMXCSR_VALUE, dst.size))], []
+
+def ldmxcsr(ir, instr, dst):
+    # Implemented as a NOP
+    return [], []
+
+
+def _select4(src, control):
+    # Implementation inspired from Intel Intrisics Guide
+    # @control is already resolved (was an immediate)
+
+    if control == 0:
+        return src[:32] # 0
+    elif control == 1:
+        return src[32:64]
+    elif control == 2:
+        return src[64:96]
+    elif control == 3:
+        return src[96:]
+    else:
+        raise ValueError("Control must be on 2 bits")
+
+
+def shufps(ir, instr, dst, src, imm8):
+    out = []
+    control = int(imm8)
+    for i in xrange(4):
+        if i < 2:
+            source = dst
+        else:
+            source = src
+        out.append(_select4(source, (control >> (i * 2)) & 3))
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
+
+
+def shufpd(ir, instr, dst, src, imm8):
+    out = []
+    control = int(imm8)
+    out.append(dst[64:] if control & 1 else dst[:64])
+    out.append(src[64:] if control & 2 else src[:64])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out))], []
+
+def movmskps(ir, instr, dst, src):
+    out = []
+    for i in xrange(4):
+        out.append(src[(32 * i) + 31:(32 * i) + 32])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out).zeroExtend(dst.size))], []
+
+def movmskpd(ir, instr, dst, src):
+    out = []
+    for i in xrange(2):
+        out.append(src[(64 * i) + 63:(64 * i) + 64])
+    return [m2_expr.ExprAff(dst, m2_expr.ExprCompose(*out).zeroExtend(dst.size))], []
+
 
 mnemo_func = {'mov': mov,
               'xchg': xchg,
@@ -4950,6 +5091,49 @@ mnemo_func = {'mov': mov,
               "divps": divps,
               "divpd": divpd,
 
+              # Comparisons (floating-point)
+              #
+              "minps": minps,
+              "minpd": minpd,
+              "minss": minss,
+              "minsd": minsd,
+              "maxps": maxps,
+              "maxpd": maxpd,
+              "maxss": maxss,
+              "maxsd": maxsd,
+              "cmpeqps": cmpeqps,
+              "cmpeqpd": cmpeqpd,
+              "cmpeqss": cmpeqss,
+              "cmpeqsd": cmpeqsd,
+              "cmpltps": cmpltps,
+              "cmpltpd": cmpltpd,
+              "cmpltss": cmpltss,
+              "cmpltsd": cmpltsd,
+              "cmpleps": cmpleps,
+              "cmplepd": cmplepd,
+              "cmpless": cmpless,
+              "cmplesd": cmplesd,
+              "cmpunordps": cmpunordps,
+              "cmpunordpd": cmpunordpd,
+              "cmpunordss": cmpunordss,
+              "cmpunordsd": cmpunordsd,
+              "cmpneqps": cmpneqps,
+              "cmpneqpd": cmpneqpd,
+              "cmpneqss": cmpneqss,
+              "cmpneqsd": cmpneqsd,
+              "cmpnltps": cmpnltps,
+              "cmpnltpd": cmpnltpd,
+              "cmpnltss": cmpnltss,
+              "cmpnltsd": cmpnltsd,
+              "cmpnleps": cmpnleps,
+              "cmpnlepd": cmpnlepd,
+              "cmpnless": cmpnless,
+              "cmpnlesd": cmpnlesd,
+              "cmpordps": cmpordps,
+              "cmpordpd": cmpordpd,
+              "cmpordss": cmpordss,
+              "cmpordsd": cmpordsd,
+
               # Logical (floating-point)
               #
 
@@ -4961,6 +5145,8 @@ mnemo_func = {'mov': mov,
               "wrmsr": wrmsr,
               "pshufb": pshufb,
               "pshufd": pshufd,
+              "pshuflw": pshuflw,
+              "pshufhw": pshufhw,
 
               "psrlw": psrlw,
               "psrld": psrld,
@@ -4970,6 +5156,8 @@ mnemo_func = {'mov': mov,
               "psllq": psllq,
               "pslldq": pslldq,
               "psrldq": psrldq,
+              "psraw": psraw,
+              "psrad": psrad,
 
               "palignr": palignr,
 
@@ -5025,6 +5213,7 @@ mnemo_func = {'mov': mov,
               "movlhps": movlhps,
               "movhlps": movhlps,
               "movdq2q": movdq2q,
+              "movq2dq": movq2dq,
 
               "sqrtpd": sqrtpd,
               "sqrtps": sqrtps,
@@ -5050,6 +5239,12 @@ mnemo_func = {'mov': mov,
               "maskmovq": maskmovq,
               "maskmovdqu": maskmovq,
               "emms": emms,
+              "shufps": shufps,
+              "shufpd": shufpd,
+              "movmskps": movmskps,
+              "movmskpd": movmskpd,
+              "stmxcsr": stmxcsr,
+              "ldmxcsr": ldmxcsr,
               }