about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/emu/x64run660f.c40
-rw-r--r--src/emu/x64runavx660f.c647
-rw-r--r--src/emu/x64runavx660f38.c188
-rw-r--r--src/emu/x64runavxf20f.c29
-rw-r--r--src/emu/x64runavxf30f.c28
5 files changed, 821 insertions, 111 deletions
diff --git a/src/emu/x64run660f.c b/src/emu/x64run660f.c
index 7c14a969..b32b78f9 100644
--- a/src/emu/x64run660f.c
+++ b/src/emu/x64run660f.c
@@ -1391,27 +1391,19 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         nextop = F8;

         GETEX(0);

         GETGX;

-        for(int i=7; i>0; --i)  // 0 is untouched

+        for(int i=7; i>=0; --i) {

             GX->ub[2 * i] = GX->ub[i];

-        if(GX==EX)

-            for(int i=0; i<8; ++i)

-                GX->ub[2 * i + 1] = GX->ub[2 * i];

-        else

-            for(int i=0; i<8; ++i)

-                GX->ub[2 * i + 1] = EX->ub[i];

+            GX->ub[2 * i + 1] = EX->ub[i];

+        }

         break;

     case 0x61:  /* PUNPCKLWD Gx,Ex */

         nextop = F8;

         GETEX(0);

         GETGX;

-        for(int i=3; i>0; --i)

+        for(int i=3; i>=0; --i) {

             GX->uw[2 * i] = GX->uw[i];

-        if(GX==EX)

-            for(int i=0; i<4; ++i)

-                GX->uw[2 * i + 1] = GX->uw[2 * i];

-        else

-            for(int i=0; i<4; ++i)

-                GX->uw[2 * i + 1] = EX->uw[i];

+            GX->uw[2 * i + 1] = EX->uw[i];

+        }

         break;

     case 0x62:  /* PUNPCKLDQ Gx,Ex */

         nextop = F8;

@@ -1476,27 +1468,19 @@ uintptr_t Run660F(x64emu_t *emu, rex_t rex, uintptr_t addr)
         nextop = F8;

         GETEX(0);

         GETGX;

-        for(int i=0; i<8; ++i)

+        for(int i=0; i<8; ++i) {

             GX->ub[2 * i] = GX->ub[i + 8];

-        if(GX==EX)

-            for(int i=0; i<8; ++i)

-                GX->ub[2 * i + 1] = GX->ub[2 * i];

-        else

-            for(int i=0; i<8; ++i)

-                GX->ub[2 * i + 1] = EX->ub[i + 8];

+            GX->ub[2 * i + 1] = EX->ub[i + 8];

+        }

         break;

     case 0x69:  /* PUNPCKHWD Gx,Ex */

         nextop = F8;

         GETEX(0);

         GETGX;

-        for(int i=0; i<4; ++i)

+        for(int i=0; i<4; ++i) {

             GX->uw[2 * i] = GX->uw[i + 4];

-        if(GX==EX)

-            for(int i=0; i<4; ++i)

-                GX->uw[2 * i + 1] = GX->uw[2 * i];

-        else

-            for(int i=0; i<4; ++i)

-                GX->uw[2 * i + 1] = EX->uw[i + 4];

+            GX->uw[2 * i + 1] = EX->uw[i + 4];

+        }

         break;

     case 0x6A:  /* PUNPCKHDQ Gx,Ex */

         nextop = F8;

diff --git a/src/emu/x64runavx660f.c b/src/emu/x64runavx660f.c
index 9666efb7..3425a864 100644
--- a/src/emu/x64runavx660f.c
+++ b/src/emu/x64runavx660f.c
@@ -38,7 +38,7 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
 {
     uint8_t opcode;
     uint8_t nextop;
-    uint8_t tmp8u;
+    uint8_t tmp8u, u8;
     int8_t tmp8s;
     int16_t tmp16s;
     int32_t tmp32s, tmp32s2;
@@ -450,7 +450,58 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0x60:  /* VPUNPCKLBW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX;
+            GETGY;
+            for(int i=7; i>=0; --i) {
+                GX->ub[2 * i + 1] = EX->ub[i];
+                GX->ub[2 * i] = VX->ub[i];
+            }
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=7; i>=0; --i) {
+                    GY->ub[2 * i + 1] = EY->ub[i];
+                    GY->ub[2 * i] = VY->ub[i];
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x61:  /* VPUNPCKLWD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            for(int i=3; i>=0; --i) {
+                GX->uw[2 * i + 1] = EX->uw[i];
+                GX->uw[2 * i] = VX->uw[i];
+            }
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=3; i>=0; --i) {
+                    GY->uw[2 * i + 1] = EY->uw[i];
+                    GY->uw[2 * i] = VY->uw[i];
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x62:  /* VPUNPCKLDQ Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            for(int i=1; i>=0; --i) {
+                GX->ud[2 * i + 1] = EX->ud[i];
+                GX->ud[2 * i] = VX->ud[i];
+            }
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=1; i>=0; --i) {
+                    GY->ud[2 * i + 1] = EY->ud[i];
+                    GY->ud[2 * i] = VY->ud[i];
+                }
+            } else
+                GY->u128 = 0;
+            break;
         case 0x63:  /* VPACKSSWB Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -579,7 +630,60 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0x68:  /* VPUNPCKHBW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; 
+            GETGY;
+            for(int i=0; i<8; ++i) {
+                GX->ub[2 * i] = VX->ub[i + 8];
+                GX->ub[2 * i + 1] = EX->ub[i + 8];
+            }
+            if(vex.l) {
+                GETVY; GETEY;
+                for(int i=0; i<8; ++i) {
+                    GY->ub[2 * i] = VY->ub[i + 8];
+                    GY->ub[2 * i + 1] = EY->ub[i + 8];
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x69:  /* VPUNPCKHWD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; 
+            GETGY;
+            for(int i=0; i<4; ++i) {
+                GX->uw[2 * i] = VX->uw[i + 4];
+                GX->uw[2 * i + 1] = EX->uw[i + 4];
+            }
+            if(vex.l) {
+                GETVY; GETEY;
+                for(int i=0; i<4; ++i) {
+                    GY->uw[2 * i] = VY->uw[i + 4];
+                    GY->uw[2 * i + 1] = EY->uw[i + 4];
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x6A:  /* VPUNPCKHDQ Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX;
+            GETGY;
+            for(int i=0; i<2; ++i) {
+                GX->ud[2 * i] = VX->ud[i + 2];
+                GX->ud[2 * i + 1] = EX->ud[i + 2];
+            }
+            if(vex.l) {
+                GETVY; GETEY;
+                for(int i=0; i<2; ++i) {
+                    GY->ud[2 * i] = VY->ud[i + 2];
+                    GY->ud[2 * i + 1] = EY->ud[i + 2];
+                }
+            } else
+                GY->u128 = 0;
+            break;
         case 0x6B:  /* VPACKSSDW Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -690,27 +794,68 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             for (int i=0; i<4; ++i)
                 GX->ud[i] = EX->ud[(tmp8u>>(i*2))&3];
             break;
-
+        case 0x71:  /* GRP */
+            nextop = F8;
+            GETEX(1);
+            GETVX;
+            GETVY;
+            if(!vex.l) VY->u128 = 0;
+            switch((nextop>>3)&7) {
+                case 2:                 /* VPSRLW Vx, Ex, Ib */
+                    tmp8u = F8;
+                    if(tmp8u>15) VX->u128 = 0;
+                    else
+                        for (int i=0; i<8; ++i) VX->uw[i] = EX->uw[i] >> tmp8u;
+                    if(vex.l) {
+                        GETEY;
+                        if(tmp8u>15) VY->u128 = 0;
+                        else
+                            for (int i=0; i<8; ++i) VY->uw[i] = EY->uw[i] >> tmp8u;
+                    }
+                    break;
+                case 4:                 /* VPSRAW Vx, Ex, Ib */
+                    tmp8u = F8;
+                    if(tmp8u>15) tmp8u=15;
+                    for (int i=0; i<8; ++i) VX->sw[i] = EX->sw[i] >> tmp8u;
+                    if(vex.l) {
+                        GETEY;
+                        for (int i=0; i<8; ++i) VY->sw[i] = EY->sw[i] >> tmp8u;
+                    }
+                    break;
+                case 6:                 /* VPSLLW Vx, Ex, Ib */
+                    tmp8u = F8;
+                    if(tmp8u>15) VX->u128 = 0;
+                    else
+                        for (int i=0; i<8; ++i) VX->uw[i] = EX->uw[i] << tmp8u;
+                    if(vex.l) {
+                        GETEY;
+                        if(tmp8u>15) VY->u128 = 0;
+                        else
+                            for (int i=0; i<8; ++i) VY->uw[i] = EY->uw[i] << tmp8u;
+                    }
+                    break;
+                default:
+                    return 0;
+            }
+            break;
         case 0x72:  /* GRP */
             nextop = F8;
             GETEX(1);
             GETVX;
             GETVY;
+            if(!vex.l) VY->u128 = 0;
             switch((nextop>>3)&7) {
                 case 2:                 /* VPSRLD Vx, Ex, Ib */
                     tmp8u = F8;
-                    if(tmp8u>31)
-                        {VX->q[0] = VX->q[1] = 0;}
+                    if(tmp8u>31) VX->u128 = 0;
                     else
                         for (int i=0; i<4; ++i) VX->ud[i] = EX->ud[i] >> tmp8u;
                     if(vex.l) {
                         GETEY;
-                        if(tmp8u>31)
-                            {VY->q[0] = VY->q[1] = 0;}
+                        if(tmp8u>31) VY->u128 = 0;
                         else
                             for (int i=0; i<4; ++i) VY->ud[i] = EY->ud[i] >> tmp8u;
-                    } else
-                        VY->q[0] = VY->q[1] = 0;
+                    }
                     break;
                 case 4:                 /* VPSRAD Vx, Ex, Ib */
                     tmp8u = F8;
@@ -719,23 +864,19 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                     if(vex.l) {
                         GETEY;
                         for (int i=0; i<4; ++i) VY->sd[i] = EY->sd[i] >> tmp8u;
-                    } else
-                        VY->q[0] = VY->q[1] = 0;
+                    }
                     break;
                 case 6:                 /* VPSLLD Vx, Ex, Ib */
                     tmp8u = F8;
-                    if(tmp8u>31)
-                        {VX->q[0] = VX->q[1] = 0;}
+                    if(tmp8u>31) VX->u128 = 0;
                     else
                         for (int i=0; i<4; ++i) VX->ud[i] = EX->ud[i] << tmp8u;
                     if(vex.l) {
                         GETEY;
-                        if(tmp8u>31)
-                            {VY->q[0] = VY->q[1] = 0;}
+                        if(tmp8u>31) VY->u128 = 0;
                         else
                             for (int i=0; i<4; ++i) VY->ud[i] = EY->ud[i] << tmp8u;
-                    } else
-                        VY->q[0] = VY->q[1] = 0;
+                    }
                     break;
                 default:
                     return 0;
@@ -746,98 +887,89 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             GETEX(1);
             GETVX;
             GETVY;
+            if(!vex.l) VY->u128 = 0;
             switch((nextop>>3)&7) {
                 case 2:                 /* VPSRLQ Vx, Ex, Ib */
                     tmp8u = F8;
-                    if(tmp8u>63)
-                        {VX->q[0] = VX->q[1] = 0;}
+                    if(tmp8u>63) VX->u128;
                     else
                         {VX->q[0] = EX->q[0] >> tmp8u; VX->q[1] = EX->q[1] >> tmp8u;}
                     if(vex.l) {
                         GETEY;
-                        if(tmp8u>63)
-                            {VY->q[0] = VY->q[1] = 0;}
+                        if(tmp8u>63) VY->u128 = 0;
                         else
                             {VY->q[0] = EY->q[0] >> tmp8u; VY->q[1] = EY->q[1] >> tmp8u;}
-                    } else
-                        VY->q[0] = VY->q[1] = 0;
+                    }
                     break;
                 case 3:                 /* VPSRLDQ Vx, Ex, Ib */
                     tmp8u = F8;
-                    if(tmp8u>15)
-                        {VX->q[0] = VX->q[1] = 0;}
+                    if(tmp8u>15) VX->u128 = 0;
                     else if (tmp8u!=0) {
-                        tmp8u*=8;
-                        if (tmp8u < 64) {
-                            VX->q[0] = (EX->q[0] >> tmp8u) | (EX->q[1] << (64 - tmp8u));
-                            VX->q[1] = (EX->q[1] >> tmp8u);
+                        u8=tmp8u*8;
+                        if (u8 < 64) {
+                            VX->q[0] = (EX->q[0] >> u8) | (EX->q[1] << (64 - u8));
+                            VX->q[1] = (EX->q[1] >> u8);
                         } else {
-                            VX->q[0] = EX->q[1] >> (tmp8u - 64);
+                            VX->q[0] = EX->q[1] >> (u8 - 64);
                             VX->q[1] = 0;
                         }
-                    }
+                    } else VX->u128 = EX->u128;
                     if(vex.l) {
                         GETEY;
-                        if(tmp8u>15)
-                            {VY->q[0] = VY->q[1] = 0;}
+                        if(tmp8u>15) VY->u128 = 0;
                         else if (tmp8u!=0) {
-                            tmp8u*=8;
-                            if (tmp8u < 64) {
-                                VY->q[0] = (EY->q[0] >> tmp8u) | (EY->q[1] << (64 - tmp8u));
-                                VY->q[1] = (EY->q[1] >> tmp8u);
+                            u8=tmp8u*8;
+                            if (u8 < 64) {
+                                VY->q[0] = (EY->q[0] >> u8) | (EY->q[1] << (64 - u8));
+                                VY->q[1] = (EY->q[1] >> u8);
                             } else {
-                                VY->q[0] = EY->q[1] >> (tmp8u - 64);
+                                VY->q[0] = EY->q[1] >> (u8 - 64);
                                 VY->q[1] = 0;
                             }
-                        }
-                    } else
-                        VY->q[0] = VY->q[1] = 0;
+                        } else VY->u128 = EY->u128;
+                    }
                     break;
                 case 6:                 /* VPSLLQ Vx, Ex, Ib */
                     tmp8u = F8;
-                    if(tmp8u>63)
-                        {VX->q[0] = VX->q[1] = 0;}
+                    if(tmp8u>63) VX->u128 = 0;
                     else
                         {VX->q[0] = EX->q[0] << tmp8u; VX->q[1] = EX->q[1] << tmp8u;}
                     if(vex.l) {
                         GETEY;
-                        if(tmp8u>63)
-                            {VY->q[0] = VY->q[1] = 0;}
+                        if(tmp8u>63) VY->u128 = 0;
                         else
                             {VY->q[0] = EY->q[0] << tmp8u; VY->q[1] = EY->q[1] << tmp8u;}
-                    } else
-                        VY->q[0] = VY->q[1] = 0;
+                    }
                     break;
                 case 7:                 /* VPSLLDQ Vx, Ex, Ib */
                     tmp8u = F8;
-                    if(tmp8u>15)
-                        {VX->q[0] = VX->q[1] = 0;}
+                    if(tmp8u>15) VX->u128 = 0;
                     else if (tmp8u!=0) {
-                        tmp8u*=8;
-                        if (tmp8u < 64) {
-                            VX->q[1] = (EX->q[1] << tmp8u) | (EX->q[0] >> (64 - tmp8u));
-                            VX->q[0] = (EX->q[0] << tmp8u);
+                        u8=tmp8u<<3;
+                        if (u8 < 64) {
+                            VX->q[1] = (EX->q[1] << u8) | (EX->q[0] >> (64 - u8));
+                            VX->q[0] = (EX->q[0] << u8);
                         } else {
-                            VX->q[1] = EX->q[0] << (tmp8u - 64);
+                            VX->q[1] = EX->q[0] << (u8 - 64);
                             VX->q[0] = 0;
                         }
-                    }
+                    } else
+                        VX->u128 = EX->u128;
                     if(vex.l) {
                         GETEY;
-                        if(tmp8u>15)
-                            {VY->q[0] = VY->q[1] = 0;}
+                        if(tmp8u>15) VY->u128 = 0;
                         else if (tmp8u!=0) {
-                            tmp8u*=8;
-                            if (tmp8u < 64) {
-                                VY->q[1] = (EY->q[1] << tmp8u) | (EY->q[0] >> (64 - tmp8u));
-                                VY->q[0] = (EY->q[0] << tmp8u);
+                            u8=tmp8u<<3;
+                            if (u8 < 64) {
+                                VY->q[1] = (EY->q[1] << u8) | (EY->q[0] >> (64 - u8));
+                                VY->q[0] = (EY->q[0] << u8);
                             } else {
-                                VY->q[1] = EY->q[0] << (tmp8u - 64);
+                                VY->q[1] = EY->q[0] << (u8 - 64);
                                 VY->q[0] = 0;
                             }
-                        }
-                    } else
-                        VY->q[0] = VY->q[1] = 0;
+                        } else
+                            VY->u128 = EY->u128;
+                    }
                     break;
                 default:
                     return 0;
@@ -1117,7 +1249,57 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0xD1:  /* VPSRLW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX; GETGY;
+            tmp64u = EX->q[0];
+            if(tmp64u>15) GX->u128 = 0;
+            else
+                {tmp8u=tmp64u; for (int i=0; i<8; ++i) GX->uw[i] = VX->uw[i] >> tmp8u;}
+            if(vex.l) {
+                GETEY; GETVY;
+                if(tmp64u>15) GY->u128 = 0;
+                else
+                    {tmp8u=tmp64u; for (int i=0; i<8; ++i) GY->uw[i] = VY->uw[i] >> tmp8u;}
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xD2:  /* VPSRLD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX; GETGY;
+            tmp64u = EX->q[0];
+            if(tmp64u>31) GX->u128 = 0;
+            else
+                {tmp8u=tmp64u; for (int i=0; i<4; ++i) GX->ud[i] = VX->ud[i] >> tmp8u;}
+            if(vex.l) {
+                GETEY; GETVY;
+                if(tmp64u>31) GY->u128 = 0;
+                else
+                    {tmp8u=tmp64u; for (int i=0; i<4; ++i) GY->ud[i] = VY->ud[i] >> tmp8u;}
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xD3:  /* VPSRLQ Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX; GETGY;
+            tmp64u = EX->q[0];
+            if(tmp64u>63) GX->u128 = 0;
+            else
+                {tmp8u=tmp64u; for (int i=0; i<2; ++i) GX->q[i] = VX->q[i] >> tmp8u;}
+            if(vex.l) {
+                GETEY; GETVY;
+                if(tmp64u>63) GY->u128 = 0;
+                else
+                    {tmp8u=tmp64u; for (int i=0; i<2; ++i) GY->q[i] = VY->q[i] >> tmp8u;}
+            } else
+                GY->u128 = 0;
+            break;
         case 0xD4:  /* VPADDQ Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1134,7 +1316,23 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0xD5:  /* VPMULLW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            for(int i=0; i<8; ++i) {
+                tmp32s = (int32_t)VX->sw[i] * EX->sw[i];
+                GX->sw[i] = tmp32s&0xffff;
+            }
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<8; ++i) {
+                    tmp32s = (int32_t)VY->sw[i] * EY->sw[i];
+                    GY->sw[i] = tmp32s&0xffff;
+                }
+            } else
+                GY->u128 = 0;
+            break;
         case 0xD6:  /* VMOVQ Ex, Gx */
             nextop = F8;
             GETEX(0);
@@ -1164,7 +1362,46 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 return 0;
             break;
-
+        case 0xD8:  /* VPSUBUSB Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<16; ++i) {
+                tmp16s = (int16_t)VX->ub[i] - EX->ub[i];
+                GX->ub[i] = (tmp16s>255)?255:((tmp16s<0)?0:tmp16s);
+            }
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=0; i<16; ++i) {
+                    tmp16s = (int16_t)VY->ub[i] - EY->ub[i];
+                    GY->ub[i] = (tmp16s>255)?255:((tmp16s<0)?0:tmp16s);
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xD9:  /* VPSUBUSW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<8; ++i) {
+                tmp32s = (int32_t)VX->uw[i] - EX->uw[i];
+                GX->uw[i] = (tmp32s>65535)?65535:((tmp32s<0)?0:tmp32s);
+            }
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=0; i<8; ++i) {
+                    tmp32s = (int32_t)VY->uw[i] - EY->uw[i];
+                    GY->uw[i] = (tmp32s>65535)?65535:((tmp32s<0)?0:tmp32s);
+                }
+            } else
+                GY->u128 = 0;
+            break;
         case 0xDA:  /* VPMINUB Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1284,7 +1521,36 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else 
                 GY->u128 = 0;
             break;
-
+        case 0xE1:  /* VPSRAW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX; GETGY;
+            tmp8u=(EX->q[0]>15)?15:EX->ub[0];
+            for (int i=0; i<8; ++i)
+                GX->sw[i] = VX->sw[i] >> tmp8u;
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<8; ++i)
+                    GY->sw[i] = VY->sw[i] >> tmp8u;
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xE2:  /* VPSRAD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX; GETGY;
+            tmp8u=(EX->q[0]>31)?31:EX->ub[0];
+            for (int i=0; i<4; ++i)
+                GX->sd[i] = VX->sd[i] >> tmp8u;
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<4; ++i)
+                    GY->sd[i] = VY->sd[i] >> tmp8u;
+            } else
+                GY->u128 = 0;
+            break;
         case 0xE3:  /* VPAVGW Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1301,7 +1567,40 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0xE4:  /* VPMULHUW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            for(int i=0; i<8; ++i) {
+                tmp32u = (uint32_t)VX->uw[i] * EX->uw[i];
+                GX->uw[i] = (tmp32u>>16)&0xffff;
+            }
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<8; ++i) {
+                    tmp32u = (uint32_t)VY->uw[i] * EY->uw[i];
+                    GY->uw[i] = (tmp32u>>16)&0xffff;
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xE5:  /* VPMULHW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX, GETGY;
+            for(int i=0; i<8; ++i) {
+                tmp32s = (int32_t)VX->sw[i] * EX->sw[i];
+                GX->uw[i] = (tmp32s>>16)&0xffff;
+            }
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<8; ++i) {
+                    tmp32s = (int32_t)VY->sw[i] * EY->sw[i];
+                    GY->uw[i] = (tmp32s>>16)&0xffff;
+                }
+            } else
+                GY->u128 = 0;
+            break;
         case 0xE6:  /* CVTTPD2DQ Gx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1342,7 +1641,46 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 EY->q[1] = GY->q[1];
             }
             break;
-
+        case 0xE8:  /* VSUBSB Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<16; ++i) {
+                tmp16s = (int16_t)VX->sb[i] - EX->sb[i];
+                GX->sb[i] = (tmp16s>127)?127:((tmp16s<-128)?-128:tmp16s);
+            }
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=0; i<16; ++i) {
+                    tmp16s = (int16_t)VY->sb[i] - EY->sb[i];
+                    GY->sb[i] = (tmp16s>127)?127:((tmp16s<-128)?-128:tmp16s);
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xE9:  /* VPSUBSW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<8; ++i) {
+                tmp32s = (int32_t)VX->sw[i] - EX->sw[i];
+                GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
+            }
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=0; i<8; ++i) {
+                    tmp32s = (int32_t)VY->sw[i] - EY->sw[i];
+                    GY->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
+                }
+            } else
+                GY->u128 = 0;
+            break;
         case 0xEA:  /* VPMINSW Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1435,20 +1773,82 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             GETEX(0);
             GETGX;
             GETVX;
-            VX->q[0] = GX->q[0] ^ EX->q[0];
-            VX->q[1] = GX->q[1] ^ EX->q[1];
+            GX->q[0] = VX->q[0] ^ EX->q[0];
+            GX->q[1] = VX->q[1] ^ EX->q[1];
             GETGY;
             if(vex.l) {
                 GETEY;
                 GETVY;
                 GY->q[0] = VY->q[0] ^ EY->q[0];
                 GY->q[1] = VY->q[1] ^ EY->q[1];
-            } else {
-                GY->q[0] = GY->q[1] = 0;
-            }
-
+            } else 
+                GY->u128 = 0;
             break;
 
+        case 0xF1:  /* VPSLLW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            if(EX->q[0]>15)
+                GX->u128 = 0;
+            else
+                {tmp8u=EX->ub[0]; for (int i=0; i<8; ++i) GX->uw[i] = VX->uw[i]<<tmp8u;}
+            if(vex.l) {
+                GETEY; GETVY;
+                if(EY->q[0]>15)
+                    GY->u128 = 0;
+                else
+                    {tmp8u=EY->ub[0]; for (int i=0; i<8; ++i) GY->uw[i] = VY->uw[i]<<tmp8u;}
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xF2:  /* VPSLLD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            if(EX->q[0]>31)
+                GX->u128 = 0;
+            else
+                {tmp8u=EX->ub[0]; for (int i=0; i<4; ++i) GX->ud[i] = VX->ud[i]<<tmp8u;}
+            if(vex.l) {
+                GETEY; GETVY;
+                if(EY->q[0]>31)
+                    GY->u128 = 0;
+                else
+                    {tmp8u=EY->ub[0]; for (int i=0; i<4; ++i) GY->ud[i] = VY->ud[i]<<tmp8u;}
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xF3:  /* VPSLLQ Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            if(EX->q[0]>63)
+                GX->u128 = 0;
+            else
+                {tmp8u=EX->ub[0]; for (int i=0; i<2; ++i) GX->q[i] = VX->q[i]<<tmp8u;}
+            if(vex.l) {
+                GETEY; GETVY;
+                if(EY->q[0]>63)
+                    GY->u128 = 0;
+                else
+                    {tmp8u=EY->ub[0]; for (int i=0; i<2; ++i) GY->q[i] = VY->q[i]<<tmp8u;}
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xF4:  /* VPMULUDQ Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX, GETGY;
+            GX->q[1] = (uint64_t)EX->ud[2]*VX->ud[2];
+            GX->q[0] = (uint64_t)EX->ud[0]*VX->ud[0];
+            if(vex.l) {
+                GETEY; GETVY;
+                GY->q[1] = (uint64_t)EY->ud[2]*VY->ud[2];
+                GY->q[0] = (uint64_t)EY->ud[0]*VY->ud[0];
+            } else
+                GY->u128 = 0;
+            break;
         case 0xF5:  /* VPMADDWD Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1464,7 +1864,31 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0xF6:  /* VPSADBW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            tmp32u = 0;
+            for (int i=0; i<8; ++i)
+                tmp32u += (VX->ub[i]>EX->ub[i])?(VX->ub[i] - EX->ub[i]):(EX->ub[i] - VX->ub[i]);
+            GX->q[0] = tmp32u;
+            tmp32u = 0;
+            for (int i=8; i<16; ++i)
+                tmp32u += (VX->ub[i]>EX->ub[i])?(VX->ub[i] - EX->ub[i]):(EX->ub[i] - VX->ub[i]);
+            GX->q[1] = tmp32u;
+            if(vex.l) {
+                GETEY; GETVY;
+                tmp32u = 0;
+                for (int i=0; i<8; ++i)
+                    tmp32u += (VY->ub[i]>EY->ub[i])?(VY->ub[i] - EY->ub[i]):(EY->ub[i] - VY->ub[i]);
+                GY->q[0] = tmp32u;
+                tmp32u = 0;
+                for (int i=8; i<16; ++i)
+                    tmp32u += (VY->ub[i]>EY->ub[i])?(VY->ub[i] - EY->ub[i]):(EY->ub[i] - VY->ub[i]);
+                GY->q[1] = tmp32u;
+            } else
+                GY->u128 = 0;
+            break;
         case 0xF7:  /* VMASKMOVDQU Gx, Ex */
             nextop = F8;
             if(vex.l) {
@@ -1479,7 +1903,70 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             }
             // no raz of upper ymm
             break;
-
+        case 0xF8:  /* VSUBB Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<16; ++i)
+                GX->sb[i] = VX->sb[i] - EX->sb[i];
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=0; i<16; ++i)
+                    GY->sb[i] = VY->sb[i] - EY->sb[i];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xF9:  /* VPSUBW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<8; ++i)
+                GX->sw[i] = VX->sw[i] - EX->sw[i];
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=0; i<8; ++i)
+                    GY->sw[i] = VY->sw[i] - EY->sw[i];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xFA:  /* VPSUBD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<4; ++i)
+                GX->sd[i] = VX->sd[i] - EX->sd[i];
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=0; i<4; ++i)
+                    GY->sd[i] = VY->sd[i] - EY->sd[i];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0xFB:  /* VPSUBQ Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<2; ++i)
+                GX->sq[i] = VX->sq[i] - EX->sq[i];
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=0; i<2; ++i)
+                    GY->sq[i] = VY->sq[i] - EY->sq[i];
+            } else
+                GY->u128 = 0;
+            break;
         case 0xFC:  /* VPADDB Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
diff --git a/src/emu/x64runavx660f38.c b/src/emu/x64runavx660f38.c
index 70d72514..e4de81c2 100644
--- a/src/emu/x64runavx660f38.c
+++ b/src/emu/x64runavx660f38.c
@@ -376,7 +376,62 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0x08:  /* VPSIGNB Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            for (int i=0; i<16; ++i)
+                GX->sb[i] = VX->sb[i] * ((EX->sb[i]<0)?-1:((EX->sb[i]>0)?1:0));
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<16; ++i)
+                    GY->sb[i] = VY->sb[i] * ((EY->sb[i]<0)?-1:((EY->sb[i]>0)?1:0));
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x09:  /* VPSIGNW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            for (int i=0; i<8; ++i)
+                GX->sw[i] = VX->sw[i] * ((EX->sw[i]<0)?-1:((EX->sw[i]>0)?1:0));
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<8; ++i)
+                    GY->sw[i] = VY->sw[i] * ((EY->sw[i]<0)?-1:((EY->sw[i]>0)?1:0));
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x0A:  /* VPSIGND Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            for (int i=0; i<4; ++i)
+                GX->sd[i] = VX->sd[i] * ((EX->sd[i]<0)?-1:((EX->sd[i]>0)?1:0));
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<4; ++i)
+                    GY->sd[i] = VY->sd[i] * ((EY->sd[i]<0)?-1:((EY->sd[i]>0)?1:0));
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x0B:  /* VPMULHRSW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            for (int i=0; i<8; ++i) {
+                tmp32s = ((((int32_t)(VX->sw[i])*(int32_t)(EX->sw[i]))>>14) + 1)>>1;
+                GX->uw[i] = tmp32s&0xffff;
+            }
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<8; ++i) {
+                    tmp32s = ((((int32_t)(VY->sw[i])*(int32_t)(EY->sw[i]))>>14) + 1)>>1;
+                    GY->uw[i] = tmp32s&0xffff;
+                }
+            } else
+                GY->u128 = 0;
+            break;
         case 0x0C:  /* VPERMILPS Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -445,7 +500,24 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->ud[i] = (u8>3)?EY->ud[u8&3]:EX->ud[u8];
             }
             break;
-
+        case 0x17:      // VPTEST GX, EX
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            RESET_FLAGS(emu);
+            if(vex.l) {
+                GETEY; GETGY;
+                CONDITIONAL_SET_FLAG(!(GY->u128&EY->u128), F_ZF);
+                CONDITIONAL_SET_FLAG(!((~GY->u128)&EY->u128), F_CF);
+            } else {
+                CONDITIONAL_SET_FLAG(!(GX->u128&EX->u128), F_ZF);
+                CONDITIONAL_SET_FLAG(!((~GX->u128)&EX->u128), F_CF);
+            }
+            CLEAR_FLAG(F_AF);
+            CLEAR_FLAG(F_OF);
+            CLEAR_FLAG(F_SF);
+            CLEAR_FLAG(F_PF);
+            break;
         case 0x18:  /* VBROADCASTSS Gx, Ex */
             nextop = F8;
             GETEX(0);
@@ -611,6 +683,21 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GX->sq[i] = EX->sd[i];
             break;
 
+        case 0x28:  /* VPMULDQ Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GX->sq[1] = ((int64_t)VX->sd[2])*(int64_t)EX->sd[2];
+            GX->sq[0] = ((int64_t)VX->sd[0])*(int64_t)EX->sd[0];
+            if(vex.l) {
+                GETEY; GETVY;
+                GY->sq[1] = ((int64_t)VY->sd[2])*(int64_t)EY->sd[2];
+                GY->sq[0] = ((int64_t)VY->sd[0])*(int64_t)EY->sd[0];
+            } else
+                GY->u128 = 0;
+            break;
         case 0x29:  /* VPCMPEQQ Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -982,7 +1069,19 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0x40:  /* VPMULLD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            for(int i=0; i<4; ++i)
+                GX->ud[i] = VX->ud[i] * EX->ud[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<4; ++i)
+                    GY->ud[i] = VY->ud[i] * EY->ud[i];
+            } else
+                GY->u128 = 0;
+            break;
         case 0x41:  /* PHMINPOSUW Gx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1003,6 +1102,89 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             GY->u128 = 0;
             break;
 
+        case 0x45:  /* VPSLRVD/Q Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            if(rex.w) {
+                for(int i=0; i<2; ++i) {
+                    tmp64u = EX->q[i];
+                    GX->q[i] = (tmp64u<64)?(VX->q[i]>>tmp64u):0;
+                }
+                if(vex.l) {
+                    GETEY; GETVY;
+                    for(int i=0; i<2; ++i) {
+                        tmp64u = EY->q[i];
+                        GY->q[i] = (tmp64u<64)?(VY->q[i]>>tmp64u):0;
+                    }
+                }
+            } else {
+                for(int i=0; i<4; ++i) {
+                    tmp32u = EX->ud[i];
+                    GX->ud[i] = (tmp32u<32)?(VX->ud[i]>>tmp32u):0;
+                }
+                if(vex.l) {
+                    GETEY; GETVY;
+                    for(int i=0; i<4; ++i) {
+                        tmp32u = EY->ud[i];
+                        GY->ud[i] = (tmp32u<32)?(VY->ud[i]>>tmp32u):0;
+                    }
+                }
+            }
+            if(!vex.l)
+                GY->u128=0;
+            break;
+        case 0x46:  /* VPSRAVD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            if(rex.w) return 0;
+            for(int i=0; i<4; ++i) {
+                tmp32u = EX->ud[i]; if(tmp32u>31) tmp32u=31;
+                GX->sd[i] = VX->sd[i]>>tmp32u;
+            }
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<4; ++i) {
+                    tmp32u = EY->ud[i]; if(tmp32u>31) tmp32u=31;
+                    GY->sd[i] = VY->sd[i]>>tmp32u;
+                }
+            } else
+                GY->u128=0;
+            break;
+        case 0x47:  /* VPSLLVD/Q Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX; GETVX; GETGY;
+            if(rex.w) {
+                for(int i=0; i<2; ++i) {
+                    tmp64u = EX->q[i];
+                    GX->q[i] = (tmp64u<64)?(VX->q[i]<<tmp64u):0;
+                }
+                if(vex.l) {
+                    GETEY; GETVY;
+                    for(int i=0; i<2; ++i) {
+                        tmp64u = EY->q[i];
+                        GY->q[i] = (tmp64u<64)?(VY->q[i]<<tmp64u):0;
+                    }
+                }
+            } else {
+                for(int i=0; i<4; ++i) {
+                    tmp32u = EX->ud[i];
+                    GX->ud[i] = (tmp32u<32)?(VX->ud[i]<<tmp32u):0;
+                }
+                if(vex.l) {
+                    GETEY; GETVY;
+                    for(int i=0; i<4; ++i) {
+                        tmp32u = EY->ud[i];
+                        GY->ud[i] = (tmp32u<32)?(VY->ud[i]<<tmp32u):0;
+                    }
+                }
+            }
+            if(!vex.l)
+                GY->u128=0;
+            break;
+
         case 0x58:  /* VPBROADCASTD Gx, Ex */
             nextop = F8;
             GETEX(0);
diff --git a/src/emu/x64runavxf20f.c b/src/emu/x64runavxf20f.c
index 6208f9f9..c0eff9d0 100644
--- a/src/emu/x64runavxf20f.c
+++ b/src/emu/x64runavxf20f.c
@@ -273,6 +273,35 @@ uintptr_t RunAVX_F20F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             GY->u128 = 0;
             break;
 
+        case 0x70:  /* VPSHUFLW Gx, Ex, Ib */
+            nextop = F8;
+            GETEX(1);
+            GETGX; GETGY;
+            tmp8u = F8;
+            if(GX==EX) {
+                for (int i=0; i<4; ++i)
+                    eax1.uw[i] = EX->uw[(tmp8u>>(i*2))&3];
+                GX->q[0] = eax1.q[0];
+            } else {
+                for (int i=0; i<4; ++i)
+                    GX->uw[i] = EX->uw[(tmp8u>>(i*2))&3];
+                GX->q[1] = EX->q[1];
+            }
+            if(vex.l) {
+                GETEY;
+                if(GY==EY) {
+                    for (int i=0; i<4; ++i)
+                        eay1.uw[i] = EY->uw[(tmp8u>>(i*2))&3];
+                    GY->q[0] = eay1.q[0];
+                } else {
+                    for (int i=0; i<4; ++i)
+                        GY->uw[i] = EY->uw[(tmp8u>>(i*2))&3];
+                    GY->q[1] = EY->q[1];
+                }
+            } else
+                GY->u128 = 0;
+            break;
+
         case 0x7C:  /* VHADDPS Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
diff --git a/src/emu/x64runavxf30f.c b/src/emu/x64runavxf30f.c
index c8fd8b69..541fca9e 100644
--- a/src/emu/x64runavxf30f.c
+++ b/src/emu/x64runavxf30f.c
@@ -353,6 +353,34 @@ uintptr_t RunAVX_F30F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->q[0] = GY->q[1] = 0;
             break;
+        case 0x70:  /* VPSHUFHW Gx, Ex, Ib */
+            nextop = F8;
+            GETEX(1);
+            GETGX; GETGY;
+            tmp8u = F8;
+            if(GX==EX) {
+                for (int i=0; i<4; ++i)
+                    eax1.uw[4+i] = EX->uw[4+((tmp8u>>(i*2))&3)];
+                GX->q[1] = eax1.q[1];
+            } else {
+                for (int i=0; i<4; ++i)
+                    GX->uw[4+i] = EX->uw[4+((tmp8u>>(i*2))&3)];
+                GX->q[0] = EX->q[0];
+            }
+            if(vex.l) {
+                GETEY;
+                if(GY==EY) {
+                    for (int i=0; i<4; ++i)
+                        eay1.uw[4+i] = EY->uw[4+((tmp8u>>(i*2))&3)];
+                    GY->q[1] = eay1.q[1];
+                } else {
+                    for (int i=0; i<4; ++i)
+                        GY->uw[4+i] = EY->uw[4+((tmp8u>>(i*2))&3)];
+                    GY->q[0] = EY->q[0];
+                }
+            } else
+                GY->u128 = 0;
+            break;
 
         case 0x7E:  /* MOVQ Gx, Ex */
             nextop = F8;