about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/emu/x64runavx660f.c109
-rw-r--r--src/emu/x64runavx660f38.c476
-rw-r--r--src/emu/x64runavx660f3a.c27
3 files changed, 599 insertions, 13 deletions
diff --git a/src/emu/x64runavx660f.c b/src/emu/x64runavx660f.c
index 7b712a7e..9666efb7 100644
--- a/src/emu/x64runavx660f.c
+++ b/src/emu/x64runavx660f.c
@@ -1081,6 +1081,18 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0xC4:  /* VPINSRW Gx, Vx, Ew, Ib */
+            nextop = F8;
+            GETED(1);
+            GETGX;
+            GETVX;
+            GETGY;
+            tmp8u = F8;
+            if(GX!=VX)
+                GX->u128 = VX->u128;
+            GX->uw[tmp8u&7] = ED->word[0];   // only low 16bits
+            GY->u128 = 0;
+            break;
         case 0xC5:  /* VPEXTRW Gw,Ex,Ib */
             nextop = F8;
             GETEX(1);
@@ -1134,7 +1146,40 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 EY->u128 = 0;
             }
             break;
+        case 0xD7:  /* PMOVMSKB Gd,Ex */
+            nextop = F8;
+            if(MODREG) {
+                GETEX(0);
+                GETGD;
+                GD->q[0] = 0;
+                for (int i=0; i<16; ++i)
+                    if(EX->ub[i]&0x80)
+                        GD->dword[0] |= (1<<i);
+                if(vex.l) {
+                    GETEY;
+                    for (int i=0; i<16; ++i)
+                        if(EY->ub[i]&0x80)
+                            GD->dword[0] |= (1<<(i+16));
+                }
+            } else
+                return 0;
+            break;
 
+        case 0xDA:  /* VPMINUB Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for (int i=0; i<16; ++i)
+                GX->ub[i] = (EX->ub[i]<VX->ub[i])?EX->ub[i]:VX->ub[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<16; ++i)
+                    GY->ub[i] = (EY->ub[i]<VY->ub[i])?EY->ub[i]:VY->ub[i];
+            } else
+                GY->u128 = 0;
+            break;
         case 0xDB:  /* VPAND Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1152,7 +1197,6 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->q[0] = GY->q[1] = 0;
             }
             break;
-
         case 0xDC:  /* VPADDUSB Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1193,7 +1237,21 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0xDE:  /* VPMAXUB Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for (int i=0; i<16; ++i)
+                GX->ub[i] = (EX->ub[i]>VX->ub[i])?EX->ub[i]:VX->ub[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<16; ++i)
+                    GY->ub[i] = (EY->ub[i]>VY->ub[i])?EY->ub[i]:VY->ub[i];
+            } else
+                GY->u128 = 0;
+            break;
         case 0xDF:  /* VPANDN Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1285,6 +1343,21 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             }
             break;
 
+        case 0xEA:  /* VPMINSW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<8; ++i)
+                GX->sw[i] = (VX->sw[i]<EX->sw[i])?VX->sw[i]:EX->sw[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<8; ++i)
+                    GY->sw[i] = (VY->sw[i]<EY->sw[i])?VY->sw[i]:EY->sw[i];
+            } else
+                GY->u128 = 0;
+            break;
         case 0xEB:  /* VPOR Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1342,7 +1415,21 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-
+        case 0xEE:  /* VPMAXSW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<8; ++i)
+                GX->sw[i] = (VX->sw[i]>EX->sw[i])?VX->sw[i]:EX->sw[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<8; ++i)
+                    GY->sw[i] = (VY->sw[i]>EY->sw[i])?VY->sw[i]:EY->sw[i];
+            } else
+                GY->u128 = 0;
+            break;
         case 0xEF:                      /* VPXOR Gx,Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -1362,6 +1449,22 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
 
             break;
 
+        case 0xF5:  /* VPMADDWD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for (int i=0; i<4; ++i)
+                GX->sd[i] = (int32_t)(VX->sw[i*2+0])*EX->sw[i*2+0] + (int32_t)(VX->sw[i*2+1])*EX->sw[i*2+1];
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<4; ++i)
+                    GY->sd[i] = (int32_t)(VY->sw[i*2+0])*EY->sw[i*2+0] + (int32_t)(VY->sw[i*2+1])*EY->sw[i*2+1];
+            } else
+                GY->u128 = 0;
+            break;
+
         case 0xF7:  /* VMASKMOVDQU Gx, Ex */
             nextop = F8;
             if(vex.l) {
diff --git a/src/emu/x64runavx660f38.c b/src/emu/x64runavx660f38.c
index f127be72..70d72514 100644
--- a/src/emu/x64runavx660f38.c
+++ b/src/emu/x64runavx660f38.c
@@ -61,6 +61,8 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
     uint8_t nextop;
     uint8_t tmp8u, u8;
     int8_t tmp8s;
+    int16_t tmp16s;
+    uint16_t tmp16u;
     int32_t tmp32s, tmp32s2;
     uint32_t tmp32u, tmp32u2;
     uint64_t tmp64u, tmp64u2;
@@ -254,6 +256,126 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
+        case 0x04:  /* VPMADDUBSW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for (int i=0; i<8; ++i) {
+                tmp32s = (int32_t)(VX->ub[i*2+0])*EX->sb[i*2+0] + (int32_t)(VX->ub[i*2+1])*EX->sb[i*2+1];
+                GX->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
+            }
+            if(vex.l) {
+                GETEY; GETVY;
+                for (int i=0; i<8; ++i) {
+                    tmp32s = (int32_t)(VY->ub[i*2+0])*EY->sb[i*2+0] + (int32_t)(VY->ub[i*2+1])*EY->sb[i*2+1];
+                    GY->sw[i] = (tmp32s>32767)?32767:((tmp32s<-32768)?-32768:tmp32s);
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x05:  /* VPHSUBW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GETEY;
+            u8 = (VX==EX);
+            if(GX==EX) {eax1=*EX; EX=&eax1;}
+            for (int i=0; i<4; ++i)
+                GX->sw[i] = VX->sw[i*2+0] - VX->sw[i*2+1];
+            if(u8) {
+                GX->q[1] = GX->q[0];
+            } else {
+                for (int i=0; i<4; ++i)
+                    GX->sw[4+i] = EX->sw[i*2+0] - EX->sw[i*2+1];
+            }
+            if(vex.l) {
+                GETVY;
+                if(EY==GY) {eay1=*EY; EY=&eay1;}
+                for (int i=0; i<4; ++i)
+                    GY->sw[i] = VY->sw[i*2+0] - VY->sw[i*2+1];
+                if(u8) {
+                    GY->q[1] = GY->q[0];
+                } else {
+                    for (int i=0; i<4; ++i)
+                        GY->sw[4+i] = EY->sw[i*2+0] - EY->sw[i*2+1];
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x06:  /* VPHSUBD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GETEY;
+            u8 = (VX==EX);
+            if(GX==EX) {eax1=*EX; EX=&eax1;}
+            for (int i=0; i<2; ++i)
+                GX->sd[i] = VX->sd[i*2+0] - VX->sd[i*2+1];
+            if(u8) {
+                GX->q[1] = GX->q[0];
+            } else {
+                for (int i=0; i<2; ++i)
+                    GX->sd[2+i] = EX->sd[i*2+0] - EX->sd[i*2+1];
+            }
+            if(vex.l) {
+                GETVY;
+                if(EY==GY) {eay1=*EY; EY=&eay1;}
+                GY->sd[0] = VY->sd[0] - VY->sd[1];
+                GY->sd[1] = VY->sd[2] - VY->sd[3];
+                if(u8) {
+                    GY->q[1] = GY->q[0];
+                } else {
+                    GY->sd[2] = EY->sd[0] - EY->sd[1];
+                    GY->sd[3] = EY->sd[2] - EY->sd[3];
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x07:  /* PHSUBSW Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GETEY;
+            u8 = (VX==EX);
+            if(GX==EX) {eax1=*EX; EX=&eax1;}
+            for (int i=0; i<4; ++i) {
+                tmp32s = VX->sw[i*2+0] - VX->sw[i*2+1];
+                GX->sw[i] = (tmp32s<-32768)?-32768:((tmp32s>32767)?32767:tmp32s);
+            }
+            if(u8) {
+                GX->q[1] = GX->q[0];
+            } else {
+                for (int i=0; i<4; ++i) {
+                    tmp32s = EX->sw[i*2+0] - EX->sw[i*2+1];
+                    GX->sw[4+i] = (tmp32s<-32768)?-32768:((tmp32s>32767)?32767:tmp32s);
+                }
+            }
+            if(vex.l) {
+                GETVY;
+                if(EY==GY) {eay1=*EY; EY=&eay1;}
+                for (int i=0; i<4; ++i) {
+                    tmp32s = VY->sw[i*2+0] - VY->sw[i*2+1];
+                    GY->sw[i] = (tmp32s<-32768)?-32768:((tmp32s>32767)?32767:tmp32s);
+                }
+                if(u8) {
+                    GY->q[1] = GY->q[0];
+                } else {
+                    for (int i=0; i<4; ++i) {
+                        tmp32s = EY->sw[i*2+0] - EY->sw[i*2+1];
+                        GY->sw[4+i] = (tmp32s<-32768)?-32768:((tmp32s>32767)?32767:tmp32s);
+                    }
+                }
+            } else
+                GY->u128 = 0;
+            break;
 
         case 0x0C:  /* VPERMILPS Gx, Vx, Ex */
             nextop = F8;
@@ -410,6 +532,85 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0x20:  /* VPMOVSXBW Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=7; i>=0; --i)
+                    GY->sw[i] = EX->sb[8+i];
+            else
+                GY->u128 = 0;
+            for(int i=7; i>=0; --i)
+                GX->sw[i] = EX->sb[i];
+            break;
+        case 0x21:  /* VPMOVSXBD Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=3; i>=0; --i)
+                    GY->sd[i] = EX->sb[4+i];
+            else
+                GY->u128 = 0;
+            for(int i=3; i>=0; --i)
+                GX->sd[i] = EX->sb[i];
+            break;
+        case 0x22:  /* VPMOVSXBQ Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=1; i>=0; --i)
+                    GY->sq[i] = EX->sb[2+i];
+            else
+                GY->u128 = 0;
+            for(int i=1; i>=0; --i)
+                GX->sq[i] = EX->sb[i];
+            break;
+        case 0x23:  /* VPMOVSXWD Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=3; i>=0; --i)
+                    GY->sd[i] = EX->sw[4+i];
+            else
+                GY->u128 = 0;
+            for(int i=3; i>=0; --i)
+                GX->sd[i] = EX->sw[i];
+            break;
+        case 0x24:  /* VPMOVSXWQ Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=1; i>=0; --i)
+                    GY->sq[i] = EX->sw[2+i];
+            else
+                GY->u128 = 0;
+            for(int i=1; i>=0; --i)
+                GX->sq[i] = EX->sw[i];
+            break;
+        case 0x25:  /* VPMOVSXDQ Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=1; i>=0; --i)
+                    GY->sq[i] = EX->sd[2+i];
+            else
+                GY->u128 = 0;
+            for(int i=1; i>=0; --i)
+                GX->sq[i] = EX->sd[i];
+            break;
+
         case 0x29:  /* VPCMPEQQ Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -536,7 +737,84 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                     if(VY->q[i]>>63) EY->q[i] = GY->q[i];
             }
             break;
-
+        case 0x30: /* VPMOVZXBW Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=7; i>=0; --i)
+                    GY->uw[i] = EX->ub[7+1+i];
+            else
+                GY->u128 = 0;
+            for(int i=7; i>=0; --i)
+                GX->uw[i] = EX->ub[i];
+            break;
+        case 0x31: /* VPMOVZXBD Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=3; i>=0; --i)
+                    GY->ud[i] = EX->ub[3+1+i];
+            else
+                GY->u128 = 0;
+            for(int i=3; i>=0; --i)
+                GX->ud[i] = EX->ub[i];
+            break;
+        case 0x32: /* VPMOVZXBQ Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=1; i>=0; --i)
+                    GY->q[i] = EX->ub[1+1+i];
+            else
+                GY->u128 = 0;
+            for(int i=1; i>=0; --i)
+                GX->q[i] = EX->ub[i];
+            break;
+        case 0x33: /* VPMOVZXWD Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=3; i>=0; --i)
+                    GY->ud[i] = EX->uw[3+1+i];
+            else
+                GY->u128 = 0;
+            for(int i=3; i>=0; --i)
+                GX->ud[i] = EX->uw[i];
+            break;
+        case 0x34: /* VPMOVZXWQ Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=1; i>=0; --i)
+                    GY->q[i] = EX->uw[1+1+i];
+            else
+                GY->u128 = 0;
+            for(int i=1; i>=0; --i)
+                GX->q[i] = EX->uw[i];
+            break;
+        case 0x35: /* VPMOVZXDQ Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETGY;
+            if(vex.l)
+                for(int i=1; i>=0; --i)
+                    GY->q[i] = EX->ud[1+1+i];
+            else
+                GY->u128 = 0;
+            for(int i=1; i>=0; --i)
+                GX->q[i] = EX->ud[i];
+            break;
         case 0x36:  /* VPERMD Gx, Vx, Ex */
             // same code as 0x16
             nextop = F8;
@@ -584,24 +862,146 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
-        case 0x38:  /* VPERMILD Gx, Vx, Ex */
+        case 0x38:  /* VPMINSB Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
             GETGX;
             GETVX;
             GETGY;
-            GETEY;
-            GETVY;
-            if(GX==EX) {eax1 = *EX; EX = &eax1;}
-            for(int i=0; i<2; ++i)
-                GX->q[i] = EX->q[(VX->q[i]>>1)&1];
+            for(int i=0; i<16; ++i)
+                GX->sb[i] = (VX->sb[i]>EX->sb[i])?EX->sb[i]:VX->sb[i];
             if(vex.l) {
-                if(GY==EY) {eay1 = *EY; EY = &eay1;}
-                for(int i=0; i<2; ++i)
-                    GY->q[i] = EY->q[(VY->q[i]>>1)&1];
+                GETEY; GETVY;
+                for(int i=0; i<16; ++i)
+                    GY->sb[i] = (VY->sb[i]>EY->sb[i])?EY->sb[i]:VY->sb[i];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x39:  /* VPMINSD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<4; ++i)
+                GX->sd[i] = (VX->sd[i]>EX->sd[i])?EX->sd[i]:VX->sd[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<4; ++i)
+                    GY->sd[i] = (VY->sd[i]>EY->sd[i])?EY->sd[i]:VY->sd[i];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x3A:  /* VPMINUW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<8; ++i)
+                GX->uw[i] = (VX->uw[i]>EX->uw[i])?EX->uw[i]:VX->uw[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<8; ++i)
+                    GY->uw[i] = (VY->uw[i]>EY->uw[i])?EY->uw[i]:VY->uw[i];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x3B:  /* VPMINUD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<4; ++i)
+                GX->ud[i] = (VX->ud[i]>EX->ud[i])?EX->ud[i]:VX->ud[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<4; ++i)
+                    GY->ud[i] = (VY->ud[i]>EY->ud[i])?EY->ud[i]:VY->ud[i];
             } else
                 GY->u128 = 0;
             break;
+        case 0x3C:  /* VPMAXSB Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<16; ++i)
+                GX->sb[i] = (VX->sb[i]<EX->sb[i])?EX->sb[i]:VX->sb[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<16; ++i)
+                    GY->sb[i] = (VY->sb[i]<EY->sb[i])?EY->sb[i]:VY->sb[i];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x3D:  /* VPMAXSD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<4; ++i)
+                GX->sd[i] = (VX->sd[i]<EX->sd[i])?EX->sd[i]:VX->sd[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<4; ++i)
+                    GY->sd[i] = (VY->sd[i]<EY->sd[i])?EY->sd[i]:VY->sd[i];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x3E:  /* VPMAXUW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<8; ++i)
+                GX->uw[i] = (VX->uw[i]<EX->uw[i])?EX->uw[i]:VX->uw[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<8; ++i)
+                    GY->uw[i] = (VY->uw[i]<EY->uw[i])?EY->uw[i]:VY->uw[i];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x3F:  /* VPMAXUD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=0; i<4; ++i)
+                GX->ud[i] = (VX->ud[i]<EX->ud[i])?EX->ud[i]:VX->ud[i];
+            if(vex.l) {
+                GETEY; GETVY;
+                for(int i=0; i<4; ++i)
+                    GY->ud[i] = (VY->ud[i]<EY->ud[i])?EY->ud[i]:VY->ud[i];
+            } else
+                GY->u128 = 0;
+            break;
+
+        case 0x41:  /* PHMINPOSUW Gx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            tmp16u = EX->uw[0];
+            tmp16s = 0;
+            for(int i=1; i<8; ++i) {
+                if(EX->uw[i]<tmp16u) {
+                    tmp16u = EX->uw[i];
+                    tmp16s = i;
+                }
+            }
+            GX->q[1] = 0;
+            GX->uw[0] = tmp16u;
+            GX->uw[1] = tmp16s;
+            GX->ud[1] = 0;
+            GETGY;
+            GY->u128 = 0;
+            break;
 
         case 0x58:  /* VPBROADCASTD Gx, Ex */
             nextop = F8;
@@ -665,6 +1065,62 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0x8C:  /*VMASKMOVD/Q Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            if(rex.w) {
+                for(int i=0; i<2; ++i)
+                    GX->q[i] = (VX->q[i]>>63)?EX->q[i]:0;
+                if(vex.l) {
+                    GETEY;
+                    GETVY;
+                    for(int i=0; i<2; ++i)
+                        GY->q[i] = (VY->q[i]>>63)?EY->q[i]:0;
+                } else
+                    GY->u128 = 0;
+            } else {
+                for(int i=0; i<4; ++i)
+                    GX->ud[i] = (VX->ud[i]>>31)?EX->ud[i]:0;
+                if(vex.l) {
+                    GETEY;
+                    GETVY;
+                    for(int i=0; i<4; ++i)
+                        GY->ud[i] = (VY->ud[i]>>31)?EY->ud[i]:0;
+                } else
+                    GY->u128 = 0;
+            }
+            break;
+        case 0x8E:  /*VMASKMOVD/Q Ex, Vx, Gx */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            if(rex.w) {
+                for(int i=0; i<2; ++i)
+                    if(VX->q[i]>>63) EX->q[i] = GX->q[i];
+                if(vex.l) {
+                    GETGY;
+                    GETEY;
+                    GETVY;
+                    for(int i=0; i<2; ++i)
+                        if(VY->q[i]>>63) EY->q[i] = GY->q[i];
+                }
+            } else {
+                for(int i=0; i<4; ++i)
+                    if(VX->ud[i]>>31) EX->ud[i] = GX->ud[i];
+                if(vex.l) {
+                    GETGY;
+                    GETEY;
+                    GETVY;
+                    for(int i=0; i<4; ++i)
+                        if(VY->ud[i]>>31) EY->ud[i] = GY->ud[i];
+                }
+            }
+            break;
+
         case 0x90:  /* VPGATHERDD Gx, VSIB, Vx */
         case 0x92:  /* VGATHERDPD/VGATHERDPS Gx, VSIB, Vx */
             nextop = F8;
diff --git a/src/emu/x64runavx660f3a.c b/src/emu/x64runavx660f3a.c
index 36daecad..baf9fb21 100644
--- a/src/emu/x64runavx660f3a.c
+++ b/src/emu/x64runavx660f3a.c
@@ -370,6 +370,18 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             }
             break;
 
+        case 0x20:      // VPINSRB GX, Vx, ED, u8
+            nextop = F8;
+            GETED(1);   // It's ED, and not EB
+            GETGX;
+            GETVX;
+            GETGY;
+            tmp8u = F8;
+            if(GX!=VX)
+                GX->u128 = VX->u128;
+            GX->ub[tmp8u&0xf] = ED->byte[0];
+            GY->u128 = 0;
+            break;
         case 0x21:  /* VINSERTPS Gx, Vx, Ex, imm8 */
             nextop = F8;
             GETGX;
@@ -385,6 +397,21 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GX->ud[i] = (tmp8u&(1<<i))?0:((i==((tmp8u>>4)&3))?tmp32u:VX->ud[i]);
             GY->u128 = 0;
             break;
+        case 0x22:      // VPINSRD Gx, Vx, ED, u8
+            nextop = F8;
+            GETED(1);
+            GETGX;
+            GETVX;
+            GETGY;
+            tmp8u = F8;
+            if(GX!=VX)
+                GX->u128 = VX->u128;
+            if(rex.w)
+                GX->q[tmp8u&0x1] = ED->q[0];
+            else
+                GX->ud[tmp8u&0x3] = ED->dword[0];
+            GY->u128 = 0;
+            break;
 
         case 0x38:  /* VINSERTI128 Gx, Ex, imm8 */
             nextop = F8;