about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-05-29 10:53:38 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-05-29 10:53:38 +0200
commitaf05d7439fa6866f900c7e571812ba3f970bd72b (patch)
tree1943aa5fd7d3ee4df00f8d4cc6a39e054004f2f3 /src
parent6e22f4fd6d56a62025cafe2076b851c730492cef (diff)
downloadbox64-af05d7439fa6866f900c7e571812ba3f970bd72b.tar.gz
box64-af05d7439fa6866f900c7e571812ba3f970bd72b.zip
[INTERPRETER] yet more avx/avx2 opcodes
Diffstat (limited to 'src')
-rw-r--r--src/emu/x64runavx.c2
-rw-r--r--src/emu/x64runavx660f.c72
-rw-r--r--src/emu/x64runavx660f38.c258
-rw-r--r--src/emu/x64runavx660f3a.c230
4 files changed, 555 insertions, 7 deletions
diff --git a/src/emu/x64runavx.c b/src/emu/x64runavx.c
index 8b4b9871..b66a275c 100644
--- a/src/emu/x64runavx.c
+++ b/src/emu/x64runavx.c
@@ -76,7 +76,7 @@ uintptr_t RunAVX(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
     else addr = 0;
 
     if(!addr)
-        printf_log(LOG_NONE, "Unimplemented AVX opcode size %d prefix %s map %s opcode %X ", 128<<vex.l, avx_prefix_string(vex.p), avx_map_string(vex.m), opcode);
+        printf_log(LOG_NONE, "Unimplemented AVX opcode size %d prefix %s map %s opcode %02X ", 128<<vex.l, avx_prefix_string(vex.p), avx_map_string(vex.m), opcode);
 
     return addr;
 }
diff --git a/src/emu/x64runavx660f.c b/src/emu/x64runavx660f.c
index a1b089e1..7b712a7e 100644
--- a/src/emu/x64runavx660f.c
+++ b/src/emu/x64runavx660f.c
@@ -230,7 +230,21 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             }
             break;
-
+        case 0x57:  /* VXORPD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GX->u128 = VX->u128 ^ EX->u128;
+            GETGY;
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                GY->u128 = VY->u128 ^ EY->u128;
+            } else {
+                GY->u128 = 0;
+            }
+            break;
         case 0x58:  /* VADDPD Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
@@ -829,6 +843,54 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                     return 0;
             }
             break;
+        case 0x74:  /* VPCMPEQB Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for (int i=0; i<16; ++i)
+                GX->ub[i] = (VX->ub[i]==EX->ub[i])?0xff:0;
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for (int i=0; i<16; ++i)
+                    GY->ub[i] = (VY->ub[i]==EY->ub[i])?0xff:0;
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x75:  /* VPCMPEQW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for (int i=0; i<8; ++i)
+                GX->uw[i] = (VX->uw[i]==EX->uw[i])?0xffff:0;
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for (int i=0; i<8; ++i)
+                    GY->uw[i] = (VY->uw[i]==EY->uw[i])?0xffff:0;
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x76:  /* VPCMPEQD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for (int i=0; i<4; ++i)
+                GX->ud[i] = (VX->ud[i]==EX->ud[i])?0xffffffff:0;
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for (int i=0; i<4; ++i)
+                    GY->ud[i] = (VY->ud[i]==EY->ud[i])?0xffffffff:0;
+            } else
+                GY->u128 = 0;
+            break;
 
         case 0x7C:  /* VHADDPD Gx, Vx, Ex */
             nextop = F8;
@@ -1019,6 +1081,14 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0xC5:  /* VPEXTRW Gw,Ex,Ib */
+            nextop = F8;
+            GETEX(1);
+            GETGD;
+            tmp8u = F8;
+            GD->q[0] = EX->uw[tmp8u&7];  // 16bits extract, 0 extended
+            break;
+
         case 0xD0:  /* VADDSUBPD Gx, Vx, Ex */
             nextop = F8;
             GETEX(0);
diff --git a/src/emu/x64runavx660f38.c b/src/emu/x64runavx660f38.c
index 53a8e345..f127be72 100644
--- a/src/emu/x64runavx660f38.c
+++ b/src/emu/x64runavx660f38.c
@@ -66,8 +66,8 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
     uint64_t tmp64u, tmp64u2;
     int64_t tmp64s;
     reg64_t *oped, *opgd;
-    sse_regs_t *opex, *opgx, *opvx, eax1;
-    sse_regs_t *opey, *opgy, *opvy, eay1;
+    sse_regs_t *opex, *opgx, *opvx, eax1, eax2;
+    sse_regs_t *opey, *opgy, *opvy, eay1, eay2;
     // AES opcodes constants
                             //   A0 B1 C2 D3 E4 F5 G6 H7 I8 J9 Ka Lb Mc Nd Oe Pf
                             //   A  F  K  P  E  J  O  D  I  N  C  H  M  B  G  L
@@ -153,6 +153,176 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->q[0] = GY->q[1] = 0;
             break;
+        case 0x01:  /* VPHADDW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GETEY;
+            u8 = (VX==EX);
+            if(GX==EX) {eax1=*EX; EX=&eax1;}
+            for (int i=0; i<4; ++i)
+                GX->sw[i] = VX->sw[i*2+0]+VX->sw[i*2+1];
+            if(u8) {
+                GX->q[1] = GX->q[0];
+            } else {
+                for (int i=0; i<4; ++i)
+                    GX->sw[4+i] = EX->sw[i*2+0] + EX->sw[i*2+1];
+            }
+            if(vex.l) {
+                GETVY;
+                if(EY==GY) {eay1=*EY; EY=&eay1;}
+                for (int i=0; i<4; ++i)
+                    GY->sw[i] = VY->sw[i*2+0]+VY->sw[i*2+1];
+                if(u8) {
+                    GY->q[1] = GY->q[0];
+                } else {
+                    for (int i=0; i<4; ++i)
+                        GY->sw[4+i] = EY->sw[i*2+0] + EY->sw[i*2+1];
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x02:  /* VPHADDD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GETEY;
+            u8 = (VX==EX);
+            if(GX==EX) {eax1=*EX; EX=&eax1;}
+            GX->sd[0] = VX->sd[0] + VX->sd[1];
+            GX->sd[1] = VX->sd[2] + VX->sd[3];
+            if(u8) {
+                GX->q[1] = GX->q[0];
+            } else {
+                GX->sd[2] = EX->sd[0] + EX->sd[1];
+                GX->sd[3] = EX->sd[2] + EX->sd[3];
+            }
+            if(vex.l) {
+                GETVY;
+                if(EY==GY) {eay1=*EY; EY=&eay1;}
+                GY->sd[0] = VY->sd[0] + VY->sd[1];
+                GY->sd[1] = VY->sd[2] + VY->sd[3];
+                if(u8) {
+                    GY->q[1] = GY->q[0];
+                } else {
+                    GY->sd[2] = EY->sd[0] + EY->sd[1];
+                    GY->sd[3] = EY->sd[2] + EY->sd[3];
+                }
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x03:  /* VPHADDSW Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GETEY;
+            u8 = (VX==EX);
+            if(GX==EX) {eax1=*EX; EX=&eax1;}
+            for (int i=0; i<4; ++i) {
+                tmp32s = VX->sw[i*2+0]+VX->sw[i*2+1];
+                GX->sw[i] = (tmp32s<-32768)?-32768:((tmp32s>32767)?32767:tmp32s);
+            }
+            if(u8) {
+                GX->q[1] = GX->q[0];
+            } else {
+                for (int i=0; i<4; ++i) {
+                    tmp32s = EX->sw[i*2+0] + EX->sw[i*2+1];
+                    GX->sw[4+i] = (tmp32s<-32768)?-32768:((tmp32s>32767)?32767:tmp32s);
+                }
+            }
+            if(vex.l) {
+                GETVY;
+                if(EY==GY) {eay1=*EY; EY=&eay1;}
+                for (int i=0; i<4; ++i) {
+                    tmp32s = VY->sw[i*2+0]+VY->sw[i*2+1];
+                    GY->sw[i] = (tmp32s<-32768)?-32768:((tmp32s>32767)?32767:tmp32s);
+                }
+                if(u8) {
+                    GY->q[1] = GY->q[0];
+                } else {
+                    for (int i=0; i<4; ++i) {
+                        tmp32s = EY->sw[i*2+0] + EY->sw[i*2+1];
+                        GY->sw[4+i] = (tmp32s<-32768)?-32768:((tmp32s>32767)?32767:tmp32s);
+                    }
+                }
+            } else
+                GY->u128 = 0;
+            break;
+
+        case 0x0C:  /* VPERMILPS Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GETEY;
+            GETVY;
+            if(GX==VX) {eax1 = *VX; VX = &eax1;}
+            for(int i=0; i<4; ++i)
+                GX->ud[i] = VX->ud[EX->ud[i]&3];
+            if(vex.l) {
+                if(GY==VY) {eay1 = *VY; VY = &eay1;}
+                for(int i=0; i<4; ++i)
+                    GY->ud[i] = VY->ud[EY->ud[i]&3];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x0D:  /* VPERMILPD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GETEY;
+            GETVY;
+            if(GX==VX) {eax1 = *VX; VX = &eax1;}
+            for(int i=0; i<2; ++i)
+                GX->q[i] = VX->q[(EX->q[i]>>1)&1];
+            if(vex.l) {
+                if(GY==VY) {eay1 = *VY; VY = &eay1;}
+                for(int i=0; i<2; ++i)
+                    GY->q[i] = VY->q[(EY->q[i]>>1)&1];
+            } else
+                GY->u128 = 0;
+            break;
+
+        case 0x16:  /* VPERMPS Gx, Vx, Ex */
+            // same code as 0x36
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETEY;
+            GETGY;
+            GETVY;
+            if(!vex.l) emit_signal(emu, SIGILL, (void*)R_RIP, 0);
+            if(GX==EX) {
+                eax1 = *EX;
+                EX = &eax1;
+                eay1 = *EY;
+                EY = &eay1;
+            }
+            if(GX==VX) {
+                eax2 = *VX;
+                VX = &eax2;
+                eay2 = *VY;
+                VY = &eay2;
+            }
+            for(int i=0; i<4; ++i) {
+                u8 = VX->ud[i]&7;
+                GX->ud[i] = (u8>3)?EY->ud[u8&3]:EX->ud[u8];
+            }
+            for(int i=0; i<4; ++i) {
+                u8 = VY->ud[i]&7;
+                GY->ud[i] = (u8>3)?EY->ud[u8&3]:EX->ud[u8];
+            }
+            break;
 
         case 0x18:  /* VBROADCASTSS Gx, Ex */
             nextop = F8;
@@ -240,6 +410,22 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0x29:  /* VPCMPEQQ Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=1; i>=0; --i)
+                GX->sq[i] = (VX->sq[i]==EX->sq[i])?-1LL:0LL;
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=1; i>=0; --i)
+                    GY->sq[i] = (VY->sq[i]==EY->sq[i])?-1LL:0LL;
+            } else
+                GY->u128 = 0;
+            break;
         case 0x2A:  /* VMOVNTDQA Gx, Ex */
             nextop = F8;
             GETEX(0);
@@ -351,6 +537,72 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             }
             break;
 
+        case 0x36:  /* VPERMD Gx, Vx, Ex */
+            // same code as 0x16
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETEY;
+            GETGY;
+            GETVY;
+            if(!vex.l) emit_signal(emu, SIGILL, (void*)R_RIP, 0);
+            if(GX==EX) {
+                eax1 = *EX;
+                EX = &eax1;
+                eay1 = *EY;
+                EY = &eay1;
+            }
+            if(GX==VX) {
+                eax2 = *VX;
+                VX = &eax2;
+                eay2 = *VY;
+                VY = &eay2;
+            }
+            for(int i=0; i<4; ++i) {
+                u8 = VX->ud[i]&7;
+                GX->ud[i] = (u8>3)?EY->ud[u8&3]:EX->ud[u8];
+            }
+            for(int i=0; i<4; ++i) {
+                u8 = VY->ud[i]&7;
+                GY->ud[i] = (u8>3)?EY->ud[u8&3]:EX->ud[u8];
+            }
+            break;
+        case 0x37: /* VPCMPGTQ Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            for(int i=1; i>=0; --i)
+                GX->sq[i] = (VX->sq[i]>EX->sq[i])?-1LL:0LL;
+            if(vex.l) {
+                GETEY;
+                GETVY;
+                for(int i=1; i>=0; --i)
+                    GY->sq[i] = (VY->sq[i]>EY->sq[i])?-1LL:0LL;
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x38:  /* VPERMILD Gx, Vx, Ex */
+            nextop = F8;
+            GETEX(0);
+            GETGX;
+            GETVX;
+            GETGY;
+            GETEY;
+            GETVY;
+            if(GX==EX) {eax1 = *EX; EX = &eax1;}
+            for(int i=0; i<2; ++i)
+                GX->q[i] = EX->q[(VX->q[i]>>1)&1];
+            if(vex.l) {
+                if(GY==EY) {eay1 = *EY; EY = &eay1;}
+                for(int i=0; i<2; ++i)
+                    GY->q[i] = EY->q[(VY->q[i]>>1)&1];
+            } else
+                GY->u128 = 0;
+            break;
+
         case 0x58:  /* VPBROADCASTD Gx, Ex */
             nextop = F8;
             GETEX(0);
@@ -413,6 +665,7 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0x90:  /* VPGATHERDD Gx, VSIB, Vx */
         case 0x92:  /* VGATHERDPD/VGATHERDPS Gx, VSIB, Vx */
             nextop = F8;
             if(((nextop&7)!=4) || MODREG) {
@@ -479,6 +732,7 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             } else
                 GY->u128 = 0;
             break;
+        case 0x91:  /* VPGATHERQD Gx, VSIB, Vx */
         case 0x93:  /* VGATHERQPD/VGATHERQPS Gx, VSIB, Vx */
             nextop = F8;
             if(((nextop&7)!=4) || MODREG) {
diff --git a/src/emu/x64runavx660f3a.c b/src/emu/x64runavx660f3a.c
index 16d10af0..36daecad 100644
--- a/src/emu/x64runavx660f3a.c
+++ b/src/emu/x64runavx660f3a.c
@@ -23,6 +23,7 @@
 #include "bridge.h"
 #include "signals.h"
 #include "x64shaext.h"
+#include "x64compstrings.h"
 #ifdef DYNAREC
 #include "custommem.h"
 #include "../dynarec/native_lock.h"
@@ -58,8 +59,8 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
     reg64_t *oped, *opgd;
     float tmpf;
     double tmpd;
-    sse_regs_t *opex, *opgx, *opvx, eax1;
-    sse_regs_t *opey, *opgy, *opvy, eay1;
+    sse_regs_t *opex, *opgx, *opvx, eax1,eax2;
+    sse_regs_t *opey, *opgy, *opvy, eay1,eay2;
     // AES opcodes constants
     const uint8_t subbytes[256] = {
         0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
@@ -90,6 +91,26 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
 
     switch(opcode) {
 
+        case 0x00:  /* VPERMQ Gx, Ex, Imm8 */
+        case 0x01:  /* VPERMPD Gx, Ex, Imm8 */
+            nextop = F8;
+            GETEX(1);
+            GETGX;
+            GETGY;
+            GETEY;
+            u8 = F8;
+            if(!vex.l)  emit_signal(emu, SIGILL, (void*)R_RIP, 0);
+            if(GX==EX) {
+                eax1 = *EX;
+                EX = &eax1;
+                eay1 = *EY;
+                EY = &eay1;
+            }
+            for(int i=0; i<2; ++i)
+                GX->q[i] = (((u8>>(i*2))&3)>1)?EY->q[(u8>>(i*2))&1]:EX->q[(u8>>(i*2))&1];
+            for(int i=2; i<4; ++i)
+                GY->q[i-2] = (((u8>>(i*2))&3)>1)?EY->q[(u8>>(i*2))&1]:EX->q[(u8>>(i*2))&1];
+            break;
         case 0x02:      /* VBLENDD Gx, Vx, Ex, u8 */
             nextop = F8;
             GETEX(1);
@@ -108,6 +129,90 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0x04:  /* VPERMILPS Gx, Ex, Imm8 */
+            nextop = F8;
+            GETEX(1);
+            GETGX;
+            GETGY;
+            GETEY;
+            u8 = F8;
+            if(GX==EX) {
+                eax1 = *EX;
+                EX = &eax1;
+            }
+            for(int i=0; i<4; ++i)
+                GX->ud[i] = EX->ud[(u8>>(i*2))&3];
+            if(vex.l) {
+                if(GY==EY) {
+                    eay1 = *EY;
+                    EY = &eay1;
+                }
+                for(int i=0; i<4; ++i)
+                    GY->ud[i] = EY->ud[(u8>>(i*2))&3];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x05:  /* VPERMILD Gx, Ex, Imm8 */
+            nextop = F8;
+            GETEX(1);
+            GETGX;
+            GETGY;
+            GETEY;
+            u8 = F8;
+            if(GX==EX) {
+                eax1 = *EX;
+                EX = &eax1;
+            }
+            for(int i=0; i<2; ++i)
+                GX->q[i] = EX->q[(u8>>i)&1];
+            if(vex.l) {
+                if(GY==EY) {
+                    eay1 = *EY;
+                    EY = &eay1;
+                }
+                for(int i=0; i<2; ++i)
+                    GY->q[i] = EY->q[(u8>>(i+2))&1];
+            } else
+                GY->u128 = 0;
+            break;
+        case 0x06:  /* VPERM2F128 Gx, Vx, Ex, Imm8 */
+            nextop = F8;
+            GETEX(1);
+            GETGX;
+            GETVX;
+            GETEY;
+            GETGY;
+            GETVY;
+            u8 = F8;
+            if(!vex.l) emit_signal(emu, SIGILL, (void*)R_RIP, 0);
+            if(GX==EX) {
+                eax1 = *EX;
+                EX = &eax1;
+                eay1 = *EY;
+                EY = &eay1;
+            }
+            if(GX==VX) {
+                eax2 = *VX;
+                VX = &eax2;
+                eay2 = *VY;
+                VY = &eay2;
+            }
+            switch(u8&0x0f) {
+                case 0 : GX->u128 = VX->u128; break;
+                case 1 : GX->u128 = VY->u128; break;
+                case 2 : GX->u128 = EX->u128; break;
+                case 3 : GX->u128 = EY->u128; break;
+                default: GX->u128 = 0; break;
+            }
+            switch((u8>>4)&0x0f) {
+                case 0 : GY->u128 = VX->u128; break;
+                case 1 : GY->u128 = VY->u128; break;
+                case 2 : GY->u128 = EX->u128; break;
+                case 3 : GY->u128 = EY->u128; break;
+                default: GY->u128 = 0; break;
+            }
+            break;
+
         case 0x0C:      /* VBLENDPS Gx, Vx, Ex, u8 */
             nextop = F8;
             GETEX(1);
@@ -191,6 +296,26 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0x14:      // VPEXTRB ED, GX, u8
+            nextop = F8;
+            GETED(1);
+            GETGX;
+            tmp8u = F8;
+            if(MODREG)
+                ED->q[0] = GX->ub[tmp8u&0x0f];
+            else
+                ED->byte[0] = GX->ub[tmp8u&0x0f];
+            break;
+        case 0x15:      // VPEXTRW Ew,Gx,Ib
+            nextop = F8;
+            GETED(1);
+            GETGX;
+            tmp8u = F8;
+            if(MODREG)
+                ED->q[0] = GX->uw[tmp8u&7];  // 16bits extract, 0 extended
+            else
+                ED->word[0] = GX->uw[tmp8u&7];
+            break;
         case 0x16:      // VPEXTRD/Q ED, GX, u8
             nextop = F8;
             GETED(1);
@@ -414,6 +539,44 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0x46:  /* VPERM2I128 Gx, Vx, Ex, Imm8 */
+            nextop = F8;
+            GETEX(1);
+            GETGX;
+            GETVX;
+            GETEY;
+            GETGY;
+            GETVY;
+            u8 = F8;
+            if(!vex.l) emit_signal(emu, SIGILL, (void*)R_RIP, 0);
+            if(GX==EX) {
+                eax1 = *EX;
+                EX = &eax1;
+                eay1 = *EY;
+                EY = &eay1;
+            }
+            if(GX==VX) {
+                eax2 = *VX;
+                VX = &eax2;
+                eay2 = *VY;
+                VY = &eay2;
+            }
+            switch(u8&0x0f) {
+                case 0 : GX->u128 = VX->u128; break;
+                case 1 : GX->u128 = VY->u128; break;
+                case 2 : GX->u128 = EX->u128; break;
+                case 3 : GX->u128 = EY->u128; break;
+                default: GX->u128 = 0; break;
+            }
+            switch((u8>>4)&0x0f) {
+                case 0 : GY->u128 = VX->u128; break;
+                case 1 : GY->u128 = VY->u128; break;
+                case 2 : GY->u128 = EX->u128; break;
+                case 3 : GY->u128 = EY->u128; break;
+                default: GY->u128 = 0; break;
+            }
+            break;
+
         case 0x4A:      /* VBLENDVPS Gx, Vx, Ex, XMMImm8 */
             nextop = F8;
             GETEX(1);
@@ -433,7 +596,7 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
             break;
         case 0x4B:      /* VBLENDVPD Gx, Vx, Ex, XMMImm8 */
             nextop = F8;
-            GETEX(0);
+            GETEX(1);
             GETGX;
             GETVX;
             GETGY;
@@ -469,6 +632,67 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step)
                 GY->u128 = 0;
             break;
 
+        case 0x60:  /* VPCMPESTRM */
+            nextop = F8;
+            GETEX(1);
+            GETGX;
+            tmp8u = F8;
+            tmp32u = sse42_compare_string_explicit_len(emu, EX, R_EDX, GX, R_EAX, tmp8u);
+            if(tmp8u&0b1000000) {
+                switch(tmp8u&1) {
+                    case 0: for(int i=0; i<16; ++i) emu->xmm[0].ub[i] = ((tmp32u>>i)&1)?0xff:0x00; break;
+                    case 1: for(int i=0; i<8; ++i) emu->xmm[0].uw[i] = ((tmp32u>>i)&1)?0xffff:0x0000; break;
+                }
+            } else {
+                emu->xmm[0].q[1] = emu->xmm[0].q[0] = 0;
+                emu->xmm[0].uw[0] = tmp32u;
+                emu->ymm[0].u128 = 0;
+            }
+            break;
+        case 0x61:  /* VPCMPESTRI */
+            nextop = F8;
+            GETEX(1);
+            GETGX;
+            tmp8u = F8;
+            tmp32u = sse42_compare_string_explicit_len(emu, EX, R_EDX, GX, R_EAX, tmp8u);
+            if(!tmp32u)
+                R_RCX = (tmp8u&1)?8:16;
+            else if(tmp8u&0b1000000)
+                R_RCX = 31-__builtin_clz(tmp32u);
+            else
+                R_RCX = __builtin_ffs(tmp32u) - 1;
+            break;
+        case 0x62:  /* VPCMPISTRM */
+            nextop = F8;
+            GETEX(1);
+            GETGX;
+            tmp8u = F8;
+            tmp32u = sse42_compare_string_implicit_len(emu, EX, GX, tmp8u);
+            if(tmp8u&0b1000000) {
+                switch(tmp8u&1) {
+                    case 0: for(int i=0; i<16; ++i) emu->xmm[0].ub[i] = ((tmp32u>>i)&1)?0xff:0x00; break;
+                    case 1: for(int i=0; i<8; ++i) emu->xmm[0].uw[i] = ((tmp32u>>i)&1)?0xffff:0x0000; break;
+                }
+            } else {
+                emu->xmm[0].q[1] = emu->xmm[0].q[0] = 0;
+                emu->xmm[0].uw[0] = tmp32u;
+                emu->ymm[0].u128 = 0;
+            }
+            break;
+        case 0x63:  /* VPCMPISTRI */
+            nextop = F8;
+            GETEX(1);
+            GETGX;
+            tmp8u = F8;
+            tmp32u = sse42_compare_string_implicit_len(emu, EX, GX, tmp8u);
+            if(!tmp32u)
+                R_RCX = (tmp8u&1)?8:16;
+            else if(tmp8u&0b1000000)
+                R_RCX = 31-__builtin_clz(tmp32u);
+            else
+                R_RCX = __builtin_ffs(tmp32u) - 1;
+            break;
+
         case 0xDF:      // VAESKEYGENASSIST Gx, Ex, u8
             nextop = F8;
             GETEX(1);