diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-05-28 10:42:41 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-05-28 10:42:41 +0200 |
| commit | b9f5929439ab5e0ebf9d64b2dc2659a9a018f19d (patch) | |
| tree | abd27a75f28e1a50caf99716ca76b7f1184c125d /src | |
| parent | 3dc396a64775a0aa8aae55513eda0d326cb50080 (diff) | |
| download | box64-b9f5929439ab5e0ebf9d64b2dc2659a9a018f19d.tar.gz box64-b9f5929439ab5e0ebf9d64b2dc2659a9a018f19d.zip | |
[INTERPRETER] More avx, avx2 and vaes opcodes
Diffstat (limited to 'src')
| -rw-r--r-- | src/emu/x64run0f.c | 2 | ||||
| -rw-r--r-- | src/emu/x64runavx.c | 15 | ||||
| -rw-r--r-- | src/emu/x64runavx0f.c | 207 | ||||
| -rw-r--r-- | src/emu/x64runavx660f.c | 243 | ||||
| -rw-r--r-- | src/emu/x64runavx660f38.c | 289 | ||||
| -rw-r--r-- | src/emu/x64runavx660f3a.c | 173 | ||||
| -rw-r--r-- | src/emu/x64runavxf20f.c | 142 | ||||
| -rw-r--r-- | src/emu/x64runavxf30f.c | 41 | ||||
| -rw-r--r-- | src/tools/my_cpuid.c | 1 | ||||
| -rw-r--r-- | src/wrapped/wrappedlibc.c | 2 |
10 files changed, 1069 insertions, 46 deletions
diff --git a/src/emu/x64run0f.c b/src/emu/x64run0f.c index 0dadfe2d..8fc5e645 100644 --- a/src/emu/x64run0f.c +++ b/src/emu/x64run0f.c @@ -702,7 +702,7 @@ uintptr_t Run0F(x64emu_t *emu, rex_t rex, uintptr_t addr, int *step) if(EX->f[i]==0) GX->f[i] = 1.0f/EX->f[i]; else if (EX->f[i]<0) - GX->f[i] = NAN; + GX->f[i] = -NAN; else if (isnan(EX->f[i])) GX->f[i] = EX->f[i]; else if (isinf(EX->f[i])) diff --git a/src/emu/x64runavx.c b/src/emu/x64runavx.c index 56507b4d..9d15e803 100644 --- a/src/emu/x64runavx.c +++ b/src/emu/x64runavx.c @@ -57,21 +57,10 @@ uintptr_t TestAVX(x64test_t *test, vex_t vex, uintptr_t addr, int *step) uintptr_t RunAVX(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) #endif { - uint8_t opcode; - uint8_t nextop; - uint8_t tmp8u; - int8_t tmp8s; - int32_t tmp32s, tmp32s2; - uint32_t tmp32u, tmp32u2; - uint64_t tmp64u, tmp64u2; - int64_t tmp64s; - reg64_t *oped, *opgd; - sse_regs_t *opex, *opgx, eax1; - mmx87_regs_t *opem, *opgm, eam1; - #ifdef TEST_INTERPRETER x64emu_t *emu = test->emu; #endif + uint8_t opcode = PK(0); if( (vex.m==VEX_M_0F) && (vex.p==VEX_P_NONE)) addr = RunAVX_0F(emu, vex, addr, step); else if( (vex.m==VEX_M_0F) && (vex.p==VEX_P_66)) @@ -87,7 +76,7 @@ uintptr_t RunAVX(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) else addr = 0; if(!addr) - printf_log(LOG_NONE, "Unimplemented AVX opcode prefix %s map %s ", avx_prefix_string(vex.p), avx_map_string(vex.m)); + printf_log(LOG_NONE, "Unimplemented AVX opcode prefix %s map %s opcode %X ", avx_prefix_string(vex.p), avx_map_string(vex.m), opcode); return addr; } diff --git a/src/emu/x64runavx0f.c b/src/emu/x64runavx0f.c index b5a49560..34372ca6 100644 --- a/src/emu/x64runavx0f.c +++ b/src/emu/x64runavx0f.c @@ -69,7 +69,7 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GY->q[0] = EY->q[0]; GY->q[1] = EY->q[1]; } else { - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; } break; case 0x11: /* VMOVUPS Ex, Gx */ @@ -86,6 +86,27 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) } break; + case 0x14: /* VUNPCKLPS Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + GX->ud[3] = EX->ud[1]; + GX->ud[2] = VX->ud[1]; + GX->ud[1] = EX->ud[0]; + GX->ud[0] = VX->ud[0]; + if(vex.l) { + GETEY; + GETVY; + GY->ud[3] = EY->ud[1]; + GY->ud[2] = VY->ud[1]; + GY->ud[1] = EY->ud[0]; + GY->ud[0] = VY->ud[0]; + } else + GY->u128 = 0; + break; + case 0x28: /* VMOVAPS Gx, Ex */ nextop = F8; GETEX(0); @@ -98,7 +119,7 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GY->q[0] = EY->q[0]; GY->q[1] = EY->q[1]; } else { - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; } break; case 0x29: /* VMOVAPS Ex, Gx */ @@ -115,6 +136,23 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) } break; + case 0x2F: /* VCOMISS Gx, Ex */ + RESET_FLAGS(emu); + nextop = F8; + GETEX(0); + GETGX; + if(isnan(GX->f[0]) || isnan(EX->f[0])) { + SET_FLAG(F_ZF); SET_FLAG(F_PF); SET_FLAG(F_CF); + } else if(isgreater(GX->f[0], EX->f[0])) { + CLEAR_FLAG(F_ZF); CLEAR_FLAG(F_PF); CLEAR_FLAG(F_CF); + } else if(isless(GX->f[0], EX->f[0])) { + CLEAR_FLAG(F_ZF); CLEAR_FLAG(F_PF); SET_FLAG(F_CF); + } else { + SET_FLAG(F_ZF); CLEAR_FLAG(F_PF); CLEAR_FLAG(F_CF); + } + CLEAR_FLAG(F_OF); CLEAR_FLAG(F_AF); CLEAR_FLAG(F_SF); + break; + case 0x52: /* VRSQRTPS Gx, Ex */ nextop = F8; GETEX(0); @@ -124,7 +162,7 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) if(EX->f[i]==0) GX->f[i] = 1.0f/EX->f[i]; else if (EX->f[i]<0) - GX->f[i] = NAN; + GX->f[i] = -NAN; else if (isnan(EX->f[i])) GX->f[i] = EX->f[i]; else if (isinf(EX->f[i])) @@ -138,7 +176,7 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) if(EY->f[i]==0) GY->f[i] = 1.0f/EY->f[i]; else if (EY->f[i]<0) - GY->f[i] = NAN; + GY->f[i] = -NAN; else if (isnan(EY->f[i])) GY->f[i] = EY->f[i]; else if (isinf(EY->f[i])) @@ -147,30 +185,73 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GY->f[i] = 1.0f/sqrtf(EY->f[i]); } } else - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; #ifdef TEST_INTERPRETER test->notest = 1; #endif break; + case 0x54: /* VANDPS Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GX->u128 = VX->u128 & EX->u128; + GETGY; + if(vex.l) { + GETEY; + GETVY; + GY->u128 = VY->u128 & EY->u128; + } else { + GY->u128 = 0; + } + break; + case 0x55: /* VANDNPS Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GX->u128 = (~VX->u128) & EX->u128; + GETGY; + if(vex.l) { + GETEY; + GETVY; + GY->u128 = (~VY->u128) & EY->u128; + } else { + GY->u128 = 0; + } + break; + case 0x57: /* XORPS Gx, Vx, Ex */ nextop = F8; GETEX(0); GETGX; GETVX; GETGY; + GX->u128 = VX->u128 ^ EX->u128; + if(vex.l) { + GETEY; + GETVY; + GY->u128 = VY->u128 ^ EY->u128; + } else + GY->u128 = 0; + break; + case 0x58: /* VADDPS Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; for(int i=0; i<4; ++i) - GX->ud[i] = VX->ud[i] ^ EX->ud[i]; + GX->f[i] = VX->f[i] + EX->f[i]; if(vex.l) { GETEY; GETVY; for(int i=0; i<4; ++i) - GY->ud[i] = VY->ud[i] ^ EY->ud[i]; - + GY->f[i] = VY->f[i] + EY->f[i]; } else - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; break; - case 0x59: /* VMULPS Gx, Vx, Ex */ nextop = F8; GETEX(0); @@ -185,9 +266,27 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) for(int i=0; i<4; ++i) GY->f[i] = VY->f[i] * EY->f[i]; } else - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; break; + case 0x5B: /* VCVTDQ2PS Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + GX->f[0] = EX->sd[0]; + GX->f[1] = EX->sd[1]; + GX->f[2] = EX->sd[2]; + GX->f[3] = EX->sd[3]; + if(vex.l) { + GETEY; + GY->f[0] = EY->sd[0]; + GY->f[1] = EY->sd[1]; + GY->f[2] = EY->sd[2]; + GY->f[3] = EY->sd[3]; + } else + GY->u128 = 0; + break; case 0x5C: /* VSUBPS Gx, Vx, Ex */ nextop = F8; GETEX(0); @@ -202,7 +301,7 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) for(int i=0; i<4; ++i) GY->f[i] = VY->f[i] - EY->f[i]; } else - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; break; case 0x77: @@ -216,6 +315,90 @@ uintptr_t RunAVX_0F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) return 0; break; + case 0xC2: /* VCMPPS Gx, Vx, Ex, Ib */ + nextop = F8; + GETEX(1); + GETGX; + GETVX; + GETGY; + tmp8u = F8; + for(int i=0; i<4; ++i) { + tmp8s = 0; + switch(tmp8u&7) { + case 0: tmp8s=(VX->f[i] == EX->f[i]); break; + case 1: tmp8s=isless(VX->f[i], EX->f[i]); break; + case 2: tmp8s=islessequal(VX->f[i], EX->f[i]); break; + case 3: tmp8s=isnan(VX->f[i]) || isnan(EX->f[i]); break; + case 4: tmp8s=(VX->f[i] != EX->f[i]); break; + case 5: tmp8s=isnan(VX->f[i]) || isnan(EX->f[i]) || isgreaterequal(VX->f[i], EX->f[i]); break; + case 6: tmp8s=isnan(VX->f[i]) || isnan(EX->f[i]) || isgreater(VX->f[i], EX->f[i]); break; + case 7: tmp8s=!isnan(VX->f[i]) && !isnan(EX->f[i]); break; + } + GX->ud[i]=(tmp8s)?0xffffffff:0; + } + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<4; ++i) { + tmp8s = 0; + switch(tmp8u&7) { + case 0: tmp8s=(VY->f[i] == EY->f[i]); break; + case 1: tmp8s=isless(VY->f[i], EY->f[i]); break; + case 2: tmp8s=islessequal(VY->f[i], EY->f[i]); break; + case 3: tmp8s=isnan(VY->f[i]) || isnan(EY->f[i]); break; + case 4: tmp8s=(VY->f[i] != EY->f[i]); break; + case 5: tmp8s=isnan(VY->f[i]) || isnan(EY->f[i]) || isgreaterequal(VY->f[i], EY->f[i]); break; + case 6: tmp8s=isnan(VY->f[i]) || isnan(EY->f[i]) || isgreater(VY->f[i], EY->f[i]); break; + case 7: tmp8s=!isnan(VY->f[i]) && !isnan(EY->f[i]); break; + } + GY->ud[i]=(tmp8s)?0xffffffff:0; + } + } else + GY->u128 = 0; + break; + + case 0xC6: /* VSHUFPS Gx, Vx, Ex, Ib */ + nextop = F8; + GETEX(1); + GETGX; + GETVX; + GETGY; + GETVY; + GETEY; + tmp8u = F8; + if(GX==VX) { + eax1 = *VX; + VX = &eax1; + } + if(GX==EX) { + eay1 = *EX; + EX = &eay1; + } + for(int i=0; i<2; ++i) { + GX->ud[i] = VX->ud[(tmp8u>>(i*2))&3]; + } + for(int i=2; i<4; ++i) { + GX->ud[i] = EX->ud[(tmp8u>>(i*2))&3]; + } + if(vex.l) { + if(GY==VY) { + eax1 = *VY; + VY = &eax1; + } + if(GY==EY) { + eay1 = *EY; + EY = &eay1; + } + for(int i=0; i<2; ++i) { + GY->ud[i] = VY->ud[(tmp8u>>(i*2))&3]; + } + for(int i=2; i<4; ++i) { + GY->ud[i] = EY->ud[(tmp8u>>(i*2))&3]; + } + } else + GY->u128 = 0; + break; + default: return 0; } diff --git a/src/emu/x64runavx660f.c b/src/emu/x64runavx660f.c index aeb976f1..3136433f 100644 --- a/src/emu/x64runavx660f.c +++ b/src/emu/x64runavx660f.c @@ -58,6 +58,154 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) switch(opcode) { + case 0x2F: /* VCOMISD Gx, Ex */ + RESET_FLAGS(emu); + nextop = F8; + GETEX(0); + GETGX; + if(isnan(GX->d[0]) || isnan(EX->d[0])) { + SET_FLAG(F_ZF); SET_FLAG(F_PF); SET_FLAG(F_CF); + } else if(isgreater(GX->d[0], EX->d[0])) { + CLEAR_FLAG(F_ZF); CLEAR_FLAG(F_PF); CLEAR_FLAG(F_CF); + } else if(isless(GX->d[0], EX->d[0])) { + CLEAR_FLAG(F_ZF); CLEAR_FLAG(F_PF); SET_FLAG(F_CF); + } else { + SET_FLAG(F_ZF); CLEAR_FLAG(F_PF); CLEAR_FLAG(F_CF); + } + CLEAR_FLAG(F_OF); CLEAR_FLAG(F_AF); CLEAR_FLAG(F_SF); + break; + + case 0x54: /* VANDPD Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GX->u128 = VX->u128 & EX->u128; + GETGY; + if(vex.l) { + GETEY; + GETVY; + GY->u128 = VY->u128 & EY->u128; + } else { + GY->u128 = 0; + } + break; + case 0x55: /* VANDNPD Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GX->u128 = (~VX->u128) & EX->u128; + GETGY; + if(vex.l) { + GETEY; + GETVY; + GY->u128 = (~VY->u128) & EY->u128; + } else { + GY->u128 = 0; + } + break; + + case 0x58: /* VADDPD Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GX->d[0] = VX->d[0] + EX->d[0]; + GX->d[1] = VX->d[1] + EX->d[1]; + GETGY; + if(vex.l) { + GETEY; + GETVY; + GY->d[0] = VY->d[0] + EY->d[0]; + GY->d[1] = VY->d[1] + EY->d[1]; + } else { + GY->u128 = 0; + } + break; + + case 0x5A: /* VCVTPD2PS Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + GX->f[0] = EX->d[0]; + GX->f[1] = EX->d[1]; + if(vex.l) { + GETEY; + GX->f[2] = EY->d[0]; + GX->f[3] = EY->d[1]; + } else + GX->q[1] = 0; + GY->u128 = 0; + break; + case 0x5B: /* VCVTPS2DQ Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + for(int i=0; i<4; ++i) { + if(isnanf(EX->f[i])) + tmp64s = INT32_MIN; + else + switch(emu->mxcsr.f.MXCSR_RC) { + case ROUND_Nearest: { + int round = fegetround(); + fesetround(FE_TONEAREST); + tmp64s = nearbyintf(EX->f[i]); + fesetround(round); + break; + } + case ROUND_Down: + tmp64s = floorf(EX->f[i]); + break; + case ROUND_Up: + tmp64s = ceilf(EX->f[i]); + break; + case ROUND_Chop: + tmp64s = EX->f[i]; + break; + } + if (tmp64s==(int32_t)tmp64s) { + GX->sd[i] = (int32_t)tmp64s; + } else { + GX->sd[i] = INT32_MIN; + } + } + if(vex.l) { + GETEY; + for(int i=0; i<4; ++i) { + if(isnanf(EY->f[i])) + tmp64s = INT32_MIN; + else + switch(emu->mxcsr.f.MXCSR_RC) { + case ROUND_Nearest: { + int round = fegetround(); + fesetround(FE_TONEAREST); + tmp64s = nearbyintf(EY->f[i]); + fesetround(round); + break; + } + case ROUND_Down: + tmp64s = floorf(EY->f[i]); + break; + case ROUND_Up: + tmp64s = ceilf(EY->f[i]); + break; + case ROUND_Chop: + tmp64s = EY->f[i]; + break; + } + if (tmp64s==(int32_t)tmp64s) { + GY->sd[i] = (int32_t)tmp64s; + } else { + GY->sd[i] = INT32_MIN; + } + } + } else + GY->u128 = 0; + break; + case 0x64: /* VPCMPGTB Gx,Vx, Ex */ nextop = F8; GETEX(0); @@ -107,21 +255,45 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GY->q[0] = GY->q[1] = 0; break; - case 0x6C: /* VPUNPCKLQDQ Gx,E Vx, x */ + case 0x6B: /* VPACKSSDW Gx,Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + for(int i=0; i<4; ++i) + GX->sw[i] = (VX->sd[i]<-32768)?-32768:((VX->sd[i]>32767)?32767:VX->sd[i]); + if(GX==EX) + GX->q[1] = GX->q[0]; + else + for(int i=0; i<4; ++i) + GX->sw[4+i] = (EX->sd[i]<-32768)?-32768:((EX->sd[i]>32767)?32767:EX->sd[i]); + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<4; ++i) + GY->sw[i] = (VY->sd[i]<-32768)?-32768:((VY->sd[i]>32767)?32767:VY->sd[i]); + if(GY==EY) + GY->q[1] = GY->q[0]; + else + for(int i=0; i<4; ++i) + GY->sw[4+i] = (EY->sd[i]<-32768)?-32768:((EY->sd[i]>32767)?32767:EY->sd[i]); + } else + GY->u128 = 0; + break; + case 0x6C: /* VPUNPCKLQDQ Gx, Vx, Ex */ nextop = F8; GETEX(0); GETGX; GETVX; GETGY; GX->q[1] = EX->q[0]; - if(GX!=VX) - GX->q[0] = VX->q[0]; + GX->q[0] = VX->q[0]; if(vex.l) { GETEY; GETVY; GY->q[1] = EY->q[0]; - if(GY!=VY) - GY->q[0] = VY->q[0]; + GY->q[0] = VY->q[0]; } else GY->q[0] = GY->q[1] = 0; break; @@ -178,7 +350,7 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GETEY; if(EY==GY) {eay1 = *GY; EY = &eay1;} // copy is needed for (int i=0; i<4; ++i) - GY->ud[4+i] = EY->ud[4+((tmp8u>>(i*2))&3)]; + GY->ud[i] = EY->ud[(tmp8u>>(i*2))&3]; } else memset(GY, 0, 16); if(EX==GX) {eax1 = *GX; EX = &eax1;} // copy is needed @@ -366,6 +538,65 @@ uintptr_t RunAVX_660F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) } // no upper raz? break; + case 0xC2: /* CMPPD Gx, Vx, Ex, Ib */ + nextop = F8; + GETEX(1); + GETGX; + GETVX; + GETGY; + tmp8u = F8; + for(int i=0; i<2; ++i) { + tmp8s = 0; + switch(tmp8u&7) { + case 0: tmp8s=(VX->d[i] == EX->d[i]); break; + case 1: tmp8s=isless(VX->d[i], EX->d[i]); break; + case 2: tmp8s=islessequal(VX->d[i], EX->d[i]); break; + case 3: tmp8s=isnan(VX->d[i]) || isnan(EX->d[i]); break; + case 4: tmp8s=isnan(VX->d[i]) || isnan(EX->d[i]) || (VX->d[i] != EX->d[i]); break; + case 5: tmp8s=isnan(VX->d[i]) || isnan(EX->d[i]) || isgreaterequal(VX->d[i], EX->d[i]); break; + case 6: tmp8s=isnan(VX->d[i]) || isnan(EX->d[i]) || isgreater(VX->d[i], EX->d[i]); break; + case 7: tmp8s=!isnan(VX->d[i]) && !isnan(EX->d[i]); break; + } + GX->q[i]=(tmp8s)?0xffffffffffffffffLL:0LL; + } + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<2; ++i) { + tmp8s = 0; + switch(tmp8u&7) { + case 0: tmp8s=(VY->d[i] == EY->d[i]); break; + case 1: tmp8s=isless(VY->d[i], EY->d[i]); break; + case 2: tmp8s=islessequal(VY->d[i], EY->d[i]); break; + case 3: tmp8s=isnan(VY->d[i]) || isnan(EY->d[i]); break; + case 4: tmp8s=isnan(VY->d[i]) || isnan(EY->d[i]) || (VY->d[i] != EY->d[i]); break; + case 5: tmp8s=isnan(VY->d[i]) || isnan(EY->d[i]) || isgreaterequal(VY->d[i], EY->d[i]); break; + case 6: tmp8s=isnan(VY->d[i]) || isnan(EY->d[i]) || isgreater(VY->d[i], EY->d[i]); break; + case 7: tmp8s=!isnan(VY->d[i]) && !isnan(EY->d[i]); break; + } + GY->q[i]=(tmp8s)?0xffffffffffffffffLL:0LL; + } + } else + GY->u128 = 0; + break; + + case 0xD0: /* VADDSUBPD Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + GX->d[0] = VX->d[0] - EX->d[0]; + GX->d[1] = VX->d[1] + EX->d[1]; + if(vex.l) { + GETEY; + GETVY; + GY->d[0] = VY->d[0] - EY->d[0]; + GY->d[1] = VY->d[1] + EY->d[1]; + } else + GY->u128 = 0; + break; + case 0xDB: /* VPAND Gx, Vx, Ex */ nextop = F8; GETEX(0); diff --git a/src/emu/x64runavx660f38.c b/src/emu/x64runavx660f38.c index ab01aba7..3ec1f0ff 100644 --- a/src/emu/x64runavx660f38.c +++ b/src/emu/x64runavx660f38.c @@ -30,6 +30,27 @@ #include "modrm.h" +static uint8_t ff_mult(uint8_t a, uint8_t b) +{ + int retval = 0; + + for(int i = 0; i < 8; i++) { + if((b & 1) == 1) + retval ^= a; + + if((a & 0x80)) { + a <<= 1; + a ^= 0x1b; + } else { + a <<= 1; + } + + b >>= 1; + } + + return retval; +} + #ifdef TEST_INTERPRETER uintptr_t TestAVX_660F38(x64test_t *test, vex_t vex, uintptr_t addr, int *step) #else @@ -47,6 +68,49 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) reg64_t *oped, *opgd; sse_regs_t *opex, *opgx, *opvx, eax1; sse_regs_t *opey, *opgy, *opvy, eay1; + // AES opcodes constants + // A0 B1 C2 D3 E4 F5 G6 H7 I8 J9 Ka Lb Mc Nd Oe Pf + // A F K P E J O D I N C H M B G L + const uint8_t shiftrows[] = {0, 5,10,15, 4, 9,14, 3, 8,13, 2, 7,12, 1, 6,11}; + const uint8_t subbytes[256] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, + }; + // A0 B1 C2 D3 E4 F5 G6 H7 I8 J9 Ka Lb Mc Nd Oe Pf + // A N K H E B O L I F C P M J G D + const uint8_t invshiftrows[] = {0,13,10, 7, 4, 1,14,11, 8, 5, 2,15,12, 9, 6, 3}; + const uint8_t invsubbytes[256] = { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, + }; #ifdef TEST_INTERPRETER @@ -90,6 +154,231 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GY->q[0] = GY->q[1] = 0; break; + case 0x18: /* VBROADCASTSS Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + tmp32u = EX->ud[0]; + for(int i=0; i<4; ++i) + GX->ud[i] = tmp32u; + if(vex.l) { + for(int i=0; i<4; ++i) + GY->ud[i] = tmp32u; + } else + GY->u128 = 0; + break; + case 0x19: /* VBROADCASTSD Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + tmp64u = EX->q[0]; + for(int i=0; i<2; ++i) + GX->q[i] = tmp64u; + if(vex.l) { + for(int i=0; i<2; ++i) + GY->q[i] = tmp64u; + } else + GY->u128 = 0; + break; + case 0x1A: /* VBROADCASTF128 Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + GX->u128 = EX->u128; + GY->u128 = EX->u128; + break; + + case 0x2C: /*VMASKMOVPS Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + for(int i=0; i<4; ++i) + GX->ud[i] = (VX->ud[i]>>31)?EX->ud[i]:0; + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<4; ++i) + GY->ud[i] = (VY->ud[i]>>31)?EY->ud[i]:0; + } else + GY->u128 = 0; + break; + + case 0x2E: /*VMASKMOVPS Ex, Vx, Gx */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + for(int i=0; i<4; ++i) + EX->ud[i] = (VX->ud[i]>>31)?GX->ud[i]:0; + if(vex.l) { + GETGY; + GETEY; + GETVY; + for(int i=0; i<4; ++i) + EY->ud[i] = (VY->ud[i]>>31)?GY->ud[i]:0; + } + break; + + case 0x5A: /* VBROADCASTI128 Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + GX->u128 = EX->u128; + GY->u128 = EX->u128; + break; + + case 0xDB: /* VAESIMC Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + //STATE ← InvMixColumns( STATE ); + if (EX == GX) { + for(int i=0; i<16; ++i) + eax1.ub[i] = EX->ub[i]; + for(int j=0; j<4; ++j) { + GX->ub[0+j*4] = ff_mult(0x0E, eax1.ub[0+j*4]) ^ ff_mult(0x0B, eax1.ub[1+j*4]) ^ ff_mult(0x0D, eax1.ub[2+j*4]) ^ ff_mult(0x09, eax1.ub[3+j*4]); + GX->ub[1+j*4] = ff_mult(0x09, eax1.ub[0+j*4]) ^ ff_mult(0x0E, eax1.ub[1+j*4]) ^ ff_mult(0x0B, eax1.ub[2+j*4]) ^ ff_mult(0x0D, eax1.ub[3+j*4]); + GX->ub[2+j*4] = ff_mult(0x0D, eax1.ub[0+j*4]) ^ ff_mult(0x09, eax1.ub[1+j*4]) ^ ff_mult(0x0E, eax1.ub[2+j*4]) ^ ff_mult(0x0B, eax1.ub[3+j*4]); + GX->ub[3+j*4] = ff_mult(0x0B, eax1.ub[0+j*4]) ^ ff_mult(0x0D, eax1.ub[1+j*4]) ^ ff_mult(0x09, eax1.ub[2+j*4]) ^ ff_mult(0x0E, eax1.ub[3+j*4]); + } + } else { + for(int j=0; j<4; ++j) { + GX->ub[0+j*4] = ff_mult(0x0E, EX->ub[0+j*4]) ^ ff_mult(0x0B, EX->ub[1+j*4]) ^ ff_mult(0x0D, EX->ub[2+j*4]) ^ ff_mult(0x09, EX->ub[3+j*4]); + GX->ub[1+j*4] = ff_mult(0x09, EX->ub[0+j*4]) ^ ff_mult(0x0E, EX->ub[1+j*4]) ^ ff_mult(0x0B, EX->ub[2+j*4]) ^ ff_mult(0x0D, EX->ub[3+j*4]); + GX->ub[2+j*4] = ff_mult(0x0D, EX->ub[0+j*4]) ^ ff_mult(0x09, EX->ub[1+j*4]) ^ ff_mult(0x0E, EX->ub[2+j*4]) ^ ff_mult(0x0B, EX->ub[3+j*4]); + GX->ub[3+j*4] = ff_mult(0x0B, EX->ub[0+j*4]) ^ ff_mult(0x0D, EX->ub[1+j*4]) ^ ff_mult(0x09, EX->ub[2+j*4]) ^ ff_mult(0x0E, EX->ub[3+j*4]); + } + } + GY->u128 = 0; + break; + case 0xDC: /* VAESENC Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + //STATE ← SRC1; + //RoundKey ← SRC2; + //STATE ← ShiftRows( STATE ); + //STATE ← SubBytes( STATE ); + for(int i=0; i<16; ++i) + eax1.ub[i] = subbytes[VX->ub[shiftrows[i]]]; + //STATE ← MixColumns( STATE ); + for(int j=0; j<4; ++j) { + eay1.ub[0+j*4] = ff_mult(0x02, eax1.ub[0+j*4]) ^ ff_mult(0x03, eax1.ub[1+j*4]) ^ eax1.ub[2+j*4] ^ eax1.ub[3+j*4] ; + eay1.ub[1+j*4] = eax1.ub[0+j*4] ^ ff_mult(0x02, eax1.ub[1+j*4]) ^ ff_mult(0x03, eax1.ub[2+j*4]) ^ eax1.ub[3+j*4] ; + eay1.ub[2+j*4] = eax1.ub[0+j*4] ^ eax1.ub[1+j*4] ^ ff_mult(0x02, eax1.ub[2+j*4]) ^ ff_mult(0x03, eax1.ub[3+j*4]); + eay1.ub[3+j*4] = ff_mult(0x03, eax1.ub[0+j*4]) ^ eax1.ub[1+j*4] ^ eax1.ub[2+j*4] ^ ff_mult(0x02, eax1.ub[3+j*4]); + } + //DEST[127:0] ← STATE XOR RoundKey; + GX->u128 = eay1.u128 ^ EX->u128; + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<16; ++i) + eax1.ub[i] = subbytes[VY->ub[shiftrows[i]]]; + for(int j=0; j<4; ++j) { + eay1.ub[0+j*4] = ff_mult(0x02, eax1.ub[0+j*4]) ^ ff_mult(0x03, eax1.ub[1+j*4]) ^ eax1.ub[2+j*4] ^ eax1.ub[3+j*4] ; + eay1.ub[1+j*4] = eax1.ub[0+j*4] ^ ff_mult(0x02, eax1.ub[1+j*4]) ^ ff_mult(0x03, eax1.ub[2+j*4]) ^ eax1.ub[3+j*4] ; + eay1.ub[2+j*4] = eax1.ub[0+j*4] ^ eax1.ub[1+j*4] ^ ff_mult(0x02, eax1.ub[2+j*4]) ^ ff_mult(0x03, eax1.ub[3+j*4]); + eay1.ub[3+j*4] = ff_mult(0x03, eax1.ub[0+j*4]) ^ eax1.ub[1+j*4] ^ eax1.ub[2+j*4] ^ ff_mult(0x02, eax1.ub[3+j*4]); + } + GY->u128 = eay1.u128 ^ EY->u128; + } else + GY->u128 = 0; + break; + case 0xDD: /* VAESENCLAST Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + //STATE ← SRC1; + //RoundKey ← SRC2; + //STATE ← ShiftRows( STATE ); + //STATE ← SubBytes( STATE ); + for(int i=0; i<16; ++i) + eax1.ub[i] = subbytes[VX->ub[shiftrows[i]]]; + //DEST[127:0] ← STATE XOR RoundKey; + GX->u128 = eax1.u128 ^ EX->u128; + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<16; ++i) + eax1.ub[i] = subbytes[VY->ub[shiftrows[i]]]; + GY->u128 = eax1.u128 ^ EY->u128; + } else + GY->u128 = 0; + break; + case 0xDE: /* VAESDEC Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + //STATE ← SRC1; + //RoundKey ← SRC2; + //STATE ← InvShiftRows( STATE ); + for(int i=0; i<16; ++i) + eax1.ub[i] = VX->ub[invshiftrows[i]]; + //STATE ← InvSubBytes( STATE ); + for(int i=0; i<16; ++i) + eax1.ub[i] = invsubbytes[eax1.ub[i]]; + //STATE ← InvMixColumns( STATE ); + for(int j=0; j<4; ++j) { + eay1.ub[0+j*4] = ff_mult(0x0E, eax1.ub[0+j*4]) ^ ff_mult(0x0B, eax1.ub[1+j*4]) ^ ff_mult(0x0D, eax1.ub[2+j*4]) ^ ff_mult(0x09, eax1.ub[3+j*4]); + eay1.ub[1+j*4] = ff_mult(0x09, eax1.ub[0+j*4]) ^ ff_mult(0x0E, eax1.ub[1+j*4]) ^ ff_mult(0x0B, eax1.ub[2+j*4]) ^ ff_mult(0x0D, eax1.ub[3+j*4]); + eay1.ub[2+j*4] = ff_mult(0x0D, eax1.ub[0+j*4]) ^ ff_mult(0x09, eax1.ub[1+j*4]) ^ ff_mult(0x0E, eax1.ub[2+j*4]) ^ ff_mult(0x0B, eax1.ub[3+j*4]); + eay1.ub[3+j*4] = ff_mult(0x0B, eax1.ub[0+j*4]) ^ ff_mult(0x0D, eax1.ub[1+j*4]) ^ ff_mult(0x09, eax1.ub[2+j*4]) ^ ff_mult(0x0E, eax1.ub[3+j*4]); + } + //DEST[127:0] ← STATE XOR RoundKey; + GX->u128 = eay1.u128 ^ EX->u128; + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<16; ++i) + eax1.ub[i] = invsubbytes[VY->ub[invshiftrows[i]]]; + for(int j=0; j<4; ++j) { + eay1.ub[0+j*4] = ff_mult(0x0E, eax1.ub[0+j*4]) ^ ff_mult(0x0B, eax1.ub[1+j*4]) ^ ff_mult(0x0D, eax1.ub[2+j*4]) ^ ff_mult(0x09, eax1.ub[3+j*4]); + eay1.ub[1+j*4] = ff_mult(0x09, eax1.ub[0+j*4]) ^ ff_mult(0x0E, eax1.ub[1+j*4]) ^ ff_mult(0x0B, eax1.ub[2+j*4]) ^ ff_mult(0x0D, eax1.ub[3+j*4]); + eay1.ub[2+j*4] = ff_mult(0x0D, eax1.ub[0+j*4]) ^ ff_mult(0x09, eax1.ub[1+j*4]) ^ ff_mult(0x0E, eax1.ub[2+j*4]) ^ ff_mult(0x0B, eax1.ub[3+j*4]); + eay1.ub[3+j*4] = ff_mult(0x0B, eax1.ub[0+j*4]) ^ ff_mult(0x0D, eax1.ub[1+j*4]) ^ ff_mult(0x09, eax1.ub[2+j*4]) ^ ff_mult(0x0E, eax1.ub[3+j*4]); + } + GY->u128 = eay1.u128 ^ EY->u128; + } else + GY->u128 = 0; + break; + case 0xDF: /* VAESDECLAST Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + //STATE ← SRC1; + //RoundKey ← SRC2; + //STATE ← InvShiftRows( STATE ); + //STATE ← InvSubBytes( STATE ); + for(int i=0; i<16; ++i) + eax1.ub[i] = invsubbytes[VX->ub[invshiftrows[i]]]; + //DEST[127:0] ← STATE XOR RoundKey; + GX->u128 = eax1.u128 ^ EX->u128; + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<16; ++i) + eax1.ub[i] = invsubbytes[VY->ub[invshiftrows[i]]]; + GY->u128 = eax1.u128 ^ EY->u128; + } else + GY->u128 = 0; + break; + default: return 0; } diff --git a/src/emu/x64runavx660f3a.c b/src/emu/x64runavx660f3a.c index db82a823..cc5e784d 100644 --- a/src/emu/x64runavx660f3a.c +++ b/src/emu/x64runavx660f3a.c @@ -59,6 +59,25 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) float tmpf; sse_regs_t *opex, *opgx, *opvx, eax1; sse_regs_t *opey, *opgy, *opvy, eay1; + // AES opcodes constants + const uint8_t subbytes[256] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, + }; #ifdef TEST_INTERPRETER @@ -70,7 +89,41 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) switch(opcode) { - case 0x0F: // VPALIGNR GX, VX, EX, u8 + case 0x0C: /* VBLENDPS Gx, Vx, Ex, u8 */ + nextop = F8; + GETEX(1); + GETGX; + GETVX; + GETGY; + tmp8u = F8; + for(int i=0; i<4; ++i) + GX->ud[i] = (tmp8u&(1<<i))?EX->ud[i]:VX->ud[i]; + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<4; ++i) + GY->ud[i] = (tmp8u&(1<<(i+4)))?EY->ud[i]:VY->ud[i]; + } else + GY->u128 = 0; + break; + case 0x0D: /* VBLENDPD Gx, Vx, Ex, u8 */ + nextop = F8; + GETEX(1); + GETGX; + GETVX; + GETGY; + tmp8u = F8; + for(int i=0; i<2; ++i) + GX->q[i] = (tmp8u&(1<<i))?EX->q[i]:VX->q[i]; + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<2; ++i) + GY->q[i] = (tmp8u&(1<<(i+2)))?EY->q[i]:VY->q[i]; + } else + GY->u128 = 0; + break; + case 0x0F: /* VPALIGNR GX, VX, EX, u8 */ nextop = F8; GETEX(1); GETGX; @@ -90,7 +143,7 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GETEY; GETVY; if(tmp8u>31) - {GY->q[0] = GY->q[1] = 0;} + {GY->u128 = 0;} else { for (int i=0; i<16; ++i, ++tmp8u) @@ -99,10 +152,56 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GY->q[1] = eax1.q[1]; } } else - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; break; - case 0x21: /* VINSRTPS Gx, Vx, Ex, imm8 */ + case 0x16: // VPEXTRD/Q ED, GX, u8 + nextop = F8; + GETED(1); + GETGX; + tmp8u = F8; + if(rex.w) { + ED->q[0] = GX->q[tmp8u&1]; + } else { + if(MODREG) + ED->q[0] = GX->ud[tmp8u&3]; + else + ED->dword[0] = GX->ud[tmp8u&3]; + } + break; + + case 0x18: /* VINSERTF128 Gx, Ex, imm8 */ + nextop = F8; + GETEX(1); + GETGX; + GETVX; + GETGY; + GETVY; + tmp8u = F8; + if(tmp8u&1) { + GY->u128 = EX->u128; + if(GX!=VX); + GX->u128 = VX->u128; + } else { + GX->u128 = EX->u128; + if(GY!=VY) + GY->u128 = VY->u128; + } + break; + case 0x19: /* VEXTRACT128 Ex, Gx, imm8 */ + nextop = F8; + GETEX(1); + GETGX; + GETGY; + tmp8u = F8; + EX->u128 = (tmp8u&1)?GY->u128:GX->u128; + if(MODREG) { + GETEY; + EY->u128 = 0; + } + break; + + case 0x21: /* VINSERTPS Gx, Vx, Ex, imm8 */ nextop = F8; GETGX; GETEX(1); @@ -114,8 +213,8 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) } else tmp32u = EX->ud[0]; for(int i=0; i<4; ++i) - GX->ud[i] = (tmp8u&(1<<i))?((i==((tmp8u>>4)&3))?tmp32u:VX->ud[i]):0; - GY->q[0] = GY->q[1] = 0; + GX->ud[i] = (tmp8u&(1<<i))?0:((i==((tmp8u>>4)&3))?tmp32u:VX->ud[i]); + GY->u128 = 0; break; case 0x40: /* DPPS Gx, Ex, Ib */ @@ -139,7 +238,7 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) for(int i=0; i<4; ++i) GY->f[i] = (tmp8u&(1<<i))?tmpf:0.0f; } else - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; break; case 0x44: /* VPCLMULQDQ Gx, Vx, Ex, imm8 */ @@ -155,7 +254,65 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GETEY; GY->u128 = pclmul_helper(VY->q[tmp8u&1], EY->q[(tmp8u>>4)&1]); } else - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; + break; + + case 0x4A: /* VBLENDVPS Gx, Vx, Ex, XMMImm8 */ + nextop = F8; + GETEX(1); + GETGX; + GETVX; + GETGY; + tmp8u = (F8)>>4; + for(int i=0; i<4; ++i) + GX->ud[i] = (emu->xmm[tmp8u].ud[i]>>31)?EX->ud[i]:VX->ud[i]; + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<4; ++i) + GY->ud[i] = (emu->ymm[tmp8u].ud[i]>>31)?EY->ud[i]:VY->ud[i]; + } else + GY->u128 = 0; + break; + case 0x4B: /* VBLENDVPD Gx, Vx, Ex, XMMImm8 */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + tmp8u = (F8)>>4; + for(int i=0; i<2; ++i) + GX->q[i] = (emu->xmm[tmp8u].q[i]>>63)?EX->q[i]:VX->q[i]; + if(vex.l) { + GETEY; + GETVY; + for(int i=0; i<2; ++i) + GY->q[i] = (emu->ymm[tmp8u].q[i]>>63)?EY->q[i]:VY->q[i]; + } else + GY->u128 = 0; + break; + + case 0xDF: // VAESKEYGENASSIST Gx, Ex, u8 + nextop = F8; + GETEX(1); + GETGX; + tmp32u = F8; + for (int i = 4; i < 8; ++i) + GX->ub[i] = subbytes[EX->ub[i]]; + for (int i = 12; i < 16; ++i) + GX->ub[i] = subbytes[EX->ub[i]]; + GX->ud[0] = GX->ud[1]; + tmp8u = GX->ub[4]; + GX->ud[1] = GX->ud[1] >> 8; + GX->ub[7] = tmp8u; + GX->ud[1] ^= tmp32u; + GX->ud[2] = GX->ud[3]; + tmp8u = GX->ub[12]; + GX->ud[3] = GX->ud[3] >> 8; + GX->ub[15] = tmp8u; + GX->ud[3] ^= tmp32u; + GETGY; + GY->u128 = 0; break; default: diff --git a/src/emu/x64runavxf20f.c b/src/emu/x64runavxf20f.c index 1bb03d72..642946b5 100644 --- a/src/emu/x64runavxf20f.c +++ b/src/emu/x64runavxf20f.c @@ -44,6 +44,7 @@ uintptr_t RunAVX_F20F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) uint32_t tmp32u, tmp32u2; uint64_t tmp64u, tmp64u2; int64_t tmp64s; + int64_t tmp64s0, tmp64s1; reg64_t *oped, *opgd; sse_regs_t *opex, *opgx, *opvx, eax1; sse_regs_t *opey, *opgy, *opvy, eay1; @@ -70,9 +71,9 @@ uintptr_t RunAVX_F20F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GX->q[1] = 0; } GETGY; - GY->q[0] = GY->q[1] = 0; + GY->u128 = 0; break; - case 0x11: /* MOVSS Ex Gx */ + case 0x11: /* MOVSD Ex Gx */ nextop = F8; GETEX(0); GETGX; @@ -81,10 +82,143 @@ uintptr_t RunAVX_F20F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GETVX; EX->q[1] = VX->q[1]; GETEY; - EY->q[0] = EY->q[1] = 0; + EY->u128 = 0; } break; - + + case 0x58: /* VADDSD Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + GX->d[0] = VX->d[0] + EX->d[0]; + if(GX!=VX) { + GX->q[1] = VX->q[1]; + } + GY->u128 = 0; + break; + + case 0xC2: /* VCMPSD Gx, Vx, Ex, Ib */ + nextop = F8; + GETEX(1); + GETGX; + GETVX; + GETGY; + tmp8u = F8; + tmp8s = 0; + switch(tmp8u&7) { + case 0: tmp8s=(VX->d[0] == EX->d[0]); break; + case 1: tmp8s=isless(VX->d[0], EX->d[0]) && !(isnan(VX->d[0]) || isnan(EX->d[0])); break; + case 2: tmp8s=islessequal(VX->d[0], EX->d[0]) && !(isnan(VX->d[0]) || isnan(EX->d[0])); break; + case 3: tmp8s=isnan(VX->d[0]) || isnan(EX->d[0]); break; + case 4: tmp8s=isnan(VX->d[0]) || isnan(EX->d[0]) || (VX->d[0] != EX->d[0]); break; + case 5: tmp8s=isnan(VX->d[0]) || isnan(EX->d[0]) || isgreaterequal(VX->d[0], EX->d[0]); break; + case 6: tmp8s=isnan(VX->d[0]) || isnan(EX->d[0]) || isgreater(VX->d[0], EX->d[0]); break; + case 7: tmp8s=!isnan(VX->d[0]) && !isnan(EX->d[0]); break; + } + GX->q[0]=(tmp8s)?0xffffffffffffffffLL:0LL; + GX->q[1] = VX->q[1]; + GY->u128 = 0; + break; + + case 0xD0: /* VADDSUBPS Gx, Vx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETVX; + GETGY; + GX->f[0] = VX->f[0] - EX->f[0]; + GX->f[1] = VX->f[1] + EX->f[1]; + GX->f[2] = VX->f[2] - EX->f[2]; + GX->f[3] = VX->f[3] + EX->f[3]; + if(vex.l) { + GETEY; + GETVY; + GY->f[0] = VY->f[0] - EY->f[0]; + GY->f[1] = VY->f[1] + EY->f[1]; + GY->f[2] = VY->f[2] - EY->f[2]; + GY->f[3] = VY->f[3] + EY->f[3]; + } else + GY->u128 = 0; + break; + + case 0xE6: /* CVTPD2DQ Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + switch(emu->mxcsr.f.MXCSR_RC) { + case ROUND_Nearest: { + int round = fegetround(); + fesetround(FE_TONEAREST); + tmp64s0 = nearbyint(EX->d[0]); + tmp64s1 = nearbyint(EX->d[1]); + fesetround(round); + break; + } + case ROUND_Down: + tmp64s0 = floor(EX->d[0]); + tmp64s1 = floor(EX->d[1]); + break; + case ROUND_Up: + tmp64s0 = ceil(EX->d[0]); + tmp64s1 = ceil(EX->d[1]); + break; + case ROUND_Chop: + tmp64s0 = EX->d[0]; + tmp64s1 = EX->d[1]; + break; + } + if (tmp64s0==(int32_t)tmp64s0 && !isnan(EX->d[0])) { + GX->sd[0] = (int32_t)tmp64s0; + } else { + GX->sd[0] = INT32_MIN; + } + if (tmp64s1==(int32_t)tmp64s1 && !isnan(EX->d[1])) { + GX->sd[1] = (int32_t)tmp64s1; + } else { + GX->sd[1] = INT32_MIN; + } + if(vex.l) { + GETEY; + switch(emu->mxcsr.f.MXCSR_RC) { + case ROUND_Nearest: { + int round = fegetround(); + fesetround(FE_TONEAREST); + tmp64s0 = nearbyint(EY->d[0]); + tmp64s1 = nearbyint(EY->d[1]); + fesetround(round); + break; + } + case ROUND_Down: + tmp64s0 = floor(EY->d[0]); + tmp64s1 = floor(EY->d[1]); + break; + case ROUND_Up: + tmp64s0 = ceil(EY->d[0]); + tmp64s1 = ceil(EY->d[1]); + break; + case ROUND_Chop: + tmp64s0 = EY->d[0]; + tmp64s1 = EY->d[1]; + break; + } + if (tmp64s0==(int32_t)tmp64s0 && !isnan(EY->d[0])) { + GX->sd[2] = (int32_t)tmp64s0; + } else { + GX->sd[2] = INT32_MIN; + } + if (tmp64s1==(int32_t)tmp64s1 && !isnan(EY->d[1])) { + GX->sd[3] = (int32_t)tmp64s1; + } else { + GX->sd[3] = INT32_MIN; + } + } else + GX->q[1] = 0; + GY->u128 = 0; + break; + default: return 0; } diff --git a/src/emu/x64runavxf30f.c b/src/emu/x64runavxf30f.c index 73180d0a..98fb8b4d 100644 --- a/src/emu/x64runavxf30f.c +++ b/src/emu/x64runavxf30f.c @@ -68,7 +68,6 @@ uintptr_t RunAVX_F30F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) GX->ud[1] = VX->ud[1]; GX->q[1] = VX->q[1]; } else { - // EX is not a register (reg to reg only move 31:0) GX->ud[1] = GX->ud[2] = GX->ud[3] = 0; } GETGY; @@ -152,6 +151,46 @@ uintptr_t RunAVX_F30F(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) } // no ymm raz here it seems break; + case 0xC2: /* VCMPSS Gx, Vx, Ex, Ib */ + nextop = F8; + GETEX(1); + GETGX; + GETVX; + GETGY; + tmp8u = F8; + tmp8s = 0; + switch(tmp8u&7) { + case 0: tmp8s=(VX->f[0] == EX->f[0]); break; + case 1: tmp8s=isless(VX->f[0], EX->f[0]) && !(isnan(VX->f[0]) || isnan(EX->f[0])); break; + case 2: tmp8s=islessequal(VX->f[0], EX->f[0]) && !(isnan(VX->f[0]) || isnan(EX->f[0])); break; + case 3: tmp8s=isnan(VX->f[0]) || isnan(EX->f[0]); break; + case 4: tmp8s=isnan(VX->f[0]) || isnan(EX->f[0]) || (VX->f[0] != EX->f[0]); break; + case 5: tmp8s=isnan(VX->f[0]) || isnan(EX->f[0]) || isgreaterequal(VX->f[0], EX->f[0]); break; + case 6: tmp8s=isnan(VX->f[0]) || isnan(EX->f[0]) || isgreater(VX->f[0], EX->f[0]); break; + case 7: tmp8s=!isnan(VX->f[0]) && !isnan(EX->f[0]); break; + } + GX->ud[0]=(tmp8s)?0xffffffff:0; + if(GX!=VX) { + GX->ud[1] = VX->ud[1]; + GX->q[1] = VX->q[1]; + } + GY->u128 = 0; + break; + + case 0xE6: /* VCVTDQ2PD Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + if(vex.l) { + GY->d[1] = EX->sd[3]; + GY->d[0] = EX->sd[2]; + } else + GY->u128 = 0; + GX->d[1] = EX->sd[1]; + GX->d[0] = EX->sd[0]; + break; + default: return 0; } diff --git a/src/tools/my_cpuid.c b/src/tools/my_cpuid.c index 2ed0e3a6..7c54a379 100644 --- a/src/tools/my_cpuid.c +++ b/src/tools/my_cpuid.c @@ -325,6 +325,7 @@ void my_cpuid(x64emu_t* emu, uint32_t tmp32u) //1<<3 | // BMI1 box64_avx2<<5 | //AVX2 //1<<8 | //BMI2 + box64_avx2<<9 | //VAES 1<<29| // SHA extension 0; } else {R_EAX = R_ECX = R_EBX = R_EDX = 0;} diff --git a/src/wrapped/wrappedlibc.c b/src/wrapped/wrappedlibc.c index 51d7557f..e547d523 100644 --- a/src/wrapped/wrappedlibc.c +++ b/src/wrapped/wrappedlibc.c @@ -1639,7 +1639,7 @@ void CreateCPUInfoFile(int fd) P; sprintf(buff, "bogomips\t: %g\n", getBogoMips()); P; - sprintf(buff, "flags\t\t: fpu cx8 sep ht cmov clflush mmx sse sse2 syscall tsc lahf_lm ssse3 ht tm lm fxsr cpuid pclmulqdq cx16 aes movbe pni sse4_1%s%s lzcnt popcnt%s\n", box64_sse42?" sse4_2":"", box64_avx?" avx":"", box64_avx2?" avx2":""); + sprintf(buff, "flags\t\t: fpu cx8 sep ht cmov clflush mmx sse sse2 syscall tsc lahf_lm ssse3 ht tm lm fxsr cpuid pclmulqdq cx16 aes movbe pni sse4_1%s%s lzcnt popcnt%s%s\n", box64_sse42?" sse4_2":"", box64_avx?" avx":"", box64_avx2?" avx2":"", box64_avx2?" vaes":""); P; sprintf(buff, "address sizes\t: 48 bits physical, 48 bits virtual\n"); P; |