diff options
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_660f.c | 2 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c | 85 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.c | 22 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.h | 4 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native_functions.c | 34 | ||||
| -rw-r--r-- | src/dynarec/dynarec_native_functions.h | 2 |
6 files changed, 145 insertions, 4 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c index 903c427c..79173acf 100644 --- a/src/dynarec/arm64/dynarec_arm64_660f.c +++ b/src/dynarec/arm64/dynarec_arm64_660f.c @@ -2894,7 +2894,6 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(q0, 0, 0); VBICQ(v0, q0, v0); break; - case 0xE0: INST_NAME("PAVGB Gx, Ex"); nextop = F8; @@ -2902,7 +2901,6 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(v1, 0, 0); URHADDQ_8(v0, v0, v1); break; - case 0xE1: INST_NAME("PSRAW Gx,Ex"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c index ee1fa401..e5894ce1 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c @@ -107,6 +107,29 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } else YMM0(gd); break; + case 0x0F: + INST_NAME("VPALIGNR Gx, Vx, Ex, Ib"); + nextop = F8; + d0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { + GETGX_empty_VXEX(v0, v2, v1, 1); + u8 = F8; + } else { + GETGY_empty_VYEY(v0, v2, v1); + } + if(u8>31) { + VEORQ(v0, v0, v0); + } else if(u8>15) { + if(!l) VEORQ(d0, d0, d0); + VEXTQ_8(v0, v2, d0, u8-16); + } else { + VEXTQ_8(v0, v1, v2, u8); + } + } + if(!vex.l) YMM0(gd); + break; + case 0x15: INST_NAME("VPEXTRW Ed, Gx, imm8"); nextop = F8; @@ -198,6 +221,68 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip F8; // read u8, but it's been already handled break; + case 0x44: + INST_NAME("PCLMULQDQ Gx, Vx, Ex, Ib"); + nextop = F8; + if(arm64_pmull) { + d0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { + GETGX_empty_VXEX(v0, v2, v1, 1); + u8 = F8; + } else { + GETGY_empty_VYEY(v0, v2, v1); + } + switch (u8&0b00010001) { + case 0b00000000: + PMULL_128(v0, v2, v1); + break; + case 0b00010001: + PMULL2_128(v0, v2, v1); + break; + case 0b00000001: + VEXTQ_8(d0, v2, v2, 8); // Swap Up/Lower 64bits parts + PMULL_128(v0, d0, v1); + break; + case 0b00010000: + VEXTQ_8(d0, v2, v2, 8); // Swap Up/Lower 64bits parts + PMULL2_128(v0, d0, v1); + break; + } + } + } else { + for(int l=0; l<1+vex.l; ++l) { + if(!l) { + GETG; + sse_forget_reg(dyn, ninst, gd); + sse_reflect_reg(dyn, ninst, vex.v); + } + MOV32w(x1, gd); // gx + MOV32w(x2, vex.v); // vx + if(MODREG) { + if(!l) { + ed = (nextop&7)+(rex.b<<3); + sse_forget_reg(dyn, ninst, ed); + } + MOV32w(x3, ed); + } else { + if(!l) { + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1); + if(ed!=x3) { + MOVx_REG(x3, ed); + } + } else { + ADDx_U12(x3, ed, 16); + } + } + if(!l) u8 = F8; + MOV32w(x4, u8); + CALL_(l?native_pclmul_y:native_pclmul_x, -1, x3); + } + } + if(!vex.l) YMM0(gd); + break; + default: DEFAULT; } diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index 2d08510e..908b757d 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -1630,6 +1630,22 @@ void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a) if(dyn->n.neoncache[dyn->n.ssecache[a].reg].t == NEON_CACHE_XMMW) { VSTR128_U12(dyn->n.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a])); } + // YMM part too + if(is_avx_zero_unset(dyn, ninst, a)) { + //only ymm[0] can be accessed with STP :( + if(!a) + STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[a])); + else { + STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])); + STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])+8); + } + } else for(int i=0; i<32; ++i) + if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) || (dyn->n.neoncache[i].t == NEON_CACHE_YMMR)) { + if(dyn->n.neoncache[i].t == NEON_CACHE_YMMW) + VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); + fpu_free_reg(dyn, i); + + } fpu_free_reg(dyn, dyn->n.ssecache[a].reg); dyn->n.ssecache[a].v = -1; return; @@ -1725,6 +1741,9 @@ static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1) } STPx_S7_offset(xZR, xZR, s1, i*16); } + for(int i=0; i<32; ++i) + if(dyn->n.neoncache[i].t == NEON_CACHE_YMMW) + VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n])); } } @@ -1738,6 +1757,9 @@ void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a) STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])); STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])+8); } + } else for(int i=0; i<32; ++i) + if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) && (dyn->n.neoncache[i].n == a)) { + VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[a])); } if(dyn->n.ssecache[a].v==-1) return; diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index e328d255..872a86fb 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -1476,11 +1476,11 @@ int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int int sse_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite); // get neon register for a SSE reg, but don't try to synch it if it needed to be created int sse_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a); -// forget neon register for a SSE reg, create the entry if needed +// forget neon register for a SSE reg, YMM high part too void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a); // purge the XMM0..XMM7 cache (before function call) void sse_purge07cache(dynarec_arm_t* dyn, int ninst, int s1); -// Push current value to the cache +// Push current value to the cache (ymm too) void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a); // common coproc helpers // reset the cache with n diff --git a/src/dynarec/dynarec_native_functions.c b/src/dynarec/dynarec_native_functions.c index 10d6f333..e0e0f3ff 100644 --- a/src/dynarec/dynarec_native_functions.c +++ b/src/dynarec/dynarec_native_functions.c @@ -416,6 +416,40 @@ void native_pclmul(x64emu_t* emu, int gx, int ex, void* p, uint32_t u8) GX->q[0] = result&0xffffffffffffffffLL; GX->q[1] = (result>>64)&0xffffffffffffffffLL; } +void native_pclmul_x(x64emu_t* emu, int gx, int vx, void* p, uint32_t u8) +{ + + sse_regs_t *EX = ((uintptr_t)p<16)?((sse_regs_t*)p):&emu->xmm[(uintptr_t)p]; + sse_regs_t *GX = &emu->xmm[gx]; + sse_regs_t *VX = &emu->xmm[vx]; + int g = (u8&1)?1:0; + int e = (u8&0b10000)?1:0; + __int128 result = 0; + __int128 op2 = EX->q[e]; + for (int i=0; i<64; ++i) + if(VX->q[g]&(1LL<<i)) + result ^= (op2<<i); + + GX->q[0] = result&0xffffffffffffffffLL; + GX->q[1] = (result>>64)&0xffffffffffffffffLL; +} +void native_pclmul_y(x64emu_t* emu, int gy, int vy, void* p, uint32_t u8) +{ + + sse_regs_t *EY = ((uintptr_t)p<16)?((sse_regs_t*)p):&emu->ymm[(uintptr_t)p]; + sse_regs_t *GY = &emu->ymm[gy]; + sse_regs_t *VY = &emu->ymm[vy]; + int g = (u8&1)?1:0; + int e = (u8&0b10000)?1:0; + __int128 result = 0; + __int128 op2 = EY->q[e]; + for (int i=0; i<64; ++i) + if(VY->q[g]&(1LL<<i)) + result ^= (op2<<i); + + GY->q[0] = result&0xffffffffffffffffLL; + GY->q[1] = (result>>64)&0xffffffffffffffffLL; +} void native_clflush(x64emu_t* emu, void* p) { diff --git a/src/dynarec/dynarec_native_functions.h b/src/dynarec/dynarec_native_functions.h index 3e81081b..5085e9e1 100644 --- a/src/dynarec/dynarec_native_functions.h +++ b/src/dynarec/dynarec_native_functions.h @@ -42,6 +42,8 @@ void native_aeselast(x64emu_t* emu, int xmm); void native_aesimc(x64emu_t* emu, int xmm); void native_aeskeygenassist(x64emu_t* emu, int gx, int ex, void* p, uint32_t u8); void native_pclmul(x64emu_t* emu, int gx, int ex, void* p, uint32_t u8); +void native_pclmul_x(x64emu_t* emu, int gx, int vx, void* p, uint32_t u8); +void native_pclmul_y(x64emu_t* emu, int gy, int vy, void* p, uint32_t u8); void native_clflush(x64emu_t* emu, void* p); |