diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-06-05 20:40:26 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-06-05 20:40:26 +0200 |
| commit | 79c1ad1431277d17cc7ce34f255b0af4c23ccbaa (patch) | |
| tree | 9b59ff894dd7bc771e31ca841c7c6349c4ce69b3 /src | |
| parent | c0ebe095213b5048b54ff41d0d5550750af2cbdb (diff) | |
| download | box64-79c1ad1431277d17cc7ce34f255b0af4c23ccbaa.tar.gz box64-79c1ad1431277d17cc7ce34f255b0af4c23ccbaa.zip | |
[ARM64_DYNAREC] Added a few more AVX opcodes
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_0f.c | 11 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f.c | 32 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c | 10 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c | 29 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c | 64 |
5 files changed, 138 insertions, 8 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c index 268e1f86..044c131d 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c @@ -320,7 +320,16 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int } } break; - + case 0x51: + INST_NAME("VSQRTPS Gx, Ex"); + nextop = F8; + SKIPTEST(x1); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_EX(q0, q1, 0); } else { GETGY_empty_EY(q0, q1); } + VFSQRTQS(q0, q1); + } + if(!vex.l) YMM0(gd); + break; case 0x52: INST_NAME("VRSQRTPS Gx, Ex"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c index b7a3f80a..c9243180 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c @@ -1396,6 +1396,38 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } if(!vex.l) YMM0(gd); break; + case 0xE1: + INST_NAME("VPSRAW Gx, Vx, Ex"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + MOVI_32(q1, 15); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + UQXTN_32(q0, v1); + UMIN_32(q0, q0, q1); // limit to -15 .. +15 values + NEG_16(q0, q0); + VDUPQ_16(q0, q0, 0); // only the low 8bits will be used anyway + SSHLQ_16(v0, v2, q0); + } + if(!vex.l) YMM0(gd); + break; + case 0xE2: + INST_NAME("VPSRAD Gx, Vx, Ex"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + MOVI_32(q1, 31); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + UQXTN_32(q0, v1); + UMIN_32(q0, q0, q1); // limit to 0 .. +31 values + NEG_32(q0, q0); + VDUPQ_32(q0, q0, 0); // only the low 8bits will be used anyway + SSHLQ_32(v0, v2, q0); + } + if(!vex.l) YMM0(gd); + break; case 0xE4: INST_NAME("VPMULHUW Gx, Vx, Ex"); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c index f3ea8f41..783b77c2 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c @@ -77,6 +77,16 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip if(!vex.l) YMM0(gd); break; + case 0x02: + INST_NAME("VPHADDD Gx, Vx, Ex"); + nextop = F8; + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + VADDPQ_32(v0, v2, v1); + } + if(!vex.l) YMM0(gd); + break; + case 0x08: INST_NAME("VPSIGNB Gx, Vx, Ex"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c index 371fe25a..cdbe93f6 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c @@ -647,6 +647,35 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip } if(!vex.l) YMM0(gd); break; + case 0x4C: + INST_NAME("VBLENDPVB Gx, Vx, Ex, XMMImm8"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + u8 = geted_ib(dyn, addr, ninst, nextop)>>4; + for(int l=0; l<1+vex.l; ++l) { + if(!l) { + q1 = sse_get_reg(dyn, ninst, x1, u8, 0); + GETGX_empty_VXEX(v0, v2, v1, 1); + F8; + } else { + v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, u8, (MODREG)?((nextop&7)+(rex.b<<3)):-1); + if(MODREG) + v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, u8); + else + VLDR128_U12(v1, ed, fixedaddress+16); + q1 = ymm_get_reg(dyn, ninst, x1, u8, 0, vex.v, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1); + v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, u8, (MODREG)?((nextop&7)+(rex.b<<3)):-1); + } + VSSHRQ_8(q0, q1, 7); // create mask + if(v0==v1) + VBIFQ(v0, v2, q0); + else { + if(v0!=v2) VMOVQ(v0, v2); + VBITQ(v0, v1, q0); + } + } + if(!vex.l) YMM0(gd); + break; default: DEFAULT; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c index 11aee1d2..8e2ed65c 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c @@ -39,7 +39,7 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int q0, q1, q2; int d0, d1, d2; int s0; - uint64_t tmp64u; + uint64_t tmp64u, u64; int64_t j64; int64_t fixedaddress; int unscaled; @@ -375,6 +375,34 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, } if(!vex.l) YMM0(gd); break; + case 0x70: + INST_NAME("VPSHUFHW Gx, Ex, Ib"); + nextop = F8; + d0 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETEX(v1, 0, 1); GETGX(v0, 1); u8 = F8; } else { GETGY(v0, 1, MODREG?((nextop&7)+(rex.b<<3)):-1, -1, -1); GETEY(v1); } + if(u8==0b00000000 || u8==0b01010101 || u8==0b10101010 || u8==0b11111111) { + VDUP_16(d0, v1, 4+(u8&3)); + } else { + // only high part need to be suffled. VTBL only handle 8bits value, so the 16bits suffles need to be changed in 8bits + if(!l) { + u64 = 0; + for (int i=0; i<4; ++i) { + u64 |= ((uint64_t)((u8>>(i*2))&3)*2+8)<<(i*16+0); + u64 |= ((uint64_t)((u8>>(i*2))&3)*2+9)<<(i*16+8); + } + MOV64x(x2, u64); + } + VMOVQDfrom(d0, 0, x2); + VTBL1_8(d0, v1, d0); + } + VMOVeD(v0, 1, d0, 0); + if(v0!=v1) { + VMOVeD(v0, 0, v1, 0); + } + } + if(!vex.l) YMM0(gd); + break; case 0x7E: INST_NAME("MOVQ Gx, Ex"); @@ -420,10 +448,13 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, GETEXSS(v1, 0, 1); GETGX_empty_VX(v0, v2); u8 = F8; - if(((u8&15)==12)||((u8&15)==13)||((u8&15)==9)||((u8&15)==10)) - FCMPS(v1, v2); - else - FCMPS(v2, v1); + if(((u8&15)!=12) && ((u8&15)!=15)) { + if(((u8&15)==12)||((u8&15)==13)||((u8&15)==9)||((u8&15)==10)) + FCMPS(v1, v2); + else + FCMPS(v2, v1); + } + // TODO: create a test for this one, there might be an issue with cases 9, 10 and 13 if(v0!=v2) VMOVQ(v0, v2); switch(u8&7) { case 0x00: CSETMw(x2, cEQ); break; // Equal @@ -435,8 +466,8 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, case 0x06: CSETMw(x2, cHI); break; // Greater or unordered case 0x07: CSETMw(x2, cVC); break; // not NaN case 0x08: CSETMw(x2, cEQ); CSETMw(x3, cVS); ORRw_REG(x2, x2, x3); break; // Equal than or ordered - case 0x09: CSETMw(x2, cCS); break; // Less than or ordered - case 0x0a: CSETMw(x2, cHI); break; // Less or equal or ordered + case 0x09: CSETMw(x2, cCS); break; // Less than or unordered + case 0x0a: CSETMw(x2, cHI); break; // Less or equal or unordered case 0x0b: MOV32w(x2, 0); break; // false case 0x0c: CSETMw(x2, cNE); CSETMw(x3, cVC); ANDw_REG(x2, x2, x3); break; // Not Equal not unordered case 0x0d: CSETMw(x2, cCC); break; // Greater or equal not unordered @@ -447,6 +478,25 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, YMM0(gd); break; + case 0xE6: + INST_NAME("VCVTDQ2PD Gx, Ex"); + nextop = F8; + if(vex.l) { + GETEX_Y(v1, 0, 0); + } else { + GETEXSD(v1, 0, 0); + } + GETGX_empty(v0); + d0 = fpu_get_scratch(dyn, ninst); + if(vex.l) { + q0 = ymm_get_reg_empty(dyn, ninst, x1, gd, -1, -1, -1); + SXTL2_32(q0, v1); + SCVTQFD(q0, q0); + } else YMM0(gd); + SXTL_32(v0, v1); + SCVTQFD(v0, v0); + break; + default: DEFAULT; } |