diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-06-02 09:15:37 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-06-02 09:15:37 +0200 |
| commit | dc71840a5a1e065e5f98e3a230f6714ec8d946ed (patch) | |
| tree | 7779dd3f00d277b3254ac6b472d83ae346ba1f5b /src | |
| parent | 07b17cd0f63dfea2318cb19d08673265984eedfa (diff) | |
| download | box64-dc71840a5a1e065e5f98e3a230f6714ec8d946ed.tar.gz box64-dc71840a5a1e065e5f98e3a230f6714ec8d946ed.zip | |
[ARM64_DYNAREC] Added AVX.66.0F3A 21 and fixed a bunch of issues
Diffstat (limited to 'src')
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_0f.c | 12 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c | 8 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c | 33 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.c | 12 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_helper.h | 18 |
5 files changed, 62 insertions, 21 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c index 2acb719a..b8d21d9e 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c @@ -81,11 +81,11 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int } else { v0 = sse_get_reg_empty(dyn, ninst, x1, gd); SMREAD(); - addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xffe<<4, 15, rex, NULL, 0, 0); - VLD128(v0, ed, fixedaddress); // no alignment issue with ARMv8 NEON :) + addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0); + VLDR128_U12(v0, ed, fixedaddress); // no alignment issue with ARMv8 NEON :) if(vex.l) { v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, -1, -1, -1); - VLD128(v0, ed, fixedaddress+16); + VLDR128_U12(v0, ed, fixedaddress+16); } } if(!vex.l) YMM0(gd); @@ -105,11 +105,11 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int VMOVQ(v1, v0); } } else { - addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xffe<<4, 15, rex, NULL, 0, 0); - VST128(v0, ed, fixedaddress); + addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0); + VSTR128_U12(v0, ed, fixedaddress); if(vex.l) { v0 = ymm_get_reg(dyn, ninst, x1, gd, 0, ed, -1, -1); - VST128(v0, ed, fixedaddress+16); + VSTR128_U12(v0, ed, fixedaddress+16); } SMWRITE2(); } diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c index afebb155..be983b14 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c @@ -132,14 +132,14 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip VSSHRQ_32(q0, v2, 31); VBITQ(v1, v0, q0); if(!MODREG) { - VST128(v1, ed, fixedaddress); + VSTR128_U12(v1, ed, fixedaddress); } if(vex.l) { GETGYVYEY(v0, v2, v1); VSSHRQ_32(q0, v2, 31); VBITQ(v1, v0, q0); if(!MODREG) - VST128(v1, ed, fixedaddress+16); + VSTR128_U12(v1, ed, fixedaddress+16); } break; case 0x2F: @@ -151,14 +151,14 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip VSSHRQ_64(q0, v2, 63); VBITQ(v1, v0, q0); if(!MODREG) { - VST128(v1, ed, fixedaddress); + VSTR128_U12(v1, ed, fixedaddress); } if(vex.l) { GETGYVYEY(v0, v2, v1); VSSHRQ_64(q0, v2, 63); VBITQ(v1, v0, q0); if(!MODREG) { - VST128(v1, ed, fixedaddress+16); + VSTR128_U12(v1, ed, fixedaddress+16); } } break; diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c index 14b3c30e..901bcef0 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c @@ -223,6 +223,39 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip F8; // read u8, but it's been already handled break; + case 0x21: + INST_NAME("VINSERTPS Gx, Vx, Ex, Ib"); + nextop = F8; + GETGX_empty_VX(v0, v2); + if (MODREG) { + v1 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0); + u8 = F8; + if(v0==v1) { + d0 = fpu_get_scratch(dyn, ninst); + VMOVQ(d0, v1); + if(v0!=v2) VMOVQ(v0, v2); + VMOVeS(v0, (u8>>4)&3, d0, (u8>>6)&3); + } else { + if(v0!=v2) VMOVQ(v0, v2); + VMOVeS(v0, (u8>>4)&3, v1, (u8>>6)&3); + } + } else { + if(v0!=v2) VMOVQ(v0, v2); + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &wback, x1, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 1); + u8 = F8; + LDW(x2, wback, fixedaddress); + VMOVQSfrom(v0, (u8>>4)&3, x2); + } + uint8_t zmask = u8 & 0xf; + for (uint8_t i=0; i<4; i++) { + if (zmask & (1<<i)) { + VMOVQSfrom(v0, i, wZR); + } + } + YMM0(gd); + break; + case 0x44: INST_NAME("PCLMULQDQ Gx, Vx, Ex, Ib"); nextop = F8; diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index 908b757d..e054e99c 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -2112,7 +2112,7 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) { -#if 1//STEP > 1 +#if STEP > 0 int i2 = dyn->insts[ninst].x64.jmp_insts; if(i2<0) return; @@ -2494,27 +2494,27 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst) void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1) { - if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Purge YMM mask=%04x --------\n", dyn->insts[ninst].purge_ymm); + MESSAGE(LOG_NONE, "Purge YMM mask=%04x --------\n", dyn->insts[ninst].purge_ymm); int s1_set = 0; for(int i=0; i<16; ++i) if(dyn->insts[ninst].purge_ymm&(1<<i)) { - if(is_avx_zero(dyn, ninst, i)) { + if(is_avx_zero_unset(dyn, ninst, i)) { if(!s1_set) { ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0])); s1_set = 1; } STPx_S7_offset(xZR, xZR, s1, i*16); - avx_unmark_zero(dyn, ninst, i); } - int reg = -1; for(int j=0; j<32; ++j) if(dyn->n.neoncache[j].t==NEON_CACHE_YMMR && dyn->n.neoncache[j].n==i) { // just forget the reg.... dyn->n.neoncache[j].v = 0; + j=32; } else if(dyn->n.neoncache[j].t==NEON_CACHE_YMMW && dyn->n.neoncache[j].n==i) { VSTR128_U12(j, xEmu, offsetof(x64emu_t, ymm[i])); dyn->n.neoncache[j].v = 0; + j=32; } } - if(box64_dynarec_dump) dynarec_log(LOG_NONE, "---------- Purge YMM\n"); + MESSAGE(LOG_NONE, "---------- Purge YMM\n"); } \ No newline at end of file diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index 30f967a7..0252a052 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -517,7 +517,7 @@ if(MODREG) \ ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, -1); \ else \ - VLD128(ey, ed, fixedaddress+16); \ + VLDR128_U12(ey, ed, fixedaddress+16); \ gy = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1) // Get EY and non-writen VY and GY @@ -526,9 +526,16 @@ if(MODREG) \ ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1, gd, vex.v, -1); \ else \ - VLD128(ey, ed, fixedaddress+16); \ + VLDR128_U12(ey, ed, fixedaddress+16); \ gy = ymm_get_reg(dyn, ninst, x1, gd, 0, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1) +// Get empty EY and non-writen VY and GY +#define GETGYVYEY_empty(gy, vy, ey) \ + vy = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); \ + gy = ymm_get_reg(dyn, ninst, x1, gd, 0, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); \ + if(MODREG) \ + ey = ymm_get_reg_empty(dyn, ninst, x1, (nextop&7)+(rex.b<<3), gd, vex.v, -1) + // Get EY and non-writen GY #define GETGYEY(gy, ey) \ if(MODREG) \ @@ -561,9 +568,9 @@ // Get empty VY, and non-writen EY #define GETVY_empty_EY(vy, ey) \ if(MODREG) \ - ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, vex.v, -1, -1); \ + ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, vex.v, -1, -1); \ else \ - VLD128(ey, ed, fixedaddress+16); \ + VLDR128_U12(ey, ed, fixedaddress+16); \ vy = ymm_get_reg_empty(dyn, ninst, x1, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1, -1) // Get EX as a quad, (x3 is used) @@ -575,7 +582,7 @@ addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, D); \ unscaled = 0; \ a = fpu_get_scratch(dyn, ninst); \ - VLD128(a, ed, fixedaddress); \ + VLDR128_U12(a, ed, fixedaddress); \ } // Get EX as a quad, (x3 is used) #define GETEX_empty_Y(a, D) \ @@ -583,6 +590,7 @@ a = sse_get_reg_empty(dyn, ninst, x3, (nextop&7)+(rex.b<<3)); \ } else { \ WILLWRITE2(); \ + a = fpu_get_scratch(dyn, ninst); \ addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, D); \ unscaled = 0; \ } |