From d44f3d9baee8a0f7ce16bb3027f5a666a262aa07 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Mon, 4 Mar 2024 17:20:56 +0100 Subject: [ARM64_DYNAREC] Small optims for SSE/SSE2 & strongmem>1 --- src/dynarec/arm64/dynarec_arm64_660f.c | 1 + src/dynarec/arm64/dynarec_arm64_f20f.c | 1 + src/dynarec/arm64/dynarec_arm64_f30f.c | 22 +++++++++++++--------- src/dynarec/arm64/dynarec_arm64_helper.h | 8 ++++---- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c index 2e0d6e5a..627e12f2 100644 --- a/src/dynarec/arm64/dynarec_arm64_660f.c +++ b/src/dynarec/arm64/dynarec_arm64_660f.c @@ -2663,6 +2663,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n v1 = sse_get_reg_empty(dyn, ninst, x1, (nextop&7) + (rex.b<<3)); FMOVD(v1, v0); } else { + WILLWRITE2(); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VST64(v0, ed, fixedaddress); SMWRITE2(); diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c index a8d55cf3..65607b52 100644 --- a/src/dynarec/arm64/dynarec_arm64_f20f.c +++ b/src/dynarec/arm64/dynarec_arm64_f20f.c @@ -71,6 +71,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n d0 = sse_get_reg(dyn, ninst, x1, ed, 1); VMOVeD(d0, 0, v0, 0); } else { + WILLWRITE2(); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); VST64(v0, ed, fixedaddress); SMWRITE2(); diff --git a/src/dynarec/arm64/dynarec_arm64_f30f.c b/src/dynarec/arm64/dynarec_arm64_f30f.c index d57d3c8b..8f47646c 100644 --- a/src/dynarec/arm64/dynarec_arm64_f30f.c +++ b/src/dynarec/arm64/dynarec_arm64_f30f.c @@ -333,16 +333,20 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n GETEX(v1, 0, 1) ; GETGX(v0, 1); u8 = F8; - // only high part need to be suffled. VTBL only handle 8bits value, so the 16bits suffles need to be changed in 8bits - u64 = 0; - for (int i=0; i<4; ++i) { - u64 |= ((uint64_t)((u8>>(i*2))&3)*2+8)<<(i*16+0); - u64 |= ((uint64_t)((u8>>(i*2))&3)*2+9)<<(i*16+8); - } - MOV64x(x2, u64); d0 = fpu_get_scratch(dyn); - VMOVQDfrom(d0, 0, x2); - VTBL1_8(d0, v1, d0); + if(u8==0b00000000 || u8==0b01010101 || u8==0b10101010 || u8==0b11111111) { + VDUP_16(d0, v1, u8&3); + } else { + // only high part need to be suffled. VTBL only handle 8bits value, so the 16bits suffles need to be changed in 8bits + u64 = 0; + for (int i=0; i<4; ++i) { + u64 |= ((uint64_t)((u8>>(i*2))&3)*2+8)<<(i*16+0); + u64 |= ((uint64_t)((u8>>(i*2))&3)*2+9)<<(i*16+8); + } + MOV64x(x2, u64); + VMOVQDfrom(d0, 0, x2); + VTBL1_8(d0, v1, d0); + } VMOVeD(v0, 1, d0, 0); if(v0!=v1) { VMOVeD(v0, 0, v1, 0); diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index 0fe4d252..563327e8 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -426,7 +426,7 @@ if(MODREG) { \ a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ - SMREAD(); \ + if(w) {WILLWRITE2();} else {SMREAD();} \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, D); \ a = fpu_get_scratch(dyn); \ VLD128(a, ed, fixedaddress); \ @@ -445,7 +445,7 @@ if(MODREG) { \ a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ - SMREAD(); \ + if(w) {WILLWRITE2();} else {SMREAD();} \ a = fpu_get_scratch(dyn); \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, D); \ VLD64(a, ed, fixedaddress); \ @@ -459,7 +459,7 @@ if(MODREG) { \ a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ - SMREAD(); \ + if(w) {WILLWRITE2();} else {SMREAD();} \ a = fpu_get_scratch(dyn); \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, D); \ VLD32(a, ed, fixedaddress); \ @@ -473,7 +473,7 @@ if(MODREG) { \ a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w); \ } else { \ - SMREAD(); \ + if(w) {WILLWRITE2();} else {SMREAD();} \ a = fpu_get_scratch(dyn); \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, D); \ VLD16(a, ed, fixedaddress); \ -- cgit 1.4.1