1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
PSAD has unnecessary moves
Can be seen in `OpSize/66_F6.asm`
```asm
adr x0, #-0x4 (addr 0x7fffe2080844)
str x0, [x28, #184]
ldr q4, [x6, #240]
uabdl v5.8h, v24.8b, v4.8b
uabdl2 v4.8h, v24.16b, v4.16b
addv h5, v5.8h
addv h4, v4.8h
mov v0.16b, v5.16b
mov v0.d[1], v4.d[0]
mov v24.16b, v0.16b
ldr x0, pc+8 (addr 0x7fffe2080878)
blr x0
```
The three moves at the end are unnecessary and can be implemented as follows.
```asm
adr x0, #-0x4 (addr 0x7fffe2080844)
str x0, [x28, #184]
ldr q4, [x6, #240]
uabdl v5.8h, v24.8b, v4.8b
uabdl2 v4.8h, v24.16b, v4.16b
addv v24.16b, v5.8h
addv h4, v4.8h
mov v24.d[1], v4.d[0]
ldr x0, pc+8 (addr 0x7fffe2080878)
blr x0
```
IR:
```
(%2) CodeBlock %3, %17
(%3 i0) BeginBlock %2(Invalid)
(%4 i0) GuestOpcode #0x0
%5(FPRFixed8) i128 = LoadRegister #0x0, #0x140, FPR, FPRFixed, u8:Tmp:Size
%6(GPRFixed2) i64 = LoadRegister #0x0, #0x18, GPR, GPRFixed, u8:Tmp:Size
(%7 i64) InlineConstant #0xf0
%8(FPR0) i128 = LoadMem FPR, u8:Tmp:Size, %6(GPRFixed2) i64, %7(Invalid), #0x10, SXTX, #0x1
%9(FPR1) i16v8 = VUABDL u8:Tmp:RegisterSize, u8:Tmp:ElementSize, %5(FPRFixed8) i128, %8(FPR0) i128
%10(FPR0) i16v8 = VUABDL2 u8:Tmp:RegisterSize, u8:Tmp:ElementSize, %5(FPRFixed8) i128, %8(FPR0) i128
%11(FPR1) i16v8 = VAddV u8:Tmp:RegisterSize, u8:Tmp:ElementSize, %9(FPR1) i16v8
%12(FPR0) i16v8 = VAddV u8:Tmp:RegisterSize, u8:Tmp:ElementSize, %10(FPR0) i16v8
%13(FPRFixed8) i64v2 = VInsElement u8:Tmp:RegisterSize, u8:Tmp:ElementSize, #0x1, #0x0, %11(FPR1) i16v8, %12(FPR0) i16v8
(%14 i128) StoreRegister %13(FPRFixed8) i64v2, #0x0, #0x140, FPR, FPRFixed, u8:Tmp:Size
(%15 i64) InlineEntrypointOffset #0x9, u8:Tmp:RegisterSize
(%16 i64) ExitFunction %15(Invalid)
(%17 i0) EndBlock %2(Invalid)
```
This is because the first `VAddV` doesn't understand it can get stored in to the vector. Which means the `VInsertElement` does a mov+mov+mov.
Care must be taken as usual that if the destination is to memory, that we don't turn the store in to multiple memory stores.
|