1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
|
Sonic Mania movie player very slow block.
Caught while profiling, the Sonic Mania movie player consumes 72% of CPU time in a **SINGLE** block.
This block is mostly just bad because of bad codegen.

Block ripped from their deobfuscated executable (32-bit x86). This block jumps to itself.
```asm
movzx edx, byte [esi+ecx]
movzx ecx, byte [esi+edi]
or edx, 0xffff0000
shl edx, 0x8
inc esi
or edx, ecx
mov ecx, dword [ebp+0xc {arg5}]
or dword [eax], edx
add eax, 0x4
cmp esi, ebx
jl 0x5cada0
```
This block is fairly simple, it's combining two bytestreams and a 32-bit stream in to one. Looks like some sort of RGBA combination where one channel is already stored in the destination stream and alpha is forced to 0xFF.
This has some absolutely abysmal codegen.
```asm
(gdb) disas 0x2a10ecfa0,+0x0000013c
Dump of assembler code from 0x2a10ecfa0 to 0x2a10ed0dc:
0x00000002a10ecfa0: mov w20, w5
0x00000002a10ecfa4: mov w21, w10
0x00000002a10ecfa8: add w20, w20, w21
0x00000002a10ecfac: ldrb w20, [x20]
0x00000002a10ecfb0: bfxil x6, x20, #0, #32
0x00000002a10ecfb4: mov w20, w11
0x00000002a10ecfb8: mov w21, w10
0x00000002a10ecfbc: add w20, w20, w21
0x00000002a10ecfc0: ldrb w20, [x20]
0x00000002a10ecfc4: bfxil x5, x20, #0, #32
0x00000002a10ecfc8: mov w20, w6
0x00000002a10ecfcc: orr w20, w20, #0xffff0000
0x00000002a10ecfd0: bfxil x6, x20, #0, #32
0x00000002a10ecfd4: mov w20, w6
0x00000002a10ecfd8: lsl w20, w20, #8
0x00000002a10ecfdc: bfxil x6, x20, #0, #32
0x00000002a10ecfe0: mov w20, w10
0x00000002a10ecfe4: add w20, w20, #0x1
0x00000002a10ecfe8: bfxil x10, x20, #0, #32
0x00000002a10ecfec: mov w20, w5
0x00000002a10ecff0: mov w21, w6
0x00000002a10ecff4: orr w20, w21, w20
0x00000002a10ecff8: bfxil x6, x20, #0, #32
0x00000002a10ecffc: mov w20, w9
0x00000002a10ed000: add w20, w20, #0xc
0x00000002a10ed004: ldr w20, [x20]
0x00000002a10ed008: bfxil x5, x20, #0, #32
0x00000002a10ed00c: mov w20, w6
0x00000002a10ed010: mov w21, w4
0x00000002a10ed014: ldr w21, [x21]
0x00000002a10ed018: orr w20, w21, w20
=> 0x00000002a10ed01c: mov w21, w4
0x00000002a10ed020: str w20, [x21]
0x00000002a10ed024: mov w20, w4
0x00000002a10ed028: add w20, w20, #0x4
0x00000002a10ed02c: bfxil x4, x20, #0, #32
0x00000002a10ed030: mov w20, w7
0x00000002a10ed034: mov w21, w10
0x00000002a10ed038: sub w22, w21, w20
0x00000002a10ed03c: eor w23, w21, w20
0x00000002a10ed040: eor w23, w23, w22
0x00000002a10ed044: ubfx x23, x23, #4, #1
0x00000002a10ed048: strb w23, [x28, #708]
0x00000002a10ed04c: ubfx x23, x22, #31, #1
0x00000002a10ed050: strb w23, [x28, #711]
0x00000002a10ed054: and x24, x22, #0xff
0x00000002a10ed058: fmov d0, x24
0x00000002a10ed05c: cnt v0.8b, v0.8b
0x00000002a10ed060: addv b0, v0.8b
0x00000002a10ed064: umov w24, v0.b[0]
0x00000002a10ed068: eor x24, x24, #0x1
0x00000002a10ed06c: ubfx x24, x24, #0, #1
0x00000002a10ed070: strb w24, [x28, #706]
0x00000002a10ed074: cmp x22, #0x0
0x00000002a10ed078: cset x24, eq // eq = none
0x00000002a10ed07c: strb w24, [x28, #710]
0x00000002a10ed080: cmp w21, w20
0x00000002a10ed084: cset x24, cc // cc = lo, ul, last
0x00000002a10ed088: strb w24, [x28, #704]
0x00000002a10ed08c: eor w20, w21, w20
0x00000002a10ed090: eor w21, w22, w21
0x00000002a10ed094: and w20, w20, w21
0x00000002a10ed098: ubfx x20, x20, #31, #1
0x00000002a10ed09c: strb w20, [x28, #715]
0x00000002a10ed0a0: cmp w23, w20
0x00000002a10ed0a4: b.ne 0x2a10ecfa0 // b.any
0x00000002a10ed0a8: b 0x2a10ed0d4
0x00000002a10ed0ac: blr x0
0x00000002a10ed0b0: ldapurh w0, [x8, #-200]
0x00000002a10ed0b4: udf #2
0x00000002a10ed0b8: .inst 0x005cadc0 ; undefined
0x00000002a10ed0bc: udf #0
0x00000002a10ed0c0: udf #316
0x00000002a10ed0c4: udf #0
0x00000002a10ed0c8: .inst 0x005cada0 ; undefined
0x00000002a10ed0cc: udf #0
0x00000002a10ed0d0: udf #312
0x00000002a10ed0d4: adr x0, 0x2a10ed0d0
0x00000002a10ed0d8: str x0, [x28, #184]
```
llvm-mca is quite damning.
```bash
llvm-mca -mcpu=cortex-x1c -mattr=+rcpc-immo mania.txt
Iterations: 100
Instructions: 6900
Total Cycles: 5394
Total uOps: 7100
Dispatch Width: 3
uOps Per Cycle: 1.32
IPC: 1.28
Block RThroughput: 23.7
```
And here is an ASM test example that can run in our unit tests to see how much the codegen improves.
```asm
%ifdef CONFIG
{
"Mode": "32BIT"
}
%endif
; Original
; movzx edx, byte [esi+ecx]
; movzx ecx, byte [esi+edi]
; or edx, 0xffff0000
; shl edx, 0x8
; inc esi
; or edx, ecx
; mov ecx, dword [ebp+0xc {arg5}]
; or dword [eax], edx
; add eax, 0x4
; cmp esi, ebx
; jl 0x5cada0
mov ebp, 0xe0000000
; [ebp + 0xc] contains src1 offset
mov dword [ebp + 0xc], 8
; ebx contains loop iteration end offset
lea ebx, .data
add ebx, 1
; esi contains the pointer to the data bases
lea esi, .data
; edi contains src1 offset
mov edi, 0
; ecx starts off with src1 offset
mov ecx, dword [ebp + 0xc]
; eax contains the dword destination and src that it accumulates to
lea eax, .data_dst
; Break the block here for easier viewing.
jmp .loop_top
; This loop is the hot loop in Sonic Mania.
.loop_top:
; Load src1
movzx edx, byte [esi + ecx]
; Load src1
movzx ecx, byte [esi + edi]
; set top two channels to 0xFFFF
; Src1 in low bytes - 0xff'ff'00'<src1>
or edx, 0xffff0000
; shl edx by 8, shifting off the top 0xff for some reason
; Src1 in low bytes - 0xff'ff'00'<src1> -> 0xff'00'<src1>'00
shl edx, 0x8
; increment byte base
inc esi
; or in Src1 in to low byte
; 0xff'00'<src1>'<src2>
or edx, ecx
; Reload src1 offset since it was overwritten
mov ecx, dword [ebp + 0xc]
; or dword in to eax dest
or dword [eax], edx
; Increment eax offset
add eax, 0x4
; See if esi matches the end pointer
cmp esi, ebx
; Rerun loop if the counter isn't at the end yet.
jl .loop_top
; Exit
hlt
.data:
dq 0, 0, 0, 0, 0, 0, 0, 0
.data_dst:
dq 0, 0, 0, 0, 0, 0, 0, 0
```
Optimizing the issues in this codegen will significantly improve FEX's codegen all over the codebase. So it will be a significant win everywhere. This is some low hanging fruit in our codegen, theoretically fixing most of this should be easy.
|