results/scraper/fex/2681


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216

Sonic Mania movie player very slow block.
Caught while profiling, the Sonic Mania movie player consumes 72% of CPU time in a **SINGLE** block.
This block is mostly just bad because of bad codegen.
![Image_2023-05-18_19-35-45](https://github.com/FEX-Emu/FEX/assets/1018829/c012fa3f-d133-4fcc-940b-91d903dfe883)

Block ripped from their deobfuscated executable (32-bit x86). This block jumps to itself.
```asm
movzx   edx, byte [esi+ecx]
movzx   ecx, byte [esi+edi]
or      edx, 0xffff0000
shl     edx, 0x8
inc     esi
or      edx, ecx
mov     ecx, dword [ebp+0xc {arg5}]
or      dword [eax], edx
add     eax, 0x4
cmp     esi, ebx
jl      0x5cada0
```

This block is fairly simple, it's combining two bytestreams and a 32-bit stream in to one. Looks like some sort of RGBA combination where one channel is already stored in the destination stream and alpha is forced to 0xFF.
This has some absolutely abysmal codegen.
```asm
(gdb) disas 0x2a10ecfa0,+0x0000013c
Dump of assembler code from 0x2a10ecfa0 to 0x2a10ed0dc:
   0x00000002a10ecfa0:  mov     w20, w5
   0x00000002a10ecfa4:  mov     w21, w10
   0x00000002a10ecfa8:  add     w20, w20, w21
   0x00000002a10ecfac:  ldrb    w20, [x20]
   0x00000002a10ecfb0:  bfxil   x6, x20, #0, #32
   0x00000002a10ecfb4:  mov     w20, w11
   0x00000002a10ecfb8:  mov     w21, w10
   0x00000002a10ecfbc:  add     w20, w20, w21
   0x00000002a10ecfc0:  ldrb    w20, [x20]
   0x00000002a10ecfc4:  bfxil   x5, x20, #0, #32
   0x00000002a10ecfc8:  mov     w20, w6
   0x00000002a10ecfcc:  orr     w20, w20, #0xffff0000
   0x00000002a10ecfd0:  bfxil   x6, x20, #0, #32
   0x00000002a10ecfd4:  mov     w20, w6
   0x00000002a10ecfd8:  lsl     w20, w20, #8
   0x00000002a10ecfdc:  bfxil   x6, x20, #0, #32
   0x00000002a10ecfe0:  mov     w20, w10
   0x00000002a10ecfe4:  add     w20, w20, #0x1
   0x00000002a10ecfe8:  bfxil   x10, x20, #0, #32
   0x00000002a10ecfec:  mov     w20, w5
   0x00000002a10ecff0:  mov     w21, w6
   0x00000002a10ecff4:  orr     w20, w21, w20
   0x00000002a10ecff8:  bfxil   x6, x20, #0, #32
   0x00000002a10ecffc:  mov     w20, w9
   0x00000002a10ed000:  add     w20, w20, #0xc
   0x00000002a10ed004:  ldr     w20, [x20]
   0x00000002a10ed008:  bfxil   x5, x20, #0, #32
   0x00000002a10ed00c:  mov     w20, w6
   0x00000002a10ed010:  mov     w21, w4
   0x00000002a10ed014:  ldr     w21, [x21]
   0x00000002a10ed018:  orr     w20, w21, w20
=> 0x00000002a10ed01c:  mov     w21, w4
   0x00000002a10ed020:  str     w20, [x21]
   0x00000002a10ed024:  mov     w20, w4
   0x00000002a10ed028:  add     w20, w20, #0x4
   0x00000002a10ed02c:  bfxil   x4, x20, #0, #32
   0x00000002a10ed030:  mov     w20, w7
   0x00000002a10ed034:  mov     w21, w10
   0x00000002a10ed038:  sub     w22, w21, w20
   0x00000002a10ed03c:  eor     w23, w21, w20
   0x00000002a10ed040:  eor     w23, w23, w22
   0x00000002a10ed044:  ubfx    x23, x23, #4, #1
   0x00000002a10ed048:  strb    w23, [x28, #708]
   0x00000002a10ed04c:  ubfx    x23, x22, #31, #1
   0x00000002a10ed050:  strb    w23, [x28, #711]
   0x00000002a10ed054:  and     x24, x22, #0xff
   0x00000002a10ed058:  fmov    d0, x24
   0x00000002a10ed05c:  cnt     v0.8b, v0.8b
   0x00000002a10ed060:  addv    b0, v0.8b
   0x00000002a10ed064:  umov    w24, v0.b[0]
   0x00000002a10ed068:  eor     x24, x24, #0x1
   0x00000002a10ed06c:  ubfx    x24, x24, #0, #1
   0x00000002a10ed070:  strb    w24, [x28, #706]
   0x00000002a10ed074:  cmp     x22, #0x0
   0x00000002a10ed078:  cset    x24, eq  // eq = none
   0x00000002a10ed07c:  strb    w24, [x28, #710]
   0x00000002a10ed080:  cmp     w21, w20
   0x00000002a10ed084:  cset    x24, cc  // cc = lo, ul, last
   0x00000002a10ed088:  strb    w24, [x28, #704]
   0x00000002a10ed08c:  eor     w20, w21, w20
   0x00000002a10ed090:  eor     w21, w22, w21
   0x00000002a10ed094:  and     w20, w20, w21
   0x00000002a10ed098:  ubfx    x20, x20, #31, #1
   0x00000002a10ed09c:  strb    w20, [x28, #715]
   0x00000002a10ed0a0:  cmp     w23, w20
   0x00000002a10ed0a4:  b.ne    0x2a10ecfa0  // b.any
   0x00000002a10ed0a8:  b       0x2a10ed0d4
   0x00000002a10ed0ac:  blr     x0
   0x00000002a10ed0b0:  ldapurh w0, [x8, #-200]
   0x00000002a10ed0b4:  udf     #2
   0x00000002a10ed0b8:  .inst   0x005cadc0 ; undefined
   0x00000002a10ed0bc:  udf     #0
   0x00000002a10ed0c0:  udf     #316
   0x00000002a10ed0c4:  udf     #0
   0x00000002a10ed0c8:  .inst   0x005cada0 ; undefined
   0x00000002a10ed0cc:  udf     #0
   0x00000002a10ed0d0:  udf     #312
   0x00000002a10ed0d4:  adr     x0, 0x2a10ed0d0
   0x00000002a10ed0d8:  str     x0, [x28, #184]
   ```

llvm-mca is quite damning.
```bash
llvm-mca -mcpu=cortex-x1c -mattr=+rcpc-immo mania.txt
Iterations:        100
Instructions:      6900
Total Cycles:      5394
Total uOps:        7100

Dispatch Width:    3
uOps Per Cycle:    1.32
IPC:               1.28
Block RThroughput: 23.7
```

And here is an ASM test example that can run in our unit tests to see how much the codegen improves.

```asm
%ifdef CONFIG
{
  "Mode": "32BIT"
}
%endif

; Original
; movzx   edx, byte [esi+ecx]
; movzx   ecx, byte [esi+edi]
; or      edx, 0xffff0000
; shl     edx, 0x8
; inc     esi
; or      edx, ecx
; mov     ecx, dword [ebp+0xc {arg5}]
; or      dword [eax], edx
; add     eax, 0x4
; cmp     esi, ebx
; jl      0x5cada0

mov ebp, 0xe0000000

; [ebp + 0xc] contains src1 offset
mov dword [ebp + 0xc], 8

; ebx contains loop iteration end offset
lea ebx, .data
add ebx, 1

; esi contains the pointer to the data bases
lea esi, .data

; edi contains src1 offset
mov edi, 0

; ecx starts off with src1 offset
mov ecx, dword [ebp + 0xc]

; eax contains the dword destination and src that it accumulates to
lea eax, .data_dst

; Break the block here for easier viewing.
jmp .loop_top

; This loop is the hot loop in Sonic Mania.
.loop_top:
; Load src1
movzx edx, byte [esi + ecx]
; Load src1
movzx ecx, byte [esi + edi]

; set top two channels to 0xFFFF
; Src1 in low bytes - 0xff'ff'00'<src1>
or edx, 0xffff0000

; shl edx by 8, shifting off the top 0xff for some reason
; Src1 in low bytes - 0xff'ff'00'<src1> -> 0xff'00'<src1>'00
shl edx, 0x8

; increment byte base
inc esi

; or in Src1 in to low byte
; 0xff'00'<src1>'<src2>
or edx, ecx

; Reload src1 offset since it was overwritten
mov ecx, dword [ebp + 0xc]

; or dword in to eax dest
or dword [eax], edx

; Increment eax offset
add eax, 0x4


; See if esi matches the end pointer
cmp esi, ebx

; Rerun loop if the counter isn't at the end yet.
jl .loop_top

; Exit
hlt

.data:

dq 0, 0, 0, 0, 0, 0, 0, 0

.data_dst:
dq 0, 0, 0, 0, 0, 0, 0, 0
```

Optimizing the issues in this codegen will significantly improve FEX's codegen all over the codebase. So it will be a significant win everywhere. This is some low hanging fruit in our codegen, theoretically fixing most of this should be easy.