1 files changed, 216 insertions, 0 deletions
diff --git a/results/scraper/fex/2681 b/results/scraper/fex/2681
new file mode 100644
index 000000000..5848d031b
--- /dev/null
+++ b/results/scraper/fex/2681
@@ -0,0 +1,216 @@
+Sonic Mania movie player very slow block.
+Caught while profiling, the Sonic Mania movie player consumes 72% of CPU time in a **SINGLE** block.
+This block is mostly just bad because of bad codegen.
+![Image_2023-05-18_19-35-45](https://github.com/FEX-Emu/FEX/assets/1018829/c012fa3f-d133-4fcc-940b-91d903dfe883)
+
+Block ripped from their deobfuscated executable (32-bit x86). This block jumps to itself.
+```asm
+movzx   edx, byte [esi+ecx]
+movzx   ecx, byte [esi+edi]
+or      edx, 0xffff0000
+shl     edx, 0x8
+inc     esi
+or      edx, ecx
+mov     ecx, dword [ebp+0xc {arg5}]
+or      dword [eax], edx
+add     eax, 0x4
+cmp     esi, ebx
+jl      0x5cada0
+```
+
+This block is fairly simple, it's combining two bytestreams and a 32-bit stream in to one. Looks like some sort of RGBA combination where one channel is already stored in the destination stream and alpha is forced to 0xFF.
+This has some absolutely abysmal codegen.
+```asm
+(gdb) disas 0x2a10ecfa0,+0x0000013c
+Dump of assembler code from 0x2a10ecfa0 to 0x2a10ed0dc:
+   0x00000002a10ecfa0:  mov     w20, w5
+   0x00000002a10ecfa4:  mov     w21, w10
+   0x00000002a10ecfa8:  add     w20, w20, w21
+   0x00000002a10ecfac:  ldrb    w20, [x20]
+   0x00000002a10ecfb0:  bfxil   x6, x20, #0, #32
+   0x00000002a10ecfb4:  mov     w20, w11
+   0x00000002a10ecfb8:  mov     w21, w10
+   0x00000002a10ecfbc:  add     w20, w20, w21
+   0x00000002a10ecfc0:  ldrb    w20, [x20]
+   0x00000002a10ecfc4:  bfxil   x5, x20, #0, #32
+   0x00000002a10ecfc8:  mov     w20, w6
+   0x00000002a10ecfcc:  orr     w20, w20, #0xffff0000
+   0x00000002a10ecfd0:  bfxil   x6, x20, #0, #32
+   0x00000002a10ecfd4:  mov     w20, w6
+   0x00000002a10ecfd8:  lsl     w20, w20, #8
+   0x00000002a10ecfdc:  bfxil   x6, x20, #0, #32
+   0x00000002a10ecfe0:  mov     w20, w10
+   0x00000002a10ecfe4:  add     w20, w20, #0x1
+   0x00000002a10ecfe8:  bfxil   x10, x20, #0, #32
+   0x00000002a10ecfec:  mov     w20, w5
+   0x00000002a10ecff0:  mov     w21, w6
+   0x00000002a10ecff4:  orr     w20, w21, w20
+   0x00000002a10ecff8:  bfxil   x6, x20, #0, #32
+   0x00000002a10ecffc:  mov     w20, w9
+   0x00000002a10ed000:  add     w20, w20, #0xc
+   0x00000002a10ed004:  ldr     w20, [x20]
+   0x00000002a10ed008:  bfxil   x5, x20, #0, #32
+   0x00000002a10ed00c:  mov     w20, w6
+   0x00000002a10ed010:  mov     w21, w4
+   0x00000002a10ed014:  ldr     w21, [x21]
+   0x00000002a10ed018:  orr     w20, w21, w20
+=> 0x00000002a10ed01c:  mov     w21, w4
+   0x00000002a10ed020:  str     w20, [x21]
+   0x00000002a10ed024:  mov     w20, w4
+   0x00000002a10ed028:  add     w20, w20, #0x4
+   0x00000002a10ed02c:  bfxil   x4, x20, #0, #32
+   0x00000002a10ed030:  mov     w20, w7
+   0x00000002a10ed034:  mov     w21, w10
+   0x00000002a10ed038:  sub     w22, w21, w20
+   0x00000002a10ed03c:  eor     w23, w21, w20
+   0x00000002a10ed040:  eor     w23, w23, w22
+   0x00000002a10ed044:  ubfx    x23, x23, #4, #1
+   0x00000002a10ed048:  strb    w23, [x28, #708]
+   0x00000002a10ed04c:  ubfx    x23, x22, #31, #1
+   0x00000002a10ed050:  strb    w23, [x28, #711]
+   0x00000002a10ed054:  and     x24, x22, #0xff
+   0x00000002a10ed058:  fmov    d0, x24
+   0x00000002a10ed05c:  cnt     v0.8b, v0.8b
+   0x00000002a10ed060:  addv    b0, v0.8b
+   0x00000002a10ed064:  umov    w24, v0.b[0]
+   0x00000002a10ed068:  eor     x24, x24, #0x1
+   0x00000002a10ed06c:  ubfx    x24, x24, #0, #1
+   0x00000002a10ed070:  strb    w24, [x28, #706]
+   0x00000002a10ed074:  cmp     x22, #0x0
+   0x00000002a10ed078:  cset    x24, eq  // eq = none
+   0x00000002a10ed07c:  strb    w24, [x28, #710]
+   0x00000002a10ed080:  cmp     w21, w20
+   0x00000002a10ed084:  cset    x24, cc  // cc = lo, ul, last
+   0x00000002a10ed088:  strb    w24, [x28, #704]
+   0x00000002a10ed08c:  eor     w20, w21, w20
+   0x00000002a10ed090:  eor     w21, w22, w21
+   0x00000002a10ed094:  and     w20, w20, w21
+   0x00000002a10ed098:  ubfx    x20, x20, #31, #1
+   0x00000002a10ed09c:  strb    w20, [x28, #715]
+   0x00000002a10ed0a0:  cmp     w23, w20
+   0x00000002a10ed0a4:  b.ne    0x2a10ecfa0  // b.any
+   0x00000002a10ed0a8:  b       0x2a10ed0d4
+   0x00000002a10ed0ac:  blr     x0
+   0x00000002a10ed0b0:  ldapurh w0, [x8, #-200]
+   0x00000002a10ed0b4:  udf     #2
+   0x00000002a10ed0b8:  .inst   0x005cadc0 ; undefined
+   0x00000002a10ed0bc:  udf     #0
+   0x00000002a10ed0c0:  udf     #316
+   0x00000002a10ed0c4:  udf     #0
+   0x00000002a10ed0c8:  .inst   0x005cada0 ; undefined
+   0x00000002a10ed0cc:  udf     #0
+   0x00000002a10ed0d0:  udf     #312
+   0x00000002a10ed0d4:  adr     x0, 0x2a10ed0d0
+   0x00000002a10ed0d8:  str     x0, [x28, #184]
+   ```
+
+llvm-mca is quite damning.
+```bash
+llvm-mca -mcpu=cortex-x1c -mattr=+rcpc-immo mania.txt
+Iterations:        100
+Instructions:      6900
+Total Cycles:      5394
+Total uOps:        7100
+
+Dispatch Width:    3
+uOps Per Cycle:    1.32
+IPC:               1.28
+Block RThroughput: 23.7
+```
+
+And here is an ASM test example that can run in our unit tests to see how much the codegen improves.
+
+```asm
+%ifdef CONFIG
+{
+  "Mode": "32BIT"
+}
+%endif
+
+; Original
+; movzx   edx, byte [esi+ecx]
+; movzx   ecx, byte [esi+edi]
+; or      edx, 0xffff0000
+; shl     edx, 0x8
+; inc     esi
+; or      edx, ecx
+; mov     ecx, dword [ebp+0xc {arg5}]
+; or      dword [eax], edx
+; add     eax, 0x4
+; cmp     esi, ebx
+; jl      0x5cada0
+
+mov ebp, 0xe0000000
+
+; [ebp + 0xc] contains src1 offset
+mov dword [ebp + 0xc], 8
+
+; ebx contains loop iteration end offset
+lea ebx, .data
+add ebx, 1
+
+; esi contains the pointer to the data bases
+lea esi, .data
+
+; edi contains src1 offset
+mov edi, 0
+
+; ecx starts off with src1 offset
+mov ecx, dword [ebp + 0xc]
+
+; eax contains the dword destination and src that it accumulates to
+lea eax, .data_dst
+
+; Break the block here for easier viewing.
+jmp .loop_top
+
+; This loop is the hot loop in Sonic Mania.
+.loop_top:
+; Load src1
+movzx edx, byte [esi + ecx]
+; Load src1
+movzx ecx, byte [esi + edi]
+
+; set top two channels to 0xFFFF
+; Src1 in low bytes - 0xff'ff'00'<src1>
+or edx, 0xffff0000
+
+; shl edx by 8, shifting off the top 0xff for some reason
+; Src1 in low bytes - 0xff'ff'00'<src1> -> 0xff'00'<src1>'00
+shl edx, 0x8
+
+; increment byte base
+inc esi
+
+; or in Src1 in to low byte
+; 0xff'00'<src1>'<src2>
+or edx, ecx
+
+; Reload src1 offset since it was overwritten
+mov ecx, dword [ebp + 0xc]
+
+; or dword in to eax dest
+or dword [eax], edx
+
+; Increment eax offset
+add eax, 0x4
+
+
+; See if esi matches the end pointer
+cmp esi, ebx
+
+; Rerun loop if the counter isn't at the end yet.
+jl .loop_top
+
+; Exit
+hlt
+
+.data:
+
+dq 0, 0, 0, 0, 0, 0, 0, 0
+
+.data_dst:
+dq 0, 0, 0, 0, 0, 0, 0, 0
+```
+
+Optimizing the issues in this codegen will significantly improve FEX's codegen all over the codebase. So it will be a significant win everywhere. This is some low hanging fruit in our codegen, theoretically fixing most of this should be easy.
\ No newline at end of file