summary refs log tree commit diff stats
path: root/results/scraper/fex/2681
diff options
context:
space:
mode:
Diffstat (limited to 'results/scraper/fex/2681')
-rw-r--r--results/scraper/fex/2681216
1 files changed, 216 insertions, 0 deletions
diff --git a/results/scraper/fex/2681 b/results/scraper/fex/2681
new file mode 100644
index 000000000..5848d031b
--- /dev/null
+++ b/results/scraper/fex/2681
@@ -0,0 +1,216 @@
+Sonic Mania movie player very slow block.
+Caught while profiling, the Sonic Mania movie player consumes 72% of CPU time in a **SINGLE** block.

+This block is mostly just bad because of bad codegen.

+![Image_2023-05-18_19-35-45](https://github.com/FEX-Emu/FEX/assets/1018829/c012fa3f-d133-4fcc-940b-91d903dfe883)

+

+Block ripped from their deobfuscated executable (32-bit x86). This block jumps to itself.

+```asm

+movzx   edx, byte [esi+ecx]

+movzx   ecx, byte [esi+edi]

+or      edx, 0xffff0000

+shl     edx, 0x8

+inc     esi

+or      edx, ecx

+mov     ecx, dword [ebp+0xc {arg5}]

+or      dword [eax], edx

+add     eax, 0x4

+cmp     esi, ebx

+jl      0x5cada0

+```

+

+This block is fairly simple, it's combining two bytestreams and a 32-bit stream in to one. Looks like some sort of RGBA combination where one channel is already stored in the destination stream and alpha is forced to 0xFF.

+This has some absolutely abysmal codegen.

+```asm

+(gdb) disas 0x2a10ecfa0,+0x0000013c

+Dump of assembler code from 0x2a10ecfa0 to 0x2a10ed0dc:

+   0x00000002a10ecfa0:  mov     w20, w5

+   0x00000002a10ecfa4:  mov     w21, w10

+   0x00000002a10ecfa8:  add     w20, w20, w21

+   0x00000002a10ecfac:  ldrb    w20, [x20]

+   0x00000002a10ecfb0:  bfxil   x6, x20, #0, #32

+   0x00000002a10ecfb4:  mov     w20, w11

+   0x00000002a10ecfb8:  mov     w21, w10

+   0x00000002a10ecfbc:  add     w20, w20, w21

+   0x00000002a10ecfc0:  ldrb    w20, [x20]

+   0x00000002a10ecfc4:  bfxil   x5, x20, #0, #32

+   0x00000002a10ecfc8:  mov     w20, w6

+   0x00000002a10ecfcc:  orr     w20, w20, #0xffff0000

+   0x00000002a10ecfd0:  bfxil   x6, x20, #0, #32

+   0x00000002a10ecfd4:  mov     w20, w6

+   0x00000002a10ecfd8:  lsl     w20, w20, #8

+   0x00000002a10ecfdc:  bfxil   x6, x20, #0, #32

+   0x00000002a10ecfe0:  mov     w20, w10

+   0x00000002a10ecfe4:  add     w20, w20, #0x1

+   0x00000002a10ecfe8:  bfxil   x10, x20, #0, #32

+   0x00000002a10ecfec:  mov     w20, w5

+   0x00000002a10ecff0:  mov     w21, w6

+   0x00000002a10ecff4:  orr     w20, w21, w20

+   0x00000002a10ecff8:  bfxil   x6, x20, #0, #32

+   0x00000002a10ecffc:  mov     w20, w9

+   0x00000002a10ed000:  add     w20, w20, #0xc

+   0x00000002a10ed004:  ldr     w20, [x20]

+   0x00000002a10ed008:  bfxil   x5, x20, #0, #32

+   0x00000002a10ed00c:  mov     w20, w6

+   0x00000002a10ed010:  mov     w21, w4

+   0x00000002a10ed014:  ldr     w21, [x21]

+   0x00000002a10ed018:  orr     w20, w21, w20

+=> 0x00000002a10ed01c:  mov     w21, w4

+   0x00000002a10ed020:  str     w20, [x21]

+   0x00000002a10ed024:  mov     w20, w4

+   0x00000002a10ed028:  add     w20, w20, #0x4

+   0x00000002a10ed02c:  bfxil   x4, x20, #0, #32

+   0x00000002a10ed030:  mov     w20, w7

+   0x00000002a10ed034:  mov     w21, w10

+   0x00000002a10ed038:  sub     w22, w21, w20

+   0x00000002a10ed03c:  eor     w23, w21, w20

+   0x00000002a10ed040:  eor     w23, w23, w22

+   0x00000002a10ed044:  ubfx    x23, x23, #4, #1

+   0x00000002a10ed048:  strb    w23, [x28, #708]

+   0x00000002a10ed04c:  ubfx    x23, x22, #31, #1

+   0x00000002a10ed050:  strb    w23, [x28, #711]

+   0x00000002a10ed054:  and     x24, x22, #0xff

+   0x00000002a10ed058:  fmov    d0, x24

+   0x00000002a10ed05c:  cnt     v0.8b, v0.8b

+   0x00000002a10ed060:  addv    b0, v0.8b

+   0x00000002a10ed064:  umov    w24, v0.b[0]

+   0x00000002a10ed068:  eor     x24, x24, #0x1

+   0x00000002a10ed06c:  ubfx    x24, x24, #0, #1

+   0x00000002a10ed070:  strb    w24, [x28, #706]

+   0x00000002a10ed074:  cmp     x22, #0x0

+   0x00000002a10ed078:  cset    x24, eq  // eq = none

+   0x00000002a10ed07c:  strb    w24, [x28, #710]

+   0x00000002a10ed080:  cmp     w21, w20

+   0x00000002a10ed084:  cset    x24, cc  // cc = lo, ul, last

+   0x00000002a10ed088:  strb    w24, [x28, #704]

+   0x00000002a10ed08c:  eor     w20, w21, w20

+   0x00000002a10ed090:  eor     w21, w22, w21

+   0x00000002a10ed094:  and     w20, w20, w21

+   0x00000002a10ed098:  ubfx    x20, x20, #31, #1

+   0x00000002a10ed09c:  strb    w20, [x28, #715]

+   0x00000002a10ed0a0:  cmp     w23, w20

+   0x00000002a10ed0a4:  b.ne    0x2a10ecfa0  // b.any

+   0x00000002a10ed0a8:  b       0x2a10ed0d4

+   0x00000002a10ed0ac:  blr     x0

+   0x00000002a10ed0b0:  ldapurh w0, [x8, #-200]

+   0x00000002a10ed0b4:  udf     #2

+   0x00000002a10ed0b8:  .inst   0x005cadc0 ; undefined

+   0x00000002a10ed0bc:  udf     #0

+   0x00000002a10ed0c0:  udf     #316

+   0x00000002a10ed0c4:  udf     #0

+   0x00000002a10ed0c8:  .inst   0x005cada0 ; undefined

+   0x00000002a10ed0cc:  udf     #0

+   0x00000002a10ed0d0:  udf     #312

+   0x00000002a10ed0d4:  adr     x0, 0x2a10ed0d0

+   0x00000002a10ed0d8:  str     x0, [x28, #184]

+   ```

+

+llvm-mca is quite damning.

+```bash

+llvm-mca -mcpu=cortex-x1c -mattr=+rcpc-immo mania.txt

+Iterations:        100

+Instructions:      6900

+Total Cycles:      5394

+Total uOps:        7100

+

+Dispatch Width:    3

+uOps Per Cycle:    1.32

+IPC:               1.28

+Block RThroughput: 23.7

+```

+

+And here is an ASM test example that can run in our unit tests to see how much the codegen improves.

+

+```asm

+%ifdef CONFIG

+{

+  "Mode": "32BIT"

+}

+%endif

+

+; Original

+; movzx   edx, byte [esi+ecx]

+; movzx   ecx, byte [esi+edi]

+; or      edx, 0xffff0000

+; shl     edx, 0x8

+; inc     esi

+; or      edx, ecx

+; mov     ecx, dword [ebp+0xc {arg5}]

+; or      dword [eax], edx

+; add     eax, 0x4

+; cmp     esi, ebx

+; jl      0x5cada0

+

+mov ebp, 0xe0000000

+

+; [ebp + 0xc] contains src1 offset

+mov dword [ebp + 0xc], 8

+

+; ebx contains loop iteration end offset

+lea ebx, .data

+add ebx, 1

+

+; esi contains the pointer to the data bases

+lea esi, .data

+

+; edi contains src1 offset

+mov edi, 0

+

+; ecx starts off with src1 offset

+mov ecx, dword [ebp + 0xc]

+

+; eax contains the dword destination and src that it accumulates to

+lea eax, .data_dst

+

+; Break the block here for easier viewing.

+jmp .loop_top

+

+; This loop is the hot loop in Sonic Mania.

+.loop_top:

+; Load src1

+movzx edx, byte [esi + ecx]

+; Load src1

+movzx ecx, byte [esi + edi]

+

+; set top two channels to 0xFFFF

+; Src1 in low bytes - 0xff'ff'00'<src1>

+or edx, 0xffff0000

+

+; shl edx by 8, shifting off the top 0xff for some reason

+; Src1 in low bytes - 0xff'ff'00'<src1> -> 0xff'00'<src1>'00

+shl edx, 0x8

+

+; increment byte base

+inc esi

+

+; or in Src1 in to low byte

+; 0xff'00'<src1>'<src2>

+or edx, ecx

+

+; Reload src1 offset since it was overwritten

+mov ecx, dword [ebp + 0xc]

+

+; or dword in to eax dest

+or dword [eax], edx

+

+; Increment eax offset

+add eax, 0x4

+

+

+; See if esi matches the end pointer

+cmp esi, ebx

+

+; Rerun loop if the counter isn't at the end yet.

+jl .loop_top

+

+; Exit

+hlt

+

+.data:

+

+dq 0, 0, 0, 0, 0, 0, 0, 0

+

+.data_dst:

+dq 0, 0, 0, 0, 0, 0, 0, 0

+```

+

+Optimizing the issues in this codegen will significantly improve FEX's codegen all over the codebase. So it will be a significant win everywhere. This is some low hanging fruit in our codegen, theoretically fixing most of this should be easy.
\ No newline at end of file