diff options
| author | Christian Krinitsin <mail@krinitsin.com> | 2025-07-17 09:10:43 +0200 |
|---|---|---|
| committer | Christian Krinitsin <mail@krinitsin.com> | 2025-07-17 09:10:43 +0200 |
| commit | f2ec263023649e596c5076df32c2d328bc9393d2 (patch) | |
| tree | 5dd86caab46e552bd2e62bf9c4fb1a7504a44db4 /results/scraper/fex/2681 | |
| parent | 63d2e9d409831aa8582787234cae4741847504b7 (diff) | |
| download | qemu-analysis-main.tar.gz qemu-analysis-main.zip | |
Diffstat (limited to 'results/scraper/fex/2681')
| -rw-r--r-- | results/scraper/fex/2681 | 216 |
1 files changed, 216 insertions, 0 deletions
diff --git a/results/scraper/fex/2681 b/results/scraper/fex/2681 new file mode 100644 index 000000000..5848d031b --- /dev/null +++ b/results/scraper/fex/2681 @@ -0,0 +1,216 @@ +Sonic Mania movie player very slow block. +Caught while profiling, the Sonic Mania movie player consumes 72% of CPU time in a **SINGLE** block. +This block is mostly just bad because of bad codegen. + + +Block ripped from their deobfuscated executable (32-bit x86). This block jumps to itself. +```asm +movzx edx, byte [esi+ecx] +movzx ecx, byte [esi+edi] +or edx, 0xffff0000 +shl edx, 0x8 +inc esi +or edx, ecx +mov ecx, dword [ebp+0xc {arg5}] +or dword [eax], edx +add eax, 0x4 +cmp esi, ebx +jl 0x5cada0 +``` + +This block is fairly simple, it's combining two bytestreams and a 32-bit stream in to one. Looks like some sort of RGBA combination where one channel is already stored in the destination stream and alpha is forced to 0xFF. +This has some absolutely abysmal codegen. +```asm +(gdb) disas 0x2a10ecfa0,+0x0000013c +Dump of assembler code from 0x2a10ecfa0 to 0x2a10ed0dc: + 0x00000002a10ecfa0: mov w20, w5 + 0x00000002a10ecfa4: mov w21, w10 + 0x00000002a10ecfa8: add w20, w20, w21 + 0x00000002a10ecfac: ldrb w20, [x20] + 0x00000002a10ecfb0: bfxil x6, x20, #0, #32 + 0x00000002a10ecfb4: mov w20, w11 + 0x00000002a10ecfb8: mov w21, w10 + 0x00000002a10ecfbc: add w20, w20, w21 + 0x00000002a10ecfc0: ldrb w20, [x20] + 0x00000002a10ecfc4: bfxil x5, x20, #0, #32 + 0x00000002a10ecfc8: mov w20, w6 + 0x00000002a10ecfcc: orr w20, w20, #0xffff0000 + 0x00000002a10ecfd0: bfxil x6, x20, #0, #32 + 0x00000002a10ecfd4: mov w20, w6 + 0x00000002a10ecfd8: lsl w20, w20, #8 + 0x00000002a10ecfdc: bfxil x6, x20, #0, #32 + 0x00000002a10ecfe0: mov w20, w10 + 0x00000002a10ecfe4: add w20, w20, #0x1 + 0x00000002a10ecfe8: bfxil x10, x20, #0, #32 + 0x00000002a10ecfec: mov w20, w5 + 0x00000002a10ecff0: mov w21, w6 + 0x00000002a10ecff4: orr w20, w21, w20 + 0x00000002a10ecff8: bfxil x6, x20, #0, #32 + 0x00000002a10ecffc: mov w20, w9 + 0x00000002a10ed000: add w20, w20, #0xc + 0x00000002a10ed004: ldr w20, [x20] + 0x00000002a10ed008: bfxil x5, x20, #0, #32 + 0x00000002a10ed00c: mov w20, w6 + 0x00000002a10ed010: mov w21, w4 + 0x00000002a10ed014: ldr w21, [x21] + 0x00000002a10ed018: orr w20, w21, w20 +=> 0x00000002a10ed01c: mov w21, w4 + 0x00000002a10ed020: str w20, [x21] + 0x00000002a10ed024: mov w20, w4 + 0x00000002a10ed028: add w20, w20, #0x4 + 0x00000002a10ed02c: bfxil x4, x20, #0, #32 + 0x00000002a10ed030: mov w20, w7 + 0x00000002a10ed034: mov w21, w10 + 0x00000002a10ed038: sub w22, w21, w20 + 0x00000002a10ed03c: eor w23, w21, w20 + 0x00000002a10ed040: eor w23, w23, w22 + 0x00000002a10ed044: ubfx x23, x23, #4, #1 + 0x00000002a10ed048: strb w23, [x28, #708] + 0x00000002a10ed04c: ubfx x23, x22, #31, #1 + 0x00000002a10ed050: strb w23, [x28, #711] + 0x00000002a10ed054: and x24, x22, #0xff + 0x00000002a10ed058: fmov d0, x24 + 0x00000002a10ed05c: cnt v0.8b, v0.8b + 0x00000002a10ed060: addv b0, v0.8b + 0x00000002a10ed064: umov w24, v0.b[0] + 0x00000002a10ed068: eor x24, x24, #0x1 + 0x00000002a10ed06c: ubfx x24, x24, #0, #1 + 0x00000002a10ed070: strb w24, [x28, #706] + 0x00000002a10ed074: cmp x22, #0x0 + 0x00000002a10ed078: cset x24, eq // eq = none + 0x00000002a10ed07c: strb w24, [x28, #710] + 0x00000002a10ed080: cmp w21, w20 + 0x00000002a10ed084: cset x24, cc // cc = lo, ul, last + 0x00000002a10ed088: strb w24, [x28, #704] + 0x00000002a10ed08c: eor w20, w21, w20 + 0x00000002a10ed090: eor w21, w22, w21 + 0x00000002a10ed094: and w20, w20, w21 + 0x00000002a10ed098: ubfx x20, x20, #31, #1 + 0x00000002a10ed09c: strb w20, [x28, #715] + 0x00000002a10ed0a0: cmp w23, w20 + 0x00000002a10ed0a4: b.ne 0x2a10ecfa0 // b.any + 0x00000002a10ed0a8: b 0x2a10ed0d4 + 0x00000002a10ed0ac: blr x0 + 0x00000002a10ed0b0: ldapurh w0, [x8, #-200] + 0x00000002a10ed0b4: udf #2 + 0x00000002a10ed0b8: .inst 0x005cadc0 ; undefined + 0x00000002a10ed0bc: udf #0 + 0x00000002a10ed0c0: udf #316 + 0x00000002a10ed0c4: udf #0 + 0x00000002a10ed0c8: .inst 0x005cada0 ; undefined + 0x00000002a10ed0cc: udf #0 + 0x00000002a10ed0d0: udf #312 + 0x00000002a10ed0d4: adr x0, 0x2a10ed0d0 + 0x00000002a10ed0d8: str x0, [x28, #184] + ``` + +llvm-mca is quite damning. +```bash +llvm-mca -mcpu=cortex-x1c -mattr=+rcpc-immo mania.txt +Iterations: 100 +Instructions: 6900 +Total Cycles: 5394 +Total uOps: 7100 + +Dispatch Width: 3 +uOps Per Cycle: 1.32 +IPC: 1.28 +Block RThroughput: 23.7 +``` + +And here is an ASM test example that can run in our unit tests to see how much the codegen improves. + +```asm +%ifdef CONFIG +{ + "Mode": "32BIT" +} +%endif + +; Original +; movzx edx, byte [esi+ecx] +; movzx ecx, byte [esi+edi] +; or edx, 0xffff0000 +; shl edx, 0x8 +; inc esi +; or edx, ecx +; mov ecx, dword [ebp+0xc {arg5}] +; or dword [eax], edx +; add eax, 0x4 +; cmp esi, ebx +; jl 0x5cada0 + +mov ebp, 0xe0000000 + +; [ebp + 0xc] contains src1 offset +mov dword [ebp + 0xc], 8 + +; ebx contains loop iteration end offset +lea ebx, .data +add ebx, 1 + +; esi contains the pointer to the data bases +lea esi, .data + +; edi contains src1 offset +mov edi, 0 + +; ecx starts off with src1 offset +mov ecx, dword [ebp + 0xc] + +; eax contains the dword destination and src that it accumulates to +lea eax, .data_dst + +; Break the block here for easier viewing. +jmp .loop_top + +; This loop is the hot loop in Sonic Mania. +.loop_top: +; Load src1 +movzx edx, byte [esi + ecx] +; Load src1 +movzx ecx, byte [esi + edi] + +; set top two channels to 0xFFFF +; Src1 in low bytes - 0xff'ff'00'<src1> +or edx, 0xffff0000 + +; shl edx by 8, shifting off the top 0xff for some reason +; Src1 in low bytes - 0xff'ff'00'<src1> -> 0xff'00'<src1>'00 +shl edx, 0x8 + +; increment byte base +inc esi + +; or in Src1 in to low byte +; 0xff'00'<src1>'<src2> +or edx, ecx + +; Reload src1 offset since it was overwritten +mov ecx, dword [ebp + 0xc] + +; or dword in to eax dest +or dword [eax], edx + +; Increment eax offset +add eax, 0x4 + + +; See if esi matches the end pointer +cmp esi, ebx + +; Rerun loop if the counter isn't at the end yet. +jl .loop_top + +; Exit +hlt + +.data: + +dq 0, 0, 0, 0, 0, 0, 0, 0 + +.data_dst: +dq 0, 0, 0, 0, 0, 0, 0, 0 +``` + +Optimizing the issues in this codegen will significantly improve FEX's codegen all over the codebase. So it will be a significant win everywhere. This is some low hanging fruit in our codegen, theoretically fixing most of this should be easy. \ No newline at end of file |