summary refs log tree commit diff stats
path: root/results/scraper/fex/2707
diff options
context:
space:
mode:
authorChristian Krinitsin <mail@krinitsin.com>2025-07-17 09:10:43 +0200
committerChristian Krinitsin <mail@krinitsin.com>2025-07-17 09:10:43 +0200
commitf2ec263023649e596c5076df32c2d328bc9393d2 (patch)
tree5dd86caab46e552bd2e62bf9c4fb1a7504a44db4 /results/scraper/fex/2707
parent63d2e9d409831aa8582787234cae4741847504b7 (diff)
downloadqemu-analysis-main.tar.gz
qemu-analysis-main.zip
add downloaded fex bug-reports HEAD main
Diffstat (limited to 'results/scraper/fex/2707')
-rw-r--r--results/scraper/fex/270753
1 files changed, 53 insertions, 0 deletions
diff --git a/results/scraper/fex/2707 b/results/scraper/fex/2707
new file mode 100644
index 000000000..831d3300d
--- /dev/null
+++ b/results/scraper/fex/2707
@@ -0,0 +1,53 @@
+Optimize PF calculation
+The PF calculation is very spicy and isn't generating optimal codegen.

+

+Current codegen:

+```asm

+   0x00000002a10ed054:  and     x24, x22, #0xff

+   0x00000002a10ed058:  fmov    d0, x24

+   0x00000002a10ed05c:  cnt     v0.8b, v0.8b

+   0x00000002a10ed060:  addv    b0, v0.8b

+   0x00000002a10ed064:  umov    w24, v0.b[0]

+   0x00000002a10ed068:  eor     x24, x24, #0x1

+   0x00000002a10ed06c:  ubfx    x24, x24, #0, #1

+   0x00000002a10ed070:  strb    w24, [x28, #706]

+   ```

+What it can be:

+```asm

+   0x00000002a10ed064:  movi    v1.8b, #1

+   0x00000002a10ed058:  fmov    s0, w22

+   0x00000002a10ed05c:  cnt     v0.8b, v0.8b

+   0x00000002a10ed068:  bic     v0.8b, v1.8b, v0.8b

+   0x00000002a10ed070:  str     b0, [x28, #706]

+   ```

+   

+Not a significant reduction in instructions, but removes a FPR->GPR move which is quite an improvement.

+

+For CPUs with CSSC extension this can be even more optimal but even the upcoming X4/A720/A520 doesn't support them.

+

+llvm-mca results:

+Before:

+```

+Iterations:        100

+Instructions:      800

+Total Cycles:      315

+Total uOps:        1000

+

+Dispatch Width:    10

+uOps Per Cycle:    3.17

+IPC:               2.54

+Block RThroughput: 3.0

+```

+

+After:

+```

+Iterations:        100

+Instructions:      500

+Total Cycles:      408

+Total uOps:        600

+

+Dispatch Width:    10

+uOps Per Cycle:    1.47

+IPC:               1.23

+Block RThroughput: 3.0

+```
\ No newline at end of file