summary refs log tree commit diff stats
path: root/results/scraper/fex/3364
diff options
context:
space:
mode:
Diffstat (limited to 'results/scraper/fex/3364')
-rw-r--r--results/scraper/fex/336430
1 files changed, 30 insertions, 0 deletions
diff --git a/results/scraper/fex/3364 b/results/scraper/fex/3364
new file mode 100644
index 000000000..c1678a692
--- /dev/null
+++ b/results/scraper/fex/3364
@@ -0,0 +1,30 @@
+Using STNT1B to implement MOVNTDQ
+As the title says. ASIMD only has STNP which doesn't match semantics. Using STNT1B to match semantics.

+Although execution latency is not great on A715 for this instruction. (It gets a bit better on Neoverse-V2).

+

+Found with a hot-loop in d3dcore.dll in Proton doing a non-temporal memcpy. Consuming about 1.5% CPU time.

+``` 

+1caf3a020  movdqa  xmm4, xmmword [r10+rax]

+1caf3a026  movntdq xmmword [rcx+rax], xmm4

+1caf3a02b  add     rax, 0x10

+1caf3a02f  cmp     rdx, rax

+1caf3a032  ja      0x1caf3a020

+```

+rdx is 0x1500 in the hot loop that I found

+

+```

+  0x00007ffac21cc384:  adr     x0, 0x7ffac21cc380

+   0x00007ffac21cc388:  str     x0, [x28, #184]

+   0x00007ffac21cc38c:  ldr     q20, [x4, x14, sxtx]

+   0x00007ffac21cc390:  str     q20, [x4, x5, sxtx]

+=> 0x00007ffac21cc394:  add     x4, x4, #0x10

+   0x00007ffac21cc398:  sub     x26, x6, x4

+   0x00007ffac21cc39c:  eor     x27, x6, x4

+   0x00007ffac21cc3a0:  cmp     x6, x4

+   0x00007ffac21cc3a4:  cfinv

+   0x00007ffac21cc3a8:  cset    x20, cc  // cc = lo, ul, last

+   0x00007ffac21cc3ac:  csel    x20, x20, xzr, ne  // ne = any

+   0x00007ffac21cc3b0:  cbnz    x20, 0x7ffac21cc38c

+   0x00007ffac21cc3b4:  b       0x7ffac21cc3f0

+   0x00007ffac21cc3b8:  blr     x0

+```
\ No newline at end of file