summary refs log tree commit diff stats
path: root/results/classifier/zero-shot-user-mode/output/runtime/2083
diff options
context:
space:
mode:
Diffstat (limited to 'results/classifier/zero-shot-user-mode/output/runtime/2083')
-rw-r--r--results/classifier/zero-shot-user-mode/output/runtime/2083117
1 files changed, 117 insertions, 0 deletions
diff --git a/results/classifier/zero-shot-user-mode/output/runtime/2083 b/results/classifier/zero-shot-user-mode/output/runtime/2083
new file mode 100644
index 000000000..68088b0ba
--- /dev/null
+++ b/results/classifier/zero-shot-user-mode/output/runtime/2083
@@ -0,0 +1,117 @@
+runtime: 0.389
+instruction: 0.382
+syscall: 0.229
+
+
+
+AArch64 SME SMOPA (4-way) outer product instruction gives incorrect result
+Description of problem:
+The SME SMOPA (4-way) instruction ([spec](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-?lang=en)) is giving incorrect result. Example below for 8-bit variant, which is equivalent to following Python example (128-bit VL) to make it clearer:
+
+```
+import numpy as np
+vl = 128
+esize = 32
+dim = vl // esize
+
+A = range(16)
+B = range(16, 32)
+C = np.zeros((4, 4,), dtype=np.int32)
+
+for row in range(dim):
+    for col in range(dim):
+        for k in range(4):
+            C[row, col] += A[4*row + k] * B[4*col + k]
+
+print(C)
+
+[[ 110  134  158  182]
+ [ 390  478  566  654]
+ [ 670  822  974 1126]
+ [ 950 1166 1382 1598]]
+```
+
+main.c
+```
+#include <stdio.h>
+#include <stdint.h>
+
+void foo(int *dst);
+
+int main() {
+  int32_t dst[16];
+  foo(dst);
+
+  // This should print:
+  // >>> 110  134  158  182
+  // >>> 390  478  566  654
+  // >>> 670  822  974  1126
+  // >>> 950  1166  1382  1598
+  for (int i=0; i<4; ++i) {
+    printf(">>> ");
+    for (int j=0; j<4; ++j) {
+      printf("%d  ", dst[i * 4 + j]);
+    }
+    printf("\n");
+  }
+}
+```
+
+foo.S
+
+```
+.global foo
+foo:
+  stp x29, x30, [sp, -80]!
+  mov x29, sp
+  stp d8, d9, [sp, 16]
+  stp d10, d11, [sp, 32]
+  stp d12, d13, [sp, 48]
+  stp d14, d15, [sp, 64]
+
+  smstart
+
+  ptrue p0.b
+  index z0.b, #0, #1
+  mov   z1.d, z0.d
+  add   z1.b, z1.b, #16
+
+  zero  {za}
+  smopa za0.s, p0/m, p0/m, z0.b, z1.b
+
+  // Read the first 4x4 sub-matrix of elements from tile 0:
+  mov w12, #0
+  mova z0.s, p0/m, za0h.s[w12, #0]
+  mova z1.s, p0/m, za0h.s[w12, #1]
+  mova z2.s, p0/m, za0h.s[w12, #2]
+  mova z3.s, p0/m, za0h.s[w12, #3]
+
+  // And store them to the input pointer (dst in the C code):
+  st1w {z0.s}, p0, [x0]
+  add x0, x0, #16
+  st1w {z1.s}, p0, [x0]
+  add x0, x0, #16
+  st1w {z2.s}, p0, [x0]
+  add x0, x0, #16
+  st1w {z3.s}, p0, [x0]
+
+  smstop
+
+  ldp d8, d9, [sp, 16]
+  ldp d10, d11, [sp, 32]
+  ldp d12, d13, [sp, 48]
+  ldp d14, d15, [sp, 64]
+  ldp x29, x30, [sp], 80
+  ret
+```
+Steps to reproduce:
+```
+$ clang -target aarch64-linux-gnu -march=armv9-a+sme main.c foo.S
+$ ~/qemu/build/qemu-aarch64 -cpu max,sme128=on a.out
+>>> 110  478  158  654
+>>> 0  0  0  0
+>>> 670  1166  974  1598
+>>> 0  0  0  0
+```
+Additional information:
+