add gemma accelerator classification results

author: Christian Krinitsin <mail@krinitsin.com> 2025-07-03 16:27:09 +0000
committer: Christian Krinitsin <mail@krinitsin.com> 2025-07-03 16:27:09 +0000
commit: 4d9e26c0333abd39bdbd039dcdb30ed429c475ba (patch)
tree: 4010d5fb3e8bc48c110a2c1ff2a16b8648cb86bb /results/classifier/accel-gemma3:12b/tcg/2083
parent: 5541099586dbd6018574cb44e1934907c121526f (diff)
download: emulator-bug-study-4d9e26c0333abd39bdbd039dcdb30ed429c475ba.tar.gz
emulator-bug-study-4d9e26c0333abd39bdbd039dcdb30ed429c475ba.zip
1 files changed, 112 insertions, 0 deletions
diff --git a/results/classifier/accel-gemma3:12b/tcg/2083 b/results/classifier/accel-gemma3:12b/tcg/2083
new file mode 100644
index 00000000..091fb976
--- /dev/null
+++ b/results/classifier/accel-gemma3:12b/tcg/2083
@@ -0,0 +1,112 @@
+
+AArch64 SME SMOPA (4-way) outer product instruction gives incorrect result
+Description of problem:
+The SME SMOPA (4-way) instruction ([spec](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-?lang=en)) is giving incorrect result. Example below for 8-bit variant, which is equivalent to following Python example (128-bit VL) to make it clearer:
+
+```
+import numpy as np
+vl = 128
+esize = 32
+dim = vl // esize
+
+A = range(16)
+B = range(16, 32)
+C = np.zeros((4, 4,), dtype=np.int32)
+
+for row in range(dim):
+    for col in range(dim):
+        for k in range(4):
+            C[row, col] += A[4*row + k] * B[4*col + k]
+
+print(C)
+
+[[ 110  134  158  182]
+ [ 390  478  566  654]
+ [ 670  822  974 1126]
+ [ 950 1166 1382 1598]]
+```
+
+main.c
+```
+#include <stdio.h>
+#include <stdint.h>
+
+void foo(int *dst);
+
+int main() {
+  int32_t dst[16];
+  foo(dst);
+
+  // This should print:
+  // >>> 110  134  158  182
+  // >>> 390  478  566  654
+  // >>> 670  822  974  1126
+  // >>> 950  1166  1382  1598
+  for (int i=0; i<4; ++i) {
+    printf(">>> ");
+    for (int j=0; j<4; ++j) {
+      printf("%d  ", dst[i * 4 + j]);
+    }
+    printf("\n");
+  }
+}
+```
+
+foo.S
+
+```
+.global foo
+foo:
+  stp x29, x30, [sp, -80]!
+  mov x29, sp
+  stp d8, d9, [sp, 16]
+  stp d10, d11, [sp, 32]
+  stp d12, d13, [sp, 48]
+  stp d14, d15, [sp, 64]
+
+  smstart
+
+  ptrue p0.b
+  index z0.b, #0, #1
+  mov   z1.d, z0.d
+  add   z1.b, z1.b, #16
+
+  zero  {za}
+  smopa za0.s, p0/m, p0/m, z0.b, z1.b
+
+  // Read the first 4x4 sub-matrix of elements from tile 0:
+  mov w12, #0
+  mova z0.s, p0/m, za0h.s[w12, #0]
+  mova z1.s, p0/m, za0h.s[w12, #1]
+  mova z2.s, p0/m, za0h.s[w12, #2]
+  mova z3.s, p0/m, za0h.s[w12, #3]
+
+  // And store them to the input pointer (dst in the C code):
+  st1w {z0.s}, p0, [x0]
+  add x0, x0, #16
+  st1w {z1.s}, p0, [x0]
+  add x0, x0, #16
+  st1w {z2.s}, p0, [x0]
+  add x0, x0, #16
+  st1w {z3.s}, p0, [x0]
+
+  smstop
+
+  ldp d8, d9, [sp, 16]
+  ldp d10, d11, [sp, 32]
+  ldp d12, d13, [sp, 48]
+  ldp d14, d15, [sp, 64]
+  ldp x29, x30, [sp], 80
+  ret
+```
+Steps to reproduce:
+```
+$ clang -target aarch64-linux-gnu -march=armv9-a+sme main.c foo.S
+$ ~/qemu/build/qemu-aarch64 -cpu max,sme128=on a.out
+>>> 110  478  158  654
+>>> 0  0  0  0
+>>> 670  1166  974  1598
+>>> 0  0  0  0
+```
+Additional information:
+
author	Christian Krinitsin <mail@krinitsin.com>	2025-07-03 16:27:09 +0000
committer	Christian Krinitsin <mail@krinitsin.com>	2025-07-03 16:27:09 +0000
commit	4d9e26c0333abd39bdbd039dcdb30ed429c475ba (patch)
tree	4010d5fb3e8bc48c110a2c1ff2a16b8648cb86bb /results/classifier/accel-gemma3:12b/tcg/2083
parent	5541099586dbd6018574cb44e1934907c121526f (diff)
download	emulator-bug-study-4d9e26c0333abd39bdbd039dcdb30ed429c475ba.tar.gz emulator-bug-study-4d9e26c0333abd39bdbd039dcdb30ed429c475ba.zip