summaryrefslogtreecommitdiffstats
path: root/results/classifier/accel-gemma3:12b/tcg/2083
diff options
context:
space:
mode:
authorChristian Krinitsin <mail@krinitsin.com>2025-07-03 16:27:09 +0000
committerChristian Krinitsin <mail@krinitsin.com>2025-07-03 16:27:09 +0000
commit4d9e26c0333abd39bdbd039dcdb30ed429c475ba (patch)
tree4010d5fb3e8bc48c110a2c1ff2a16b8648cb86bb /results/classifier/accel-gemma3:12b/tcg/2083
parent5541099586dbd6018574cb44e1934907c121526f (diff)
downloademulator-bug-study-4d9e26c0333abd39bdbd039dcdb30ed429c475ba.tar.gz
emulator-bug-study-4d9e26c0333abd39bdbd039dcdb30ed429c475ba.zip
add gemma accelerator classification results
Diffstat (limited to 'results/classifier/accel-gemma3:12b/tcg/2083')
-rw-r--r--results/classifier/accel-gemma3:12b/tcg/2083112
1 files changed, 112 insertions, 0 deletions
diff --git a/results/classifier/accel-gemma3:12b/tcg/2083 b/results/classifier/accel-gemma3:12b/tcg/2083
new file mode 100644
index 00000000..091fb976
--- /dev/null
+++ b/results/classifier/accel-gemma3:12b/tcg/2083
@@ -0,0 +1,112 @@
+
+AArch64 SME SMOPA (4-way) outer product instruction gives incorrect result
+Description of problem:
+The SME SMOPA (4-way) instruction ([spec](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-?lang=en)) is giving incorrect result. Example below for 8-bit variant, which is equivalent to following Python example (128-bit VL) to make it clearer:
+
+```
+import numpy as np
+vl = 128
+esize = 32
+dim = vl // esize
+
+A = range(16)
+B = range(16, 32)
+C = np.zeros((4, 4,), dtype=np.int32)
+
+for row in range(dim):
+ for col in range(dim):
+ for k in range(4):
+ C[row, col] += A[4*row + k] * B[4*col + k]
+
+print(C)
+
+[[ 110 134 158 182]
+ [ 390 478 566 654]
+ [ 670 822 974 1126]
+ [ 950 1166 1382 1598]]
+```
+
+main.c
+```
+#include <stdio.h>
+#include <stdint.h>
+
+void foo(int *dst);
+
+int main() {
+ int32_t dst[16];
+ foo(dst);
+
+ // This should print:
+ // >>> 110 134 158 182
+ // >>> 390 478 566 654
+ // >>> 670 822 974 1126
+ // >>> 950 1166 1382 1598
+ for (int i=0; i<4; ++i) {
+ printf(">>> ");
+ for (int j=0; j<4; ++j) {
+ printf("%d ", dst[i * 4 + j]);
+ }
+ printf("\n");
+ }
+}
+```
+
+foo.S
+
+```
+.global foo
+foo:
+ stp x29, x30, [sp, -80]!
+ mov x29, sp
+ stp d8, d9, [sp, 16]
+ stp d10, d11, [sp, 32]
+ stp d12, d13, [sp, 48]
+ stp d14, d15, [sp, 64]
+
+ smstart
+
+ ptrue p0.b
+ index z0.b, #0, #1
+ mov z1.d, z0.d
+ add z1.b, z1.b, #16
+
+ zero {za}
+ smopa za0.s, p0/m, p0/m, z0.b, z1.b
+
+ // Read the first 4x4 sub-matrix of elements from tile 0:
+ mov w12, #0
+ mova z0.s, p0/m, za0h.s[w12, #0]
+ mova z1.s, p0/m, za0h.s[w12, #1]
+ mova z2.s, p0/m, za0h.s[w12, #2]
+ mova z3.s, p0/m, za0h.s[w12, #3]
+
+ // And store them to the input pointer (dst in the C code):
+ st1w {z0.s}, p0, [x0]
+ add x0, x0, #16
+ st1w {z1.s}, p0, [x0]
+ add x0, x0, #16
+ st1w {z2.s}, p0, [x0]
+ add x0, x0, #16
+ st1w {z3.s}, p0, [x0]
+
+ smstop
+
+ ldp d8, d9, [sp, 16]
+ ldp d10, d11, [sp, 32]
+ ldp d12, d13, [sp, 48]
+ ldp d14, d15, [sp, 64]
+ ldp x29, x30, [sp], 80
+ ret
+```
+Steps to reproduce:
+```
+$ clang -target aarch64-linux-gnu -march=armv9-a+sme main.c foo.S
+$ ~/qemu/build/qemu-aarch64 -cpu max,sme128=on a.out
+>>> 110 478 158 654
+>>> 0 0 0 0
+>>> 670 1166 974 1598
+>>> 0 0 0 0
+```
+Additional information:
+