diff options
| author | Christian Krinitsin <mail@krinitsin.com> | 2025-07-06 16:43:19 +0000 |
|---|---|---|
| committer | Christian Krinitsin <mail@krinitsin.com> | 2025-07-06 16:43:19 +0000 |
| commit | 238ec2b7cc1557d6f34c33cc482e4d0cd3e266dd (patch) | |
| tree | cd8a1b75ba7b3543eb7fe6857f408e7be4d9fd0b /results/classifier/gemma3:27b/instruction/1620 | |
| parent | 96049c939b1916d80532630d63c14e04d5244f1d (diff) | |
| download | emulator-bug-study-238ec2b7cc1557d6f34c33cc482e4d0cd3e266dd.tar.gz emulator-bug-study-238ec2b7cc1557d6f34c33cc482e4d0cd3e266dd.zip | |
add results
Diffstat (limited to 'results/classifier/gemma3:27b/instruction/1620')
| -rw-r--r-- | results/classifier/gemma3:27b/instruction/1620 | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/results/classifier/gemma3:27b/instruction/1620 b/results/classifier/gemma3:27b/instruction/1620 new file mode 100644 index 00000000..e286fd22 --- /dev/null +++ b/results/classifier/gemma3:27b/instruction/1620 @@ -0,0 +1,97 @@ + + + +SME FMOPA outer product instruction gives incorrect result +Description of problem: +The SME outer product instructions operate on tiles of elements. In the +below example we are performing an outer product of a vector of 1.0 +by itself. This naturally should produce a matrix filled with 1.0 +values, however if we read the values of the tile and printf them we +instead observe 0.0 values. + +Without digging into the underlying QEMU code this appears to be a bug +in how elements are set based on the tile number, since the same code +using za0.s rather than za1.s correctly reports all 1.0 values as output +as expected. + +main.c +``` +#include <stdio.h> + +void foo(float *dst); + +int main() { + float dst[16]; + foo(dst); + + // This should print: + // >>> 1.000000 1.000000 1.000000 1.000000 + // >>> 1.000000 1.000000 1.000000 1.000000 + // >>> 1.000000 1.000000 1.000000 1.000000 + // >>> 1.000000 1.000000 1.000000 1.000000 + for (int i=0; i<4; ++i) { + printf(">>> "); + for (int j=0; j<4; ++j) { + printf("%lf ", (double)dst[i * 4 + j]); + } + printf("\n"); + } +} +``` + +foo.S +``` +.global foo +foo: + stp x29, x30, [sp, -80]! + mov x29, sp + stp d8, d9, [sp, 16] + stp d10, d11, [sp, 32] + stp d12, d13, [sp, 48] + stp d14, d15, [sp, 64] + + smstart + + ptrue p0.s, vl4 + fmov z0.s, #1.0 + + // An outer product of a vector of 1.0 by itself should be a matrix of 1.0. + // Note that we are using tile 1 here (za1.s) rather than tile 0. + zero {za} + fmopa za1.s, p0/m, p0/m, z0.s, z0.s + + // Read the first 4x4 sub-matrix of elements from tile 1: + // Note that za1h should be interchangable here. + mov w12, #0 + mova z0.s, p0/m, za1v.s[w12, #0] + mova z1.s, p0/m, za1v.s[w12, #1] + mova z2.s, p0/m, za1v.s[w12, #2] + mova z3.s, p0/m, za1v.s[w12, #3] + + // And store them to the input pointer (dst in the C code): + st1w {z0.s}, p0, [x0] + add x0, x0, #16 + st1w {z1.s}, p0, [x0] + add x0, x0, #16 + st1w {z2.s}, p0, [x0] + add x0, x0, #16 + st1w {z3.s}, p0, [x0] + + smstop + + ldp d8, d9, [sp, 16] + ldp d10, d11, [sp, 32] + ldp d12, d13, [sp, 48] + ldp d14, d15, [sp, 64] + ldp x29, x30, [sp], 80 + ret +``` +Steps to reproduce: +``` +$ clang -target aarch64-linux-gnu -march=armv9-a+sme test.c -O1 -static +$ ~/qemu/build/qemu-aarch64 ./a.out +>>> 0.000000 0.000000 0.000000 0.000000 +>>> 0.000000 0.000000 0.000000 0.000000 +>>> 0.000000 0.000000 0.000000 0.000000 +>>> 0.000000 0.000000 0.000000 0.000000 +``` |