diff options
Diffstat (limited to 'results/classifier/118/none/2083')
| -rw-r--r-- | results/classifier/118/none/2083 | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/results/classifier/118/none/2083 b/results/classifier/118/none/2083 new file mode 100644 index 00000000..7643df53 --- /dev/null +++ b/results/classifier/118/none/2083 @@ -0,0 +1,141 @@ +user-level: 0.780 +performance: 0.747 +graphic: 0.709 +permissions: 0.689 +architecture: 0.668 +device: 0.649 +arm: 0.642 +PID: 0.624 +semantic: 0.614 +debug: 0.596 +register: 0.590 +socket: 0.582 +hypervisor: 0.564 +virtual: 0.543 +peripherals: 0.534 +risc-v: 0.533 +assembly: 0.527 +mistranslation: 0.525 +TCG: 0.524 +ppc: 0.520 +i386: 0.519 +VMM: 0.490 +files: 0.470 +vnc: 0.458 +network: 0.415 +kernel: 0.387 +boot: 0.349 +KVM: 0.346 +x86: 0.312 + +AArch64 SME SMOPA (4-way) outer product instruction gives incorrect result +Description of problem: +The SME SMOPA (4-way) instruction ([spec](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-?lang=en)) is giving incorrect result. Example below for 8-bit variant, which is equivalent to following Python example (128-bit VL) to make it clearer: + +``` +import numpy as np +vl = 128 +esize = 32 +dim = vl // esize + +A = range(16) +B = range(16, 32) +C = np.zeros((4, 4,), dtype=np.int32) + +for row in range(dim): + for col in range(dim): + for k in range(4): + C[row, col] += A[4*row + k] * B[4*col + k] + +print(C) + +[[ 110 134 158 182] + [ 390 478 566 654] + [ 670 822 974 1126] + [ 950 1166 1382 1598]] +``` + +main.c +``` +#include <stdio.h> +#include <stdint.h> + +void foo(int *dst); + +int main() { + int32_t dst[16]; + foo(dst); + + // This should print: + // >>> 110 134 158 182 + // >>> 390 478 566 654 + // >>> 670 822 974 1126 + // >>> 950 1166 1382 1598 + for (int i=0; i<4; ++i) { + printf(">>> "); + for (int j=0; j<4; ++j) { + printf("%d ", dst[i * 4 + j]); + } + printf("\n"); + } +} +``` + +foo.S + +``` +.global foo +foo: + stp x29, x30, [sp, -80]! + mov x29, sp + stp d8, d9, [sp, 16] + stp d10, d11, [sp, 32] + stp d12, d13, [sp, 48] + stp d14, d15, [sp, 64] + + smstart + + ptrue p0.b + index z0.b, #0, #1 + mov z1.d, z0.d + add z1.b, z1.b, #16 + + zero {za} + smopa za0.s, p0/m, p0/m, z0.b, z1.b + + // Read the first 4x4 sub-matrix of elements from tile 0: + mov w12, #0 + mova z0.s, p0/m, za0h.s[w12, #0] + mova z1.s, p0/m, za0h.s[w12, #1] + mova z2.s, p0/m, za0h.s[w12, #2] + mova z3.s, p0/m, za0h.s[w12, #3] + + // And store them to the input pointer (dst in the C code): + st1w {z0.s}, p0, [x0] + add x0, x0, #16 + st1w {z1.s}, p0, [x0] + add x0, x0, #16 + st1w {z2.s}, p0, [x0] + add x0, x0, #16 + st1w {z3.s}, p0, [x0] + + smstop + + ldp d8, d9, [sp, 16] + ldp d10, d11, [sp, 32] + ldp d12, d13, [sp, 48] + ldp d14, d15, [sp, 64] + ldp x29, x30, [sp], 80 + ret +``` +Steps to reproduce: +``` +$ clang -target aarch64-linux-gnu -march=armv9-a+sme main.c foo.S +$ ~/qemu/build/qemu-aarch64 -cpu max,sme128=on a.out +>>> 110 478 158 654 +>>> 0 0 0 0 +>>> 670 1166 974 1598 +>>> 0 0 0 0 +``` +Additional information: + |