diff options
Diffstat (limited to 'gitlab/issues_text/target_arm/host_missing/accel_TCG/2083')
| -rw-r--r-- | gitlab/issues_text/target_arm/host_missing/accel_TCG/2083 | 111 |
1 files changed, 0 insertions, 111 deletions
diff --git a/gitlab/issues_text/target_arm/host_missing/accel_TCG/2083 b/gitlab/issues_text/target_arm/host_missing/accel_TCG/2083 deleted file mode 100644 index 0e6cc43a2..000000000 --- a/gitlab/issues_text/target_arm/host_missing/accel_TCG/2083 +++ /dev/null @@ -1,111 +0,0 @@ -AArch64 SME SMOPA (4-way) outer product instruction gives incorrect result -Description of problem: -The SME SMOPA (4-way) instruction ([spec](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-?lang=en)) is giving incorrect result. Example below for 8-bit variant, which is equivalent to following Python example (128-bit VL) to make it clearer: - -``` -import numpy as np -vl = 128 -esize = 32 -dim = vl // esize - -A = range(16) -B = range(16, 32) -C = np.zeros((4, 4,), dtype=np.int32) - -for row in range(dim): - for col in range(dim): - for k in range(4): - C[row, col] += A[4*row + k] * B[4*col + k] - -print(C) - -[[ 110 134 158 182] - [ 390 478 566 654] - [ 670 822 974 1126] - [ 950 1166 1382 1598]] -``` - -main.c -``` -#include <stdio.h> -#include <stdint.h> - -void foo(int *dst); - -int main() { - int32_t dst[16]; - foo(dst); - - // This should print: - // >>> 110 134 158 182 - // >>> 390 478 566 654 - // >>> 670 822 974 1126 - // >>> 950 1166 1382 1598 - for (int i=0; i<4; ++i) { - printf(">>> "); - for (int j=0; j<4; ++j) { - printf("%d ", dst[i * 4 + j]); - } - printf("\n"); - } -} -``` - -foo.S - -``` -.global foo -foo: - stp x29, x30, [sp, -80]! - mov x29, sp - stp d8, d9, [sp, 16] - stp d10, d11, [sp, 32] - stp d12, d13, [sp, 48] - stp d14, d15, [sp, 64] - - smstart - - ptrue p0.b - index z0.b, #0, #1 - mov z1.d, z0.d - add z1.b, z1.b, #16 - - zero {za} - smopa za0.s, p0/m, p0/m, z0.b, z1.b - - // Read the first 4x4 sub-matrix of elements from tile 0: - mov w12, #0 - mova z0.s, p0/m, za0h.s[w12, #0] - mova z1.s, p0/m, za0h.s[w12, #1] - mova z2.s, p0/m, za0h.s[w12, #2] - mova z3.s, p0/m, za0h.s[w12, #3] - - // And store them to the input pointer (dst in the C code): - st1w {z0.s}, p0, [x0] - add x0, x0, #16 - st1w {z1.s}, p0, [x0] - add x0, x0, #16 - st1w {z2.s}, p0, [x0] - add x0, x0, #16 - st1w {z3.s}, p0, [x0] - - smstop - - ldp d8, d9, [sp, 16] - ldp d10, d11, [sp, 32] - ldp d12, d13, [sp, 48] - ldp d14, d15, [sp, 64] - ldp x29, x30, [sp], 80 - ret -``` -Steps to reproduce: -``` -$ clang -target aarch64-linux-gnu -march=armv9-a+sme main.c foo.S -$ ~/qemu/build/qemu-aarch64 -cpu max,sme128=on a.out ->>> 110 478 158 654 ->>> 0 0 0 0 ->>> 670 1166 974 1598 ->>> 0 0 0 0 -``` -Additional information: - |