1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
AArch64 SME SMOPA (4-way) outer product instruction gives incorrect result
Description of problem:
The SME SMOPA (4-way) instruction ([spec](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-?lang=en)) is giving incorrect result. Example below for 8-bit variant, which is equivalent to following Python example (128-bit VL) to make it clearer:
```
import numpy as np
vl = 128
esize = 32
dim = vl // esize
A = range(16)
B = range(16, 32)
C = np.zeros((4, 4,), dtype=np.int32)
for row in range(dim):
for col in range(dim):
for k in range(4):
C[row, col] += A[4*row + k] * B[4*col + k]
print(C)
[[ 110 134 158 182]
[ 390 478 566 654]
[ 670 822 974 1126]
[ 950 1166 1382 1598]]
```
main.c
```
#include <stdio.h>
#include <stdint.h>
void foo(int *dst);
int main() {
int32_t dst[16];
foo(dst);
// This should print:
// >>> 110 134 158 182
// >>> 390 478 566 654
// >>> 670 822 974 1126
// >>> 950 1166 1382 1598
for (int i=0; i<4; ++i) {
printf(">>> ");
for (int j=0; j<4; ++j) {
printf("%d ", dst[i * 4 + j]);
}
printf("\n");
}
}
```
foo.S
```
.global foo
foo:
stp x29, x30, [sp, -80]!
mov x29, sp
stp d8, d9, [sp, 16]
stp d10, d11, [sp, 32]
stp d12, d13, [sp, 48]
stp d14, d15, [sp, 64]
smstart
ptrue p0.b
index z0.b, #0, #1
mov z1.d, z0.d
add z1.b, z1.b, #16
zero {za}
smopa za0.s, p0/m, p0/m, z0.b, z1.b
// Read the first 4x4 sub-matrix of elements from tile 0:
mov w12, #0
mova z0.s, p0/m, za0h.s[w12, #0]
mova z1.s, p0/m, za0h.s[w12, #1]
mova z2.s, p0/m, za0h.s[w12, #2]
mova z3.s, p0/m, za0h.s[w12, #3]
// And store them to the input pointer (dst in the C code):
st1w {z0.s}, p0, [x0]
add x0, x0, #16
st1w {z1.s}, p0, [x0]
add x0, x0, #16
st1w {z2.s}, p0, [x0]
add x0, x0, #16
st1w {z3.s}, p0, [x0]
smstop
ldp d8, d9, [sp, 16]
ldp d10, d11, [sp, 32]
ldp d12, d13, [sp, 48]
ldp d14, d15, [sp, 64]
ldp x29, x30, [sp], 80
ret
```
Steps to reproduce:
```
$ clang -target aarch64-linux-gnu -march=armv9-a+sme main.c foo.S
$ ~/qemu/build/qemu-aarch64 -cpu max,sme128=on a.out
>>> 110 478 158 654
>>> 0 0 0 0
>>> 670 1166 974 1598
>>> 0 0 0 0
```
Additional information:
|