1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
user-level: 0.780
performance: 0.747
graphic: 0.709
permissions: 0.689
architecture: 0.668
device: 0.649
arm: 0.642
PID: 0.624
semantic: 0.614
debug: 0.596
register: 0.590
socket: 0.582
hypervisor: 0.564
virtual: 0.543
peripherals: 0.534
risc-v: 0.533
assembly: 0.527
mistranslation: 0.525
TCG: 0.524
ppc: 0.520
i386: 0.519
VMM: 0.490
files: 0.470
vnc: 0.458
network: 0.415
kernel: 0.387
boot: 0.349
KVM: 0.346
x86: 0.312
AArch64 SME SMOPA (4-way) outer product instruction gives incorrect result
Description of problem:
The SME SMOPA (4-way) instruction ([spec](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-?lang=en)) is giving incorrect result. Example below for 8-bit variant, which is equivalent to following Python example (128-bit VL) to make it clearer:
```
import numpy as np
vl = 128
esize = 32
dim = vl // esize
A = range(16)
B = range(16, 32)
C = np.zeros((4, 4,), dtype=np.int32)
for row in range(dim):
for col in range(dim):
for k in range(4):
C[row, col] += A[4*row + k] * B[4*col + k]
print(C)
[[ 110 134 158 182]
[ 390 478 566 654]
[ 670 822 974 1126]
[ 950 1166 1382 1598]]
```
main.c
```
#include <stdio.h>
#include <stdint.h>
void foo(int *dst);
int main() {
int32_t dst[16];
foo(dst);
// This should print:
// >>> 110 134 158 182
// >>> 390 478 566 654
// >>> 670 822 974 1126
// >>> 950 1166 1382 1598
for (int i=0; i<4; ++i) {
printf(">>> ");
for (int j=0; j<4; ++j) {
printf("%d ", dst[i * 4 + j]);
}
printf("\n");
}
}
```
foo.S
```
.global foo
foo:
stp x29, x30, [sp, -80]!
mov x29, sp
stp d8, d9, [sp, 16]
stp d10, d11, [sp, 32]
stp d12, d13, [sp, 48]
stp d14, d15, [sp, 64]
smstart
ptrue p0.b
index z0.b, #0, #1
mov z1.d, z0.d
add z1.b, z1.b, #16
zero {za}
smopa za0.s, p0/m, p0/m, z0.b, z1.b
// Read the first 4x4 sub-matrix of elements from tile 0:
mov w12, #0
mova z0.s, p0/m, za0h.s[w12, #0]
mova z1.s, p0/m, za0h.s[w12, #1]
mova z2.s, p0/m, za0h.s[w12, #2]
mova z3.s, p0/m, za0h.s[w12, #3]
// And store them to the input pointer (dst in the C code):
st1w {z0.s}, p0, [x0]
add x0, x0, #16
st1w {z1.s}, p0, [x0]
add x0, x0, #16
st1w {z2.s}, p0, [x0]
add x0, x0, #16
st1w {z3.s}, p0, [x0]
smstop
ldp d8, d9, [sp, 16]
ldp d10, d11, [sp, 32]
ldp d12, d13, [sp, 48]
ldp d14, d15, [sp, 64]
ldp x29, x30, [sp], 80
ret
```
Steps to reproduce:
```
$ clang -target aarch64-linux-gnu -march=armv9-a+sme main.c foo.S
$ ~/qemu/build/qemu-aarch64 -cpu max,sme128=on a.out
>>> 110 478 158 654
>>> 0 0 0 0
>>> 670 1166 974 1598
>>> 0 0 0 0
```
Additional information:
|