1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
assembly: 0.775
permissions: 0.759
user-level: 0.730
architecture: 0.729
device: 0.720
debug: 0.718
KVM: 0.715
performance: 0.712
mistranslation: 0.702
peripherals: 0.701
hypervisor: 0.699
files: 0.696
register: 0.695
arm: 0.689
semantic: 0.688
risc-v: 0.685
VMM: 0.681
virtual: 0.678
PID: 0.674
graphic: 0.672
ppc: 0.661
TCG: 0.654
kernel: 0.653
vnc: 0.639
socket: 0.616
boot: 0.561
network: 0.554
x86: 0.531
i386: 0.505
SME FMOPA outer product instruction gives incorrect result
Description of problem:
The SME outer product instructions operate on tiles of elements. In the
below example we are performing an outer product of a vector of 1.0
by itself. This naturally should produce a matrix filled with 1.0
values, however if we read the values of the tile and printf them we
instead observe 0.0 values.
Without digging into the underlying QEMU code this appears to be a bug
in how elements are set based on the tile number, since the same code
using za0.s rather than za1.s correctly reports all 1.0 values as output
as expected.
main.c
```
#include <stdio.h>
void foo(float *dst);
int main() {
float dst[16];
foo(dst);
// This should print:
// >>> 1.000000 1.000000 1.000000 1.000000
// >>> 1.000000 1.000000 1.000000 1.000000
// >>> 1.000000 1.000000 1.000000 1.000000
// >>> 1.000000 1.000000 1.000000 1.000000
for (int i=0; i<4; ++i) {
printf(">>> ");
for (int j=0; j<4; ++j) {
printf("%lf ", (double)dst[i * 4 + j]);
}
printf("\n");
}
}
```
foo.S
```
.global foo
foo:
stp x29, x30, [sp, -80]!
mov x29, sp
stp d8, d9, [sp, 16]
stp d10, d11, [sp, 32]
stp d12, d13, [sp, 48]
stp d14, d15, [sp, 64]
smstart
ptrue p0.s, vl4
fmov z0.s, #1.0
// An outer product of a vector of 1.0 by itself should be a matrix of 1.0.
// Note that we are using tile 1 here (za1.s) rather than tile 0.
zero {za}
fmopa za1.s, p0/m, p0/m, z0.s, z0.s
// Read the first 4x4 sub-matrix of elements from tile 1:
// Note that za1h should be interchangable here.
mov w12, #0
mova z0.s, p0/m, za1v.s[w12, #0]
mova z1.s, p0/m, za1v.s[w12, #1]
mova z2.s, p0/m, za1v.s[w12, #2]
mova z3.s, p0/m, za1v.s[w12, #3]
// And store them to the input pointer (dst in the C code):
st1w {z0.s}, p0, [x0]
add x0, x0, #16
st1w {z1.s}, p0, [x0]
add x0, x0, #16
st1w {z2.s}, p0, [x0]
add x0, x0, #16
st1w {z3.s}, p0, [x0]
smstop
ldp d8, d9, [sp, 16]
ldp d10, d11, [sp, 32]
ldp d12, d13, [sp, 48]
ldp d14, d15, [sp, 64]
ldp x29, x30, [sp], 80
ret
```
Steps to reproduce:
```
$ clang -target aarch64-linux-gnu -march=armv9-a+sme test.c -O1 -static
$ ~/qemu/build/qemu-aarch64 ./a.out
>>> 0.000000 0.000000 0.000000 0.000000
>>> 0.000000 0.000000 0.000000 0.000000
>>> 0.000000 0.000000 0.000000 0.000000
>>> 0.000000 0.000000 0.000000 0.000000
```
|