results/classifier/zero-shot/118/none/2083


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

user-level: 0.780
performance: 0.747
graphic: 0.709
permissions: 0.689
architecture: 0.668
device: 0.649
arm: 0.642
PID: 0.624
semantic: 0.614
debug: 0.596
register: 0.590
socket: 0.582
hypervisor: 0.564
virtual: 0.543
peripherals: 0.534
risc-v: 0.533
assembly: 0.527
mistranslation: 0.525
TCG: 0.524
ppc: 0.520
i386: 0.519
VMM: 0.490
files: 0.470
vnc: 0.458
network: 0.415
kernel: 0.387
boot: 0.349
KVM: 0.346
x86: 0.312

AArch64 SME SMOPA (4-way) outer product instruction gives incorrect result
Description of problem:
The SME SMOPA (4-way) instruction ([spec](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-?lang=en)) is giving incorrect result. Example below for 8-bit variant, which is equivalent to following Python example (128-bit VL) to make it clearer:

```
import numpy as np
vl = 128
esize = 32
dim = vl // esize

A = range(16)
B = range(16, 32)
C = np.zeros((4, 4,), dtype=np.int32)

for row in range(dim):
    for col in range(dim):
        for k in range(4):
            C[row, col] += A[4*row + k] * B[4*col + k]

print(C)

[[ 110  134  158  182]
 [ 390  478  566  654]
 [ 670  822  974 1126]
 [ 950 1166 1382 1598]]
```

main.c
```
#include <stdio.h>
#include <stdint.h>

void foo(int *dst);

int main() {
  int32_t dst[16];
  foo(dst);

  // This should print:
  // >>> 110  134  158  182
  // >>> 390  478  566  654
  // >>> 670  822  974  1126
  // >>> 950  1166  1382  1598
  for (int i=0; i<4; ++i) {
    printf(">>> ");
    for (int j=0; j<4; ++j) {
      printf("%d  ", dst[i * 4 + j]);
    }
    printf("\n");
  }
}
```

foo.S

```
.global foo
foo:
  stp x29, x30, [sp, -80]!
  mov x29, sp
  stp d8, d9, [sp, 16]
  stp d10, d11, [sp, 32]
  stp d12, d13, [sp, 48]
  stp d14, d15, [sp, 64]

  smstart

  ptrue p0.b
  index z0.b, #0, #1
  mov   z1.d, z0.d
  add   z1.b, z1.b, #16

  zero  {za}
  smopa za0.s, p0/m, p0/m, z0.b, z1.b

  // Read the first 4x4 sub-matrix of elements from tile 0:
  mov w12, #0
  mova z0.s, p0/m, za0h.s[w12, #0]
  mova z1.s, p0/m, za0h.s[w12, #1]
  mova z2.s, p0/m, za0h.s[w12, #2]
  mova z3.s, p0/m, za0h.s[w12, #3]

  // And store them to the input pointer (dst in the C code):
  st1w {z0.s}, p0, [x0]
  add x0, x0, #16
  st1w {z1.s}, p0, [x0]
  add x0, x0, #16
  st1w {z2.s}, p0, [x0]
  add x0, x0, #16
  st1w {z3.s}, p0, [x0]

  smstop

  ldp d8, d9, [sp, 16]
  ldp d10, d11, [sp, 32]
  ldp d12, d13, [sp, 48]
  ldp d14, d15, [sp, 64]
  ldp x29, x30, [sp], 80
  ret
```
Steps to reproduce:
```
$ clang -target aarch64-linux-gnu -march=armv9-a+sme main.c foo.S
$ ~/qemu/build/qemu-aarch64 -cpu max,sme128=on a.out
>>> 110  478  158  654
>>> 0  0  0  0
>>> 670  1166  974  1598
>>> 0  0  0  0
```
Additional information: