summary refs log tree commit diff stats
path: root/results/classifier/zero-shot/118/all/2595
blob: 3928405636e57e068ac131cce750ab1734954535 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
debug: 0.959
graphic: 0.959
semantic: 0.951
performance: 0.951
hypervisor: 0.946
peripherals: 0.938
ppc: 0.937
TCG: 0.935
mistranslation: 0.934
arm: 0.933
x86: 0.919
device: 0.919
assembly: 0.915
virtual: 0.915
PID: 0.908
permissions: 0.906
user-level: 0.904
socket: 0.901
architecture: 0.899
boot: 0.896
risc-v: 0.893
kernel: 0.892
vnc: 0.886
register: 0.883
VMM: 0.883
files: 0.871
network: 0.861
KVM: 0.850
i386: 0.810

Incorrect behavior with 64-bit element SDOT and UDOT instructions on ARM SVE when sve-default-vector-length>=64
Description of problem:
The behavior of SDOT and UDOT instructions are incorrect when the Zresult.D register is used, which is the 64-bit svdot_lane\_{s,u}64 intrinsic in ACLE.

I have tested the same code using [Arm Instruction Emulator](https://developer.arm.com/Tools%20and%20Software/Arm%20Instruction%20Emulator) (which is deprecated though) and gem5 which produced correct result, I believe that the SDOT and UDOT implementation in qemu is incorrect.
Steps to reproduce:
1. Get Arm Gnu toolchain from [Arm GNU Toolchain Downloads – Arm Developer](https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads), for x86 Linux hosts, download arm-gnu-toolchain-13.3.rel1-x86_64-aarch64-none-linux-gnu.tar.xz and extract it. Alternatively, use any compiler that is able to cross compile for armv8.2-a+sve targets.
2. Compile the following program with these compiler arguments

   ```
   arm-gnu-toolchain-13.3.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc -O3 -march=armv8.2-a+sve dot_lane.c -o dot_lane
   ```

   ```c
   #include <stdio.h>
   #include <arm_sve.h>
   
   int64_t a[32] = { 0 };
   int16_t b[128];
   int16_t c[128];
   int64_t r[32];
   int64_t expected_r[32];
   
   #define IMM 0
   
   int main(void)
   {
       for (size_t i = 0; i < 128; i++) {
           b[i] = 1;
           c[i] = i / 4;
       }
   
       svint64_t av = svld1(svptrue_b64(), a);
       svint16_t bv = svld1(svptrue_b16(), b);
       svint16_t cv = svld1(svptrue_b16(), c);
   
       svint64_t result = svdot_lane_s64(av, bv, cv, IMM);
   
       svst1(svptrue_b64(), r, result);
   
       for (size_t i = 0; i < svcntd(); i++) {
           expected_r[i] = 
               (int64_t)b[i * 4 + 0] * (int64_t)c[(i - i % 2) * 4 + IMM * 4 + 0] +
               (int64_t)b[i * 4 + 1] * (int64_t)c[(i - i % 2) * 4 + IMM * 4 + 1] +
               (int64_t)b[i * 4 + 2] * (int64_t)c[(i - i % 2) * 4 + IMM * 4 + 2] +
               (int64_t)b[i * 4 + 3] * (int64_t)c[(i - i % 2) * 4 + IMM * 4 + 3] +
               a[i];
       }
       
       printf("%12s", "r: ");
       for (size_t i = 0; i < svcntd(); i++) {
           printf("%4ld", r[i]);
       }
       printf("\n");
       printf("%12s", "expected_r: ");
       for (size_t i = 0; i < svcntd(); i++) {
           printf("%4ld", expected_r[i]);
       }
       printf("\n\t\t");
       for (size_t i = 0; i < svcntd(); i++) {
           if (r[i] != expected_r[i]) {
               printf("%4c", '^');
           } else {
               printf("%4c", ' ');
           }
       }
       printf("\n");
       printf("idx:\t\t");
       for (size_t i = 0; i < svcntd(); i++) {
           if (r[i] != expected_r[i]) {
               printf("%4d", i);
           } else {
               printf("%4c", ' ');
           }
       }
       printf("\n");
   
       return 0;
   }
   ```
3. Execute it with the following commands:

   ```
   qemu-aarch64 -cpu max,sve-default-vector-length=16 -L arm-gnu-toolchain-13.3.rel1-x86_64-aarch64-none-linux-gnu/bin/../aarch64-none-linux-gnu/libc dot_lane
   ```

   Change the value of `sve-default-vector-length` to 32, 64, 128, 256 and observe the outputs, we should see that for `sve-default-vector-length` \>= 64, the result is incorrect.

   `sve-default-vector-length=16`

   ```
            r:    0   0
   expected_r:    0   0
                           
   idx:                
   ```

   `sve-default-vector-length=32`

   ```
            r:    0   0   8   8
   expected_r:    0   0   8   8
                                   
   idx:                        
   ```

   `sve-default-vector-length=64`

   ```
            r:    0   0   8   8   8   8  24  24
   expected_r:    0   0   8   8  16  16  24  24
                                      ^   ^        
   idx:                               4   5         
   ```

   `sve-default-vector-length=128`

   ```
            r:    0   0   8   8   8   8  24  24  24  24  40  40  40  40  56  56
   expected_r:    0   0   8   8  16  16  24  24  32  32  40  40  48  48  56  56
                                      ^   ^           ^   ^           ^   ^        
   idx:                               4   5           8   9          12  13       
   ```

   `sve-default-vector-length=256`

   ```
            r:    0   0   8   8   8   8  24  24  24  24  40  40  40  40  56  56  56  56  72  72  72  72  88  88  88  88 104 104 104 104 120 120
   expected_r:    0   0   8   8  16  16  24  24  32  32  40  40  48  48  56  56  64  64  72  72  80  80  88  88  96  96 104 104 112 112 120 120
                                      ^   ^           ^   ^           ^   ^           ^   ^           ^   ^           ^   ^           ^   ^        
   idx:                               4   5           8   9          12  13          16  17          20  21          24  25          28  29     
   ```
4. By passing `-S` to the compiler, we can see that sdot (or udot if using `svdot_lane_u64()`) is produced in assembly (`sdot z0.d, z1.h, z2.h[0]`), which is correct behavior according to [Intrinsics – Arm Developer](https://developer.arm.com/architectures/instruction-sets/intrinsics/svdot_lane%5B_s64%5D).
Additional information: