1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
|
Documenting difference between AVX2 gather loadstores before they're implemented
Some prototyping of AVX2 gather loadstores in CPP just to show how ugly it is between SVE-256 and SVE-128 before FEX-Emu has implemented it.
<details>
<summary>CPP</summary>
```cpp
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <type_traits>
#ifdef __aarch64__
// Compile with `-O3 -march=armv8-a+sve -msve-vector-bits=128`
// Or
// Compile with `-O3 -march=armv8-a+sve -msve-vector-bits=256`
// To see the difference
#include <arm_neon.h>
#include <arm_sve.h>
struct ContextObject {
uint8_t Pad[512];
// Maximum uint64_t times 4 times for gather loadstores.
uint8_t TempSpace[32];
};
#if __ARM_FEATURE_SVE_BITS != 128
using Vec256 = svuint64_t;
template<typename IndexType, size_t Scale>
Vec256 GatherLoad_ArbitraryScale(
ContextObject *Ctx,
svbool_t pg_256bit,
uint64_t *base,
Vec256 indices) {
using IndexTypeScaled = typename std::conditional<std::is_signed_v<IndexType>, int64_t, uint64_t>::type;
IndexType *Indexes = (IndexType*)Ctx->TempSpace;
svst1_u64(pg_256bit, Indexes, indices);
for (size_t i = 0; i < 4; ++i) {
IndexTypeScaled Index = Indexes[i];
uint64_t *Addr = (uint64_t*)((uint64_t)base + (Index << Scale));
Indexes[i] = *Addr;
}
return svld1(pg_256bit, Indexes);
}
Vec256 GatherLoad1_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {
return svld1_gather_u64offset_u64(pg_256bit, base, indices);
}
Vec256 GatherLoad2_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {
return GatherLoad_ArbitraryScale<uint64_t, 2>(Ctx, pg_256bit, base, indices);
}
Vec256 GatherLoad4_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {
return GatherLoad_ArbitraryScale<uint64_t, 4>(Ctx, pg_256bit, base, indices);
}
Vec256 GatherLoad8_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {
return svld1_gather_u64index_u64(pg_256bit, base, indices);
}
#elif __ARM_FEATURE_SVE_BITS == 128
struct Vec256 {
uint64x2_t Lower;
uint64x2_t Upper;
};
template<typename IndexType, size_t Scale>
Vec256 GatherLoad_ArbitraryScale(
ContextObject *Ctx,
svbool_t pg_256bit,
uint64_t *base,
Vec256 indices) {
using IndexTypeScaled = typename std::conditional<std::is_signed_v<IndexType>, int64_t, uint64_t>::type;
IndexType *Indexes = (IndexType*)Ctx->TempSpace;
memcpy(Indexes, &indices, sizeof(Vec256));
for (size_t i = 0; i < 4; ++i) {
IndexTypeScaled Index = Indexes[i];
uint64_t *Addr = (uint64_t*)((uint64_t)base + (Index << Scale));
Indexes[i] = *Addr;
}
Vec256 Result{};
memcpy(&Result, Indexes, sizeof(Vec256));
return Result;
}
Vec256 GatherLoad1_u64(
ContextObject *Ctx,
svbool_t pg_128bit,
uint64_t *base,
svuint64_t indices_lower,
svuint64_t indices_upper) {
Vec256 Result{};
uint64_t *Indexes = (uint64_t*)Ctx->TempSpace;
auto Lower = svld1_gather_u64offset_u64(pg_128bit, base, indices_lower);
auto Upper = svld1_gather_u64offset_u64(pg_128bit, base, indices_upper);
svst1_u64(pg_128bit, Indexes, Lower);
svst1_u64(pg_128bit, &Indexes[4], Upper);
memcpy(&Result, Indexes, sizeof(Vec256));
return Result;
}
Vec256 GatherLoad2_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {
return GatherLoad_ArbitraryScale<uint64_t, 2>(Ctx, pg_256bit, base, indices);
}
Vec256 GatherLoad4_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {
return GatherLoad_ArbitraryScale<uint64_t, 4>(Ctx, pg_256bit, base, indices);
}
Vec256 GatherLoad8_u64( ContextObject *Ctx,
svbool_t pg_128bit,
uint64_t *base,
svuint64_t indices_lower,
svuint64_t indices_upper) {
Vec256 Result{};
uint64_t *Indexes = (uint64_t*)Ctx->TempSpace;
auto Lower = svld1_gather_u64index_u64(pg_128bit, base, indices_lower);
auto Upper = svld1_gather_u64index_u64(pg_128bit, base, indices_upper);
svst1_u64(pg_128bit, Indexes, Lower);
svst1_u64(pg_128bit, &Indexes[4], Upper);
memcpy(&Result, Indexes, sizeof(Vec256));
return Result;}
#else
#error unsupported
#endif
#else
#include <immintrin.h>
// Not quite matching since this is doing an insert and mask which FEX needs to handle
using Vec256 = __m256i;
Vec256 GatherLoad1(uint64_t *base, Vec256 indices) {
return _mm256_i64gather_epi64 (base, indices, 1);
}
Vec256 GatherLoad2(uint64_t *base, Vec256 indices) {
return _mm256_i64gather_epi64 (base, indices, 2);
}
Vec256 GatherLoad4(uint64_t *base, Vec256 indices) {
return _mm256_i64gather_epi64 (base, indices, 4);
}
Vec256 GatherLoad8(uint64_t *base, Vec256 indices) {
return _mm256_i64gather_epi64 (base, indices, 8);
}
#endif
```
</details>
<details>
<summary>Assembly SVE-128bit</summary>
```asm
GatherLoad1_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t, __SVUint64_t): // @GatherLoad1_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t, __SVUint64_t)
ld1d { z0.d }, p0/z, [x1, z0.d]
mov x8, #64 // =0x40
mov x9, #68 // =0x44
ld1d { z1.d }, p0/z, [x1, z1.d]
st1d { z0.d }, p0, [x0, x8, lsl #3]
st1d { z1.d }, p0, [x0, x9, lsl #3]
ldp q0, q1, [x0, #512]
ret
GatherLoad2_u64(ContextObject*, __SVBool_t, unsigned long*, Vec256): // @GatherLoad2_u64(ContextObject*, __SVBool_t, unsigned long*, Vec256)
fmov x9, d0
mov x8, v0.d[1]
stp q0, q1, [x0, #512]
fmov x10, d1
lsl x9, x9, #2
lsl x8, x8, #2
lsl x10, x10, #2
ldr x9, [x9, x1]
str x9, [x0, #512]
mov x9, v1.d[1]
ldr x8, [x8, x1]
str x8, [x0, #520]
ldr x8, [x10, x1]
lsl x9, x9, #2
str x8, [x0, #528]
ldr x8, [x9, x1]
str x8, [x0, #536]
ldp q0, q1, [x0, #512]
ret
GatherLoad4_u64(ContextObject*, __SVBool_t, unsigned long*, Vec256): // @GatherLoad4_u64(ContextObject*, __SVBool_t, unsigned long*, Vec256)
fmov x9, d0
mov x8, v0.d[1]
stp q0, q1, [x0, #512]
fmov x10, d1
lsl x9, x9, #4
lsl x8, x8, #4
lsl x10, x10, #4
ldr x9, [x9, x1]
str x9, [x0, #512]
mov x9, v1.d[1]
ldr x8, [x8, x1]
str x8, [x0, #520]
ldr x8, [x10, x1]
lsl x9, x9, #4
str x8, [x0, #528]
ldr x8, [x9, x1]
str x8, [x0, #536]
ldp q0, q1, [x0, #512]
ret
GatherLoad8_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t, __SVUint64_t): // @GatherLoad8_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t, __SVUint64_t)
ld1d { z0.d }, p0/z, [x1, z0.d, lsl #3]
mov x8, #64 // =0x40
mov x9, #68 // =0x44
ld1d { z1.d }, p0/z, [x1, z1.d, lsl #3]
st1d { z0.d }, p0, [x0, x8, lsl #3]
st1d { z1.d }, p0, [x0, x9, lsl #3]
ldp q0, q1, [x0, #512]
ret
```
</details>
<details>
<summary>Assembly SVE-256bit</summary>
```asm
GatherLoad1_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t): // @GatherLoad1_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t)
ld1d { z0.d }, p0/z, [x1, z0.d]
ret
GatherLoad2_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t): // @GatherLoad2_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t)
mov x8, #64 // =0x40
st1d { z0.d }, p0, [x0, x8, lsl #3]
ldr x9, [x0, #512]
ldr x10, [x0, #520]
lsl x9, x9, #2
lsl x10, x10, #2
ldr x9, [x9, x1]
str x9, [x0, #512]
ldr x9, [x0, #528]
ldr x10, [x10, x1]
lsl x9, x9, #2
str x10, [x0, #520]
ldr x10, [x0, #536]
ldr x9, [x9, x1]
lsl x10, x10, #2
str x9, [x0, #528]
ldr x9, [x10, x1]
str x9, [x0, #536]
ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
ret
GatherLoad4_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t): // @GatherLoad4_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t)
mov x8, #64 // =0x40
st1d { z0.d }, p0, [x0, x8, lsl #3]
ldr x9, [x0, #512]
ldr x10, [x0, #520]
lsl x9, x9, #4
lsl x10, x10, #4
ldr x9, [x9, x1]
str x9, [x0, #512]
ldr x9, [x0, #528]
ldr x10, [x10, x1]
lsl x9, x9, #4
str x10, [x0, #520]
ldr x10, [x0, #536]
ldr x9, [x9, x1]
lsl x10, x10, #4
str x9, [x0, #528]
ldr x9, [x10, x1]
str x9, [x0, #536]
ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
ret
GatherLoad8_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t): // @GatherLoad8_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t)
ld1d { z0.d }, p0/z, [x1, z0.d, lsl #3]
ret
```
</details>
The index scaling versions that don't match ARM SVE behaviour is particularly nasty but should hopefully not happen too frequently in practice.
The index scaling versions without SVE-256 fall down the worst case path of being basically ASIMD, which can be cleaned up slightly when inside the JIT but is pretty bad.
The index scaling versions that do match SVE behaviour go from 1 instruction with SVE to 7, so it's a bit spicy.
If the gather operations are implemented with only an ASIMD implementation then it turns in to the equivalent of not matching SVE scaling behaviour with 128-bit operations (pretty bad).
Once we support AVX gather loadstores then this documentation ticket can be closed.
|