summary refs log tree commit diff stats
path: root/results/scraper/fex/3659
diff options
context:
space:
mode:
authorChristian Krinitsin <mail@krinitsin.com>2025-07-17 09:10:43 +0200
committerChristian Krinitsin <mail@krinitsin.com>2025-07-17 09:10:43 +0200
commitf2ec263023649e596c5076df32c2d328bc9393d2 (patch)
tree5dd86caab46e552bd2e62bf9c4fb1a7504a44db4 /results/scraper/fex/3659
parent63d2e9d409831aa8582787234cae4741847504b7 (diff)
downloadqemu-analysis-main.tar.gz
qemu-analysis-main.zip
add downloaded fex bug-reports HEAD main
Diffstat (limited to 'results/scraper/fex/3659')
-rw-r--r--results/scraper/fex/3659272
1 files changed, 272 insertions, 0 deletions
diff --git a/results/scraper/fex/3659 b/results/scraper/fex/3659
new file mode 100644
index 000000000..da3b96466
--- /dev/null
+++ b/results/scraper/fex/3659
@@ -0,0 +1,272 @@
+Documenting difference between AVX2 gather loadstores before they're implemented
+Some prototyping of AVX2 gather loadstores in CPP just to show how ugly it is between SVE-256 and SVE-128 before FEX-Emu has implemented it.

+

+<details>

+<summary>CPP</summary>

+

+```cpp

+#include <cstddef>

+#include <cstdint>

+#include <cstring>

+#include <type_traits>

+

+#ifdef __aarch64__

+// Compile with `-O3 -march=armv8-a+sve -msve-vector-bits=128`

+// Or

+// Compile with `-O3 -march=armv8-a+sve -msve-vector-bits=256`

+// To see the difference

+#include <arm_neon.h>

+#include <arm_sve.h>

+

+struct ContextObject {

+    uint8_t Pad[512];

+    // Maximum uint64_t times 4 times for gather loadstores.

+    uint8_t TempSpace[32];

+};

+

+#if __ARM_FEATURE_SVE_BITS != 128

+using Vec256 = svuint64_t;

+

+template<typename IndexType, size_t Scale>

+Vec256 GatherLoad_ArbitraryScale(

+    ContextObject *Ctx,

+    svbool_t pg_256bit,

+    uint64_t *base,

+    Vec256 indices) {

+    using IndexTypeScaled = typename std::conditional<std::is_signed_v<IndexType>, int64_t, uint64_t>::type;

+    IndexType *Indexes = (IndexType*)Ctx->TempSpace;

+    svst1_u64(pg_256bit, Indexes, indices);

+    for (size_t i = 0; i < 4; ++i) {

+        IndexTypeScaled Index = Indexes[i];

+        uint64_t *Addr = (uint64_t*)((uint64_t)base + (Index << Scale));

+        Indexes[i] = *Addr;

+    }

+

+    return svld1(pg_256bit, Indexes);

+}

+

+Vec256 GatherLoad1_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {

+    return svld1_gather_u64offset_u64(pg_256bit, base, indices);

+}

+Vec256 GatherLoad2_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {

+    return GatherLoad_ArbitraryScale<uint64_t, 2>(Ctx, pg_256bit, base, indices); 

+}

+Vec256 GatherLoad4_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {

+    return GatherLoad_ArbitraryScale<uint64_t, 4>(Ctx, pg_256bit, base, indices); 

+}

+Vec256 GatherLoad8_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {

+    return svld1_gather_u64index_u64(pg_256bit, base, indices);

+}

+#elif __ARM_FEATURE_SVE_BITS == 128

+struct Vec256 {

+    uint64x2_t Lower;

+    uint64x2_t Upper;

+};

+

+template<typename IndexType, size_t Scale>

+Vec256 GatherLoad_ArbitraryScale(

+    ContextObject *Ctx,

+    svbool_t pg_256bit,

+    uint64_t *base,

+    Vec256 indices) {

+    using IndexTypeScaled = typename std::conditional<std::is_signed_v<IndexType>, int64_t, uint64_t>::type;

+    IndexType *Indexes = (IndexType*)Ctx->TempSpace;

+    memcpy(Indexes, &indices, sizeof(Vec256));

+    for (size_t i = 0; i < 4; ++i) {

+        IndexTypeScaled Index = Indexes[i];

+        uint64_t *Addr = (uint64_t*)((uint64_t)base + (Index << Scale));

+        Indexes[i] = *Addr;

+    }

+

+    Vec256 Result{};

+    memcpy(&Result, Indexes, sizeof(Vec256));

+    return Result;

+}

+

+Vec256 GatherLoad1_u64(

+    ContextObject *Ctx,

+    svbool_t pg_128bit,

+    uint64_t *base,

+    svuint64_t indices_lower,

+    svuint64_t indices_upper) {

+    Vec256 Result{};

+    uint64_t *Indexes = (uint64_t*)Ctx->TempSpace;

+    auto Lower = svld1_gather_u64offset_u64(pg_128bit, base, indices_lower);

+    auto Upper = svld1_gather_u64offset_u64(pg_128bit, base, indices_upper);

+    svst1_u64(pg_128bit, Indexes, Lower);

+    svst1_u64(pg_128bit, &Indexes[4], Upper);

+    memcpy(&Result, Indexes, sizeof(Vec256));

+    return Result;

+}

+Vec256 GatherLoad2_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {

+    return GatherLoad_ArbitraryScale<uint64_t, 2>(Ctx, pg_256bit, base, indices); 

+}

+Vec256 GatherLoad4_u64(ContextObject *Ctx, svbool_t pg_256bit, uint64_t *base, Vec256 indices) {

+    return GatherLoad_ArbitraryScale<uint64_t, 4>(Ctx, pg_256bit, base, indices); 

+}

+Vec256 GatherLoad8_u64(    ContextObject *Ctx,

+    svbool_t pg_128bit,

+    uint64_t *base,

+    svuint64_t indices_lower,

+    svuint64_t indices_upper) {

+    Vec256 Result{};

+    uint64_t *Indexes = (uint64_t*)Ctx->TempSpace;

+    auto Lower = svld1_gather_u64index_u64(pg_128bit, base, indices_lower);

+    auto Upper = svld1_gather_u64index_u64(pg_128bit, base, indices_upper);

+    svst1_u64(pg_128bit, Indexes, Lower);

+    svst1_u64(pg_128bit, &Indexes[4], Upper);

+    memcpy(&Result, Indexes, sizeof(Vec256));

+    return Result;}

+#else

+#error unsupported

+#endif

+

+#else

+#include <immintrin.h>

+// Not quite matching since this is doing an insert and mask which FEX needs to handle

+using Vec256 = __m256i;

+Vec256 GatherLoad1(uint64_t *base, Vec256 indices) {

+    return _mm256_i64gather_epi64 (base, indices, 1);

+}

+Vec256 GatherLoad2(uint64_t *base, Vec256 indices) {

+    return _mm256_i64gather_epi64 (base, indices, 2);

+}

+Vec256 GatherLoad4(uint64_t *base, Vec256 indices) {

+    return _mm256_i64gather_epi64 (base, indices, 4);

+}

+Vec256 GatherLoad8(uint64_t *base, Vec256 indices) {

+    return _mm256_i64gather_epi64 (base, indices, 8);

+}

+#endif

+```

+

+</details>

+<details>

+  <summary>Assembly SVE-128bit</summary>

+

+```asm

+GatherLoad1_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t, __SVUint64_t): // @GatherLoad1_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t, __SVUint64_t)

+        ld1d    { z0.d }, p0/z, [x1, z0.d]

+        mov     x8, #64                         // =0x40

+        mov     x9, #68                         // =0x44

+        ld1d    { z1.d }, p0/z, [x1, z1.d]

+        st1d    { z0.d }, p0, [x0, x8, lsl #3]

+        st1d    { z1.d }, p0, [x0, x9, lsl #3]

+        ldp     q0, q1, [x0, #512]

+        ret

+GatherLoad2_u64(ContextObject*, __SVBool_t, unsigned long*, Vec256): // @GatherLoad2_u64(ContextObject*, __SVBool_t, unsigned long*, Vec256)

+        fmov    x9, d0

+        mov     x8, v0.d[1]

+        stp     q0, q1, [x0, #512]

+        fmov    x10, d1

+        lsl     x9, x9, #2

+        lsl     x8, x8, #2

+        lsl     x10, x10, #2

+        ldr     x9, [x9, x1]

+        str     x9, [x0, #512]

+        mov     x9, v1.d[1]

+        ldr     x8, [x8, x1]

+        str     x8, [x0, #520]

+        ldr     x8, [x10, x1]

+        lsl     x9, x9, #2

+        str     x8, [x0, #528]

+        ldr     x8, [x9, x1]

+        str     x8, [x0, #536]

+        ldp     q0, q1, [x0, #512]

+        ret

+GatherLoad4_u64(ContextObject*, __SVBool_t, unsigned long*, Vec256): // @GatherLoad4_u64(ContextObject*, __SVBool_t, unsigned long*, Vec256)

+        fmov    x9, d0

+        mov     x8, v0.d[1]

+        stp     q0, q1, [x0, #512]

+        fmov    x10, d1

+        lsl     x9, x9, #4

+        lsl     x8, x8, #4

+        lsl     x10, x10, #4

+        ldr     x9, [x9, x1]

+        str     x9, [x0, #512]

+        mov     x9, v1.d[1]

+        ldr     x8, [x8, x1]

+        str     x8, [x0, #520]

+        ldr     x8, [x10, x1]

+        lsl     x9, x9, #4

+        str     x8, [x0, #528]

+        ldr     x8, [x9, x1]

+        str     x8, [x0, #536]

+        ldp     q0, q1, [x0, #512]

+        ret

+GatherLoad8_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t, __SVUint64_t): // @GatherLoad8_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t, __SVUint64_t)

+        ld1d    { z0.d }, p0/z, [x1, z0.d, lsl #3]

+        mov     x8, #64                         // =0x40

+        mov     x9, #68                         // =0x44

+        ld1d    { z1.d }, p0/z, [x1, z1.d, lsl #3]

+        st1d    { z0.d }, p0, [x0, x8, lsl #3]

+        st1d    { z1.d }, p0, [x0, x9, lsl #3]

+        ldp     q0, q1, [x0, #512]

+        ret

+```

+

+</details>

+<details>

+  <summary>Assembly SVE-256bit</summary>

+

+```asm

+GatherLoad1_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t): // @GatherLoad1_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t)

+        ld1d    { z0.d }, p0/z, [x1, z0.d]

+        ret

+GatherLoad2_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t): // @GatherLoad2_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t)

+        mov     x8, #64                         // =0x40

+        st1d    { z0.d }, p0, [x0, x8, lsl #3]

+        ldr     x9, [x0, #512]

+        ldr     x10, [x0, #520]

+        lsl     x9, x9, #2

+        lsl     x10, x10, #2

+        ldr     x9, [x9, x1]

+        str     x9, [x0, #512]

+        ldr     x9, [x0, #528]

+        ldr     x10, [x10, x1]

+        lsl     x9, x9, #2

+        str     x10, [x0, #520]

+        ldr     x10, [x0, #536]

+        ldr     x9, [x9, x1]

+        lsl     x10, x10, #2

+        str     x9, [x0, #528]

+        ldr     x9, [x10, x1]

+        str     x9, [x0, #536]

+        ld1d    { z0.d }, p0/z, [x0, x8, lsl #3]

+        ret

+GatherLoad4_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t): // @GatherLoad4_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t)

+        mov     x8, #64                         // =0x40

+        st1d    { z0.d }, p0, [x0, x8, lsl #3]

+        ldr     x9, [x0, #512]

+        ldr     x10, [x0, #520]

+        lsl     x9, x9, #4

+        lsl     x10, x10, #4

+        ldr     x9, [x9, x1]

+        str     x9, [x0, #512]

+        ldr     x9, [x0, #528]

+        ldr     x10, [x10, x1]

+        lsl     x9, x9, #4

+        str     x10, [x0, #520]

+        ldr     x10, [x0, #536]

+        ldr     x9, [x9, x1]

+        lsl     x10, x10, #4

+        str     x9, [x0, #528]

+        ldr     x9, [x10, x1]

+        str     x9, [x0, #536]

+        ld1d    { z0.d }, p0/z, [x0, x8, lsl #3]

+        ret

+GatherLoad8_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t): // @GatherLoad8_u64(ContextObject*, __SVBool_t, unsigned long*, __SVUint64_t)

+        ld1d    { z0.d }, p0/z, [x1, z0.d, lsl #3]

+        ret

+```

+

+</details>

+

+The index scaling versions that don't match ARM SVE behaviour is particularly nasty but should hopefully not happen too frequently in practice.

+

+The index scaling versions without SVE-256 fall down the worst case path of being basically ASIMD, which can be cleaned up slightly when inside the JIT but is pretty bad.

+

+The index scaling versions that do match SVE behaviour go from 1 instruction with SVE to 7, so it's a bit spicy.

+If the gather operations are implemented with only an ASIMD implementation then it turns in to the equivalent of not matching SVE scaling behaviour with 128-bit operations (pretty bad).

+

+Once we support AVX gather loadstores then this documentation ticket can be closed.