architecture: 0.968 performance: 0.958 graphic: 0.919 arm: 0.894 files: 0.862 device: 0.848 assembly: 0.842 ppc: 0.833 peripherals: 0.829 socket: 0.826 permissions: 0.825 PID: 0.799 debug: 0.786 user-level: 0.781 kernel: 0.778 vnc: 0.774 register: 0.701 network: 0.701 hypervisor: 0.695 semantic: 0.687 boot: 0.686 mistranslation: 0.642 risc-v: 0.639 virtual: 0.630 VMM: 0.618 x86: 0.529 TCG: 0.480 i386: 0.421 KVM: 0.346 qemu-aarch64: Incorrect result for ssra instruction when using vector lengths of 1024-bit or higher. Description of problem: ``` #include #include #define SZ 32 int main(int argc, char* argv[]) { svbool_t pg = svptrue_b64(); uint64_t VL = svcntd(); fprintf(stderr, "One SVE vector can hold %li uint64_ts\n", VL); int64_t sr[SZ], sx[SZ], sy[SZ]; uint64_t ur[SZ], ux[SZ], uy[SZ]; for (uint64_t i = 0; i < SZ; ++i) { sx[i] = ux[i] = 0; sy[i] = uy[i] = 1024; } for (uint64_t i = 0; i < SZ; i+=VL) { fprintf(stderr, "Processing elements %li - %li\n", i, i + VL - 1); svint64_t SX = svld1(pg, sx + i); svint64_t SY = svld1(pg, sy + i); svint64_t SR = svsra(SX, SY, 4); svst1(pg, sr + i, SR); svuint64_t UX = svld1(pg, ux + i); svuint64_t UY = svld1(pg, uy + i); svuint64_t UR = svsra(UX, UY, 4); svst1(pg, ur + i, UR); } for (uint64_t i = 0; i < SZ; ++i) { fprintf(stderr, "sr[%li]=%li, ur[%li]\n", i, sr[i], ur[i]); } return 0; } ``` Steps to reproduce: 1. Build the above C source using "gcc -march=armv9-a -O1 ssra.c", can also use clang. 2. Run with "qemu-aarch64 -cpu max,sve-default-vector-length=64 ./a.out" and you'll see the expected result of 64 (signed and unsigned) 3. Run with "qemu-aarch64 -cpu max,sve-default-vector-length=128 ./a.out" and you'll see the expected result of 64 for unsigned but the signed result is 0. This suggests the emulation of SVE2 ssra instruction is incorrect for this and bigger vector lengths. Additional information: