summary refs log tree commit diff stats
path: root/gitlab/issues/target_arm/host_missing/accel_TCG/1833.toml
blob: 309f98069b5d3e6530e50ccd5cddbf114da8e4c6 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
id = 1833
title = "ARM64 SME ST1Q incorrectly stores 9 bytes (rather than 16) per 128-bit element"
state = "closed"
created_at = "2023-08-16T20:54:29.517Z"
closed_at = "2023-08-24T15:27:44.757Z"
labels = ["Closed::Fixed", "accel: TCG", "kind::Bug", "target: arm"]
url = "https://gitlab.com/qemu-project/qemu/-/issues/1833"
host-os = "Ubuntu 20.04.5 LTS"
host-arch = "ARM64"
qemu-version = "qemu-aarch64 version 8.0.93 (also confirmed on master branch)"
guest-os = "n/a"
guest-arch = "n/a"
description = """QEMU incorrectly stores 9 bytes instead of 16 per 128-bit element in the ST1Q SME instruction (https://developer.arm.com/documentation/ddi0602/2022-06/SME-Instructions/ST1Q--Contiguous-store-of-quadwords-from-128-bit-element-ZA-tile-slice-). It copies the first byte of the upper 64-bits, then lower the 64-bits.

This seems to be a simple issue; I tracked it down to:
https://gitlab.com/qemu-project/qemu/-/blob/master/target/arm/tcg/sme_helper.c?ref_type=heads#L382

Updating that `+ 1` to a `+ 8` fixes the problem."""
reproduce = """```c
#include <stdio.h>
#include <stdint.h>
#include <string.h>

void st1q_sme_copy_test(uint8_t* src,  uint8_t* dest) {
  asm volatile(
    "smstart sm\\n"
    "smstart za\\n"
    "ptrue p0.b\\n"
    "mov x12, xzr\\n"
    "ld1q {za0h.q[w12, 0]}, p0/z, %0\\n"
    "st1q {za0h.q[w12, 0]}, p0, %1\\n"
    "smstop za\\n"
    "smstop sm\\n" : : "m"(*src), "m"(*dest) : "w12", "p0");
}

void print_first_128(uint8_t* data) {
  putchar('[');
  for (int i = 0; i < 16; i++) {
    printf("%02d", data[i]);
    if (i != 15)
      printf(", ");
  }
  printf("]\\n");
}

int main() {
  _Alignas(16) uint8_t dest[512] = { };
  _Alignas(16) uint8_t src[512] = { };
  for (int i = 0; i < sizeof(src); i++)
    src[i] = i;
  puts("Before");
  printf(" src: ");
  print_first_128(src);
  printf("dest: ");
  print_first_128(dest);
  st1q_sme_copy_test(src, dest);
  puts("\\nAfter ");
  printf(" src: ");
  print_first_128(src);
  printf("dest: ");
  print_first_128(dest);
}
```

Compile with (requires at least clang ~14, tested with clang 16):<br/>
`clang ./qemu_repro.c -march=armv9-a+sme+sve -o ./qemu_repro` 

Run with:<br/>
`qemu-aarch64 -cpu max,sme=on ./qemu_repro`

It's expected just to copy from `src` to `dest` and output:
```
Before
 src: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15]
dest: [00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00]

After 
 src: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15]
dest: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15]
```

But currently outputs:
```
Before
 src: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15]
dest: [00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00]

After 
 src: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15]
dest: [00, 08, 09, 10, 11, 12, 13, 14, 15, 00, 00, 00, 00, 00, 00, 00]
```"""
additional = """N/A"""