summary refs log tree commit diff stats
path: root/gitlab/issues_text/target_arm/host_missing/accel_TCG/2083
diff options
context:
space:
mode:
Diffstat (limited to 'gitlab/issues_text/target_arm/host_missing/accel_TCG/2083')
-rw-r--r--gitlab/issues_text/target_arm/host_missing/accel_TCG/2083111
1 files changed, 0 insertions, 111 deletions
diff --git a/gitlab/issues_text/target_arm/host_missing/accel_TCG/2083 b/gitlab/issues_text/target_arm/host_missing/accel_TCG/2083
deleted file mode 100644
index 0e6cc43a2..000000000
--- a/gitlab/issues_text/target_arm/host_missing/accel_TCG/2083
+++ /dev/null
@@ -1,111 +0,0 @@
-AArch64 SME SMOPA (4-way) outer product instruction gives incorrect result
-Description of problem:
-The SME SMOPA (4-way) instruction ([spec](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-?lang=en)) is giving incorrect result. Example below for 8-bit variant, which is equivalent to following Python example (128-bit VL) to make it clearer:
-
-```
-import numpy as np
-vl = 128
-esize = 32
-dim = vl // esize
-
-A = range(16)
-B = range(16, 32)
-C = np.zeros((4, 4,), dtype=np.int32)
-
-for row in range(dim):
-    for col in range(dim):
-        for k in range(4):
-            C[row, col] += A[4*row + k] * B[4*col + k]
-
-print(C)
-
-[[ 110  134  158  182]
- [ 390  478  566  654]
- [ 670  822  974 1126]
- [ 950 1166 1382 1598]]
-```
-
-main.c
-```
-#include <stdio.h>
-#include <stdint.h>
-
-void foo(int *dst);
-
-int main() {
-  int32_t dst[16];
-  foo(dst);
-
-  // This should print:
-  // >>> 110  134  158  182
-  // >>> 390  478  566  654
-  // >>> 670  822  974  1126
-  // >>> 950  1166  1382  1598
-  for (int i=0; i<4; ++i) {
-    printf(">>> ");
-    for (int j=0; j<4; ++j) {
-      printf("%d  ", dst[i * 4 + j]);
-    }
-    printf("\n");
-  }
-}
-```
-
-foo.S
-
-```
-.global foo
-foo:
-  stp x29, x30, [sp, -80]!
-  mov x29, sp
-  stp d8, d9, [sp, 16]
-  stp d10, d11, [sp, 32]
-  stp d12, d13, [sp, 48]
-  stp d14, d15, [sp, 64]
-
-  smstart
-
-  ptrue p0.b
-  index z0.b, #0, #1
-  mov   z1.d, z0.d
-  add   z1.b, z1.b, #16
-
-  zero  {za}
-  smopa za0.s, p0/m, p0/m, z0.b, z1.b
-
-  // Read the first 4x4 sub-matrix of elements from tile 0:
-  mov w12, #0
-  mova z0.s, p0/m, za0h.s[w12, #0]
-  mova z1.s, p0/m, za0h.s[w12, #1]
-  mova z2.s, p0/m, za0h.s[w12, #2]
-  mova z3.s, p0/m, za0h.s[w12, #3]
-
-  // And store them to the input pointer (dst in the C code):
-  st1w {z0.s}, p0, [x0]
-  add x0, x0, #16
-  st1w {z1.s}, p0, [x0]
-  add x0, x0, #16
-  st1w {z2.s}, p0, [x0]
-  add x0, x0, #16
-  st1w {z3.s}, p0, [x0]
-
-  smstop
-
-  ldp d8, d9, [sp, 16]
-  ldp d10, d11, [sp, 32]
-  ldp d12, d13, [sp, 48]
-  ldp d14, d15, [sp, 64]
-  ldp x29, x30, [sp], 80
-  ret
-```
-Steps to reproduce:
-```
-$ clang -target aarch64-linux-gnu -march=armv9-a+sme main.c foo.S
-$ ~/qemu/build/qemu-aarch64 -cpu max,sme128=on a.out
->>> 110  478  158  654
->>> 0  0  0  0
->>> 670  1166  974  1598
->>> 0  0  0  0
-```
-Additional information:
-