35 files changed, 1147 insertions, 257 deletions
diff --git a/tests/tcg/Makefile.target b/tests/tcg/Makefile.target
index a3b0aaf8af..8318caf924 100644
--- a/tests/tcg/Makefile.target
+++ b/tests/tcg/Makefile.target
@@ -201,3 +201,10 @@ clean:
 
 distclean:
 	rm -f config-cc.mak config-target.mak ../config-$(TARGET).mak
+
+.PHONY: help
+help:
+	@echo "TCG tests help $(TARGET_NAME)"
+	@echo "Built with $(CC)"
+	@echo "Available tests:"
+	@$(foreach t,$(RUN_TESTS),echo "  $t";)
diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target
index db122ab4ff..9e91a20b0d 100644
--- a/tests/tcg/aarch64/Makefile.target
+++ b/tests/tcg/aarch64/Makefile.target
@@ -81,7 +81,7 @@ sha512-vector: sha512.c
 
 TESTS += sha512-vector
 
-ifneq ($(HAVE_GDB_BIN),)
+ifeq ($(HOST_GDB_SUPPORTS_ARCH),y)
 GDB_SCRIPT=$(SRC_PATH)/tests/guest-debug/run-test.py
 
 run-gdbstub-sysregs: sysregs
diff --git a/tests/tcg/hexagon/Makefile.target b/tests/tcg/hexagon/Makefile.target
index 18e6a5969e..0d82dfa76e 100644
--- a/tests/tcg/hexagon/Makefile.target
+++ b/tests/tcg/hexagon/Makefile.target
@@ -1,5 +1,5 @@
 ##
-##  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -45,6 +45,10 @@ HEX_TESTS += fpstuff
 HEX_TESTS += overflow
 HEX_TESTS += signal_context
 HEX_TESTS += reg_mut
+HEX_TESTS += vector_add_int
+HEX_TESTS += scatter_gather
+HEX_TESTS += hvx_misc
+HEX_TESTS += hvx_histogram
 
 HEX_TESTS += test_abs
 HEX_TESTS += test_bitcnt
@@ -78,3 +82,10 @@ TESTS += $(HEX_TESTS)
 usr: usr.c
 	$(CC) $(CFLAGS) -mv67t -O2 -Wno-inline-asm -Wno-expansion-to-defined $< -o $@ $(LDFLAGS)
 
+scatter_gather: CFLAGS += -mhvx
+vector_add_int: CFLAGS += -mhvx -fvectorize
+hvx_misc: CFLAGS += -mhvx
+hvx_histogram: CFLAGS += -mhvx -Wno-gnu-folding-constant
+
+hvx_histogram: hvx_histogram.c hvx_histogram_row.S
+	$(CC) $(CFLAGS) $(CROSS_CC_GUEST_CFLAGS) $^ -o $@ $(LDFLAGS)
diff --git a/tests/tcg/hexagon/fpstuff.c b/tests/tcg/hexagon/fpstuff.c
index 56bf562a40..90ce9a6ef3 100644
--- a/tests/tcg/hexagon/fpstuff.c
+++ b/tests/tcg/hexagon/fpstuff.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2020-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2020-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -40,6 +40,7 @@ const int SF_HEX_NAN =                    0xffffffff;
 const int SF_small_neg =                  0xab98fba8;
 const int SF_denorm =                     0x00000001;
 const int SF_random =                     0x346001d6;
+const int SF_neg_zero =                   0x80000000;
 
 const long long DF_QNaN =                 0x7ff8000000000000ULL;
 const long long DF_SNaN =                 0x7ff7000000000000ULL;
@@ -536,6 +537,33 @@ static void check_sffixupd(void)
     check32(result, 0x146001d6);
 }
 
+static void check_sffms(void)
+{
+    int result;
+
+    /* Check that sffms properly deals with -0 */
+    result = SF_neg_zero;
+    asm ("%0 -= sfmpy(%1 , %2)\n\t"
+        : "+r"(result)
+        : "r"(SF_ZERO), "r"(SF_ZERO)
+        : "r12", "r8");
+    check32(result, SF_neg_zero);
+
+    result = SF_ZERO;
+    asm ("%0 -= sfmpy(%1 , %2)\n\t"
+        : "+r"(result)
+        : "r"(SF_neg_zero), "r"(SF_ZERO)
+        : "r12", "r8");
+    check32(result, SF_ZERO);
+
+    result = SF_ZERO;
+    asm ("%0 -= sfmpy(%1 , %2)\n\t"
+        : "+r"(result)
+        : "r"(SF_ZERO), "r"(SF_neg_zero)
+        : "r12", "r8");
+    check32(result, SF_ZERO);
+}
+
 static void check_float2int_convs()
 {
     int res32;
@@ -688,6 +716,7 @@ int main()
     check_invsqrta();
     check_sffixupn();
     check_sffixupd();
+    check_sffms();
     check_float2int_convs();
 
     puts(err ? "FAIL" : "PASS");
diff --git a/tests/tcg/hexagon/preg_alias.c b/tests/tcg/hexagon/preg_alias.c
index b44a8112b4..8798fbcaf3 100644
--- a/tests/tcg/hexagon/preg_alias.c
+++ b/tests/tcg/hexagon/preg_alias.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -65,7 +65,7 @@ static inline void creg_alias(int cval, PRegs *pregs)
       : "=r"(pregs->pregs.p0), "=r"(pregs->pregs.p1),
         "=r"(pregs->pregs.p2), "=r"(pregs->pregs.p3)
       : "r"(cval)
-      : "p0", "p1", "p2", "p3");
+      : "c4", "p0", "p1", "p2", "p3");
 }
 
 int err;
@@ -92,7 +92,7 @@ static inline void creg_alias_pair(unsigned int cval, PRegs *pregs)
        : "=r"(pregs->pregs.p0), "=r"(pregs->pregs.p1),
          "=r"(pregs->pregs.p2), "=r"(pregs->pregs.p3), "=r"(c5)
        : "r"(cval_pair)
-       : "p0", "p1", "p2", "p3");
+       : "c4", "c5", "p0", "p1", "p2", "p3");
 
   check(c5, 0xdeadbeef);
 }
@@ -117,7 +117,7 @@ static void test_packet(void)
          "}\n\t"
          : "+r"(result)
          : "r"(0xffffffff), "r"(0xff00ffff), "r"(0x837ed653)
-         : "p0", "p1", "p2", "p3");
+         : "c4", "p0", "p1", "p2", "p3");
     check(result, old_val);
 
     /* Test a predicated store */
@@ -129,7 +129,7 @@ static void test_packet(void)
          "}\n\t"
          :
          : "r"(0), "r"(0xffffffff), "r"(&result)
-         : "p0", "p1", "p2", "p3", "memory");
+         : "c4", "p0", "p1", "p2", "p3", "memory");
     check(result, 0x0);
 }
 
diff --git a/tests/tcg/hexagon/scatter_gather.c b/tests/tcg/hexagon/scatter_gather.c
index b93eb18133..bf8b5e0317 100644
--- a/tests/tcg/hexagon/scatter_gather.c
+++ b/tests/tcg/hexagon/scatter_gather.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -40,47 +40,6 @@ typedef long HVX_VectorPair   __attribute__((__vector_size__(256)))
 typedef long HVX_VectorPred   __attribute__((__vector_size__(128)))
                               __attribute__((aligned(128)));
 
-#define VSCATTER_16(BASE, RGN, OFF, VALS) \
-    __builtin_HEXAGON_V6_vscattermh_128B((int)BASE, RGN, OFF, VALS)
-#define VSCATTER_16_MASKED(MASK, BASE, RGN, OFF, VALS) \
-    __builtin_HEXAGON_V6_vscattermhq_128B(MASK, (int)BASE, RGN, OFF, VALS)
-#define VSCATTER_32(BASE, RGN, OFF, VALS) \
-    __builtin_HEXAGON_V6_vscattermw_128B((int)BASE, RGN, OFF, VALS)
-#define VSCATTER_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
-    __builtin_HEXAGON_V6_vscattermwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
-#define VSCATTER_16_32(BASE, RGN, OFF, VALS) \
-    __builtin_HEXAGON_V6_vscattermhw_128B((int)BASE, RGN, OFF, VALS)
-#define VSCATTER_16_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
-    __builtin_HEXAGON_V6_vscattermhwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
-#define VSCATTER_16_ACC(BASE, RGN, OFF, VALS) \
-    __builtin_HEXAGON_V6_vscattermh_add_128B((int)BASE, RGN, OFF, VALS)
-#define VSCATTER_32_ACC(BASE, RGN, OFF, VALS) \
-    __builtin_HEXAGON_V6_vscattermw_add_128B((int)BASE, RGN, OFF, VALS)
-#define VSCATTER_16_32_ACC(BASE, RGN, OFF, VALS) \
-    __builtin_HEXAGON_V6_vscattermhw_add_128B((int)BASE, RGN, OFF, VALS)
-
-#define VGATHER_16(DSTADDR, BASE, RGN, OFF) \
-    __builtin_HEXAGON_V6_vgathermh_128B(DSTADDR, (int)BASE, RGN, OFF)
-#define VGATHER_16_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
-    __builtin_HEXAGON_V6_vgathermhq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
-#define VGATHER_32(DSTADDR, BASE, RGN, OFF) \
-    __builtin_HEXAGON_V6_vgathermw_128B(DSTADDR, (int)BASE, RGN, OFF)
-#define VGATHER_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
-    __builtin_HEXAGON_V6_vgathermwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
-#define VGATHER_16_32(DSTADDR, BASE, RGN, OFF) \
-    __builtin_HEXAGON_V6_vgathermhw_128B(DSTADDR, (int)BASE, RGN, OFF)
-#define VGATHER_16_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
-    __builtin_HEXAGON_V6_vgathermhwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
-
-#define VSHUFF_H(V) \
-    __builtin_HEXAGON_V6_vshuffh_128B(V)
-#define VSPLAT_H(X) \
-    __builtin_HEXAGON_V6_lvsplath_128B(X)
-#define VAND_VAL(PRED, VAL) \
-    __builtin_HEXAGON_V6_vandvrt_128B(PRED, VAL)
-#define VDEAL_H(V) \
-    __builtin_HEXAGON_V6_vdealh_128B(V)
-
 int err;
 
 /* define the number of rows/cols in a square matrix */
@@ -108,22 +67,22 @@ unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
 unsigned short vgather16_32_ref[MATRIX_SIZE];
 
 /* declare the arrays of offsets */
-unsigned short half_offsets[MATRIX_SIZE];
-unsigned int   word_offsets[MATRIX_SIZE];
+unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
+unsigned int   word_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
 
 /* declare the arrays of values */
-unsigned short half_values[MATRIX_SIZE];
-unsigned short half_values_acc[MATRIX_SIZE];
-unsigned short half_values_masked[MATRIX_SIZE];
-unsigned int   word_values[MATRIX_SIZE];
-unsigned int   word_values_acc[MATRIX_SIZE];
-unsigned int   word_values_masked[MATRIX_SIZE];
+unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128)));
+unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
+unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
+unsigned int   word_values[MATRIX_SIZE] __attribute__((aligned(128)));
+unsigned int   word_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
+unsigned int   word_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
 
 /* declare the arrays of predicates */
-unsigned short half_predicates[MATRIX_SIZE];
-unsigned int   word_predicates[MATRIX_SIZE];
+unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
+unsigned int   word_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
 
-/* make this big enough for all the intrinsics */
+/* make this big enough for all the operations */
 const size_t region_len = sizeof(vtcm);
 
 /* optionally add sync instructions */
@@ -261,164 +220,201 @@ void create_offsets_values_preds_16_32(void)
     }
 }
 
-/* scatter the 16 bit elements using intrinsics */
+/* scatter the 16 bit elements using HVX */
 void vector_scatter_16(void)
 {
-    /* copy the offsets and values to vectors */
-    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
-    HVX_Vector values = *(HVX_Vector *)half_values;
-
-    VSCATTER_16(&vtcm.vscatter16, region_len, offsets, values);
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "v1 = vmem(%3 + #0)\n\t"
+         "vscatter(%0, m0, v0.h).h = v1\n\t"
+         : : "r"(vtcm.vscatter16), "r"(region_len),
+             "r"(half_offsets), "r"(half_values)
+         : "m0", "v0", "v1", "memory");
 
     sync_scatter(vtcm.vscatter16);
 }
 
-/* scatter-accumulate the 16 bit elements using intrinsics */
+/* scatter-accumulate the 16 bit elements using HVX */
 void vector_scatter_16_acc(void)
 {
-    /* copy the offsets and values to vectors */
-    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
-    HVX_Vector values = *(HVX_Vector *)half_values_acc;
-
-    VSCATTER_16_ACC(&vtcm.vscatter16, region_len, offsets, values);
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "v1 = vmem(%3 + #0)\n\t"
+         "vscatter(%0, m0, v0.h).h += v1\n\t"
+         : : "r"(vtcm.vscatter16), "r"(region_len),
+             "r"(half_offsets), "r"(half_values_acc)
+         : "m0", "v0", "v1", "memory");
 
     sync_scatter(vtcm.vscatter16);
 }
 
-/* scatter the 16 bit elements using intrinsics */
+/* masked scatter the 16 bit elements using HVX */
 void vector_scatter_16_masked(void)
 {
-    /* copy the offsets and values to vectors */
-    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
-    HVX_Vector values = *(HVX_Vector *)half_values_masked;
-    HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
-    HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
-
-    VSCATTER_16_MASKED(preds, &vtcm.vscatter16, region_len, offsets, values);
+    asm ("r1 = #-1\n\t"
+         "v0 = vmem(%0 + #0)\n\t"
+         "q0 = vand(v0, r1)\n\t"
+         "m0 = %2\n\t"
+         "v0 = vmem(%3 + #0)\n\t"
+         "v1 = vmem(%4 + #0)\n\t"
+         "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t"
+         : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
+             "r"(half_offsets), "r"(half_values_masked)
+         : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
 
     sync_scatter(vtcm.vscatter16);
 }
 
-/* scatter the 32 bit elements using intrinsics */
+/* scatter the 32 bit elements using HVX */
 void vector_scatter_32(void)
 {
-    /* copy the offsets and values to vectors */
-    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
-    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
-    HVX_Vector valueslo = *(HVX_Vector *)word_values;
-    HVX_Vector valueshi = *(HVX_Vector *)&word_values[MATRIX_SIZE / 2];
-
-    VSCATTER_32(&vtcm.vscatter32, region_len, offsetslo, valueslo);
-    VSCATTER_32(&vtcm.vscatter32, region_len, offsetshi, valueshi);
+    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
+    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+    HVX_Vector *valueslo = (HVX_Vector *)word_values;
+    HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2];
+
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "v1 = vmem(%3 + #0)\n\t"
+         "vscatter(%0, m0, v0.w).w = v1\n\t"
+         : : "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetslo), "r"(valueslo)
+         : "m0", "v0", "v1", "memory");
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "v1 = vmem(%3 + #0)\n\t"
+         "vscatter(%0, m0, v0.w).w = v1\n\t"
+         : : "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetshi), "r"(valueshi)
+         : "m0", "v0", "v1", "memory");
 
     sync_scatter(vtcm.vscatter32);
 }
 
-/* scatter-acc the 32 bit elements using intrinsics */
+/* scatter-accumulate the 32 bit elements using HVX */
 void vector_scatter_32_acc(void)
 {
-    /* copy the offsets and values to vectors */
-    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
-    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
-    HVX_Vector valueslo = *(HVX_Vector *)word_values_acc;
-    HVX_Vector valueshi = *(HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
-
-    VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetslo, valueslo);
-    VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetshi, valueshi);
+    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
+    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+    HVX_Vector *valueslo = (HVX_Vector *)word_values_acc;
+    HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
+
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "v1 = vmem(%3 + #0)\n\t"
+         "vscatter(%0, m0, v0.w).w += v1\n\t"
+         : : "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetslo), "r"(valueslo)
+         : "m0", "v0", "v1", "memory");
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "v1 = vmem(%3 + #0)\n\t"
+         "vscatter(%0, m0, v0.w).w += v1\n\t"
+         : : "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetshi), "r"(valueshi)
+         : "m0", "v0", "v1", "memory");
 
     sync_scatter(vtcm.vscatter32);
 }
 
-/* scatter the 32 bit elements using intrinsics */
+/* masked scatter the 32 bit elements using HVX */
 void vector_scatter_32_masked(void)
 {
-    /* copy the offsets and values to vectors */
-    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
-    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
-    HVX_Vector valueslo = *(HVX_Vector *)word_values_masked;
-    HVX_Vector valueshi = *(HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
-    HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
-    HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
-    HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
-    HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
-
-    VSCATTER_32_MASKED(predslo, &vtcm.vscatter32, region_len, offsetslo,
-                       valueslo);
-    VSCATTER_32_MASKED(predshi, &vtcm.vscatter32, region_len, offsetshi,
-                       valueshi);
+    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
+    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+    HVX_Vector *valueslo = (HVX_Vector *)word_values_masked;
+    HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
+    HVX_Vector *predslo = (HVX_Vector *)word_predicates;
+    HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
+
+    asm ("r1 = #-1\n\t"
+         "v0 = vmem(%0 + #0)\n\t"
+         "q0 = vand(v0, r1)\n\t"
+         "m0 = %2\n\t"
+         "v0 = vmem(%3 + #0)\n\t"
+         "v1 = vmem(%4 + #0)\n\t"
+         "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
+         : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetslo), "r"(valueslo)
+         : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
+    asm ("r1 = #-1\n\t"
+         "v0 = vmem(%0 + #0)\n\t"
+         "q0 = vand(v0, r1)\n\t"
+         "m0 = %2\n\t"
+         "v0 = vmem(%3 + #0)\n\t"
+         "v1 = vmem(%4 + #0)\n\t"
+         "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
+         : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetshi), "r"(valueshi)
+         : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
 
-    sync_scatter(vtcm.vscatter16);
+    sync_scatter(vtcm.vscatter32);
 }
 
-/* scatter the 16 bit elements with 32 bit offsets using intrinsics */
+/* scatter the 16 bit elements with 32 bit offsets using HVX */
 void vector_scatter_16_32(void)
 {
-    HVX_VectorPair offsets;
-    HVX_Vector values;
-
-    /* get the word offsets in a vector pair */
-    offsets = *(HVX_VectorPair *)word_offsets;
-
-    /* these values need to be shuffled for the scatter */
-    values = *(HVX_Vector *)half_values;
-    values = VSHUFF_H(values);
-
-    VSCATTER_16_32(&vtcm.vscatter16_32, region_len, offsets, values);
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "v1 = vmem(%2 + #1)\n\t"
+         "v2 = vmem(%3 + #0)\n\t"
+         "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
+         "vscatter(%0, m0, v1:0.w).h = v2\n\t"
+         : : "r"(vtcm.vscatter16_32), "r"(region_len),
+             "r"(word_offsets), "r"(half_values)
+         : "m0", "v0", "v1", "v2", "memory");
 
     sync_scatter(vtcm.vscatter16_32);
 }
 
-/* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */
+/* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */
 void vector_scatter_16_32_acc(void)
 {
-    HVX_VectorPair offsets;
-    HVX_Vector values;
-
-    /* get the word offsets in a vector pair */
-    offsets = *(HVX_VectorPair *)word_offsets;
-
-    /* these values need to be shuffled for the scatter */
-    values = *(HVX_Vector *)half_values_acc;
-    values = VSHUFF_H(values);
-
-    VSCATTER_16_32_ACC(&vtcm.vscatter16_32, region_len, offsets, values);
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "v1 = vmem(%2 + #1)\n\t"
+         "v2 = vmem(%3 + #0)\n\t" \
+         "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
+         "vscatter(%0, m0, v1:0.w).h += v2\n\t"
+         : : "r"(vtcm.vscatter16_32), "r"(region_len),
+             "r"(word_offsets), "r"(half_values_acc)
+         : "m0", "v0", "v1", "v2", "memory");
 
     sync_scatter(vtcm.vscatter16_32);
 }
 
-/* masked scatter the 16 bit elements with 32 bit offsets using intrinsics */
+/* masked scatter the 16 bit elements with 32 bit offsets using HVX */
 void vector_scatter_16_32_masked(void)
 {
-    HVX_VectorPair offsets;
-    HVX_Vector values;
-    HVX_Vector pred_reg;
-
-    /* get the word offsets in a vector pair */
-    offsets = *(HVX_VectorPair *)word_offsets;
-
-    /* these values need to be shuffled for the scatter */
-    values = *(HVX_Vector *)half_values_masked;
-    values = VSHUFF_H(values);
-
-    pred_reg = *(HVX_Vector *)half_predicates;
-    pred_reg = VSHUFF_H(pred_reg);
-    HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
-
-    VSCATTER_16_32_MASKED(preds, &vtcm.vscatter16_32, region_len, offsets,
-                          values);
+    asm ("r1 = #-1\n\t"
+         "v0 = vmem(%0 + #0)\n\t"
+         "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
+         "q0 = vand(v0, r1)\n\t"
+         "m0 = %2\n\t"
+         "v0 = vmem(%3 + #0)\n\t"
+         "v1 = vmem(%3 + #1)\n\t"
+         "v2 = vmem(%4 + #0)\n\t" \
+         "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
+         "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t"
+         : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
+             "r"(word_offsets), "r"(half_values_masked)
+         : "r1", "q0", "m0", "v0", "v1", "v2", "memory");
 
     sync_scatter(vtcm.vscatter16_32);
 }
 
-/* gather the elements from the scatter16 buffer */
+/* gather the elements from the scatter16 buffer using HVX */
 void vector_gather_16(void)
 {
-    HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
-    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
-
-    VGATHER_16(vgather, &vtcm.vscatter16, region_len, offsets);
-
-    sync_gather(vgather);
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t"
+         "  vmem(%3 + #0) = vtmp.new }\n\t"
+         : : "r"(vtcm.vscatter16), "r"(region_len),
+             "r"(half_offsets), "r"(vtcm.vgather16)
+         : "m0", "v0", "memory");
+
+    sync_gather(vtcm.vgather16);
 }
 
 static unsigned short gather_16_masked_init(void)
@@ -427,31 +423,51 @@ static unsigned short gather_16_masked_init(void)
     return letter | (letter << 8);
 }
 
+/* masked gather the elements from the scatter16 buffer using HVX */
 void vector_gather_16_masked(void)
 {
-    HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
-    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
-    HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
-    HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
-
-    *vgather = VSPLAT_H(gather_16_masked_init());
-    VGATHER_16_MASKED(vgather, preds, &vtcm.vscatter16, region_len, offsets);
-
-    sync_gather(vgather);
+    unsigned short init = gather_16_masked_init();
+
+    asm ("v0.h = vsplat(%5)\n\t"
+         "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
+         "r1 = #-1\n\t"
+         "v0 = vmem(%0 + #0)\n\t"
+         "q0 = vand(v0, r1)\n\t"
+         "m0 = %2\n\t"
+         "v0 = vmem(%3 + #0)\n\t"
+         "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t"
+         "  vmem(%4 + #0) = vtmp.new }\n\t"
+         : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
+             "r"(half_offsets), "r"(vtcm.vgather16), "r"(init)
+         : "r1", "q0", "m0", "v0", "memory");
+
+    sync_gather(vtcm.vgather16);
 }
 
-/* gather the elements from the scatter32 buffer */
+/* gather the elements from the scatter32 buffer using HVX */
 void vector_gather_32(void)
 {
-    HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
-    HVX_Vector *vgatherhi =
-        (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
-    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
-    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
-
-    VGATHER_32(vgatherlo, &vtcm.vscatter32, region_len, offsetslo);
-    VGATHER_32(vgatherhi, &vtcm.vscatter32, region_len, offsetshi);
+    HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
+    HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
+    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
+    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
+         "  vmem(%3 + #0) = vtmp.new }\n\t"
+         : : "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetslo), "r"(vgatherlo)
+         : "m0", "v0", "memory");
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
+         "  vmem(%3 + #0) = vtmp.new }\n\t"
+         : : "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetshi), "r"(vgatherhi)
+         : "m0", "v0", "memory");
 
+    sync_gather(vgatherlo);
     sync_gather(vgatherhi);
 }
 
@@ -461,79 +477,88 @@ static unsigned int gather_32_masked_init(void)
     return letter | (letter << 8) | (letter << 16) | (letter << 24);
 }
 
+/* masked gather the elements from the scatter32 buffer using HVX */
 void vector_gather_32_masked(void)
 {
-    HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
-    HVX_Vector *vgatherhi =
-        (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
-    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
-    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
-    HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
-    HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
-    HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
-    HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
-
-    *vgatherlo = VSPLAT_H(gather_32_masked_init());
-    *vgatherhi = VSPLAT_H(gather_32_masked_init());
-    VGATHER_32_MASKED(vgatherlo, predslo, &vtcm.vscatter32, region_len,
-                      offsetslo);
-    VGATHER_32_MASKED(vgatherhi, predshi, &vtcm.vscatter32, region_len,
-                      offsetshi);
+    unsigned int init = gather_32_masked_init();
+    HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
+    HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
+    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
+    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+    HVX_Vector *predslo = (HVX_Vector *)word_predicates;
+    HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
+
+    asm ("v0.h = vsplat(%5)\n\t"
+         "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
+         "r1 = #-1\n\t"
+         "v0 = vmem(%0 + #0)\n\t"
+         "q0 = vand(v0, r1)\n\t"
+         "m0 = %2\n\t"
+         "v0 = vmem(%3 + #0)\n\t"
+         "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
+         "  vmem(%4 + #0) = vtmp.new }\n\t"
+         : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetslo), "r"(vgatherlo), "r"(init)
+         : "r1", "q0", "m0", "v0", "memory");
+    asm ("v0.h = vsplat(%5)\n\t"
+         "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
+         "r1 = #-1\n\t"
+         "v0 = vmem(%0 + #0)\n\t"
+         "q0 = vand(v0, r1)\n\t"
+         "m0 = %2\n\t"
+         "v0 = vmem(%3 + #0)\n\t"
+         "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
+         "  vmem(%4 + #0) = vtmp.new }\n\t"
+         : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
+             "r"(offsetshi), "r"(vgatherhi), "r"(init)
+         : "r1", "q0", "m0", "v0", "memory");
 
     sync_gather(vgatherlo);
     sync_gather(vgatherhi);
 }
 
-/* gather the elements from the scatter16_32 buffer */
+/* gather the elements from the scatter16_32 buffer using HVX */
 void vector_gather_16_32(void)
 {
-    HVX_Vector *vgather;
-    HVX_VectorPair offsets;
-    HVX_Vector values;
-
-    /* get the vtcm address to gather from */
-    vgather = (HVX_Vector *)&vtcm.vgather16_32;
-
-    /* get the word offsets in a vector pair */
-    offsets = *(HVX_VectorPair *)word_offsets;
-
-    VGATHER_16_32(vgather, &vtcm.vscatter16_32, region_len, offsets);
-
-    /* deal the elements to get the order back */
-    values = *(HVX_Vector *)vgather;
-    values = VDEAL_H(values);
-
-    /* write it back to vtcm address */
-    *(HVX_Vector *)vgather = values;
+    asm ("m0 = %1\n\t"
+         "v0 = vmem(%2 + #0)\n\t"
+         "v1 = vmem(%2 + #1)\n\t"
+         "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t"
+         "  vmem(%3 + #0) = vtmp.new }\n\t"
+         "v0 = vmem(%3 + #0)\n\t"
+         "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
+         "vmem(%3 + #0) = v0\n\t"
+         : : "r"(vtcm.vscatter16_32), "r"(region_len),
+             "r"(word_offsets), "r"(vtcm.vgather16_32)
+         : "m0", "v0", "v1", "memory");
+
+    sync_gather(vtcm.vgather16_32);
 }
 
+/* masked gather the elements from the scatter16_32 buffer using HVX */
 void vector_gather_16_32_masked(void)
 {
-    HVX_Vector *vgather;
-    HVX_VectorPair offsets;
-    HVX_Vector pred_reg;
-    HVX_VectorPred preds;
-    HVX_Vector values;
-
-    /* get the vtcm address to gather from */
-    vgather = (HVX_Vector *)&vtcm.vgather16_32;
-
-    /* get the word offsets in a vector pair */
-    offsets = *(HVX_VectorPair *)word_offsets;
-    pred_reg = *(HVX_Vector *)half_predicates;
-    pred_reg = VSHUFF_H(pred_reg);
-    preds = VAND_VAL(pred_reg, ~0);
-
-   *vgather = VSPLAT_H(gather_16_masked_init());
-   VGATHER_16_32_MASKED(vgather, preds, &vtcm.vscatter16_32, region_len,
-                        offsets);
-
-    /* deal the elements to get the order back */
-    values = *(HVX_Vector *)vgather;
-    values = VDEAL_H(values);
-
-    /* write it back to vtcm address */
-    *(HVX_Vector *)vgather = values;
+    unsigned short init = gather_16_masked_init();
+
+    asm ("v0.h = vsplat(%5)\n\t"
+         "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
+         "r1 = #-1\n\t"
+         "v0 = vmem(%0 + #0)\n\t"
+         "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
+         "q0 = vand(v0, r1)\n\t"
+         "m0 = %2\n\t"
+         "v0 = vmem(%3 + #0)\n\t"
+         "v1 = vmem(%3 + #1)\n\t"
+         "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t"
+         "  vmem(%4 + #0) = vtmp.new }\n\t"
+         "v0 = vmem(%4 + #0)\n\t"
+         "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
+         "vmem(%4 + #0) = v0\n\t"
+         : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
+             "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init)
+         : "r1", "q0", "m0", "v0", "v1", "memory");
+
+    sync_gather(vtcm.vgather16_32);
 }
 
 static void check_buffer(const char *name, void *c, void *r, size_t size)
@@ -579,6 +604,7 @@ void scalar_scatter_16_acc(unsigned short *vscatter16)
     }
 }
 
+/* scatter-accumulate the 16 bit elements using C */
 void check_scatter_16_acc()
 {
     memset(vscatter16_ref, FILL_CHAR,
@@ -589,7 +615,7 @@ void check_scatter_16_acc()
                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 }
 
-/* scatter the 16 bit elements using C */
+/* masked scatter the 16 bit elements using C */
 void scalar_scatter_16_masked(unsigned short *vscatter16)
 {
     for (int i = 0; i < MATRIX_SIZE; i++) {
@@ -628,7 +654,7 @@ void check_scatter_32()
                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 }
 
-/* scatter the 32 bit elements using C */
+/* scatter-accumulate the 32 bit elements using C */
 void scalar_scatter_32_acc(unsigned int *vscatter32)
 {
     for (int i = 0; i < MATRIX_SIZE; ++i) {
@@ -646,7 +672,7 @@ void check_scatter_32_acc()
                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 }
 
-/* scatter the 32 bit elements using C */
+/* masked scatter the 32 bit elements using C */
 void scalar_scatter_32_masked(unsigned int *vscatter32)
 {
     for (int i = 0; i < MATRIX_SIZE; i++) {
@@ -667,7 +693,7 @@ void check_scatter_32_masked()
                   SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 }
 
-/* scatter the 32 bit elements using C */
+/* scatter the 16 bit elements with 32 bit offsets using C */
 void scalar_scatter_16_32(unsigned short *vscatter16_32)
 {
     for (int i = 0; i < MATRIX_SIZE; ++i) {
@@ -684,7 +710,7 @@ void check_scatter_16_32()
                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 }
 
-/* scatter the 32 bit elements using C */
+/* scatter-accumulate the 16 bit elements with 32 bit offsets using C */
 void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
 {
     for (int i = 0; i < MATRIX_SIZE; ++i) {
@@ -702,6 +728,7 @@ void check_scatter_16_32_acc()
                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 }
 
+/* masked scatter the 16 bit elements with 32 bit offsets using C */
 void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
 {
     for (int i = 0; i < MATRIX_SIZE; i++) {
@@ -738,6 +765,7 @@ void check_gather_16()
                    MATRIX_SIZE * sizeof(unsigned short));
 }
 
+/* masked gather the elements from the scatter buffer using C */
 void scalar_gather_16_masked(unsigned short *vgather16)
 {
     for (int i = 0; i < MATRIX_SIZE; ++i) {
@@ -756,7 +784,7 @@ void check_gather_16_masked()
                  MATRIX_SIZE * sizeof(unsigned short));
 }
 
-/* gather the elements from the scatter buffer using C */
+/* gather the elements from the scatter32 buffer using C */
 void scalar_gather_32(unsigned int *vgather32)
 {
     for (int i = 0; i < MATRIX_SIZE; ++i) {
@@ -772,6 +800,7 @@ void check_gather_32(void)
                  MATRIX_SIZE * sizeof(unsigned int));
 }
 
+/* masked gather the elements from the scatter32 buffer using C */
 void scalar_gather_32_masked(unsigned int *vgather32)
 {
     for (int i = 0; i < MATRIX_SIZE; ++i) {
@@ -781,7 +810,6 @@ void scalar_gather_32_masked(unsigned int *vgather32)
     }
 }
 
-
 void check_gather_32_masked(void)
 {
     memset(vgather32_ref, gather_32_masked_init(),
@@ -791,7 +819,7 @@ void check_gather_32_masked(void)
                  vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
 }
 
-/* gather the elements from the scatter buffer using C */
+/* gather the elements from the scatter16_32 buffer using C */
 void scalar_gather_16_32(unsigned short *vgather16_32)
 {
     for (int i = 0; i < MATRIX_SIZE; ++i) {
@@ -807,6 +835,7 @@ void check_gather_16_32(void)
                  MATRIX_SIZE * sizeof(unsigned short));
 }
 
+/* masked gather the elements from the scatter16_32 buffer using C */
 void scalar_gather_16_32_masked(unsigned short *vgather16_32)
 {
     for (int i = 0; i < MATRIX_SIZE; ++i) {
diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target
index ae8b3d7268..373db69648 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -64,6 +64,7 @@ run-test-mmap-%: test-mmap
 	$(call run-test, test-mmap-$*, $(QEMU) -p $* $<, $< ($* byte pages))
 
 ifneq ($(HAVE_GDB_BIN),)
+ifeq ($(HOST_GDB_SUPPORTS_ARCH),y)
 GDB_SCRIPT=$(SRC_PATH)/tests/guest-debug/run-test.py
 
 run-gdbstub-sha1: sha1
@@ -89,6 +90,10 @@ run-gdbstub-thread-breakpoint: testthread
 
 else
 run-gdbstub-%:
+	$(call skip-test, "gdbstub test $*", "no guest arch support")
+endif
+else
+run-gdbstub-%:
 	$(call skip-test, "gdbstub test $*", "need working gdb")
 endif
 EXTRA_RUNS += run-gdbstub-sha1 run-gdbstub-qxfer-auxv-read \
diff --git a/tests/tcg/multiarch/system/Makefile.softmmu-target b/tests/tcg/multiarch/system/Makefile.softmmu-target
index 368b64d531..5f432c95f3 100644
--- a/tests/tcg/multiarch/system/Makefile.softmmu-target
+++ b/tests/tcg/multiarch/system/Makefile.softmmu-target
@@ -15,6 +15,7 @@ MULTIARCH_TEST_SRCS=$(wildcard $(MULTIARCH_SYSTEM_SRC)/*.c)
 MULTIARCH_TESTS = $(patsubst $(MULTIARCH_SYSTEM_SRC)/%.c, %, $(MULTIARCH_TEST_SRCS))
 
 ifneq ($(HAVE_GDB_BIN),)
+ifeq ($(HOST_GDB_SUPPORTS_ARCH),y)
 GDB_SCRIPT=$(SRC_PATH)/tests/guest-debug/run-test.py
 
 run-gdbstub-memory: memory
@@ -26,7 +27,10 @@ run-gdbstub-memory: memory
 		"-monitor none -display none -chardev file$(COMMA)path=$<.out$(COMMA)id=output $(QEMU_OPTS)" \
 		--bin $< --test $(MULTIARCH_SRC)/gdbstub/memory.py, \
 	softmmu gdbstub support)
-
+else
+run-gdbstub-%:
+	$(call skip-test, "gdbstub test $*", "no guest arch support")
+endif
 else
 run-gdbstub-%:
 	$(call skip-test, "gdbstub test $*", "need working gdb")
diff --git a/tests/tcg/s390x/Makefile.softmmu-target b/tests/tcg/s390x/Makefile.softmmu-target
index 725b6c598d..3e7f72abcd 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -1,11 +1,25 @@
 S390X_SRC=$(SRC_PATH)/tests/tcg/s390x
 VPATH+=$(S390X_SRC)
 QEMU_OPTS=-action panic=exit-failure -kernel
+LINK_SCRIPT=$(S390X_SRC)/softmmu.ld
+LDFLAGS=-nostdlib -static -Wl,-T$(LINK_SCRIPT) -Wl,--build-id=none
 
-%: %.S
-	$(CC) -march=z13 -m64 -nostdlib -static -Wl,-Ttext=0 \
-		-Wl,--build-id=none $< -o $@
+%.o: %.S
+	$(CC) -march=z13 -m64 -c $< -o $@
+
+%: %.o $(LINK_SCRIPT)
+	$(CC) $< -o $@ $(LDFLAGS)
 
 TESTS += unaligned-lowcore
 TESTS += bal
 TESTS += sam
+TESTS += lpsw
+TESTS += lpswe-early
+TESTS += ssm-early
+TESTS += stosm-early
+TESTS += exrl-ssm-early
+
+include $(S390X_SRC)/pgm-specification.mak
+$(PGM_SPECIFICATION_TESTS): pgm-specification-softmmu.o
+$(PGM_SPECIFICATION_TESTS): LDFLAGS+=pgm-specification-softmmu.o
+TESTS += $(PGM_SPECIFICATION_TESTS)
diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 72ad309b27..0031868b13 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -2,6 +2,9 @@ S390X_SRC=$(SRC_PATH)/tests/tcg/s390x
 VPATH+=$(S390X_SRC)
 CFLAGS+=-march=zEC12 -m64
 
+%.o: %.c
+	$(CC) $(CFLAGS) $(EXTRA_CFLAGS) -c $< -o $@
+
 config-cc.mak: Makefile
 	$(quiet-@)( \
 	    $(call cc-option,-march=z14, CROSS_CC_HAS_Z14); \
@@ -28,10 +31,20 @@ TESTS+=div
 TESTS+=clst
 TESTS+=long-double
 TESTS+=cdsg
+TESTS+=chrl
+TESTS+=rxsbg
+TESTS+=ex-relative-long
 
 cdsg: CFLAGS+=-pthread
 cdsg: LDFLAGS+=-pthread
 
+rxsbg: CFLAGS+=-O2
+
+include $(S390X_SRC)/pgm-specification.mak
+$(PGM_SPECIFICATION_TESTS): pgm-specification-user.o
+$(PGM_SPECIFICATION_TESTS): LDFLAGS+=pgm-specification-user.o
+TESTS += $(PGM_SPECIFICATION_TESTS)
+
 Z13_TESTS=vistr
 $(Z13_TESTS): CFLAGS+=-march=z13 -O2
 TESTS+=$(Z13_TESTS)
@@ -51,7 +64,7 @@ $(Z15_TESTS): CFLAGS+=-march=z15 -O2
 TESTS+=$(Z15_TESTS)
 endif
 
-ifneq ($(HAVE_GDB_BIN),)
+ifeq ($(HOST_GDB_SUPPORTS_ARCH),y)
 GDB_SCRIPT=$(SRC_PATH)/tests/guest-debug/run-test.py
 
 run-gdbstub-signals-s390x: signals-s390x
diff --git a/tests/tcg/s390x/br-odd.S b/tests/tcg/s390x/br-odd.S
new file mode 100644
index 0000000000..2fae47a9e3
--- /dev/null
+++ b/tests/tcg/s390x/br-odd.S
@@ -0,0 +1,16 @@
+/*
+ * Test BRanching to a non-mapped odd address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    lgrl %r1,odd_addr
+    br %r1
+
+    .align 8
+odd_addr:
+    .quad 0xDDDDDDDDDDDDDDDD
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,0xDDDDDDDDDDDDDDDD
diff --git a/tests/tcg/s390x/cgrl-unaligned.S b/tests/tcg/s390x/cgrl-unaligned.S
new file mode 100644
index 0000000000..164d68f2e6
--- /dev/null
+++ b/tests/tcg/s390x/cgrl-unaligned.S
@@ -0,0 +1,16 @@
+/*
+ * Test CGRL with a non-doubleword aligned address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    cgrl %r1,unaligned
+
+    .align 8
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,test
+    .long 0
+unaligned:
+    .quad 0
diff --git a/tests/tcg/s390x/chrl.c b/tests/tcg/s390x/chrl.c
new file mode 100644
index 0000000000..b1c3a1c561
--- /dev/null
+++ b/tests/tcg/s390x/chrl.c
@@ -0,0 +1,80 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <stdint.h>
+
+static void test_chrl(void)
+{
+    uint32_t program_mask, cc;
+
+    asm volatile (
+        ".pushsection .rodata\n"
+        "0:\n\t"
+        ".short 1, 0x8000\n\t"
+        ".popsection\n\t"
+
+        "chrl %[r], 0b\n\t"
+        "ipm %[program_mask]\n"
+        : [program_mask] "=r" (program_mask)
+        : [r] "r" (1)
+    );
+
+    cc = program_mask >> 28;
+    assert(!cc);
+
+    asm volatile (
+        ".pushsection .rodata\n"
+        "0:\n\t"
+        ".short -1, 0x8000\n\t"
+        ".popsection\n\t"
+
+        "chrl %[r], 0b\n\t"
+        "ipm %[program_mask]\n"
+        : [program_mask] "=r" (program_mask)
+        : [r] "r" (-1)
+    );
+
+    cc = program_mask >> 28;
+    assert(!cc);
+}
+
+static void test_cghrl(void)
+{
+    uint32_t program_mask, cc;
+
+    asm volatile (
+        ".pushsection .rodata\n"
+        "0:\n\t"
+        ".short 1, 0x8000, 0, 0\n\t"
+        ".popsection\n\t"
+
+        "cghrl %[r], 0b\n\t"
+        "ipm %[program_mask]\n"
+        : [program_mask] "=r" (program_mask)
+        : [r] "r" (1L)
+    );
+
+    cc = program_mask >> 28;
+    assert(!cc);
+
+    asm volatile (
+        ".pushsection .rodata\n"
+        "0:\n\t"
+        ".short -1, 0x8000, 0, 0\n\t"
+        ".popsection\n\t"
+
+        "cghrl %[r], 0b\n\t"
+        "ipm %[program_mask]\n"
+        : [program_mask] "=r" (program_mask)
+        : [r] "r" (-1L)
+    );
+
+    cc = program_mask >> 28;
+    assert(!cc);
+}
+
+int main(void)
+{
+    test_chrl();
+    test_cghrl();
+    return EXIT_SUCCESS;
+}
diff --git a/tests/tcg/s390x/clrl-unaligned.S b/tests/tcg/s390x/clrl-unaligned.S
new file mode 100644
index 0000000000..182b1b6462
--- /dev/null
+++ b/tests/tcg/s390x/clrl-unaligned.S
@@ -0,0 +1,16 @@
+/*
+ * Test CLRL with a non-word aligned address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    clrl %r1,unaligned
+
+    .align 8
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,test
+    .short 0
+unaligned:
+    .long 0
diff --git a/tests/tcg/s390x/crl-unaligned.S b/tests/tcg/s390x/crl-unaligned.S
new file mode 100644
index 0000000000..b86fbe0ef3
--- /dev/null
+++ b/tests/tcg/s390x/crl-unaligned.S
@@ -0,0 +1,16 @@
+/*
+ * Test CRL with a non-word aligned address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    crl %r1,unaligned
+
+    .align 8
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,test
+    .short 0
+unaligned:
+    .long 0
diff --git a/tests/tcg/s390x/ex-odd.S b/tests/tcg/s390x/ex-odd.S
new file mode 100644
index 0000000000..4e42a47df3
--- /dev/null
+++ b/tests/tcg/s390x/ex-odd.S
@@ -0,0 +1,17 @@
+/*
+ * Test EXECUTing a non-mapped odd address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    lgrl %r1,odd_addr
+fail:
+    ex 0,0(%r1)
+
+    .align 8
+odd_addr:
+    .quad 0xDDDDDDDDDDDDDDDD
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,fail
diff --git a/tests/tcg/s390x/ex-relative-long.c b/tests/tcg/s390x/ex-relative-long.c
new file mode 100644
index 0000000000..21fbef6258
--- /dev/null
+++ b/tests/tcg/s390x/ex-relative-long.c
@@ -0,0 +1,156 @@
+/* Check EXECUTE with relative long instructions as targets. */
+#include <stdlib.h>
+#include <stdio.h>
+
+struct test {
+    const char *name;
+    long (*func)(long reg, long *cc);
+    long exp_reg;
+    long exp_mem;
+    long exp_cc;
+};
+
+/*
+ * Each test sets the MEM_IDXth element of the mem array to MEM and uses a
+ * single relative long instruction on it. The other elements remain zero.
+ * This is in order to prevent stumbling upon MEM in random memory in case
+ * there is an off-by-a-small-value bug.
+ *
+ * Note that while gcc supports the ZL constraint for relative long operands,
+ * clang doesn't, so the assembly code accesses mem[MEM_IDX] using MEM_ASM.
+ */
+static long mem[0x1000];
+#define MEM_IDX 0x800
+#define MEM_ASM "mem+0x800*8"
+
+/* Initial %r2 value. */
+#define REG 0x1234567887654321
+
+/* Initial mem[MEM_IDX] value. */
+#define MEM 0xfedcba9889abcdef
+
+/* Initial cc value. */
+#define CC 0
+
+/* Relative long instructions and their expected effects. */
+#define FOR_EACH_INSN(F)                                                       \
+    F(cgfrl,  REG,                 MEM,                2)                      \
+    F(cghrl,  REG,                 MEM,                2)                      \
+    F(cgrl,   REG,                 MEM,                2)                      \
+    F(chrl,   REG,                 MEM,                1)                      \
+    F(clgfrl, REG,                 MEM,                2)                      \
+    F(clghrl, REG,                 MEM,                2)                      \
+    F(clgrl,  REG,                 MEM,                1)                      \
+    F(clhrl,  REG,                 MEM,                2)                      \
+    F(clrl,   REG,                 MEM,                1)                      \
+    F(crl,    REG,                 MEM,                1)                      \
+    F(larl,   (long)&mem[MEM_IDX], MEM,                CC)                     \
+    F(lgfrl,  0xfffffffffedcba98,  MEM,                CC)                     \
+    F(lghrl,  0xfffffffffffffedc,  MEM,                CC)                     \
+    F(lgrl,   MEM,                 MEM,                CC)                     \
+    F(lhrl,   0x12345678fffffedc,  MEM,                CC)                     \
+    F(llghrl, 0x000000000000fedc,  MEM,                CC)                     \
+    F(llhrl,  0x123456780000fedc,  MEM,                CC)                     \
+    F(lrl,    0x12345678fedcba98,  MEM,                CC)                     \
+    F(stgrl,  REG,                 REG,                CC)                     \
+    F(sthrl,  REG,                 0x4321ba9889abcdef, CC)                     \
+    F(strl,   REG,                 0x8765432189abcdef, CC)
+
+/* Test functions. */
+#define DEFINE_EX_TEST(insn, exp_reg, exp_mem, exp_cc)                         \
+    static long test_ex_ ## insn(long reg, long *cc)                           \
+    {                                                                          \
+        register long r2 asm("r2");                                            \
+        char mask = 0x20;  /* make target use %r2 */                           \
+        long pm, target;                                                       \
+                                                                               \
+        r2 = reg;                                                              \
+        asm("larl %[target],0f\n"                                              \
+            "cr %%r0,%%r0\n"  /* initial cc */                                 \
+            "ex %[mask],0(%[target])\n"                                        \
+            "jg 1f\n"                                                          \
+            "0: " #insn " %%r0," MEM_ASM "\n"                                  \
+            "1: ipm %[pm]\n"                                                   \
+            : [target] "=&a" (target), [r2] "+r" (r2), [pm] "=r" (pm)          \
+            : [mask] "a" (mask)                                                \
+            : "cc", "memory");                                                 \
+        reg = r2;                                                              \
+        *cc = (pm >> 28) & 3;                                                  \
+                                                                               \
+        return reg;                                                            \
+    }
+
+#define DEFINE_EXRL_TEST(insn, exp_reg, exp_mem, exp_cc)                       \
+    static long test_exrl_ ## insn(long reg, long *cc)                         \
+    {                                                                          \
+        register long r2 asm("r2");                                            \
+        char mask = 0x20;  /* make target use %r2 */                           \
+        long pm;                                                               \
+                                                                               \
+        r2 = reg;                                                              \
+        asm("cr %%r0,%%r0\n"  /* initial cc */                                 \
+            "exrl %[mask],0f\n"                                                \
+            "jg 1f\n"                                                          \
+            "0: " #insn " %%r0," MEM_ASM "\n"                                  \
+            "1: ipm %[pm]\n"                                                   \
+            : [r2] "+r" (r2), [pm] "=r" (pm)                                   \
+            : [mask] "a" (mask)                                                \
+            : "cc", "memory");                                                 \
+        reg = r2;                                                              \
+        *cc = (pm >> 28) & 3;                                                  \
+                                                                               \
+        return reg;                                                            \
+    }
+
+FOR_EACH_INSN(DEFINE_EX_TEST)
+FOR_EACH_INSN(DEFINE_EXRL_TEST)
+
+/* Test definitions. */
+#define REGISTER_EX_EXRL_TEST(ex_insn, insn, _exp_reg, _exp_mem, _exp_cc)      \
+    {                                                                          \
+        .name = #ex_insn " " #insn,                                            \
+        .func = test_ ## ex_insn ## _ ## insn,                                 \
+        .exp_reg = (_exp_reg),                                                 \
+        .exp_mem = (_exp_mem),                                                 \
+        .exp_cc = (_exp_cc),                                                   \
+    },
+
+#define REGISTER_EX_TEST(insn, exp_reg, exp_mem, exp_cc)                       \
+    REGISTER_EX_EXRL_TEST(ex, insn, exp_reg, exp_mem, exp_cc)
+
+#define REGISTER_EXRL_TEST(insn, exp_reg, exp_mem, exp_cc)                     \
+    REGISTER_EX_EXRL_TEST(exrl, insn, exp_reg, exp_mem, exp_cc)
+
+static const struct test tests[] = {
+    FOR_EACH_INSN(REGISTER_EX_TEST)
+    FOR_EACH_INSN(REGISTER_EXRL_TEST)
+};
+
+/* Loop over all tests and run them. */
+int main(void)
+{
+    const struct test *test;
+    int ret = EXIT_SUCCESS;
+    long reg, cc;
+    size_t i;
+
+    for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) {
+        test = &tests[i];
+        mem[MEM_IDX] = MEM;
+        cc = -1;
+        reg = test->func(REG, &cc);
+#define ASSERT_EQ(expected, actual) do {                                       \
+    if (expected != actual) {                                                  \
+        fprintf(stderr, "%s: " #expected " (0x%lx) != " #actual " (0x%lx)\n",  \
+                test->name, expected, actual);                                 \
+        ret = EXIT_FAILURE;                                                    \
+    }                                                                          \
+} while (0)
+        ASSERT_EQ(test->exp_reg, reg);
+        ASSERT_EQ(test->exp_mem, mem[MEM_IDX]);
+        ASSERT_EQ(test->exp_cc, cc);
+#undef ASSERT_EQ
+    }
+
+    return ret;
+}
diff --git a/tests/tcg/s390x/exrl-ssm-early.S b/tests/tcg/s390x/exrl-ssm-early.S
new file mode 100644
index 0000000000..68fbd87b3a
--- /dev/null
+++ b/tests/tcg/s390x/exrl-ssm-early.S
@@ -0,0 +1,43 @@
+/*
+ * Test early exception recognition using EXRL + SSM.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .org 0x8d
+ilc:
+    .org 0x8e
+program_interruption_code:
+    .org 0x150
+program_old_psw:
+    .org 0x1D0                         /* program new PSW */
+    .quad 0,pgm
+    .org 0x200                         /* lowcore padding */
+
+    .globl _start
+_start:
+    exrl %r0,ssm
+expected_pswa:
+    j failure
+ssm:
+    ssm ssm_op
+
+pgm:
+    chhsi program_interruption_code,0x6          /* specification exception? */
+    jne failure
+    cli ilc,6                                    /* ilc for EXRL? */
+    jne failure
+    clc program_old_psw(16),expected_old_psw     /* correct old PSW? */
+    jne failure
+    lpswe success_psw
+failure:
+    lpswe failure_psw
+
+ssm_op:
+    .byte 0x08                                   /* bit 4 set */
+    .align 8
+expected_old_psw:
+    .quad 0x0800000180000000,expected_pswa       /* bit 2 set */
+success_psw:
+    .quad 0x2000000000000,0xfff        /* see is_special_wait_psw() */
+failure_psw:
+    .quad 0x2000000000000,0            /* disabled wait */
diff --git a/tests/tcg/s390x/lgrl-unaligned.S b/tests/tcg/s390x/lgrl-unaligned.S
new file mode 100644
index 0000000000..ef8d51d47c
--- /dev/null
+++ b/tests/tcg/s390x/lgrl-unaligned.S
@@ -0,0 +1,16 @@
+/*
+ * Test LGRL from a non-doubleword aligned address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    lgrl %r1,unaligned
+
+    .align 8
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,test
+    .long 0
+unaligned:
+    .quad 0
diff --git a/tests/tcg/s390x/llgfrl-unaligned.S b/tests/tcg/s390x/llgfrl-unaligned.S
new file mode 100644
index 0000000000..c9b4eeaecf
--- /dev/null
+++ b/tests/tcg/s390x/llgfrl-unaligned.S
@@ -0,0 +1,16 @@
+/*
+ * Test LLGFRL from a non-word aligned address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    llgfrl %r1,unaligned
+
+    .align 8
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,test
+    .short 0
+unaligned:
+    .long 0
diff --git a/tests/tcg/s390x/lpsw.S b/tests/tcg/s390x/lpsw.S
new file mode 100644
index 0000000000..b37dec59b7
--- /dev/null
+++ b/tests/tcg/s390x/lpsw.S
@@ -0,0 +1,36 @@
+/*
+ * Test the LPSW instruction.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .org 0x140
+svc_old_psw:
+    .org 0x1c0                         /* supervisor call new PSW */
+    .quad 0x80000000,svc               /* 31-bit mode */
+    .org 0x200                         /* lowcore padding */
+
+    .globl _start
+_start:
+    lpsw short_psw
+lpsw_target:
+    svc 0
+expected_pswa:
+    j failure
+
+svc:
+    clc svc_old_psw(16),expected_psw   /* correct full PSW? */
+    jne failure
+    lpswe success_psw
+failure:
+    lpswe failure_psw
+
+    .align 8
+short_psw:
+    .long 0x90001,0x80000000+lpsw_target         /* problem state,
+                                                    64-bit mode */
+expected_psw:
+    .quad 0x1000180000000,expected_pswa          /* corresponds to short_psw */
+success_psw:
+    .quad 0x2000000000000,0xfff        /* see is_special_wait_psw() */
+failure_psw:
+    .quad 0x2000000000000,0            /* disabled wait */
diff --git a/tests/tcg/s390x/lpswe-early.S b/tests/tcg/s390x/lpswe-early.S
new file mode 100644
index 0000000000..90a7f213df
--- /dev/null
+++ b/tests/tcg/s390x/lpswe-early.S
@@ -0,0 +1,38 @@
+/*
+ * Test early exception recognition using LPSWE.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .org 0x8d
+ilc:
+    .org 0x8e
+program_interruption_code:
+    .org 0x150
+program_old_psw:
+    .org 0x1D0                         /* program new PSW */
+    .quad 0,pgm
+    .org 0x200                         /* lowcore padding */
+
+    .globl _start
+_start:
+    lpswe bad_psw
+    j failure
+
+pgm:
+    chhsi program_interruption_code,0x6          /* specification exception? */
+    jne failure
+    cli ilc,0                                    /* ilc zero? */
+    jne failure
+    clc program_old_psw(16),bad_psw              /* correct old PSW? */
+    jne failure
+    lpswe success_psw
+failure:
+    lpswe failure_psw
+
+    .align 8
+bad_psw:
+    .quad 0x8000000000000000,0xfedcba9876543210  /* bit 0 set */
+success_psw:
+    .quad 0x2000000000000,0xfff        /* see is_special_wait_psw() */
+failure_psw:
+    .quad 0x2000000000000,0            /* disabled wait */
diff --git a/tests/tcg/s390x/lpswe-unaligned.S b/tests/tcg/s390x/lpswe-unaligned.S
new file mode 100644
index 0000000000..989f249a6a
--- /dev/null
+++ b/tests/tcg/s390x/lpswe-unaligned.S
@@ -0,0 +1,18 @@
+/*
+ * Test LPSWE from a non-doubleword aligned address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    larl %r1,unaligned
+fail:
+    lpswe 0(%r1)
+
+    .align 8
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,fail
+    .long 0
+unaligned:
+    .quad 0
diff --git a/tests/tcg/s390x/lrl-unaligned.S b/tests/tcg/s390x/lrl-unaligned.S
new file mode 100644
index 0000000000..11eb07f93a
--- /dev/null
+++ b/tests/tcg/s390x/lrl-unaligned.S
@@ -0,0 +1,16 @@
+/*
+ * Test LRL from a non-word aligned address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    lrl %r1,unaligned
+
+    .align 8
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,test
+    .short 0
+unaligned:
+    .long 0
diff --git a/tests/tcg/s390x/pgm-specification-softmmu.S b/tests/tcg/s390x/pgm-specification-softmmu.S
new file mode 100644
index 0000000000..d534f4e505
--- /dev/null
+++ b/tests/tcg/s390x/pgm-specification-softmmu.S
@@ -0,0 +1,40 @@
+/*
+ * Common softmmu code for specification exception testing.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .section .head
+    .org 0x8d
+ilc:
+    .org 0x8e
+program_interruption_code:
+    .org 0x150
+program_old_psw:
+    .org 0x1D0                         /* program new PSW */
+    .quad 0x180000000,pgm              /* 64-bit mode */
+    .org 0x200                         /* lowcore padding */
+
+    .globl _start
+_start:
+    lpswe test_psw
+
+pgm:
+    chhsi program_interruption_code,0x6          /* PGM_SPECIFICATION? */
+    jne failure
+    lg %r0,expected_old_psw+8                    /* ilc adjustment */
+    llgc %r1,ilc
+    agr %r0,%r1
+    stg %r0,expected_old_psw+8
+    clc expected_old_psw(16),program_old_psw     /* correct location? */
+    jne failure
+    lpswe success_psw
+failure:
+    lpswe failure_psw
+
+    .align 8
+test_psw:
+    .quad 0x180000000,test             /* 64-bit mode */
+success_psw:
+    .quad 0x2000180000000,0xfff        /* see is_special_wait_psw() */
+failure_psw:
+    .quad 0x2000180000000,0            /* disabled wait */
diff --git a/tests/tcg/s390x/pgm-specification-user.c b/tests/tcg/s390x/pgm-specification-user.c
new file mode 100644
index 0000000000..9ee6907b7c
--- /dev/null
+++ b/tests/tcg/s390x/pgm-specification-user.c
@@ -0,0 +1,37 @@
+/*
+ * Common user code for specification exception testing.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include <assert.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+extern void test(void);
+extern long expected_old_psw[2];
+
+static void handle_sigill(int sig, siginfo_t *info, void *ucontext)
+{
+    if ((long)info->si_addr != expected_old_psw[1]) {
+        _exit(EXIT_FAILURE);
+    }
+    _exit(EXIT_SUCCESS);
+}
+
+int main(void)
+{
+    struct sigaction act;
+    int err;
+
+    memset(&act, 0, sizeof(act));
+    act.sa_sigaction = handle_sigill;
+    act.sa_flags = SA_SIGINFO;
+    err = sigaction(SIGILL, &act, NULL);
+    assert(err == 0);
+
+    test();
+
+    return EXIT_FAILURE;
+}
diff --git a/tests/tcg/s390x/pgm-specification.mak b/tests/tcg/s390x/pgm-specification.mak
new file mode 100644
index 0000000000..2999aee26e
--- /dev/null
+++ b/tests/tcg/s390x/pgm-specification.mak
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+# List of specification exception tests.
+# Shared between the softmmu and the user makefiles.
+PGM_SPECIFICATION_TESTS = \
+	br-odd \
+	cgrl-unaligned \
+	clrl-unaligned \
+	crl-unaligned \
+	ex-odd \
+	lgrl-unaligned \
+	llgfrl-unaligned \
+	lpswe-unaligned \
+	lrl-unaligned \
+	stgrl-unaligned \
+	strl-unaligned
diff --git a/tests/tcg/s390x/rxsbg.c b/tests/tcg/s390x/rxsbg.c
new file mode 100644
index 0000000000..4b155db304
--- /dev/null
+++ b/tests/tcg/s390x/rxsbg.c
@@ -0,0 +1,46 @@
+/*
+ * Test the RXSBG instruction.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+static inline __attribute__((__always_inline__)) void
+rxsbg(unsigned long *r1, unsigned long r2, int i3, int i4, int i5, int *cc)
+{
+    asm("rxsbg %[r1],%[r2],%[i3],%[i4],%[i5]\n"
+        "ipm %[cc]"
+        : [r1] "+r" (*r1), [cc] "=r" (*cc)
+        : [r2] "r" (r2) , [i3] "i" (i3) , [i4] "i" (i4) , [i5] "i" (i5)
+        : "cc");
+    *cc = (*cc >> 28) & 3;
+}
+
+void test_cc0(void)
+{
+    unsigned long r1 = 6;
+    int cc;
+
+    rxsbg(&r1, 3, 61 | 0x80, 62, 1, &cc);
+    assert(r1 == 6);
+    assert(cc == 0);
+}
+
+void test_cc1(void)
+{
+    unsigned long r1 = 2;
+    int cc;
+
+    rxsbg(&r1, 3, 61 | 0x80, 62, 1, &cc);
+    assert(r1 == 2);
+    assert(cc == 1);
+}
+
+int main(void)
+{
+    test_cc0();
+    test_cc1();
+
+    return EXIT_SUCCESS;
+}
diff --git a/tests/tcg/s390x/softmmu.ld b/tests/tcg/s390x/softmmu.ld
new file mode 100644
index 0000000000..ea944eaa3c
--- /dev/null
+++ b/tests/tcg/s390x/softmmu.ld
@@ -0,0 +1,20 @@
+/*
+ * Linker script for the softmmu test kernels.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+ENTRY(_start)
+
+SECTIONS {
+    . = 0;
+
+    .text : {
+        *(.head)
+        *(.text)
+    }
+
+    /DISCARD/ : {
+        *(*)
+    }
+}
diff --git a/tests/tcg/s390x/ssm-early.S b/tests/tcg/s390x/ssm-early.S
new file mode 100644
index 0000000000..6dfe40c597
--- /dev/null
+++ b/tests/tcg/s390x/ssm-early.S
@@ -0,0 +1,41 @@
+/*
+ * Test early exception recognition using SSM.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .org 0x8d
+ilc:
+    .org 0x8e
+program_interruption_code:
+    .org 0x150
+program_old_psw:
+    .org 0x1D0                         /* program new PSW */
+    .quad 0,pgm
+    .org 0x200                         /* lowcore padding */
+
+    .globl _start
+_start:
+    ssm ssm_op
+expected_pswa:
+    j failure
+
+pgm:
+    chhsi program_interruption_code,0x6          /* specification exception? */
+    jne failure
+    cli ilc,4                                    /* ilc for SSM? */
+    jne failure
+    clc program_old_psw(16),expected_old_psw     /* correct old PSW? */
+    jne failure
+    lpswe success_psw
+failure:
+    lpswe failure_psw
+
+ssm_op:
+    .byte 0x20                                   /* bit 2 set */
+    .align 8
+expected_old_psw:
+    .quad 0x2000000180000000,expected_pswa       /* bit 2 set */
+success_psw:
+    .quad 0x2000000000000,0xfff        /* see is_special_wait_psw() */
+failure_psw:
+    .quad 0x2000000000000,0            /* disabled wait */
diff --git a/tests/tcg/s390x/stgrl-unaligned.S b/tests/tcg/s390x/stgrl-unaligned.S
new file mode 100644
index 0000000000..32df37780a
--- /dev/null
+++ b/tests/tcg/s390x/stgrl-unaligned.S
@@ -0,0 +1,16 @@
+/*
+ * Test STGRL to a non-doubleword aligned address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    stgrl %r1,unaligned
+
+    .align 8
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,test
+    .long 0
+unaligned:
+    .quad 0
diff --git a/tests/tcg/s390x/stosm-early.S b/tests/tcg/s390x/stosm-early.S
new file mode 100644
index 0000000000..0689924f3a
--- /dev/null
+++ b/tests/tcg/s390x/stosm-early.S
@@ -0,0 +1,41 @@
+/*
+ * Test early exception recognition using STOSM.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .org 0x8d
+ilc:
+    .org 0x8e
+program_interruption_code:
+    .org 0x150
+program_old_psw:
+    .org 0x1D0                         /* program new PSW */
+    .quad 0,pgm
+    .org 0x200                         /* lowcore padding */
+
+    .globl _start
+_start:
+    stosm ssm_op,0x10                            /* bit 3 set */
+expected_pswa:
+    j failure
+
+pgm:
+    chhsi program_interruption_code,0x6          /* specification exception? */
+    jne failure
+    cli ilc,4                                    /* ilc for STOSM? */
+    jne failure
+    clc program_old_psw(16),expected_old_psw     /* correct old PSW? */
+    jne failure
+    lpswe success_psw
+failure:
+    lpswe failure_psw
+
+ssm_op:
+    .byte 0
+    .align 8
+expected_old_psw:
+    .quad 0x1000000180000000,expected_pswa       /* bit 3 set */
+success_psw:
+    .quad 0x2000000000000,0xfff        /* see is_special_wait_psw() */
+failure_psw:
+    .quad 0x2000000000000,0            /* disabled wait */
diff --git a/tests/tcg/s390x/strl-unaligned.S b/tests/tcg/s390x/strl-unaligned.S
new file mode 100644
index 0000000000..1d248819f0
--- /dev/null
+++ b/tests/tcg/s390x/strl-unaligned.S
@@ -0,0 +1,16 @@
+/*
+ * Test STRL to a non-word aligned address.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+    .globl test
+test:
+    strl %r1,unaligned
+
+    .align 8
+    .globl expected_old_psw
+expected_old_psw:
+    .quad 0x180000000,test
+    .short 0
+unaligned:
+    .long 0
diff --git a/tests/tcg/xtensa/Makefile.softmmu-target b/tests/tcg/xtensa/Makefile.softmmu-target
index 973e55298e..ba6cd9fde3 100644
--- a/tests/tcg/xtensa/Makefile.softmmu-target
+++ b/tests/tcg/xtensa/Makefile.softmmu-target
@@ -2,7 +2,8 @@
 # Xtensa softmmu tests
 #
 
-ifneq ($(TARGET_BIG_ENDIAN),y)
+CORE=dc232b
+ifneq ($(shell $(QEMU) -cpu help | grep -w $(CORE)),)
 
 XTENSA_SRC = $(SRC_PATH)/tests/tcg/xtensa
 XTENSA_ALL = $(filter-out $(XTENSA_SRC)/linker.ld.S,$(wildcard $(XTENSA_SRC)/*.S))
@@ -15,7 +16,6 @@ XTENSA_USABLE_TESTS = $(filter-out $(XTENSA_BROKEN_TESTS), $(XTENSA_TESTS))
 TESTS += $(XTENSA_USABLE_TESTS)
 VPATH += $(XTENSA_SRC)
 
-CORE=dc232b
 QEMU_OPTS+=-M sim -cpu $(CORE) -nographic -semihosting -icount 6 $(EXTFLAGS) -kernel
 
 INCLUDE_DIRS = $(SRC_PATH)/target/xtensa/core-$(CORE)
@@ -26,6 +26,7 @@ ASFLAGS = -Wa,--no-absolute-literals
 LDFLAGS = -Tlinker.ld -nostartfiles -nostdlib
 
 CRT        = crt.o vectors.o
+CLEANFILES += linker.ld
 
 linker.ld: linker.ld.S
 	$(CC) $(XTENSA_INC) -E -P $< -o $@
diff --git a/tests/tcg/xtensaeb/Makefile.softmmu-target b/tests/tcg/xtensaeb/Makefile.softmmu-target
new file mode 100644
index 0000000000..4204a96d53
--- /dev/null
+++ b/tests/tcg/xtensaeb/Makefile.softmmu-target
@@ -0,0 +1,5 @@
+#
+# Xtensa softmmu tests
+#
+
+include $(SRC_PATH)/tests/tcg/xtensa/Makefile.softmmu-target