27 files changed, 1416 insertions, 152 deletions
diff --git a/fpu/softfloat-specialize.c.inc b/fpu/softfloat-specialize.c.inc
index 9096fb302b..1610472cfc 100644
--- a/fpu/softfloat-specialize.c.inc
+++ b/fpu/softfloat-specialize.c.inc
@@ -390,8 +390,8 @@ bool float32_is_signaling_nan(float32 a_, float_status *status)
 static int pickNaN(FloatClass a_cls, FloatClass b_cls,
                    bool aIsLargerSignificand, float_status *status)
 {
-#if defined(TARGET_ARM) || defined(TARGET_MIPS) || defined(TARGET_HPPA) \
-    || defined(TARGET_LOONGARCH64)
+#if defined(TARGET_ARM) || defined(TARGET_MIPS) || defined(TARGET_HPPA) || \
+    defined(TARGET_LOONGARCH64) || defined(TARGET_S390X)
     /* ARM mandated NaN propagation rules (see FPProcessNaNs()), take
      * the first of:
      *  1. A if it is signaling
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 7ad948ee7c..dd0d056fde 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -49,7 +49,6 @@
 
 #define VIRTIO_NET_VM_VERSION    11
 
-#define MAC_TABLE_ENTRIES    64
 #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
 
 /* previously fixed value */
@@ -1434,57 +1433,71 @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
     return VIRTIO_NET_OK;
 }
 
-static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
+                                  const struct iovec *in_sg, unsigned in_num,
+                                  const struct iovec *out_sg,
+                                  unsigned out_num)
 {
     VirtIONet *n = VIRTIO_NET(vdev);
     struct virtio_net_ctrl_hdr ctrl;
     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
-    VirtQueueElement *elem;
     size_t s;
     struct iovec *iov, *iov2;
-    unsigned int iov_cnt;
+
+    if (iov_size(in_sg, in_num) < sizeof(status) ||
+        iov_size(out_sg, out_num) < sizeof(ctrl)) {
+        virtio_error(vdev, "virtio-net ctrl missing headers");
+        return 0;
+    }
+
+    iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
+    s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
+    iov_discard_front(&iov, &out_num, sizeof(ctrl));
+    if (s != sizeof(ctrl)) {
+        status = VIRTIO_NET_ERR;
+    } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
+        status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
+        status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
+        status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
+        status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
+        status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
+        status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
+    }
+
+    s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
+    assert(s == sizeof(status));
+
+    g_free(iov2);
+    return sizeof(status);
+}
+
+static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtQueueElement *elem;
 
     for (;;) {
+        size_t written;
         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
         if (!elem) {
             break;
         }
-        if (iov_size(elem->in_sg, elem->in_num) < sizeof(status) ||
-            iov_size(elem->out_sg, elem->out_num) < sizeof(ctrl)) {
-            virtio_error(vdev, "virtio-net ctrl missing headers");
+
+        written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
+                                             elem->out_sg, elem->out_num);
+        if (written > 0) {
+            virtqueue_push(vq, elem, written);
+            virtio_notify(vdev, vq);
+            g_free(elem);
+        } else {
             virtqueue_detach_element(vq, elem, 0);
             g_free(elem);
             break;
         }
-
-        iov_cnt = elem->out_num;
-        iov2 = iov = g_memdup2(elem->out_sg,
-                               sizeof(struct iovec) * elem->out_num);
-        s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl));
-        iov_discard_front(&iov, &iov_cnt, sizeof(ctrl));
-        if (s != sizeof(ctrl)) {
-            status = VIRTIO_NET_ERR;
-        } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
-            status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
-            status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
-            status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
-            status = virtio_net_handle_announce(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
-            status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
-            status = virtio_net_handle_offloads(n, ctrl.cmd, iov, iov_cnt);
-        }
-
-        s = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, sizeof(status));
-        assert(s == sizeof(status));
-
-        virtqueue_push(vq, elem, sizeof(status));
-        virtio_notify(vdev, vq);
-        g_free(iov2);
-        g_free(elem);
     }
 }
 
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index 56c96ebd13..e4956728dd 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -122,17 +122,35 @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
     return true;
 }
 
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
-                                    const struct iovec *iovec, size_t num,
-                                    bool more_descs, bool write)
+/**
+ * Write descriptors to SVQ vring
+ *
+ * @svq: The shadow virtqueue
+ * @sg: Cache for hwaddr
+ * @iovec: The iovec from the guest
+ * @num: iovec length
+ * @more_descs: True if more descriptors come in the chain
+ * @write: True if they are writeable descriptors
+ *
+ * Return true if success, false otherwise and print error.
+ */
+static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+                                        const struct iovec *iovec, size_t num,
+                                        bool more_descs, bool write)
 {
     uint16_t i = svq->free_head, last = svq->free_head;
     unsigned n;
     uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
     vring_desc_t *descs = svq->vring.desc;
+    bool ok;
 
     if (num == 0) {
-        return;
+        return true;
+    }
+
+    ok = vhost_svq_translate_addr(svq, sg, iovec, num);
+    if (unlikely(!ok)) {
+        return false;
     }
 
     for (n = 0; n < num; n++) {
@@ -150,40 +168,39 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
     }
 
     svq->free_head = le16_to_cpu(svq->desc_next[last]);
+    return true;
 }
 
 static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
-                                VirtQueueElement *elem, unsigned *head)
+                                const struct iovec *out_sg, size_t out_num,
+                                const struct iovec *in_sg, size_t in_num,
+                                unsigned *head)
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
     bool ok;
-    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
 
     *head = svq->free_head;
 
     /* We need some descriptors here */
-    if (unlikely(!elem->out_num && !elem->in_num)) {
+    if (unlikely(!out_num && !in_num)) {
         qemu_log_mask(LOG_GUEST_ERROR,
                       "Guest provided element with no descriptors");
         return false;
     }
 
-    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0,
+                                     false);
     if (unlikely(!ok)) {
         return false;
     }
-    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
-                            elem->in_num > 0, false);
 
-
-    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true);
     if (unlikely(!ok)) {
         return false;
     }
 
-    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
-
     /*
      * Put the entry in the available array (but don't update avail->idx until
      * they do sync).
@@ -199,38 +216,58 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
     return true;
 }
 
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
 /**
  * Add an element to a SVQ.
  *
  * The caller must check that there is enough slots for the new element. It
- * takes ownership of the element: In case of failure, it is free and the SVQ
- * is considered broken.
+ * takes ownership of the element: In case of failure not ENOSPC, it is free.
+ *
+ * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
  */
-static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+                  size_t out_num, const struct iovec *in_sg, size_t in_num,
+                  VirtQueueElement *elem)
 {
     unsigned qemu_head;
-    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    unsigned ndescs = in_num + out_num;
+    bool ok;
+
+    if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
+        return -ENOSPC;
+    }
+
+    ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head);
     if (unlikely(!ok)) {
         g_free(elem);
-        return false;
+        return -EINVAL;
     }
 
-    svq->ring_id_maps[qemu_head] = elem;
-    return true;
+    svq->desc_state[qemu_head].elem = elem;
+    svq->desc_state[qemu_head].ndescs = ndescs;
+    vhost_svq_kick(svq);
+    return 0;
 }
 
-static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+/* Convenience wrapper to add a guest's element to SVQ */
+static int vhost_svq_add_element(VhostShadowVirtqueue *svq,
+                                 VirtQueueElement *elem)
 {
-    /*
-     * We need to expose the available array entries before checking the used
-     * flags
-     */
-    smp_mb();
-    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
-        return;
-    }
-
-    event_notifier_set(&svq->hdev_kick);
+    return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg,
+                         elem->in_num, elem);
 }
 
 /**
@@ -257,7 +294,7 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 
         while (true) {
             VirtQueueElement *elem;
-            bool ok;
+            int r;
 
             if (svq->next_guest_avail_elem) {
                 elem = g_steal_pointer(&svq->next_guest_avail_elem);
@@ -269,28 +306,30 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
                 break;
             }
 
-            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
-                /*
-                 * This condition is possible since a contiguous buffer in GPA
-                 * does not imply a contiguous buffer in qemu's VA
-                 * scatter-gather segments. If that happens, the buffer exposed
-                 * to the device needs to be a chain of descriptors at this
-                 * moment.
-                 *
-                 * SVQ cannot hold more available buffers if we are here:
-                 * queue the current guest descriptor and ignore further kicks
-                 * until some elements are used.
-                 */
-                svq->next_guest_avail_elem = elem;
-                return;
+            if (svq->ops) {
+                r = svq->ops->avail_handler(svq, elem, svq->ops_opaque);
+            } else {
+                r = vhost_svq_add_element(svq, elem);
             }
-
-            ok = vhost_svq_add(svq, elem);
-            if (unlikely(!ok)) {
-                /* VQ is broken, just return and ignore any other kicks */
+            if (unlikely(r != 0)) {
+                if (r == -ENOSPC) {
+                    /*
+                     * This condition is possible since a contiguous buffer in
+                     * GPA does not imply a contiguous buffer in qemu's VA
+                     * scatter-gather segments. If that happens, the buffer
+                     * exposed to the device needs to be a chain of descriptors
+                     * at this moment.
+                     *
+                     * SVQ cannot hold more available buffers if we are here:
+                     * queue the current guest descriptor and ignore kicks
+                     * until some elements are used.
+                     */
+                    svq->next_guest_avail_elem = elem;
+                }
+
+                /* VQ is full or broken, just return and ignore kicks */
                 return;
             }
-            vhost_svq_kick(svq);
         }
 
         virtio_queue_set_notification(svq->vq, true);
@@ -311,11 +350,12 @@ static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 
 static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
 {
+    uint16_t *used_idx = &svq->vring.used->idx;
     if (svq->last_used_idx != svq->shadow_used_idx) {
         return true;
     }
 
-    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+    svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx);
 
     return svq->last_used_idx != svq->shadow_used_idx;
 }
@@ -376,21 +416,36 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
         return NULL;
     }
 
-    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+    if (unlikely(!svq->desc_state[used_elem.id].elem)) {
         qemu_log_mask(LOG_GUEST_ERROR,
             "Device %s says index %u is used, but it was not available",
             svq->vdev->name, used_elem.id);
         return NULL;
     }
 
-    num = svq->ring_id_maps[used_elem.id]->in_num +
-          svq->ring_id_maps[used_elem.id]->out_num;
+    num = svq->desc_state[used_elem.id].ndescs;
     last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
     svq->desc_next[last_used_chain] = svq->free_head;
     svq->free_head = used_elem.id;
 
     *len = used_elem.len;
-    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+    return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
+}
+
+/**
+ * Push an element to SVQ, returning it to the guest.
+ */
+void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
+                         const VirtQueueElement *elem, uint32_t len)
+{
+    virtqueue_push(svq->vq, elem, len);
+    if (svq->next_guest_avail_elem) {
+        /*
+         * Avail ring was full when vhost_svq_flush was called, so it's a
+         * good moment to make more descriptors available if possible.
+         */
+        vhost_handle_guest_kick(svq);
+    }
 }
 
 static void vhost_svq_flush(VhostShadowVirtqueue *svq,
@@ -435,6 +490,33 @@ static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 }
 
 /**
+ * Poll the SVQ for one device used buffer.
+ *
+ * This function race with main event loop SVQ polling, so extra
+ * synchronization is needed.
+ *
+ * Return the length written by the device.
+ */
+size_t vhost_svq_poll(VhostShadowVirtqueue *svq)
+{
+    int64_t start_us = g_get_monotonic_time();
+    do {
+        uint32_t len;
+        VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+        if (elem) {
+            return len;
+        }
+
+        if (unlikely(g_get_monotonic_time() - start_us > 10e6)) {
+            return 0;
+        }
+
+        /* Make sure we read new used_idx */
+        smp_rmb();
+    } while (true);
+}
+
+/**
  * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
@@ -560,7 +642,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
     memset(svq->vring.desc, 0, driver_size);
     svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size);
     memset(svq->vring.used, 0, device_size);
-    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
     svq->desc_next = g_new0(uint16_t, svq->vring.num);
     for (unsigned i = 0; i < svq->vring.num - 1; i++) {
         svq->desc_next[i] = cpu_to_le16(i + 1);
@@ -585,7 +667,7 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
 
     for (unsigned i = 0; i < svq->vring.num; ++i) {
         g_autofree VirtQueueElement *elem = NULL;
-        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        elem = g_steal_pointer(&svq->desc_state[i].elem);
         if (elem) {
             virtqueue_detach_element(svq->vq, elem, 0);
         }
@@ -597,7 +679,7 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
     }
     svq->vq = NULL;
     g_free(svq->desc_next);
-    g_free(svq->ring_id_maps);
+    g_free(svq->desc_state);
     qemu_vfree(svq->vring.desc);
     qemu_vfree(svq->vring.used);
 }
@@ -607,12 +689,16 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * shadow methods and file descriptors.
  *
  * @iova_tree: Tree to perform descriptors translations
+ * @ops: SVQ owner callbacks
+ * @ops_opaque: ops opaque pointer
  *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+                                    const VhostShadowVirtqueueOps *ops,
+                                    void *ops_opaque)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -634,6 +720,8 @@ VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     svq->iova_tree = iova_tree;
+    svq->ops = ops;
+    svq->ops_opaque = ops_opaque;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index c132c994e9..d04c34a589 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -15,6 +15,37 @@
 #include "standard-headers/linux/vhost_types.h"
 #include "hw/virtio/vhost-iova-tree.h"
 
+typedef struct SVQDescState {
+    VirtQueueElement *elem;
+
+    /*
+     * Number of descriptors exposed to the device. May or may not match
+     * guest's
+     */
+    unsigned int ndescs;
+} SVQDescState;
+
+typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
+
+/**
+ * Callback to handle an avail buffer.
+ *
+ * @svq:  Shadow virtqueue
+ * @elem:  Element placed in the queue by the guest
+ * @vq_callback_opaque:  Opaque
+ *
+ * Returns 0 if the vq is running as expected.
+ *
+ * Note that ownership of elem is transferred to the callback.
+ */
+typedef int (*VirtQueueAvailCallback)(VhostShadowVirtqueue *svq,
+                                      VirtQueueElement *elem,
+                                      void *vq_callback_opaque);
+
+typedef struct VhostShadowVirtqueueOps {
+    VirtQueueAvailCallback avail_handler;
+} VhostShadowVirtqueueOps;
+
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
     /* Shadow vring */
@@ -47,8 +78,8 @@ typedef struct VhostShadowVirtqueue {
     /* IOVA mapping */
     VhostIOVATree *iova_tree;
 
-    /* Map for use the guest's descriptors */
-    VirtQueueElement **ring_id_maps;
+    /* SVQ vring descriptors state */
+    SVQDescState *desc_state;
 
     /* Next VirtQueue element that guest made available */
     VirtQueueElement *next_guest_avail_elem;
@@ -59,6 +90,12 @@ typedef struct VhostShadowVirtqueue {
      */
     uint16_t *desc_next;
 
+    /* Caller callbacks */
+    const VhostShadowVirtqueueOps *ops;
+
+    /* Caller callbacks opaque */
+    void *ops_opaque;
+
     /* Next head to expose to the device */
     uint16_t shadow_avail_idx;
 
@@ -74,6 +111,13 @@ typedef struct VhostShadowVirtqueue {
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
+void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
+                         const VirtQueueElement *elem, uint32_t len);
+int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+                  size_t out_num, const struct iovec *in_sg, size_t in_num,
+                  VirtQueueElement *elem);
+size_t vhost_svq_poll(VhostShadowVirtqueue *svq);
+
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
@@ -85,7 +129,9 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+                                    const VhostShadowVirtqueueOps *ops,
+                                    void *ops_opaque);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 66f054a12c..291cd19054 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -20,6 +20,7 @@
 #include "hw/virtio/vhost-shadow-virtqueue.h"
 #include "hw/virtio/vhost-vdpa.h"
 #include "exec/address-spaces.h"
+#include "migration/blocker.h"
 #include "qemu/cutils.h"
 #include "qemu/main-loop.h"
 #include "cpu.h"
@@ -71,8 +72,8 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
     return false;
 }
 
-static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
-                              void *vaddr, bool readonly)
+int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
+                       void *vaddr, bool readonly)
 {
     struct vhost_msg_v2 msg = {};
     int fd = v->device_fd;
@@ -97,8 +98,7 @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
     return ret;
 }
 
-static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
-                                hwaddr size)
+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
 {
     struct vhost_msg_v2 msg = {};
     int fd = v->device_fd;
@@ -418,8 +418,10 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
+        g_autoptr(VhostShadowVirtqueue) svq;
 
+        svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
+                            v->shadow_vq_ops_opaque);
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
             return -1;
@@ -1021,6 +1023,13 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
         return true;
     }
 
+    if (v->migration_blocker) {
+        int r = migrate_add_blocker(v->migration_blocker, &err);
+        if (unlikely(r < 0)) {
+            return false;
+        }
+    }
+
     for (i = 0; i < v->shadow_vqs->len; ++i) {
         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
@@ -1063,6 +1072,10 @@ err:
         vhost_svq_stop(svq);
     }
 
+    if (v->migration_blocker) {
+        migrate_del_blocker(v->migration_blocker);
+    }
+
     return false;
 }
 
@@ -1082,6 +1095,9 @@ static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
         }
     }
 
+    if (v->migration_blocker) {
+        migrate_del_blocker(v->migration_blocker);
+    }
     return true;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index a29dbb3f53..d10a89303e 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -15,6 +15,7 @@
 #include <gmodule.h>
 
 #include "hw/virtio/vhost-iova-tree.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -34,9 +35,16 @@ typedef struct vhost_vdpa {
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
+    Error *migration_blocker;
     GPtrArray *shadow_vqs;
+    const VhostShadowVirtqueueOps *shadow_vq_ops;
+    void *shadow_vq_ops_opaque;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostVDPA;
 
+int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
+                       void *vaddr, bool readonly);
+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size);
+
 #endif
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index eb87032627..ef234ffe7e 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -35,6 +35,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIONet, VIRTIO_NET)
  * and latency. */
 #define TX_BURST 256
 
+/* Maximum VIRTIO_NET_CTRL_MAC_TABLE_SET unicast + multicast entries. */
+#define MAC_TABLE_ENTRIES    64
+
 typedef struct virtio_net_conf
 {
     uint32_t txtimer;
@@ -218,6 +221,10 @@ struct VirtIONet {
     struct EBPFRSSContext ebpf_rss;
 };
 
+size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
+                                  const struct iovec *in_sg, unsigned in_num,
+                                  const struct iovec *out_sg,
+                                  unsigned out_num);
 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
                                    const char *type);
 
diff --git a/net/colo-compare.c b/net/colo-compare.c
index d5d0965805..787c740f14 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -1323,7 +1323,7 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
                                                       connection_key_equal,
                                                       g_free,
-                                                      connection_destroy);
+                                                      NULL);
 
     colo_compare_iothread(s);
 
diff --git a/net/colo.c b/net/colo.c
index 1f8162f59f..6b0ff562ad 100644
--- a/net/colo.c
+++ b/net/colo.c
@@ -46,7 +46,14 @@ int parse_packet_early(Packet *pkt)
     static const uint8_t vlan[] = {0x81, 0x00};
     uint8_t *data = pkt->data + pkt->vnet_hdr_len;
     uint16_t l3_proto;
-    ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
+    ssize_t l2hdr_len;
+
+    if (data == NULL) {
+        trace_colo_proxy_main_vnet_info("This packet is not parsed correctly, "
+                                        "pkt->vnet_hdr_len", pkt->vnet_hdr_len);
+        return 1;
+    }
+    l2hdr_len = eth_get_l2_hdr_length(data);
 
     if (pkt->size < ETH_HLEN + pkt->vnet_hdr_len) {
         trace_colo_proxy_main("pkt->size < ETH_HLEN");
@@ -218,7 +225,7 @@ Connection *connection_get(GHashTable *connection_track_table,
             /*
              * clear the conn_list
              */
-            while (!g_queue_is_empty(conn_list)) {
+            while (conn_list && !g_queue_is_empty(conn_list)) {
                 connection_destroy(g_queue_pop_head(conn_list));
             }
         }
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
index bf05023dc3..c18c4c2019 100644
--- a/net/filter-rewriter.c
+++ b/net/filter-rewriter.c
@@ -383,7 +383,7 @@ static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
                                                       connection_key_equal,
                                                       g_free,
-                                                      connection_destroy);
+                                                      NULL);
     s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
 }
 
diff --git a/net/meson.build b/net/meson.build
index 754e2d1d40..d1be76daf3 100644
--- a/net/meson.build
+++ b/net/meson.build
@@ -41,7 +41,8 @@ endif
 softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files(tap_posix))
 softmmu_ss.add(when: 'CONFIG_WIN32', if_true: files('tap-win32.c'))
 if have_vhost_net_vdpa
-  softmmu_ss.add(files('vhost-vdpa.c'))
+  softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-vdpa.c'), if_false: files('vhost-vdpa-stub.c'))
+  softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-vdpa-stub.c'))
 endif
 
 vmnet_files = files(
diff --git a/net/trace-events b/net/trace-events
index d7a17256cc..6af927b4b9 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -9,6 +9,7 @@ vhost_user_event(const char *chr, int event) "chr: %s got event: %d"
 
 # colo.c
 colo_proxy_main(const char *chr) ": %s"
+colo_proxy_main_vnet_info(const char *sta, int size) ": %s = %d"
 
 # colo-compare.c
 colo_compare_main(const char *chr) ": %s"
diff --git a/net/vhost-vdpa-stub.c b/net/vhost-vdpa-stub.c
new file mode 100644
index 0000000000..1732ed2443
--- /dev/null
+++ b/net/vhost-vdpa-stub.c
@@ -0,0 +1,21 @@
+/*
+ * vhost-vdpa-stub.c
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "clients.h"
+#include "net/vhost-vdpa.h"
+#include "qapi/error.h"
+
+int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
+                        NetClientState *peer, Error **errp)
+{
+    error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
+    return -1;
+}
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index df1e69ee72..6abad276a6 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -11,11 +11,14 @@
 
 #include "qemu/osdep.h"
 #include "clients.h"
+#include "hw/virtio/virtio-net.h"
 #include "net/vhost_net.h"
 #include "net/vhost-vdpa.h"
 #include "hw/virtio/vhost-vdpa.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "qemu/option.h"
 #include "qapi/error.h"
 #include <linux/vhost.h>
@@ -30,6 +33,9 @@ typedef struct VhostVDPAState {
     NetClientState nc;
     struct vhost_vdpa vhost_vdpa;
     VHostNetState *vhost_net;
+
+    /* Control commands shadow buffers */
+    void *cvq_cmd_out_buffer, *cvq_cmd_in_buffer;
     bool started;
 } VhostVDPAState;
 
@@ -69,6 +75,28 @@ const int vdpa_feature_bits[] = {
     VHOST_INVALID_FEATURE_BIT
 };
 
+/** Supported device specific feature bits with SVQ */
+static const uint64_t vdpa_svq_device_features =
+    BIT_ULL(VIRTIO_NET_F_CSUM) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
+    BIT_ULL(VIRTIO_NET_F_MTU) |
+    BIT_ULL(VIRTIO_NET_F_MAC) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
+    BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
+    BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
+    BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
+    BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
+    BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
+    BIT_ULL(VIRTIO_NET_F_STATUS) |
+    BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
+    BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
+    BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
+    BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
+    BIT_ULL(VIRTIO_NET_F_STANDBY);
+
 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
@@ -127,7 +155,13 @@ err_init:
 static void vhost_vdpa_cleanup(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+    struct vhost_dev *dev = &s->vhost_net->dev;
 
+    qemu_vfree(s->cvq_cmd_out_buffer);
+    qemu_vfree(s->cvq_cmd_in_buffer);
+    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
+        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
+    }
     if (s->vhost_net) {
         vhost_net_cleanup(s->vhost_net);
         g_free(s->vhost_net);
@@ -187,13 +221,251 @@ static NetClientInfo net_vhost_vdpa_info = {
         .check_peer_type = vhost_vdpa_check_peer_type,
 };
 
+static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
+{
+    VhostIOVATree *tree = v->iova_tree;
+    DMAMap needle = {
+        /*
+         * No need to specify size or to look for more translations since
+         * this contiguous chunk was allocated by us.
+         */
+        .translated_addr = (hwaddr)(uintptr_t)addr,
+    };
+    const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
+    int r;
+
+    if (unlikely(!map)) {
+        error_report("Cannot locate expected map");
+        return;
+    }
+
+    r = vhost_vdpa_dma_unmap(v, map->iova, map->size + 1);
+    if (unlikely(r != 0)) {
+        error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
+    }
+
+    vhost_iova_tree_remove(tree, map);
+}
+
+static size_t vhost_vdpa_net_cvq_cmd_len(void)
+{
+    /*
+     * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
+     * In buffer is always 1 byte, so it should fit here
+     */
+    return sizeof(struct virtio_net_ctrl_hdr) +
+           2 * sizeof(struct virtio_net_ctrl_mac) +
+           MAC_TABLE_ENTRIES * ETH_ALEN;
+}
+
+static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
+{
+    return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
+}
+
+/** Copy and map a guest buffer. */
+static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v,
+                                   const struct iovec *out_data,
+                                   size_t out_num, size_t data_len, void *buf,
+                                   size_t *written, bool write)
+{
+    DMAMap map = {};
+    int r;
+
+    if (unlikely(!data_len)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid legnth of %s buffer\n",
+                      __func__, write ? "in" : "out");
+        return false;
+    }
+
+    *written = iov_to_buf(out_data, out_num, 0, buf, data_len);
+    map.translated_addr = (hwaddr)(uintptr_t)buf;
+    map.size = vhost_vdpa_net_cvq_cmd_page_len() - 1;
+    map.perm = write ? IOMMU_RW : IOMMU_RO,
+    r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
+    if (unlikely(r != IOVA_OK)) {
+        error_report("Cannot map injected element");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, map.iova, vhost_vdpa_net_cvq_cmd_page_len(), buf,
+                           !write);
+    if (unlikely(r < 0)) {
+        goto dma_map_err;
+    }
+
+    return true;
+
+dma_map_err:
+    vhost_iova_tree_remove(v->iova_tree, &map);
+    return false;
+}
+
+/**
+ * Copy the guest element into a dedicated buffer suitable to be sent to NIC
+ *
+ * @iov: [0] is the out buffer, [1] is the in one
+ */
+static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s,
+                                        VirtQueueElement *elem,
+                                        struct iovec *iov)
+{
+    size_t in_copied;
+    bool ok;
+
+    iov[0].iov_base = s->cvq_cmd_out_buffer;
+    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, elem->out_sg, elem->out_num,
+                                vhost_vdpa_net_cvq_cmd_len(), iov[0].iov_base,
+                                &iov[0].iov_len, false);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    iov[1].iov_base = s->cvq_cmd_in_buffer;
+    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, NULL, 0,
+                                sizeof(virtio_net_ctrl_ack), iov[1].iov_base,
+                                &in_copied, true);
+    if (unlikely(!ok)) {
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
+        return false;
+    }
+
+    iov[1].iov_len = sizeof(virtio_net_ctrl_ack);
+    return true;
+}
+
+/**
+ * Do not forward commands not supported by SVQ. Otherwise, the device could
+ * accept it and qemu would not know how to update the device model.
+ */
+static bool vhost_vdpa_net_cvq_validate_cmd(const struct iovec *out,
+                                            size_t out_num)
+{
+    struct virtio_net_ctrl_hdr ctrl;
+    size_t n;
+
+    n = iov_to_buf(out, out_num, 0, &ctrl, sizeof(ctrl));
+    if (unlikely(n < sizeof(ctrl))) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "%s: invalid legnth of out buffer %zu\n", __func__, n);
+        return false;
+    }
+
+    switch (ctrl.class) {
+    case VIRTIO_NET_CTRL_MAC:
+        switch (ctrl.cmd) {
+        case VIRTIO_NET_CTRL_MAC_ADDR_SET:
+            return true;
+        default:
+            qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid mac cmd %u\n",
+                          __func__, ctrl.cmd);
+        };
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid control class %u\n",
+                      __func__, ctrl.class);
+    };
+
+    return false;
+}
+
+/**
+ * Validate and copy control virtqueue commands.
+ *
+ * Following QEMU guidelines, we offer a copy of the buffers to the device to
+ * prevent TOCTOU bugs.
+ */
+static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
+                                            VirtQueueElement *elem,
+                                            void *opaque)
+{
+    VhostVDPAState *s = opaque;
+    size_t in_len, dev_written;
+    virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+    /* out and in buffers sent to the device */
+    struct iovec dev_buffers[2] = {
+        { .iov_base = s->cvq_cmd_out_buffer },
+        { .iov_base = s->cvq_cmd_in_buffer },
+    };
+    /* in buffer used for device model */
+    const struct iovec in = {
+        .iov_base = &status,
+        .iov_len = sizeof(status),
+    };
+    int r = -EINVAL;
+    bool ok;
+
+    ok = vhost_vdpa_net_cvq_map_elem(s, elem, dev_buffers);
+    if (unlikely(!ok)) {
+        goto out;
+    }
+
+    ok = vhost_vdpa_net_cvq_validate_cmd(&dev_buffers[0], 1);
+    if (unlikely(!ok)) {
+        goto out;
+    }
+
+    r = vhost_svq_add(svq, &dev_buffers[0], 1, &dev_buffers[1], 1, elem);
+    if (unlikely(r != 0)) {
+        if (unlikely(r == -ENOSPC)) {
+            qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
+                          __func__);
+        }
+        goto out;
+    }
+
+    /*
+     * We can poll here since we've had BQL from the time we sent the
+     * descriptor. Also, we need to take the answer before SVQ pulls by itself,
+     * when BQL is released
+     */
+    dev_written = vhost_svq_poll(svq);
+    if (unlikely(dev_written < sizeof(status))) {
+        error_report("Insufficient written data (%zu)", dev_written);
+        goto out;
+    }
+
+    memcpy(&status, dev_buffers[1].iov_base, sizeof(status));
+    if (status != VIRTIO_NET_OK) {
+        goto out;
+    }
+
+    status = VIRTIO_NET_ERR;
+    virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, dev_buffers, 1);
+    if (status != VIRTIO_NET_OK) {
+        error_report("Bad CVQ processing in model");
+    }
+
+out:
+    in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
+                          sizeof(status));
+    if (unlikely(in_len < sizeof(status))) {
+        error_report("Bad device CVQ written length");
+    }
+    vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
+    g_free(elem);
+    if (dev_buffers[0].iov_base) {
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[0].iov_base);
+    }
+    if (dev_buffers[1].iov_base) {
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[1].iov_base);
+    }
+    return r;
+}
+
+static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
+    .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
+};
+
 static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
                                            const char *device,
                                            const char *name,
                                            int vdpa_device_fd,
                                            int queue_pair_index,
                                            int nvqs,
-                                           bool is_datapath)
+                                           bool is_datapath,
+                                           bool svq,
+                                           VhostIOVATree *iova_tree)
 {
     NetClientState *nc = NULL;
     VhostVDPAState *s;
@@ -211,6 +483,21 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
 
     s->vhost_vdpa.device_fd = vdpa_device_fd;
     s->vhost_vdpa.index = queue_pair_index;
+    s->vhost_vdpa.shadow_vqs_enabled = svq;
+    s->vhost_vdpa.iova_tree = iova_tree;
+    if (!is_datapath) {
+        s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
+                                            vhost_vdpa_net_cvq_cmd_page_len());
+        memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
+        s->cvq_cmd_in_buffer = qemu_memalign(qemu_real_host_page_size(),
+                                            vhost_vdpa_net_cvq_cmd_page_len());
+        memset(s->cvq_cmd_in_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
+
+        s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
+        s->vhost_vdpa.shadow_vq_ops_opaque = s;
+        error_setg(&s->vhost_vdpa.migration_blocker,
+                   "Migration disabled: vhost-vdpa uses CVQ.");
+    }
     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
     if (ret) {
         qemu_del_net_client(nc);
@@ -219,20 +506,32 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
     return nc;
 }
 
-static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp)
+static int vhost_vdpa_get_iova_range(int fd,
+                                     struct vhost_vdpa_iova_range *iova_range)
+{
+    int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
+
+    return ret < 0 ? -errno : 0;
+}
+
+static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
+{
+    int ret = ioctl(fd, VHOST_GET_FEATURES, features);
+    if (unlikely(ret < 0)) {
+        error_setg_errno(errp, errno,
+                         "Fail to query features from vhost-vDPA device");
+    }
+    return ret;
+}
+
+static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
+                                          int *has_cvq, Error **errp)
 {
     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
     g_autofree struct vhost_vdpa_config *config = NULL;
     __virtio16 *max_queue_pairs;
-    uint64_t features;
     int ret;
 
-    ret = ioctl(fd, VHOST_GET_FEATURES, &features);
-    if (ret) {
-        error_setg(errp, "Fail to query features from vhost-vDPA device");
-        return ret;
-    }
-
     if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
         *has_cvq = 1;
     } else {
@@ -262,10 +561,12 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
                         NetClientState *peer, Error **errp)
 {
     const NetdevVhostVDPAOptions *opts;
+    uint64_t features;
     int vdpa_device_fd;
     g_autofree NetClientState **ncs = NULL;
+    g_autoptr(VhostIOVATree) iova_tree = NULL;
     NetClientState *nc;
-    int queue_pairs, i, has_cvq = 0;
+    int queue_pairs, r, i, has_cvq = 0;
 
     assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
     opts = &netdev->u.vhost_vdpa;
@@ -279,29 +580,57 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
         return -errno;
     }
 
-    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd,
+    r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
+    if (unlikely(r < 0)) {
+        return r;
+    }
+
+    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
                                                  &has_cvq, errp);
     if (queue_pairs < 0) {
         qemu_close(vdpa_device_fd);
         return queue_pairs;
     }
 
+    if (opts->x_svq) {
+        struct vhost_vdpa_iova_range iova_range;
+
+        uint64_t invalid_dev_features =
+            features & ~vdpa_svq_device_features &
+            /* Transport are all accepted at this point */
+            ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
+                             VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
+
+        if (invalid_dev_features) {
+            error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
+                       invalid_dev_features);
+            goto err_svq;
+        }
+
+        vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
+        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
+    }
+
     ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
 
     for (i = 0; i < queue_pairs; i++) {
         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
-                                     vdpa_device_fd, i, 2, true);
+                                     vdpa_device_fd, i, 2, true, opts->x_svq,
+                                     iova_tree);
         if (!ncs[i])
             goto err;
     }
 
     if (has_cvq) {
         nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
-                                 vdpa_device_fd, i, 1, false);
+                                 vdpa_device_fd, i, 1, false,
+                                 opts->x_svq, iova_tree);
         if (!nc)
             goto err;
     }
 
+    /* iova_tree ownership belongs to last NetClientState */
+    g_steal_pointer(&iova_tree);
     return 0;
 
 err:
@@ -310,6 +639,8 @@ err:
             qemu_del_net_client(ncs[i]);
         }
     }
+
+err_svq:
     qemu_close(vdpa_device_fd);
 
     return -1;
diff --git a/qapi/net.json b/qapi/net.json
index 9af11e9a3b..75ba2cb989 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -445,12 +445,19 @@
 # @queues: number of queues to be created for multiqueue vhost-vdpa
 #          (default: 1)
 #
+# @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1)
+#         (default: false)
+#
+# Features:
+# @unstable: Member @x-svq is experimental.
+#
 # Since: 5.1
 ##
 { 'struct': 'NetdevVhostVDPAOptions',
   'data': {
     '*vhostdev':     'str',
-    '*queues':       'int' } }
+    '*queues':       'int',
+    '*x-svq':        {'type': 'bool', 'features' : [ 'unstable'] } } }
 
 ##
 # @NetdevVmnetHostOptions:
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index fac7b63259..168e1b78a0 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -126,6 +126,7 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
 
     { RUN_STATE_COLO, RUN_STATE_RUNNING },
+    { RUN_STATE_COLO, RUN_STATE_PRELAUNCH },
     { RUN_STATE_COLO, RUN_STATE_SHUTDOWN},
 
     { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
diff --git a/target/hexagon/gen_tcg.h b/target/hexagon/gen_tcg.h
index c6f0879b6e..50634ac459 100644
--- a/target/hexagon/gen_tcg.h
+++ b/target/hexagon/gen_tcg.h
@@ -339,12 +339,13 @@
     do { \
         TCGv LSB = tcg_temp_local_new(); \
         TCGLabel *label = gen_new_label(); \
-        GET_EA; \
+        tcg_gen_movi_tl(EA, 0); \
         PRED;  \
+        CHECK_NOSHUF_PRED(GET_EA, SIZE, LSB); \
         PRED_LOAD_CANCEL(LSB, EA); \
         tcg_gen_movi_tl(RdV, 0); \
         tcg_gen_brcondi_tl(TCG_COND_EQ, LSB, 0, label); \
-            fLOAD(1, SIZE, SIGN, EA, RdV); \
+        fLOAD(1, SIZE, SIGN, EA, RdV); \
         gen_set_label(label); \
         tcg_temp_free(LSB); \
     } while (0)
@@ -398,12 +399,13 @@
     do { \
         TCGv LSB = tcg_temp_local_new(); \
         TCGLabel *label = gen_new_label(); \
-        GET_EA; \
+        tcg_gen_movi_tl(EA, 0); \
         PRED;  \
+        CHECK_NOSHUF_PRED(GET_EA, 8, LSB); \
         PRED_LOAD_CANCEL(LSB, EA); \
         tcg_gen_movi_i64(RddV, 0); \
         tcg_gen_brcondi_tl(TCG_COND_EQ, LSB, 0, label); \
-            fLOAD(1, 8, u, EA, RddV); \
+        fLOAD(1, 8, u, EA, RddV); \
         gen_set_label(label); \
         tcg_temp_free(LSB); \
     } while (0)
diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
index cd6af4bceb..8a334ba07b 100644
--- a/target/hexagon/genptr.c
+++ b/target/hexagon/genptr.c
@@ -638,5 +638,12 @@ static void vec_to_qvec(size_t size, intptr_t dstoff, intptr_t srcoff)
     tcg_temp_free_i64(mask);
 }
 
+static void probe_noshuf_load(TCGv va, int s, int mi)
+{
+    TCGv size = tcg_constant_tl(s);
+    TCGv mem_idx = tcg_constant_tl(mi);
+    gen_helper_probe_noshuf_load(cpu_env, va, size, mem_idx);
+}
+
 #include "tcg_funcs_generated.c.inc"
 #include "tcg_func_table_generated.c.inc"
diff --git a/target/hexagon/helper.h b/target/hexagon/helper.h
index c89aa4ed4d..368f0b5708 100644
--- a/target/hexagon/helper.h
+++ b/target/hexagon/helper.h
@@ -104,6 +104,7 @@ DEF_HELPER_1(vwhist128q, void, env)
 DEF_HELPER_2(vwhist128m, void, env, s32)
 DEF_HELPER_2(vwhist128qm, void, env, s32)
 
+DEF_HELPER_4(probe_noshuf_load, void, env, i32, int, int)
 DEF_HELPER_2(probe_pkt_scalar_store_s0, void, env, int)
 DEF_HELPER_2(probe_hvx_stores, void, env, int)
 DEF_HELPER_3(probe_pkt_scalar_hvx_stores, void, env, int, int)
diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h
index a78e84faa4..92eb8bbf05 100644
--- a/target/hexagon/macros.h
+++ b/target/hexagon/macros.h
@@ -87,49 +87,66 @@
  *
  *
  * For qemu, we look for a load in slot 0 when there is  a store in slot 1
- * in the same packet.  When we see this, we call a helper that merges the
- * bytes from the store buffer with the value loaded from memory.
+ * in the same packet.  When we see this, we call a helper that probes the
+ * load to make sure it doesn't fault.  Then, we process the store ahead of
+ * the actual load.
+
  */
-#define CHECK_NOSHUF \
+#define CHECK_NOSHUF(VA, SIZE) \
     do { \
         if (insn->slot == 0 && pkt->pkt_has_store_s1) { \
+            probe_noshuf_load(VA, SIZE, ctx->mem_idx); \
+            process_store(ctx, pkt, 1); \
+        } \
+    } while (0)
+
+#define CHECK_NOSHUF_PRED(GET_EA, SIZE, PRED) \
+    do { \
+        TCGLabel *label = gen_new_label(); \
+        tcg_gen_brcondi_tl(TCG_COND_EQ, PRED, 0, label); \
+        GET_EA; \
+        if (insn->slot == 0 && pkt->pkt_has_store_s1) { \
+            probe_noshuf_load(EA, SIZE, ctx->mem_idx); \
+        } \
+        gen_set_label(label); \
+        if (insn->slot == 0 && pkt->pkt_has_store_s1) { \
             process_store(ctx, pkt, 1); \
         } \
     } while (0)
 
 #define MEM_LOAD1s(DST, VA) \
     do { \
-        CHECK_NOSHUF; \
+        CHECK_NOSHUF(VA, 1); \
         tcg_gen_qemu_ld8s(DST, VA, ctx->mem_idx); \
     } while (0)
 #define MEM_LOAD1u(DST, VA) \
     do { \
-        CHECK_NOSHUF; \
+        CHECK_NOSHUF(VA, 1); \
         tcg_gen_qemu_ld8u(DST, VA, ctx->mem_idx); \
     } while (0)
 #define MEM_LOAD2s(DST, VA) \
     do { \
-        CHECK_NOSHUF; \
+        CHECK_NOSHUF(VA, 2); \
         tcg_gen_qemu_ld16s(DST, VA, ctx->mem_idx); \
     } while (0)
 #define MEM_LOAD2u(DST, VA) \
     do { \
-        CHECK_NOSHUF; \
+        CHECK_NOSHUF(VA, 2); \
         tcg_gen_qemu_ld16u(DST, VA, ctx->mem_idx); \
     } while (0)
 #define MEM_LOAD4s(DST, VA) \
     do { \
-        CHECK_NOSHUF; \
+        CHECK_NOSHUF(VA, 4); \
         tcg_gen_qemu_ld32s(DST, VA, ctx->mem_idx); \
     } while (0)
 #define MEM_LOAD4u(DST, VA) \
     do { \
-        CHECK_NOSHUF; \
+        CHECK_NOSHUF(VA, 4); \
         tcg_gen_qemu_ld32s(DST, VA, ctx->mem_idx); \
     } while (0)
 #define MEM_LOAD8u(DST, VA) \
     do { \
-        CHECK_NOSHUF; \
+        CHECK_NOSHUF(VA, 8); \
         tcg_gen_qemu_ld64(DST, VA, ctx->mem_idx); \
     } while (0)
 
diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index a5ed819c04..085afc3274 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -442,6 +442,17 @@ static void probe_store(CPUHexagonState *env, int slot, int mmu_idx)
     }
 }
 
+/*
+ * Called from a mem_noshuf packet to make sure the load doesn't
+ * raise an exception
+ */
+void HELPER(probe_noshuf_load)(CPUHexagonState *env, target_ulong va,
+                               int size, int mmu_idx)
+{
+    uintptr_t retaddr = GETPC();
+    probe_read(env, va, size, mmu_idx, retaddr);
+}
+
 /* Called during packet commit when there are two scalar stores */
 void HELPER(probe_pkt_scalar_store_s0)(CPUHexagonState *env, int mmu_idx)
 {
@@ -514,10 +525,12 @@ void HELPER(probe_pkt_scalar_hvx_stores)(CPUHexagonState *env, int mask,
  * If the load is in slot 0 and there is a store in slot1 (that
  * wasn't cancelled), we have to do the store first.
  */
-static void check_noshuf(CPUHexagonState *env, uint32_t slot)
+static void check_noshuf(CPUHexagonState *env, uint32_t slot,
+                         target_ulong vaddr, int size)
 {
     if (slot == 0 && env->pkt_has_store_s1 &&
         ((env->slot_cancelled & (1 << 1)) == 0)) {
+        HELPER(probe_noshuf_load)(env, vaddr, size, MMU_USER_IDX);
         HELPER(commit_store)(env, 1);
     }
 }
@@ -526,7 +539,7 @@ static uint8_t mem_load1(CPUHexagonState *env, uint32_t slot,
                          target_ulong vaddr)
 {
     uintptr_t ra = GETPC();
-    check_noshuf(env, slot);
+    check_noshuf(env, slot, vaddr, 1);
     return cpu_ldub_data_ra(env, vaddr, ra);
 }
 
@@ -534,7 +547,7 @@ static uint16_t mem_load2(CPUHexagonState *env, uint32_t slot,
                           target_ulong vaddr)
 {
     uintptr_t ra = GETPC();
-    check_noshuf(env, slot);
+    check_noshuf(env, slot, vaddr, 2);
     return cpu_lduw_data_ra(env, vaddr, ra);
 }
 
@@ -542,7 +555,7 @@ static uint32_t mem_load4(CPUHexagonState *env, uint32_t slot,
                           target_ulong vaddr)
 {
     uintptr_t ra = GETPC();
-    check_noshuf(env, slot);
+    check_noshuf(env, slot, vaddr, 4);
     return cpu_ldl_data_ra(env, vaddr, ra);
 }
 
@@ -550,7 +563,7 @@ static uint64_t mem_load8(CPUHexagonState *env, uint32_t slot,
                           target_ulong vaddr)
 {
     uintptr_t ra = GETPC();
-    check_noshuf(env, slot);
+    check_noshuf(env, slot, vaddr, 8);
     return cpu_ldq_data_ra(env, vaddr, ra);
 }
 
diff --git a/target/s390x/tcg/vec_fpu_helper.c b/target/s390x/tcg/vec_fpu_helper.c
index 2a618a1093..75cf605b9f 100644
--- a/target/s390x/tcg/vec_fpu_helper.c
+++ b/target/s390x/tcg/vec_fpu_helper.c
@@ -824,7 +824,7 @@ static S390MinMaxRes vfmin_res(uint16_t dcmask_a, uint16_t dcmask_b,
         default:
             g_assert_not_reached();
         }
-    } else if (unlikely(dcmask_a & dcmask_b & DCMASK_ZERO)) {
+    } else if (unlikely((dcmask_a & DCMASK_ZERO) && (dcmask_b & DCMASK_ZERO))) {
         switch (type) {
         case S390_MINMAX_TYPE_JAVA:
             return neg_a ? S390_MINMAX_RES_A : S390_MINMAX_RES_B;
@@ -874,7 +874,7 @@ static S390MinMaxRes vfmax_res(uint16_t dcmask_a, uint16_t dcmask_b,
         default:
             g_assert_not_reached();
         }
-    } else if (unlikely(dcmask_a & dcmask_b & DCMASK_ZERO)) {
+    } else if (unlikely((dcmask_a & DCMASK_ZERO) && (dcmask_b & DCMASK_ZERO))) {
         const bool neg_a = dcmask_a & DCMASK_NEGATIVE;
 
         switch (type) {
diff --git a/tests/tcg/hexagon/Makefile.target b/tests/tcg/hexagon/Makefile.target
index 23b9870534..96a4d7a614 100644
--- a/tests/tcg/hexagon/Makefile.target
+++ b/tests/tcg/hexagon/Makefile.target
@@ -35,6 +35,7 @@ HEX_TESTS += preg_alias
 HEX_TESTS += dual_stores
 HEX_TESTS += multi_result
 HEX_TESTS += mem_noshuf
+HEX_TESTS += mem_noshuf_exception
 HEX_TESTS += circ
 HEX_TESTS += brev
 HEX_TESTS += load_unpack
diff --git a/tests/tcg/hexagon/mem_noshuf.c b/tests/tcg/hexagon/mem_noshuf.c
index dd714d5e98..0f4064e700 100644
--- a/tests/tcg/hexagon/mem_noshuf.c
+++ b/tests/tcg/hexagon/mem_noshuf.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -84,6 +84,70 @@ MEM_NOSHUF32(mem_noshuf_sd_luh, long long,    unsigned short,   memd, memuh)
 MEM_NOSHUF32(mem_noshuf_sd_lw,  long long,    signed int,       memd, memw)
 MEM_NOSHUF64(mem_noshuf_sd_ld,  long long,    signed long long, memd, memd)
 
+static inline int pred_lw_sw(int pred, int *p, int *q, int x, int y)
+{
+    int ret;
+    asm volatile("p0 = cmp.eq(%5, #0)\n\t"
+                 "%0 = %3\n\t"
+                 "{\n\t"
+                 "    memw(%1) = %4\n\t"
+                 "    if (!p0) %0 = memw(%2)\n\t"
+                 "}:mem_noshuf\n"
+                 : "=&r"(ret)
+                 : "r"(p), "r"(q), "r"(x), "r"(y), "r"(pred)
+                 : "p0", "memory");
+    return ret;
+}
+
+static inline int pred_lw_sw_pi(int pred, int *p, int *q, int x, int y)
+{
+    int ret;
+    asm volatile("p0 = cmp.eq(%5, #0)\n\t"
+                 "%0 = %3\n\t"
+                 "r7 = %2\n\t"
+                 "{\n\t"
+                 "    memw(%1) = %4\n\t"
+                 "    if (!p0) %0 = memw(r7++#4)\n\t"
+                 "}:mem_noshuf\n"
+                 : "=&r"(ret)
+                 : "r"(p), "r"(q), "r"(x), "r"(y), "r"(pred)
+                 : "r7", "p0", "memory");
+    return ret;
+}
+
+static inline long long pred_ld_sd(int pred, long long *p, long long *q,
+                                   long long x, long long y)
+{
+    unsigned long long ret;
+    asm volatile("p0 = cmp.eq(%5, #0)\n\t"
+                 "%0 = %3\n\t"
+                 "{\n\t"
+                 "    memd(%1) = %4\n\t"
+                 "    if (!p0) %0 = memd(%2)\n\t"
+                 "}:mem_noshuf\n"
+                 : "=&r"(ret)
+                 : "r"(p), "r"(q), "r"(x), "r"(y), "r"(pred)
+                 : "p0", "memory");
+    return ret;
+}
+
+static inline long long pred_ld_sd_pi(int pred, long long *p, long long *q,
+                                      long long x, long long y)
+{
+    long long ret;
+    asm volatile("p0 = cmp.eq(%5, #0)\n\t"
+                 "%0 = %3\n\t"
+                 "r7 = %2\n\t"
+                 "{\n\t"
+                 "    memd(%1) = %4\n\t"
+                 "    if (!p0) %0 = memd(r7++#8)\n\t"
+                 "}:mem_noshuf\n"
+                 : "=&r"(ret)
+                 : "r"(p), "r"(q), "r"(x), "r"(y), "r"(pred)
+                 : "p0", "memory");
+    return ret;
+}
+
 static inline unsigned int cancel_sw_lb(int pred, int *p, signed char *q, int x)
 {
     unsigned int ret;
@@ -126,18 +190,22 @@ typedef union {
 
 int err;
 
-static void check32(int n, int expect)
+#define check32(n, expect) check32_(n, expect, __LINE__)
+
+static void check32_(int n, int expect, int line)
 {
     if (n != expect) {
-        printf("ERROR: 0x%08x != 0x%08x\n", n, expect);
+        printf("ERROR: 0x%08x != 0x%08x, line %d\n", n, expect, line);
         err++;
     }
 }
 
-static void check64(long long n, long long expect)
+#define check64(n, expect) check64_(n, expect, __LINE__)
+
+static void check64_(long long n, long long expect, int line)
 {
     if (n != expect) {
-        printf("ERROR: 0x%08llx != 0x%08llx\n", n, expect);
+        printf("ERROR: 0x%08llx != 0x%08llx, line %d\n", n, expect, line);
         err++;
     }
 }
@@ -323,6 +391,50 @@ int main()
     res64 = mem_noshuf_sd_ld(&n.d[0], &n.d[1], 0x123456789abcdef0LL);
     check64(res64, 0xffffffffffffffffLL);
 
+    n.w[0] = ~0;
+    res32 = pred_lw_sw(0, &n.w[0], &n.w[0], 0x12345678, 0xc0ffeeda);
+    check32(res32, 0x12345678);
+    check32(n.w[0], 0xc0ffeeda);
+
+    n.w[0] = ~0;
+    res32 = pred_lw_sw(1, &n.w[0], &n.w[0], 0x12345678, 0xc0ffeeda);
+    check32(res32, 0xc0ffeeda);
+    check32(n.w[0], 0xc0ffeeda);
+
+    n.w[0] = ~0;
+    res32 = pred_lw_sw_pi(0, &n.w[0], &n.w[0], 0x12345678, 0xc0ffeeda);
+    check32(res32, 0x12345678);
+    check32(n.w[0], 0xc0ffeeda);
+
+    n.w[0] = ~0;
+    res32 = pred_lw_sw_pi(1, &n.w[0], &n.w[0], 0x12345678, 0xc0ffeeda);
+    check32(res32, 0xc0ffeeda);
+    check32(n.w[0], 0xc0ffeeda);
+
+    n.d[0] = ~0LL;
+    res64 = pred_ld_sd(0, &n.d[0], &n.d[0],
+                       0x1234567812345678LL, 0xc0ffeedac0ffeedaLL);
+    check64(res64, 0x1234567812345678LL);
+    check64(n.d[0], 0xc0ffeedac0ffeedaLL);
+
+    n.d[0] = ~0LL;
+    res64 = pred_ld_sd(1, &n.d[0], &n.d[0],
+                       0x1234567812345678LL, 0xc0ffeedac0ffeedaLL);
+    check64(res64, 0xc0ffeedac0ffeedaLL);
+    check64(n.d[0], 0xc0ffeedac0ffeedaLL);
+
+    n.d[0] = ~0LL;
+    res64 = pred_ld_sd_pi(0, &n.d[0], &n.d[0],
+                          0x1234567812345678LL, 0xc0ffeedac0ffeedaLL);
+    check64(res64, 0x1234567812345678LL);
+    check64(n.d[0], 0xc0ffeedac0ffeedaLL);
+
+    n.d[0] = ~0LL;
+    res64 = pred_ld_sd_pi(1, &n.d[0], &n.d[0],
+                          0x1234567812345678LL, 0xc0ffeedac0ffeedaLL);
+    check64(res64, 0xc0ffeedac0ffeedaLL);
+    check64(n.d[0], 0xc0ffeedac0ffeedaLL);
+
     puts(err ? "FAIL" : "PASS");
     return err;
 }
diff --git a/tests/tcg/hexagon/mem_noshuf_exception.c b/tests/tcg/hexagon/mem_noshuf_exception.c
new file mode 100644
index 0000000000..08660ea3e1
--- /dev/null
+++ b/tests/tcg/hexagon/mem_noshuf_exception.c
@@ -0,0 +1,146 @@
+/*
+ *  Copyright(c) 2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Test the VLIW semantics of exceptions with mem_noshuf
+ *
+ * When a packet has the :mem_noshuf attribute, the semantics dictate
+ * That the load will get the data from the store if the addresses overlap.
+ * To accomplish this, we perform the store first.  However, we have to
+ * handle the case where the store raises an exception.  In that case, the
+ * store should not alter the machine state.
+ *
+ * We test this with a mem_noshuf packet with a store to a global variable,
+ * "should_not_change" and a load from NULL.  After the SIGSEGV is caught,
+ * we check * that the "should_not_change" value is the same.
+ *
+ * We also check that a predicated load where the predicate is false doesn't
+ * raise an exception and allows the store to happen.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <setjmp.h>
+#include <signal.h>
+
+int err;
+int segv_caught;
+
+#define SHOULD_NOT_CHANGE_VAL        5
+int should_not_change = SHOULD_NOT_CHANGE_VAL;
+
+#define OK_TO_CHANGE_VAL        13
+int ok_to_change = OK_TO_CHANGE_VAL;
+
+static void __check(const char *filename, int line, int x, int expect)
+{
+    if (x != expect) {
+        printf("ERROR %s:%d - %d != %d\n",
+               filename, line, x, expect);
+        err++;
+    }
+}
+
+#define check(x, expect) __check(__FILE__, __LINE__, (x), (expect))
+
+static void __chk_error(const char *filename, int line, int ret)
+{
+    if (ret < 0) {
+        printf("ERROR %s:%d - %d\n", filename, line, ret);
+        err++;
+    }
+}
+
+#define chk_error(ret) __chk_error(__FILE__, __LINE__, (ret))
+
+jmp_buf jmp_env;
+
+static void sig_segv(int sig, siginfo_t *info, void *puc)
+{
+    check(sig, SIGSEGV);
+    segv_caught = 1;
+    longjmp(jmp_env, 1);
+}
+
+int main()
+{
+    struct sigaction act;
+    int dummy32;
+    long long dummy64;
+    void *p;
+
+    /* SIGSEGV test */
+    act.sa_sigaction = sig_segv;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = SA_SIGINFO;
+    chk_error(sigaction(SIGSEGV, &act, NULL));
+    if (setjmp(jmp_env) == 0) {
+        asm volatile("r18 = ##should_not_change\n\t"
+                     "r19 = #0\n\t"
+                     "{\n\t"
+                     "    memw(r18) = #7\n\t"
+                     "    %0 = memw(r19)\n\t"
+                     "}:mem_noshuf\n\t"
+                      : "=r"(dummy32) : : "r18", "r19", "memory");
+    }
+
+    act.sa_handler = SIG_DFL;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+    chk_error(sigaction(SIGSEGV, &act, NULL));
+
+    check(segv_caught, 1);
+    check(should_not_change, SHOULD_NOT_CHANGE_VAL);
+
+    /*
+     * Check that a predicated load where the predicate is false doesn't
+     * raise an exception and allows the store to happen.
+     */
+    asm volatile("r18 = ##ok_to_change\n\t"
+                 "r19 = #0\n\t"
+                 "p0 = cmp.gt(r0, r0)\n\t"
+                 "{\n\t"
+                 "    memw(r18) = #7\n\t"
+                 "    if (p0) %0 = memw(r19)\n\t"
+                 "}:mem_noshuf\n\t"
+                  : "=r"(dummy32) : : "r18", "r19", "p0", "memory");
+
+    check(ok_to_change, 7);
+
+    /*
+     * Also check that the post-increment doesn't happen when the
+     * predicate is false.
+     */
+    ok_to_change = OK_TO_CHANGE_VAL;
+    p = NULL;
+    asm volatile("r18 = ##ok_to_change\n\t"
+                 "p0 = cmp.gt(r0, r0)\n\t"
+                 "{\n\t"
+                 "    memw(r18) = #9\n\t"
+                 "    if (p0) %1 = memd(%0 ++ #8)\n\t"
+                 "}:mem_noshuf\n\t"
+                  : "+r"(p), "=r"(dummy64) : : "r18", "p0", "memory");
+
+    check(ok_to_change, 9);
+    check((int)p, (int)NULL);
+
+    puts(err ? "FAIL" : "PASS");
+    return err ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 3124172736..1a7a4a2f59 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -17,6 +17,13 @@ TESTS+=trap
 TESTS+=signals-s390x
 TESTS+=branch-relative-long
 
+Z14_TESTS=vfminmax
+vfminmax: LDFLAGS+=-lm
+$(Z14_TESTS): CFLAGS+=-march=z14 -O2
+
+TESTS+=$(if $(shell $(CC) -march=z14 -S -o /dev/null -xc /dev/null \
+			 >/dev/null 2>&1 && echo OK),$(Z14_TESTS))
+
 VECTOR_TESTS=vxeh2_vs
 VECTOR_TESTS+=vxeh2_vcvt
 VECTOR_TESTS+=vxeh2_vlstr
diff --git a/tests/tcg/s390x/vfminmax.c b/tests/tcg/s390x/vfminmax.c
new file mode 100644
index 0000000000..22629df160
--- /dev/null
+++ b/tests/tcg/s390x/vfminmax.c
@@ -0,0 +1,411 @@
+#define _GNU_SOURCE
+#include <fenv.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * vfmin/vfmax instruction execution.
+ */
+#define VFMIN 0xEE
+#define VFMAX 0xEF
+
+extern char insn[6];
+asm(".pushsection .rwx,\"awx\",@progbits\n"
+    ".globl insn\n"
+    /* e7 89 a0 00 2e ef */
+    "insn: vfmaxsb %v24,%v25,%v26,0\n"
+    ".popsection\n");
+
+static void vfminmax(unsigned int op,
+                     unsigned int m4, unsigned int m5, unsigned int m6,
+                     void *v1, const void *v2, const void *v3)
+{
+   insn[3] = (m6 << 4) | m5;
+   insn[4] = (m4 << 4) | 0x0e;
+   insn[5] = op;
+
+    asm("vl %%v25,%[v2]\n"
+        "vl %%v26,%[v3]\n"
+        "ex 0,%[insn]\n"
+        "vst %%v24,%[v1]\n"
+        : [v1] "=m" (*(char (*)[16])v1)
+        : [v2] "m" (*(char (*)[16])v2)
+        , [v3] "m" (*(char (*)[16])v3)
+        , [insn] "m"(insn)
+        : "v24", "v25", "v26");
+}
+
+/*
+ * Floating-point value classes.
+ */
+#define N_FORMATS 3
+#define N_SIGNED_CLASSES 8
+static const size_t float_sizes[N_FORMATS] = {
+    /* M4 == 2: short    */ 4,
+    /* M4 == 3: long     */ 8,
+    /* M4 == 4: extended */ 16,
+};
+static const size_t e_bits[N_FORMATS] = {
+    /* M4 == 2: short    */ 8,
+    /* M4 == 3: long     */ 11,
+    /* M4 == 4: extended */ 15,
+};
+static const unsigned char signed_floats[N_FORMATS][N_SIGNED_CLASSES][2][16] = {
+    /* M4 == 2: short */
+    {
+        /* -inf */ {{0xff, 0x80, 0x00, 0x00},
+                    {0xff, 0x80, 0x00, 0x00}},
+        /* -Fn */  {{0xc2, 0x28, 0x00, 0x00},
+                    {0xc2, 0x29, 0x00, 0x00}},
+        /* -0 */   {{0x80, 0x00, 0x00, 0x00},
+                    {0x80, 0x00, 0x00, 0x00}},
+        /* +0 */   {{0x00, 0x00, 0x00, 0x00},
+                    {0x00, 0x00, 0x00, 0x00}},
+        /* +Fn */  {{0x42, 0x28, 0x00, 0x00},
+                    {0x42, 0x2a, 0x00, 0x00}},
+        /* +inf */ {{0x7f, 0x80, 0x00, 0x00},
+                    {0x7f, 0x80, 0x00, 0x00}},
+        /* QNaN */ {{0x7f, 0xff, 0xff, 0xff},
+                    {0x7f, 0xff, 0xff, 0xfe}},
+        /* SNaN */ {{0x7f, 0xbf, 0xff, 0xff},
+                    {0x7f, 0xbf, 0xff, 0xfd}},
+    },
+
+    /* M4 == 3: long */
+    {
+        /* -inf */ {{0xff, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0xff, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* -Fn */  {{0xc0, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0xc0, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* -0 */   {{0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +0 */   {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +Fn */  {{0x40, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x40, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +inf */ {{0x7f, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x7f, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* QNaN */ {{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+                    {0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}},
+        /* SNaN */ {{0x7f, 0xf7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+                    {0x7f, 0xf7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd}},
+    },
+
+    /* M4 == 4: extended */
+    {
+        /* -inf */ {{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* -Fn */  {{0xc0, 0x04, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0xc0, 0x04, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* -0 */   {{0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +0 */   {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +Fn */  {{0x40, 0x04, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x40, 0x04, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +inf */ {{0x7f, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x7f, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* QNaN */ {{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+                    {0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}},
+        /* SNaN */ {{0x7f, 0xff, 0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+                    {0x7f, 0xff, 0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd}},
+    },
+};
+
+/*
+ * PoP tables as close to the original as possible.
+ */
+struct signed_test {
+    int op;
+    int m6;
+    const char *m6_desc;
+    const char *table[N_SIGNED_CLASSES][N_SIGNED_CLASSES];
+} signed_tests[] = {
+    {
+        .op = VFMIN,
+        .m6 = 0,
+        .m6_desc = "IEEE MinNum",
+        .table = {
+             /*         -inf         -Fn          -0           +0           +Fn          +inf         QNaN         SNaN     */
+            {/* -inf */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* -Fn  */ "T(b)",      "T(M(a,b))", "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* -0   */ "T(b)",      "T(b)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* +0   */ "T(b)",      "T(b)",      "T(b)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* +Fn  */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(M(a,b))", "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* +inf */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* QNaN */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* SNaN */ "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)"},
+        },
+    },
+    {
+        .op = VFMIN,
+        .m6 = 1,
+        .m6_desc = "JAVA Math.Min()",
+        .table = {
+             /*         -inf         -Fn          -0           +0           +Fn          +inf         QNaN         SNaN     */
+            {/* -inf */ "T(b)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* -Fn  */ "T(b)",      "T(M(a,b))", "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* -0   */ "T(b)",      "T(b)",      "T(b)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* +0   */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* +Fn  */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(M(a,b))", "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* +inf */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* QNaN */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* SNaN */ "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)"},
+        },
+    },
+    {
+        .op = VFMIN,
+        .m6 = 2,
+        .m6_desc = "C-style Min Macro",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(b)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* -Fn  */ "T(b)",     "T(M(a,b))", "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* -0   */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(a)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +0   */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(a)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +Fn  */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(M(a,b))", "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +inf */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* QNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)"},
+            {/* SNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)"},
+        },
+    },
+    {
+        .op = VFMIN,
+        .m6 = 3,
+        .m6_desc = "C++ algorithm.min()",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(b)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* -Fn  */ "T(b)",     "T(M(a,b))", "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* -0   */ "T(b)",     "T(b)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +0   */ "T(b)",     "T(b)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +Fn  */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(M(a,b))", "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +inf */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* QNaN */ "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)"},
+            {/* SNaN */ "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)"},
+        },
+    },
+    {
+        .op = VFMIN,
+        .m6 = 4,
+        .m6_desc = "fmin()",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* -Fn  */ "T(b)",     "T(M(a,b))", "T(a)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* -0   */ "T(b)",     "T(b)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* +0   */ "T(b)",     "T(b)",      "T(b)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* +Fn  */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(M(a,b))", "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* +inf */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* QNaN */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* SNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(a)", "Xi: T(a)"},
+        },
+    },
+
+    {
+        .op = VFMAX,
+        .m6 = 0,
+        .m6_desc = "IEEE MaxNum",
+        .table = {
+             /*         -inf         -Fn          -0           +0           +Fn          +inf         QNaN         SNaN     */
+            {/* -inf */ "T(a)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* -Fn  */ "T(a)",      "T(M(a,b))", "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* -0   */ "T(a)",      "T(a)",      "T(a)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* +0   */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* +Fn  */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(M(a,b))", "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* +inf */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* QNaN */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* SNaN */ "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)"},
+        },
+    },
+    {
+        .op = VFMAX,
+        .m6 = 1,
+        .m6_desc = "JAVA Math.Max()",
+        .table = {
+             /*         -inf         -Fn          -0           +0           +Fn          +inf         QNaN         SNaN     */
+            {/* -inf */ "T(a)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* -Fn  */ "T(a)",      "T(M(a,b))", "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* -0   */ "T(a)",      "T(a)",      "T(a)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* +0   */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* +Fn  */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(M(a,b))", "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* +inf */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* QNaN */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* SNaN */ "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)"},
+        },
+    },
+    {
+        .op = VFMAX,
+        .m6 = 2,
+        .m6_desc = "C-style Max Macro",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* -Fn  */ "T(a)",     "T(M(a,b))", "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* -0   */ "T(a)",     "T(a)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +0   */ "T(a)",     "T(a)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +Fn  */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(M(a,b))", "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +inf */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* QNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)"},
+            {/* SNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)"},
+        },
+    },
+    {
+        .op = VFMAX,
+        .m6 = 3,
+        .m6_desc = "C++ algorithm.max()",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(a)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* -Fn  */ "T(a)",     "T(M(a,b))", "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* -0   */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(b)",      "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +0   */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(b)",      "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +Fn  */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(M(a,b))", "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +inf */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* QNaN */ "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)"},
+            {/* SNaN */ "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)"},
+        },
+    },
+    {
+        .op = VFMAX,
+        .m6 = 4,
+        .m6_desc = "fmax()",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(a)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* -Fn  */ "T(a)",     "T(M(a,b))", "T(b)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* -0   */ "T(a)",     "T(a)",      "T(a)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* +0   */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* +Fn  */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(M(a,b))", "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* +inf */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* QNaN */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* SNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(a)", "Xi: T(a)"},
+        },
+    },
+};
+
+static void dump_v(FILE *f, const void *v, size_t n)
+{
+    for (int i = 0; i < n; i++) {
+        fprintf(f, "%02x", ((const unsigned char *)v)[i]);
+    }
+}
+
+static int signed_test(struct signed_test *test, int m4, int m5,
+                       const void *v1_exp, bool xi_exp,
+                       const void *v2, const void *v3)
+{
+    size_t n = (m5 & 8) ? float_sizes[m4 - 2] : 16;
+    char v1[16];
+    bool xi;
+
+    feclearexcept(FE_ALL_EXCEPT);
+    vfminmax(test->op, m4, m5, test->m6, v1, v2, v3);
+    xi = fetestexcept(FE_ALL_EXCEPT) == FE_INVALID;
+
+    if (memcmp(v1, v1_exp, n) != 0 || xi != xi_exp) {
+        fprintf(stderr, "[  FAILED  ] %s ", test->m6_desc);
+        dump_v(stderr, v2, n);
+        fprintf(stderr, ", ");
+        dump_v(stderr, v3, n);
+        fprintf(stderr, ", %d, %d, %d: actual=", m4, m5, test->m6);
+        dump_v(stderr, v1, n);
+        fprintf(stderr, "/%d, expected=", (int)xi);
+        dump_v(stderr, v1_exp, n);
+        fprintf(stderr, "/%d\n", (int)xi_exp);
+        return 1;
+    }
+
+    return 0;
+}
+
+static void snan_to_qnan(char *v, int m4)
+{
+    size_t bit = 1 + e_bits[m4 - 2];
+    v[bit / 8] |= 1 << (7 - (bit % 8));
+}
+
+int main(void)
+{
+    int ret = 0;
+    size_t i;
+
+    for (i = 0; i < sizeof(signed_tests) / sizeof(signed_tests[0]); i++) {
+        struct signed_test *test = &signed_tests[i];
+        int m4;
+
+        for (m4 = 2; m4 <= 4; m4++) {
+            const unsigned char (*floats)[2][16] = signed_floats[m4 - 2];
+            size_t float_size = float_sizes[m4 - 2];
+            int m5;
+
+            for (m5 = 0; m5 <= 8; m5 += 8) {
+                char v1_exp[16], v2[16], v3[16];
+                bool xi_exp = false;
+                int pos = 0;
+                int i2;
+
+                for (i2 = 0; i2 < N_SIGNED_CLASSES * 2; i2++) {
+                    int i3;
+
+                    for (i3 = 0; i3 < N_SIGNED_CLASSES * 2; i3++) {
+                        const char *spec = test->table[i2 / 2][i3 / 2];
+
+                        memcpy(&v2[pos], floats[i2 / 2][i2 % 2], float_size);
+                        memcpy(&v3[pos], floats[i3 / 2][i3 % 2], float_size);
+                        if (strcmp(spec, "T(a)") == 0 ||
+                            strcmp(spec, "Xi: T(a)") == 0) {
+                            memcpy(&v1_exp[pos], &v2[pos], float_size);
+                        } else if (strcmp(spec, "T(b)") == 0 ||
+                                   strcmp(spec, "Xi: T(b)") == 0) {
+                            memcpy(&v1_exp[pos], &v3[pos], float_size);
+                        } else if (strcmp(spec, "Xi: T(a*)") == 0) {
+                            memcpy(&v1_exp[pos], &v2[pos], float_size);
+                            snan_to_qnan(&v1_exp[pos], m4);
+                        } else if (strcmp(spec, "Xi: T(b*)") == 0) {
+                            memcpy(&v1_exp[pos], &v3[pos], float_size);
+                            snan_to_qnan(&v1_exp[pos], m4);
+                        } else if (strcmp(spec, "T(M(a,b))") == 0) {
+                            /*
+                             * Comparing floats is risky, since the compiler
+                             * might generate the same instruction that we are
+                             * testing. Compare ints instead. This works,
+                             * because we get here only for +-Fn, and the
+                             * corresponding test values have identical
+                             * exponents.
+                             */
+                            int v2_int = *(int *)&v2[pos];
+                            int v3_int = *(int *)&v3[pos];
+
+                            if ((v2_int < v3_int) ==
+                                ((test->op == VFMIN) != (v2_int < 0))) {
+                                memcpy(&v1_exp[pos], &v2[pos], float_size);
+                            } else {
+                                memcpy(&v1_exp[pos], &v3[pos], float_size);
+                            }
+                        } else {
+                            fprintf(stderr, "Unexpected spec: %s\n", spec);
+                            return 1;
+                        }
+                        xi_exp |= spec[0] == 'X';
+                        pos += float_size;
+
+                        if ((m5 & 8) || pos == 16) {
+                            ret |= signed_test(test, m4, m5,
+                                               v1_exp, xi_exp, v2, v3);
+                            pos = 0;
+                            xi_exp = false;
+                        }
+                    }
+                }
+
+                if (pos != 0) {
+                    ret |= signed_test(test, m4, m5, v1_exp, xi_exp, v2, v3);
+                }
+            }
+        }
+    }
+
+    return ret;
+}