1 files changed, 199 insertions, 68 deletions
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 37cdc84562..e19ab063fa 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -43,6 +43,10 @@ typedef struct VhostVDPAState {
 
     /* The device always have SVQ enabled */
     bool always_svq;
+
+    /* The device can isolate CVQ in its own ASID */
+    bool cvq_isolated;
+
     bool started;
 } VhostVDPAState;
 
@@ -54,6 +58,7 @@ const int vdpa_feature_bits[] = {
     VIRTIO_F_VERSION_1,
     VIRTIO_NET_F_CSUM,
     VIRTIO_NET_F_GUEST_CSUM,
+    VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,
     VIRTIO_NET_F_GSO,
     VIRTIO_NET_F_GUEST_TSO4,
     VIRTIO_NET_F_GUEST_TSO6,
@@ -85,6 +90,7 @@ const int vdpa_feature_bits[] = {
 static const uint64_t vdpa_svq_device_features =
     BIT_ULL(VIRTIO_NET_F_CSUM) |
     BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
+    BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) |
     BIT_ULL(VIRTIO_NET_F_MTU) |
     BIT_ULL(VIRTIO_NET_F_MAC) |
     BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
@@ -116,6 +122,22 @@ VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
     return s->vhost_net;
 }
 
+static size_t vhost_vdpa_net_cvq_cmd_len(void)
+{
+    /*
+     * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
+     * In buffer is always 1 byte, so it should fit here
+     */
+    return sizeof(struct virtio_net_ctrl_hdr) +
+           2 * sizeof(struct virtio_net_ctrl_mac) +
+           MAC_TABLE_ENTRIES * ETH_ALEN;
+}
+
+static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
+{
+    return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
+}
+
 static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp)
 {
     uint64_t invalid_dev_features =
@@ -185,8 +207,16 @@ static void vhost_vdpa_cleanup(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 
-    qemu_vfree(s->cvq_cmd_out_buffer);
-    qemu_vfree(s->status);
+    /*
+     * If a peer NIC is attached, do not cleanup anything.
+     * Cleanup will happen as a part of qemu_cleanup() -> net_cleanup()
+     * when the guest is shutting down.
+     */
+    if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) {
+        return;
+    }
+    munmap(s->cvq_cmd_out_buffer, vhost_vdpa_net_cvq_cmd_page_len());
+    munmap(s->status, vhost_vdpa_net_cvq_cmd_page_len());
     if (s->vhost_net) {
         vhost_net_cleanup(s->vhost_net);
         g_free(s->vhost_net);
@@ -362,7 +392,8 @@ static NetClientInfo net_vhost_vdpa_info = {
         .check_peer_type = vhost_vdpa_check_peer_type,
 };
 
-static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index)
+static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index,
+                                          Error **errp)
 {
     struct vhost_vring_state state = {
         .index = vq_index,
@@ -370,8 +401,8 @@ static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index)
     int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state);
 
     if (unlikely(r < 0)) {
-        error_report("Cannot get VQ %u group: %s", vq_index,
-                     g_strerror(errno));
+        r = -errno;
+        error_setg_errno(errp, errno, "Cannot get VQ %u group", vq_index);
         return r;
     }
 
@@ -422,22 +453,6 @@ static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
     vhost_iova_tree_remove(tree, *map);
 }
 
-static size_t vhost_vdpa_net_cvq_cmd_len(void)
-{
-    /*
-     * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
-     * In buffer is always 1 byte, so it should fit here
-     */
-    return sizeof(struct virtio_net_ctrl_hdr) +
-           2 * sizeof(struct virtio_net_ctrl_mac) +
-           MAC_TABLE_ENTRIES * ETH_ALEN;
-}
-
-static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
-{
-    return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
-}
-
 /** Map CVQ buffer. */
 static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size,
                                   bool write)
@@ -471,9 +486,9 @@ static int vhost_vdpa_net_cvq_start(NetClientState *nc)
 {
     VhostVDPAState *s, *s0;
     struct vhost_vdpa *v;
-    uint64_t backend_features;
     int64_t cvq_group;
-    int cvq_index, r;
+    int r;
+    Error *err = NULL;
 
     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 
@@ -493,41 +508,22 @@ static int vhost_vdpa_net_cvq_start(NetClientState *nc)
     /*
      * If we early return in these cases SVQ will not be enabled. The migration
      * will be blocked as long as vhost-vdpa backends will not offer _F_LOG.
-     *
-     * Calling VHOST_GET_BACKEND_FEATURES as they are not available in v->dev
-     * yet.
      */
-    r = ioctl(v->device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features);
-    if (unlikely(r < 0)) {
-        error_report("Cannot get vdpa backend_features: %s(%d)",
-            g_strerror(errno), errno);
-        return -1;
+    if (!vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) {
+        return 0;
     }
-    if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID)) ||
-        !vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) {
+
+    if (!s->cvq_isolated) {
         return 0;
     }
 
-    /*
-     * Check if all the virtqueues of the virtio device are in a different vq
-     * than the last vq. VQ group of last group passed in cvq_group.
-     */
-    cvq_index = v->dev->vq_index_end - 1;
-    cvq_group = vhost_vdpa_get_vring_group(v->device_fd, cvq_index);
+    cvq_group = vhost_vdpa_get_vring_group(v->device_fd,
+                                           v->dev->vq_index_end - 1,
+                                           &err);
     if (unlikely(cvq_group < 0)) {
+        error_report_err(err);
         return cvq_group;
     }
-    for (int i = 0; i < cvq_index; ++i) {
-        int64_t group = vhost_vdpa_get_vring_group(v->device_fd, i);
-
-        if (unlikely(group < 0)) {
-            return group;
-        }
-
-        if (group == cvq_group) {
-            return 0;
-        }
-    }
 
     r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID);
     if (unlikely(r < 0)) {
@@ -643,8 +639,7 @@ static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, uint8_t class,
 
 static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n)
 {
-    uint64_t features = n->parent_obj.guest_features;
-    if (features & BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR)) {
+    if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
         ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MAC,
                                                   VIRTIO_NET_CTRL_MAC_ADDR_SET,
                                                   n->mac, sizeof(n->mac));
@@ -662,10 +657,9 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
                                   const VirtIONet *n)
 {
     struct virtio_net_ctrl_mq mq;
-    uint64_t features = n->parent_obj.guest_features;
     ssize_t dev_written;
 
-    if (!(features & BIT_ULL(VIRTIO_NET_F_MQ))) {
+    if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_MQ)) {
         return 0;
     }
 
@@ -680,6 +674,44 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
     return *s->status != VIRTIO_NET_OK;
 }
 
+static int vhost_vdpa_net_load_offloads(VhostVDPAState *s,
+                                        const VirtIONet *n)
+{
+    uint64_t offloads;
+    ssize_t dev_written;
+
+    if (!virtio_vdev_has_feature(&n->parent_obj,
+                                 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
+        return 0;
+    }
+
+    if (n->curr_guest_offloads == virtio_net_supported_guest_offloads(n)) {
+        /*
+         * According to VirtIO standard, "Upon feature negotiation
+         * corresponding offload gets enabled to preserve
+         * backward compatibility.".
+         *
+         * Therefore, there is no need to send this CVQ command if the
+         * driver also enables all supported offloads, which aligns with
+         * the device's defaults.
+         *
+         * Note that the device's defaults can mismatch the driver's
+         * configuration only at live migration.
+         */
+        return 0;
+    }
+
+    offloads = cpu_to_le64(n->curr_guest_offloads);
+    dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
+                                          VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
+                                          &offloads, sizeof(offloads));
+    if (unlikely(dev_written < 0)) {
+        return dev_written;
+    }
+
+    return *s->status != VIRTIO_NET_OK;
+}
+
 static int vhost_vdpa_net_load(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
@@ -702,6 +734,10 @@ static int vhost_vdpa_net_load(NetClientState *nc)
     if (unlikely(r)) {
         return r;
     }
+    r = vhost_vdpa_net_load_offloads(s, n);
+    if (unlikely(r)) {
+        return r;
+    }
 
     return 0;
 }
@@ -766,7 +802,7 @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
     }
 
     if (*s->status != VIRTIO_NET_OK) {
-        return VIRTIO_NET_ERR;
+        goto out;
     }
 
     status = VIRTIO_NET_ERR;
@@ -790,6 +826,87 @@ static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
     .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
 };
 
+/**
+ * Probe if CVQ is isolated
+ *
+ * @device_fd         The vdpa device fd
+ * @features          Features offered by the device.
+ * @cvq_index         The control vq pair index
+ *
+ * Returns <0 in case of failure, 0 if false and 1 if true.
+ */
+static int vhost_vdpa_probe_cvq_isolation(int device_fd, uint64_t features,
+                                          int cvq_index, Error **errp)
+{
+    uint64_t backend_features;
+    int64_t cvq_group;
+    uint8_t status = VIRTIO_CONFIG_S_ACKNOWLEDGE |
+                     VIRTIO_CONFIG_S_DRIVER |
+                     VIRTIO_CONFIG_S_FEATURES_OK;
+    int r;
+
+    ERRP_GUARD();
+
+    r = ioctl(device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features);
+    if (unlikely(r < 0)) {
+        error_setg_errno(errp, errno, "Cannot get vdpa backend_features");
+        return r;
+    }
+
+    if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID))) {
+        return 0;
+    }
+
+    r = ioctl(device_fd, VHOST_SET_FEATURES, &features);
+    if (unlikely(r)) {
+        error_setg_errno(errp, errno, "Cannot set features");
+    }
+
+    r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
+    if (unlikely(r)) {
+        error_setg_errno(errp, -r, "Cannot set device features");
+        goto out;
+    }
+
+    cvq_group = vhost_vdpa_get_vring_group(device_fd, cvq_index, errp);
+    if (unlikely(cvq_group < 0)) {
+        if (cvq_group != -ENOTSUP) {
+            r = cvq_group;
+            goto out;
+        }
+
+        /*
+         * The kernel report VHOST_BACKEND_F_IOTLB_ASID if the vdpa frontend
+         * support ASID even if the parent driver does not.  The CVQ cannot be
+         * isolated in this case.
+         */
+        error_free(*errp);
+        *errp = NULL;
+        r = 0;
+        goto out;
+    }
+
+    for (int i = 0; i < cvq_index; ++i) {
+        int64_t group = vhost_vdpa_get_vring_group(device_fd, i, errp);
+        if (unlikely(group < 0)) {
+            r = group;
+            goto out;
+        }
+
+        if (group == (int64_t)cvq_group) {
+            r = 0;
+            goto out;
+        }
+    }
+
+    r = 1;
+
+out:
+    status = 0;
+    ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
+    return r;
+}
+
 static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
                                        const char *device,
                                        const char *name,
@@ -799,16 +916,26 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
                                        bool is_datapath,
                                        bool svq,
                                        struct vhost_vdpa_iova_range iova_range,
-                                       uint64_t features)
+                                       uint64_t features,
+                                       Error **errp)
 {
     NetClientState *nc = NULL;
     VhostVDPAState *s;
     int ret = 0;
     assert(name);
+    int cvq_isolated;
+
     if (is_datapath) {
         nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device,
                                  name);
     } else {
+        cvq_isolated = vhost_vdpa_probe_cvq_isolation(vdpa_device_fd, features,
+                                                      queue_pair_index * 2,
+                                                      errp);
+        if (unlikely(cvq_isolated < 0)) {
+            return NULL;
+        }
+
         nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer,
                                          device, name);
     }
@@ -826,24 +953,28 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
         vhost_vdpa_net_valid_svq_features(features,
                                           &s->vhost_vdpa.migration_blocker);
     } else if (!is_datapath) {
-        s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
-                                            vhost_vdpa_net_cvq_cmd_page_len());
-        memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
-        s->status = qemu_memalign(qemu_real_host_page_size(),
-                                  vhost_vdpa_net_cvq_cmd_page_len());
-        memset(s->status, 0, vhost_vdpa_net_cvq_cmd_page_len());
+        s->cvq_cmd_out_buffer = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
+                                     PROT_READ | PROT_WRITE,
+                                     MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+        s->status = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
+                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+                         -1, 0);
 
         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
         s->vhost_vdpa.shadow_vq_ops_opaque = s;
+        s->cvq_isolated = cvq_isolated;
 
         /*
-         * TODO: We cannot migrate devices with CVQ as there is no way to set
-         * the device state (MAC, MQ, etc) before starting the datapath.
+         * TODO: We cannot migrate devices with CVQ and no x-svq enabled as
+         * there is no way to set the device state (MAC, MQ, etc) before
+         * starting the datapath.
          *
          * Migration blocker ownership now belongs to s->vhost_vdpa.
          */
-        error_setg(&s->vhost_vdpa.migration_blocker,
-                   "net vdpa cannot migrate with CVQ feature");
+        if (!svq) {
+            error_setg(&s->vhost_vdpa.migration_blocker,
+                       "net vdpa cannot migrate with CVQ feature");
+        }
     }
     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
     if (ret) {
@@ -963,7 +1094,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
     for (i = 0; i < queue_pairs; i++) {
         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
                                      vdpa_device_fd, i, 2, true, opts->x_svq,
-                                     iova_range, features);
+                                     iova_range, features, errp);
         if (!ncs[i])
             goto err;
     }
@@ -971,7 +1102,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
     if (has_cvq) {
         nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
                                  vdpa_device_fd, i, 1, false,
-                                 opts->x_svq, iova_range, features);
+                                 opts->x_svq, iova_range, features, errp);
         if (!nc)
             goto err;
     }