summary refs log tree commit diff stats
path: root/hw/vfio-user/pci.c
diff options
context:
space:
mode:
authorStefan Hajnoczi <stefanha@redhat.com>2025-06-26 10:25:01 -0400
committerStefan Hajnoczi <stefanha@redhat.com>2025-06-26 10:25:01 -0400
commitaec6836c73403cffa56b9a4c5556451ee16071fe (patch)
tree9c562d1c2c5a01814dc0ad8849658c5d0cba193c /hw/vfio-user/pci.c
parent1721fe75df1cbabf2665a2b76a6e7b5bc0fc036b (diff)
parentda198e8f0f99cd8539f3072ad2071f9dc01680d6 (diff)
downloadfocaccia-qemu-aec6836c73403cffa56b9a4c5556451ee16071fe.tar.gz
focaccia-qemu-aec6836c73403cffa56b9a4c5556451ee16071fe.zip
Merge tag 'pull-vfio-20250626' of https://github.com/legoater/qemu into staging
vfio queue:

* Added several small fixes and cleanups
* Added support for vfio-user client device

# -----BEGIN PGP SIGNATURE-----
#
# iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmhc+mwACgkQUaNDx8/7
# 7KHQShAAwGTjc6yzBaFr9DTKaL6Vszwby5tYdV3vWTsnFj2zBks+5BswohxbnYdk
# Smy5E/F+yCiHI6X4ohI4FRjJgBJplDxZ6bbEgTwZa6ADs7xWAwFWv349KQY9uLpJ
# XV/ACot2b6FZUskv3w1SxQrpzho4ICm3DuLOdAFAvBPRtxyC2aQMLsXTlT+7+5cC
# X8zJB/9mtjWIomYWKRXnYUP/uM1g7QLtyU7d01szvqCfSVUilVlg6Ys7RxnqLG0k
# A1/kxYOrEPHHxMO+YwFuapIfE8Gqihes2K1GfM871JaBT14dMIAZkajmVasbKD16
# Iljz89nEV3UehDP9HADhx3QuXO7fhJ3cxcHvTH0xhUeoks3EgTlUq0VNRRYzu6rQ
# 3P1E3cVaPTmwfoSrhecNIFcln4v/bENdwzYcjh96r9fcFwE+ro4oUTGNKCPYv2t0
# yOoc6PqgiZN7DM89/N2hcesgOun7oOVpMnKhiqHjVe53HoM8bfLojWECKNq9Cz1u
# m0YEHn2gEuEB5l03IguRnAywZq76Jivd6WFmAeXGrHRZ9sfxQCwvImbqMa7QxYpI
# rt+j7RAyP57WVoBPoW8hlaIQmLuIvIgdWwWkwQd2BTIprLpdHJd4SWkL6eqGozpE
# rsaHw+WQZqFoddrl7EUSVY/Z2CfIRr1g/Zo5z4RU9YLtxVxjSPw=
# =sX2P
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 26 Jun 2025 03:44:44 EDT
# gpg:                using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@redhat.com>" [full]
# gpg:                 aka "Cédric Le Goater <clg@kaod.org>" [full]
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B  0B60 51A3 43C7 CFFB ECA1

* tag 'pull-vfio-20250626' of https://github.com/legoater/qemu: (25 commits)
  vfio-user: introduce vfio-user protocol specification
  docs: add vfio-user documentation
  vfio-user: add coalesced posted writes
  vfio-user: support posted writes
  vfio-user: add 'x-msg-timeout' option
  vfio-user: implement VFIO_USER_DMA_READ/WRITE
  vfio-user: implement VFIO_USER_DMA_MAP/UNMAP
  vfio-user: implement VFIO_USER_DEVICE_RESET
  vfio-user: set up container access to the proxy
  vfio-user: forward MSI-X PBA BAR accesses to server
  vfio-user: implement VFIO_USER_DEVICE_GET/SET_IRQ*
  vfio-user: set up PCI in vfio_user_pci_realize()
  vfio-user: implement VFIO_USER_REGION_READ/WRITE
  vfio-user: implement VFIO_USER_DEVICE_GET_REGION_INFO
  vfio-user: implement VFIO_USER_DEVICE_GET_INFO
  vfio-user: implement message send infrastructure
  vfio-user: implement message receive infrastructure
  vfio-user: connect vfio proxy to remote server
  vfio-user: add vfio-user class and container
  vfio/container: fails mdev hotplug if add migration blocker failed
  ...

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Diffstat (limited to 'hw/vfio-user/pci.c')
-rw-r--r--hw/vfio-user/pci.c475
1 files changed, 475 insertions, 0 deletions
diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c
new file mode 100644
index 0000000000..be71c77729
--- /dev/null
+++ b/hw/vfio-user/pci.c
@@ -0,0 +1,475 @@
+/*
+ * vfio PCI device over a UNIX socket.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <sys/ioctl.h>
+#include "qemu/osdep.h"
+#include "qapi-visit-sockets.h"
+#include "qemu/error-report.h"
+
+#include "hw/qdev-properties.h"
+#include "hw/vfio/pci.h"
+#include "hw/vfio-user/device.h"
+#include "hw/vfio-user/proxy.h"
+
+#define TYPE_VFIO_USER_PCI "vfio-user-pci"
+OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI)
+
+struct VFIOUserPCIDevice {
+    VFIOPCIDevice device;
+    SocketAddress *socket;
+    bool send_queued;   /* all sends are queued */
+    uint32_t wait_time; /* timeout for message replies */
+    bool no_post;       /* all region writes are sync */
+};
+
+/*
+ * The server maintains the device's pending interrupts,
+ * via its MSIX table and PBA, so we treat these accesses
+ * like PCI config space and forward them.
+ */
+static uint64_t vfio_user_pba_read(void *opaque, hwaddr addr,
+                                   unsigned size)
+{
+    VFIOPCIDevice *vdev = opaque;
+    VFIORegion *region = &vdev->bars[vdev->msix->pba_bar].region;
+    uint64_t data;
+
+    /* server copy is what matters */
+    data = vfio_region_read(region, addr + vdev->msix->pba_offset, size);
+    return data;
+}
+
+static void vfio_user_pba_write(void *opaque, hwaddr addr,
+                                  uint64_t data, unsigned size)
+{
+    /* dropped */
+}
+
+static const MemoryRegionOps vfio_user_pba_ops = {
+    .read = vfio_user_pba_read,
+    .write = vfio_user_pba_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_user_msix_setup(VFIOPCIDevice *vdev)
+{
+    MemoryRegion *vfio_reg, *msix_reg, *pba_reg;
+
+    pba_reg = g_new0(MemoryRegion, 1);
+    vdev->msix->pba_region = pba_reg;
+
+    vfio_reg = vdev->bars[vdev->msix->pba_bar].mr;
+    msix_reg = &vdev->pdev.msix_pba_mmio;
+    memory_region_init_io(pba_reg, OBJECT(vdev), &vfio_user_pba_ops, vdev,
+                          "VFIO MSIX PBA", int128_get64(msix_reg->size));
+    memory_region_add_subregion_overlap(vfio_reg, vdev->msix->pba_offset,
+                                        pba_reg, 1);
+}
+
+static void vfio_user_msix_teardown(VFIOPCIDevice *vdev)
+{
+    MemoryRegion *mr, *sub;
+
+    mr = vdev->bars[vdev->msix->pba_bar].mr;
+    sub = vdev->msix->pba_region;
+    memory_region_del_subregion(mr, sub);
+
+    g_free(vdev->msix->pba_region);
+    vdev->msix->pba_region = NULL;
+}
+
+static void vfio_user_dma_read(VFIOPCIDevice *vdev, VFIOUserDMARW *msg)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    VFIOUserProxy *proxy = vdev->vbasedev.proxy;
+    VFIOUserDMARW *res;
+    MemTxResult r;
+    size_t size;
+
+    if (msg->hdr.size < sizeof(*msg)) {
+        vfio_user_send_error(proxy, &msg->hdr, EINVAL);
+        return;
+    }
+    if (msg->count > proxy->max_xfer_size) {
+        vfio_user_send_error(proxy, &msg->hdr, E2BIG);
+        return;
+    }
+
+    /* switch to our own message buffer */
+    size = msg->count + sizeof(VFIOUserDMARW);
+    res = g_malloc0(size);
+    memcpy(res, msg, sizeof(*res));
+    g_free(msg);
+
+    r = pci_dma_read(pdev, res->offset, &res->data, res->count);
+
+    switch (r) {
+    case MEMTX_OK:
+        if (res->hdr.flags & VFIO_USER_NO_REPLY) {
+            g_free(res);
+            return;
+        }
+        vfio_user_send_reply(proxy, &res->hdr, size);
+        break;
+    case MEMTX_ERROR:
+        vfio_user_send_error(proxy, &res->hdr, EFAULT);
+        break;
+    case MEMTX_DECODE_ERROR:
+        vfio_user_send_error(proxy, &res->hdr, ENODEV);
+        break;
+    case MEMTX_ACCESS_ERROR:
+        vfio_user_send_error(proxy, &res->hdr, EPERM);
+        break;
+    default:
+        error_printf("vfio_user_dma_read unknown error %d\n", r);
+        vfio_user_send_error(vdev->vbasedev.proxy, &res->hdr, EINVAL);
+    }
+}
+
+static void vfio_user_dma_write(VFIOPCIDevice *vdev, VFIOUserDMARW *msg)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    VFIOUserProxy *proxy = vdev->vbasedev.proxy;
+    MemTxResult r;
+
+    if (msg->hdr.size < sizeof(*msg)) {
+        vfio_user_send_error(proxy, &msg->hdr, EINVAL);
+        return;
+    }
+    /* make sure transfer count isn't larger than the message data */
+    if (msg->count > msg->hdr.size - sizeof(*msg)) {
+        vfio_user_send_error(proxy, &msg->hdr, E2BIG);
+        return;
+    }
+
+    r = pci_dma_write(pdev, msg->offset, &msg->data, msg->count);
+
+    switch (r) {
+    case MEMTX_OK:
+        if ((msg->hdr.flags & VFIO_USER_NO_REPLY) == 0) {
+            vfio_user_send_reply(proxy, &msg->hdr, sizeof(msg->hdr));
+        } else {
+            g_free(msg);
+        }
+        break;
+    case MEMTX_ERROR:
+        vfio_user_send_error(proxy, &msg->hdr, EFAULT);
+        break;
+    case MEMTX_DECODE_ERROR:
+        vfio_user_send_error(proxy, &msg->hdr, ENODEV);
+        break;
+    case MEMTX_ACCESS_ERROR:
+        vfio_user_send_error(proxy, &msg->hdr, EPERM);
+        break;
+    default:
+        error_printf("vfio_user_dma_write unknown error %d\n", r);
+        vfio_user_send_error(vdev->vbasedev.proxy, &msg->hdr, EINVAL);
+    }
+}
+
+/*
+ * Incoming request message callback.
+ *
+ * Runs off main loop, so BQL held.
+ */
+static void vfio_user_pci_process_req(void *opaque, VFIOUserMsg *msg)
+{
+    VFIOPCIDevice *vdev = opaque;
+    VFIOUserHdr *hdr = msg->hdr;
+
+    /* no incoming PCI requests pass FDs */
+    if (msg->fds != NULL) {
+        vfio_user_send_error(vdev->vbasedev.proxy, hdr, EINVAL);
+        vfio_user_putfds(msg);
+        return;
+    }
+
+    switch (hdr->command) {
+    case VFIO_USER_DMA_READ:
+        vfio_user_dma_read(vdev, (VFIOUserDMARW *)hdr);
+        break;
+    case VFIO_USER_DMA_WRITE:
+        vfio_user_dma_write(vdev, (VFIOUserDMARW *)hdr);
+        break;
+    default:
+        error_printf("vfio_user_pci_process_req unknown cmd %d\n",
+                     hdr->command);
+        vfio_user_send_error(vdev->vbasedev.proxy, hdr, ENOSYS);
+    }
+}
+
+/*
+ * Emulated devices don't use host hot reset
+ */
+static void vfio_user_compute_needs_reset(VFIODevice *vbasedev)
+{
+    vbasedev->needs_reset = false;
+}
+
+static Object *vfio_user_pci_get_object(VFIODevice *vbasedev)
+{
+    VFIOUserPCIDevice *vdev = container_of(vbasedev, VFIOUserPCIDevice,
+                                           device.vbasedev);
+
+    return OBJECT(vdev);
+}
+
+static VFIODeviceOps vfio_user_pci_ops = {
+    .vfio_compute_needs_reset = vfio_user_compute_needs_reset,
+    .vfio_eoi = vfio_pci_intx_eoi,
+    .vfio_get_object = vfio_user_pci_get_object,
+    /* No live migration support yet. */
+    .vfio_save_config = NULL,
+    .vfio_load_config = NULL,
+};
+
+static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
+{
+    ERRP_GUARD();
+    VFIOUserPCIDevice *udev = VFIO_USER_PCI(pdev);
+    VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
+    VFIODevice *vbasedev = &vdev->vbasedev;
+    const char *sock_name;
+    AddressSpace *as;
+    SocketAddress addr;
+    VFIOUserProxy *proxy;
+
+    if (!udev->socket) {
+        error_setg(errp, "No socket specified");
+        error_append_hint(errp, "e.g. -device '{"
+            "\"driver\":\"vfio-user-pci\", "
+            "\"socket\": {\"path\": \"/tmp/vfio-user.sock\", "
+            "\"type\": \"unix\"}'"
+            "}'\n");
+        return;
+    }
+
+    sock_name = udev->socket->u.q_unix.path;
+
+    vbasedev->name = g_strdup_printf("vfio-user:%s", sock_name);
+
+    memset(&addr, 0, sizeof(addr));
+    addr.type = SOCKET_ADDRESS_TYPE_UNIX;
+    addr.u.q_unix.path = (char *)sock_name;
+    proxy = vfio_user_connect_dev(&addr, errp);
+    if (!proxy) {
+        return;
+    }
+    vbasedev->proxy = proxy;
+    vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev);
+
+    vbasedev->name = g_strdup_printf("vfio-user:%s", sock_name);
+
+    if (udev->send_queued) {
+        proxy->flags |= VFIO_PROXY_FORCE_QUEUED;
+    }
+
+    if (udev->no_post) {
+        proxy->flags |= VFIO_PROXY_NO_POST;
+    }
+
+    /* user specified or 5 sec default */
+    proxy->wait_time = udev->wait_time;
+
+    if (!vfio_user_validate_version(proxy, errp)) {
+        goto error;
+    }
+
+    /*
+     * Use socket-based device I/O instead of vfio kernel driver.
+     */
+    vbasedev->io_ops = &vfio_user_device_io_ops_sock;
+
+    /*
+     * vfio-user devices are effectively mdevs (don't use a host iommu).
+     */
+    vbasedev->mdev = true;
+
+    /*
+     * Enable per-region fds.
+     */
+    vbasedev->use_region_fds = true;
+
+    as = pci_device_iommu_address_space(pdev);
+    if (!vfio_device_attach_by_iommu_type(TYPE_VFIO_IOMMU_USER,
+                                          vbasedev->name, vbasedev,
+                                          as, errp)) {
+        goto error;
+    }
+
+    if (!vfio_pci_populate_device(vdev, errp)) {
+        goto error;
+    }
+
+    if (!vfio_pci_config_setup(vdev, errp)) {
+        goto error;
+    }
+
+    /*
+     * vfio_pci_config_setup will have registered the device's BARs
+     * and setup any MSIX BARs, so errors after it succeeds must
+     * use out_teardown
+     */
+
+    if (!vfio_pci_add_capabilities(vdev, errp)) {
+        goto out_teardown;
+    }
+
+    if (vdev->msix != NULL) {
+        vfio_user_msix_setup(vdev);
+    }
+
+    if (!vfio_pci_interrupt_setup(vdev, errp)) {
+        goto out_teardown;
+    }
+
+    vfio_pci_register_err_notifier(vdev);
+    vfio_pci_register_req_notifier(vdev);
+
+    return;
+
+out_teardown:
+    vfio_pci_teardown_msi(vdev);
+    vfio_pci_bars_exit(vdev);
+error:
+    error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+    vfio_pci_put_device(vdev);
+}
+
+static void vfio_user_instance_init(Object *obj)
+{
+    PCIDevice *pci_dev = PCI_DEVICE(obj);
+    VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
+    VFIODevice *vbasedev = &vdev->vbasedev;
+
+    device_add_bootindex_property(obj, &vdev->bootindex,
+                                  "bootindex", NULL,
+                                  &pci_dev->qdev);
+    vdev->host.domain = ~0U;
+    vdev->host.bus = ~0U;
+    vdev->host.slot = ~0U;
+    vdev->host.function = ~0U;
+
+    vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_user_pci_ops,
+                     DEVICE(vdev), false);
+
+    vdev->nv_gpudirect_clique = 0xFF;
+
+    /*
+     * QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
+     * line, therefore, no need to wait to realize like other devices.
+     */
+    pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
+}
+
+static void vfio_user_instance_finalize(Object *obj)
+{
+    VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
+    VFIODevice *vbasedev = &vdev->vbasedev;
+
+    if (vdev->msix != NULL) {
+        vfio_user_msix_teardown(vdev);
+    }
+
+    vfio_pci_put_device(vdev);
+
+    if (vbasedev->proxy != NULL) {
+        vfio_user_disconnect(vbasedev->proxy);
+    }
+}
+
+static void vfio_user_pci_reset(DeviceState *dev)
+{
+    VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
+    VFIODevice *vbasedev = &vdev->vbasedev;
+
+    vfio_pci_pre_reset(vdev);
+
+    if (vbasedev->reset_works) {
+        vfio_user_device_reset(vbasedev->proxy);
+    }
+
+    vfio_pci_post_reset(vdev);
+}
+
+static const Property vfio_user_pci_dev_properties[] = {
+    DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice,
+                       vendor_id, PCI_ANY_ID),
+    DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice,
+                       device_id, PCI_ANY_ID),
+    DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
+                       sub_vendor_id, PCI_ANY_ID),
+    DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
+                       sub_device_id, PCI_ANY_ID),
+    DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false),
+    DEFINE_PROP_UINT32("x-msg-timeout", VFIOUserPCIDevice, wait_time, 5000),
+    DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false),
+};
+
+static void vfio_user_pci_set_socket(Object *obj, Visitor *v, const char *name,
+                                     void *opaque, Error **errp)
+{
+    VFIOUserPCIDevice *udev = VFIO_USER_PCI(obj);
+    bool success;
+
+    if (udev->device.vbasedev.proxy) {
+        error_setg(errp, "Proxy is connected");
+        return;
+    }
+
+    qapi_free_SocketAddress(udev->socket);
+
+    udev->socket = NULL;
+
+    success = visit_type_SocketAddress(v, name, &udev->socket, errp);
+
+    if (!success) {
+        return;
+    }
+
+    if (udev->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
+        error_setg(errp, "Unsupported socket type %s",
+                   SocketAddressType_str(udev->socket->type));
+        qapi_free_SocketAddress(udev->socket);
+        udev->socket = NULL;
+        return;
+    }
+}
+
+static void vfio_user_pci_dev_class_init(ObjectClass *klass, const void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
+
+    device_class_set_legacy_reset(dc, vfio_user_pci_reset);
+    device_class_set_props(dc, vfio_user_pci_dev_properties);
+
+    object_class_property_add(klass, "socket", "SocketAddress", NULL,
+                              vfio_user_pci_set_socket, NULL, NULL);
+    object_class_property_set_description(klass, "socket",
+                                          "SocketAddress (UNIX sockets only)");
+
+    dc->desc = "VFIO over socket PCI device assignment";
+    pdc->realize = vfio_user_pci_realize;
+}
+
+static const TypeInfo vfio_user_pci_dev_info = {
+    .name = TYPE_VFIO_USER_PCI,
+    .parent = TYPE_VFIO_PCI_BASE,
+    .instance_size = sizeof(VFIOUserPCIDevice),
+    .class_init = vfio_user_pci_dev_class_init,
+    .instance_init = vfio_user_instance_init,
+    .instance_finalize = vfio_user_instance_finalize,
+};
+
+static void register_vfio_user_dev_type(void)
+{
+    type_register_static(&vfio_user_pci_dev_info);
+}
+
+ type_init(register_vfio_user_dev_type)