summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--hw/i386/intel_iommu.c12
-rw-r--r--hw/vfio/common.c5
-rw-r--r--hw/vfio/pci-quirks.c8
-rw-r--r--hw/vfio/pci.c101
-rw-r--r--hw/vfio/pci.h1
-rw-r--r--hw/vfio/trace-events1
-rw-r--r--include/exec/memory.h8
-rw-r--r--memory.c10
8 files changed, 136 insertions, 10 deletions
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 347718f938..5eba704477 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -24,6 +24,7 @@
 #include "exec/address-spaces.h"
 #include "intel_iommu_internal.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
 
 /*#define DEBUG_INTEL_IOMMU*/
 #ifdef DEBUG_INTEL_IOMMU
@@ -1871,6 +1872,16 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
     return ret;
 }
 
+static void vtd_iommu_notify_started(MemoryRegion *iommu)
+{
+    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+
+    hw_error("Device at bus %s addr %02x.%d requires iommu notifier which "
+             "is currently not supported by intel-iommu emulation",
+             vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
+             PCI_FUNC(vtd_as->devfn));
+}
+
 static const VMStateDescription vtd_vmstate = {
     .name = "iommu-intel",
     .unmigratable = 1,
@@ -1938,6 +1949,7 @@ static void vtd_init(IntelIOMMUState *s)
     memset(s->womask, 0, DMAR_REG_SIZE);
 
     s->iommu_ops.translate = vtd_iommu_translate;
+    s->iommu_ops.notify_started = vtd_iommu_notify_started;
     s->root = 0;
     s->root_extended = false;
     s->dmar_enabled = false;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 27cc1596f9..7be638e0e3 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -455,7 +455,8 @@ static void vfio_listener_region_del(MemoryListener *listener,
 
         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
             if (giommu->iommu == section->mr) {
-                memory_region_unregister_iommu_notifier(&giommu->n);
+                memory_region_unregister_iommu_notifier(giommu->iommu,
+                                                        &giommu->n);
                 QLIST_REMOVE(giommu, giommu_next);
                 g_free(giommu);
                 break;
@@ -991,7 +992,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
         QLIST_REMOVE(container, next);
 
         QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
-            memory_region_unregister_iommu_notifier(&giommu->n);
+            memory_region_unregister_iommu_notifier(giommu->iommu, &giommu->n);
             QLIST_REMOVE(giommu, giommu_next);
             g_free(giommu);
         }
diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index 35d32b78f4..bec694c8d8 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -318,7 +318,7 @@ static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 
     /* This windows doesn't seem to be used except by legacy VGA code */
     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
-        !vdev->has_vga || nr != 4) {
+        !vdev->vga || nr != 4) {
         return;
     }
 
@@ -366,7 +366,7 @@ static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
 
     /* Only enable on newer devices where BAR2 is 64bit */
     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
-        !vdev->has_vga || nr != 2 || !vdev->bars[2].mem64) {
+        !vdev->vga || nr != 2 || !vdev->bars[2].mem64) {
         return;
     }
 
@@ -660,7 +660,7 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
     VFIOConfigWindowQuirk *window;
 
     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
-        !vdev->has_vga || nr != 5) {
+        !vdev->vga || nr != 5) {
         return;
     }
 
@@ -776,7 +776,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 
     /* The 0x1800 offset mirror only seems to get used by legacy VGA */
-    if (vdev->has_vga) {
+    if (vdev->vga) {
         quirk = g_malloc0(sizeof(*quirk));
         mirror = quirk->data = g_malloc0(sizeof(*mirror));
         mirror->mem = quirk->mem = g_new0(MemoryRegion, 1);
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 53b87b76ea..f2c679e47c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1502,6 +1502,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
     return next - pos;
 }
 
+
+static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
+{
+    uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
+
+    for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
+        tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
+        if (tmp > pos && tmp < next) {
+            next = tmp;
+        }
+    }
+
+    return next - pos;
+}
+
 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
 {
     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
@@ -1749,16 +1764,100 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    uint32_t header;
+    uint16_t cap_id, next, size;
+    uint8_t cap_ver;
+    uint8_t *config;
+
+    /* Only add extended caps if we have them and the guest can see them */
+    if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) ||
+        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
+        return 0;
+    }
+
+    /*
+     * pcie_add_capability always inserts the new capability at the tail
+     * of the chain.  Therefore to end up with a chain that matches the
+     * physical device, we cache the config space to avoid overwriting
+     * the original config space when we parse the extended capabilities.
+     */
+    config = g_memdup(pdev->config, vdev->config_size);
+
+    /*
+     * Extended capabilities are chained with each pointing to the next, so we
+     * can drop anything other than the head of the chain simply by modifying
+     * the previous next pointer.  For the head of the chain, we can modify the
+     * capability ID to something that cannot match a valid capability.  ID
+     * 0 is reserved for this since absence of capabilities is indicated by
+     * 0 for the ID, version, AND next pointer.  However, pcie_add_capability()
+     * uses ID 0 as reserved for list management and will incorrectly match and
+     * assert if we attempt to pre-load the head of the chain with with this
+     * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
+     * part for identifying absence of capabilities in a root complex register
+     * block.  If the ID still exists after adding capabilities, switch back to
+     * zero.  We'll mark this entire first dword as emulated for this purpose.
+     */
+    pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
+                 PCI_EXT_CAP(0xFFFF, 0, 0));
+    pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
+    pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
+
+    for (next = PCI_CONFIG_SPACE_SIZE; next;
+         next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
+        header = pci_get_long(config + next);
+        cap_id = PCI_EXT_CAP_ID(header);
+        cap_ver = PCI_EXT_CAP_VER(header);
+
+        /*
+         * If it becomes important to configure extended capabilities to their
+         * actual size, use this as the default when it's something we don't
+         * recognize. Since QEMU doesn't actually handle many of the config
+         * accesses, exact size doesn't seem worthwhile.
+         */
+        size = vfio_ext_cap_max_size(config, next);
+
+        /* Use emulated next pointer to allow dropping extended caps */
+        pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
+                                   PCI_EXT_CAP_NEXT_MASK);
+
+        switch (cap_id) {
+        case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
+            trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
+            break;
+        default:
+            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+        }
+
+    }
+
+    /* Cleanup chain head ID if necessary */
+    if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
+        pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
+    }
+
+    g_free(config);
+    return 0;
+}
+
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
+    int ret;
 
     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
         !pdev->config[PCI_CAPABILITY_LIST]) {
         return 0; /* Nothing to add */
     }
 
-    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+    ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+    if (ret) {
+        return ret;
+    }
+
+    return vfio_add_ext_cap(vdev);
 }
 
 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index b3eb0d838e..7d482d9d21 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -135,7 +135,6 @@ typedef struct VFIOPCIDevice {
     int32_t bootindex;
     uint32_t igd_gms;
     uint8_t pm_cap;
-    bool has_vga;
     bool pci_aer;
     bool req_enabled;
     bool has_flr;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 9da0ff928b..a768fb54ec 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -37,6 +37,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %
 vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
 vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m"
 vfio_initfn(const char *name, int group_id) " (%s) group %d"
+vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s %x@%x"
 vfio_pci_reset(const char *name) " (%s)"
 vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET"
 vfio_pci_reset_pm(const char *name) "%s PCI PM Reset"
diff --git a/include/exec/memory.h b/include/exec/memory.h
index e3829f797a..23c7399131 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -153,6 +153,10 @@ struct MemoryRegionIOMMUOps {
     IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool is_write);
     /* Returns minimum supported page size */
     uint64_t (*get_min_page_size)(MemoryRegion *iommu);
+    /* Called when the first notifier is set */
+    void (*notify_started)(MemoryRegion *iommu);
+    /* Called when the last notifier is removed */
+    void (*notify_stopped)(MemoryRegion *iommu);
 };
 
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
@@ -622,9 +626,11 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write);
  * memory_region_unregister_iommu_notifier: unregister a notifier for
  * changes to IOMMU translation entries.
  *
+ * @mr: the memory region which was observed and for which notity_stopped()
+ *      needs to be called
  * @n: the notifier to be removed.
  */
-void memory_region_unregister_iommu_notifier(Notifier *n);
+void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n);
 
 /**
  * memory_region_name: get a memory region's name
diff --git a/memory.c b/memory.c
index 8549c791d7..33799e810b 100644
--- a/memory.c
+++ b/memory.c
@@ -1499,6 +1499,10 @@ bool memory_region_is_logging(MemoryRegion *mr, uint8_t client)
 
 void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n)
 {
+    if (mr->iommu_ops->notify_started &&
+        QLIST_EMPTY(&mr->iommu_notify.notifiers)) {
+        mr->iommu_ops->notify_started(mr);
+    }
     notifier_list_add(&mr->iommu_notify, n);
 }
 
@@ -1532,9 +1536,13 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write)
     }
 }
 
-void memory_region_unregister_iommu_notifier(Notifier *n)
+void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n)
 {
     notifier_remove(n);
+    if (mr->iommu_ops->notify_stopped &&
+        QLIST_EMPTY(&mr->iommu_notify.notifiers)) {
+        mr->iommu_ops->notify_stopped(mr);
+    }
 }
 
 void memory_region_notify_iommu(MemoryRegion *mr,