diff options
Diffstat (limited to 'hw/vfio/listener.c')
| -rw-r--r-- | hw/vfio/listener.c | 1172 |
1 files changed, 1172 insertions, 0 deletions
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c new file mode 100644 index 0000000000..bde1bb3c13 --- /dev/null +++ b/hw/vfio/listener.c @@ -0,0 +1,1172 @@ +/* + * generic functions used by VFIO devices + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> +#ifdef CONFIG_KVM +#include <linux/kvm.h> +#endif +#include <linux/vfio.h> + +#include "hw/vfio/vfio-common.h" +#include "hw/vfio/pci.h" +#include "system/address-spaces.h" +#include "system/memory.h" +#include "system/ram_addr.h" +#include "hw/hw.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qemu/range.h" +#include "system/kvm.h" +#include "system/reset.h" +#include "system/runstate.h" +#include "trace.h" +#include "qapi/error.h" +#include "migration/misc.h" +#include "migration/qemu-file.h" +#include "system/tcg.h" +#include "system/tpm.h" +#include "vfio-migration-internal.h" +#include "vfio-helpers.h" + +/* + * Device state interfaces + */ + + +static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer) +{ + VFIODevice *vbasedev; + + if (!vfio_container_dirty_tracking_is_started(bcontainer)) { + return false; + } + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { + return false; + } + + if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && + (vfio_device_state_is_running(vbasedev) || + vfio_device_state_is_precopy(vbasedev))) { + return false; + } + } + return true; +} + +static bool vfio_listener_skipped_section(MemoryRegionSection *section) +{ + return (!memory_region_is_ram(section->mr) && + !memory_region_is_iommu(section->mr)) || + memory_region_is_protected(section->mr) || + /* + * Sizing an enabled 64-bit BAR can cause spurious mappings to + * addresses in the upper part of the 64-bit address space. These + * are never accessed by the CPU and beyond the address width of + * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. + */ + section->offset_within_address_space & (1ULL << 63); +} + +/* Called with rcu_read_lock held. */ +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + ram_addr_t *ram_addr, bool *read_only, + Error **errp) +{ + bool ret, mr_has_discard_manager; + + ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, + &mr_has_discard_manager, errp); + if (ret && mr_has_discard_manager) { + /* + * Malicious VMs might trigger discarding of IOMMU-mapped memory. The + * pages will remain pinned inside vfio until unmapped, resulting in a + * higher memory consumption than expected. If memory would get + * populated again later, there would be an inconsistency between pages + * pinned by vfio and pages seen by QEMU. This is the case until + * unmapped from the IOMMU (e.g., during device reset). + * + * With malicious guests, we really only care about pinning more memory + * than expected. RLIMIT_MEMLOCK set for the user/process can never be + * exceeded and can be used to mitigate this problem. + */ + warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" + " RAM (e.g., virtio-mem) works, however, malicious" + " guests can trigger pinning of more memory than" + " intended via an IOMMU. It's possible to mitigate " + " by setting/adjusting RLIMIT_MEMLOCK."); + } + return ret; +} + +static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); + VFIOContainerBase *bcontainer = giommu->bcontainer; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + void *vaddr; + int ret; + Error *local_err = NULL; + + trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", + iova, iova + iotlb->addr_mask); + + if (iotlb->target_as != &address_space_memory) { + error_setg(&local_err, + "Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + if (migration_is_running()) { + migration_file_set_error(-EINVAL, local_err); + } else { + error_report_err(local_err); + } + return; + } + + rcu_read_lock(); + + if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { + bool read_only; + + if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { + error_report_err(local_err); + goto out; + } + /* + * vaddr is only valid until rcu_read_unlock(). But after + * vfio_dma_map has set up the mapping the pages will be + * pinned by the kernel. This makes sure that the RAM backend + * of vaddr will always be there, even if the memory object is + * destroyed and its backing memory munmap-ed. + */ + ret = vfio_container_dma_map(bcontainer, iova, + iotlb->addr_mask + 1, vaddr, + read_only); + if (ret) { + error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx", %p) = %d (%s)", + bcontainer, iova, + iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); + } + } else { + ret = vfio_container_dma_unmap(bcontainer, iova, + iotlb->addr_mask + 1, iotlb); + if (ret) { + error_setg(&local_err, + "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + bcontainer, iova, + iotlb->addr_mask + 1, ret, strerror(-ret)); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } + } + } +out: + rcu_read_unlock(); +} + +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + int ret; + + /* Unmap with a single call. */ + ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); + if (ret) { + error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, + strerror(-ret)); + } +} + +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; + const hwaddr end = section->offset_within_region + + int128_get64(section->size); + hwaddr start, next, iova; + void *vaddr; + int ret; + + /* + * Map in (aligned within memory region) minimum granularity, so we can + * unmap in minimum granularity later. + */ + for (start = section->offset_within_region; start < end; start = next) { + next = ROUND_UP(start + 1, vrdl->granularity); + next = MIN(next, end); + + iova = start - section->offset_within_region + + section->offset_within_address_space; + vaddr = memory_region_get_ram_ptr(section->mr) + start; + + ret = vfio_container_dma_map(bcontainer, iova, next - start, + vaddr, section->readonly); + if (ret) { + /* Rollback */ + vfio_ram_discard_notify_discard(rdl, section); + return ret; + } + } + return 0; +} + +static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + int target_page_size = qemu_target_page_size(); + VFIORamDiscardListener *vrdl; + + /* Ignore some corner cases not relevant in practice. */ + g_assert(QEMU_IS_ALIGNED(section->offset_within_region, target_page_size)); + g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, + target_page_size)); + g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), target_page_size)); + + vrdl = g_new0(VFIORamDiscardListener, 1); + vrdl->bcontainer = bcontainer; + vrdl->mr = section->mr; + vrdl->offset_within_address_space = section->offset_within_address_space; + vrdl->size = int128_get64(section->size); + vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, + section->mr); + + g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); + g_assert(bcontainer->pgsizes && + vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); + + ram_discard_listener_init(&vrdl->listener, + vfio_ram_discard_notify_populate, + vfio_ram_discard_notify_discard, true); + ram_discard_manager_register_listener(rdm, &vrdl->listener, section); + QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); + + /* + * Sanity-check if we have a theoretically problematic setup where we could + * exceed the maximum number of possible DMA mappings over time. We assume + * that each mapped section in the same address space as a RamDiscardManager + * section consumes exactly one DMA mapping, with the exception of + * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections + * in the same address space as RamDiscardManager sections. + * + * We assume that each section in the address space consumes one memslot. + * We take the number of KVM memory slots as a best guess for the maximum + * number of sections in the address space we could have over time, + * also consuming DMA mappings. + */ + if (bcontainer->dma_max_mappings) { + unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; + +#ifdef CONFIG_KVM + if (kvm_enabled()) { + max_memslots = kvm_get_max_memslots(); + } +#endif + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + hwaddr start, end; + + start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, + vrdl->granularity); + end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, + vrdl->granularity); + vrdl_mappings += (end - start) / vrdl->granularity; + vrdl_count++; + } + + if (vrdl_mappings + max_memslots - vrdl_count > + bcontainer->dma_max_mappings) { + warn_report("%s: possibly running out of DMA mappings. E.g., try" + " increasing the 'block-size' of virtio-mem devies." + " Maximum possible DMA mappings: %d, Maximum possible" + " memslots: %d", __func__, bcontainer->dma_max_mappings, + max_memslots); + } + } +} + +static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to unregister missing RAM discard listener"); + } + + ram_discard_manager_unregister_listener(rdm, &vrdl->listener); + QLIST_REMOVE(vrdl, next); + g_free(vrdl); +} + +static bool vfio_known_safe_misalignment(MemoryRegionSection *section) +{ + MemoryRegion *mr = section->mr; + + if (!TPM_IS_CRB(mr->owner)) { + return false; + } + + /* this is a known safe misaligned region, just trace for debug purpose */ + trace_vfio_known_safe_misalignment(memory_region_name(mr), + section->offset_within_address_space, + section->offset_within_region, + qemu_real_host_page_size()); + return true; +} + +static bool vfio_listener_valid_section(MemoryRegionSection *section, + const char *name) +{ + if (vfio_listener_skipped_section(section)) { + trace_vfio_listener_region_skip(name, + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(int128_sub(section->size, int128_one()))); + return false; + } + + if (unlikely((section->offset_within_address_space & + ~qemu_real_host_page_mask()) != + (section->offset_within_region & ~qemu_real_host_page_mask()))) { + if (!vfio_known_safe_misalignment(section)) { + error_report("%s received unaligned region %s iova=0x%"PRIx64 + " offset_within_region=0x%"PRIx64 + " qemu_real_host_page_size=0x%"PRIxPTR, + __func__, memory_region_name(section->mr), + section->offset_within_address_space, + section->offset_within_region, + qemu_real_host_page_size()); + } + return false; + } + + return true; +} + +static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + hwaddr *out_iova, hwaddr *out_end, + Int128 *out_llend) +{ + Int128 llend; + hwaddr iova; + + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); + + if (int128_ge(int128_make64(iova), llend)) { + return false; + } + + *out_iova = iova; + *out_end = int128_get64(int128_sub(llend, int128_one())); + if (out_llend) { + *out_llend = llend; + } + return true; +} + +static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) +{ + /* + * MMIO region mapping failures are not fatal but in this case PCI + * peer-to-peer transactions are broken. + */ + if (vbasedev && vbasedev->type == VFIO_DEVICE_TYPE_PCI) { + error_append_hint(errp, "%s: PCI peer-to-peer transactions " + "on BARs are not supported.\n", vbasedev->name); + } +} + +static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + hwaddr iova, end; + Int128 llend, llsize; + void *vaddr; + int ret; + Error *err = NULL; + + if (!vfio_listener_valid_section(section, "region_add")) { + return; + } + + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { + if (memory_region_is_ram_device(section->mr)) { + trace_vfio_listener_region_add_no_dma_map( + memory_region_name(section->mr), + section->offset_within_address_space, + int128_getlo(section->size), + qemu_real_host_page_size()); + } + return; + } + + /* PPC64/pseries machine only */ + if (!vfio_container_add_section_window(bcontainer, section, &err)) { + goto mmio_dma_error; + } + + memory_region_ref(section->mr); + + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); + int iommu_idx; + + trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); + /* + * FIXME: For VFIO iommu types which have KVM acceleration to + * avoid bouncing all map/unmaps through qemu this way, this + * would be the right place to wire that up (tell the KVM + * device emulation the VFIO iommu handles to use). + */ + giommu = g_malloc0(sizeof(*giommu)); + giommu->iommu_mr = iommu_mr; + giommu->iommu_offset = section->offset_within_address_space - + section->offset_within_region; + giommu->bcontainer = bcontainer; + llend = int128_add(int128_make64(section->offset_within_region), + section->size); + llend = int128_sub(llend, int128_one()); + iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, + MEMTXATTRS_UNSPECIFIED); + iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, + IOMMU_NOTIFIER_IOTLB_EVENTS, + section->offset_within_region, + int128_get64(llend), + iommu_idx); + + ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, + &err); + if (ret) { + g_free(giommu); + goto fail; + } + QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); + memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); + + return; + } + + /* Here we assume that memory_region_is_ram(section->mr)==true */ + + /* + * For RAM memory regions with a RamDiscardManager, we only want to map the + * actually populated parts - and update the mapping whenever we're notified + * about changes. + */ + if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_register_ram_discard_listener(bcontainer, section); + return; + } + + vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + + trace_vfio_listener_region_add_ram(iova, end, vaddr); + + llsize = int128_sub(llend, int128_make64(iova)); + + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; + + if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { + trace_vfio_listener_region_add_no_dma_map( + memory_region_name(section->mr), + section->offset_within_address_space, + int128_getlo(section->size), + pgmask + 1); + return; + } + } + + ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), + vaddr, section->readonly); + if (ret) { + error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx", %p) = %d (%s)", + bcontainer, iova, int128_get64(llsize), vaddr, ret, + strerror(-ret)); + mmio_dma_error: + if (memory_region_is_ram_device(section->mr)) { + /* Allow unexpected mappings not to be fatal for RAM devices */ + VFIODevice *vbasedev = + vfio_get_vfio_device(memory_region_owner(section->mr)); + vfio_device_error_append(vbasedev, &err); + warn_report_err_once(err); + return; + } + goto fail; + } + + return; + +fail: + if (!bcontainer->initialized) { + /* + * At machine init time or when the device is attached to the + * VM, store the first error in the container so we can + * gracefully fail the device realize routine. + */ + if (!bcontainer->error) { + error_propagate_prepend(&bcontainer->error, err, + "Region %s: ", + memory_region_name(section->mr)); + } else { + error_free(err); + } + } else { + /* + * At runtime, there's not much we can do other than throw a + * hardware error. + */ + error_report_err(err); + hw_error("vfio: DMA mapping failed, unable to continue"); + } +} + +static void vfio_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + hwaddr iova, end; + Int128 llend, llsize; + int ret; + bool try_unmap = true; + + if (!vfio_listener_valid_section(section, "region_del")) { + return; + } + + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + + trace_vfio_listener_region_del_iommu(section->mr->name); + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu_mr) == section->mr && + giommu->n.start == section->offset_within_region) { + memory_region_unregister_iommu_notifier(section->mr, + &giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + break; + } + } + + /* + * FIXME: We assume the one big unmap below is adequate to + * remove any individual page mappings in the IOMMU which + * might have been copied into VFIO. This works for a page table + * based IOMMU where a big unmap flattens a large range of IO-PTEs. + * That may not be true for all IOMMU types. + */ + } + + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { + return; + } + + llsize = int128_sub(llend, int128_make64(iova)); + + trace_vfio_listener_region_del(iova, end); + + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask; + + pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; + try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } else if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_unregister_ram_discard_listener(bcontainer, section); + /* Unregistering will trigger an unmap. */ + try_unmap = false; + } + + if (try_unmap) { + if (int128_eq(llsize, int128_2_64())) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + llsize = int128_rshift(llsize, 1); + ret = vfio_container_dma_unmap(bcontainer, iova, + int128_get64(llsize), NULL); + if (ret) { + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + bcontainer, iova, int128_get64(llsize), ret, + strerror(-ret)); + } + iova += int128_get64(llsize); + } + ret = vfio_container_dma_unmap(bcontainer, iova, + int128_get64(llsize), NULL); + if (ret) { + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + bcontainer, iova, int128_get64(llsize), ret, + strerror(-ret)); + } + } + + memory_region_unref(section->mr); + + /* PPC64/pseries machine only */ + vfio_container_del_section_window(bcontainer, section); +} + +typedef struct VFIODirtyRanges { + hwaddr min32; + hwaddr max32; + hwaddr min64; + hwaddr max64; + hwaddr minpci64; + hwaddr maxpci64; +} VFIODirtyRanges; + +typedef struct VFIODirtyRangesListener { + VFIOContainerBase *bcontainer; + VFIODirtyRanges ranges; + MemoryListener listener; +} VFIODirtyRangesListener; + +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, + VFIOContainerBase *bcontainer) +{ + VFIOPCIDevice *pcidev; + VFIODevice *vbasedev; + Object *owner; + + owner = memory_region_owner(section->mr); + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + if (OBJECT(pcidev) == owner) { + return true; + } + } + + return false; +} + +static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range, + hwaddr iova, hwaddr end, + bool update_pci) +{ + hwaddr *min, *max; + + /* + * The address space passed to the dirty tracker is reduced to three ranges: + * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the + * PCI 64-bit hole. + * + * The underlying reports of dirty will query a sub-interval of each of + * these ranges. + * + * The purpose of the three range handling is to handle known cases of big + * holes in the address space, like the x86 AMD 1T hole, and firmware (like + * OVMF) which may relocate the pci-hole64 to the end of the address space. + * The latter would otherwise generate large ranges for tracking, stressing + * the limits of supported hardware. The pci-hole32 will always be below 4G + * (overlapping or not) so it doesn't need special handling and is part of + * the 32-bit range. + * + * The alternative would be an IOVATree but that has a much bigger runtime + * overhead and unnecessary complexity. + */ + if (update_pci && iova >= UINT32_MAX) { + min = &range->minpci64; + max = &range->maxpci64; + } else { + min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; + max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; + } + if (*min > iova) { + *min = iova; + } + if (*max < end) { + *max = end; + } + + trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); +} + +static void vfio_dirty_tracking_update(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIODirtyRangesListener *dirty = + container_of(listener, VFIODirtyRangesListener, listener); + hwaddr iova, end; + + if (!vfio_listener_valid_section(section, "tracking_update") || + !vfio_get_section_iova_range(dirty->bcontainer, section, + &iova, &end, NULL)) { + return; + } + + vfio_dirty_tracking_update_range(&dirty->ranges, iova, end, + vfio_section_is_vfio_pci(section, dirty->bcontainer)); +} + +static const MemoryListener vfio_dirty_tracking_listener = { + .name = "vfio-tracking", + .region_add = vfio_dirty_tracking_update, +}; + +static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, + VFIODirtyRanges *ranges) +{ + VFIODirtyRangesListener dirty; + + memset(&dirty, 0, sizeof(dirty)); + dirty.ranges.min32 = UINT32_MAX; + dirty.ranges.min64 = UINT64_MAX; + dirty.ranges.minpci64 = UINT64_MAX; + dirty.listener = vfio_dirty_tracking_listener; + dirty.bcontainer = bcontainer; + + memory_listener_register(&dirty.listener, + bcontainer->space->as); + + *ranges = dirty.ranges; + + /* + * The memory listener is synchronous, and used to calculate the range + * to dirty tracking. Unregister it after we are done as we are not + * interested in any follow-up updates. + */ + memory_listener_unregister(&dirty.listener); +} + +static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) +{ + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + VFIODevice *vbasedev; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (!vbasedev->dirty_tracking) { + continue; + } + + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + warn_report("%s: Failed to stop DMA logging, err %d (%s)", + vbasedev->name, -errno, strerror(errno)); + } + vbasedev->dirty_tracking = false; + } +} + +static struct vfio_device_feature * +vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, + VFIODirtyRanges *tracking) +{ + struct vfio_device_feature *feature; + size_t feature_size; + struct vfio_device_feature_dma_logging_control *control; + struct vfio_device_feature_dma_logging_range *ranges; + + feature_size = sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_dma_logging_control); + feature = g_try_malloc0(feature_size); + if (!feature) { + errno = ENOMEM; + return NULL; + } + feature->argsz = feature_size; + feature->flags = VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_START; + + control = (struct vfio_device_feature_dma_logging_control *)feature->data; + control->page_size = qemu_real_host_page_size(); + + /* + * DMA logging uAPI guarantees to support at least a number of ranges that + * fits into a single host kernel base page. + */ + control->num_ranges = !!tracking->max32 + !!tracking->max64 + + !!tracking->maxpci64; + ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, + control->num_ranges); + if (!ranges) { + g_free(feature); + errno = ENOMEM; + + return NULL; + } + + control->ranges = (uintptr_t)ranges; + if (tracking->max32) { + ranges->iova = tracking->min32; + ranges->length = (tracking->max32 - tracking->min32) + 1; + ranges++; + } + if (tracking->max64) { + ranges->iova = tracking->min64; + ranges->length = (tracking->max64 - tracking->min64) + 1; + ranges++; + } + if (tracking->maxpci64) { + ranges->iova = tracking->minpci64; + ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1; + } + + trace_vfio_device_dirty_tracking_start(control->num_ranges, + tracking->min32, tracking->max32, + tracking->min64, tracking->max64, + tracking->minpci64, tracking->maxpci64); + + return feature; +} + +static void vfio_device_feature_dma_logging_start_destroy( + struct vfio_device_feature *feature) +{ + struct vfio_device_feature_dma_logging_control *control = + (struct vfio_device_feature_dma_logging_control *)feature->data; + struct vfio_device_feature_dma_logging_range *ranges = + (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; + + g_free(ranges); + g_free(feature); +} + +static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, + Error **errp) +{ + struct vfio_device_feature *feature; + VFIODirtyRanges ranges; + VFIODevice *vbasedev; + int ret = 0; + + vfio_dirty_tracking_init(bcontainer, &ranges); + feature = vfio_device_feature_dma_logging_start_create(bcontainer, + &ranges); + if (!feature) { + error_setg_errno(errp, errno, "Failed to prepare DMA logging"); + return false; + } + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->dirty_tracking) { + continue; + } + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + if (ret) { + ret = -errno; + error_setg_errno(errp, errno, "%s: Failed to start DMA logging", + vbasedev->name); + goto out; + } + vbasedev->dirty_tracking = true; + } + +out: + if (ret) { + vfio_devices_dma_logging_stop(bcontainer); + } + + vfio_device_feature_dma_logging_start_destroy(feature); + + return ret == 0; +} + +static bool vfio_listener_log_global_start(MemoryListener *listener, + Error **errp) +{ + ERRP_GUARD(); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + bool ret; + + if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { + ret = vfio_devices_dma_logging_start(bcontainer, errp); + } else { + ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0; + } + + if (!ret) { + error_prepend(errp, "vfio: Could not start dirty page tracking - "); + } + return ret; +} + +static void vfio_listener_log_global_stop(MemoryListener *listener) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + Error *local_err = NULL; + int ret = 0; + + if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { + vfio_devices_dma_logging_stop(bcontainer); + } else { + ret = vfio_container_set_dirty_page_tracking(bcontainer, false, + &local_err); + } + + if (ret) { + error_prepend(&local_err, + "vfio: Could not stop dirty page tracking - "); + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } + } +} + +typedef struct { + IOMMUNotifier n; + VFIOGuestIOMMU *giommu; +} vfio_giommu_dirty_notifier; + +static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + vfio_giommu_dirty_notifier *gdn = container_of(n, + vfio_giommu_dirty_notifier, n); + VFIOGuestIOMMU *giommu = gdn->giommu; + VFIOContainerBase *bcontainer = giommu->bcontainer; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + ram_addr_t translated_addr; + Error *local_err = NULL; + int ret = -EINVAL; + + trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); + + if (iotlb->target_as != &address_space_memory) { + error_setg(&local_err, + "Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + goto out; + } + + rcu_read_lock(); + if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { + goto out_unlock; + } + + ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, + translated_addr, &local_err); + if (ret) { + error_prepend(&local_err, + "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") failed - ", bcontainer, iova, + iotlb->addr_mask + 1); + } + +out_unlock: + rcu_read_unlock(); + +out: + if (ret) { + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } + } +} + +static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, + void *opaque) +{ + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + VFIORamDiscardListener *vrdl = opaque; + Error *local_err = NULL; + int ret; + + /* + * Sync the whole mapped region (spanning multiple individual mappings) + * in one go. + */ + ret = vfio_container_query_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr, + &local_err); + if (ret) { + error_report_err(local_err); + } + return ret; +} + +static int +vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + } + + /* + * We only want/can synchronize the bitmap for actually mapped parts - + * which correspond to populated parts. Replay all populated parts. + */ + return ram_discard_manager_replay_populated(rdm, section, + vfio_ram_discard_get_dirty_bitmap, + &vrdl); +} + +static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIOGuestIOMMU *giommu; + bool found = false; + Int128 llend; + vfio_giommu_dirty_notifier gdn; + int idx; + + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu_mr) == section->mr && + giommu->n.start == section->offset_within_region) { + found = true; + break; + } + } + + if (!found) { + return 0; + } + + gdn.giommu = giommu; + idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr, + MEMTXATTRS_UNSPECIFIED); + + llend = int128_add(int128_make64(section->offset_within_region), + section->size); + llend = int128_sub(llend, int128_one()); + + iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP, + section->offset_within_region, int128_get64(llend), + idx); + memory_region_iommu_replay(giommu->iommu_mr, &gdn.n); + + return 0; +} + +static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, Error **errp) +{ + ram_addr_t ram_addr; + + if (memory_region_is_iommu(section->mr)) { + return vfio_sync_iommu_dirty_bitmap(bcontainer, section); + } else if (memory_region_has_ram_discard_manager(section->mr)) { + int ret; + + ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); + if (ret) { + error_setg(errp, + "Failed to sync dirty bitmap with RAM discard listener"); + } + return ret; + } + + ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + + return vfio_container_query_dirty_bitmap(bcontainer, + REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr, errp); +} + +static void vfio_listener_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + int ret; + Error *local_err = NULL; + + if (vfio_listener_skipped_section(section)) { + return; + } + + if (vfio_log_sync_needed(bcontainer)) { + ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err); + if (ret) { + if (migration_is_running()) { + migration_file_set_error(ret, local_err); + } else { + error_report_err(local_err); + } + } + } +} + +const MemoryListener vfio_memory_listener = { + .name = "vfio", + .region_add = vfio_listener_region_add, + .region_del = vfio_listener_region_del, + .log_global_start = vfio_listener_log_global_start, + .log_global_stop = vfio_listener_log_global_stop, + .log_sync = vfio_listener_log_sync, +}; |