diff options
43 files changed, 1444 insertions, 149 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 89d71b42b2..1b4b50aea3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1964,6 +1964,15 @@ F: include/sysemu/rng*.h F: backends/rng*.c F: tests/qtest/virtio-rng-test.c +vhost-user-rng +M: Mathieu Poirier <mathieu.poirier@linaro.org> +S: Supported +F: docs/tools/vhost-user-rng.rst +F: hw/virtio/vhost-user-rng.c +F: hw/virtio/vhost-user-rng-pci.c +F: include/hw/virtio/vhost-user-rng.h +F: tools/vhost-user-rng/* + virtio-crypto M: Gonglei <arei.gonglei@huawei.com> S: Supported diff --git a/block/file-posix.c b/block/file-posix.c index a26eab0ac3..cb9bffe047 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -46,6 +46,7 @@ #if defined(HAVE_HOST_BLOCK_DEVICE) #include <paths.h> #include <sys/param.h> +#include <sys/mount.h> #include <IOKit/IOKitLib.h> #include <IOKit/IOBSD.h> #include <IOKit/storage/IOMediaBSDClient.h> @@ -1254,6 +1255,15 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) return; } +#if defined(__APPLE__) && (__MACH__) + struct statfs buf; + + if (!fstatfs(s->fd, &buf)) { + bs->bl.opt_transfer = buf.f_iosize; + bs->bl.pdiscard_alignment = buf.f_bsize; + } +#endif + if (bs->sg || S_ISBLK(st.st_mode)) { int ret = hdev_get_max_hw_transfer(s->fd, &st); @@ -1591,6 +1601,7 @@ out: } } +#if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD) static int translate_err(int err) { if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || @@ -1599,6 +1610,7 @@ static int translate_err(int err) } return err; } +#endif #ifdef CONFIG_FALLOCATE static int do_fallocate(int fd, int mode, off_t offset, off_t len) @@ -1811,16 +1823,27 @@ static int handle_aiocb_discard(void *opaque) } } while (errno == EINTR); - ret = -errno; + ret = translate_err(-errno); #endif } else { #ifdef CONFIG_FALLOCATE_PUNCH_HOLE ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, aiocb->aio_offset, aiocb->aio_nbytes); + ret = translate_err(-errno); +#elif defined(__APPLE__) && (__MACH__) + fpunchhole_t fpunchhole; + fpunchhole.fp_flags = 0; + fpunchhole.reserved = 0; + fpunchhole.fp_offset = aiocb->aio_offset; + fpunchhole.fp_length = aiocb->aio_nbytes; + if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) { + ret = errno == ENODEV ? -ENOTSUP : -errno; + } else { + ret = 0; + } #endif } - ret = translate_err(ret); if (ret == -ENOTSUP) { s->has_discard = false; } diff --git a/block/io.c b/block/io.c index cf177a9d2d..e0a689c584 100644 --- a/block/io.c +++ b/block/io.c @@ -125,6 +125,8 @@ void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) { + dst->pdiscard_alignment = MAX(dst->pdiscard_alignment, + src->pdiscard_alignment); dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer, diff --git a/docs/pcie_pci_bridge.txt b/docs/pcie_pci_bridge.txt index ab35ebf3ca..1aa08fc5f0 100644 --- a/docs/pcie_pci_bridge.txt +++ b/docs/pcie_pci_bridge.txt @@ -70,9 +70,9 @@ A detailed command line would be: [qemu-bin + storage options] \ -m 2G \ --device pcie-root-port,bus=pcie.0,id=rp1 \ --device pcie-root-port,bus=pcie.0,id=rp2 \ --device pcie-root-port,bus=pcie.0,id=rp3,bus-reserve=1 \ +-device pcie-root-port,bus=pcie.0,id=rp1,slot=1 \ +-device pcie-root-port,bus=pcie.0,id=rp2,slot=2 \ +-device pcie-root-port,bus=pcie.0,id=rp3,slot=3,bus-reserve=1 \ -device pcie-pci-bridge,id=br1,bus=rp1 \ -device pcie-pci-bridge,id=br2,bus=rp2 \ -device e1000,bus=br1,addr=8 diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst index 70e08baff6..94fb7dbf4e 100644 --- a/docs/system/deprecated.rst +++ b/docs/system/deprecated.rst @@ -221,6 +221,24 @@ This machine is deprecated because we have enough AST2500 based OpenPOWER machines. It can be easily replaced by the ``witherspoon-bmc`` or the ``romulus-bmc`` machines. +Backend options +--------------- + +Using non-persistent backing file with pmem=on (since 6.1) +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''' + +This option is used when ``memory-backend-file`` is consumed by emulated NVDIMM +device. However enabling ``memory-backend-file.pmem`` option, when backing file +is (a) not DAX capable or (b) not on a filesystem that support direct mapping +of persistent memory, is not safe and may lead to data loss or corruption in case +of host crash. +Options are: + + - modify VM configuration to set ``pmem=off`` to continue using fake NVDIMM + (without persistence guaranties) with backing file on non DAX storage + - move backing file to NVDIMM storage and keep ``pmem=on`` + (to have NVDIMM with persistence guaranties). + Device options -------------- diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 39c825763a..e28457a7d1 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -207,7 +207,7 @@ static void ged_regs_write(void *opaque, hwaddr addr, uint64_t data, return; case ACPI_GED_REG_RESET: if (data == ACPI_GED_RESET_VALUE) { - qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); + qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); } return; } diff --git a/hw/block/block.c b/hw/block/block.c index 1e34573da7..d47ebf005a 100644 --- a/hw/block/block.c +++ b/hw/block/block.c @@ -65,24 +65,58 @@ bool blkconf_blocksizes(BlockConf *conf, Error **errp) { BlockBackend *blk = conf->blk; BlockSizes blocksizes; - int backend_ret; + BlockDriverState *bs; + bool use_blocksizes; + bool use_bs; + + switch (conf->backend_defaults) { + case ON_OFF_AUTO_AUTO: + use_blocksizes = !blk_probe_blocksizes(blk, &blocksizes); + use_bs = false; + break; + + case ON_OFF_AUTO_ON: + use_blocksizes = !blk_probe_blocksizes(blk, &blocksizes); + bs = blk_bs(blk); + use_bs = bs; + break; + + case ON_OFF_AUTO_OFF: + use_blocksizes = false; + use_bs = false; + break; + + default: + abort(); + } - backend_ret = blk_probe_blocksizes(blk, &blocksizes); /* fill in detected values if they are not defined via qemu command line */ if (!conf->physical_block_size) { - if (!backend_ret) { + if (use_blocksizes) { conf->physical_block_size = blocksizes.phys; } else { conf->physical_block_size = BDRV_SECTOR_SIZE; } } if (!conf->logical_block_size) { - if (!backend_ret) { + if (use_blocksizes) { conf->logical_block_size = blocksizes.log; } else { conf->logical_block_size = BDRV_SECTOR_SIZE; } } + if (use_bs) { + if (!conf->opt_io_size) { + conf->opt_io_size = bs->bl.opt_transfer; + } + if (conf->discard_granularity == -1) { + if (bs->bl.pdiscard_alignment) { + conf->discard_granularity = bs->bl.pdiscard_alignment; + } else if (bs->bl.request_alignment != 1) { + conf->discard_granularity = bs->bl.request_alignment; + } + } + } if (conf->logical_block_size > conf->physical_block_size) { error_setg(errp, diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index cd81893d1d..252c3a7a23 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -198,6 +198,10 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) goto fail_guest_notifiers; } + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); /* Set up virtqueue notify */ @@ -211,6 +215,10 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); while (j--) { @@ -330,12 +338,20 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev) aio_context_release(s->ctx); + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); for (i = 0; i < nvqs; i++) { virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); for (i = 0; i < nvqs; i++) { diff --git a/hw/hyperv/vmbus.c b/hw/hyperv/vmbus.c index 984caf898d..c9887d5a7b 100644 --- a/hw/hyperv/vmbus.c +++ b/hw/hyperv/vmbus.c @@ -2372,6 +2372,14 @@ static void vmbus_dev_realize(DeviceState *dev, Error **errp) assert(!qemu_uuid_is_null(&vdev->instanceid)); + if (!qemu_uuid_is_null(&vdc->instanceid)) { + /* Class wants to only have a single instance with a fixed UUID */ + if (!qemu_uuid_is_equal(&vdev->instanceid, &vdc->instanceid)) { + error_setg(&err, "instance id can't be changed"); + goto error_out; + } + } + /* Check for instance id collision for this class id */ QTAILQ_FOREACH(child, &BUS(vmbus)->children, sibling) { VMBusDevice *child_dev = VMBUS_DEVICE(child->child); @@ -2438,18 +2446,22 @@ static void vmbus_dev_unrealize(DeviceState *dev) free_channels(vdev); } +static Property vmbus_dev_props[] = { + DEFINE_PROP_UUID("instanceid", VMBusDevice, instanceid), + DEFINE_PROP_END_OF_LIST() +}; + + static void vmbus_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *kdev = DEVICE_CLASS(klass); + device_class_set_props(kdev, vmbus_dev_props); kdev->bus_type = TYPE_VMBUS; kdev->realize = vmbus_dev_realize; kdev->unrealize = vmbus_dev_unrealize; kdev->reset = vmbus_dev_reset; } -static Property vmbus_dev_instanceid = - DEFINE_PROP_UUID("instanceid", VMBusDevice, instanceid); - static void vmbus_dev_instance_init(Object *obj) { VMBusDevice *vdev = VMBUS_DEVICE(obj); @@ -2458,8 +2470,6 @@ static void vmbus_dev_instance_init(Object *obj) if (!qemu_uuid_is_null(&vdc->instanceid)) { /* Class wants to only have a single instance with a fixed UUID */ vdev->instanceid = vdc->instanceid; - } else { - qdev_property_add_static(DEVICE(vdev), &vmbus_dev_instanceid); } } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 796ffc6f5c..357437ff1d 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -435,11 +435,15 @@ static void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus, aml_append(dev, aml_name_decl("_ADR", aml_int(slot << 16))); if (bsel) { - aml_append(dev, aml_name_decl("_SUN", aml_int(slot))); + /* + * Can't declare _SUN here for every device as it changes 'slot' + * enumeration order in linux kernel, so use another variable for it + */ + aml_append(dev, aml_name_decl("ASUN", aml_int(slot))); method = aml_method("_DSM", 4, AML_SERIALIZED); aml_append(method, aml_return( aml_call6("PDSM", aml_arg(0), aml_arg(1), aml_arg(2), - aml_arg(3), aml_name("BSEL"), aml_name("_SUN")) + aml_arg(3), aml_name("BSEL"), aml_name("ASUN")) )); aml_append(dev, method); } @@ -466,6 +470,7 @@ static void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus, aml_append(method, aml_return(aml_int(s3d))); aml_append(dev, method); } else if (hotplug_enabled_dev) { + aml_append(dev, aml_name_decl("_SUN", aml_int(slot))); /* add _EJ0 to make slot hotpluggable */ method = aml_method("_EJ0", 1, AML_NOTSERIALIZED); aml_append(method, diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index bd7958b9f0..16d20cdee5 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -3234,6 +3234,7 @@ static bool failover_replug_primary(VirtIONet *n, DeviceState *dev, } hotplug_handler_plug(hotplug_ctrl, dev, &err); } + pdev->partially_hotplugged = false; out: error_propagate(errp, err); diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index 2eb729dff5..0f37cf056a 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -29,6 +29,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "hw/i386/pc.h" #include "hw/pci-host/q35.h" #include "hw/qdev-properties.h" @@ -318,6 +319,8 @@ static void mch_update_pciexbar(MCHPCIState *mch) addr_mask |= MCH_HOST_BRIDGE_PCIEXBAR_64ADMSK; break; case MCH_HOST_BRIDGE_PCIEXBAR_LENGTH_RVD: + qemu_log_mask(LOG_GUEST_ERROR, "Q35: Reserved PCIEXBAR LENGTH\n"); + return; default: abort(); } diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index d68888fccd..6a2df1c1e9 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -31,6 +31,7 @@ #include "trace.h" #include "hw/s390x/css-bridge.h" #include "hw/s390x/s390-virtio-ccw.h" +#include "sysemu/replay.h" #define NR_CLASSIC_INDICATOR_BITS 64 @@ -770,6 +771,11 @@ static void virtio_ccw_device_realize(VirtioCcwDevice *dev, Error **errp) dev->flags &= ~VIRTIO_CCW_FLAG_USE_IOEVENTFD; } + /* fd-based ioevents can't be synchronized in record/replay */ + if (replay_mode != REPLAY_MODE_NONE) { + dev->flags &= ~VIRTIO_CCW_FLAG_USE_IOEVENTFD; + } + if (k->realize) { k->realize(dev, &err); if (err) { diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c index 28e003250a..18eb824c97 100644 --- a/hw/scsi/virtio-scsi-dataplane.c +++ b/hw/scsi/virtio-scsi-dataplane.c @@ -152,6 +152,10 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev) goto fail_guest_notifiers; } + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); rc = virtio_scsi_set_host_notifier(s, vs->ctrl_vq, 0); @@ -198,6 +202,10 @@ fail_host_notifiers: virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); for (i = 0; i < vq_init_count; i++) { @@ -238,12 +246,20 @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev) blk_drain_all(); /* ensure there are no in-flight requests */ + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); for (i = 0; i < vs->conf.num_queues + 2; i++) { virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); for (i = 0; i < vs->conf.num_queues + 2; i++) { diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ae5654fcdb..3f0d111360 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -36,6 +36,7 @@ #include "qemu/range.h" #include "sysemu/kvm.h" #include "sysemu/reset.h" +#include "sysemu/runstate.h" #include "trace.h" #include "qapi/error.h" #include "migration/migration.h" @@ -134,6 +135,29 @@ static const char *index_to_str(VFIODevice *vbasedev, int index) } } +static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) +{ + switch (container->iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + /* + * We support coordinated discarding of RAM via the RamDiscardManager. + */ + return ram_block_uncoordinated_discard_disable(state); + default: + /* + * VFIO_SPAPR_TCE_IOMMU most probably works just fine with + * RamDiscardManager, however, it is completely untested. + * + * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does + * completely the opposite of managing mapping/pinning dynamically as + * required by RamDiscardManager. We would have to special-case sections + * with a RamDiscardManager. + */ + return ram_block_discard_disable(state); + } +} + int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, int action, int fd, Error **errp) { @@ -569,6 +593,44 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, error_report("iommu map to non memory area %"HWADDR_PRIx"", xlat); return false; + } else if (memory_region_has_ram_discard_manager(mr)) { + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr); + MemoryRegionSection tmp = { + .mr = mr, + .offset_within_region = xlat, + .size = int128_make64(len), + }; + + /* + * Malicious VMs can map memory into the IOMMU, which is expected + * to remain discarded. vfio will pin all pages, populating memory. + * Disallow that. vmstate priorities make sure any RamDiscardManager + * were already restored before IOMMUs are restored. + */ + if (!ram_discard_manager_is_populated(rdm, &tmp)) { + error_report("iommu map to discarded memory (e.g., unplugged via" + " virtio-mem): %"HWADDR_PRIx"", + iotlb->translated_addr); + return false; + } + + /* + * Malicious VMs might trigger discarding of IOMMU-mapped memory. The + * pages will remain pinned inside vfio until unmapped, resulting in a + * higher memory consumption than expected. If memory would get + * populated again later, there would be an inconsistency between pages + * pinned by vfio and pages seen by QEMU. This is the case until + * unmapped from the IOMMU (e.g., during device reset). + * + * With malicious guests, we really only care about pinning more memory + * than expected. RLIMIT_MEMLOCK set for the user/process can never be + * exceeded and can be used to mitigate this problem. + */ + warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" + " RAM (e.g., virtio-mem) works, however, malicious" + " guests can trigger pinning of more memory than" + " intended via an IOMMU. It's possible to mitigate " + " by setting/adjusting RLIMIT_MEMLOCK."); } /* @@ -649,6 +711,153 @@ out: rcu_read_unlock(); } +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + int ret; + + /* Unmap with a single call. */ + ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); + if (ret) { + error_report("%s: vfio_dma_unmap() failed: %s", __func__, + strerror(-ret)); + } +} + +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + const hwaddr end = section->offset_within_region + + int128_get64(section->size); + hwaddr start, next, iova; + void *vaddr; + int ret; + + /* + * Map in (aligned within memory region) minimum granularity, so we can + * unmap in minimum granularity later. + */ + for (start = section->offset_within_region; start < end; start = next) { + next = ROUND_UP(start + 1, vrdl->granularity); + next = MIN(next, end); + + iova = start - section->offset_within_region + + section->offset_within_address_space; + vaddr = memory_region_get_ram_ptr(section->mr) + start; + + ret = vfio_dma_map(vrdl->container, iova, next - start, + vaddr, section->readonly); + if (ret) { + /* Rollback */ + vfio_ram_discard_notify_discard(rdl, section); + return ret; + } + } + return 0; +} + +static void vfio_register_ram_discard_listener(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl; + + /* Ignore some corner cases not relevant in practice. */ + g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); + g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, + TARGET_PAGE_SIZE)); + g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); + + vrdl = g_new0(VFIORamDiscardListener, 1); + vrdl->container = container; + vrdl->mr = section->mr; + vrdl->offset_within_address_space = section->offset_within_address_space; + vrdl->size = int128_get64(section->size); + vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, + section->mr); + + g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); + g_assert(vrdl->granularity >= 1 << ctz64(container->pgsizes)); + + ram_discard_listener_init(&vrdl->listener, + vfio_ram_discard_notify_populate, + vfio_ram_discard_notify_discard, true); + ram_discard_manager_register_listener(rdm, &vrdl->listener, section); + QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); + + /* + * Sanity-check if we have a theoretically problematic setup where we could + * exceed the maximum number of possible DMA mappings over time. We assume + * that each mapped section in the same address space as a RamDiscardManager + * section consumes exactly one DMA mapping, with the exception of + * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections + * in the same address space as RamDiscardManager sections. + * + * We assume that each section in the address space consumes one memslot. + * We take the number of KVM memory slots as a best guess for the maximum + * number of sections in the address space we could have over time, + * also consuming DMA mappings. + */ + if (container->dma_max_mappings) { + unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; + +#ifdef CONFIG_KVM + if (kvm_enabled()) { + max_memslots = kvm_get_max_memslots(); + } +#endif + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + hwaddr start, end; + + start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, + vrdl->granularity); + end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, + vrdl->granularity); + vrdl_mappings += (end - start) / vrdl->granularity; + vrdl_count++; + } + + if (vrdl_mappings + max_memslots - vrdl_count > + container->dma_max_mappings) { + warn_report("%s: possibly running out of DMA mappings. E.g., try" + " increasing the 'block-size' of virtio-mem devies." + " Maximum possible DMA mappings: %d, Maximum possible" + " memslots: %d", __func__, container->dma_max_mappings, + max_memslots); + } + } +} + +static void vfio_unregister_ram_discard_listener(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to unregister missing RAM discard listener"); + } + + ram_discard_manager_unregister_listener(rdm, &vrdl->listener); + QLIST_REMOVE(vrdl, next); + g_free(vrdl); +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { @@ -810,6 +1019,16 @@ static void vfio_listener_region_add(MemoryListener *listener, /* Here we assume that memory_region_is_ram(section->mr)==true */ + /* + * For RAM memory regions with a RamDiscardManager, we only want to map the + * actually populated parts - and update the mapping whenever we're notified + * about changes. + */ + if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_register_ram_discard_listener(container, section); + return; + } + vaddr = memory_region_get_ram_ptr(section->mr) + section->offset_within_region + (iova - section->offset_within_address_space); @@ -947,6 +1166,10 @@ static void vfio_listener_region_del(MemoryListener *listener, pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } else if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_unregister_ram_discard_listener(container, section); + /* Unregistering will trigger an unmap. */ + try_unmap = false; } if (try_unmap) { @@ -1108,6 +1331,49 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_unlock(); } +static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, + void *opaque) +{ + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + VFIORamDiscardListener *vrdl = opaque; + + /* + * Sync the whole mapped region (spanning multiple individual mappings) + * in one go. + */ + return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); +} + +static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + } + + /* + * We only want/can synchronize the bitmap for actually mapped parts - + * which correspond to populated parts. Replay all populated parts. + */ + return ram_discard_manager_replay_populated(rdm, section, + vfio_ram_discard_get_dirty_bitmap, + &vrdl); +} + static int vfio_sync_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { @@ -1139,6 +1405,8 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, } } return 0; + } else if (memory_region_has_ram_discard_manager(section->mr)) { + return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); } ram_addr = memory_region_get_ram_addr(section->mr) + @@ -1732,15 +2000,25 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * new memory, it will not yet set ram_block_discard_set_required() and * therefore, neither stops us here or deals with the sudden memory * consumption of inflated memory. + * + * We do support discarding of memory coordinated via the RamDiscardManager + * with some IOMMU types. vfio_ram_block_discard_disable() handles the + * details once we know which type of IOMMU we are using. */ - ret = ram_block_discard_disable(true); - if (ret) { - error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - return ret; - } QLIST_FOREACH(container, &space->containers, next) { if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, + "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, + &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); + } + return ret; + } group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); vfio_kvm_device_add_group(group); @@ -1768,14 +2046,22 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->fd = fd; container->error = NULL; container->dirty_pages_supported = false; + container->dma_max_mappings = 0; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&container->vrdl_list); ret = vfio_init_container(container, group->fd, errp); if (ret) { goto free_container_exit; } + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + goto free_container_exit; + } + switch (container->iommu_type) { case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: @@ -1798,7 +2084,10 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes); container->pgsizes = info->iova_pgsizes; + /* The default in the kernel ("dma_entry_limit") is 65535. */ + container->dma_max_mappings = 65535; if (!ret) { + vfio_get_info_dma_avail(info, &container->dma_max_mappings); vfio_get_iommu_info_migration(container, info); } g_free(info); @@ -1820,7 +2109,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (ret) { error_setg_errno(errp, errno, "failed to enable container"); ret = -errno; - goto free_container_exit; + goto enable_discards_exit; } } else { container->prereg_listener = vfio_prereg_listener; @@ -1832,7 +2121,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, ret = -1; error_propagate_prepend(errp, container->error, "RAM memory listener initialization failed: "); - goto free_container_exit; + goto enable_discards_exit; } } @@ -1845,7 +2134,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (v2) { memory_listener_unregister(&container->prereg_listener); } - goto free_container_exit; + goto enable_discards_exit; } if (v2) { @@ -1860,7 +2149,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (ret) { error_setg_errno(errp, -ret, "failed to remove existing window"); - goto free_container_exit; + goto enable_discards_exit; } } else { /* The default table uses 4K pages */ @@ -1901,6 +2190,9 @@ listener_release_exit: vfio_kvm_device_del_group(group); vfio_listener_release(container); +enable_discards_exit: + vfio_ram_block_discard_disable(container, false); + free_container_exit: g_free(container); @@ -1908,7 +2200,6 @@ close_fd_exit: close(fd); put_space_exit: - ram_block_discard_disable(false); vfio_put_address_space(space); return ret; @@ -2030,7 +2321,7 @@ void vfio_put_group(VFIOGroup *group) } if (!group->ram_block_discard_allowed) { - ram_block_discard_disable(false); + vfio_ram_block_discard_disable(group->container, false); } vfio_kvm_device_del_group(group); vfio_disconnect_container(group); @@ -2084,7 +2375,7 @@ int vfio_get_device(VFIOGroup *group, const char *name, if (!group->ram_block_discard_allowed) { group->ram_block_discard_allowed = true; - ram_block_discard_disable(false); + vfio_ram_block_discard_disable(group->container, false); } } diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 75aa7d6f1b..df91e454b2 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -145,7 +145,173 @@ static bool virtio_mem_is_busy(void) return migration_in_incoming_postcopy() || !migration_is_idle(); } -static bool virtio_mem_test_bitmap(VirtIOMEM *vmem, uint64_t start_gpa, +typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size); + +static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg, + virtio_mem_range_cb cb) +{ + unsigned long first_zero_bit, last_zero_bit; + uint64_t offset, size; + int ret = 0; + + first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); + while (first_zero_bit < vmem->bitmap_size) { + offset = first_zero_bit * vmem->block_size; + last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + first_zero_bit + 1) - 1; + size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + + ret = cb(vmem, arg, offset, size); + if (ret) { + break; + } + first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + last_zero_bit + 2); + } + return ret; +} + +/* + * Adjust the memory section to cover the intersection with the given range. + * + * Returns false if the intersection is empty, otherwise returns true. + */ +static bool virito_mem_intersect_memory_section(MemoryRegionSection *s, + uint64_t offset, uint64_t size) +{ + uint64_t start = MAX(s->offset_within_region, offset); + uint64_t end = MIN(s->offset_within_region + int128_get64(s->size), + offset + size); + + if (end <= start) { + return false; + } + + s->offset_within_address_space += start - s->offset_within_region; + s->offset_within_region = start; + s->size = int128_make64(end - start); + return true; +} + +typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg); + +static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, + MemoryRegionSection *s, + void *arg, + virtio_mem_section_cb cb) +{ + unsigned long first_bit, last_bit; + uint64_t offset, size; + int ret = 0; + + first_bit = s->offset_within_region / vmem->bitmap_size; + first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit); + while (first_bit < vmem->bitmap_size) { + MemoryRegionSection tmp = *s; + + offset = first_bit * vmem->block_size; + last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + first_bit + 1) - 1; + size = (last_bit - first_bit + 1) * vmem->block_size; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + break; + } + ret = cb(&tmp, arg); + if (ret) { + break; + } + first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + last_bit + 2); + } + return ret; +} + +static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg) +{ + RamDiscardListener *rdl = arg; + + return rdl->notify_populate(rdl, s); +} + +static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg) +{ + RamDiscardListener *rdl = arg; + + rdl->notify_discard(rdl, s); + return 0; +} + +static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset, + uint64_t size) +{ + RamDiscardListener *rdl; + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + rdl->notify_discard(rdl, &tmp); + } +} + +static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, + uint64_t size) +{ + RamDiscardListener *rdl, *rdl2; + int ret = 0; + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + ret = rdl->notify_populate(rdl, &tmp); + if (ret) { + break; + } + } + + if (ret) { + /* Notify all already-notified listeners. */ + QLIST_FOREACH(rdl2, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (rdl2 == rdl) { + break; + } + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + rdl2->notify_discard(rdl2, &tmp); + } + } + return ret; +} + +static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem) +{ + RamDiscardListener *rdl; + + if (!vmem->size) { + return; + } + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + if (rdl->double_discard_supported) { + rdl->notify_discard(rdl, rdl->section); + } else { + virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_discard_cb); + } + } +} + +static bool virtio_mem_test_bitmap(const VirtIOMEM *vmem, uint64_t start_gpa, uint64_t size, bool plugged) { const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size; @@ -198,7 +364,8 @@ static void virtio_mem_send_response_simple(VirtIOMEM *vmem, virtio_mem_send_response(vmem, elem, &resp); } -static bool virtio_mem_valid_range(VirtIOMEM *vmem, uint64_t gpa, uint64_t size) +static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa, + uint64_t size) { if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) { return false; @@ -219,19 +386,21 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, uint64_t size, bool plug) { const uint64_t offset = start_gpa - vmem->addr; - int ret; + RAMBlock *rb = vmem->memdev->mr.ram_block; if (virtio_mem_is_busy()) { return -EBUSY; } if (!plug) { - ret = ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size); - if (ret) { - error_report("Unexpected error discarding RAM: %s", - strerror(-ret)); + if (ram_block_discard_range(rb, offset, size)) { return -EBUSY; } + virtio_mem_notify_unplug(vmem, offset, size); + } else if (virtio_mem_notify_plug(vmem, offset, size)) { + /* Could be a mapping attempt resulted in memory getting populated. */ + ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size); + return -EBUSY; } virtio_mem_set_bitmap(vmem, start_gpa, size, plug); return 0; @@ -318,17 +487,16 @@ static void virtio_mem_resize_usable_region(VirtIOMEM *vmem, static int virtio_mem_unplug_all(VirtIOMEM *vmem) { RAMBlock *rb = vmem->memdev->mr.ram_block; - int ret; if (virtio_mem_is_busy()) { return -EBUSY; } - ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); - if (ret) { - error_report("Unexpected error discarding RAM: %s", strerror(-ret)); + if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) { return -EBUSY; } + virtio_mem_notify_unplug_all(vmem); + bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size); if (vmem->size) { vmem->size = 0; @@ -551,7 +719,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) return; } - if (ram_block_discard_require(true)) { + if (ram_block_coordinated_discard_require(true)) { error_setg(errp, "Discarding RAM is disabled"); return; } @@ -559,7 +727,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); if (ret) { error_setg_errno(errp, -ret, "Unexpected error discarding RAM"); - ram_block_discard_require(false); + ram_block_coordinated_discard_require(false); return; } @@ -577,6 +745,13 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem)); qemu_register_reset(virtio_mem_system_reset, vmem); precopy_add_notifier(&vmem->precopy_notifier); + + /* + * Set ourselves as RamDiscardManager before the plug handler maps the + * memory region and exposes it via an address space. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, + RAM_DISCARD_MANAGER(vmem)); } static void virtio_mem_device_unrealize(DeviceState *dev) @@ -584,6 +759,11 @@ static void virtio_mem_device_unrealize(DeviceState *dev) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOMEM *vmem = VIRTIO_MEM(dev); + /* + * The unplug handler unmapped the memory region, it cannot be + * found via an address space anymore. Unset ourselves. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); precopy_remove_notifier(&vmem->precopy_notifier); qemu_unregister_reset(virtio_mem_system_reset, vmem); vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem)); @@ -591,43 +771,47 @@ static void virtio_mem_device_unrealize(DeviceState *dev) virtio_del_queue(vdev, 0); virtio_cleanup(vdev); g_free(vmem->bitmap); - ram_block_discard_require(false); + ram_block_coordinated_discard_require(false); } -static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) +static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size) { RAMBlock *rb = vmem->memdev->mr.ram_block; - unsigned long first_zero_bit, last_zero_bit; - uint64_t offset, length; - int ret; - /* Find consecutive unplugged blocks and discard the consecutive range. */ - first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); - while (first_zero_bit < vmem->bitmap_size) { - offset = first_zero_bit * vmem->block_size; - last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, - first_zero_bit + 1) - 1; - length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0; +} - ret = ram_block_discard_range(rb, offset, length); - if (ret) { - error_report("Unexpected error discarding RAM: %s", - strerror(-ret)); - return -EINVAL; - } - first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, - last_zero_bit + 2); - } - return 0; +static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) +{ + /* Make sure all memory is really discarded after migration. */ + return virtio_mem_for_each_unplugged_range(vmem, NULL, + virtio_mem_discard_range_cb); } static int virtio_mem_post_load(void *opaque, int version_id) { + VirtIOMEM *vmem = VIRTIO_MEM(opaque); + RamDiscardListener *rdl; + int ret; + + /* + * We started out with all memory discarded and our memory region is mapped + * into an address space. Replay, now that we updated the bitmap. + */ + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_populate_cb); + if (ret) { + return ret; + } + } + if (migration_in_incoming_postcopy()) { return 0; } - return virtio_mem_restore_unplugged(VIRTIO_MEM(opaque)); + return virtio_mem_restore_unplugged(vmem); } typedef struct VirtIOMEMMigSanityChecks { @@ -702,6 +886,7 @@ static const VMStateDescription vmstate_virtio_mem_device = { .name = "virtio-mem-device", .minimum_version_id = 1, .version_id = 1, + .priority = MIG_PRI_VIRTIO_MEM, .post_load = virtio_mem_post_load, .fields = (VMStateField[]) { VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks, @@ -872,28 +1057,19 @@ static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name, vmem->block_size = value; } -static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem) +static int virtio_mem_precopy_exclude_range_cb(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size) { void * const host = qemu_ram_get_host_addr(vmem->memdev->mr.ram_block); - unsigned long first_zero_bit, last_zero_bit; - uint64_t offset, length; - /* - * Find consecutive unplugged blocks and exclude them from migration. - * - * Note: Blocks cannot get (un)plugged during precopy, no locking needed. - */ - first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); - while (first_zero_bit < vmem->bitmap_size) { - offset = first_zero_bit * vmem->block_size; - last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, - first_zero_bit + 1) - 1; - length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + qemu_guest_free_page_hint(host + offset, size); + return 0; +} - qemu_guest_free_page_hint(host + offset, length); - first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, - last_zero_bit + 2); - } +static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem) +{ + virtio_mem_for_each_unplugged_range(vmem, NULL, + virtio_mem_precopy_exclude_range_cb); } static int virtio_mem_precopy_notify(NotifierWithReturn *n, void *data) @@ -918,6 +1094,7 @@ static void virtio_mem_instance_init(Object *obj) notifier_list_init(&vmem->size_change_notifiers); vmem->precopy_notifier.notify = virtio_mem_precopy_notify; + QLIST_INIT(&vmem->rdl_list); object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size, NULL, NULL, NULL); @@ -937,11 +1114,107 @@ static Property virtio_mem_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + + g_assert(mr == &vmem->memdev->mr); + return vmem->block_size; +} + +static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *s) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + uint64_t start_gpa = vmem->addr + s->offset_within_region; + uint64_t end_gpa = start_gpa + int128_get64(s->size); + + g_assert(s->mr == &vmem->memdev->mr); + + start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size); + end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size); + + if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) { + return false; + } + + return virtio_mem_test_bitmap(vmem, start_gpa, end_gpa - start_gpa, true); +} + +struct VirtIOMEMReplayData { + void *fn; + void *opaque; +}; + +static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg) +{ + struct VirtIOMEMReplayData *data = arg; + + return ((ReplayRamPopulate)data->fn)(s, data->opaque); +} + +static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *s, + ReplayRamPopulate replay_fn, + void *opaque) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + struct VirtIOMEMReplayData data = { + .fn = replay_fn, + .opaque = opaque, + }; + + g_assert(s->mr == &vmem->memdev->mr); + return virtio_mem_for_each_plugged_section(vmem, s, &data, + virtio_mem_rdm_replay_populated_cb); +} + +static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *s) +{ + VirtIOMEM *vmem = VIRTIO_MEM(rdm); + int ret; + + g_assert(s->mr == &vmem->memdev->mr); + rdl->section = memory_region_section_new_copy(s); + + QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next); + ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_populate_cb); + if (ret) { + error_report("%s: Replaying plugged ranges failed: %s", __func__, + strerror(-ret)); + } +} + +static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl) +{ + VirtIOMEM *vmem = VIRTIO_MEM(rdm); + + g_assert(rdl->section->mr == &vmem->memdev->mr); + if (vmem->size) { + if (rdl->double_discard_supported) { + rdl->notify_discard(rdl, rdl->section); + } else { + virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_discard_cb); + } + } + + memory_region_section_free_copy(rdl->section); + rdl->section = NULL; + QLIST_REMOVE(rdl, next); +} + static void virtio_mem_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass); + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass); device_class_set_props(dc, virtio_mem_properties); dc->vmsd = &vmstate_virtio_mem; @@ -957,6 +1230,12 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data) vmc->get_memory_region = virtio_mem_get_memory_region; vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier; vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier; + + rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity; + rdmc->is_populated = virtio_mem_rdm_is_populated; + rdmc->replay_populated = virtio_mem_rdm_replay_populated; + rdmc->register_listener = virtio_mem_rdm_register_listener; + rdmc->unregister_listener = virtio_mem_rdm_unregister_listener; } static const TypeInfo virtio_mem_info = { @@ -966,6 +1245,10 @@ static const TypeInfo virtio_mem_info = { .instance_init = virtio_mem_instance_init, .class_init = virtio_mem_class_init, .class_size = sizeof(VirtIOMEMClass), + .interfaces = (InterfaceInfo[]) { + { TYPE_RAM_DISCARD_MANAGER }, + { } + }, }; static void virtio_register_types(void) diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index 5952471b38..1af48a1b04 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -29,6 +29,7 @@ #include "qemu/host-utils.h" #include "qemu/module.h" #include "sysemu/kvm.h" +#include "sysemu/replay.h" #include "hw/virtio/virtio-mmio.h" #include "qemu/error-report.h" #include "qemu/log.h" @@ -740,6 +741,11 @@ static void virtio_mmio_realizefn(DeviceState *d, Error **errp) proxy->flags &= ~VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD; } + /* fd-based ioevents can't be synchronized in record/replay */ + if (replay_mode != REPLAY_MODE_NONE) { + proxy->flags &= ~VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD; + } + if (proxy->legacy) { memory_region_init_io(&proxy->iomem, OBJECT(d), &virtio_legacy_mem_ops, proxy, diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index b321604d9b..433060ac02 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -37,6 +37,7 @@ #include "qemu/range.h" #include "hw/virtio/virtio-bus.h" #include "qapi/visitor.h" +#include "sysemu/replay.h" #define VIRTIO_PCI_REGION_SIZE(dev) VIRTIO_PCI_CONFIG_OFF(msix_present(dev)) @@ -423,6 +424,11 @@ static uint64_t virtio_pci_config_read(void *opaque, hwaddr addr, VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev); uint64_t val = 0; + + if (vdev == NULL) { + return UINT64_MAX; + } + if (addr < config) { return virtio_ioport_read(proxy, addr); } @@ -454,6 +460,11 @@ static void virtio_pci_config_write(void *opaque, hwaddr addr, VirtIOPCIProxy *proxy = opaque; uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev); VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (vdev == NULL) { + return; + } + if (addr < config) { virtio_ioport_write(proxy, addr, val); return; @@ -1146,6 +1157,10 @@ static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, uint32_t val = 0; int i; + if (vdev == NULL) { + return UINT64_MAX; + } + switch (addr) { case VIRTIO_PCI_COMMON_DFSELECT: val = proxy->dfselect; @@ -1229,6 +1244,10 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, VirtIOPCIProxy *proxy = opaque; VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + if (vdev == NULL) { + return; + } + switch (addr) { case VIRTIO_PCI_COMMON_DFSELECT: proxy->dfselect = val; @@ -1330,6 +1349,11 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, static uint64_t virtio_pci_notify_read(void *opaque, hwaddr addr, unsigned size) { + VirtIOPCIProxy *proxy = opaque; + if (virtio_bus_get_device(&proxy->bus) == NULL) { + return UINT64_MAX; + } + return 0; } @@ -1367,7 +1391,7 @@ static uint64_t virtio_pci_isr_read(void *opaque, hwaddr addr, uint64_t val; if (vdev == NULL) { - return 0; + return UINT64_MAX; } val = qatomic_xchg(&vdev->isr, 0); @@ -1388,7 +1412,7 @@ static uint64_t virtio_pci_device_read(void *opaque, hwaddr addr, uint64_t val; if (vdev == NULL) { - return 0; + return UINT64_MAX; } switch (size) { @@ -1760,6 +1784,11 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD; } + /* fd-based ioevents can't be synchronized in record/replay */ + if (replay_mode != REPLAY_MODE_NONE) { + proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD; + } + /* * virtio pci bar layout used by default. * subclasses can re-arrange things if needed. diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index ab516ac614..6dcf3baf56 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3728,6 +3728,10 @@ static int virtio_device_start_ioeventfd_impl(VirtIODevice *vdev) VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev))); int i, n, r, err; + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { VirtQueue *vq = &vdev->vq[n]; @@ -3766,6 +3770,10 @@ assign_error: r = virtio_bus_set_host_notifier(qbus, n, false); assert(r >= 0); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); while (--i >= 0) { @@ -3790,6 +3798,10 @@ static void virtio_device_stop_ioeventfd_impl(VirtIODevice *vdev) VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev))); int n, r; + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ memory_region_transaction_begin(); for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { VirtQueue *vq = &vdev->vq[n]; @@ -3801,6 +3813,10 @@ static void virtio_device_stop_ioeventfd_impl(VirtIODevice *vdev) r = virtio_bus_set_host_notifier(qbus, n, false); assert(r >= 0); } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ memory_region_transaction_commit(); for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { diff --git a/include/block/aio.h b/include/block/aio.h index 10fcae1515..807edce9b5 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -292,19 +292,44 @@ void aio_context_acquire(AioContext *ctx); void aio_context_release(AioContext *ctx); /** + * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will + * run only once and as soon as possible. + * + * @name: A human-readable identifier for debugging purposes. + */ +void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, + const char *name); + +/** * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run * only once and as soon as possible. + * + * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the + * name string. */ -void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque); +#define aio_bh_schedule_oneshot(ctx, cb, opaque) \ + aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb))) /** - * aio_bh_new: Allocate a new bottom half structure. + * aio_bh_new_full: Allocate a new bottom half structure. * * Bottom halves are lightweight callbacks whose invocation is guaranteed * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure * is opaque and must be allocated prior to its use. + * + * @name: A human-readable identifier for debugging purposes. + */ +QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, + const char *name); + +/** + * aio_bh_new: Allocate a new bottom half structure + * + * A convenience wrapper for aio_bh_new_full() that uses the cb as the name + * string. */ -QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque); +#define aio_bh_new(ctx, cb, opaque) \ + aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb))) /** * aio_notify: Force processing of pending events. diff --git a/include/exec/memory.h b/include/exec/memory.h index b116f7c64e..c3d417d317 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -42,6 +42,12 @@ typedef struct IOMMUMemoryRegionClass IOMMUMemoryRegionClass; DECLARE_OBJ_CHECKERS(IOMMUMemoryRegion, IOMMUMemoryRegionClass, IOMMU_MEMORY_REGION, TYPE_IOMMU_MEMORY_REGION) +#define TYPE_RAM_DISCARD_MANAGER "qemu:ram-discard-manager" +typedef struct RamDiscardManagerClass RamDiscardManagerClass; +typedef struct RamDiscardManager RamDiscardManager; +DECLARE_OBJ_CHECKERS(RamDiscardManager, RamDiscardManagerClass, + RAM_DISCARD_MANAGER, TYPE_RAM_DISCARD_MANAGER); + #ifdef CONFIG_FUZZ void fuzz_dma_read_cb(size_t addr, size_t len, @@ -65,6 +71,28 @@ struct ReservedRegion { unsigned type; }; +/** + * struct MemoryRegionSection: describes a fragment of a #MemoryRegion + * + * @mr: the region, or %NULL if empty + * @fv: the flat view of the address space the region is mapped in + * @offset_within_region: the beginning of the section, relative to @mr's start + * @size: the size of the section; will not exceed @mr's boundaries + * @offset_within_address_space: the address of the first byte of the section + * relative to the region's address space + * @readonly: writes to this section are ignored + * @nonvolatile: this section is non-volatile + */ +struct MemoryRegionSection { + Int128 size; + MemoryRegion *mr; + FlatView *fv; + hwaddr offset_within_region; + hwaddr offset_within_address_space; + bool readonly; + bool nonvolatile; +}; + typedef struct IOMMUTLBEntry IOMMUTLBEntry; /* See address_space_translate: bit 0 is read, bit 1 is write. */ @@ -448,6 +476,206 @@ struct IOMMUMemoryRegionClass { Error **errp); }; +typedef struct RamDiscardListener RamDiscardListener; +typedef int (*NotifyRamPopulate)(RamDiscardListener *rdl, + MemoryRegionSection *section); +typedef void (*NotifyRamDiscard)(RamDiscardListener *rdl, + MemoryRegionSection *section); + +struct RamDiscardListener { + /* + * @notify_populate: + * + * Notification that previously discarded memory is about to get populated. + * Listeners are able to object. If any listener objects, already + * successfully notified listeners are notified about a discard again. + * + * @rdl: the #RamDiscardListener getting notified + * @section: the #MemoryRegionSection to get populated. The section + * is aligned within the memory region to the minimum granularity + * unless it would exceed the registered section. + * + * Returns 0 on success. If the notification is rejected by the listener, + * an error is returned. + */ + NotifyRamPopulate notify_populate; + + /* + * @notify_discard: + * + * Notification that previously populated memory was discarded successfully + * and listeners should drop all references to such memory and prevent + * new population (e.g., unmap). + * + * @rdl: the #RamDiscardListener getting notified + * @section: the #MemoryRegionSection to get populated. The section + * is aligned within the memory region to the minimum granularity + * unless it would exceed the registered section. + */ + NotifyRamDiscard notify_discard; + + /* + * @double_discard_supported: + * + * The listener suppors getting @notify_discard notifications that span + * already discarded parts. + */ + bool double_discard_supported; + + MemoryRegionSection *section; + QLIST_ENTRY(RamDiscardListener) next; +}; + +static inline void ram_discard_listener_init(RamDiscardListener *rdl, + NotifyRamPopulate populate_fn, + NotifyRamDiscard discard_fn, + bool double_discard_supported) +{ + rdl->notify_populate = populate_fn; + rdl->notify_discard = discard_fn; + rdl->double_discard_supported = double_discard_supported; +} + +typedef int (*ReplayRamPopulate)(MemoryRegionSection *section, void *opaque); + +/* + * RamDiscardManagerClass: + * + * A #RamDiscardManager coordinates which parts of specific RAM #MemoryRegion + * regions are currently populated to be used/accessed by the VM, notifying + * after parts were discarded (freeing up memory) and before parts will be + * populated (consuming memory), to be used/acessed by the VM. + * + * A #RamDiscardManager can only be set for a RAM #MemoryRegion while the + * #MemoryRegion isn't mapped yet; it cannot change while the #MemoryRegion is + * mapped. + * + * The #RamDiscardManager is intended to be used by technologies that are + * incompatible with discarding of RAM (e.g., VFIO, which may pin all + * memory inside a #MemoryRegion), and require proper coordination to only + * map the currently populated parts, to hinder parts that are expected to + * remain discarded from silently getting populated and consuming memory. + * Technologies that support discarding of RAM don't have to bother and can + * simply map the whole #MemoryRegion. + * + * An example #RamDiscardManager is virtio-mem, which logically (un)plugs + * memory within an assigned RAM #MemoryRegion, coordinated with the VM. + * Logically unplugging memory consists of discarding RAM. The VM agreed to not + * access unplugged (discarded) memory - especially via DMA. virtio-mem will + * properly coordinate with listeners before memory is plugged (populated), + * and after memory is unplugged (discarded). + * + * Listeners are called in multiples of the minimum granularity (unless it + * would exceed the registered range) and changes are aligned to the minimum + * granularity within the #MemoryRegion. Listeners have to prepare for memory + * becomming discarded in a different granularity than it was populated and the + * other way around. + */ +struct RamDiscardManagerClass { + /* private */ + InterfaceClass parent_class; + + /* public */ + + /** + * @get_min_granularity: + * + * Get the minimum granularity in which listeners will get notified + * about changes within the #MemoryRegion via the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @mr: the #MemoryRegion + * + * Returns the minimum granularity. + */ + uint64_t (*get_min_granularity)(const RamDiscardManager *rdm, + const MemoryRegion *mr); + + /** + * @is_populated: + * + * Check whether the given #MemoryRegionSection is completely populated + * (i.e., no parts are currently discarded) via the #RamDiscardManager. + * There are no alignment requirements. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * + * Returns whether the given range is completely populated. + */ + bool (*is_populated)(const RamDiscardManager *rdm, + const MemoryRegionSection *section); + + /** + * @replay_populated: + * + * Call the #ReplayRamPopulate callback for all populated parts within the + * #MemoryRegionSection via the #RamDiscardManager. + * + * In case any call fails, no further calls are made. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamPopulate callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ + int (*replay_populated)(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamPopulate replay_fn, void *opaque); + + /** + * @register_listener: + * + * Register a #RamDiscardListener for the given #MemoryRegionSection and + * immediately notify the #RamDiscardListener about all populated parts + * within the #MemoryRegionSection via the #RamDiscardManager. + * + * In case any notification fails, no further notifications are triggered + * and an error is logged. + * + * @rdm: the #RamDiscardManager + * @rdl: the #RamDiscardListener + * @section: the #MemoryRegionSection + */ + void (*register_listener)(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *section); + + /** + * @unregister_listener: + * + * Unregister a previously registered #RamDiscardListener via the + * #RamDiscardManager after notifying the #RamDiscardListener about all + * populated parts becoming unpopulated within the registered + * #MemoryRegionSection. + * + * @rdm: the #RamDiscardManager + * @rdl: the #RamDiscardListener + */ + void (*unregister_listener)(RamDiscardManager *rdm, + RamDiscardListener *rdl); +}; + +uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr); + +bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *section); + +int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamPopulate replay_fn, + void *opaque); + +void ram_discard_manager_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *section); + +void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl); + typedef struct CoalescedMemoryRange CoalescedMemoryRange; typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; @@ -494,6 +722,7 @@ struct MemoryRegion { const char *name; unsigned ioeventfd_nb; MemoryRegionIoeventfd *ioeventfds; + RamDiscardManager *rdm; /* Only for RAM */ }; struct IOMMUMemoryRegion { @@ -825,28 +1054,6 @@ typedef bool (*flatview_cb)(Int128 start, */ void flatview_for_each_range(FlatView *fv, flatview_cb cb, void *opaque); -/** - * struct MemoryRegionSection: describes a fragment of a #MemoryRegion - * - * @mr: the region, or %NULL if empty - * @fv: the flat view of the address space the region is mapped in - * @offset_within_region: the beginning of the section, relative to @mr's start - * @size: the size of the section; will not exceed @mr's boundaries - * @offset_within_address_space: the address of the first byte of the section - * relative to the region's address space - * @readonly: writes to this section are ignored - * @nonvolatile: this section is non-volatile - */ -struct MemoryRegionSection { - Int128 size; - MemoryRegion *mr; - FlatView *fv; - hwaddr offset_within_region; - hwaddr offset_within_address_space; - bool readonly; - bool nonvolatile; -}; - static inline bool MemoryRegionSection_eq(MemoryRegionSection *a, MemoryRegionSection *b) { @@ -860,6 +1067,26 @@ static inline bool MemoryRegionSection_eq(MemoryRegionSection *a, } /** + * memory_region_section_new_copy: Copy a memory region section + * + * Allocate memory for a new copy, copy the memory region section, and + * properly take a reference on all relevant members. + * + * @s: the #MemoryRegionSection to copy + */ +MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s); + +/** + * memory_region_section_new_copy: Free a copied memory region section + * + * Free a copy of a memory section created via memory_region_section_new_copy(). + * properly dropping references on all relevant members. + * + * @s: the #MemoryRegionSection to copy + */ +void memory_region_section_free_copy(MemoryRegionSection *s); + +/** * memory_region_init: Initialize a memory region * * The region typically acts as a container for other memory regions. Use @@ -2024,6 +2251,41 @@ bool memory_region_present(MemoryRegion *container, hwaddr addr); bool memory_region_is_mapped(MemoryRegion *mr); /** + * memory_region_get_ram_discard_manager: get the #RamDiscardManager for a + * #MemoryRegion + * + * The #RamDiscardManager cannot change while a memory region is mapped. + * + * @mr: the #MemoryRegion + */ +RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr); + +/** + * memory_region_has_ram_discard_manager: check whether a #MemoryRegion has a + * #RamDiscardManager assigned + * + * @mr: the #MemoryRegion + */ +static inline bool memory_region_has_ram_discard_manager(MemoryRegion *mr) +{ + return !!memory_region_get_ram_discard_manager(mr); +} + +/** + * memory_region_set_ram_discard_manager: set the #RamDiscardManager for a + * #MemoryRegion + * + * This function must not be called for a mapped #MemoryRegion, a #MemoryRegion + * that does not cover RAM, or a #MemoryRegion that already has a + * #RamDiscardManager assigned. + * + * @mr: the #MemoryRegion + * @rdm: #RamDiscardManager to set + */ +void memory_region_set_ram_discard_manager(MemoryRegion *mr, + RamDiscardManager *rdm); + +/** * memory_region_find: translate an address/size relative to a * MemoryRegion into a #MemoryRegionSection. * @@ -2632,6 +2894,12 @@ static inline MemOp devend_memop(enum device_endian end) int ram_block_discard_disable(bool state); /* + * See ram_block_discard_disable(): only disable uncoordinated discards, + * keeping coordinated discards (via the RamDiscardManager) enabled. + */ +int ram_block_uncoordinated_discard_disable(bool state); + +/* * Inhibit technologies that disable discarding of pages in RAM blocks. * * Returns 0 if successful. Returns -EBUSY if discards are already set to @@ -2640,12 +2908,20 @@ int ram_block_discard_disable(bool state); int ram_block_discard_require(bool state); /* - * Test if discarding of memory in ram blocks is disabled. + * See ram_block_discard_require(): only inhibit technologies that disable + * uncoordinated discarding of pages in RAM blocks, allowing co-existance with + * technologies that only inhibit uncoordinated discards (via the + * RamDiscardManager). + */ +int ram_block_coordinated_discard_require(bool state); + +/* + * Test if any discarding of memory in ram blocks is disabled. */ bool ram_block_discard_is_disabled(void); /* - * Test if discarding of memory in ram blocks is required to work reliably. + * Test if any discarding of memory in ram blocks is required to work reliably. */ bool ram_block_discard_is_required(void); diff --git a/include/hw/block/block.h b/include/hw/block/block.h index c172cbe65f..5902c0440a 100644 --- a/include/hw/block/block.h +++ b/include/hw/block/block.h @@ -19,6 +19,7 @@ typedef struct BlockConf { BlockBackend *blk; + OnOffAuto backend_defaults; uint32_t physical_block_size; uint32_t logical_block_size; uint32_t min_io_size; @@ -48,6 +49,8 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf) } #define DEFINE_BLOCK_PROPERTIES_BASE(_state, _conf) \ + DEFINE_PROP_ON_OFF_AUTO("backend_defaults", _state, \ + _conf.backend_defaults, ON_OFF_AUTO_AUTO), \ DEFINE_PROP_BLOCKSIZE("logical_block_size", _state, \ _conf.logical_block_size), \ DEFINE_PROP_BLOCKSIZE("physical_block_size", _state, \ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 6141162d7a..8af11b0a76 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -88,9 +88,11 @@ typedef struct VFIOContainer { uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; unsigned long pgsizes; + unsigned int dma_max_mappings; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_ENTRY(VFIOContainer) next; } VFIOContainer; @@ -102,6 +104,16 @@ typedef struct VFIOGuestIOMMU { QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; } VFIOGuestIOMMU; +typedef struct VFIORamDiscardListener { + VFIOContainer *container; + MemoryRegion *mr; + hwaddr offset_within_address_space; + hwaddr size; + uint64_t granularity; + RamDiscardListener listener; + QLIST_ENTRY(VFIORamDiscardListener) next; +} VFIORamDiscardListener; + typedef struct VFIOHostDMAWindow { hwaddr min_iova; hwaddr max_iova; diff --git a/include/hw/virtio/virtio-mem.h b/include/hw/virtio/virtio-mem.h index 4eeb82d5dd..9a6e348fa2 100644 --- a/include/hw/virtio/virtio-mem.h +++ b/include/hw/virtio/virtio-mem.h @@ -67,6 +67,9 @@ struct VirtIOMEM { /* don't migrate unplugged memory */ NotifierWithReturn precopy_notifier; + + /* listeners to notify on plug/unplug activity. */ + QLIST_HEAD(, RamDiscardListener) rdl_list; }; struct VirtIOMEMClass { diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 8df7b69f38..017c03675c 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -153,6 +153,7 @@ typedef enum { MIG_PRI_DEFAULT = 0, MIG_PRI_IOMMU, /* Must happen before PCI devices */ MIG_PRI_PCI_BUS, /* Must happen before IOMMU */ + MIG_PRI_VIRTIO_MEM, /* Must happen before IOMMU */ MIG_PRI_GICV3_ITS, /* Must happen before PCI devices */ MIG_PRI_GICV3, /* Must happen before the ITS */ MIG_PRI_MAX, diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h index 98aef5647c..8dbc6fcb89 100644 --- a/include/qemu/main-loop.h +++ b/include/qemu/main-loop.h @@ -294,7 +294,9 @@ void qemu_cond_timedwait_iothread(QemuCond *cond, int ms); void qemu_fd_register(int fd); -QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque); +#define qemu_bh_new(cb, opaque) \ + qemu_bh_new_full((cb), (opaque), (stringify(cb))) +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name); void qemu_bh_schedule_idle(QEMUBH *bh); enum { diff --git a/softmmu/memory.c b/softmmu/memory.c index f0161515e9..cea2f622c9 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -2027,6 +2027,70 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr) return imrc->num_indexes(iommu_mr); } +RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr) +{ + if (!memory_region_is_mapped(mr) || !memory_region_is_ram(mr)) { + return NULL; + } + return mr->rdm; +} + +void memory_region_set_ram_discard_manager(MemoryRegion *mr, + RamDiscardManager *rdm) +{ + g_assert(memory_region_is_ram(mr) && !memory_region_is_mapped(mr)); + g_assert(!rdm || !mr->rdm); + mr->rdm = rdm; +} + +uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->get_min_granularity); + return rdmc->get_min_granularity(rdm, mr); +} + +bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *section) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->is_populated); + return rdmc->is_populated(rdm, section); +} + +int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamPopulate replay_fn, + void *opaque) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->replay_populated); + return rdmc->replay_populated(rdm, section, replay_fn, opaque); +} + +void ram_discard_manager_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->register_listener); + rdmc->register_listener(rdm, rdl, section); +} + +void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->unregister_listener); + rdmc->unregister_listener(rdm, rdl); +} + void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) { uint8_t mask = 1 << client; @@ -2637,6 +2701,33 @@ MemoryRegionSection memory_region_find(MemoryRegion *mr, return ret; } +MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s) +{ + MemoryRegionSection *tmp = g_new(MemoryRegionSection, 1); + + *tmp = *s; + if (tmp->mr) { + memory_region_ref(tmp->mr); + } + if (tmp->fv) { + bool ret = flatview_ref(tmp->fv); + + g_assert(ret); + } + return tmp; +} + +void memory_region_section_free_copy(MemoryRegionSection *s) +{ + if (s->fv) { + flatview_unref(s->fv); + } + if (s->mr) { + memory_region_unref(s->mr); + } + g_free(s); +} + bool memory_region_present(MemoryRegion *container, hwaddr addr) { MemoryRegion *mr; @@ -3320,10 +3411,17 @@ static const TypeInfo iommu_memory_region_info = { .abstract = true, }; +static const TypeInfo ram_discard_manager_info = { + .parent = TYPE_INTERFACE, + .name = TYPE_RAM_DISCARD_MANAGER, + .class_size = sizeof(RamDiscardManagerClass), +}; + static void memory_register_types(void) { type_register_static(&memory_region_info); type_register_static(&iommu_memory_region_info); + type_register_static(&ram_discard_manager_info); } type_init(memory_register_types) diff --git a/softmmu/physmem.c b/softmmu/physmem.c index 9b171c9dbe..3c1912a1a0 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -3684,56 +3684,106 @@ void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root) } } -/* - * If positive, discarding RAM is disabled. If negative, discarding RAM is - * required to work and cannot be disabled. - */ -static int ram_block_discard_disabled; +/* Require any discards to work. */ +static unsigned int ram_block_discard_required_cnt; +/* Require only coordinated discards to work. */ +static unsigned int ram_block_coordinated_discard_required_cnt; +/* Disable any discards. */ +static unsigned int ram_block_discard_disabled_cnt; +/* Disable only uncoordinated discards. */ +static unsigned int ram_block_uncoordinated_discard_disabled_cnt; +static QemuMutex ram_block_discard_disable_mutex; + +static void ram_block_discard_disable_mutex_lock(void) +{ + static gsize initialized; + + if (g_once_init_enter(&initialized)) { + qemu_mutex_init(&ram_block_discard_disable_mutex); + g_once_init_leave(&initialized, 1); + } + qemu_mutex_lock(&ram_block_discard_disable_mutex); +} + +static void ram_block_discard_disable_mutex_unlock(void) +{ + qemu_mutex_unlock(&ram_block_discard_disable_mutex); +} int ram_block_discard_disable(bool state) { - int old; + int ret = 0; + ram_block_discard_disable_mutex_lock(); if (!state) { - qatomic_dec(&ram_block_discard_disabled); - return 0; + ram_block_discard_disabled_cnt--; + } else if (ram_block_discard_required_cnt || + ram_block_coordinated_discard_required_cnt) { + ret = -EBUSY; + } else { + ram_block_discard_disabled_cnt++; } + ram_block_discard_disable_mutex_unlock(); + return ret; +} - do { - old = qatomic_read(&ram_block_discard_disabled); - if (old < 0) { - return -EBUSY; - } - } while (qatomic_cmpxchg(&ram_block_discard_disabled, - old, old + 1) != old); - return 0; +int ram_block_uncoordinated_discard_disable(bool state) +{ + int ret = 0; + + ram_block_discard_disable_mutex_lock(); + if (!state) { + ram_block_uncoordinated_discard_disabled_cnt--; + } else if (ram_block_discard_required_cnt) { + ret = -EBUSY; + } else { + ram_block_uncoordinated_discard_disabled_cnt++; + } + ram_block_discard_disable_mutex_unlock(); + return ret; } int ram_block_discard_require(bool state) { - int old; + int ret = 0; + ram_block_discard_disable_mutex_lock(); if (!state) { - qatomic_inc(&ram_block_discard_disabled); - return 0; + ram_block_discard_required_cnt--; + } else if (ram_block_discard_disabled_cnt || + ram_block_uncoordinated_discard_disabled_cnt) { + ret = -EBUSY; + } else { + ram_block_discard_required_cnt++; } + ram_block_discard_disable_mutex_unlock(); + return ret; +} - do { - old = qatomic_read(&ram_block_discard_disabled); - if (old > 0) { - return -EBUSY; - } - } while (qatomic_cmpxchg(&ram_block_discard_disabled, - old, old - 1) != old); - return 0; +int ram_block_coordinated_discard_require(bool state) +{ + int ret = 0; + + ram_block_discard_disable_mutex_lock(); + if (!state) { + ram_block_coordinated_discard_required_cnt--; + } else if (ram_block_discard_disabled_cnt) { + ret = -EBUSY; + } else { + ram_block_coordinated_discard_required_cnt++; + } + ram_block_discard_disable_mutex_unlock(); + return ret; } bool ram_block_discard_is_disabled(void) { - return qatomic_read(&ram_block_discard_disabled) > 0; + return qatomic_read(&ram_block_discard_disabled_cnt) || + qatomic_read(&ram_block_uncoordinated_discard_disabled_cnt); } bool ram_block_discard_is_required(void) { - return qatomic_read(&ram_block_discard_disabled) < 0; + return qatomic_read(&ram_block_discard_required_cnt) || + qatomic_read(&ram_block_coordinated_discard_required_cnt); } diff --git a/tests/data/acpi/pc/DSDT b/tests/data/acpi/pc/DSDT index b9dd9b38e4..cc1223773e 100644 --- a/tests/data/acpi/pc/DSDT +++ b/tests/data/acpi/pc/DSDT Binary files differdiff --git a/tests/data/acpi/pc/DSDT.acpihmat b/tests/data/acpi/pc/DSDT.acpihmat index cba5a1dcb0..2d0678eb83 100644 --- a/tests/data/acpi/pc/DSDT.acpihmat +++ b/tests/data/acpi/pc/DSDT.acpihmat Binary files differdiff --git a/tests/data/acpi/pc/DSDT.bridge b/tests/data/acpi/pc/DSDT.bridge index a9b4d56594..77778c3a69 100644 --- a/tests/data/acpi/pc/DSDT.bridge +++ b/tests/data/acpi/pc/DSDT.bridge Binary files differdiff --git a/tests/data/acpi/pc/DSDT.cphp b/tests/data/acpi/pc/DSDT.cphp index 8d86155e27..af046b40b0 100644 --- a/tests/data/acpi/pc/DSDT.cphp +++ b/tests/data/acpi/pc/DSDT.cphp Binary files differdiff --git a/tests/data/acpi/pc/DSDT.dimmpxm b/tests/data/acpi/pc/DSDT.dimmpxm index e00a447f92..b56b2e0890 100644 --- a/tests/data/acpi/pc/DSDT.dimmpxm +++ b/tests/data/acpi/pc/DSDT.dimmpxm Binary files differdiff --git a/tests/data/acpi/pc/DSDT.hpbridge b/tests/data/acpi/pc/DSDT.hpbridge index 5d8ba19505..bb0593eeb8 100644 --- a/tests/data/acpi/pc/DSDT.hpbridge +++ b/tests/data/acpi/pc/DSDT.hpbridge Binary files differdiff --git a/tests/data/acpi/pc/DSDT.ipmikcs b/tests/data/acpi/pc/DSDT.ipmikcs index 01e53bd436..2e618e49d3 100644 --- a/tests/data/acpi/pc/DSDT.ipmikcs +++ b/tests/data/acpi/pc/DSDT.ipmikcs Binary files differdiff --git a/tests/data/acpi/pc/DSDT.memhp b/tests/data/acpi/pc/DSDT.memhp index b8103799b4..c32d28575b 100644 --- a/tests/data/acpi/pc/DSDT.memhp +++ b/tests/data/acpi/pc/DSDT.memhp Binary files differdiff --git a/tests/data/acpi/pc/DSDT.nohpet b/tests/data/acpi/pc/DSDT.nohpet index d4f0050533..623f06a900 100644 --- a/tests/data/acpi/pc/DSDT.nohpet +++ b/tests/data/acpi/pc/DSDT.nohpet Binary files differdiff --git a/tests/data/acpi/pc/DSDT.numamem b/tests/data/acpi/pc/DSDT.numamem index 8632dfe8a8..f0a3fa92de 100644 --- a/tests/data/acpi/pc/DSDT.numamem +++ b/tests/data/acpi/pc/DSDT.numamem Binary files differdiff --git a/tests/qemu-iotests/172.out b/tests/qemu-iotests/172.out index d53f61d0de..4cf4d536b4 100644 --- a/tests/qemu-iotests/172.out +++ b/tests/qemu-iotests/172.out @@ -21,6 +21,7 @@ Testing: dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -48,6 +49,7 @@ Testing: -fda TEST_DIR/t.qcow2 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -85,6 +87,7 @@ Testing: -fdb TEST_DIR/t.qcow2 dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -96,6 +99,7 @@ Testing: -fdb TEST_DIR/t.qcow2 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -137,6 +141,7 @@ Testing: -fda TEST_DIR/t.qcow2 -fdb TEST_DIR/t.qcow2.2 dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -148,6 +153,7 @@ Testing: -fda TEST_DIR/t.qcow2 -fdb TEST_DIR/t.qcow2.2 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -190,6 +196,7 @@ Testing: -fdb dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -201,6 +208,7 @@ Testing: -fdb dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -228,6 +236,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -265,6 +274,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2,index=1 dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -276,6 +286,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2,index=1 dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -317,6 +328,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=floppy,file=TEST_DIR/t dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -328,6 +340,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=floppy,file=TEST_DIR/t dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -373,6 +386,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0 dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -410,6 +424,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,unit=1 dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -447,6 +462,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qco dev: floppy, id "" unit = 1 (0x1) drive = "none1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -458,6 +474,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qco dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -509,6 +526,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -520,6 +538,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -562,6 +581,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -573,6 +593,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -615,6 +636,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -626,6 +648,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -668,6 +691,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -679,6 +703,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl dev: floppy, id "" unit = 1 (0x1) drive = "floppy1" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -730,6 +755,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -741,6 +767,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -783,6 +810,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q dev: floppy, id "" unit = 1 (0x1) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -794,6 +822,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q dev: floppy, id "" unit = 0 (0x0) drive = "floppy0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -842,6 +871,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -global floppy.drive=none0 -device dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -909,6 +939,7 @@ Testing: -device floppy dev: floppy, id "" unit = 0 (0x0) drive = "" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -933,6 +964,7 @@ Testing: -device floppy,drive-type=120 dev: floppy, id "" unit = 0 (0x0) drive = "" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -957,6 +989,7 @@ Testing: -device floppy,drive-type=144 dev: floppy, id "" unit = 0 (0x0) drive = "" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -981,6 +1014,7 @@ Testing: -device floppy,drive-type=288 dev: floppy, id "" unit = 0 (0x0) drive = "" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -1008,6 +1042,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,drive-t dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -1045,6 +1080,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,drive-t dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -1085,6 +1121,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,logical dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) @@ -1122,6 +1159,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,physica dev: floppy, id "" unit = 0 (0x0) drive = "none0" + backend_defaults = "auto" logical_block_size = 512 (512 B) physical_block_size = 512 (512 B) min_io_size = 0 (0 B) diff --git a/tests/unit/ptimer-test-stubs.c b/tests/unit/ptimer-test-stubs.c index 7f801a4d09..2a3ef58799 100644 --- a/tests/unit/ptimer-test-stubs.c +++ b/tests/unit/ptimer-test-stubs.c @@ -108,7 +108,7 @@ int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask) return deadline; } -QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque) +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name) { QEMUBH *bh = g_new(QEMUBH, 1); diff --git a/util/async.c b/util/async.c index 5d9b7cc1eb..9a41591319 100644 --- a/util/async.c +++ b/util/async.c @@ -57,6 +57,7 @@ enum { struct QEMUBH { AioContext *ctx; + const char *name; QEMUBHFunc *cb; void *opaque; QSLIST_ENTRY(QEMUBH) next; @@ -107,7 +108,8 @@ static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags) return bh; } -void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque) +void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, + void *opaque, const char *name) { QEMUBH *bh; bh = g_new(QEMUBH, 1); @@ -115,11 +117,13 @@ void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque) .ctx = ctx, .cb = cb, .opaque = opaque, + .name = name, }; aio_bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT); } -QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque) +QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, + const char *name) { QEMUBH *bh; bh = g_new(QEMUBH, 1); @@ -127,6 +131,7 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque) .ctx = ctx, .cb = cb, .opaque = opaque, + .name = name, }; return bh; } @@ -339,8 +344,20 @@ aio_ctx_finalize(GSource *source) assert(QSIMPLEQ_EMPTY(&ctx->bh_slice_list)); while ((bh = aio_bh_dequeue(&ctx->bh_list, &flags))) { - /* qemu_bh_delete() must have been called on BHs in this AioContext */ - assert(flags & BH_DELETED); + /* + * qemu_bh_delete() must have been called on BHs in this AioContext. In + * many cases memory leaks, hangs, or inconsistent state occur when a + * BH is leaked because something still expects it to run. + * + * If you hit this, fix the lifecycle of the BH so that + * qemu_bh_delete() and any associated cleanup is called before the + * AioContext is finalized. + */ + if (unlikely(!(flags & BH_DELETED))) { + fprintf(stderr, "%s: BH '%s' leaked, aborting...\n", + __func__, bh->name); + abort(); + } g_free(bh); } diff --git a/util/main-loop.c b/util/main-loop.c index 4ae5b23e99..06b18b195c 100644 --- a/util/main-loop.c +++ b/util/main-loop.c @@ -544,9 +544,9 @@ void main_loop_wait(int nonblocking) /* Functions to operate on the main QEMU AioContext. */ -QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque) +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name) { - return aio_bh_new(qemu_aio_context, cb, opaque); + return aio_bh_new_full(qemu_aio_context, cb, opaque, name); } /* diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index 838e286ce5..893d864354 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -225,6 +225,8 @@ static void *mmap_activate(void *ptr, size_t size, int fd, "crash.\n", file_name); g_free(proc_link); g_free(file_name); + warn_report("Using non DAX backing file with 'pmem=on' option" + " is deprecated"); } /* * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try |