diff options
105 files changed, 4839 insertions, 585 deletions
diff --git a/Kconfig.host b/Kconfig.host index d763d89269..2ee71578f3 100644 --- a/Kconfig.host +++ b/Kconfig.host @@ -46,3 +46,6 @@ config FUZZ config VFIO_USER_SERVER_ALLOWED bool imply VFIO_USER_SERVER + +config HV_BALLOON_POSSIBLE + bool diff --git a/MAINTAINERS b/MAINTAINERS index 8e8a7d5be5..d4a480ce5a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2656,6 +2656,14 @@ F: hw/usb/canokey.c F: hw/usb/canokey.h F: docs/system/devices/canokey.rst +Hyper-V Dynamic Memory Protocol +M: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> +S: Supported +F: hw/hyperv/hv-balloon*.c +F: hw/hyperv/hv-balloon*.h +F: include/hw/hyperv/dynmem-proto.h +F: include/hw/hyperv/hv-balloon.h + Subsystems ---------- Overall Audio backends diff --git a/block/nvme.c b/block/nvme.c index 96b3f8f2fa..0a0a0a6b36 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -417,9 +417,10 @@ static bool nvme_process_completion(NVMeQueuePair *q) q->cq_phase = !q->cq_phase; } cid = le16_to_cpu(c->cid); - if (cid == 0 || cid > NVME_QUEUE_SIZE) { - warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", " - "queue size: %u", cid, NVME_QUEUE_SIZE); + if (cid == 0 || cid > NVME_NUM_REQS) { + warn_report("NVMe: Unexpected CID in completion queue: %" PRIu32 + ", should be within: 1..%u inclusively", cid, + NVME_NUM_REQS); continue; } trace_nvme_complete_command(s, q->index, cid); diff --git a/block/parallels-ext.c b/block/parallels-ext.c index 8a109f005a..4d8ecf5047 100644 --- a/block/parallels-ext.c +++ b/block/parallels-ext.c @@ -130,7 +130,7 @@ static BdrvDirtyBitmap *parallels_load_bitmap(BlockDriverState *bs, g_autofree uint64_t *l1_table = NULL; BdrvDirtyBitmap *bitmap; QemuUUID uuid; - char uuidstr[UUID_FMT_LEN + 1]; + char uuidstr[UUID_STR_LEN]; int i; if (data_size < sizeof(bf)) { diff --git a/block/vdi.c b/block/vdi.c index c647d72895..7cfd12b50d 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -239,7 +239,7 @@ static void vdi_header_to_le(VdiHeader *header) static void vdi_header_print(VdiHeader *header) { - char uuidstr[37]; + char uuidstr[UUID_STR_LEN]; QemuUUID uuid; logout("text %s", header->text); logout("signature 0x%08x\n", header->signature); diff --git a/docs/devel/index-api.rst b/docs/devel/index-api.rst index 539ad29c21..fe01b2b488 100644 --- a/docs/devel/index-api.rst +++ b/docs/devel/index-api.rst @@ -11,6 +11,7 @@ generated from in-code annotations to function prototypes. loads-stores memory modules + pci qom-api qdev-api ui diff --git a/docs/devel/pci.rst b/docs/devel/pci.rst new file mode 100644 index 0000000000..68739334f3 --- /dev/null +++ b/docs/devel/pci.rst @@ -0,0 +1,8 @@ +============= +PCI subsystem +============= + +API Reference +------------- + +.. kernel-doc:: include/hw/pci/pci.h diff --git a/docs/system/arm/vexpress.rst b/docs/system/arm/vexpress.rst index 3e3839e923..38f29c73e7 100644 --- a/docs/system/arm/vexpress.rst +++ b/docs/system/arm/vexpress.rst @@ -58,6 +58,9 @@ Other differences between the hardware and the QEMU model: ``vexpress-a15``, and have IRQs from 40 upwards. If a dtb is provided on the command line then QEMU will edit it to include suitable entries describing these transports for the guest. +- QEMU does not currently support either dynamic or static remapping + of the area of memory at address 0: it is always mapped to alias + the first flash bank Booting a Linux kernel ---------------------- diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c index 49a80550c5..e8711ae16a 100644 --- a/hw/alpha/typhoon.c +++ b/hw/alpha/typhoon.c @@ -738,6 +738,10 @@ static AddressSpace *typhoon_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &s->pchip.iommu_as; } +static const PCIIOMMUOps typhoon_iommu_ops = { + .get_address_space = typhoon_pci_dma_iommu, +}; + static void typhoon_set_irq(void *opaque, int irq, int level) { TyphoonState *s = opaque; @@ -897,7 +901,7 @@ PCIBus *typhoon_init(MemoryRegion *ram, qemu_irq *p_isa_irq, "iommu-typhoon", UINT64_MAX); address_space_init(&s->pchip.iommu_as, MEMORY_REGION(&s->pchip.iommu), "pchip0-pci"); - pci_setup_iommu(b, typhoon_pci_dma_iommu, s); + pci_setup_iommu(b, &typhoon_iommu_ops, s); /* Pchip0 PCI special/interrupt acknowledge, 0x801.F800.0000, 64MB. */ memory_region_init_io(&s->pchip.reg_iack, OBJECT(s), &alpha_pci_iack_ops, diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index f35ae9aa22..9a8ac45431 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -605,6 +605,10 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) return &sdev->as; } +static const PCIIOMMUOps smmu_ops = { + .get_address_space = smmu_find_add_as, +}; + IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) { uint8_t bus_n, devfn; @@ -661,7 +665,7 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL); if (s->primary_bus) { - pci_setup_iommu(s->primary_bus, smmu_find_add_as, s); + pci_setup_iommu(s->primary_bus, &smmu_ops, s); } else { error_setg(errp, "SMMU is not attached to any PCI bus!"); } diff --git a/hw/arm/vexpress.c b/hw/arm/vexpress.c index 8ff37f52ca..c08ea34e92 100644 --- a/hw/arm/vexpress.c +++ b/hw/arm/vexpress.c @@ -177,7 +177,6 @@ struct VexpressMachineState { MemoryRegion vram; MemoryRegion sram; MemoryRegion flashalias; - MemoryRegion lowram; MemoryRegion a15sram; bool secure; bool virt; @@ -276,7 +275,6 @@ static void a9_daughterboard_init(VexpressMachineState *vms, { MachineState *machine = MACHINE(vms); MemoryRegion *sysmem = get_system_memory(); - ram_addr_t low_ram_size; if (ram_size > 0x40000000) { /* 1GB is the maximum the address space permits */ @@ -284,17 +282,11 @@ static void a9_daughterboard_init(VexpressMachineState *vms, exit(1); } - low_ram_size = ram_size; - if (low_ram_size > 0x4000000) { - low_ram_size = 0x4000000; - } - /* RAM is from 0x60000000 upwards. The bottom 64MB of the + /* + * RAM is from 0x60000000 upwards. The bottom 64MB of the * address space should in theory be remappable to various - * things including ROM or RAM; we always map the RAM there. + * things including ROM or RAM; we always map the flash there. */ - memory_region_init_alias(&vms->lowram, NULL, "vexpress.lowmem", - machine->ram, 0, low_ram_size); - memory_region_add_subregion(sysmem, 0x0, &vms->lowram); memory_region_add_subregion(sysmem, 0x60000000, machine->ram); /* 0x1e000000 A9MPCore (SCU) private memory region */ diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 9ce136cd88..8bc35a483c 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -482,7 +482,7 @@ build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 3, 1); /* ARM PL011 UART */ build_append_int_noprefix(table_data, 0, 3); /* Reserved */ /* Base Address */ - build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 8, 0, 1, + build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3, vms->memmap[VIRT_UART].base); /* Interrupt Type */ build_append_int_noprefix(table_data, @@ -673,7 +673,7 @@ build_dbg2(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 34, 2); /* BaseAddressRegister[] */ - build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 8, 0, 1, + build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3, vms->memmap[VIRT_UART].base); /* AddressSize[] */ diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 92085d2d8f..0a16ab3095 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -631,7 +631,8 @@ static void fdt_add_pmu_nodes(const VirtMachineState *vms) qemu_fdt_setprop(ms->fdt, "/pmu", "compatible", compat, sizeof(compat)); qemu_fdt_setprop_cells(ms->fdt, "/pmu", "interrupts", - GIC_FDT_IRQ_TYPE_PPI, VIRTUAL_PMU_IRQ, irqflags); + GIC_FDT_IRQ_TYPE_PPI, + INTID_TO_PPI(VIRTUAL_PMU_IRQ), irqflags); } } diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c index a07cd7eb5d..bfa53960c3 100644 --- a/hw/block/xen-block.c +++ b/hw/block/xen-block.c @@ -115,9 +115,13 @@ static void xen_block_connect(XenDevice *xendev, Error **errp) return; } - if (xen_device_frontend_scanf(xendev, "protocol", "%ms", - &str) != 1) { - protocol = BLKIF_PROTOCOL_NATIVE; + if (xen_device_frontend_scanf(xendev, "protocol", "%ms", &str) != 1) { + /* x86 defaults to the 32-bit protocol even for 64-bit guests. */ + if (object_dynamic_cast(OBJECT(qdev_get_machine()), "x86-machine")) { + protocol = BLKIF_PROTOCOL_X86_32; + } else { + protocol = BLKIF_PROTOCOL_NATIVE; + } } else { if (strcmp(str, XEN_IO_PROTO_ABI_X86_32) == 0) { protocol = BLKIF_PROTOCOL_X86_32; diff --git a/hw/core/loader.c b/hw/core/loader.c index 4dd5a71fb7..b7bb44b7f7 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -558,7 +558,7 @@ static void zfree(void *x, void *addr) ssize_t gunzip(void *dst, size_t dstlen, uint8_t *src, size_t srclen) { - z_stream s; + z_stream s = {}; ssize_t dstbytes; int r, i, flags; diff --git a/hw/core/machine-hmp-cmds.c b/hw/core/machine-hmp-cmds.c index 9a4b59c6f2..a6ff6a4875 100644 --- a/hw/core/machine-hmp-cmds.c +++ b/hw/core/machine-hmp-cmds.c @@ -253,6 +253,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) MemoryDeviceInfo *value; PCDIMMDeviceInfo *di; SgxEPCDeviceInfo *se; + HvBalloonDeviceInfo *hi; for (info = info_list; info; info = info->next) { value = info->value; @@ -310,6 +311,20 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) monitor_printf(mon, " node: %" PRId64 "\n", se->node); monitor_printf(mon, " memdev: %s\n", se->memdev); break; + case MEMORY_DEVICE_INFO_KIND_HV_BALLOON: + hi = value->u.hv_balloon.data; + monitor_printf(mon, "Memory device [%s]: \"%s\"\n", + MemoryDeviceInfoKind_str(value->type), + hi->id ? hi->id : ""); + if (hi->has_memaddr) { + monitor_printf(mon, " memaddr: 0x%" PRIx64 "\n", + hi->memaddr); + } + monitor_printf(mon, " max-size: %" PRIu64 "\n", hi->max_size); + if (hi->memdev) { + monitor_printf(mon, " memdev: %s\n", hi->memdev); + } + break; default: g_assert_not_reached(); } diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 2f1dbb3fd7..b46d16cd2c 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -705,7 +705,7 @@ static void get_reserved_region(Object *obj, Visitor *v, const char *name, int rc; rc = snprintf(buffer, sizeof(buffer), "0x%"PRIx64":0x%"PRIx64":%u", - rr->low, rr->high, rr->type); + range_lob(&rr->range), range_upb(&rr->range), rr->type); assert(rc < sizeof(buffer)); visit_type_str(v, name, &p, errp); @@ -717,6 +717,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, Property *prop = opaque; ReservedRegion *rr = object_field_prop_ptr(obj, prop); const char *endptr; + uint64_t lob, upb; char *str; int ret; @@ -724,7 +725,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, return; } - ret = qemu_strtou64(str, &endptr, 16, &rr->low); + ret = qemu_strtou64(str, &endptr, 16, &lob); if (ret) { error_setg(errp, "start address of '%s'" " must be a hexadecimal integer", name); @@ -734,7 +735,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, goto separator_error; } - ret = qemu_strtou64(endptr + 1, &endptr, 16, &rr->high); + ret = qemu_strtou64(endptr + 1, &endptr, 16, &upb); if (ret) { error_setg(errp, "end address of '%s'" " must be a hexadecimal integer", name); @@ -744,6 +745,8 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, goto separator_error; } + range_set_bounds(&rr->range, lob, upb); + ret = qemu_strtoui(endptr + 1, &endptr, 10, &rr->type); if (ret) { error_setg(errp, "type of '%s'" @@ -1111,7 +1114,7 @@ static void get_uuid(Object *obj, Visitor *v, const char *name, void *opaque, { Property *prop = opaque; QemuUUID *uuid = object_field_prop_ptr(obj, prop); - char buffer[UUID_FMT_LEN + 1]; + char buffer[UUID_STR_LEN]; char *p = buffer; qemu_uuid_unparse(uuid, buffer); diff --git a/hw/display/ati.c b/hw/display/ati.c index 6e38e00502..9a87a5504a 100644 --- a/hw/display/ati.c +++ b/hw/display/ati.c @@ -319,11 +319,13 @@ static uint64_t ati_mm_read(void *opaque, hwaddr addr, unsigned int size) case DAC_CNTL: val = s->regs.dac_cntl; break; - case GPIO_VGA_DDC: - val = s->regs.gpio_vga_ddc; + case GPIO_VGA_DDC ... GPIO_VGA_DDC + 3: + val = ati_reg_read_offs(s->regs.gpio_vga_ddc, + addr - GPIO_VGA_DDC, size); break; - case GPIO_DVI_DDC: - val = s->regs.gpio_dvi_ddc; + case GPIO_DVI_DDC ... GPIO_DVI_DDC + 3: + val = ati_reg_read_offs(s->regs.gpio_dvi_ddc, + addr - GPIO_DVI_DDC, size); break; case GPIO_MONID ... GPIO_MONID + 3: val = ati_reg_read_offs(s->regs.gpio_monid, @@ -337,6 +339,9 @@ static uint64_t ati_mm_read(void *opaque, hwaddr addr, unsigned int size) case PALETTE_DATA: val = vga_ioport_read(&s->vga, VGA_PEL_D); break; + case PALETTE_30_DATA: + val = s->regs.palette[vga_ioport_read(&s->vga, VGA_PEL_IR)]; + break; case CNFG_CNTL: val = s->regs.config_cntl; break; @@ -349,14 +354,17 @@ static uint64_t ati_mm_read(void *opaque, hwaddr addr, unsigned int size) PCI_BASE_ADDRESS_0, size) & 0xfffffff0; break; case CONFIG_APER_SIZE: - val = s->vga.vram_size; + val = s->vga.vram_size / 2; break; case CONFIG_REG_1_BASE: val = pci_default_read_config(&s->dev, PCI_BASE_ADDRESS_2, size) & 0xfffffff0; break; case CONFIG_REG_APER_SIZE: - val = memory_region_size(&s->mm); + val = memory_region_size(&s->mm) / 2; + break; + case HOST_PATH_CNTL: + val = BIT(23); /* Radeon HDP_APER_CNTL */ break; case MC_STATUS: val = 5; @@ -612,29 +620,34 @@ static void ati_mm_write(void *opaque, hwaddr addr, s->regs.dac_cntl = data & 0xffffe3ff; s->vga.dac_8bit = !!(data & DAC_8BIT_EN); break; - case GPIO_VGA_DDC: + /* + * GPIO regs for DDC access. Because some drivers access these via + * multiple byte writes we have to be careful when we send bits to + * avoid spurious changes in bitbang_i2c state. Only do it when either + * the enable bits are changed or output bits changed while enabled. + */ + case GPIO_VGA_DDC ... GPIO_VGA_DDC + 3: if (s->dev_id != PCI_DEVICE_ID_ATI_RAGE128_PF) { /* FIXME: Maybe add a property to select VGA or DVI port? */ } break; - case GPIO_DVI_DDC: + case GPIO_DVI_DDC ... GPIO_DVI_DDC + 3: if (s->dev_id != PCI_DEVICE_ID_ATI_RAGE128_PF) { - s->regs.gpio_dvi_ddc = ati_i2c(&s->bbi2c, data, 0); + ati_reg_write_offs(&s->regs.gpio_dvi_ddc, + addr - GPIO_DVI_DDC, data, size); + if ((addr <= GPIO_DVI_DDC + 2 && addr + size > GPIO_DVI_DDC + 2) || + (addr == GPIO_DVI_DDC && (s->regs.gpio_dvi_ddc & 0x30000))) { + s->regs.gpio_dvi_ddc = ati_i2c(&s->bbi2c, + s->regs.gpio_dvi_ddc, 0); + } } break; case GPIO_MONID ... GPIO_MONID + 3: /* FIXME What does Radeon have here? */ if (s->dev_id == PCI_DEVICE_ID_ATI_RAGE128_PF) { + /* Rage128p accesses DDC via MONID(1-2) with additional mask bit */ ati_reg_write_offs(&s->regs.gpio_monid, addr - GPIO_MONID, data, size); - /* - * Rage128p accesses DDC used to get EDID via these bits. - * Because some drivers access this via multiple byte writes - * we have to be careful when we send bits to avoid spurious - * changes in bitbang_i2c state. So only do it when mask is set - * and either the enable bits are changed or output bits changed - * while enabled. - */ if ((s->regs.gpio_monid & BIT(25)) && ((addr <= GPIO_MONID + 2 && addr + size > GPIO_MONID + 2) || (addr == GPIO_MONID && (s->regs.gpio_monid & 0x60000)))) { @@ -663,6 +676,12 @@ static void ati_mm_write(void *opaque, hwaddr addr, data >>= 8; vga_ioport_write(&s->vga, VGA_PEL_D, data & 0xff); break; + case PALETTE_30_DATA: + s->regs.palette[vga_ioport_read(&s->vga, VGA_PEL_IW)] = data; + vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 22) & 0xff); + vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 12) & 0xff); + vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 2) & 0xff); + break; case CNFG_CNTL: s->regs.config_cntl = data; break; @@ -1014,6 +1033,7 @@ static Property ati_vga_properties[] = { DEFINE_PROP_UINT16("x-device-id", ATIVGAState, dev_id, PCI_DEVICE_ID_ATI_RAGE128_PF), DEFINE_PROP_BOOL("guest_hwcursor", ATIVGAState, cursor_guest_mode, false), + DEFINE_PROP_UINT8("x-pixman", ATIVGAState, use_pixman, 3), DEFINE_PROP_END_OF_LIST() }; @@ -1035,11 +1055,18 @@ static void ati_vga_class_init(ObjectClass *klass, void *data) k->exit = ati_vga_exit; } +static void ati_vga_init(Object *o) +{ + object_property_set_description(o, "x-pixman", "Use pixman for: " + "1: fill, 2: blit"); +} + static const TypeInfo ati_vga_info = { .name = TYPE_ATI_VGA, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(ATIVGAState), .class_init = ati_vga_class_init, + .instance_init = ati_vga_init, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, { }, diff --git a/hw/display/ati_2d.c b/hw/display/ati_2d.c index 7d786653e8..0e6b8e4367 100644 --- a/hw/display/ati_2d.c +++ b/hw/display/ati_2d.c @@ -92,6 +92,7 @@ void ati_2d_blt(ATIVGAState *s) switch (s->regs.dp_mix & GMC_ROP3_MASK) { case ROP3_SRCCOPY: { + bool fallback = false; unsigned src_x = (s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT ? s->regs.src_x : s->regs.src_x + 1 - s->regs.dst_width); unsigned src_y = (s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM ? @@ -122,27 +123,50 @@ void ati_2d_blt(ATIVGAState *s) src_bits, dst_bits, src_stride, dst_stride, bpp, bpp, src_x, src_y, dst_x, dst_y, s->regs.dst_width, s->regs.dst_height); - if (s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT && + if ((s->use_pixman & BIT(1)) && + s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT && s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM) { - pixman_blt((uint32_t *)src_bits, (uint32_t *)dst_bits, - src_stride, dst_stride, bpp, bpp, - src_x, src_y, dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height); - } else { + fallback = !pixman_blt((uint32_t *)src_bits, (uint32_t *)dst_bits, + src_stride, dst_stride, bpp, bpp, + src_x, src_y, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height); + } else if (s->use_pixman & BIT(1)) { /* FIXME: We only really need a temporary if src and dst overlap */ int llb = s->regs.dst_width * (bpp / 8); int tmp_stride = DIV_ROUND_UP(llb, sizeof(uint32_t)); uint32_t *tmp = g_malloc(tmp_stride * sizeof(uint32_t) * s->regs.dst_height); - pixman_blt((uint32_t *)src_bits, tmp, - src_stride, tmp_stride, bpp, bpp, - src_x, src_y, 0, 0, - s->regs.dst_width, s->regs.dst_height); - pixman_blt(tmp, (uint32_t *)dst_bits, - tmp_stride, dst_stride, bpp, bpp, - 0, 0, dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height); + fallback = !pixman_blt((uint32_t *)src_bits, tmp, + src_stride, tmp_stride, bpp, bpp, + src_x, src_y, 0, 0, + s->regs.dst_width, s->regs.dst_height); + if (!fallback) { + fallback = !pixman_blt(tmp, (uint32_t *)dst_bits, + tmp_stride, dst_stride, bpp, bpp, + 0, 0, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height); + } g_free(tmp); + } else { + fallback = true; + } + if (fallback) { + unsigned int y, i, j, bypp = bpp / 8; + unsigned int src_pitch = src_stride * sizeof(uint32_t); + unsigned int dst_pitch = dst_stride * sizeof(uint32_t); + + for (y = 0; y < s->regs.dst_height; y++) { + i = dst_x * bypp; + j = src_x * bypp; + if (s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM) { + i += (dst_y + y) * dst_pitch; + j += (src_y + y) * src_pitch; + } else { + i += (dst_y + s->regs.dst_height - 1 - y) * dst_pitch; + j += (src_y + s->regs.dst_height - 1 - y) * src_pitch; + } + memmove(&dst_bits[i], &src_bits[j], s->regs.dst_width * bypp); + } } if (dst_bits >= s->vga.vram_ptr + s->vga.vbe_start_addr && dst_bits < s->vga.vram_ptr + s->vga.vbe_start_addr + @@ -180,14 +204,21 @@ void ati_2d_blt(ATIVGAState *s) dst_stride /= sizeof(uint32_t); DPRINTF("pixman_fill(%p, %d, %d, %d, %d, %d, %d, %x)\n", - dst_bits, dst_stride, bpp, - dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height, - filler); - pixman_fill((uint32_t *)dst_bits, dst_stride, bpp, - dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height, - filler); + dst_bits, dst_stride, bpp, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height, filler); + if (!(s->use_pixman & BIT(0)) || + !pixman_fill((uint32_t *)dst_bits, dst_stride, bpp, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height, filler)) { + /* fallback when pixman failed or we don't want to call it */ + unsigned int x, y, i, bypp = bpp / 8; + unsigned int dst_pitch = dst_stride * sizeof(uint32_t); + for (y = 0; y < s->regs.dst_height; y++) { + i = dst_x * bypp + (dst_y + y) * dst_pitch; + for (x = 0; x < s->regs.dst_width; x++, i += bypp) { + stn_he_p(&dst_bits[i], bypp, filler); + } + } + } if (dst_bits >= s->vga.vram_ptr + s->vga.vbe_start_addr && dst_bits < s->vga.vram_ptr + s->vga.vbe_start_addr + s->vga.vbe_regs[VBE_DISPI_INDEX_YRES] * s->vga.vbe_line_offset) { diff --git a/hw/display/ati_dbg.c b/hw/display/ati_dbg.c index bd0ecd48c7..3ffa7f35df 100644 --- a/hw/display/ati_dbg.c +++ b/hw/display/ati_dbg.c @@ -30,6 +30,7 @@ static struct ati_regdesc ati_reg_names[] = { {"AMCGPIO_EN_MIR", 0x00a8}, {"PALETTE_INDEX", 0x00b0}, {"PALETTE_DATA", 0x00b4}, + {"PALETTE_30_DATA", 0x00b8}, {"CNFG_CNTL", 0x00e0}, {"GEN_RESET_CNTL", 0x00f0}, {"CNFG_MEMSIZE", 0x00f8}, @@ -38,6 +39,7 @@ static struct ati_regdesc ati_reg_names[] = { {"CONFIG_APER_SIZE", 0x0108}, {"CONFIG_REG_1_BASE", 0x010c}, {"CONFIG_REG_APER_SIZE", 0x0110}, + {"HOST_PATH_CNTL", 0x0130}, {"MEM_CNTL", 0x0140}, {"MC_FB_LOCATION", 0x0148}, {"MC_AGP_LOCATION", 0x014C}, diff --git a/hw/display/ati_int.h b/hw/display/ati_int.h index e8d3c7af75..f5a47b82b0 100644 --- a/hw/display/ati_int.h +++ b/hw/display/ati_int.h @@ -44,6 +44,7 @@ typedef struct ATIVGARegs { uint32_t gpio_dvi_ddc; uint32_t gpio_monid; uint32_t config_cntl; + uint32_t palette[256]; uint32_t crtc_h_total_disp; uint32_t crtc_h_sync_strt_wid; uint32_t crtc_v_total_disp; @@ -89,6 +90,7 @@ struct ATIVGAState { char *model; uint16_t dev_id; uint8_t mode; + uint8_t use_pixman; bool cursor_guest_mode; uint16_t cursor_size; uint32_t cursor_offset; diff --git a/hw/display/ati_regs.h b/hw/display/ati_regs.h index d6282b2ef2..d7127748ff 100644 --- a/hw/display/ati_regs.h +++ b/hw/display/ati_regs.h @@ -48,6 +48,7 @@ #define AMCGPIO_EN_MIR 0x00a8 #define PALETTE_INDEX 0x00b0 #define PALETTE_DATA 0x00b4 +#define PALETTE_30_DATA 0x00b8 #define CNFG_CNTL 0x00e0 #define GEN_RESET_CNTL 0x00f0 #define CNFG_MEMSIZE 0x00f8 @@ -56,6 +57,7 @@ #define CONFIG_APER_SIZE 0x0108 #define CONFIG_REG_1_BASE 0x010c #define CONFIG_REG_APER_SIZE 0x0110 +#define HOST_PATH_CNTL 0x0130 #define MEM_CNTL 0x0140 #define MC_FB_LOCATION 0x0148 #define MC_AGP_LOCATION 0x014C diff --git a/hw/display/macfb.c b/hw/display/macfb.c index 2f8e016566..d61541ccb5 100644 --- a/hw/display/macfb.c +++ b/hw/display/macfb.c @@ -36,8 +36,8 @@ #define DAFB_INTR_MASK 0x104 #define DAFB_INTR_STAT 0x108 #define DAFB_INTR_CLEAR 0x10c -#define DAFB_RESET 0x200 -#define DAFB_LUT 0x213 +#define DAFB_LUT_INDEX 0x200 +#define DAFB_LUT 0x210 #define DAFB_INTR_VBL 0x4 @@ -537,6 +537,11 @@ static uint64_t macfb_ctrl_read(void *opaque, case DAFB_MODE_SENSE: val = macfb_sense_read(s); break; + case DAFB_LUT ... DAFB_LUT + 3: + val = s->color_palette[s->palette_current]; + s->palette_current = (s->palette_current + 1) % + ARRAY_SIZE(s->color_palette); + break; default: if (addr < MACFB_CTRL_TOPADDR) { val = s->regs[addr >> 2]; @@ -583,13 +588,11 @@ static void macfb_ctrl_write(void *opaque, s->regs[DAFB_INTR_STAT >> 2] &= ~DAFB_INTR_VBL; macfb_update_irq(s); break; - case DAFB_RESET: - s->palette_current = 0; - s->regs[DAFB_INTR_STAT >> 2] &= ~DAFB_INTR_VBL; - macfb_update_irq(s); + case DAFB_LUT_INDEX: + s->palette_current = (val & 0xff) * 3; break; - case DAFB_LUT: - s->color_palette[s->palette_current] = val; + case DAFB_LUT ... DAFB_LUT + 3: + s->color_palette[s->palette_current] = val & 0xff; s->palette_current = (s->palette_current + 1) % ARRAY_SIZE(s->color_palette); if (s->palette_current % 3) { diff --git a/hw/display/virtio-gpu-pci-rutabaga.c b/hw/display/virtio-gpu-pci-rutabaga.c index c96729e198..abbb898c65 100644 --- a/hw/display/virtio-gpu-pci-rutabaga.c +++ b/hw/display/virtio-gpu-pci-rutabaga.c @@ -36,6 +36,7 @@ static const TypeInfo virtio_gpu_rutabaga_pci_info[] = { .instance_init = virtio_gpu_rutabaga_initfn, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, } }, }; diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index 4265316cbb..2707bceea8 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -1213,6 +1213,9 @@ static int virtio_gpu_save(QEMUFile *f, void *opaque, size_t size, assert(QTAILQ_EMPTY(&g->cmdq)); QTAILQ_FOREACH(res, &g->reslist, next) { + if (res->blob_size) { + continue; + } qemu_put_be32(f, res->resource_id); qemu_put_be32(f, res->width); qemu_put_be32(f, res->height); @@ -1230,12 +1233,40 @@ static int virtio_gpu_save(QEMUFile *f, void *opaque, size_t size, return vmstate_save_state(f, &vmstate_virtio_gpu_scanouts, g, NULL); } +static bool virtio_gpu_load_restore_mapping(VirtIOGPU *g, + struct virtio_gpu_simple_resource *res) +{ + int i; + + for (i = 0; i < res->iov_cnt; i++) { + hwaddr len = res->iov[i].iov_len; + res->iov[i].iov_base = + dma_memory_map(VIRTIO_DEVICE(g)->dma_as, res->addrs[i], &len, + DMA_DIRECTION_TO_DEVICE, MEMTXATTRS_UNSPECIFIED); + + if (!res->iov[i].iov_base || len != res->iov[i].iov_len) { + /* Clean up the half-a-mapping we just created... */ + if (res->iov[i].iov_base) { + dma_memory_unmap(VIRTIO_DEVICE(g)->dma_as, res->iov[i].iov_base, + len, DMA_DIRECTION_TO_DEVICE, 0); + } + /* ...and the mappings for previous loop iterations */ + res->iov_cnt = i; + virtio_gpu_cleanup_mapping(g, res); + return false; + } + } + + QTAILQ_INSERT_HEAD(&g->reslist, res, next); + g->hostmem += res->hostmem; + return true; +} + static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size, const VMStateField *field) { VirtIOGPU *g = opaque; struct virtio_gpu_simple_resource *res; - struct virtio_gpu_scanout *scanout; uint32_t resource_id, pformat; void *bits = NULL; int i; @@ -1294,40 +1325,96 @@ static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size, qemu_get_buffer(f, (void *)pixman_image_get_data(res->image), pixman_image_get_stride(res->image) * res->height); - /* restore mapping */ - for (i = 0; i < res->iov_cnt; i++) { - hwaddr len = res->iov[i].iov_len; - res->iov[i].iov_base = - dma_memory_map(VIRTIO_DEVICE(g)->dma_as, res->addrs[i], &len, - DMA_DIRECTION_TO_DEVICE, - MEMTXATTRS_UNSPECIFIED); - - if (!res->iov[i].iov_base || len != res->iov[i].iov_len) { - /* Clean up the half-a-mapping we just created... */ - if (res->iov[i].iov_base) { - dma_memory_unmap(VIRTIO_DEVICE(g)->dma_as, - res->iov[i].iov_base, - len, - DMA_DIRECTION_TO_DEVICE, - 0); - } - /* ...and the mappings for previous loop iterations */ - res->iov_cnt = i; - virtio_gpu_cleanup_mapping(g, res); - pixman_image_unref(res->image); - g_free(res); - return -EINVAL; - } + if (!virtio_gpu_load_restore_mapping(g, res)) { + pixman_image_unref(res->image); + g_free(res); + return -EINVAL; } - QTAILQ_INSERT_HEAD(&g->reslist, res, next); - g->hostmem += res->hostmem; - resource_id = qemu_get_be32(f); } /* load & apply scanout state */ vmstate_load_state(f, &vmstate_virtio_gpu_scanouts, g, 1); + + return 0; +} + +static int virtio_gpu_blob_save(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field, JSONWriter *vmdesc) +{ + VirtIOGPU *g = opaque; + struct virtio_gpu_simple_resource *res; + int i; + + /* in 2d mode we should never find unprocessed commands here */ + assert(QTAILQ_EMPTY(&g->cmdq)); + + QTAILQ_FOREACH(res, &g->reslist, next) { + if (!res->blob_size) { + continue; + } + qemu_put_be32(f, res->resource_id); + qemu_put_be32(f, res->blob_size); + qemu_put_be32(f, res->iov_cnt); + for (i = 0; i < res->iov_cnt; i++) { + qemu_put_be64(f, res->addrs[i]); + qemu_put_be32(f, res->iov[i].iov_len); + } + } + qemu_put_be32(f, 0); /* end of list */ + + return 0; +} + +static int virtio_gpu_blob_load(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field) +{ + VirtIOGPU *g = opaque; + struct virtio_gpu_simple_resource *res; + uint32_t resource_id; + int i; + + resource_id = qemu_get_be32(f); + while (resource_id != 0) { + res = virtio_gpu_find_resource(g, resource_id); + if (res) { + return -EINVAL; + } + + res = g_new0(struct virtio_gpu_simple_resource, 1); + res->resource_id = resource_id; + res->blob_size = qemu_get_be32(f); + res->iov_cnt = qemu_get_be32(f); + res->addrs = g_new(uint64_t, res->iov_cnt); + res->iov = g_new(struct iovec, res->iov_cnt); + + /* read data */ + for (i = 0; i < res->iov_cnt; i++) { + res->addrs[i] = qemu_get_be64(f); + res->iov[i].iov_len = qemu_get_be32(f); + } + + if (!virtio_gpu_load_restore_mapping(g, res)) { + g_free(res); + return -EINVAL; + } + + virtio_gpu_init_udmabuf(res); + + resource_id = qemu_get_be32(f); + } + + return 0; +} + +static int virtio_gpu_post_load(void *opaque, int version_id) +{ + VirtIOGPU *g = opaque; + struct virtio_gpu_scanout *scanout; + struct virtio_gpu_simple_resource *res; + int i; + for (i = 0; i < g->parent_obj.conf.max_outputs; i++) { /* FIXME: should take scanout.r.{x,y} into account */ scanout = &g->parent_obj.scanout[i]; @@ -1475,6 +1562,32 @@ virtio_gpu_set_config(VirtIODevice *vdev, const uint8_t *config) } } +static bool virtio_gpu_blob_state_needed(void *opaque) +{ + VirtIOGPU *g = VIRTIO_GPU(opaque); + + return virtio_gpu_blob_enabled(g->parent_obj.conf); +} + +const VMStateDescription vmstate_virtio_gpu_blob_state = { + .name = "virtio-gpu/blob", + .minimum_version_id = VIRTIO_GPU_VM_VERSION, + .version_id = VIRTIO_GPU_VM_VERSION, + .needed = virtio_gpu_blob_state_needed, + .fields = (const VMStateField[]){ + { + .name = "virtio-gpu/blob", + .info = &(const VMStateInfo) { + .name = "blob", + .get = virtio_gpu_blob_load, + .put = virtio_gpu_blob_save, + }, + .flags = VMS_SINGLE, + } /* device */, + VMSTATE_END_OF_LIST() + }, +}; + /* * For historical reasons virtio_gpu does not adhere to virtio migration * scheme as described in doc/virtio-migration.txt, in a sense that no @@ -1500,6 +1613,11 @@ static const VMStateDescription vmstate_virtio_gpu = { } /* device */, VMSTATE_END_OF_LIST() }, + .subsections = (const VMStateDescription * []) { + &vmstate_virtio_gpu_blob_state, + NULL + }, + .post_load = virtio_gpu_post_load, }; static Property virtio_gpu_properties[] = { diff --git a/hw/hyperv/Kconfig b/hw/hyperv/Kconfig index fcf65903bd..41dd827c84 100644 --- a/hw/hyperv/Kconfig +++ b/hw/hyperv/Kconfig @@ -16,3 +16,13 @@ config SYNDBG bool default y depends on VMBUS + +config HV_BALLOON_SUPPORTED + bool + +config HV_BALLOON + bool + default y + depends on VMBUS + depends on HV_BALLOON_POSSIBLE + depends on HV_BALLOON_SUPPORTED diff --git a/hw/hyperv/hv-balloon-internal.h b/hw/hyperv/hv-balloon-internal.h new file mode 100644 index 0000000000..164c2e5825 --- /dev/null +++ b/hw/hyperv/hv-balloon-internal.h @@ -0,0 +1,33 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HYPERV_HV_BALLOON_INTERNAL_H +#define HW_HYPERV_HV_BALLOON_INTERNAL_H + +#include "qemu/osdep.h" + +#define HV_BALLOON_PFN_SHIFT 12 +#define HV_BALLOON_PAGE_SIZE (1 << HV_BALLOON_PFN_SHIFT) + +#define SUM_OVERFLOW_U64(in1, in2) ((in1) > UINT64_MAX - (in2)) +#define SUM_SATURATE_U64(in1, in2) \ + ({ \ + uint64_t _in1 = (in1), _in2 = (in2); \ + uint64_t _result; \ + \ + if (!SUM_OVERFLOW_U64(_in1, _in2)) { \ + _result = _in1 + _in2; \ + } else { \ + _result = UINT64_MAX; \ + } \ + \ + _result; \ + }) + +#endif diff --git a/hw/hyperv/hv-balloon-our_range_memslots.c b/hw/hyperv/hv-balloon-our_range_memslots.c new file mode 100644 index 0000000000..99bae870f3 --- /dev/null +++ b/hw/hyperv/hv-balloon-our_range_memslots.c @@ -0,0 +1,201 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "hv-balloon-internal.h" +#include "hv-balloon-our_range_memslots.h" +#include "trace.h" + +/* OurRange */ +static void our_range_init(OurRange *our_range, uint64_t start, uint64_t count) +{ + assert(count <= UINT64_MAX - start); + our_range->range.start = start; + our_range->range.count = count; + + hvb_page_range_tree_init(&our_range->removed_guest); + hvb_page_range_tree_init(&our_range->removed_both); + + /* mark the whole range as unused but for potential use */ + our_range->added = 0; + our_range->unusable_tail = 0; +} + +static void our_range_destroy(OurRange *our_range) +{ + hvb_page_range_tree_destroy(&our_range->removed_guest); + hvb_page_range_tree_destroy(&our_range->removed_both); +} + +void hvb_our_range_clear_removed_trees(OurRange *our_range) +{ + hvb_page_range_tree_destroy(&our_range->removed_guest); + hvb_page_range_tree_destroy(&our_range->removed_both); + hvb_page_range_tree_init(&our_range->removed_guest); + hvb_page_range_tree_init(&our_range->removed_both); +} + +void hvb_our_range_mark_added(OurRange *our_range, uint64_t additional_size) +{ + assert(additional_size <= UINT64_MAX - our_range->added); + + our_range->added += additional_size; + + assert(our_range->added <= UINT64_MAX - our_range->unusable_tail); + assert(our_range->added + our_range->unusable_tail <= + our_range->range.count); +} + +/* OurRangeMemslots */ +static void our_range_memslots_init_slots(OurRangeMemslots *our_range, + MemoryRegion *backing_mr, + Object *memslot_owner) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + unsigned int idx; + uint64_t memslot_offset; + + assert(memslots->count > 0); + memslots->slots = g_new0(MemoryRegion, memslots->count); + + /* Initialize our memslots, but don't map them yet. */ + assert(memslots->size_each > 0); + for (idx = 0, memslot_offset = 0; idx < memslots->count; + idx++, memslot_offset += memslots->size_each) { + uint64_t memslot_size; + g_autofree char *name = NULL; + + /* The size of the last memslot might be smaller. */ + if (idx == memslots->count - 1) { + uint64_t region_size; + + assert(our_range->mr); + region_size = memory_region_size(our_range->mr); + memslot_size = region_size - memslot_offset; + } else { + memslot_size = memslots->size_each; + } + + name = g_strdup_printf("memslot-%u", idx); + memory_region_init_alias(&memslots->slots[idx], memslot_owner, name, + backing_mr, memslot_offset, memslot_size); + /* + * We want to be able to atomically and efficiently activate/deactivate + * individual memslots without affecting adjacent memslots in memory + * notifiers. + */ + memory_region_set_unmergeable(&memslots->slots[idx], true); + } + + memslots->mapped_count = 0; +} + +OurRangeMemslots *hvb_our_range_memslots_new(uint64_t addr, + MemoryRegion *parent_mr, + MemoryRegion *backing_mr, + Object *memslot_owner, + unsigned int memslot_count, + uint64_t memslot_size) +{ + OurRangeMemslots *our_range; + + our_range = g_malloc(sizeof(*our_range)); + our_range_init(&our_range->range, + addr / HV_BALLOON_PAGE_SIZE, + memory_region_size(parent_mr) / HV_BALLOON_PAGE_SIZE); + our_range->slots.size_each = memslot_size; + our_range->slots.count = memslot_count; + our_range->mr = parent_mr; + our_range_memslots_init_slots(our_range, backing_mr, memslot_owner); + + return our_range; +} + +static void our_range_memslots_free_memslots(OurRangeMemslots *our_range) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + unsigned int idx; + uint64_t offset; + + memory_region_transaction_begin(); + for (idx = 0, offset = 0; idx < memslots->mapped_count; + idx++, offset += memslots->size_each) { + trace_hv_balloon_unmap_slot(idx, memslots->count, offset); + assert(memory_region_is_mapped(&memslots->slots[idx])); + memory_region_del_subregion(our_range->mr, &memslots->slots[idx]); + } + memory_region_transaction_commit(); + + for (idx = 0; idx < memslots->count; idx++) { + object_unparent(OBJECT(&memslots->slots[idx])); + } + + g_clear_pointer(&our_range->slots.slots, g_free); +} + +void hvb_our_range_memslots_free(OurRangeMemslots *our_range) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + MemoryRegion *hostmem_mr; + RAMBlock *rb; + + assert(our_range->slots.count > 0); + assert(our_range->slots.slots); + + hostmem_mr = memslots->slots[0].alias; + rb = hostmem_mr->ram_block; + ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); + + our_range_memslots_free_memslots(our_range); + our_range_destroy(&our_range->range); + g_free(our_range); +} + +void hvb_our_range_memslots_ensure_mapped_additional(OurRangeMemslots *our_range, + uint64_t additional_map_size) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + uint64_t total_map_size; + unsigned int idx; + uint64_t offset; + + total_map_size = (our_range->range.added + additional_map_size) * + HV_BALLOON_PAGE_SIZE; + idx = memslots->mapped_count; + assert(memslots->size_each > 0); + offset = idx * memslots->size_each; + + /* + * Activate all memslots covered by the newly added region in a single + * transaction. + */ + memory_region_transaction_begin(); + for ( ; idx < memslots->count; + idx++, offset += memslots->size_each) { + /* + * If this memslot starts beyond or at the end of the range to map so + * does every next one. + */ + if (offset >= total_map_size) { + break; + } + + /* + * Instead of enabling/disabling memslot, we add/remove them. This + * should make address space updates faster, because we don't have to + * loop over many disabled subregions. + */ + trace_hv_balloon_map_slot(idx, memslots->count, offset); + assert(!memory_region_is_mapped(&memslots->slots[idx])); + memory_region_add_subregion(our_range->mr, offset, + &memslots->slots[idx]); + + memslots->mapped_count++; + } + memory_region_transaction_commit(); +} diff --git a/hw/hyperv/hv-balloon-our_range_memslots.h b/hw/hyperv/hv-balloon-our_range_memslots.h new file mode 100644 index 0000000000..b6f592d34b --- /dev/null +++ b/hw/hyperv/hv-balloon-our_range_memslots.h @@ -0,0 +1,110 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HYPERV_HV_BALLOON_OUR_RANGE_MEMSLOTS_H +#define HW_HYPERV_HV_BALLOON_OUR_RANGE_MEMSLOTS_H + +#include "qemu/osdep.h" + +#include "exec/memory.h" +#include "qom/object.h" +#include "hv-balloon-page_range_tree.h" + +/* OurRange */ +#define OUR_RANGE(ptr) ((OurRange *)(ptr)) + +/* "our range" means the memory range owned by this driver (for hot-adding) */ +typedef struct OurRange { + PageRange range; + + /* How many pages were hot-added to the guest */ + uint64_t added; + + /* Pages at the end not currently usable */ + uint64_t unusable_tail; + + /* Memory removed from the guest */ + PageRangeTree removed_guest, removed_both; +} OurRange; + +static inline uint64_t our_range_get_remaining_start(OurRange *our_range) +{ + return our_range->range.start + our_range->added; +} + +static inline uint64_t our_range_get_remaining_size(OurRange *our_range) +{ + return our_range->range.count - our_range->added - our_range->unusable_tail; +} + +void hvb_our_range_mark_added(OurRange *our_range, uint64_t additional_size); + +static inline void our_range_mark_remaining_unusable(OurRange *our_range) +{ + our_range->unusable_tail = our_range->range.count - our_range->added; +} + +static inline PageRangeTree our_range_get_removed_tree(OurRange *our_range, + bool both) +{ + if (both) { + return our_range->removed_both; + } else { + return our_range->removed_guest; + } +} + +static inline bool our_range_is_removed_tree_empty(OurRange *our_range, + bool both) +{ + if (both) { + return page_range_tree_is_empty(our_range->removed_both); + } else { + return page_range_tree_is_empty(our_range->removed_guest); + } +} + +void hvb_our_range_clear_removed_trees(OurRange *our_range); + +/* OurRangeMemslots */ +typedef struct OurRangeMemslotsSlots { + /* Nominal size of each memslot (the last one might be smaller) */ + uint64_t size_each; + + /* Slots array and its element count */ + MemoryRegion *slots; + unsigned int count; + + /* How many slots are currently mapped */ + unsigned int mapped_count; +} OurRangeMemslotsSlots; + +typedef struct OurRangeMemslots { + OurRange range; + + /* Memslots covering our range */ + OurRangeMemslotsSlots slots; + + MemoryRegion *mr; +} OurRangeMemslots; + +OurRangeMemslots *hvb_our_range_memslots_new(uint64_t addr, + MemoryRegion *parent_mr, + MemoryRegion *backing_mr, + Object *memslot_owner, + unsigned int memslot_count, + uint64_t memslot_size); +void hvb_our_range_memslots_free(OurRangeMemslots *our_range); + +G_DEFINE_AUTOPTR_CLEANUP_FUNC(OurRangeMemslots, hvb_our_range_memslots_free) + +void hvb_our_range_memslots_ensure_mapped_additional(OurRangeMemslots *our_range, + uint64_t additional_map_size); + +#endif diff --git a/hw/hyperv/hv-balloon-page_range_tree.c b/hw/hyperv/hv-balloon-page_range_tree.c new file mode 100644 index 0000000000..e178d8b413 --- /dev/null +++ b/hw/hyperv/hv-balloon-page_range_tree.c @@ -0,0 +1,228 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "hv-balloon-internal.h" +#include "hv-balloon-page_range_tree.h" + +/* + * temporarily avoid warnings about enhanced GTree API usage requiring a + * too recent Glib version until GLIB_VERSION_MAX_ALLOWED finally reaches + * the Glib version with this API + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +/* PageRangeTree */ +static gint page_range_tree_key_compare(gconstpointer leftp, + gconstpointer rightp, + gpointer user_data) +{ + const uint64_t *left = leftp, *right = rightp; + + if (*left < *right) { + return -1; + } else if (*left > *right) { + return 1; + } else { /* *left == *right */ + return 0; + } +} + +static GTreeNode *page_range_tree_insert_new(PageRangeTree tree, + uint64_t start, uint64_t count) +{ + uint64_t *key = g_malloc(sizeof(*key)); + PageRange *range = g_malloc(sizeof(*range)); + + assert(count > 0); + + *key = range->start = start; + range->count = count; + + return g_tree_insert_node(tree.t, key, range); +} + +void hvb_page_range_tree_insert(PageRangeTree tree, + uint64_t start, uint64_t count, + uint64_t *dupcount) +{ + GTreeNode *node; + bool joinable; + uint64_t intersection; + PageRange *range; + + assert(!SUM_OVERFLOW_U64(start, count)); + if (count == 0) { + return; + } + + node = g_tree_upper_bound(tree.t, &start); + if (node) { + node = g_tree_node_previous(node); + } else { + node = g_tree_node_last(tree.t); + } + + if (node) { + range = g_tree_node_value(node); + assert(range); + intersection = page_range_intersection_size(range, start, count); + joinable = page_range_joinable_right(range, start, count); + } + + if (!node || + (!intersection && !joinable)) { + /* + * !node case: the tree is empty or the very first node in the tree + * already has a higher key (the start of its range). + * the other case: there is a gap in the tree between the new range + * and the previous one. + * anyway, let's just insert the new range into the tree. + */ + node = page_range_tree_insert_new(tree, start, count); + assert(node); + range = g_tree_node_value(node); + assert(range); + } else { + /* + * the previous range in the tree either partially covers the new + * range or ends just at its beginning - extend it + */ + if (dupcount) { + *dupcount += intersection; + } + + count += start - range->start; + range->count = MAX(range->count, count); + } + + /* check next nodes for possible merging */ + for (node = g_tree_node_next(node); node; ) { + PageRange *rangecur; + + rangecur = g_tree_node_value(node); + assert(rangecur); + + intersection = page_range_intersection_size(rangecur, + range->start, range->count); + joinable = page_range_joinable_left(rangecur, + range->start, range->count); + if (!intersection && !joinable) { + /* the current node is disjoint */ + break; + } + + if (dupcount) { + *dupcount += intersection; + } + + count = rangecur->count + (rangecur->start - range->start); + range->count = MAX(range->count, count); + + /* the current node was merged in, remove it */ + start = rangecur->start; + node = g_tree_node_next(node); + /* no hinted removal in GTree... */ + g_tree_remove(tree.t, &start); + } +} + +bool hvb_page_range_tree_pop(PageRangeTree tree, PageRange *out, + uint64_t maxcount) +{ + GTreeNode *node; + PageRange *range; + + node = g_tree_node_last(tree.t); + if (!node) { + return false; + } + + range = g_tree_node_value(node); + assert(range); + + out->start = range->start; + + /* can't modify range->start as it is the node key */ + if (range->count > maxcount) { + out->start += range->count - maxcount; + out->count = maxcount; + range->count -= maxcount; + } else { + out->count = range->count; + /* no hinted removal in GTree... */ + g_tree_remove(tree.t, &out->start); + } + + return true; +} + +bool hvb_page_range_tree_intree_any(PageRangeTree tree, + uint64_t start, uint64_t count) +{ + GTreeNode *node; + + if (count == 0) { + return false; + } + + /* find the first node that can possibly intersect our range */ + node = g_tree_upper_bound(tree.t, &start); + if (node) { + /* + * a NULL node below means that the very first node in the tree + * already has a higher key (the start of its range). + */ + node = g_tree_node_previous(node); + } else { + /* a NULL node below means that the tree is empty */ + node = g_tree_node_last(tree.t); + } + /* node range start <= range start */ + + if (!node) { + /* node range start > range start */ + node = g_tree_node_first(tree.t); + } + + for ( ; node; node = g_tree_node_next(node)) { + PageRange *range = g_tree_node_value(node); + + assert(range); + /* + * if this node starts beyond or at the end of our range so does + * every next one + */ + if (range->start >= start + count) { + break; + } + + if (page_range_intersection_size(range, start, count) > 0) { + return true; + } + } + + return false; +} + +void hvb_page_range_tree_init(PageRangeTree *tree) +{ + tree->t = g_tree_new_full(page_range_tree_key_compare, NULL, + g_free, g_free); +} + +void hvb_page_range_tree_destroy(PageRangeTree *tree) +{ + /* g_tree_destroy() is not NULL-safe */ + if (!tree->t) { + return; + } + + g_tree_destroy(tree->t); + tree->t = NULL; +} diff --git a/hw/hyperv/hv-balloon-page_range_tree.h b/hw/hyperv/hv-balloon-page_range_tree.h new file mode 100644 index 0000000000..07a9ae0da6 --- /dev/null +++ b/hw/hyperv/hv-balloon-page_range_tree.h @@ -0,0 +1,118 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HYPERV_HV_BALLOON_PAGE_RANGE_TREE_H +#define HW_HYPERV_HV_BALLOON_PAGE_RANGE_TREE_H + +#include "qemu/osdep.h" + +/* PageRange */ +typedef struct PageRange { + uint64_t start; + uint64_t count; +} PageRange; + +/* return just the part of range before (start) */ +static inline void page_range_part_before(const PageRange *range, + uint64_t start, PageRange *out) +{ + uint64_t endr = range->start + range->count; + uint64_t end = MIN(endr, start); + + out->start = range->start; + if (end > out->start) { + out->count = end - out->start; + } else { + out->count = 0; + } +} + +/* return just the part of range after (start, count) */ +static inline void page_range_part_after(const PageRange *range, + uint64_t start, uint64_t count, + PageRange *out) +{ + uint64_t end = range->start + range->count; + uint64_t ends = start + count; + + out->start = MAX(range->start, ends); + if (end > out->start) { + out->count = end - out->start; + } else { + out->count = 0; + } +} + +static inline void page_range_intersect(const PageRange *range, + uint64_t start, uint64_t count, + PageRange *out) +{ + uint64_t end1 = range->start + range->count; + uint64_t end2 = start + count; + uint64_t end = MIN(end1, end2); + + out->start = MAX(range->start, start); + out->count = out->start < end ? end - out->start : 0; +} + +static inline uint64_t page_range_intersection_size(const PageRange *range, + uint64_t start, uint64_t count) +{ + PageRange trange; + + page_range_intersect(range, start, count, &trange); + return trange.count; +} + +static inline bool page_range_joinable_left(const PageRange *range, + uint64_t start, uint64_t count) +{ + return start + count == range->start; +} + +static inline bool page_range_joinable_right(const PageRange *range, + uint64_t start, uint64_t count) +{ + return range->start + range->count == start; +} + +static inline bool page_range_joinable(const PageRange *range, + uint64_t start, uint64_t count) +{ + return page_range_joinable_left(range, start, count) || + page_range_joinable_right(range, start, count); +} + +/* PageRangeTree */ +/* type safety */ +typedef struct PageRangeTree { + GTree *t; +} PageRangeTree; + +static inline bool page_range_tree_is_empty(PageRangeTree tree) +{ + guint nnodes = g_tree_nnodes(tree.t); + + return nnodes == 0; +} + +void hvb_page_range_tree_init(PageRangeTree *tree); +void hvb_page_range_tree_destroy(PageRangeTree *tree); + +bool hvb_page_range_tree_intree_any(PageRangeTree tree, + uint64_t start, uint64_t count); + +bool hvb_page_range_tree_pop(PageRangeTree tree, PageRange *out, + uint64_t maxcount); + +void hvb_page_range_tree_insert(PageRangeTree tree, + uint64_t start, uint64_t count, + uint64_t *dupcount); + +#endif diff --git a/hw/hyperv/hv-balloon-stub.c b/hw/hyperv/hv-balloon-stub.c new file mode 100644 index 0000000000..a47412d4a8 --- /dev/null +++ b/hw/hyperv/hv-balloon-stub.c @@ -0,0 +1,19 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-machine.h" +#include "qapi/qapi-types-machine.h" + +HvBalloonInfo *qmp_query_hv_balloon_status_report(Error **errp) +{ + error_setg(errp, "hv-balloon device not enabled in this build"); + return NULL; +} diff --git a/hw/hyperv/hv-balloon.c b/hw/hyperv/hv-balloon.c new file mode 100644 index 0000000000..66f297c1d7 --- /dev/null +++ b/hw/hyperv/hv-balloon.c @@ -0,0 +1,1769 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "hv-balloon-internal.h" + +#include "exec/address-spaces.h" +#include "exec/cpu-common.h" +#include "exec/ramblock.h" +#include "hw/boards.h" +#include "hw/hyperv/dynmem-proto.h" +#include "hw/hyperv/hv-balloon.h" +#include "hw/hyperv/vmbus.h" +#include "hw/mem/memory-device.h" +#include "hw/mem/pc-dimm.h" +#include "hw/qdev-core.h" +#include "hw/qdev-properties.h" +#include "monitor/qdev.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-machine.h" +#include "qapi/qapi-events-machine.h" +#include "qapi/qapi-types-machine.h" +#include "qapi/qmp/qdict.h" +#include "qapi/visitor.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "qemu/units.h" +#include "qemu/timer.h" +#include "sysemu/balloon.h" +#include "sysemu/hostmem.h" +#include "sysemu/reset.h" +#include "hv-balloon-our_range_memslots.h" +#include "hv-balloon-page_range_tree.h" +#include "trace.h" + +#define HV_BALLOON_ADDR_PROP "addr" +#define HV_BALLOON_MEMDEV_PROP "memdev" +#define HV_BALLOON_GUID "525074DC-8985-46e2-8057-A307DC18A502" + +/* + * Some Windows versions (at least Server 2019) will crash with various + * error codes when receiving DM protocol requests (at least + * DM_MEM_HOT_ADD_REQUEST) immediately after boot. + * + * It looks like Hyper-V from Server 2016 uses a 50-second after-boot + * delay, probably to workaround this issue, so we'll use this value, too. + */ +#define HV_BALLOON_POST_INIT_WAIT (50 * 1000) + +#define HV_BALLOON_HA_CHUNK_SIZE (2 * GiB) +#define HV_BALLOON_HA_CHUNK_PAGES (HV_BALLOON_HA_CHUNK_SIZE / HV_BALLOON_PAGE_SIZE) + +#define HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN (128 * MiB) + +#define HV_BALLOON_HR_CHUNK_PAGES 585728 +/* + * ^ that's the maximum number of pages + * that Windows returns in one hot remove response + * + * If the number requested is too high Windows will no longer honor + * these requests + */ + +struct HvBalloonClass { + VMBusDeviceClass parent_class; +} HvBalloonClass; + +typedef enum State { + /* not a real state */ + S_NO_CHANGE = 0, + + S_WAIT_RESET, + S_POST_RESET_CLOSED, + + /* init flow */ + S_VERSION, + S_CAPS, + S_POST_INIT_WAIT, + + S_IDLE, + + /* balloon op flow */ + S_BALLOON_POSTING, + S_BALLOON_RB_WAIT, + S_BALLOON_REPLY_WAIT, + + /* unballoon + hot add ops flow */ + S_UNBALLOON_POSTING, + S_UNBALLOON_RB_WAIT, + S_UNBALLOON_REPLY_WAIT, + S_HOT_ADD_SETUP, + S_HOT_ADD_RB_WAIT, + S_HOT_ADD_POSTING, + S_HOT_ADD_REPLY_WAIT, +} State; + +typedef struct StateDesc { + State state; + const char *desc; +} StateDesc; + +typedef struct HvBalloon { + VMBusDevice parent; + State state; + + union dm_version version; + union dm_caps caps; + + QEMUTimer post_init_timer; + + unsigned int trans_id; + + struct { + bool enabled; + bool received; + uint64_t committed; + uint64_t available; + } status_report; + + /* Guest target size */ + uint64_t target; + bool target_changed; + + /* Current (un)balloon / hot-add operation parameters */ + union { + uint64_t balloon_diff; + + struct { + uint64_t unballoon_diff; + uint64_t hot_add_diff; + }; + + struct { + PageRange hot_add_range; + uint64_t ha_current_count; + }; + }; + + OurRangeMemslots *our_range; + + /* Count of memslots covering our memory */ + unsigned int memslot_count; + + /* Nominal size of each memslot (the last one might be smaller) */ + uint64_t memslot_size; + + /* Non-ours removed memory */ + PageRangeTree removed_guest, removed_both; + + /* Grand totals of removed memory (both ours and non-ours) */ + uint64_t removed_guest_ctr, removed_both_ctr; + + /* MEMORY_DEVICE props */ + uint64_t addr; + HostMemoryBackend *hostmem; + MemoryRegion *mr; +} HvBalloon; + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, HV_BALLOON, VMBUS_DEVICE, \ + { TYPE_MEMORY_DEVICE }, { }) + +#define HV_BALLOON_SET_STATE(hvb, news) \ + do { \ + assert(news != S_NO_CHANGE); \ + hv_balloon_state_set(hvb, news, # news); \ + } while (0) + +#define HV_BALLOON_STATE_DESC_SET(stdesc, news) \ + _hv_balloon_state_desc_set(stdesc, news, # news) + +#define HV_BALLOON_STATE_DESC_INIT \ + { \ + .state = S_NO_CHANGE, \ + } + +typedef struct HvBalloonReq { + VMBusChanReq vmreq; +} HvBalloonReq; + +/* total our memory includes parts currently removed from the guest */ +static uint64_t hv_balloon_total_our_ram(HvBalloon *balloon) +{ + if (!balloon->our_range) { + return 0; + } + + return balloon->our_range->range.added; +} + +/* TODO: unify the code below with virtio-balloon and cache the value */ +static int build_dimm_list(Object *obj, void *opaque) +{ + GSList **list = opaque; + + if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { + DeviceState *dev = DEVICE(obj); + if (dev->realized) { /* only realized DIMMs matter */ + *list = g_slist_prepend(*list, dev); + } + } + + object_child_foreach(obj, build_dimm_list, opaque); + return 0; +} + +static ram_addr_t get_current_ram_size(void) +{ + GSList *list = NULL, *item; + ram_addr_t size = current_machine->ram_size; + + build_dimm_list(qdev_get_machine(), &list); + for (item = list; item; item = g_slist_next(item)) { + Object *obj = OBJECT(item->data); + if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) + size += object_property_get_int(obj, PC_DIMM_SIZE_PROP, + &error_abort); + } + g_slist_free(list); + + return size; +} + +/* total RAM includes memory currently removed from the guest */ +static uint64_t hv_balloon_total_ram(HvBalloon *balloon) +{ + ram_addr_t ram_size = get_current_ram_size(); + uint64_t ram_size_pages = ram_size >> HV_BALLOON_PFN_SHIFT; + uint64_t our_ram_size_pages = hv_balloon_total_our_ram(balloon); + + assert(ram_size_pages > 0); + + return SUM_SATURATE_U64(ram_size_pages, our_ram_size_pages); +} + +/* + * calculating the total RAM size is a slow operation, + * avoid it as much as possible + */ +static uint64_t hv_balloon_total_removed_rs(HvBalloon *balloon, + uint64_t ram_size_pages) +{ + uint64_t total_removed; + + total_removed = SUM_SATURATE_U64(balloon->removed_guest_ctr, + balloon->removed_both_ctr); + + /* possible if guest returns pages outside actual RAM */ + if (total_removed > ram_size_pages) { + total_removed = ram_size_pages; + } + + return total_removed; +} + +/* Returns whether the state has actually changed */ +static bool hv_balloon_state_set(HvBalloon *balloon, + State newst, const char *newststr) +{ + if (newst == S_NO_CHANGE || balloon->state == newst) { + return false; + } + + balloon->state = newst; + trace_hv_balloon_state_change(newststr); + return true; +} + +static void _hv_balloon_state_desc_set(StateDesc *stdesc, + State newst, const char *newststr) +{ + /* state setting is only permitted on a freshly init desc */ + assert(stdesc->state == S_NO_CHANGE); + + assert(newst != S_NO_CHANGE); + + stdesc->state = newst; + stdesc->desc = newststr; +} + +static VMBusChannel *hv_balloon_get_channel_maybe(HvBalloon *balloon) +{ + return vmbus_device_channel(&balloon->parent, 0); +} + +static VMBusChannel *hv_balloon_get_channel(HvBalloon *balloon) +{ + VMBusChannel *chan; + + chan = hv_balloon_get_channel_maybe(balloon); + assert(chan != NULL); + return chan; +} + +static ssize_t hv_balloon_send_packet(VMBusChannel *chan, + struct dm_message *msg) +{ + int ret; + + ret = vmbus_channel_reserve(chan, 0, msg->hdr.size); + if (ret < 0) { + return ret; + } + + return vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, msg, msg->hdr.size, false, + msg->hdr.trans_id); +} + +static bool hv_balloon_unballoon_get_source(HvBalloon *balloon, + PageRangeTree *dtree, + uint64_t **dctr, + bool *is_our_range) +{ + OurRange *our_range = OUR_RANGE(balloon->our_range); + + /* Try the boot memory first */ + if (g_tree_nnodes(balloon->removed_guest.t) > 0) { + *dtree = balloon->removed_guest; + *dctr = &balloon->removed_guest_ctr; + *is_our_range = false; + } else if (g_tree_nnodes(balloon->removed_both.t) > 0) { + *dtree = balloon->removed_both; + *dctr = &balloon->removed_both_ctr; + *is_our_range = false; + } else if (!our_range) { + return false; + } else if (!our_range_is_removed_tree_empty(our_range, false)) { + *dtree = our_range_get_removed_tree(our_range, false); + *dctr = &balloon->removed_guest_ctr; + *is_our_range = true; + } else if (!our_range_is_removed_tree_empty(our_range, true)) { + *dtree = our_range_get_removed_tree(our_range, true); + *dctr = &balloon->removed_both_ctr; + *is_our_range = true; + } else { + return false; + } + + return true; +} + +static void hv_balloon_unballoon_rb_wait(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_unballoon_request *ur; + size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]); + + assert(balloon->state == S_UNBALLOON_RB_WAIT); + + if (vmbus_channel_reserve(chan, 0, ur_size) < 0) { + return; + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_POSTING); +} + +static void hv_balloon_unballoon_posting(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + PageRangeTree dtree; + uint64_t *dctr; + bool our_range; + struct dm_unballoon_request *ur; + size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]); + PageRange range; + bool bret; + ssize_t ret; + + assert(balloon->state == S_UNBALLOON_POSTING); + assert(balloon->unballoon_diff > 0); + + if (!hv_balloon_unballoon_get_source(balloon, &dtree, &dctr, &our_range)) { + error_report("trying to unballoon but nothing seems to be ballooned"); + /* + * there is little we can do as we might have already + * sent the guest a partial request we can't cancel + */ + return; + } + + assert(balloon->our_range || !our_range); + assert(dtree.t); + assert(dctr); + + ur = alloca(ur_size); + memset(ur, 0, ur_size); + ur->hdr.type = DM_UNBALLOON_REQUEST; + ur->hdr.size = ur_size; + ur->hdr.trans_id = balloon->trans_id; + + bret = hvb_page_range_tree_pop(dtree, &range, MIN(balloon->unballoon_diff, + HV_BALLOON_HA_CHUNK_PAGES)); + assert(bret); + /* TODO: madvise? */ + + *dctr -= range.count; + balloon->unballoon_diff -= range.count; + + ur->range_count = 1; + ur->range_array[0].finfo.start_page = range.start; + ur->range_array[0].finfo.page_cnt = range.count; + ur->more_pages = balloon->unballoon_diff > 0; + + trace_hv_balloon_outgoing_unballoon(ur->hdr.trans_id, + range.count, range.start, + balloon->unballoon_diff); + + if (ur->more_pages) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT); + } else { + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_REPLY_WAIT); + } + + ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, ur, ur_size, false, + ur->hdr.trans_id); + if (ret <= 0) { + error_report("error %zd when posting unballoon msg, expect problems", + ret); + } +} + +static bool hv_balloon_our_range_ensure(HvBalloon *balloon) +{ + uint64_t align; + MemoryRegion *hostmem_mr; + g_autoptr(OurRangeMemslots) our_range_memslots = NULL; + OurRange *our_range; + + if (balloon->our_range) { + return true; + } + + if (!balloon->hostmem) { + return false; + } + + align = (1 << balloon->caps.cap_bits.hot_add_alignment) * MiB; + assert(QEMU_IS_ALIGNED(balloon->addr, align)); + + hostmem_mr = host_memory_backend_get_memory(balloon->hostmem); + + our_range_memslots = hvb_our_range_memslots_new(balloon->addr, + balloon->mr, hostmem_mr, + OBJECT(balloon), + balloon->memslot_count, + balloon->memslot_size); + our_range = OUR_RANGE(our_range_memslots); + + if (hvb_page_range_tree_intree_any(balloon->removed_guest, + our_range->range.start, + our_range->range.count) || + hvb_page_range_tree_intree_any(balloon->removed_both, + our_range->range.start, + our_range->range.count)) { + error_report("some parts of the memory backend were already returned by the guest. this should not happen, please reboot the guest and try again"); + return false; + } + + trace_hv_balloon_our_range_add(our_range->range.count, + our_range->range.start); + + balloon->our_range = g_steal_pointer(&our_range_memslots); + return true; +} + +static void hv_balloon_hot_add_setup(HvBalloon *balloon, StateDesc *stdesc) +{ + /* need to make copy since it is in union with hot_add_range */ + uint64_t hot_add_diff = balloon->hot_add_diff; + PageRange *hot_add_range = &balloon->hot_add_range; + uint64_t align, our_range_remaining; + OurRange *our_range; + + assert(balloon->state == S_HOT_ADD_SETUP); + assert(hot_add_diff > 0); + + if (!hv_balloon_our_range_ensure(balloon)) { + goto ret_idle; + } + + our_range = OUR_RANGE(balloon->our_range); + + align = (1 << balloon->caps.cap_bits.hot_add_alignment) * + (MiB / HV_BALLOON_PAGE_SIZE); + + /* Absolute GPA in pages */ + hot_add_range->start = our_range_get_remaining_start(our_range); + assert(QEMU_IS_ALIGNED(hot_add_range->start, align)); + + our_range_remaining = our_range_get_remaining_size(our_range); + hot_add_range->count = MIN(our_range_remaining, hot_add_diff); + hot_add_range->count = QEMU_ALIGN_DOWN(hot_add_range->count, align); + if (hot_add_range->count == 0) { + goto ret_idle; + } + + hvb_our_range_memslots_ensure_mapped_additional(balloon->our_range, + hot_add_range->count); + + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT); + return; + +ret_idle: + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); +} + +static void hv_balloon_hot_add_rb_wait(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_hot_add *ha; + size_t ha_size = sizeof(*ha) + sizeof(ha->range); + + assert(balloon->state == S_HOT_ADD_RB_WAIT); + + if (vmbus_channel_reserve(chan, 0, ha_size) < 0) { + return; + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_POSTING); +} + +static void hv_balloon_hot_add_posting(HvBalloon *balloon, StateDesc *stdesc) +{ + PageRange *hot_add_range = &balloon->hot_add_range; + uint64_t *current_count = &balloon->ha_current_count; + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_hot_add *ha; + size_t ha_size = sizeof(*ha) + sizeof(ha->range); + union dm_mem_page_range *ha_region; + uint64_t align, chunk_max_size; + ssize_t ret; + + assert(balloon->state == S_HOT_ADD_POSTING); + assert(hot_add_range->count > 0); + + align = (1 << balloon->caps.cap_bits.hot_add_alignment) * + (MiB / HV_BALLOON_PAGE_SIZE); + if (align >= HV_BALLOON_HA_CHUNK_PAGES) { + /* + * If the required alignment is higher than the chunk size we let it + * override that size. + */ + chunk_max_size = align; + } else { + chunk_max_size = QEMU_ALIGN_DOWN(HV_BALLOON_HA_CHUNK_PAGES, align); + } + + /* + * hot_add_range->count starts aligned in hv_balloon_hot_add_setup(), + * then it is either reduced by subtracting aligned current_count or + * further hot-adds are prevented by marking the whole remaining our range + * as unusable in hv_balloon_handle_hot_add_response(). + */ + *current_count = MIN(hot_add_range->count, chunk_max_size); + + ha = alloca(ha_size); + ha_region = &(&ha->range)[1]; + memset(ha, 0, ha_size); + ha->hdr.type = DM_MEM_HOT_ADD_REQUEST; + ha->hdr.size = ha_size; + ha->hdr.trans_id = balloon->trans_id; + + ha->range.finfo.start_page = hot_add_range->start; + ha->range.finfo.page_cnt = *current_count; + ha_region->finfo.start_page = hot_add_range->start; + ha_region->finfo.page_cnt = ha->range.finfo.page_cnt; + + trace_hv_balloon_outgoing_hot_add(ha->hdr.trans_id, + *current_count, hot_add_range->start); + + ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, ha, ha_size, false, + ha->hdr.trans_id); + if (ret <= 0) { + error_report("error %zd when posting hot add msg, expect problems", + ret); + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_REPLY_WAIT); +} + +static void hv_balloon_balloon_rb_wait(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + size_t bl_size = sizeof(struct dm_balloon); + + assert(balloon->state == S_BALLOON_RB_WAIT); + + if (vmbus_channel_reserve(chan, 0, bl_size) < 0) { + return; + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_POSTING); +} + +static void hv_balloon_balloon_posting(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_balloon bl; + size_t bl_size = sizeof(bl); + ssize_t ret; + + assert(balloon->state == S_BALLOON_POSTING); + assert(balloon->balloon_diff > 0); + + memset(&bl, 0, sizeof(bl)); + bl.hdr.type = DM_BALLOON_REQUEST; + bl.hdr.size = bl_size; + bl.hdr.trans_id = balloon->trans_id; + bl.num_pages = MIN(balloon->balloon_diff, HV_BALLOON_HR_CHUNK_PAGES); + + trace_hv_balloon_outgoing_balloon(bl.hdr.trans_id, bl.num_pages, + balloon->balloon_diff); + + ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, &bl, bl_size, false, + bl.hdr.trans_id); + if (ret <= 0) { + error_report("error %zd when posting balloon msg, expect problems", + ret); + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_REPLY_WAIT); +} + +static void hv_balloon_idle_state_process_target(HvBalloon *balloon, + StateDesc *stdesc) +{ + bool can_balloon = balloon->caps.cap_bits.balloon; + uint64_t ram_size_pages, total_removed; + + ram_size_pages = hv_balloon_total_ram(balloon); + total_removed = hv_balloon_total_removed_rs(balloon, ram_size_pages); + + /* + * we need to cache the values computed from the balloon target value when + * starting the adjustment procedure in case someone changes the target when + * the procedure is in progress + */ + if (balloon->target > ram_size_pages - total_removed) { + bool can_hot_add = balloon->caps.cap_bits.hot_add; + uint64_t target_diff = balloon->target - + (ram_size_pages - total_removed); + + balloon->unballoon_diff = MIN(target_diff, total_removed); + + if (can_hot_add) { + balloon->hot_add_diff = target_diff - balloon->unballoon_diff; + } else { + balloon->hot_add_diff = 0; + } + + if (balloon->unballoon_diff > 0) { + assert(can_balloon); + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT); + } else if (balloon->hot_add_diff > 0) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP); + } + } else if (can_balloon && + balloon->target < ram_size_pages - total_removed) { + balloon->balloon_diff = ram_size_pages - total_removed - + balloon->target; + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT); + } +} + +static void hv_balloon_idle_state(HvBalloon *balloon, + StateDesc *stdesc) +{ + assert(balloon->state == S_IDLE); + + if (balloon->target_changed) { + balloon->target_changed = false; + hv_balloon_idle_state_process_target(balloon, stdesc); + return; + } +} + +static const struct { + void (*handler)(HvBalloon *balloon, StateDesc *stdesc); +} state_handlers[] = { + [S_IDLE].handler = hv_balloon_idle_state, + [S_BALLOON_POSTING].handler = hv_balloon_balloon_posting, + [S_BALLOON_RB_WAIT].handler = hv_balloon_balloon_rb_wait, + [S_UNBALLOON_POSTING].handler = hv_balloon_unballoon_posting, + [S_UNBALLOON_RB_WAIT].handler = hv_balloon_unballoon_rb_wait, + [S_HOT_ADD_SETUP].handler = hv_balloon_hot_add_setup, + [S_HOT_ADD_RB_WAIT].handler = hv_balloon_hot_add_rb_wait, + [S_HOT_ADD_POSTING].handler = hv_balloon_hot_add_posting, +}; + +static void hv_balloon_handle_state(HvBalloon *balloon, StateDesc *stdesc) +{ + if (balloon->state >= ARRAY_SIZE(state_handlers) || + !state_handlers[balloon->state].handler) { + return; + } + + state_handlers[balloon->state].handler(balloon, stdesc); +} + +static void hv_balloon_remove_response_insert_range(PageRangeTree tree, + const PageRange *range, + uint64_t *ctr1, + uint64_t *ctr2, + uint64_t *ctr3) +{ + uint64_t dupcount, effcount; + + if (range->count == 0) { + return; + } + + dupcount = 0; + hvb_page_range_tree_insert(tree, range->start, range->count, &dupcount); + + assert(dupcount <= range->count); + effcount = range->count - dupcount; + + *ctr1 += effcount; + *ctr2 += effcount; + if (ctr3) { + *ctr3 += effcount; + } +} + +static void hv_balloon_remove_response_handle_range(HvBalloon *balloon, + PageRange *range, + bool both, + uint64_t *removedctr) +{ + OurRange *our_range = OUR_RANGE(balloon->our_range); + PageRangeTree globaltree = + both ? balloon->removed_both : balloon->removed_guest; + uint64_t *globalctr = + both ? &balloon->removed_both_ctr : &balloon->removed_guest_ctr; + PageRange rangeeff; + + if (range->count == 0) { + return; + } + + trace_hv_balloon_remove_response(range->count, range->start, both); + + if (our_range) { + /* Includes the not-yet-hot-added and unusable parts. */ + rangeeff = our_range->range; + } else { + rangeeff.start = rangeeff.count = 0; + } + + if (page_range_intersection_size(range, rangeeff.start, rangeeff.count) > 0) { + PageRangeTree ourtree = our_range_get_removed_tree(our_range, both); + PageRange rangehole, rangecommon; + uint64_t ourremoved = 0; + + /* process the hole before our range, if it exists */ + page_range_part_before(range, rangeeff.start, &rangehole); + hv_balloon_remove_response_insert_range(globaltree, &rangehole, + globalctr, removedctr, NULL); + if (rangehole.count > 0) { + trace_hv_balloon_remove_response_hole(rangehole.count, + rangehole.start, + range->count, range->start, + rangeeff.start, both); + } + + /* process our part */ + page_range_intersect(range, rangeeff.start, rangeeff.count, + &rangecommon); + hv_balloon_remove_response_insert_range(ourtree, &rangecommon, + globalctr, removedctr, + &ourremoved); + if (rangecommon.count > 0) { + trace_hv_balloon_remove_response_common(rangecommon.count, + rangecommon.start, + range->count, range->start, + rangeeff.count, + rangeeff.start, ourremoved, + both); + } + + /* calculate what's left after our range */ + rangecommon = *range; + page_range_part_after(&rangecommon, rangeeff.start, rangeeff.count, + range); + } + + /* process the remainder of the range that lies after our range */ + if (range->count > 0) { + hv_balloon_remove_response_insert_range(globaltree, range, + globalctr, removedctr, NULL); + trace_hv_balloon_remove_response_remainder(range->count, range->start, + both); + range->count = 0; + } +} + +static void hv_balloon_remove_response_handle_pages(HvBalloon *balloon, + PageRange *range, + uint64_t start, + uint64_t count, + bool both, + uint64_t *removedctr) +{ + assert(count > 0); + + /* + * if there is an existing range that the new range can't be joined to + * dump it into tree(s) + */ + if (range->count > 0 && !page_range_joinable(range, start, count)) { + hv_balloon_remove_response_handle_range(balloon, range, both, + removedctr); + } + + if (range->count == 0) { + range->start = start; + range->count = count; + } else if (page_range_joinable_left(range, start, count)) { + range->start = start; + range->count += count; + } else { /* page_range_joinable_right() */ + range->count += count; + } +} + +static gboolean hv_balloon_handle_remove_host_addr_node(gpointer key, + gpointer value, + gpointer data) +{ + PageRange *range = value; + uint64_t pageoff; + + for (pageoff = 0; pageoff < range->count; ) { + uint64_t addr_64 = (range->start + pageoff) * HV_BALLOON_PAGE_SIZE; + void *addr; + RAMBlock *rb; + ram_addr_t rb_offset; + size_t rb_page_size; + size_t discard_size; + + assert(addr_64 <= UINTPTR_MAX); + addr = (void *)((uintptr_t)addr_64); + rb = qemu_ram_block_from_host(addr, false, &rb_offset); + rb_page_size = qemu_ram_pagesize(rb); + + if (rb_page_size != HV_BALLOON_PAGE_SIZE) { + /* TODO: these should end in "removed_guest" */ + warn_report("guest reported removed page backed by unsupported page size %zu", + rb_page_size); + pageoff++; + continue; + } + + discard_size = MIN(range->count - pageoff, + (rb->max_length - rb_offset) / + HV_BALLOON_PAGE_SIZE); + discard_size = MAX(discard_size, 1); + + if (ram_block_discard_range(rb, rb_offset, discard_size * + HV_BALLOON_PAGE_SIZE) != 0) { + warn_report("guest reported removed page failed discard"); + } + + pageoff += discard_size; + } + + return false; +} + +static void hv_balloon_handle_remove_host_addr_tree(PageRangeTree tree) +{ + g_tree_foreach(tree.t, hv_balloon_handle_remove_host_addr_node, NULL); +} + +static int hv_balloon_handle_remove_section(PageRangeTree tree, + const MemoryRegionSection *section, + uint64_t count) +{ + void *addr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + uint64_t addr_page; + + assert(count > 0); + + if ((uintptr_t)addr % HV_BALLOON_PAGE_SIZE) { + warn_report("guest reported removed pages at an unaligned host addr %p", + addr); + return -EINVAL; + } + + addr_page = (uintptr_t)addr / HV_BALLOON_PAGE_SIZE; + hvb_page_range_tree_insert(tree, addr_page, count, NULL); + + return 0; +} + +static void hv_balloon_handle_remove_ranges(HvBalloon *balloon, + union dm_mem_page_range ranges[], + uint32_t count) +{ + uint64_t removedcnt; + PageRangeTree removed_host_addr; + PageRange range_guest, range_both; + + hvb_page_range_tree_init(&removed_host_addr); + range_guest.count = range_both.count = removedcnt = 0; + for (unsigned int ctr = 0; ctr < count; ctr++) { + union dm_mem_page_range *mr = &ranges[ctr]; + hwaddr pa; + MemoryRegionSection section; + + for (unsigned int offset = 0; offset < mr->finfo.page_cnt; ) { + int ret; + uint64_t pageno = mr->finfo.start_page + offset; + uint64_t pagecnt = 1; + + pa = (hwaddr)pageno << HV_BALLOON_PFN_SHIFT; + section = memory_region_find(get_system_memory(), pa, + (mr->finfo.page_cnt - offset) * + HV_BALLOON_PAGE_SIZE); + if (!section.mr) { + warn_report("guest reported removed page %"PRIu64" not found in RAM", + pageno); + ret = -EINVAL; + goto finish_page; + } + + pagecnt = int128_get64(section.size) / HV_BALLOON_PAGE_SIZE; + if (pagecnt <= 0) { + warn_report("guest reported removed page %"PRIu64" in a section smaller than page size", + pageno); + pagecnt = 1; /* skip the whole page */ + ret = -EINVAL; + goto finish_page; + } + + if (!memory_region_is_ram(section.mr) || + memory_region_is_rom(section.mr) || + memory_region_is_romd(section.mr)) { + warn_report("guest reported removed page %"PRIu64" in a section that is not an ordinary RAM", + pageno); + ret = -EINVAL; + goto finish_page; + } + + ret = hv_balloon_handle_remove_section(removed_host_addr, §ion, + pagecnt); + + finish_page: + if (ret == 0) { + hv_balloon_remove_response_handle_pages(balloon, + &range_both, + pageno, pagecnt, + true, &removedcnt); + } else { + hv_balloon_remove_response_handle_pages(balloon, + &range_guest, + pageno, pagecnt, + false, &removedcnt); + } + + if (section.mr) { + memory_region_unref(section.mr); + } + + offset += pagecnt; + } + } + + hv_balloon_remove_response_handle_range(balloon, &range_both, true, + &removedcnt); + hv_balloon_remove_response_handle_range(balloon, &range_guest, false, + &removedcnt); + + hv_balloon_handle_remove_host_addr_tree(removed_host_addr); + hvb_page_range_tree_destroy(&removed_host_addr); + + if (removedcnt > balloon->balloon_diff) { + warn_report("guest reported more pages removed than currently pending (%"PRIu64" vs %"PRIu64")", + removedcnt, balloon->balloon_diff); + balloon->balloon_diff = 0; + } else { + balloon->balloon_diff -= removedcnt; + } +} + +static bool hv_balloon_handle_msg_size(HvBalloonReq *req, size_t minsize, + const char *msgname) +{ + VMBusChanReq *vmreq = &req->vmreq; + uint32_t msglen = vmreq->msglen; + + if (msglen >= minsize) { + return true; + } + + warn_report("%s message too short (%u vs %zu), ignoring", msgname, + (unsigned int)msglen, minsize); + return false; +} + +static void hv_balloon_handle_version_request(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_version_request *msgVr = vmreq->msg; + struct dm_version_response respVr; + + if (balloon->state != S_VERSION) { + warn_report("unexpected DM_VERSION_REQUEST in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgVr), + "DM_VERSION_REQUEST")) { + return; + } + + trace_hv_balloon_incoming_version(msgVr->version.major_version, + msgVr->version.minor_version); + + memset(&respVr, 0, sizeof(respVr)); + respVr.hdr.type = DM_VERSION_RESPONSE; + respVr.hdr.size = sizeof(respVr); + respVr.hdr.trans_id = msgVr->hdr.trans_id; + respVr.is_accepted = msgVr->version.version >= DYNMEM_PROTOCOL_VERSION_1 && + msgVr->version.version <= DYNMEM_PROTOCOL_VERSION_3; + + hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respVr); + + if (respVr.is_accepted) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_CAPS); + } +} + +static void hv_balloon_handle_caps_report(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_capabilities *msgCap = vmreq->msg; + struct dm_capabilities_resp_msg respCap; + + if (balloon->state != S_CAPS) { + warn_report("unexpected DM_CAPABILITIES_REPORT in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgCap), + "DM_CAPABILITIES_REPORT")) { + return; + } + + trace_hv_balloon_incoming_caps(msgCap->caps.caps); + balloon->caps = msgCap->caps; + + memset(&respCap, 0, sizeof(respCap)); + respCap.hdr.type = DM_CAPABILITIES_RESPONSE; + respCap.hdr.size = sizeof(respCap); + respCap.hdr.trans_id = msgCap->hdr.trans_id; + respCap.is_accepted = 1; + respCap.hot_remove = 1; + respCap.suppress_pressure_reports = !balloon->status_report.enabled; + hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respCap); + + timer_mod(&balloon->post_init_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + HV_BALLOON_POST_INIT_WAIT); + + HV_BALLOON_STATE_DESC_SET(stdesc, S_POST_INIT_WAIT); +} + +static void hv_balloon_handle_status_report(HvBalloon *balloon, + HvBalloonReq *req) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_status *msgStatus = vmreq->msg; + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgStatus), + "DM_STATUS_REPORT")) { + return; + } + + if (!balloon->status_report.enabled) { + return; + } + + balloon->status_report.committed = msgStatus->num_committed; + balloon->status_report.committed *= HV_BALLOON_PAGE_SIZE; + balloon->status_report.available = msgStatus->num_avail; + balloon->status_report.available *= HV_BALLOON_PAGE_SIZE; + balloon->status_report.received = true; + + qapi_event_send_hv_balloon_status_report(balloon->status_report.committed, + balloon->status_report.available); +} + +HvBalloonInfo *qmp_query_hv_balloon_status_report(Error **errp) +{ + HvBalloon *balloon; + HvBalloonInfo *info; + + balloon = HV_BALLOON(object_resolve_path_type("", TYPE_HV_BALLOON, NULL)); + if (!balloon) { + error_setg(errp, "no %s device present", TYPE_HV_BALLOON); + return NULL; + } + + if (!balloon->status_report.enabled) { + error_setg(errp, "guest memory status reporting not enabled"); + return NULL; + } + + if (!balloon->status_report.received) { + error_setg(errp, "no guest memory status report received yet"); + return NULL; + } + + info = g_malloc0(sizeof(*info)); + info->committed = balloon->status_report.committed; + info->available = balloon->status_report.available; + return info; +} + +static void hv_balloon_handle_unballoon_response(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_unballoon_response *msgUrR = vmreq->msg; + + if (balloon->state != S_UNBALLOON_REPLY_WAIT) { + warn_report("unexpected DM_UNBALLOON_RESPONSE in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgUrR), + "DM_UNBALLOON_RESPONSE")) + return; + + trace_hv_balloon_incoming_unballoon(msgUrR->hdr.trans_id); + + balloon->trans_id++; + + if (balloon->hot_add_diff > 0) { + bool can_hot_add = balloon->caps.cap_bits.hot_add; + + assert(can_hot_add); + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP); + } else { + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); + } +} + +static void hv_balloon_handle_hot_add_response(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + PageRange *hot_add_range = &balloon->hot_add_range; + VMBusChanReq *vmreq = &req->vmreq; + struct dm_hot_add_response *msgHaR = vmreq->msg; + OurRange *our_range; + + if (balloon->state != S_HOT_ADD_REPLY_WAIT) { + warn_report("unexpected DM_HOT_ADD_RESPONSE in %d state", + balloon->state); + return; + } + + assert(balloon->our_range); + our_range = OUR_RANGE(balloon->our_range); + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgHaR), + "DM_HOT_ADD_RESPONSE")) + return; + + trace_hv_balloon_incoming_hot_add(msgHaR->hdr.trans_id, msgHaR->result, + msgHaR->page_count); + + balloon->trans_id++; + + if (msgHaR->result) { + if (msgHaR->page_count > balloon->ha_current_count) { + warn_report("DM_HOT_ADD_RESPONSE page count higher than requested (%"PRIu32" vs %"PRIu64")", + msgHaR->page_count, balloon->ha_current_count); + msgHaR->page_count = balloon->ha_current_count; + } + + hvb_our_range_mark_added(our_range, msgHaR->page_count); + hot_add_range->start += msgHaR->page_count; + hot_add_range->count -= msgHaR->page_count; + } + + if (!msgHaR->result || msgHaR->page_count < balloon->ha_current_count) { + /* + * the current planned range was only partially hot-added, take note + * how much of it remains and don't attempt any further hot adds + */ + our_range_mark_remaining_unusable(our_range); + + goto ret_idle; + } + + /* any pages remaining to hot-add in our range? */ + if (hot_add_range->count > 0) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT); + return; + } + +ret_idle: + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); +} + +static void hv_balloon_handle_balloon_response(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_balloon_response *msgBR = vmreq->msg; + + if (balloon->state != S_BALLOON_REPLY_WAIT) { + warn_report("unexpected DM_BALLOON_RESPONSE in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgBR), + "DM_BALLOON_RESPONSE")) + return; + + trace_hv_balloon_incoming_balloon(msgBR->hdr.trans_id, msgBR->range_count, + msgBR->more_pages); + + if (vmreq->msglen < sizeof(*msgBR) + + (uint64_t)sizeof(msgBR->range_array[0]) * msgBR->range_count) { + warn_report("DM_BALLOON_RESPONSE too short for the range count"); + return; + } + + if (msgBR->range_count == 0) { + /* The guest is already at its minimum size */ + balloon->balloon_diff = 0; + goto ret_end_trans; + } else { + hv_balloon_handle_remove_ranges(balloon, + msgBR->range_array, + msgBR->range_count); + } + + /* More responses expected? */ + if (msgBR->more_pages) { + return; + } + +ret_end_trans: + balloon->trans_id++; + + if (balloon->balloon_diff > 0) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT); + } else { + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); + } +} + +static void hv_balloon_handle_packet(HvBalloon *balloon, HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_message *msg = vmreq->msg; + + if (vmreq->msglen < sizeof(msg->hdr)) { + return; + } + + switch (msg->hdr.type) { + case DM_VERSION_REQUEST: + hv_balloon_handle_version_request(balloon, req, stdesc); + break; + + case DM_CAPABILITIES_REPORT: + hv_balloon_handle_caps_report(balloon, req, stdesc); + break; + + case DM_STATUS_REPORT: + hv_balloon_handle_status_report(balloon, req); + break; + + case DM_MEM_HOT_ADD_RESPONSE: + hv_balloon_handle_hot_add_response(balloon, req, stdesc); + break; + + case DM_UNBALLOON_RESPONSE: + hv_balloon_handle_unballoon_response(balloon, req, stdesc); + break; + + case DM_BALLOON_RESPONSE: + hv_balloon_handle_balloon_response(balloon, req, stdesc); + break; + + default: + warn_report("unknown DM message %u", msg->hdr.type); + break; + } +} + +static bool hv_balloon_recv_channel(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan; + HvBalloonReq *req; + + if (balloon->state == S_WAIT_RESET || + balloon->state == S_POST_RESET_CLOSED) { + return false; + } + + chan = hv_balloon_get_channel(balloon); + if (vmbus_channel_recv_start(chan)) { + return false; + } + + while ((req = vmbus_channel_recv_peek(chan, sizeof(*req)))) { + hv_balloon_handle_packet(balloon, req, stdesc); + vmbus_free_req(req); + vmbus_channel_recv_pop(chan); + + if (stdesc->state != S_NO_CHANGE) { + break; + } + } + + return vmbus_channel_recv_done(chan) > 0; +} + +/* old state handler -> new state transition (potential) */ +static bool hv_balloon_event_loop_state(HvBalloon *balloon) +{ + StateDesc state_new = HV_BALLOON_STATE_DESC_INIT; + + hv_balloon_handle_state(balloon, &state_new); + return hv_balloon_state_set(balloon, state_new.state, state_new.desc); +} + +/* VMBus message -> new state transition (potential) */ +static bool hv_balloon_event_loop_recv(HvBalloon *balloon) +{ + StateDesc state_new = HV_BALLOON_STATE_DESC_INIT; + bool any_recv, state_changed; + + any_recv = hv_balloon_recv_channel(balloon, &state_new); + state_changed = hv_balloon_state_set(balloon, + state_new.state, state_new.desc); + + return state_changed || any_recv; +} + +static void hv_balloon_event_loop(HvBalloon *balloon) +{ + bool state_repeat, recv_repeat; + + do { + state_repeat = hv_balloon_event_loop_state(balloon); + recv_repeat = hv_balloon_event_loop_recv(balloon); + } while (state_repeat || recv_repeat); +} + +static void hv_balloon_vmdev_chan_notify(VMBusChannel *chan) +{ + HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan)); + + hv_balloon_event_loop(balloon); +} + +static void hv_balloon_stat(void *opaque, BalloonInfo *info) +{ + HvBalloon *balloon = opaque; + info->actual = (hv_balloon_total_ram(balloon) - balloon->removed_both_ctr) + << HV_BALLOON_PFN_SHIFT; +} + +static void hv_balloon_to_target(void *opaque, ram_addr_t target) +{ + HvBalloon *balloon = opaque; + uint64_t target_pages = target >> HV_BALLOON_PFN_SHIFT; + + if (!target_pages) { + return; + } + + /* + * always set target_changed, even with unchanged target, as the user + * might be asking us to try again reaching it + */ + balloon->target = target_pages; + balloon->target_changed = true; + + hv_balloon_event_loop(balloon); +} + +static int hv_balloon_vmdev_open_channel(VMBusChannel *chan) +{ + HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan)); + + if (balloon->state != S_POST_RESET_CLOSED) { + warn_report("guest trying to open a DM channel in invalid %d state", + balloon->state); + return -EINVAL; + } + + HV_BALLOON_SET_STATE(balloon, S_VERSION); + hv_balloon_event_loop(balloon); + + return 0; +} + +static void hv_balloon_vmdev_close_channel(VMBusChannel *chan) +{ + HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan)); + + timer_del(&balloon->post_init_timer); + + /* Don't report stale data */ + balloon->status_report.received = false; + + HV_BALLOON_SET_STATE(balloon, S_WAIT_RESET); + hv_balloon_event_loop(balloon); +} + +static void hv_balloon_post_init_timer(void *opaque) +{ + HvBalloon *balloon = opaque; + + if (balloon->state != S_POST_INIT_WAIT) { + return; + } + + HV_BALLOON_SET_STATE(balloon, S_IDLE); + hv_balloon_event_loop(balloon); +} + +static void hv_balloon_system_reset_unrealize_common(HvBalloon *balloon) +{ + g_clear_pointer(&balloon->our_range, hvb_our_range_memslots_free); +} + +static void hv_balloon_system_reset(void *opaque) +{ + HvBalloon *balloon = HV_BALLOON(opaque); + + hv_balloon_system_reset_unrealize_common(balloon); +} + +static void hv_balloon_ensure_mr(HvBalloon *balloon) +{ + MemoryRegion *hostmem_mr; + + assert(balloon->hostmem); + + if (balloon->mr) { + return; + } + + hostmem_mr = host_memory_backend_get_memory(balloon->hostmem); + + balloon->mr = g_new0(MemoryRegion, 1); + memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON, + memory_region_size(hostmem_mr)); + + /* + * The VM can indicate an alignment up to 32 GiB. Memory device core can + * usually only handle/guarantee 1 GiB alignment. The user will have to + * specify a larger maxmem eventually. + * + * The memory device core will warn the user in case maxmem might have to be + * increased and will fail plugging the device if there is not sufficient + * space after alignment. + * + * TODO: we could do the alignment ourselves in a slightly bigger region. + * But this feels better, although the warning might be annoying. Maybe + * we can optimize that in the future (e.g., with such a device on the + * cmdline place/size the device memory region differently. + */ + balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr)); +} + +static void hv_balloon_free_mr(HvBalloon *balloon) +{ + if (!balloon->mr) { + return; + } + + object_unparent(OBJECT(balloon->mr)); + g_clear_pointer(&balloon->mr, g_free); +} + +static void hv_balloon_vmdev_realize(VMBusDevice *vdev, Error **errp) +{ + ERRP_GUARD(); + HvBalloon *balloon = HV_BALLOON(vdev); + int ret; + + balloon->state = S_WAIT_RESET; + + ret = qemu_add_balloon_handler(hv_balloon_to_target, hv_balloon_stat, + balloon); + if (ret < 0) { + /* This also protects against having multiple hv-balloon instances */ + error_setg(errp, "Only one balloon device is supported"); + return; + } + + if (balloon->hostmem) { + if (host_memory_backend_is_mapped(balloon->hostmem)) { + Object *obj = OBJECT(balloon->hostmem); + + error_setg(errp, "'%s' property specifies a busy memdev: %s", + HV_BALLOON_MEMDEV_PROP, + object_get_canonical_path_component(obj)); + goto out_balloon_handler; + } + + hv_balloon_ensure_mr(balloon); + + /* This is rather unlikely to happen, but let's still check for it. */ + if (!QEMU_IS_ALIGNED(memory_region_size(balloon->mr), + HV_BALLOON_PAGE_SIZE)) { + error_setg(errp, "'%s' property memdev size has to be a multiple of 0x%" PRIx64, + HV_BALLOON_MEMDEV_PROP, (uint64_t)HV_BALLOON_PAGE_SIZE); + goto out_balloon_handler; + } + + host_memory_backend_set_mapped(balloon->hostmem, true); + vmstate_register_ram(host_memory_backend_get_memory(balloon->hostmem), + DEVICE(balloon)); + } else if (balloon->addr) { + error_setg(errp, "'%s' property must not be set without a memdev", + HV_BALLOON_MEMDEV_PROP); + goto out_balloon_handler; + } + + timer_init_ms(&balloon->post_init_timer, QEMU_CLOCK_VIRTUAL, + hv_balloon_post_init_timer, balloon); + + qemu_register_reset(hv_balloon_system_reset, balloon); + + return; + +out_balloon_handler: + qemu_remove_balloon_handler(balloon); +} + +/* + * VMBus device reset has to be implemented in case the guest decides to + * disconnect and reconnect to the VMBus without rebooting the whole system. + * + * However, the hot-added memory can't be removed here as Windows keeps on using + * it until the system is restarted, even after disconnecting from the VMBus. + */ +static void hv_balloon_vmdev_reset(VMBusDevice *vdev) +{ + HvBalloon *balloon = HV_BALLOON(vdev); + + if (balloon->state == S_POST_RESET_CLOSED) { + return; + } + + if (balloon->our_range) { + hvb_our_range_clear_removed_trees(OUR_RANGE(balloon->our_range)); + } + + hvb_page_range_tree_destroy(&balloon->removed_guest); + hvb_page_range_tree_destroy(&balloon->removed_both); + hvb_page_range_tree_init(&balloon->removed_guest); + hvb_page_range_tree_init(&balloon->removed_both); + + balloon->trans_id = 0; + balloon->removed_guest_ctr = 0; + balloon->removed_both_ctr = 0; + + HV_BALLOON_SET_STATE(balloon, S_POST_RESET_CLOSED); + hv_balloon_event_loop(balloon); +} + +/* + * Clean up things that were (possibly) allocated pre-realization, for example + * from memory_device_pre_plug(), so we don't leak them if the device don't + * actually get realized in the end. + */ +static void hv_balloon_unrealize_finalize_common(HvBalloon *balloon) +{ + hv_balloon_free_mr(balloon); + balloon->addr = 0; + + balloon->memslot_count = 0; +} + +static void hv_balloon_vmdev_unrealize(VMBusDevice *vdev) +{ + HvBalloon *balloon = HV_BALLOON(vdev); + + qemu_unregister_reset(hv_balloon_system_reset, balloon); + + hv_balloon_system_reset_unrealize_common(balloon); + + qemu_remove_balloon_handler(balloon); + + if (balloon->hostmem) { + vmstate_unregister_ram(host_memory_backend_get_memory(balloon->hostmem), + DEVICE(balloon)); + host_memory_backend_set_mapped(balloon->hostmem, false); + } + + hvb_page_range_tree_destroy(&balloon->removed_guest); + hvb_page_range_tree_destroy(&balloon->removed_both); + + hv_balloon_unrealize_finalize_common(balloon); +} + +static uint64_t hv_balloon_md_get_addr(const MemoryDeviceState *md) +{ + return object_property_get_uint(OBJECT(md), HV_BALLOON_ADDR_PROP, + &error_abort); +} + +static void hv_balloon_md_set_addr(MemoryDeviceState *md, uint64_t addr, + Error **errp) +{ + object_property_set_uint(OBJECT(md), HV_BALLOON_ADDR_PROP, addr, errp); +} + +static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md, + Error **errp) +{ + HvBalloon *balloon = HV_BALLOON(md); + + if (!balloon->hostmem) { + return NULL; + } + + hv_balloon_ensure_mr(balloon); + + return balloon->mr; +} + +static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md, + MemoryDeviceInfo *info) +{ + HvBalloonDeviceInfo *hi = g_new0(HvBalloonDeviceInfo, 1); + const HvBalloon *balloon = HV_BALLOON(md); + DeviceState *dev = DEVICE(md); + + if (dev->id) { + hi->id = g_strdup(dev->id); + } + + if (balloon->hostmem) { + hi->memdev = object_get_canonical_path(OBJECT(balloon->hostmem)); + hi->memaddr = balloon->addr; + hi->has_memaddr = true; + hi->max_size = memory_region_size(balloon->mr); + /* TODO: expose current provided size or something else? */ + } else { + hi->max_size = 0; + } + + info->u.hv_balloon.data = hi; + info->type = MEMORY_DEVICE_INFO_KIND_HV_BALLOON; +} + +static void hv_balloon_decide_memslots(MemoryDeviceState *md, + unsigned int limit) +{ + HvBalloon *balloon = HV_BALLOON(md); + MemoryRegion *hostmem_mr; + uint64_t region_size, memslot_size, memslots; + + /* We're called exactly once, before realizing the device. */ + assert(!balloon->memslot_count); + + /* We should not be called if we don't have a memory backend */ + assert(balloon->hostmem); + + hostmem_mr = host_memory_backend_get_memory(balloon->hostmem); + region_size = memory_region_size(hostmem_mr); + + assert(region_size > 0); + memslot_size = QEMU_ALIGN_UP(region_size / limit, + HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN); + memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size; + + if (memslots > 1) { + balloon->memslot_size = memslot_size; + } else { + balloon->memslot_size = region_size; + } + + assert(memslots <= UINT_MAX); + balloon->memslot_count = memslots; +} + +static unsigned int hv_balloon_get_memslots(MemoryDeviceState *md) +{ + const HvBalloon *balloon = HV_BALLOON(md); + + /* We're called after setting the suggested limit. */ + assert(balloon->memslot_count > 0); + + return balloon->memslot_count; +} + +static void hv_balloon_init(Object *obj) +{ +} + +static void hv_balloon_finalize(Object *obj) +{ + HvBalloon *balloon = HV_BALLOON(obj); + + hv_balloon_unrealize_finalize_common(balloon); +} + +static Property hv_balloon_properties[] = { + DEFINE_PROP_BOOL("status-report", HvBalloon, + status_report.enabled, false), + + /* MEMORY_DEVICE props */ + DEFINE_PROP_LINK(HV_BALLOON_MEMDEV_PROP, HvBalloon, hostmem, + TYPE_MEMORY_BACKEND, HostMemoryBackend *), + DEFINE_PROP_UINT64(HV_BALLOON_ADDR_PROP, HvBalloon, addr, 0), + + DEFINE_PROP_END_OF_LIST(), +}; + +static void hv_balloon_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VMBusDeviceClass *vdc = VMBUS_DEVICE_CLASS(klass); + MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass); + + device_class_set_props(dc, hv_balloon_properties); + qemu_uuid_parse(HV_BALLOON_GUID, &vdc->classid); + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + + vdc->vmdev_realize = hv_balloon_vmdev_realize; + vdc->vmdev_unrealize = hv_balloon_vmdev_unrealize; + vdc->vmdev_reset = hv_balloon_vmdev_reset; + vdc->open_channel = hv_balloon_vmdev_open_channel; + vdc->close_channel = hv_balloon_vmdev_close_channel; + vdc->chan_notify_cb = hv_balloon_vmdev_chan_notify; + + mdc->get_addr = hv_balloon_md_get_addr; + mdc->set_addr = hv_balloon_md_set_addr; + mdc->get_plugged_size = memory_device_get_region_size; + mdc->get_memory_region = hv_balloon_md_get_memory_region; + mdc->decide_memslots = hv_balloon_decide_memslots; + mdc->get_memslots = hv_balloon_get_memslots; + mdc->fill_device_info = hv_balloon_md_fill_device_info; +} diff --git a/hw/hyperv/meson.build b/hw/hyperv/meson.build index b43f119ea5..d3d2668c71 100644 --- a/hw/hyperv/meson.build +++ b/hw/hyperv/meson.build @@ -2,3 +2,4 @@ specific_ss.add(when: 'CONFIG_HYPERV', if_true: files('hyperv.c')) specific_ss.add(when: 'CONFIG_HYPERV_TESTDEV', if_true: files('hyperv_testdev.c')) specific_ss.add(when: 'CONFIG_VMBUS', if_true: files('vmbus.c')) specific_ss.add(when: 'CONFIG_SYNDBG', if_true: files('syndbg.c')) +specific_ss.add(when: 'CONFIG_HV_BALLOON', if_true: files('hv-balloon.c', 'hv-balloon-page_range_tree.c', 'hv-balloon-our_range_memslots.c'), if_false: files('hv-balloon-stub.c')) diff --git a/hw/hyperv/trace-events b/hw/hyperv/trace-events index b4c35ca8e3..7963c215b1 100644 --- a/hw/hyperv/trace-events +++ b/hw/hyperv/trace-events @@ -16,3 +16,21 @@ vmbus_gpadl_torndown(uint32_t gpadl_id) "gpadl #%d" vmbus_open_channel(uint32_t chan_id, uint32_t gpadl_id, uint32_t target_vp) "channel #%d gpadl #%d target vp %d" vmbus_channel_open(uint32_t chan_id, uint32_t status) "channel #%d status %d" vmbus_close_channel(uint32_t chan_id) "channel #%d" + +# hv-balloon +hv_balloon_state_change(const char *tostr) "-> %s" +hv_balloon_incoming_version(uint16_t major, uint16_t minor) "incoming proto version %u.%u" +hv_balloon_incoming_caps(uint32_t caps) "incoming caps 0x%x" +hv_balloon_outgoing_unballoon(uint32_t trans_id, uint64_t count, uint64_t start, uint64_t rempages) "posting unballoon %"PRIu32" for %"PRIu64" @ 0x%"PRIx64", remaining %"PRIu64 +hv_balloon_incoming_unballoon(uint32_t trans_id) "incoming unballoon response %"PRIu32 +hv_balloon_outgoing_hot_add(uint32_t trans_id, uint64_t count, uint64_t start) "posting hot add %"PRIu32" for %"PRIu64" @ 0x%"PRIx64 +hv_balloon_incoming_hot_add(uint32_t trans_id, uint32_t result, uint32_t count) "incoming hot add response %"PRIu32", result %"PRIu32", count %"PRIu32 +hv_balloon_outgoing_balloon(uint32_t trans_id, uint64_t count, uint64_t rempages) "posting balloon %"PRIu32" for %"PRIu64", remaining %"PRIu64 +hv_balloon_incoming_balloon(uint32_t trans_id, uint32_t range_count, uint32_t more_pages) "incoming balloon response %"PRIu32", ranges %"PRIu32", more %"PRIu32 +hv_balloon_our_range_add(uint64_t count, uint64_t start) "adding our range %"PRIu64" @ 0x%"PRIx64 +hv_balloon_remove_response(uint64_t count, uint64_t start, unsigned int both) "processing remove response range %"PRIu64" @ 0x%"PRIx64", both %u" +hv_balloon_remove_response_hole(uint64_t counthole, uint64_t starthole, uint64_t countrange, uint64_t startrange, uint64_t starthpr, unsigned int both) "response range hole %"PRIu64" @ 0x%"PRIx64" from range %"PRIu64" @ 0x%"PRIx64", before our start 0x%"PRIx64", both %u" +hv_balloon_remove_response_common(uint64_t countcommon, uint64_t startcommon, uint64_t countrange, uint64_t startrange, uint64_t counthpr, uint64_t starthpr, uint64_t removed, unsigned int both) "response common range %"PRIu64" @ 0x%"PRIx64" from range %"PRIu64" @ 0x%"PRIx64" with our %"PRIu64" @ 0x%"PRIx64", removed %"PRIu64", both %u" +hv_balloon_remove_response_remainder(uint64_t count, uint64_t start, unsigned int both) "remove response remaining range %"PRIu64" @ 0x%"PRIx64", both %u" +hv_balloon_map_slot(unsigned int idx, unsigned int total_slots, uint64_t offset) "mapping memslot %u / %u @ 0x%"PRIx64 +hv_balloon_unmap_slot(unsigned int idx, unsigned int total_slots, uint64_t offset) "unmapping memslot %u / %u @ 0x%"PRIx64 diff --git a/hw/hyperv/vmbus.c b/hw/hyperv/vmbus.c index 271289f902..c64eaa5a46 100644 --- a/hw/hyperv/vmbus.c +++ b/hw/hyperv/vmbus.c @@ -2271,7 +2271,7 @@ static void vmbus_dev_realize(DeviceState *dev, Error **errp) VMBus *vmbus = VMBUS(qdev_get_parent_bus(dev)); BusChild *child; Error *err = NULL; - char idstr[UUID_FMT_LEN + 1]; + char idstr[UUID_STR_LEN]; assert(!qemu_uuid_is_null(&vdev->instanceid)); @@ -2467,7 +2467,7 @@ static char *vmbus_get_dev_path(DeviceState *dev) static char *vmbus_get_fw_dev_path(DeviceState *dev) { VMBusDevice *vdev = VMBUS_DEVICE(dev); - char uuid[UUID_FMT_LEN + 1]; + char uuid[UUID_STR_LEN]; qemu_uuid_unparse(&vdev->instanceid, uuid); return g_strdup_printf("%s@%s", qdev_fw_name(dev), uuid); diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 94772c726b..55850791df 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -45,6 +45,7 @@ config PC select ACPI_VMGENID select VIRTIO_PMEM_SUPPORTED select VIRTIO_MEM_SUPPORTED + select HV_BALLOON_SUPPORTED config PC_PCI bool diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 7965415b47..4203144da9 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1450,6 +1450,10 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &iommu_as[devfn]->as; } +static const PCIIOMMUOps amdvi_iommu_ops = { + .get_address_space = amdvi_host_dma_iommu, +}; + static const MemoryRegionOps mmio_mem_ops = { .read = amdvi_mmio_read, .write = amdvi_mmio_write, @@ -1581,7 +1585,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) AMDVI_MMIO_SIZE); memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR, &s->mmio); - pci_setup_iommu(bus, amdvi_host_dma_iommu, s); + pci_setup_iommu(bus, &amdvi_iommu_ops, s); amdvi_init(s); } diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 1c6c18622f..5085a6fee3 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1045,18 +1045,35 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, * Rsvd field masks for spte: * vtd_spte_rsvd 4k pages * vtd_spte_rsvd_large large pages + * + * We support only 3-level and 4-level page tables (see vtd_init() which + * sets only VTD_CAP_SAGAW_39bit and maybe VTD_CAP_SAGAW_48bit bits in s->cap). */ -static uint64_t vtd_spte_rsvd[5]; -static uint64_t vtd_spte_rsvd_large[5]; +#define VTD_SPTE_RSVD_LEN 5 +static uint64_t vtd_spte_rsvd[VTD_SPTE_RSVD_LEN]; +static uint64_t vtd_spte_rsvd_large[VTD_SPTE_RSVD_LEN]; static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) { - uint64_t rsvd_mask = vtd_spte_rsvd[level]; + uint64_t rsvd_mask; + + /* + * We should have caught a guest-mis-programmed level earlier, + * via vtd_is_level_supported. + */ + assert(level < VTD_SPTE_RSVD_LEN); + /* + * Zero level doesn't exist. The smallest level is VTD_SL_PT_LEVEL=1 and + * checked by vtd_is_last_slpte(). + */ + assert(level); if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) && (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) { /* large page */ rsvd_mask = vtd_spte_rsvd_large[level]; + } else { + rsvd_mask = vtd_spte_rsvd[level]; } return slpte & rsvd_mask; @@ -4088,6 +4105,10 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &vtd_as->as; } +static PCIIOMMUOps vtd_iommu_ops = { + .get_address_space = vtd_host_dma_iommu, +}; + static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) { X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); @@ -4210,7 +4231,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal, g_free, g_free); vtd_init(s); - pci_setup_iommu(bus, vtd_host_dma_iommu, dev); + pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c index a731738411..b2b4be9983 100644 --- a/hw/i386/kvm/xen_evtchn.c +++ b/hw/i386/kvm/xen_evtchn.c @@ -490,6 +490,12 @@ int xen_evtchn_set_callback_param(uint64_t param) break; } + /* If the guest has set a per-vCPU callback vector, prefer that. */ + if (gsi && kvm_xen_has_vcpu_callback_vector()) { + in_kernel = kvm_xen_has_cap(EVTCHN_SEND); + gsi = 0; + } + if (!ret) { /* If vector delivery was turned *off* then tell the kernel */ if ((s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) == @@ -1129,6 +1135,7 @@ int xen_evtchn_reset_op(struct evtchn_reset *reset) return -ESRCH; } + QEMU_IOTHREAD_LOCK_GUARD(); return xen_evtchn_soft_reset(); } diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c index 21c30e3659..839ec920a1 100644 --- a/hw/i386/kvm/xen_gnttab.c +++ b/hw/i386/kvm/xen_gnttab.c @@ -541,7 +541,5 @@ int xen_gnttab_reset(void) s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access; s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE); - memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1); - return 0; } diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c index 660d0b72f9..8e716a7009 100644 --- a/hw/i386/kvm/xen_xenstore.c +++ b/hw/i386/kvm/xen_xenstore.c @@ -1357,10 +1357,12 @@ static void fire_watch_cb(void *opaque, const char *path, const char *token) } else { deliver_watch(s, path, token); /* - * If the message was queued because there was already ring activity, - * no need to wake the guest. But if not, we need to send the evtchn. + * Attempt to queue the message into the actual ring, and send + * the event channel notification if any bytes are copied. */ - xen_be_evtchn_notify(s->eh, s->be_port); + if (s->rsp_pending && put_rsp(s) > 0) { + xen_be_evtchn_notify(s->eh, s->be_port); + } } } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 6031234a73..1aef21aa2c 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -27,6 +27,7 @@ #include "hw/i386/pc.h" #include "hw/char/serial.h" #include "hw/char/parallel.h" +#include "hw/hyperv/hv-balloon.h" #include "hw/i386/fw_cfg.h" #include "hw/i386/vmport.h" #include "sysemu/cpus.h" @@ -57,6 +58,7 @@ #include "hw/i386/kvm/xen_evtchn.h" #include "hw/i386/kvm/xen_gnttab.h" #include "hw/i386/kvm/xen_xenstore.h" +#include "hw/mem/memory-device.h" #include "e820_memory_layout.h" #include "trace.h" #include CONFIG_DEVICES @@ -1422,6 +1424,21 @@ static void pc_memory_unplug(HotplugHandler *hotplug_dev, error_propagate(errp, local_err); } +static void pc_hv_balloon_pre_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + /* The vmbus handler has no hotplug handler; we should never end up here. */ + g_assert(!dev->hotplugged); + memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL, + errp); +} + +static void pc_hv_balloon_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev)); +} + static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -1452,6 +1469,8 @@ static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, return; } pcms->iommu = dev; + } else if (object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON)) { + pc_hv_balloon_pre_plug(hotplug_dev, dev, errp); } } @@ -1464,6 +1483,8 @@ static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev, x86_cpu_plug(hotplug_dev, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { virtio_md_pci_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON)) { + pc_hv_balloon_plug(hotplug_dev, dev, errp); } } @@ -1505,6 +1526,7 @@ static HotplugHandler *pc_get_hotplug_handler(MachineState *machine, object_dynamic_cast(OBJECT(dev), TYPE_CPU) || object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI) || object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON) || object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE)) { return HOTPLUG_HANDLER(machine); } diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c index ae38f48f16..e0704b8dc3 100644 --- a/hw/mem/memory-device.c +++ b/hw/mem/memory-device.c @@ -20,6 +20,22 @@ #include "exec/address-spaces.h" #include "trace.h" +static bool memory_device_is_empty(const MemoryDeviceState *md) +{ + const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); + Error *local_err = NULL; + MemoryRegion *mr; + + /* dropping const here is fine as we don't touch the memory region */ + mr = mdc->get_memory_region((MemoryDeviceState *)md, &local_err); + if (local_err) { + /* Not empty, we'll report errors later when ontaining the MR again. */ + error_free(local_err); + return false; + } + return !mr; +} + static gint memory_device_addr_sort(gconstpointer a, gconstpointer b) { const MemoryDeviceState *md_a = MEMORY_DEVICE(a); @@ -220,12 +236,6 @@ static uint64_t memory_device_get_free_addr(MachineState *ms, return 0; } - if (!QEMU_IS_ALIGNED(size, align)) { - error_setg(errp, "backend memory size must be multiple of 0x%" - PRIx64, align); - return 0; - } - if (hint) { if (range_init(&new, *hint, size) || !range_contains_range(&as, &new)) { error_setg(errp, "can't add memory device [0x%" PRIx64 ":0x%" PRIx64 @@ -249,6 +259,10 @@ static uint64_t memory_device_get_free_addr(MachineState *ms, uint64_t next_addr; Range tmp; + if (memory_device_is_empty(md)) { + continue; + } + range_init_nofail(&tmp, mdc->get_addr(md), memory_device_get_region_size(md, &error_abort)); @@ -292,6 +306,7 @@ MemoryDeviceInfoList *qmp_memory_device_list(void) const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(item->data); MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1); + /* Let's query infotmation even for empty memory devices. */ mdc->fill_device_info(md, info); QAPI_LIST_APPEND(tail, info); @@ -311,7 +326,7 @@ static int memory_device_plugged_size(Object *obj, void *opaque) const MemoryDeviceState *md = MEMORY_DEVICE(obj); const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(obj); - if (dev->realized) { + if (dev->realized && !memory_device_is_empty(md)) { *size += mdc->get_plugged_size(md, &error_abort); } } @@ -337,6 +352,11 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms, uint64_t addr, align = 0; MemoryRegion *mr; + /* We support empty memory devices even without device memory. */ + if (memory_device_is_empty(md)) { + return; + } + if (!ms->device_memory) { error_setg(errp, "the configuration is not prepared for memory devices" " (e.g., for memory hotplug), consider specifying the" @@ -380,10 +400,17 @@ out: void memory_device_plug(MemoryDeviceState *md, MachineState *ms) { const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); - const unsigned int memslots = memory_device_get_memslots(md); - const uint64_t addr = mdc->get_addr(md); + unsigned int memslots; + uint64_t addr; MemoryRegion *mr; + if (memory_device_is_empty(md)) { + return; + } + + memslots = memory_device_get_memslots(md); + addr = mdc->get_addr(md); + /* * We expect that a previous call to memory_device_pre_plug() succeeded, so * it can't fail at this point. @@ -408,6 +435,10 @@ void memory_device_unplug(MemoryDeviceState *md, MachineState *ms) const unsigned int memslots = memory_device_get_memslots(md); MemoryRegion *mr; + if (memory_device_is_empty(md)) { + return; + } + /* * We expect that a previous call to memory_device_pre_plug() succeeded, so * it can't fail at this point. diff --git a/hw/pci-host/astro.c b/hw/pci-host/astro.c index 4b2d7caf2d..84e0ca14ac 100644 --- a/hw/pci-host/astro.c +++ b/hw/pci-host/astro.c @@ -345,6 +345,10 @@ static AddressSpace *elroy_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->astro->iommu_as; } +static const PCIIOMMUOps elroy_pcihost_iommu_ops = { + .get_address_space = elroy_pcihost_set_iommu, +}; + /* * Encoding in IOSAPIC: * base_addr == 0xfffa0000, we want to get 0xa0ff0000. @@ -834,7 +838,7 @@ static void astro_realize(DeviceState *obj, Error **errp) &elroy->pci_io); /* Host memory as seen from the PCI side, via the IOMMU. */ - pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, elroy_pcihost_set_iommu, + pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, &elroy_pcihost_iommu_ops, elroy); } } diff --git a/hw/pci-host/designware.c b/hw/pci-host/designware.c index 6f5442f108..f477f97847 100644 --- a/hw/pci-host/designware.c +++ b/hw/pci-host/designware.c @@ -663,6 +663,10 @@ static AddressSpace *designware_pcie_host_set_iommu(PCIBus *bus, void *opaque, return &s->pci.address_space; } +static const PCIIOMMUOps designware_iommu_ops = { + .get_address_space = designware_pcie_host_set_iommu, +}; + static void designware_pcie_host_realize(DeviceState *dev, Error **errp) { PCIHostState *pci = PCI_HOST_BRIDGE(dev); @@ -705,7 +709,7 @@ static void designware_pcie_host_realize(DeviceState *dev, Error **errp) address_space_init(&s->pci.address_space, &s->pci.address_space_root, "pcie-bus-address-space"); - pci_setup_iommu(pci->bus, designware_pcie_host_set_iommu, s); + pci_setup_iommu(pci->bus, &designware_iommu_ops, s); qdev_realize(DEVICE(&s->root), BUS(pci->bus), &error_fatal); } diff --git a/hw/pci-host/dino.c b/hw/pci-host/dino.c index 82503229fa..5b0947a16c 100644 --- a/hw/pci-host/dino.c +++ b/hw/pci-host/dino.c @@ -354,6 +354,10 @@ static AddressSpace *dino_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->bm_as; } +static const PCIIOMMUOps dino_iommu_ops = { + .get_address_space = dino_pcihost_set_iommu, +}; + /* * Dino interrupts are connected as shown on Page 78, Table 23 * (Little-endian bit numbers) @@ -481,7 +485,7 @@ static void dino_pcihost_init(Object *obj) g_free(name); } - pci_setup_iommu(phb->bus, dino_pcihost_set_iommu, s); + pci_setup_iommu(phb->bus, &dino_iommu_ops, s); sysbus_init_mmio(sbd, &s->this_mem); diff --git a/hw/pci-host/pnv_phb3.c b/hw/pci-host/pnv_phb3.c index c5e58f4086..2a74dbe45f 100644 --- a/hw/pci-host/pnv_phb3.c +++ b/hw/pci-host/pnv_phb3.c @@ -968,6 +968,10 @@ static AddressSpace *pnv_phb3_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &ds->dma_as; } +static PCIIOMMUOps pnv_phb3_iommu_ops = { + .get_address_space = pnv_phb3_dma_iommu, +}; + static void pnv_phb3_instance_init(Object *obj) { PnvPHB3 *phb = PNV_PHB3(obj); @@ -1012,7 +1016,7 @@ void pnv_phb3_bus_init(DeviceState *dev, PnvPHB3 *phb) object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id, &error_abort); - pci_setup_iommu(pci->bus, pnv_phb3_dma_iommu, phb); + pci_setup_iommu(pci->bus, &pnv_phb3_iommu_ops, phb); } static void pnv_phb3_realize(DeviceState *dev, Error **errp) diff --git a/hw/pci-host/pnv_phb4.c b/hw/pci-host/pnv_phb4.c index 29cb11a5d9..37c7afc18c 100644 --- a/hw/pci-host/pnv_phb4.c +++ b/hw/pci-host/pnv_phb4.c @@ -1518,6 +1518,10 @@ static void pnv_phb4_xscom_realize(PnvPHB4 *phb) &phb->phb_regs_mr); } +static PCIIOMMUOps pnv_phb4_iommu_ops = { + .get_address_space = pnv_phb4_dma_iommu, +}; + static void pnv_phb4_instance_init(Object *obj) { PnvPHB4 *phb = PNV_PHB4(obj); @@ -1557,7 +1561,7 @@ void pnv_phb4_bus_init(DeviceState *dev, PnvPHB4 *phb) object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id, &error_abort); - pci_setup_iommu(pci->bus, pnv_phb4_dma_iommu, phb); + pci_setup_iommu(pci->bus, &pnv_phb4_iommu_ops, phb); pci->bus->flags |= PCI_BUS_EXTENDED_CONFIG_SPACE; } diff --git a/hw/pci-host/ppce500.c b/hw/pci-host/ppce500.c index 38814247f2..453a4e6ed3 100644 --- a/hw/pci-host/ppce500.c +++ b/hw/pci-host/ppce500.c @@ -435,6 +435,10 @@ static AddressSpace *e500_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->bm_as; } +static const PCIIOMMUOps ppce500_iommu_ops = { + .get_address_space = e500_pcihost_set_iommu, +}; + static void e500_pcihost_realize(DeviceState *dev, Error **errp) { SysBusDevice *sbd = SYS_BUS_DEVICE(dev); @@ -469,7 +473,7 @@ static void e500_pcihost_realize(DeviceState *dev, Error **errp) memory_region_init(&s->bm, OBJECT(s), "bm-e500", UINT64_MAX); memory_region_add_subregion(&s->bm, 0x0, &s->busmem); address_space_init(&s->bm_as, &s->bm, "pci-bm"); - pci_setup_iommu(b, e500_pcihost_set_iommu, s); + pci_setup_iommu(b, &ppce500_iommu_ops, s); pci_create_simple(b, 0, "e500-host-bridge"); diff --git a/hw/pci-host/raven.c b/hw/pci-host/raven.c index 9a11ac4b2b..86c3a49087 100644 --- a/hw/pci-host/raven.c +++ b/hw/pci-host/raven.c @@ -223,6 +223,10 @@ static AddressSpace *raven_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->bm_as; } +static const PCIIOMMUOps raven_iommu_ops = { + .get_address_space = raven_pcihost_set_iommu, +}; + static void raven_change_gpio(void *opaque, int n, int level) { PREPPCIState *s = opaque; @@ -320,7 +324,7 @@ static void raven_pcihost_initfn(Object *obj) memory_region_add_subregion(&s->bm, 0 , &s->bm_pci_memory_alias); memory_region_add_subregion(&s->bm, 0x80000000, &s->bm_ram_alias); address_space_init(&s->bm_as, &s->bm, "raven-bm"); - pci_setup_iommu(&s->pci_bus, raven_pcihost_set_iommu, s); + pci_setup_iommu(&s->pci_bus, &raven_iommu_ops, s); h->bus = &s->pci_bus; diff --git a/hw/pci-host/sabre.c b/hw/pci-host/sabre.c index dcb2e230b6..d0851b48b0 100644 --- a/hw/pci-host/sabre.c +++ b/hw/pci-host/sabre.c @@ -112,6 +112,10 @@ static AddressSpace *sabre_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &is->iommu_as; } +static const PCIIOMMUOps sabre_iommu_ops = { + .get_address_space = sabre_pci_dma_iommu, +}; + static void sabre_config_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) { @@ -384,7 +388,7 @@ static void sabre_realize(DeviceState *dev, Error **errp) /* IOMMU */ memory_region_add_subregion_overlap(&s->sabre_config, 0x200, sysbus_mmio_get_region(SYS_BUS_DEVICE(s->iommu), 0), 1); - pci_setup_iommu(phb->bus, sabre_pci_dma_iommu, s->iommu); + pci_setup_iommu(phb->bus, &sabre_iommu_ops, s->iommu); /* APB secondary busses */ pci_dev = pci_new_multifunction(PCI_DEVFN(1, 0), TYPE_SIMBA_PCI_BRIDGE); diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 885c04b6f5..c49417abb2 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2678,7 +2678,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) PCIBus *iommu_bus = bus; uint8_t devfn = dev->devfn; - while (iommu_bus && !iommu_bus->iommu_fn && iommu_bus->parent_dev) { + while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) { PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev); /* @@ -2717,15 +2717,23 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) iommu_bus = parent_bus; } - if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) { - return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn); + if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) { + return iommu_bus->iommu_ops->get_address_space(bus, + iommu_bus->iommu_opaque, devfn); } return &address_space_memory; } -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque) +void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) { - bus->iommu_fn = fn; + /* + * If called, pci_setup_iommu() should provide a minimum set of + * useful callbacks for the bus. + */ + assert(ops); + assert(ops->get_address_space); + + bus->iommu_ops = ops; bus->iommu_opaque = opaque; } diff --git a/hw/ppc/ppc440_pcix.c b/hw/ppc/ppc440_pcix.c index 672090de94..df4ee374d0 100644 --- a/hw/ppc/ppc440_pcix.c +++ b/hw/ppc/ppc440_pcix.c @@ -449,6 +449,10 @@ static AddressSpace *ppc440_pcix_set_iommu(PCIBus *b, void *opaque, int devfn) return &s->bm_as; } +static const PCIIOMMUOps ppc440_iommu_ops = { + .get_address_space = ppc440_pcix_set_iommu, +}; + /* * Some guests on sam460ex write all kinds of garbage here such as * missing enable bit and low bits set and still expect this to work @@ -503,7 +507,7 @@ static void ppc440_pcix_realize(DeviceState *dev, Error **errp) memory_region_init(&s->bm, OBJECT(s), "bm-ppc440-pcix", UINT64_MAX); memory_region_add_subregion(&s->bm, 0x0, &s->busmem); address_space_init(&s->bm_as, &s->bm, "pci-bm"); - pci_setup_iommu(h->bus, ppc440_pcix_set_iommu, s); + pci_setup_iommu(h->bus, &ppc440_iommu_ops, s); memory_region_init(&s->container, OBJECT(s), "pci-container", PCI_ALL_SIZE); memory_region_init_io(&h->conf_mem, OBJECT(s), &ppc440_pcix_host_conf_ops, diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 370c5a90f2..a27024e45a 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -780,6 +780,10 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &phb->iommu_as; } +static const PCIIOMMUOps spapr_iommu_ops = { + .get_address_space = spapr_pci_dma_iommu, +}; + static char *spapr_phb_vfio_get_loc_code(SpaprPhbState *sphb, PCIDevice *pdev) { g_autofree char *path = NULL; @@ -1978,7 +1982,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) memory_region_add_subregion(&sphb->iommu_root, SPAPR_PCI_MSI_WINDOW, &sphb->msiwindow); - pci_setup_iommu(bus, spapr_pci_dma_iommu, sphb); + pci_setup_iommu(bus, &spapr_iommu_ops, sphb); pci_bus_set_route_irq_fn(bus, spapr_route_intx_pin_to_irq); diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index 9016720547..f283f7e38d 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -18,14 +18,112 @@ */ #include "qemu/osdep.h" +#include <sys/ioctl.h> #include <linux/vfio.h> #include "hw/ppc/spapr.h" #include "hw/pci-host/spapr.h" #include "hw/pci/msix.h" #include "hw/pci/pci_device.h" -#include "hw/vfio/vfio.h" +#include "hw/vfio/vfio-common.h" #include "qemu/error-report.h" +/* + * Interfaces for IBM EEH (Enhanced Error Handling) + */ +static bool vfio_eeh_container_ok(VFIOContainer *container) +{ + /* + * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO + * implementation is broken if there are multiple groups in a + * container. The hardware works in units of Partitionable + * Endpoints (== IOMMU groups) and the EEH operations naively + * iterate across all groups in the container, without any logic + * to make sure the groups have their state synchronized. For + * certain operations (ENABLE) that might be ok, until an error + * occurs, but for others (GET_STATE) it's clearly broken. + */ + + /* + * XXX Once fixed kernels exist, test for them here + */ + + if (QLIST_EMPTY(&container->group_list)) { + return false; + } + + if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) { + return false; + } + + return true; +} + +static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) +{ + struct vfio_eeh_pe_op pe_op = { + .argsz = sizeof(pe_op), + .op = op, + }; + int ret; + + if (!vfio_eeh_container_ok(container)) { + error_report("vfio/eeh: EEH_PE_OP 0x%x: " + "kernel requires a container with exactly one group", op); + return -EPERM; + } + + ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op); + if (ret < 0) { + error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op); + return -errno; + } + + return ret; +} + +static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) +{ + VFIOAddressSpace *space = vfio_get_address_space(as); + VFIOContainer *container = NULL; + + if (QLIST_EMPTY(&space->containers)) { + /* No containers to act on */ + goto out; + } + + container = QLIST_FIRST(&space->containers); + + if (QLIST_NEXT(container, next)) { + /* + * We don't yet have logic to synchronize EEH state across + * multiple containers + */ + container = NULL; + goto out; + } + +out: + vfio_put_address_space(space); + return container; +} + +static bool vfio_eeh_as_ok(AddressSpace *as) +{ + VFIOContainer *container = vfio_eeh_as_container(as); + + return (container != NULL) && vfio_eeh_container_ok(container); +} + +static int vfio_eeh_as_op(AddressSpace *as, uint32_t op) +{ + VFIOContainer *container = vfio_eeh_as_container(as); + + if (!container) { + return -ENODEV; + } + return vfio_eeh_container_op(container, op); +} + bool spapr_phb_eeh_available(SpaprPhbState *sphb) { return vfio_eeh_as_ok(&sphb->iommu_as); diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c index 1391dd712c..7c56aad0fc 100644 --- a/hw/remote/iommu.c +++ b/hw/remote/iommu.c @@ -100,6 +100,10 @@ static void remote_iommu_finalize(Object *obj) iommu->elem_by_devfn = NULL; } +static const PCIIOMMUOps remote_iommu_ops = { + .get_address_space = remote_iommu_find_add_as, +}; + void remote_iommu_setup(PCIBus *pci_bus) { RemoteIommu *iommu = NULL; @@ -108,7 +112,7 @@ void remote_iommu_setup(PCIBus *pci_bus) iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU)); - pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu); + pci_setup_iommu(pci_bus, &remote_iommu_ops, iommu); object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu)); diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c index c27c362db9..2d391a8396 100644 --- a/hw/rtc/mc146818rtc.c +++ b/hw/rtc/mc146818rtc.c @@ -599,7 +599,7 @@ static void rtc_get_time(MC146818RtcState *s, struct tm *tm) static void rtc_set_time(MC146818RtcState *s) { - struct tm tm; + struct tm tm = {}; g_autofree const char *qom_path = object_get_canonical_path(OBJECT(s)); rtc_get_time(s, &tm); diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 2ca36f9f3b..347580ebac 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -652,6 +652,10 @@ static AddressSpace *s390_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &iommu->as; } +static const PCIIOMMUOps s390_iommu_ops = { + .get_address_space = s390_pci_dma_iommu, +}; + static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t to_be_set) { uint8_t expected, actual; @@ -839,7 +843,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) b = pci_register_root_bus(dev, NULL, s390_pci_set_irq, s390_pci_map_irq, NULL, get_system_memory(), get_system_io(), 0, 64, TYPE_PCI_BUS); - pci_setup_iommu(b, s390_pci_dma_iommu, s); + pci_setup_iommu(b, &s390_iommu_ops, s); bus = BUS(b); qbus_set_hotplug_handler(bus, OBJECT(dev)); @@ -1058,7 +1062,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, pdev = PCI_DEVICE(dev); pci_bridge_map_irq(pb, dev->id, s390_pci_map_irq); - pci_setup_iommu(&pb->sec_bus, s390_pci_dma_iommu, s); + pci_setup_iommu(&pb->sec_bus, &s390_iommu_ops, s); qbus_set_hotplug_handler(BUS(&pb->sec_bus), OBJECT(s)); diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 5f257bffb9..bbf69ff55a 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -14,7 +14,6 @@ #include <linux/vfio.h> #include <sys/ioctl.h> #include "qapi/error.h" -#include "hw/vfio/vfio.h" #include "hw/vfio/vfio-common.h" #include "hw/s390x/ap-device.h" #include "qemu/error-report.h" diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 6623ae237b..d857bb8d0f 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -20,7 +20,6 @@ #include <sys/ioctl.h> #include "qapi/error.h" -#include "hw/vfio/vfio.h" #include "hw/vfio/vfio-common.h" #include "hw/s390x/s390-ccw.h" #include "hw/s390x/vfio-ccw.h" diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d806057b40..e70fdf5e0c 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -26,7 +26,6 @@ #include <linux/vfio.h> #include "hw/vfio/vfio-common.h" -#include "hw/vfio/vfio.h" #include "hw/vfio/pci.h" #include "exec/address-spaces.h" #include "exec/memory.h" @@ -246,44 +245,6 @@ bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) return true; } -void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, - hwaddr max_iova, uint64_t iova_pgsizes) -{ - VFIOHostDMAWindow *hostwin; - - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (ranges_overlap(hostwin->min_iova, - hostwin->max_iova - hostwin->min_iova + 1, - min_iova, - max_iova - min_iova + 1)) { - hw_error("%s: Overlapped IOMMU are not enabled", __func__); - } - } - - hostwin = g_malloc0(sizeof(*hostwin)); - - hostwin->min_iova = min_iova; - hostwin->max_iova = max_iova; - hostwin->iova_pgsizes = iova_pgsizes; - QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); -} - -int vfio_host_win_del(VFIOContainer *container, - hwaddr min_iova, hwaddr max_iova) -{ - VFIOHostDMAWindow *hostwin; - - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { - QLIST_REMOVE(hostwin, hostwin_next); - g_free(hostwin); - return 0; - } - } - - return -1; -} - static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && @@ -532,22 +493,6 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container, g_free(vrdl); } -static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, - hwaddr iova, hwaddr end) -{ - VFIOHostDMAWindow *hostwin; - bool hostwin_found = false; - - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { - hostwin_found = true; - break; - } - } - - return hostwin_found ? hostwin : NULL; -} - static bool vfio_known_safe_misalignment(MemoryRegionSection *section) { MemoryRegion *mr = section->mr; @@ -626,7 +571,6 @@ static void vfio_listener_region_add(MemoryListener *listener, Int128 llend, llsize; void *vaddr; int ret; - VFIOHostDMAWindow *hostwin; Error *err = NULL; if (!vfio_listener_valid_section(section, "region_add")) { @@ -648,13 +592,6 @@ static void vfio_listener_region_add(MemoryListener *listener, goto fail; } - hostwin = vfio_find_hostwin(container, iova, end); - if (!hostwin) { - error_setg(&err, "Container %p can't map guest IOVA region" - " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); - goto fail; - } - memory_region_ref(section->mr); if (memory_region_is_iommu(section->mr)) { @@ -693,6 +630,15 @@ static void vfio_listener_region_add(MemoryListener *listener, goto fail; } + if (container->iova_ranges) { + ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, + container->iova_ranges, &err); + if (ret) { + g_free(giommu); + goto fail; + } + } + ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, &err); if (ret) { @@ -726,7 +672,7 @@ static void vfio_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1; if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { trace_vfio_listener_region_add_no_dma_map( @@ -825,12 +771,8 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_ram_device(section->mr)) { hwaddr pgmask; - VFIOHostDMAWindow *hostwin; - - hostwin = vfio_find_hostwin(container, iova, end); - assert(hostwin); /* or region_add() would have failed */ - pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + pgmask = (1ULL << ctz64(container->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { vfio_unregister_ram_discard_listener(container, section); diff --git a/hw/vfio/container.c b/hw/vfio/container.c index adc467210f..242010036a 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -20,20 +20,15 @@ #include "qemu/osdep.h" #include <sys/ioctl.h> -#ifdef CONFIG_KVM -#include <linux/kvm.h> -#endif #include <linux/vfio.h> #include "hw/vfio/vfio-common.h" -#include "hw/vfio/vfio.h" #include "exec/address-spaces.h" #include "exec/memory.h" #include "exec/ram_addr.h" #include "hw/hw.h" #include "qemu/error-report.h" #include "qemu/range.h" -#include "sysemu/kvm.h" #include "sysemu/reset.h" #include "trace.h" #include "qapi/error.h" @@ -205,92 +200,6 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, return -errno; } -int vfio_container_add_section_window(VFIOContainer *container, - MemoryRegionSection *section, - Error **errp) -{ - VFIOHostDMAWindow *hostwin; - hwaddr pgsize = 0; - int ret; - - if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { - return 0; - } - - /* For now intersections are not allowed, we may relax this later */ - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (ranges_overlap(hostwin->min_iova, - hostwin->max_iova - hostwin->min_iova + 1, - section->offset_within_address_space, - int128_get64(section->size))) { - error_setg(errp, - "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" - "host DMA window [0x%"PRIx64",0x%"PRIx64"]", - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1, - hostwin->min_iova, hostwin->max_iova); - return -EINVAL; - } - } - - ret = vfio_spapr_create_window(container, section, &pgsize); - if (ret) { - error_setg_errno(errp, -ret, "Failed to create SPAPR window"); - return ret; - } - - vfio_host_win_add(container, section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1, pgsize); -#ifdef CONFIG_KVM - if (kvm_enabled()) { - VFIOGroup *group; - IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); - struct kvm_vfio_spapr_tce param; - struct kvm_device_attr attr = { - .group = KVM_DEV_VFIO_GROUP, - .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, - .addr = (uint64_t)(unsigned long)¶m, - }; - - if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, - ¶m.tablefd)) { - QLIST_FOREACH(group, &container->group_list, container_next) { - param.groupfd = group->fd; - if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { - error_setg_errno(errp, errno, - "vfio: failed GROUP_SET_SPAPR_TCE for " - "KVM VFIO device %d and group fd %d", - param.tablefd, param.groupfd); - return -errno; - } - trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); - } - } - } -#endif - return 0; -} - -void vfio_container_del_section_window(VFIOContainer *container, - MemoryRegionSection *section) -{ - if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { - return; - } - - vfio_spapr_remove_window(container, - section->offset_within_address_space); - if (vfio_host_win_del(container, - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1) < 0) { - hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, - __func__, section->offset_within_address_space); - } -} - int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) { int ret; @@ -355,14 +264,6 @@ int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, return ret; } -static void vfio_listener_release(VFIOContainer *container) -{ - memory_listener_unregister(&container->listener); - if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - memory_listener_unregister(&container->prereg_listener); - } -} - static struct vfio_info_cap_header * vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) { @@ -382,7 +283,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, /* If the capability cannot be found, assume no DMA limiting */ hdr = vfio_get_iommu_type1_info_cap(info, VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); - if (hdr == NULL) { + if (!hdr) { return false; } @@ -394,6 +295,32 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, return true; } +static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, + VFIOContainer *container) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_iova_range *cap; + + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + if (!hdr) { + return false; + } + + cap = (void *)hdr; + + for (int i = 0; i < cap->nr_iovas; i++) { + Range *range = g_new(Range, 1); + + range_set_bounds(range, cap->iova_ranges[i].start, + cap->iova_ranges[i].end); + container->iova_ranges = + range_list_insert(container->iova_ranges, range); + } + + return true; +} + static void vfio_kvm_device_add_group(VFIOGroup *group) { Error *err = NULL; @@ -535,6 +462,12 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, } } +static void vfio_free_container(VFIOContainer *container) +{ + g_list_free_full(container->iova_ranges, g_free); + g_free(container); +} + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -616,8 +549,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->error = NULL; container->dirty_pages_supported = false; container->dma_max_mappings = 0; + container->iova_ranges = NULL; QLIST_INIT(&container->giommu_list); - QLIST_INIT(&container->hostwin_list); QLIST_INIT(&container->vrdl_list); ret = vfio_init_container(container, group->fd, errp); @@ -652,84 +585,21 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) { container->dma_max_mappings = 65535; } - vfio_get_iommu_info_migration(container, info); - g_free(info); - /* - * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE - * information to get the actual window extent rather than assume - * a 64-bit IOVA address space. - */ - vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes); + vfio_get_info_iova_range(info, container); + vfio_get_iommu_info_migration(container, info); + g_free(info); break; } case VFIO_SPAPR_TCE_v2_IOMMU: case VFIO_SPAPR_TCE_IOMMU: { - struct vfio_iommu_spapr_tce_info info; - bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; - - /* - * The host kernel code implementing VFIO_IOMMU_DISABLE is called - * when container fd is closed so we do not call it explicitly - * in this file. - */ - if (!v2) { - ret = ioctl(fd, VFIO_IOMMU_ENABLE); - if (ret) { - error_setg_errno(errp, errno, "failed to enable container"); - ret = -errno; - goto enable_discards_exit; - } - } else { - container->prereg_listener = vfio_prereg_listener; - - memory_listener_register(&container->prereg_listener, - &address_space_memory); - if (container->error) { - memory_listener_unregister(&container->prereg_listener); - ret = -1; - error_propagate_prepend(errp, container->error, - "RAM memory listener initialization failed: "); - goto enable_discards_exit; - } - } - - info.argsz = sizeof(info); - ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + ret = vfio_spapr_container_init(container, errp); if (ret) { - error_setg_errno(errp, errno, - "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); - ret = -errno; - if (v2) { - memory_listener_unregister(&container->prereg_listener); - } goto enable_discards_exit; } - - if (v2) { - container->pgsizes = info.ddw.pgsizes; - /* - * There is a default window in just created container. - * To make region_add/del simpler, we better remove this - * window now and let those iommu_listener callbacks - * create/remove them when needed. - */ - ret = vfio_spapr_remove_window(container, info.dma32_window_start); - if (ret) { - error_setg_errno(errp, -ret, - "failed to remove existing window"); - goto enable_discards_exit; - } - } else { - /* The default table uses 4K pages */ - container->pgsizes = 0x1000; - vfio_host_win_add(container, info.dma32_window_start, - info.dma32_window_start + - info.dma32_window_size - 1, - 0x1000); - } + break; } } @@ -759,13 +629,17 @@ listener_release_exit: QLIST_REMOVE(group, container_next); QLIST_REMOVE(container, next); vfio_kvm_device_del_group(group); - vfio_listener_release(container); + memory_listener_unregister(&container->listener); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { + vfio_spapr_container_deinit(container); + } enable_discards_exit: vfio_ram_block_discard_disable(container, false); free_container_exit: - g_free(container); + vfio_free_container(container); close_fd_exit: close(fd); @@ -789,7 +663,11 @@ static void vfio_disconnect_container(VFIOGroup *group) * group. */ if (QLIST_EMPTY(&container->group_list)) { - vfio_listener_release(container); + memory_listener_unregister(&container->listener); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { + vfio_spapr_container_deinit(container); + } } if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { @@ -800,7 +678,6 @@ static void vfio_disconnect_container(VFIOGroup *group) if (QLIST_EMPTY(&container->group_list)) { VFIOAddressSpace *space = container->space; VFIOGuestIOMMU *giommu, *tmp; - VFIOHostDMAWindow *hostwin, *next; QLIST_REMOVE(container, next); @@ -811,15 +688,9 @@ static void vfio_disconnect_container(VFIOGroup *group) g_free(giommu); } - QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, - next) { - QLIST_REMOVE(hostwin, hostwin_next); - g_free(hostwin); - } - trace_vfio_disconnect_container(container->fd); close(container->fd); - g_free(container); + vfio_free_container(container); vfio_put_address_space(space); } @@ -975,103 +846,6 @@ static void vfio_put_base_device(VFIODevice *vbasedev) close(vbasedev->fd); } -/* - * Interfaces for IBM EEH (Enhanced Error Handling) - */ -static bool vfio_eeh_container_ok(VFIOContainer *container) -{ - /* - * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO - * implementation is broken if there are multiple groups in a - * container. The hardware works in units of Partitionable - * Endpoints (== IOMMU groups) and the EEH operations naively - * iterate across all groups in the container, without any logic - * to make sure the groups have their state synchronized. For - * certain operations (ENABLE) that might be ok, until an error - * occurs, but for others (GET_STATE) it's clearly broken. - */ - - /* - * XXX Once fixed kernels exist, test for them here - */ - - if (QLIST_EMPTY(&container->group_list)) { - return false; - } - - if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) { - return false; - } - - return true; -} - -static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) -{ - struct vfio_eeh_pe_op pe_op = { - .argsz = sizeof(pe_op), - .op = op, - }; - int ret; - - if (!vfio_eeh_container_ok(container)) { - error_report("vfio/eeh: EEH_PE_OP 0x%x: " - "kernel requires a container with exactly one group", op); - return -EPERM; - } - - ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op); - if (ret < 0) { - error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op); - return -errno; - } - - return ret; -} - -static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) -{ - VFIOAddressSpace *space = vfio_get_address_space(as); - VFIOContainer *container = NULL; - - if (QLIST_EMPTY(&space->containers)) { - /* No containers to act on */ - goto out; - } - - container = QLIST_FIRST(&space->containers); - - if (QLIST_NEXT(container, next)) { - /* - * We don't yet have logic to synchronize EEH state across - * multiple containers - */ - container = NULL; - goto out; - } - -out: - vfio_put_address_space(space); - return container; -} - -bool vfio_eeh_as_ok(AddressSpace *as) -{ - VFIOContainer *container = vfio_eeh_as_container(as); - - return (container != NULL) && vfio_eeh_container_ok(container); -} - -int vfio_eeh_as_op(AddressSpace *as, uint32_t op) -{ - VFIOContainer *container = vfio_eeh_as_container(as); - - if (!container) { - return -ENODEV; - } - return vfio_eeh_container_op(container, op); -} - static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) { char *tmp, group_path[PATH_MAX], *group_name; diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 7e5da21b31..168847e7c5 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -23,7 +23,6 @@ #include <sys/ioctl.h> #include "hw/vfio/vfio-common.h" -#include "hw/vfio/vfio.h" #include "hw/hw.h" #include "trace.h" #include "qapi/error.h" diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b27011cee7..c62c02f7b6 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3081,7 +3081,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) struct stat st; int i, ret; bool is_mdev; - char uuid[UUID_FMT_LEN]; + char uuid[UUID_STR_LEN]; char *name; if (!vbasedev->sysfsdev) { diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 9ec1e95f6d..83da2f7ec2 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -11,6 +11,11 @@ #include "qemu/osdep.h" #include <sys/ioctl.h> #include <linux/vfio.h> +#ifdef CONFIG_KVM +#include <linux/kvm.h> +#endif +#include "sysemu/kvm.h" +#include "exec/address-spaces.h" #include "hw/vfio/vfio-common.h" #include "hw/hw.h" @@ -135,15 +140,90 @@ static void vfio_prereg_listener_region_del(MemoryListener *listener, trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); } -const MemoryListener vfio_prereg_listener = { +static const MemoryListener vfio_prereg_listener = { .name = "vfio-pre-reg", .region_add = vfio_prereg_listener_region_add, .region_del = vfio_prereg_listener_region_del, }; -int vfio_spapr_create_window(VFIOContainer *container, - MemoryRegionSection *section, - hwaddr *pgsize) +static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, + hwaddr max_iova, uint64_t iova_pgsizes) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + min_iova, + max_iova - min_iova + 1)) { + hw_error("%s: Overlapped IOMMU are not enabled", __func__); + } + } + + hostwin = g_malloc0(sizeof(*hostwin)); + + hostwin->min_iova = min_iova; + hostwin->max_iova = max_iova; + hostwin->iova_pgsizes = iova_pgsizes; + QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); +} + +static int vfio_host_win_del(VFIOContainer *container, + hwaddr min_iova, hwaddr max_iova) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); + return 0; + } + } + + return -1; +} + +static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, + hwaddr iova, hwaddr end) +{ + VFIOHostDMAWindow *hostwin; + bool hostwin_found = false; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { + hostwin_found = true; + break; + } + } + + return hostwin_found ? hostwin : NULL; +} + +static int vfio_spapr_remove_window(VFIOContainer *container, + hwaddr offset_within_address_space) +{ + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + .start_addr = offset_within_address_space, + }; + int ret; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + error_report("Failed to remove window at %"PRIx64, + (uint64_t)remove.start_addr); + return -errno; + } + + trace_vfio_spapr_remove_window(offset_within_address_space); + + return 0; +} + +static int vfio_spapr_create_window(VFIOContainer *container, + MemoryRegionSection *section, + hwaddr *pgsize) { int ret = 0; IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); @@ -233,23 +313,195 @@ int vfio_spapr_create_window(VFIOContainer *container, return 0; } -int vfio_spapr_remove_window(VFIOContainer *container, - hwaddr offset_within_address_space) +int vfio_container_add_section_window(VFIOContainer *container, + MemoryRegionSection *section, + Error **errp) { - struct vfio_iommu_spapr_tce_remove remove = { - .argsz = sizeof(remove), - .start_addr = offset_within_address_space, - }; + VFIOHostDMAWindow *hostwin; + hwaddr pgsize = 0; int ret; - ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + /* + * VFIO_SPAPR_TCE_IOMMU supports a single host window between + * [dma32_window_start, dma32_window_size), we need to ensure + * the section fall in this range. + */ + if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { + hwaddr iova, end; + + iova = section->offset_within_address_space; + end = iova + int128_get64(section->size) - 1; + + if (!vfio_find_hostwin(container, iova, end)) { + error_setg(errp, "Container %p can't map guest IOVA region" + " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, + iova, end); + return -EINVAL; + } + return 0; + } + + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { + return 0; + } + + /* For now intersections are not allowed, we may relax this later */ + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + section->offset_within_address_space, + int128_get64(section->size))) { + error_setg(errp, + "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" + "host DMA window [0x%"PRIx64",0x%"PRIx64"]", + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1, + hostwin->min_iova, hostwin->max_iova); + return -EINVAL; + } + } + + ret = vfio_spapr_create_window(container, section, &pgsize); if (ret) { - error_report("Failed to remove window at %"PRIx64, - (uint64_t)remove.start_addr); - return -errno; + error_setg_errno(errp, -ret, "Failed to create SPAPR window"); + return ret; } - trace_vfio_spapr_remove_window(offset_within_address_space); + vfio_host_win_add(container, section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1, pgsize); +#ifdef CONFIG_KVM + if (kvm_enabled()) { + VFIOGroup *group; + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); + struct kvm_vfio_spapr_tce param; + struct kvm_device_attr attr = { + .group = KVM_DEV_VFIO_GROUP, + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, + .addr = (uint64_t)(unsigned long)¶m, + }; + + if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, + ¶m.tablefd)) { + QLIST_FOREACH(group, &container->group_list, container_next) { + param.groupfd = group->fd; + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { + error_setg_errno(errp, errno, + "vfio: failed GROUP_SET_SPAPR_TCE for " + "KVM VFIO device %d and group fd %d", + param.tablefd, param.groupfd); + return -errno; + } + trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); + } + } + } +#endif + return 0; +} + +void vfio_container_del_section_window(VFIOContainer *container, + MemoryRegionSection *section) +{ + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { + return; + } + + vfio_spapr_remove_window(container, + section->offset_within_address_space); + if (vfio_host_win_del(container, + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1) < 0) { + hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, + __func__, section->offset_within_address_space); + } +} + +int vfio_spapr_container_init(VFIOContainer *container, Error **errp) +{ + struct vfio_iommu_spapr_tce_info info; + bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; + int ret, fd = container->fd; + + QLIST_INIT(&container->hostwin_list); + + /* + * The host kernel code implementing VFIO_IOMMU_DISABLE is called + * when container fd is closed so we do not call it explicitly + * in this file. + */ + if (!v2) { + ret = ioctl(fd, VFIO_IOMMU_ENABLE); + if (ret) { + error_setg_errno(errp, errno, "failed to enable container"); + return -errno; + } + } else { + container->prereg_listener = vfio_prereg_listener; + + memory_listener_register(&container->prereg_listener, + &address_space_memory); + if (container->error) { + ret = -1; + error_propagate_prepend(errp, container->error, + "RAM memory listener initialization failed: "); + goto listener_unregister_exit; + } + } + + info.argsz = sizeof(info); + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + error_setg_errno(errp, errno, + "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); + ret = -errno; + goto listener_unregister_exit; + } + + if (v2) { + container->pgsizes = info.ddw.pgsizes; + /* + * There is a default window in just created container. + * To make region_add/del simpler, we better remove this + * window now and let those iommu_listener callbacks + * create/remove them when needed. + */ + ret = vfio_spapr_remove_window(container, info.dma32_window_start); + if (ret) { + error_setg_errno(errp, -ret, + "failed to remove existing window"); + goto listener_unregister_exit; + } + } else { + /* The default table uses 4K pages */ + container->pgsizes = 0x1000; + vfio_host_win_add(container, info.dma32_window_start, + info.dma32_window_start + + info.dma32_window_size - 1, + 0x1000); + } return 0; + +listener_unregister_exit: + if (v2) { + memory_listener_unregister(&container->prereg_listener); + } + return ret; +} + +void vfio_spapr_container_deinit(VFIOContainer *container) +{ + VFIOHostDMAWindow *hostwin, *next; + + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + memory_listener_unregister(&container->prereg_listener); + } + QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, + next) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); + } } diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 0af7a2886c..637cac4edf 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -135,6 +135,7 @@ virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" virtio_iommu_freeze_granule(uint64_t page_size_mask) "granule set to 0x%"PRIx64 +virtio_iommu_host_resv_regions(const char *name, uint32_t index, uint64_t lob, uint64_t upb) "mr=%s host-resv-reg[%d] = [0x%"PRIx64",0x%"PRIx64"]" # virtio-mem.c virtio_mem_send_response(uint16_t type) "type=%" PRIu16 diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c index 7ef2f9dcdb..9459fbf6ed 100644 --- a/hw/virtio/virtio-iommu-pci.c +++ b/hw/virtio/virtio-iommu-pci.c @@ -37,7 +37,7 @@ struct VirtIOIOMMUPCI { static Property virtio_iommu_pci_properties[] = { DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), DEFINE_PROP_ARRAY("reserved-regions", VirtIOIOMMUPCI, - vdev.nb_reserved_regions, vdev.reserved_regions, + vdev.nr_prop_resv_regions, vdev.prop_resv_regions, qdev_prop_reserved_region, ReservedRegion), DEFINE_PROP_END_OF_LIST(), }; @@ -54,9 +54,9 @@ static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) "for the virtio-iommu-pci device"); return; } - for (int i = 0; i < s->nb_reserved_regions; i++) { - if (s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED && - s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) { + for (int i = 0; i < s->nr_prop_resv_regions; i++) { + if (s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED && + s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) { error_setg(errp, "reserved region %d has an invalid type", i); error_append_hint(errp, "Valid values are 0 and 1\n"); return; diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index be51635895..89fb5767d1 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -20,12 +20,15 @@ #include "qemu/osdep.h" #include "qemu/log.h" #include "qemu/iov.h" +#include "qemu/range.h" +#include "qemu/reserved-region.h" #include "exec/target_page.h" #include "hw/qdev-properties.h" #include "hw/virtio/virtio.h" #include "sysemu/kvm.h" #include "sysemu/reset.h" #include "sysemu/sysemu.h" +#include "qemu/reserved-region.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "trace.h" @@ -378,6 +381,19 @@ static void virtio_iommu_put_domain(gpointer data) g_free(domain); } +static void add_prop_resv_regions(IOMMUDevice *sdev) +{ + VirtIOIOMMU *s = sdev->viommu; + int i; + + for (i = 0; i < s->nr_prop_resv_regions; i++) { + ReservedRegion *reg = g_new0(ReservedRegion, 1); + + *reg = s->prop_resv_regions[i]; + sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg); + } +} + static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, int devfn) { @@ -408,6 +424,7 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX); address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU); + add_prop_resv_regions(sdev); /* * Build the IOMMU disabled container with aliases to the @@ -444,6 +461,10 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, return &sdev->as; } +static const PCIIOMMUOps virtio_iommu_ops = { + .get_address_space = virtio_iommu_find_add_as, +}; + static int virtio_iommu_attach(VirtIOIOMMU *s, struct virtio_iommu_req_attach *req) { @@ -624,29 +645,30 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s, return ret; } -static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep, +static ssize_t virtio_iommu_fill_resv_mem_prop(IOMMUDevice *sdev, uint32_t ep, uint8_t *buf, size_t free) { struct virtio_iommu_probe_resv_mem prop = {}; size_t size = sizeof(prop), length = size - sizeof(prop.head), total; - int i; - - total = size * s->nb_reserved_regions; + GList *l; + total = size * g_list_length(sdev->resv_regions); if (total > free) { return -ENOSPC; } - for (i = 0; i < s->nb_reserved_regions; i++) { - unsigned subtype = s->reserved_regions[i].type; + for (l = sdev->resv_regions; l; l = l->next) { + ReservedRegion *reg = l->data; + unsigned subtype = reg->type; + Range *range = ®->range; assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED || subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI); prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM); prop.head.length = cpu_to_le16(length); prop.subtype = subtype; - prop.start = cpu_to_le64(s->reserved_regions[i].low); - prop.end = cpu_to_le64(s->reserved_regions[i].high); + prop.start = cpu_to_le64(range_lob(range)); + prop.end = cpu_to_le64(range_upb(range)); memcpy(buf, &prop, size); @@ -666,19 +688,27 @@ static int virtio_iommu_probe(VirtIOIOMMU *s, uint8_t *buf) { uint32_t ep_id = le32_to_cpu(req->endpoint); + IOMMUMemoryRegion *iommu_mr = virtio_iommu_mr(s, ep_id); size_t free = VIOMMU_PROBE_SIZE; + IOMMUDevice *sdev; ssize_t count; - if (!virtio_iommu_mr(s, ep_id)) { + if (!iommu_mr) { return VIRTIO_IOMMU_S_NOENT; } - count = virtio_iommu_fill_resv_mem_prop(s, ep_id, buf, free); + sdev = container_of(iommu_mr, IOMMUDevice, iommu_mr); + if (!sdev) { + return -EINVAL; + } + + count = virtio_iommu_fill_resv_mem_prop(sdev, ep_id, buf, free); if (count < 0) { return VIRTIO_IOMMU_S_INVAL; } buf += count; free -= count; + sdev->probe_done = true; return VIRTIO_IOMMU_S_OK; } @@ -856,7 +886,7 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, bool bypass_allowed; int granule; bool found; - int i; + GList *l; interval.low = addr; interval.high = addr + 1; @@ -894,10 +924,10 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, goto unlock; } - for (i = 0; i < s->nb_reserved_regions; i++) { - ReservedRegion *reg = &s->reserved_regions[i]; + for (l = sdev->resv_regions; l; l = l->next) { + ReservedRegion *reg = l->data; - if (addr >= reg->low && addr <= reg->high) { + if (range_contains(®->range, addr)) { switch (reg->type) { case VIRTIO_IOMMU_RESV_MEM_T_MSI: entry.perm = flag; @@ -1131,6 +1161,106 @@ static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, return 0; } +/** + * rebuild_resv_regions: rebuild resv regions with both the + * info of host resv ranges and property set resv ranges + */ +static int rebuild_resv_regions(IOMMUDevice *sdev) +{ + GList *l; + int i = 0; + + /* free the existing list and rebuild it from scratch */ + g_list_free_full(sdev->resv_regions, g_free); + sdev->resv_regions = NULL; + + /* First add host reserved regions if any, all tagged as RESERVED */ + for (l = sdev->host_resv_ranges; l; l = l->next) { + ReservedRegion *reg = g_new0(ReservedRegion, 1); + Range *r = (Range *)l->data; + + reg->type = VIRTIO_IOMMU_RESV_MEM_T_RESERVED; + range_set_bounds(®->range, range_lob(r), range_upb(r)); + sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg); + trace_virtio_iommu_host_resv_regions(sdev->iommu_mr.parent_obj.name, i, + range_lob(®->range), + range_upb(®->range)); + i++; + } + /* + * then add higher priority reserved regions set by the machine + * through properties + */ + add_prop_resv_regions(sdev); + return 0; +} + +/** + * virtio_iommu_set_iova_ranges: Conveys the usable IOVA ranges + * + * The function turns those into reserved ranges. Once some + * reserved ranges have been set, new reserved regions cannot be + * added outside of the original ones. + * + * @mr: IOMMU MR + * @iova_ranges: list of usable IOVA ranges + * @errp: error handle + */ +static int virtio_iommu_set_iova_ranges(IOMMUMemoryRegion *mr, + GList *iova_ranges, + Error **errp) +{ + IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + GList *current_ranges = sdev->host_resv_ranges; + GList *l, *tmp, *new_ranges = NULL; + int ret = -EINVAL; + + /* check that each new resv region is included in an existing one */ + if (sdev->host_resv_ranges) { + range_inverse_array(iova_ranges, + &new_ranges, + 0, UINT64_MAX); + + for (tmp = new_ranges; tmp; tmp = tmp->next) { + Range *newr = (Range *)tmp->data; + bool included = false; + + for (l = current_ranges; l; l = l->next) { + Range * r = (Range *)l->data; + + if (range_contains_range(r, newr)) { + included = true; + break; + } + } + if (!included) { + goto error; + } + } + /* all new reserved ranges are included in existing ones */ + ret = 0; + goto out; + } + + if (sdev->probe_done) { + warn_report("%s: Notified about new host reserved regions after probe", + mr->parent_obj.name); + } + + range_inverse_array(iova_ranges, + &sdev->host_resv_ranges, + 0, UINT64_MAX); + rebuild_resv_regions(sdev); + + return 0; +error: + error_setg(errp, "IOMMU mr=%s Conflicting host reserved ranges set!", + mr->parent_obj.name); +out: + g_list_free_full(new_ranges, g_free); + return ret; +} + static void virtio_iommu_system_reset(void *opaque) { VirtIOIOMMU *s = opaque; @@ -1206,7 +1336,7 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free); if (s->primary_bus) { - pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s); + pci_setup_iommu(s->primary_bus, &virtio_iommu_ops, s); } else { error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!"); } @@ -1426,6 +1556,7 @@ static void virtio_iommu_memory_region_class_init(ObjectClass *klass, imrc->replay = virtio_iommu_replay; imrc->notify_flag_changed = virtio_iommu_notify_flag_changed; imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask; + imrc->iommu_set_iova_ranges = virtio_iommu_set_iova_ranges; } static const TypeInfo virtio_iommu_info = { diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c index cc24812d2e..c3512c2dae 100644 --- a/hw/virtio/virtio-pmem.c +++ b/hw/virtio/virtio-pmem.c @@ -147,7 +147,10 @@ static void virtio_pmem_fill_device_info(const VirtIOPMEM *pmem, static MemoryRegion *virtio_pmem_get_memory_region(VirtIOPMEM *pmem, Error **errp) { - assert(pmem->memdev); + if (!pmem->memdev) { + error_setg(errp, "'%s' property must be set", VIRTIO_PMEM_MEMDEV_PROP); + return NULL; + } return &pmem->memdev->mr; } diff --git a/include/exec/memory.h b/include/exec/memory.h index 9087d02769..831f7c996d 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -24,6 +24,7 @@ #include "qemu/bswap.h" #include "qemu/queue.h" #include "qemu/int128.h" +#include "qemu/range.h" #include "qemu/notify.h" #include "qom/object.h" #include "qemu/rcu.h" @@ -79,8 +80,7 @@ extern unsigned int global_dirty_tracking; typedef struct MemoryRegionOps MemoryRegionOps; struct ReservedRegion { - hwaddr low; - hwaddr high; + Range range; unsigned type; }; @@ -527,6 +527,26 @@ struct IOMMUMemoryRegionClass { int (*iommu_set_page_size_mask)(IOMMUMemoryRegion *iommu, uint64_t page_size_mask, Error **errp); + /** + * @iommu_set_iova_ranges: + * + * Propagate information about the usable IOVA ranges for a given IOMMU + * memory region. Used for example to propagate host physical device + * reserved memory region constraints to the virtual IOMMU. + * + * Optional method: if this method is not provided, then the default IOVA + * aperture is used. + * + * @iommu: the IOMMUMemoryRegion + * + * @iova_ranges: list of ordered IOVA ranges (at least one range) + * + * Returns 0 on success, or a negative error. In case of failure, the error + * object must be created. + */ + int (*iommu_set_iova_ranges)(IOMMUMemoryRegion *iommu, + GList *iova_ranges, + Error **errp); }; typedef struct RamDiscardListener RamDiscardListener; @@ -1857,6 +1877,18 @@ int memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr, Error **errp); /** + * memory_region_iommu_set_iova_ranges - Set the usable IOVA ranges + * for a given IOMMU MR region + * + * @iommu: IOMMU memory region + * @iova_ranges: list of ordered IOVA ranges (at least one range) + * @errp: pointer to Error*, to store an error if it happens. + */ +int memory_region_iommu_set_iova_ranges(IOMMUMemoryRegion *iommu, + GList *iova_ranges, + Error **errp); + +/** * memory_region_name: get a memory region's name * * Returns the string that was used to initialize the memory region. diff --git a/include/hw/hyperv/dynmem-proto.h b/include/hw/hyperv/dynmem-proto.h new file mode 100644 index 0000000000..d0f9090ac4 --- /dev/null +++ b/include/hw/hyperv/dynmem-proto.h @@ -0,0 +1,423 @@ +#ifndef HW_HYPERV_DYNMEM_PROTO_H +#define HW_HYPERV_DYNMEM_PROTO_H + +/* + * Hyper-V Dynamic Memory Protocol definitions + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * Based on drivers/hv/hv_balloon.c from Linux kernel: + * Copyright (c) 2012, Microsoft Corporation. + * + * Author: K. Y. Srinivasan <kys@microsoft.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +/* + * Protocol versions. The low word is the minor version, the high word the major + * version. + * + * History: + * Initial version 1.0 + * Changed to 0.1 on 2009/03/25 + * Changes to 0.2 on 2009/05/14 + * Changes to 0.3 on 2009/12/03 + * Changed to 1.0 on 2011/04/05 + * Changed to 2.0 on 2019/12/10 + */ + +#define DYNMEM_MAKE_VERSION(Major, Minor) ((uint32_t)(((Major) << 16) | (Minor))) +#define DYNMEM_MAJOR_VERSION(Version) ((uint32_t)(Version) >> 16) +#define DYNMEM_MINOR_VERSION(Version) ((uint32_t)(Version) & 0xff) + +enum { + DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), + DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), + DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0), + + DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1, + DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2, + DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3, + + DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 +}; + + + +/* + * Message Types + */ + +enum dm_message_type { + /* + * Version 0.3 + */ + DM_ERROR = 0, + DM_VERSION_REQUEST = 1, + DM_VERSION_RESPONSE = 2, + DM_CAPABILITIES_REPORT = 3, + DM_CAPABILITIES_RESPONSE = 4, + DM_STATUS_REPORT = 5, + DM_BALLOON_REQUEST = 6, + DM_BALLOON_RESPONSE = 7, + DM_UNBALLOON_REQUEST = 8, + DM_UNBALLOON_RESPONSE = 9, + DM_MEM_HOT_ADD_REQUEST = 10, + DM_MEM_HOT_ADD_RESPONSE = 11, + DM_VERSION_03_MAX = 11, + /* + * Version 1.0. + */ + DM_INFO_MESSAGE = 12, + DM_VERSION_1_MAX = 12, + + /* + * Version 2.0 + */ + DM_MEM_HOT_REMOVE_REQUEST = 13, + DM_MEM_HOT_REMOVE_RESPONSE = 14 +}; + + +/* + * Structures defining the dynamic memory management + * protocol. + */ + +union dm_version { + struct { + uint16_t minor_version; + uint16_t major_version; + }; + uint32_t version; +} QEMU_PACKED; + + +union dm_caps { + struct { + uint64_t balloon:1; + uint64_t hot_add:1; + /* + * To support guests that may have alignment + * limitations on hot-add, the guest can specify + * its alignment requirements; a value of n + * represents an alignment of 2^n in mega bytes. + */ + uint64_t hot_add_alignment:4; + uint64_t hot_remove:1; + uint64_t reservedz:57; + } cap_bits; + uint64_t caps; +} QEMU_PACKED; + +union dm_mem_page_range { + struct { + /* + * The PFN number of the first page in the range. + * 40 bits is the architectural limit of a PFN + * number for AMD64. + */ + uint64_t start_page:40; + /* + * The number of pages in the range. + */ + uint64_t page_cnt:24; + } finfo; + uint64_t page_range; +} QEMU_PACKED; + + + +/* + * The header for all dynamic memory messages: + * + * type: Type of the message. + * size: Size of the message in bytes; including the header. + * trans_id: The guest is responsible for manufacturing this ID. + */ + +struct dm_header { + uint16_t type; + uint16_t size; + uint32_t trans_id; +} QEMU_PACKED; + +/* + * A generic message format for dynamic memory. + * Specific message formats are defined later in the file. + */ + +struct dm_message { + struct dm_header hdr; + uint8_t data[]; /* enclosed message */ +} QEMU_PACKED; + + +/* + * Specific message types supporting the dynamic memory protocol. + */ + +/* + * Version negotiation message. Sent from the guest to the host. + * The guest is free to try different versions until the host + * accepts the version. + * + * dm_version: The protocol version requested. + * is_last_attempt: If TRUE, this is the last version guest will request. + * reservedz: Reserved field, set to zero. + */ + +struct dm_version_request { + struct dm_header hdr; + union dm_version version; + uint32_t is_last_attempt:1; + uint32_t reservedz:31; +} QEMU_PACKED; + +/* + * Version response message; Host to Guest and indicates + * if the host has accepted the version sent by the guest. + * + * is_accepted: If TRUE, host has accepted the version and the guest + * should proceed to the next stage of the protocol. FALSE indicates that + * guest should re-try with a different version. + * + * reservedz: Reserved field, set to zero. + */ + +struct dm_version_response { + struct dm_header hdr; + uint64_t is_accepted:1; + uint64_t reservedz:63; +} QEMU_PACKED; + +/* + * Message reporting capabilities. This is sent from the guest to the + * host. + */ + +struct dm_capabilities { + struct dm_header hdr; + union dm_caps caps; + uint64_t min_page_cnt; + uint64_t max_page_number; +} QEMU_PACKED; + +/* + * Response to the capabilities message. This is sent from the host to the + * guest. This message notifies if the host has accepted the guest's + * capabilities. If the host has not accepted, the guest must shutdown + * the service. + * + * is_accepted: Indicates if the host has accepted guest's capabilities. + * reservedz: Must be 0. + */ + +struct dm_capabilities_resp_msg { + struct dm_header hdr; + uint64_t is_accepted:1; + uint64_t hot_remove:1; + uint64_t suppress_pressure_reports:1; + uint64_t reservedz:61; +} QEMU_PACKED; + +/* + * This message is used to report memory pressure from the guest. + * This message is not part of any transaction and there is no + * response to this message. + * + * num_avail: Available memory in pages. + * num_committed: Committed memory in pages. + * page_file_size: The accumulated size of all page files + * in the system in pages. + * zero_free: The nunber of zero and free pages. + * page_file_writes: The writes to the page file in pages. + * io_diff: An indicator of file cache efficiency or page file activity, + * calculated as File Cache Page Fault Count - Page Read Count. + * This value is in pages. + * + * Some of these metrics are Windows specific and fortunately + * the algorithm on the host side that computes the guest memory + * pressure only uses num_committed value. + */ + +struct dm_status { + struct dm_header hdr; + uint64_t num_avail; + uint64_t num_committed; + uint64_t page_file_size; + uint64_t zero_free; + uint32_t page_file_writes; + uint32_t io_diff; +} QEMU_PACKED; + + +/* + * Message to ask the guest to allocate memory - balloon up message. + * This message is sent from the host to the guest. The guest may not be + * able to allocate as much memory as requested. + * + * num_pages: number of pages to allocate. + */ + +struct dm_balloon { + struct dm_header hdr; + uint32_t num_pages; + uint32_t reservedz; +} QEMU_PACKED; + + +/* + * Balloon response message; this message is sent from the guest + * to the host in response to the balloon message. + * + * reservedz: Reserved; must be set to zero. + * more_pages: If FALSE, this is the last message of the transaction. + * if TRUE there will atleast one more message from the guest. + * + * range_count: The number of ranges in the range array. + * + * range_array: An array of page ranges returned to the host. + * + */ + +struct dm_balloon_response { + struct dm_header hdr; + uint32_t reservedz; + uint32_t more_pages:1; + uint32_t range_count:31; + union dm_mem_page_range range_array[]; +} QEMU_PACKED; + +/* + * Un-balloon message; this message is sent from the host + * to the guest to give guest more memory. + * + * more_pages: If FALSE, this is the last message of the transaction. + * if TRUE there will atleast one more message from the guest. + * + * reservedz: Reserved; must be set to zero. + * + * range_count: The number of ranges in the range array. + * + * range_array: An array of page ranges returned to the host. + * + */ + +struct dm_unballoon_request { + struct dm_header hdr; + uint32_t more_pages:1; + uint32_t reservedz:31; + uint32_t range_count; + union dm_mem_page_range range_array[]; +} QEMU_PACKED; + +/* + * Un-balloon response message; this message is sent from the guest + * to the host in response to an unballoon request. + * + */ + +struct dm_unballoon_response { + struct dm_header hdr; +} QEMU_PACKED; + + +/* + * Hot add request message. Message sent from the host to the guest. + * + * mem_range: Memory range to hot add. + * + */ + +struct dm_hot_add { + struct dm_header hdr; + union dm_mem_page_range range; +} QEMU_PACKED; + +/* + * Hot add response message. + * This message is sent by the guest to report the status of a hot add request. + * If page_count is less than the requested page count, then the host should + * assume all further hot add requests will fail, since this indicates that + * the guest has hit an upper physical memory barrier. + * + * Hot adds may also fail due to low resources; in this case, the guest must + * not complete this message until the hot add can succeed, and the host must + * not send a new hot add request until the response is sent. + * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS + * times it fails the request. + * + * + * page_count: number of pages that were successfully hot added. + * + * result: result of the operation 1: success, 0: failure. + * + */ + +struct dm_hot_add_response { + struct dm_header hdr; + uint32_t page_count; + uint32_t result; +} QEMU_PACKED; + +struct dm_hot_remove { + struct dm_header hdr; + uint32_t virtual_node; + uint32_t page_count; + uint32_t qos_flags; + uint32_t reservedZ; +} QEMU_PACKED; + +struct dm_hot_remove_response { + struct dm_header hdr; + uint32_t result; + uint32_t range_count; + uint64_t more_pages:1; + uint64_t reservedz:63; + union dm_mem_page_range range_array[]; +} QEMU_PACKED; + +#define DM_REMOVE_QOS_LARGE (1 << 0) +#define DM_REMOVE_QOS_LOCAL (1 << 1) +#define DM_REMOVE_QOS_MASK (0x3) + +/* + * Types of information sent from host to the guest. + */ + +enum dm_info_type { + INFO_TYPE_MAX_PAGE_CNT = 0, + MAX_INFO_TYPE +}; + + +/* + * Header for the information message. + */ + +struct dm_info_header { + enum dm_info_type type; + uint32_t data_size; + uint8_t data[]; +} QEMU_PACKED; + +/* + * This message is sent from the host to the guest to pass + * some relevant information (win8 addition). + * + * reserved: no used. + * info_size: size of the information blob. + * info: information blob. + */ + +struct dm_info_msg { + struct dm_header hdr; + uint32_t reserved; + uint32_t info_size; + uint8_t info[]; +}; + +#endif diff --git a/include/hw/hyperv/hv-balloon.h b/include/hw/hyperv/hv-balloon.h new file mode 100644 index 0000000000..c1efe70fc2 --- /dev/null +++ b/include/hw/hyperv/hv-balloon.h @@ -0,0 +1,18 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HV_BALLOON_H +#define HW_HV_BALLOON_H + +#include "qom/object.h" + +#define TYPE_HV_BALLOON "hv-balloon" +OBJECT_DECLARE_SIMPLE_TYPE(HvBalloon, HV_BALLOON) + +#endif diff --git a/include/hw/mem/memory-device.h b/include/hw/mem/memory-device.h index 3354d6c166..a1d62cc551 100644 --- a/include/hw/mem/memory-device.h +++ b/include/hw/mem/memory-device.h @@ -38,6 +38,10 @@ typedef struct MemoryDeviceState MemoryDeviceState; * address in guest physical memory can either be specified explicitly * or get assigned automatically. * + * Some memory device might not own a memory region in certain device + * configurations. Such devices can logically get (un)plugged, however, + * empty memory devices are mostly ignored by the memory device code. + * * Conceptually, memory devices only span one memory region. If multiple * successive memory regions are used, a covering memory region has to * be provided. Scattered memory regions are not supported for single @@ -91,7 +95,8 @@ struct MemoryDeviceClass { uint64_t (*get_plugged_size)(const MemoryDeviceState *md, Error **errp); /* - * Return the memory region of the memory device. + * Return the memory region of the memory device. If the device is + * completely empty, returns NULL without an error. * * Called when (un)plugging the memory device, to (un)map the * memory region in guest physical memory, but also to detect the diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index ea5aff118b..fa6313aabc 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -363,10 +363,42 @@ void pci_bus_get_w64_range(PCIBus *bus, Range *range); void pci_device_deassert_intx(PCIDevice *dev); -typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int); + +/** + * struct PCIIOMMUOps: callbacks structure for specific IOMMU handlers + * of a PCIBus + * + * Allows to modify the behavior of some IOMMU operations of the PCI + * framework for a set of devices on a PCI bus. + */ +typedef struct PCIIOMMUOps { + /** + * @get_address_space: get the address space for a set of devices + * on a PCI bus. + * + * Mandatory callback which returns a pointer to an #AddressSpace + * + * @bus: the #PCIBus being accessed. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number + */ + AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); +} PCIIOMMUOps; AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); + +/** + * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus + * + * Let PCI host bridges define specific operations. + * + * @bus: the #PCIBus being updated. + * @ops: the #PCIIOMMUOps + * @opaque: passed to callbacks of the @ops structure. + */ +void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque); pcibus_t pci_bar_address(PCIDevice *d, int reg, uint8_t type, pcibus_t size); diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h index 5653175957..2261312546 100644 --- a/include/hw/pci/pci_bus.h +++ b/include/hw/pci/pci_bus.h @@ -33,7 +33,7 @@ enum PCIBusFlags { struct PCIBus { BusState qbus; enum PCIBusFlags flags; - PCIIOMMUFunc iommu_fn; + const PCIIOMMUOps *iommu_ops; void *iommu_opaque; uint8_t devfn_min; uint32_t slot_reserved_mask; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 7780b9073a..a4a22accb9 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -99,6 +99,7 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_ENTRY(VFIOContainer) next; QLIST_HEAD(, VFIODevice) device_list; + GList *iova_ranges; } VFIOContainer; typedef struct VFIOGuestIOMMU { @@ -206,11 +207,6 @@ typedef struct { hwaddr pages; } VFIOBitmap; -void vfio_host_win_add(VFIOContainer *container, - hwaddr min_iova, hwaddr max_iova, - uint64_t iova_pgsizes); -int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova, - hwaddr max_iova); VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); bool vfio_devices_all_running_and_saving(VFIOContainer *container); @@ -224,11 +220,14 @@ int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start); int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); +/* SPAPR specific */ int vfio_container_add_section_window(VFIOContainer *container, MemoryRegionSection *section, Error **errp); void vfio_container_del_section_window(VFIOContainer *container, MemoryRegionSection *section); +int vfio_spapr_container_init(VFIOContainer *container, Error **errp); +void vfio_spapr_container_deinit(VFIOContainer *container); void vfio_disable_irqindex(VFIODevice *vbasedev, int index); void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index); @@ -288,13 +287,6 @@ vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); struct vfio_info_cap_header * vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id); #endif -extern const MemoryListener vfio_prereg_listener; - -int vfio_spapr_create_window(VFIOContainer *container, - MemoryRegionSection *section, - hwaddr *pgsize); -int vfio_spapr_remove_window(VFIOContainer *container, - hwaddr offset_within_address_space); bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); void vfio_migration_exit(VFIODevice *vbasedev); diff --git a/include/hw/vfio/vfio.h b/include/hw/vfio/vfio.h deleted file mode 100644 index 86248f5436..0000000000 --- a/include/hw/vfio/vfio.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef HW_VFIO_H -#define HW_VFIO_H - -bool vfio_eeh_as_ok(AddressSpace *as); -int vfio_eeh_as_op(AddressSpace *as, uint32_t op); - -#endif diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index a93fc5383e..781ebaea8f 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -39,6 +39,9 @@ typedef struct IOMMUDevice { AddressSpace as; MemoryRegion root; /* The root container of the device */ MemoryRegion bypass_mr; /* The alias of shared memory MR */ + GList *resv_regions; + GList *host_resv_ranges; + bool probe_done; } IOMMUDevice; typedef struct IOMMUPciBus { @@ -55,8 +58,8 @@ struct VirtIOIOMMU { GHashTable *as_by_busptr; IOMMUPciBus *iommu_pcibus_by_bus_num[PCI_BUS_MAX]; PCIBus *primary_bus; - ReservedRegion *reserved_regions; - uint32_t nb_reserved_regions; + ReservedRegion *prop_resv_regions; + uint32_t nr_prop_resv_regions; GTree *domains; QemuRecMutex mutex; GTree *endpoints; diff --git a/include/qemu/range.h b/include/qemu/range.h index 7e2b1cc447..205e1da76d 100644 --- a/include/qemu/range.h +++ b/include/qemu/range.h @@ -217,6 +217,20 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1, return !(last2 < first1 || last1 < first2); } +/* + * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap. + * Both @a and @b must not be empty. + */ +int range_compare(Range *a, Range *b); + GList *range_list_insert(GList *list, Range *data); +/* + * Inverse an array of sorted ranges over the [low, high] span, ie. + * original ranges becomes holes in the newly allocated inv_ranges + */ +void range_inverse_array(GList *in_ranges, + GList **out_ranges, + uint64_t low, uint64_t high); + #endif diff --git a/include/qemu/reserved-region.h b/include/qemu/reserved-region.h new file mode 100644 index 0000000000..8e6f0a97e2 --- /dev/null +++ b/include/qemu/reserved-region.h @@ -0,0 +1,32 @@ +/* + * QEMU ReservedRegion helpers + * + * Copyright (c) 2023 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef QEMU_RESERVED_REGION_H +#define QEMU_RESERVED_REGION_H + +#include "exec/memory.h" + +/* + * Insert a new region into a sorted list of reserved regions. In case + * there is overlap with existing regions, the new added region has + * higher priority and replaces the overlapped segment. + */ +GList *resv_region_list_insert(GList *list, ReservedRegion *reg); + +#endif diff --git a/include/qemu/uuid.h b/include/qemu/uuid.h index e24a1099e4..869f84af09 100644 --- a/include/qemu/uuid.h +++ b/include/qemu/uuid.h @@ -78,9 +78,10 @@ typedef struct { "%02hhx%02hhx-" \ "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx" -#define UUID_FMT_LEN 36 - #define UUID_NONE "00000000-0000-0000-0000-000000000000" +QEMU_BUILD_BUG_ON(sizeof(UUID_NONE) - 1 != 36); + +#define UUID_STR_LEN sizeof(UUID_NONE) void qemu_uuid_generate(QemuUUID *out); diff --git a/include/sysemu/kvm_xen.h b/include/sysemu/kvm_xen.h index 595abfbe40..961c702c4e 100644 --- a/include/sysemu/kvm_xen.h +++ b/include/sysemu/kvm_xen.h @@ -22,6 +22,7 @@ int kvm_xen_soft_reset(void); uint32_t kvm_xen_get_caps(void); void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id); +bool kvm_xen_has_vcpu_callback_vector(void); void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type); void kvm_xen_set_callback_asserted(void); int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port); diff --git a/io/channel-socket.c b/io/channel-socket.c index 02ffb51e99..3a899b0608 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -782,6 +782,11 @@ static int qio_channel_socket_flush(QIOChannel *ioc, "Error not from zero copy"); return -1; } + if (serr->ee_data < serr->ee_info) { + error_setg_errno(errp, serr->ee_origin, + "Wrong notification bounds"); + return -1; + } /* No errors, count successfully finished sendmsg()*/ sioc->zero_copy_sent += serr->ee_data - serr->ee_info + 1; diff --git a/meson.build b/meson.build index dcef8b1e79..51a51075db 100644 --- a/meson.build +++ b/meson.build @@ -1323,6 +1323,30 @@ if not get_option('glusterfs').auto() or have_block endif endif +hv_balloon = false +if get_option('hv_balloon').allowed() and have_system + if cc.links(''' + #include <string.h> + #include <gmodule.h> + int main(void) { + GTree *tree; + + tree = g_tree_new((GCompareFunc)strcmp); + (void)g_tree_node_first(tree); + g_tree_destroy(tree); + return 0; + } + ''', dependencies: glib) + hv_balloon = true + else + if get_option('hv_balloon').enabled() + error('could not enable hv-balloon, update your glib') + else + warning('could not find glib support for hv-balloon, disabling') + endif + endif +endif + libssh = not_found if not get_option('libssh').auto() or have_block libssh = dependency('libssh', version: '>=0.8.7', @@ -2855,7 +2879,8 @@ host_kconfig = \ (targetos == 'linux' ? ['CONFIG_LINUX=y'] : []) + \ (have_pvrdma ? ['CONFIG_PVRDMA=y'] : []) + \ (multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : []) + \ - (vfio_user_server_allowed ? ['CONFIG_VFIO_USER_SERVER_ALLOWED=y'] : []) + (vfio_user_server_allowed ? ['CONFIG_VFIO_USER_SERVER_ALLOWED=y'] : []) + \ + (hv_balloon ? ['CONFIG_HV_BALLOON_POSSIBLE=y'] : []) ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ] @@ -4321,6 +4346,7 @@ if targetos == 'windows' endif summary_info += {'seccomp support': seccomp} summary_info += {'GlusterFS support': glusterfs} +summary_info += {'hv-balloon support': hv_balloon} summary_info += {'TPM support': have_tpm} summary_info += {'libssh support': libssh} summary_info += {'lzo support': lzo} diff --git a/meson_options.txt b/meson_options.txt index 3c7398f3c6..5c212fcd45 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -150,6 +150,8 @@ option('gio', type : 'feature', value : 'auto', description: 'use libgio for D-Bus support') option('glusterfs', type : 'feature', value : 'auto', description: 'Glusterfs block device driver') +option('hv_balloon', type : 'feature', value : 'auto', + description: 'hv-balloon driver (requires Glib 2.68+ GTree API)') option('libdw', type : 'feature', value : 'auto', description: 'debuginfo support') option('libiscsi', type : 'feature', value : 'auto', diff --git a/migration/savevm.c b/migration/savevm.c index bc98c2ea6f..eec5503a42 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -471,8 +471,8 @@ static bool vmstate_uuid_needed(void *opaque) static int vmstate_uuid_post_load(void *opaque, int version_id) { SaveState *state = opaque; - char uuid_src[UUID_FMT_LEN + 1]; - char uuid_dst[UUID_FMT_LEN + 1]; + char uuid_src[UUID_STR_LEN]; + char uuid_dst[UUID_STR_LEN]; if (!qemu_uuid_set) { /* diff --git a/monitor/monitor.c b/monitor/monitor.c index 941f87815a..01ede1babd 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -315,6 +315,7 @@ static MonitorQAPIEventConf monitor_qapi_event_conf[QAPI_EVENT__MAX] = { [QAPI_EVENT_QUORUM_FAILURE] = { 1000 * SCALE_MS }, [QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS }, [QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS }, + [QAPI_EVENT_HV_BALLOON_STATUS_REPORT] = { 1000 * SCALE_MS }, }; /* diff --git a/qapi/machine.json b/qapi/machine.json index 6c9d2f6dcf..b6d634b30d 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -1138,6 +1138,68 @@ 'data': { 'actual': 'int' } } ## +# @HvBalloonInfo: +# +# hv-balloon guest-provided memory status information. +# +# @committed: the amount of memory in use inside the guest plus the +# amount of the memory unusable inside the guest (ballooned out, +# offline, etc.) +# +# @available: the amount of the memory inside the guest available for +# new allocations ("free") +# +# Since: 8.2 +## +{ 'struct': 'HvBalloonInfo', + 'data': { 'committed': 'size', 'available': 'size' } } + +## +# @query-hv-balloon-status-report: +# +# Returns the hv-balloon driver data contained in the last received "STATUS" +# message from the guest. +# +# Returns: +# - @HvBalloonInfo on success +# - If no hv-balloon device is present, guest memory status reporting +# is not enabled or no guest memory status report received yet, +# GenericError +# +# Since: 8.2 +# +# Example: +# +# -> { "execute": "query-hv-balloon-status-report" } +# <- { "return": { +# "committed": 816640000, +# "available": 3333054464 +# } +# } +## +{ 'command': 'query-hv-balloon-status-report', 'returns': 'HvBalloonInfo' } + +## +# @HV_BALLOON_STATUS_REPORT: +# +# Emitted when the hv-balloon driver receives a "STATUS" message from +# the guest. +# +# Note: this event is rate-limited. +# +# Since: 8.2 +# +# Example: +# +# <- { "event": "HV_BALLOON_STATUS_REPORT", +# "data": { "committed": 816640000, "available": 3333054464 }, +# "timestamp": { "seconds": 1600295492, "microseconds": 661044 } } +# +## +{ 'event': 'HV_BALLOON_STATUS_REPORT', + 'data': 'HvBalloonInfo' } + +## # @MemoryInfo: # # Actual memory information in bytes. @@ -1290,6 +1352,29 @@ } ## +# @HvBalloonDeviceInfo: +# +# hv-balloon provided memory state information +# +# @id: device's ID +# +# @memaddr: physical address in memory, where device is mapped +# +# @max-size: the maximum size of memory that the device can provide +# +# @memdev: memory backend linked with device +# +# Since: 8.2 +## +{ 'struct': 'HvBalloonDeviceInfo', + 'data': { '*id': 'str', + '*memaddr': 'size', + 'max-size': 'size', + '*memdev': 'str' + } +} + +## # @MemoryDeviceInfoKind: # # @nvdimm: since 2.12 @@ -1300,10 +1385,13 @@ # # @sgx-epc: since 6.2. # +# @hv-balloon: since 8.2. +# # Since: 2.1 ## { 'enum': 'MemoryDeviceInfoKind', - 'data': [ 'dimm', 'nvdimm', 'virtio-pmem', 'virtio-mem', 'sgx-epc' ] } + 'data': [ 'dimm', 'nvdimm', 'virtio-pmem', 'virtio-mem', 'sgx-epc', + 'hv-balloon' ] } ## # @PCDIMMDeviceInfoWrapper: @@ -1338,6 +1426,14 @@ 'data': { 'data': 'SgxEPCDeviceInfo' } } ## +# @HvBalloonDeviceInfoWrapper: +# +# Since: 8.2 +## +{ 'struct': 'HvBalloonDeviceInfoWrapper', + 'data': { 'data': 'HvBalloonDeviceInfo' } } + +## # @MemoryDeviceInfo: # # Union containing information about a memory device @@ -1351,7 +1447,8 @@ 'nvdimm': 'PCDIMMDeviceInfoWrapper', 'virtio-pmem': 'VirtioPMEMDeviceInfoWrapper', 'virtio-mem': 'VirtioMEMDeviceInfoWrapper', - 'sgx-epc': 'SgxEPCDeviceInfoWrapper' + 'sgx-epc': 'SgxEPCDeviceInfoWrapper', + 'hv-balloon': 'HvBalloonDeviceInfoWrapper' } } diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 7ca4b77eae..e9d6d39279 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -123,6 +123,7 @@ meson_options_help() { printf "%s\n" ' gtk-clipboard clipboard support for the gtk UI (EXPERIMENTAL, MAY HANG)' printf "%s\n" ' guest-agent Build QEMU Guest Agent' printf "%s\n" ' guest-agent-msi Build MSI package for the QEMU Guest Agent' + printf "%s\n" ' hv-balloon hv-balloon driver (requires Glib 2.68+ GTree API)' printf "%s\n" ' hvf HVF acceleration support' printf "%s\n" ' iconv Font glyph conversion support' printf "%s\n" ' jack JACK sound support' @@ -333,6 +334,8 @@ _meson_option_parse() { --disable-guest-agent-msi) printf "%s" -Dguest_agent_msi=disabled ;; --enable-hexagon-idef-parser) printf "%s" -Dhexagon_idef_parser=true ;; --disable-hexagon-idef-parser) printf "%s" -Dhexagon_idef_parser=false ;; + --enable-hv-balloon) printf "%s" -Dhv_balloon=enabled ;; + --disable-hv-balloon) printf "%s" -Dhv_balloon=disabled ;; --enable-hvf) printf "%s" -Dhvf=enabled ;; --disable-hvf) printf "%s" -Dhvf=disabled ;; --iasl=*) quote_sh "-Diasl=$2" ;; diff --git a/system/memory.c b/system/memory.c index 4928f2525d..304fa843ea 100644 --- a/system/memory.c +++ b/system/memory.c @@ -1921,6 +1921,19 @@ int memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr, return ret; } +int memory_region_iommu_set_iova_ranges(IOMMUMemoryRegion *iommu_mr, + GList *iova_ranges, + Error **errp) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + int ret = 0; + + if (imrc->iommu_set_iova_ranges) { + ret = imrc->iommu_set_iova_ranges(iommu_mr, iova_ranges, errp); + } + return ret; +} + int memory_region_register_iommu_notifier(MemoryRegion *mr, IOMMUNotifier *n, Error **errp) { diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode index 0cf1147074..8a20dce3c8 100644 --- a/target/arm/tcg/a64.decode +++ b/target/arm/tcg/a64.decode @@ -462,7 +462,7 @@ LDAPR sz:2 111 0 00 1 0 1 11111 1100 00 rn:5 rt:5 # Load/store register (pointer authentication) # LDRA immediate is 10 bits signed and scaled, but the bits aren't all contiguous -%ldra_imm 22:s1 12:9 !function=times_2 +%ldra_imm 22:s1 12:9 !function=times_8 LDRA 11 111 0 00 m:1 . 1 ......... w:1 1 rn:5 rt:5 imm=%ldra_imm diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h index 9efe00cf6c..3c3bb3431a 100644 --- a/target/arm/tcg/translate.h +++ b/target/arm/tcg/translate.h @@ -205,6 +205,11 @@ static inline int times_4(DisasContext *s, int x) return x * 4; } +static inline int times_8(DisasContext *s, int x) +{ + return x * 8; +} + static inline int times_2_plus_1(DisasContext *s, int x) { return x * 2 + 1; diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c index 76348f9d5d..75b2c557b9 100644 --- a/target/i386/kvm/xen-emu.c +++ b/target/i386/kvm/xen-emu.c @@ -267,7 +267,6 @@ static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu, fi.submap |= 1 << XENFEAT_writable_page_tables | 1 << XENFEAT_writable_descriptor_tables | 1 << XENFEAT_auto_translated_physmap | - 1 << XENFEAT_supervisor_mode_kernel | 1 << XENFEAT_hvm_callback_vector | 1 << XENFEAT_hvm_safe_pvclock | 1 << XENFEAT_hvm_pirqs; @@ -307,7 +306,7 @@ static int kvm_xen_set_vcpu_callback_vector(CPUState *cs) trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector); - return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva); + return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xva); } static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data) @@ -425,6 +424,13 @@ void kvm_xen_set_callback_asserted(void) } } +bool kvm_xen_has_vcpu_callback_vector(void) +{ + CPUState *cs = qemu_get_cpu(0); + + return cs && !!X86_CPU(cs)->env.xen_vcpu_callback_vector; +} + void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type) { CPUState *cs = qemu_get_cpu(vcpu_id); @@ -441,7 +447,8 @@ void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type) * deliver it as an MSI. */ MSIMessage msg = { - .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id, + .address = APIC_DEFAULT_ADDRESS | + (X86_CPU(cs)->apic_id << MSI_ADDR_DEST_ID_SHIFT), .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT), }; kvm_irqchip_send_msi(kvm_state, msg); @@ -850,8 +857,7 @@ static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu, int ret = -ENOSYS; switch (cmd) { case HVMOP_set_evtchn_upcall_vector: - ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, - exit->u.hcall.params[0]); + ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, arg); break; case HVMOP_pagetable_dying: diff --git a/tests/data/acpi/virt/DBG2 b/tests/data/acpi/virt/DBG2 index 86e6314f7b..0a05e1a47f 100644 --- a/tests/data/acpi/virt/DBG2 +++ b/tests/data/acpi/virt/DBG2 Binary files differdiff --git a/tests/data/acpi/virt/SPCR b/tests/data/acpi/virt/SPCR index 24e0a579e7..cf0f2b7522 100644 --- a/tests/data/acpi/virt/SPCR +++ b/tests/data/acpi/virt/SPCR Binary files differdiff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c index 73a670e8fa..2c15f60958 100644 --- a/tests/qtest/qmp-cmd-test.c +++ b/tests/qtest/qmp-cmd-test.c @@ -45,6 +45,7 @@ static int query_error_class(const char *cmd) { "query-acpi-ospm-status", ERROR_CLASS_GENERIC_ERROR }, { "query-balloon", ERROR_CLASS_DEVICE_NOT_ACTIVE }, { "query-hotpluggable-cpus", ERROR_CLASS_GENERIC_ERROR }, + { "query-hv-balloon-status-report", ERROR_CLASS_GENERIC_ERROR }, { "query-vm-generation-id", ERROR_CLASS_GENERIC_ERROR }, /* Only valid with a USB bus added */ { "x-query-usb", ERROR_CLASS_GENERIC_ERROR }, diff --git a/tests/unit/meson.build b/tests/unit/meson.build index f33ae64b8d..e6c51e7a86 100644 --- a/tests/unit/meson.build +++ b/tests/unit/meson.build @@ -21,6 +21,7 @@ tests = { 'test-opts-visitor': [testqapi], 'test-visitor-serialization': [testqapi], 'test-bitmap': [], + 'test-resv-mem': [], # all code tested by test-x86-cpuid is inside topology.h 'test-x86-cpuid': [], 'test-cutils': [], diff --git a/tests/unit/test-resv-mem.c b/tests/unit/test-resv-mem.c new file mode 100644 index 0000000000..5963274e2c --- /dev/null +++ b/tests/unit/test-resv-mem.c @@ -0,0 +1,316 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * reserved-region/range.c unit-tests. + * + * Copyright (C) 2023, Red Hat, Inc. + * + * Author: Eric Auger <eric.auger@redhat.com> + */ + +#include "qemu/osdep.h" +#include "qemu/range.h" +#include "exec/memory.h" +#include "qemu/reserved-region.h" + +#define DEBUG 0 + +#if DEBUG +static void print_ranges(const char *prefix, GList *ranges) +{ + GList *l; + int i = 0; + + if (!g_list_length(ranges)) { + printf("%s is void\n", prefix); + return; + } + for (l = ranges; l; l = l->next) { + Range *r = (Range *)l->data; + + printf("%s rev[%i] = [0x%"PRIx64",0x%"PRIx64"]\n", + prefix, i, range_lob(r), range_upb(r)); + i++; + } +} +#endif + +static void compare_ranges(const char *prefix, GList *ranges, + GList *expected) +{ + GList *l, *e; + +#if DEBUG + print_ranges("out", ranges); + print_ranges("expected", expected); +#endif + g_assert_cmpint(g_list_length(ranges), ==, g_list_length(expected)); + for (l = ranges, e = expected; l ; l = l->next, e = e->next) { + Range *r = (Range *)l->data; + Range *er = (Range *)e->data; + + g_assert_true(range_lob(r) == range_lob(er) && + range_upb(r) == range_upb(er)); + } +} + +static GList *insert_sorted_range(GList *list, uint64_t lob, uint64_t upb) +{ + Range *new = g_new0(Range, 1); + + range_set_bounds(new, lob, upb); + return range_list_insert(list, new); +} + +static void reset(GList **in, GList **out, GList **expected) +{ + g_list_free_full(*in, g_free); + g_list_free_full(*out, g_free); + g_list_free_full(*expected, g_free); + *in = NULL; + *out = NULL; + *expected = NULL; +} + +static void +run_range_inverse_array(const char *prefix, GList **in, GList **expected, + uint64_t low, uint64_t high) +{ + GList *out = NULL; + range_inverse_array(*in, &out, low, high); + compare_ranges(prefix, out, *expected); + reset(in, &out, expected); +} + +static void check_range_reverse_array(void) +{ + GList *in = NULL, *expected = NULL; + + /* test 1 */ + + in = insert_sorted_range(in, 0x10000, UINT64_MAX); + expected = insert_sorted_range(expected, 0x0, 0xFFFF); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 2 */ + + in = insert_sorted_range(in, 0x10000, 0xFFFFFFFFFFFF); + expected = insert_sorted_range(expected, 0x0, 0xFFFF); + expected = insert_sorted_range(expected, 0x1000000000000, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 3 */ + + in = insert_sorted_range(in, 0x0, 0xFFFF); + in = insert_sorted_range(in, 0x10000, 0x2FFFF); + expected = insert_sorted_range(expected, 0x30000, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 4 */ + + in = insert_sorted_range(in, 0x50000, 0x5FFFF); + in = insert_sorted_range(in, 0x60000, 0xFFFFFFFFFFFF); + expected = insert_sorted_range(expected, 0x0, 0x4FFFF); + expected = insert_sorted_range(expected, 0x1000000000000, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 5 */ + + in = insert_sorted_range(in, 0x0, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 6 */ + in = insert_sorted_range(in, 0x10000, 0x1FFFF); + in = insert_sorted_range(in, 0x30000, 0x6FFFF); + in = insert_sorted_range(in, 0x90000, UINT64_MAX); + expected = insert_sorted_range(expected, 0x0, 0xFFFF); + expected = insert_sorted_range(expected, 0x20000, 0x2FFFF); + expected = insert_sorted_range(expected, 0x70000, 0x8FFFF); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); +} + +static void check_range_reverse_array_low_end(void) +{ + GList *in = NULL, *expected = NULL; + + /* test 1 */ + in = insert_sorted_range(in, 0x0, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x10000, 0xFFFFFF); + + /* test 2 */ + + in = insert_sorted_range(in, 0x0, 0xFFFF); + in = insert_sorted_range(in, 0x20000, 0x2FFFF); + expected = insert_sorted_range(expected, 0x40000, 0xFFFFFFFFFFFF); + run_range_inverse_array("test2", &in, &expected, 0x40000, 0xFFFFFFFFFFFF); + + /* test 3 */ + in = insert_sorted_range(in, 0x0, 0xFFFF); + in = insert_sorted_range(in, 0x20000, 0x2FFFF); + in = insert_sorted_range(in, 0x1000000000000, UINT64_MAX); + expected = insert_sorted_range(expected, 0x40000, 0xFFFFFFFFFFFF); + run_range_inverse_array("test3", &in, &expected, 0x40000, 0xFFFFFFFFFFFF); + + /* test 4 */ + + in = insert_sorted_range(in, 0x0, 0xFFFF); + in = insert_sorted_range(in, 0x20000, 0x2FFFF); + in = insert_sorted_range(in, 0x1000000000000, UINT64_MAX); + expected = insert_sorted_range(expected, 0x30000, 0xFFFFFFFFFFFF); + run_range_inverse_array("test4", &in, &expected, 0x20000, 0xFFFFFFFFFFFF); + + /* test 5 */ + + in = insert_sorted_range(in, 0x2000, 0xFFFF); + in = insert_sorted_range(in, 0x20000, 0x2FFFF); + in = insert_sorted_range(in, 0x100000000, 0x1FFFFFFFF); + expected = insert_sorted_range(expected, 0x1000, 0x1FFF); + expected = insert_sorted_range(expected, 0x10000, 0x1FFFF); + expected = insert_sorted_range(expected, 0x30000, 0xFFFFFFFF); + expected = insert_sorted_range(expected, 0x200000000, 0xFFFFFFFFFFFF); + run_range_inverse_array("test5", &in, &expected, 0x1000, 0xFFFFFFFFFFFF); + + /* test 6 */ + + in = insert_sorted_range(in, 0x10000000 , 0x1FFFFFFF); + in = insert_sorted_range(in, 0x100000000, 0x1FFFFFFFF); + expected = insert_sorted_range(expected, 0x0, 0xFFFF); + run_range_inverse_array("test6", &in, &expected, 0x0, 0xFFFF); +} + +static ReservedRegion *alloc_resv_mem(unsigned type, uint64_t lob, uint64_t upb) +{ + ReservedRegion *r; + + r = g_new0(ReservedRegion, 1); + r->type = type; + range_set_bounds(&r->range, lob, upb); + return r; +} + +static void print_resv_region_list(const char *prefix, GList *list, + uint32_t expected_length) +{ + int i = g_list_length(list); + + g_assert_cmpint(i, ==, expected_length); +#if DEBUG + i = 0; + for (GList *l = list; l; l = l->next) { + ReservedRegion *r = (ReservedRegion *)l->data; + Range *range = &r->range; + + printf("%s item[%d]=[0x%x, 0x%"PRIx64", 0x%"PRIx64"]\n", + prefix, i++, r->type, range_lob(range), range_upb(range)); + } +#endif +} + +static void free_resv_region(gpointer data) +{ + ReservedRegion *reg = (ReservedRegion *)data; + + g_free(reg); +} + +static void check_resv_region_list_insert(void) +{ + ReservedRegion *r[10]; + GList *l = NULL; + + r[0] = alloc_resv_mem(0xA, 0, 0xFFFF); + r[1] = alloc_resv_mem(0xA, 0x20000, 0x2FFFF); + l = resv_region_list_insert(l, r[0]); + l = resv_region_list_insert(l, r[1]); + print_resv_region_list("test1", l, 2); + + /* adjacent on left */ + r[2] = alloc_resv_mem(0xB, 0x0, 0xFFF); + l = resv_region_list_insert(l, r[2]); + /* adjacent on right */ + r[3] = alloc_resv_mem(0xC, 0x21000, 0x2FFFF); + l = resv_region_list_insert(l, r[3]); + print_resv_region_list("test2", l, 4); + + /* exact overlap of D into C*/ + r[4] = alloc_resv_mem(0xD, 0x21000, 0x2FFFF); + l = resv_region_list_insert(l, r[4]); + print_resv_region_list("test3", l, 4); + + /* in the middle */ + r[5] = alloc_resv_mem(0xE, 0x22000, 0x23FFF); + l = resv_region_list_insert(l, r[5]); + print_resv_region_list("test4", l, 6); + + /* overwrites several existing ones */ + r[6] = alloc_resv_mem(0xF, 0x10000, 0x2FFFF); + l = resv_region_list_insert(l, r[6]); + print_resv_region_list("test5", l, 3); + + /* contiguous at the end */ + r[7] = alloc_resv_mem(0x0, 0x30000, 0x40000); + l = resv_region_list_insert(l, r[7]); + print_resv_region_list("test6", l, 4); + + g_list_free_full(l, free_resv_region); + l = NULL; + + r[0] = alloc_resv_mem(0x0, 0x10000, 0x1FFFF); + l = resv_region_list_insert(l, r[0]); + /* insertion before the 1st item */ + r[1] = alloc_resv_mem(0x1, 0x0, 0xFF); + l = resv_region_list_insert(l, r[1]); + print_resv_region_list("test8", l, 2); + + /* collision on the left side */ + r[2] = alloc_resv_mem(0xA, 0x1200, 0x11FFF); + l = resv_region_list_insert(l, r[2]); + print_resv_region_list("test9", l, 3); + + /* collision on the right side */ + r[3] = alloc_resv_mem(0xA, 0x1F000, 0x2FFFF); + l = resv_region_list_insert(l, r[3]); + print_resv_region_list("test10", l, 4); + + /* override everything */ + r[4] = alloc_resv_mem(0xF, 0x0, UINT64_MAX); + l = resv_region_list_insert(l, r[4]); + print_resv_region_list("test11", l, 1); + + g_list_free_full(l, free_resv_region); + l = NULL; + + r[0] = alloc_resv_mem(0xF, 0x1000000000000, UINT64_MAX); + l = resv_region_list_insert(l, r[0]); + print_resv_region_list("test12", l, 1); + + r[1] = alloc_resv_mem(0xA, 0x0, 0xFFFFFFF); + l = resv_region_list_insert(l, r[1]); + print_resv_region_list("test12", l, 2); + + r[2] = alloc_resv_mem(0xB, 0x100000000, 0x1FFFFFFFF); + l = resv_region_list_insert(l, r[2]); + print_resv_region_list("test12", l, 3); + + r[3] = alloc_resv_mem(0x0, 0x010000000, 0x2FFFFFFFF); + l = resv_region_list_insert(l, r[3]); + print_resv_region_list("test12", l, 3); + + g_list_free_full(l, free_resv_region); +} + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + + g_test_add_func("/resv-mem/range_reverse_array", + check_range_reverse_array); + g_test_add_func("/resv-mem/range_reverse_array_low_end", + check_range_reverse_array_low_end); + g_test_add_func("/resv-mem/resv_region_list_insert", + check_resv_region_list_insert); + + g_test_run(); + + return 0; +} diff --git a/tests/unit/test-uuid.c b/tests/unit/test-uuid.c index aedc125ae9..739b91583c 100644 --- a/tests/unit/test-uuid.c +++ b/tests/unit/test-uuid.c @@ -145,7 +145,7 @@ static void test_uuid_unparse(void) int i; for (i = 0; i < ARRAY_SIZE(uuid_test_data); i++) { - char out[37]; + char out[UUID_STR_LEN]; if (!uuid_test_data[i].check_unparse) { continue; diff --git a/util/filemonitor-inotify.c b/util/filemonitor-inotify.c index 2c45f7f176..2121111f38 100644 --- a/util/filemonitor-inotify.c +++ b/util/filemonitor-inotify.c @@ -81,16 +81,25 @@ static void qemu_file_monitor_watch(void *arg) /* Loop over all events in the buffer */ while (used < len) { - struct inotify_event *ev = - (struct inotify_event *)(buf + used); - const char *name = ev->len ? ev->name : ""; - QFileMonitorDir *dir = g_hash_table_lookup(mon->idmap, - GINT_TO_POINTER(ev->wd)); - uint32_t iev = ev->mask & - (IN_CREATE | IN_MODIFY | IN_DELETE | IN_IGNORED | - IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB); + const char *name; + QFileMonitorDir *dir; + uint32_t iev; int qev; gsize i; + struct inotify_event *ev = (struct inotify_event *)(buf + used); + + /* + * We trust the kenel to provide valid buffer with complete event + * records. + */ + assert(len - used >= sizeof(struct inotify_event)); + assert(len - used - sizeof(struct inotify_event) >= ev->len); + + name = ev->len ? ev->name : ""; + dir = g_hash_table_lookup(mon->idmap, GINT_TO_POINTER(ev->wd)); + iev = ev->mask & + (IN_CREATE | IN_MODIFY | IN_DELETE | IN_IGNORED | + IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB); used += sizeof(struct inotify_event) + ev->len; diff --git a/util/meson.build b/util/meson.build index 769b24f2e0..c3a24af5c5 100644 --- a/util/meson.build +++ b/util/meson.build @@ -52,6 +52,7 @@ util_ss.add(files('qdist.c')) util_ss.add(files('qht.c')) util_ss.add(files('qsp.c')) util_ss.add(files('range.c')) +util_ss.add(files('reserved-region.c')) util_ss.add(files('stats64.c')) util_ss.add(files('systemd.c')) util_ss.add(files('transactions.c')) diff --git a/util/range.c b/util/range.c index 098d9d2dc0..9605ccfcbe 100644 --- a/util/range.c +++ b/util/range.c @@ -20,11 +20,7 @@ #include "qemu/osdep.h" #include "qemu/range.h" -/* - * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap. - * Both @a and @b must not be empty. - */ -static inline int range_compare(Range *a, Range *b) +int range_compare(Range *a, Range *b) { assert(!range_is_empty(a) && !range_is_empty(b)); @@ -70,3 +66,58 @@ GList *range_list_insert(GList *list, Range *data) return list; } + +static inline +GList *append_new_range(GList *list, uint64_t lob, uint64_t upb) +{ + Range *new = g_new0(Range, 1); + + range_set_bounds(new, lob, upb); + return g_list_append(list, new); +} + + +void range_inverse_array(GList *in, GList **rev, + uint64_t low, uint64_t high) +{ + Range *r, *rn; + GList *l = in, *out = *rev; + + for (l = in; l && range_upb(l->data) < low; l = l->next) { + continue; + } + + if (!l) { + out = append_new_range(out, low, high); + goto exit; + } + r = (Range *)l->data; + + /* first range lob is greater than min, insert a first range */ + if (range_lob(r) > low) { + out = append_new_range(out, low, MIN(range_lob(r) - 1, high)); + } + + /* insert a range inbetween each original range until we reach high */ + for (; l->next; l = l->next) { + r = (Range *)l->data; + rn = (Range *)l->next->data; + if (range_lob(r) >= high) { + goto exit; + } + if (range_compare(r, rn)) { + out = append_new_range(out, range_upb(r) + 1, + MIN(range_lob(rn) - 1, high)); + } + } + + /* last range */ + r = (Range *)l->data; + + /* last range upb is less than max, insert a last range */ + if (range_upb(r) < high) { + out = append_new_range(out, range_upb(r) + 1, high); + } +exit: + *rev = out; +} diff --git a/util/reserved-region.c b/util/reserved-region.c new file mode 100644 index 0000000000..18f83eb4c6 --- /dev/null +++ b/util/reserved-region.c @@ -0,0 +1,91 @@ +/* + * QEMU ReservedRegion helpers + * + * Copyright (c) 2023 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/range.h" +#include "qemu/reserved-region.h" + +GList *resv_region_list_insert(GList *list, ReservedRegion *reg) +{ + ReservedRegion *resv_iter, *new_reg; + Range *r = ®->range; + Range *range_iter; + GList *l; + + for (l = list; l ; ) { + resv_iter = (ReservedRegion *)l->data; + range_iter = &resv_iter->range; + + /* Skip all list elements strictly less than range to add */ + if (range_compare(range_iter, r) < 0) { + l = l->next; + } else if (range_compare(range_iter, r) > 0) { + return g_list_insert_before(list, l, reg); + } else { /* there is an overlap */ + if (range_contains_range(r, range_iter)) { + /* new range contains current item, simply remove this latter */ + GList *prev = l->prev; + g_free(l->data); + list = g_list_delete_link(list, l); + if (prev) { + l = prev->next; + } else { + l = list; + } + } else if (range_contains_range(range_iter, r)) { + /* new region is included in the current region */ + if (range_lob(range_iter) == range_lob(r)) { + /* adjacent on the left side, derives into 2 regions */ + range_set_bounds(range_iter, range_upb(r) + 1, + range_upb(range_iter)); + return g_list_insert_before(list, l, reg); + } else if (range_upb(range_iter) == range_upb(r)) { + /* adjacent on the right side, derives into 2 regions */ + range_set_bounds(range_iter, range_lob(range_iter), + range_lob(r) - 1); + l = l->next; + } else { + uint64_t lob = range_lob(range_iter); + /* + * the new range is in the middle of an existing one, + * split this latter into 3 regs instead + */ + range_set_bounds(range_iter, range_upb(r) + 1, + range_upb(range_iter)); + new_reg = g_new0(ReservedRegion, 1); + new_reg->type = resv_iter->type; + range_set_bounds(&new_reg->range, + lob, range_lob(r) - 1); + list = g_list_insert_before(list, l, new_reg); + return g_list_insert_before(list, l, reg); + } + } else if (range_lob(r) < range_lob(range_iter)) { + range_set_bounds(range_iter, range_upb(r) + 1, + range_upb(range_iter)); + return g_list_insert_before(list, l, reg); + } else { /* intersection on the upper range */ + range_set_bounds(range_iter, range_lob(range_iter), + range_lob(r) - 1); + l = l->next; + } + } /* overlap */ + } + return g_list_append(list, reg); +} + diff --git a/util/uuid.c b/util/uuid.c index d71aa79e5e..234619dd5e 100644 --- a/util/uuid.c +++ b/util/uuid.c @@ -51,7 +51,7 @@ int qemu_uuid_is_equal(const QemuUUID *lhv, const QemuUUID *rhv) void qemu_uuid_unparse(const QemuUUID *uuid, char *out) { const unsigned char *uu = &uuid->data[0]; - snprintf(out, UUID_FMT_LEN + 1, UUID_FMT, + snprintf(out, UUID_STR_LEN, UUID_FMT, uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7], uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]); } |