diff options
Diffstat (limited to 'hw')
62 files changed, 3581 insertions, 525 deletions
diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c index 49a80550c5..e8711ae16a 100644 --- a/hw/alpha/typhoon.c +++ b/hw/alpha/typhoon.c @@ -738,6 +738,10 @@ static AddressSpace *typhoon_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &s->pchip.iommu_as; } +static const PCIIOMMUOps typhoon_iommu_ops = { + .get_address_space = typhoon_pci_dma_iommu, +}; + static void typhoon_set_irq(void *opaque, int irq, int level) { TyphoonState *s = opaque; @@ -897,7 +901,7 @@ PCIBus *typhoon_init(MemoryRegion *ram, qemu_irq *p_isa_irq, "iommu-typhoon", UINT64_MAX); address_space_init(&s->pchip.iommu_as, MEMORY_REGION(&s->pchip.iommu), "pchip0-pci"); - pci_setup_iommu(b, typhoon_pci_dma_iommu, s); + pci_setup_iommu(b, &typhoon_iommu_ops, s); /* Pchip0 PCI special/interrupt acknowledge, 0x801.F800.0000, 64MB. */ memory_region_init_io(&s->pchip.reg_iack, OBJECT(s), &alpha_pci_iack_ops, diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index f35ae9aa22..9a8ac45431 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -605,6 +605,10 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) return &sdev->as; } +static const PCIIOMMUOps smmu_ops = { + .get_address_space = smmu_find_add_as, +}; + IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) { uint8_t bus_n, devfn; @@ -661,7 +665,7 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL); if (s->primary_bus) { - pci_setup_iommu(s->primary_bus, smmu_find_add_as, s); + pci_setup_iommu(s->primary_bus, &smmu_ops, s); } else { error_setg(errp, "SMMU is not attached to any PCI bus!"); } diff --git a/hw/arm/vexpress.c b/hw/arm/vexpress.c index 8ff37f52ca..c08ea34e92 100644 --- a/hw/arm/vexpress.c +++ b/hw/arm/vexpress.c @@ -177,7 +177,6 @@ struct VexpressMachineState { MemoryRegion vram; MemoryRegion sram; MemoryRegion flashalias; - MemoryRegion lowram; MemoryRegion a15sram; bool secure; bool virt; @@ -276,7 +275,6 @@ static void a9_daughterboard_init(VexpressMachineState *vms, { MachineState *machine = MACHINE(vms); MemoryRegion *sysmem = get_system_memory(); - ram_addr_t low_ram_size; if (ram_size > 0x40000000) { /* 1GB is the maximum the address space permits */ @@ -284,17 +282,11 @@ static void a9_daughterboard_init(VexpressMachineState *vms, exit(1); } - low_ram_size = ram_size; - if (low_ram_size > 0x4000000) { - low_ram_size = 0x4000000; - } - /* RAM is from 0x60000000 upwards. The bottom 64MB of the + /* + * RAM is from 0x60000000 upwards. The bottom 64MB of the * address space should in theory be remappable to various - * things including ROM or RAM; we always map the RAM there. + * things including ROM or RAM; we always map the flash there. */ - memory_region_init_alias(&vms->lowram, NULL, "vexpress.lowmem", - machine->ram, 0, low_ram_size); - memory_region_add_subregion(sysmem, 0x0, &vms->lowram); memory_region_add_subregion(sysmem, 0x60000000, machine->ram); /* 0x1e000000 A9MPCore (SCU) private memory region */ diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 9ce136cd88..8bc35a483c 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -482,7 +482,7 @@ build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 3, 1); /* ARM PL011 UART */ build_append_int_noprefix(table_data, 0, 3); /* Reserved */ /* Base Address */ - build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 8, 0, 1, + build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3, vms->memmap[VIRT_UART].base); /* Interrupt Type */ build_append_int_noprefix(table_data, @@ -673,7 +673,7 @@ build_dbg2(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 34, 2); /* BaseAddressRegister[] */ - build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 8, 0, 1, + build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3, vms->memmap[VIRT_UART].base); /* AddressSize[] */ diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 92085d2d8f..0a16ab3095 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -631,7 +631,8 @@ static void fdt_add_pmu_nodes(const VirtMachineState *vms) qemu_fdt_setprop(ms->fdt, "/pmu", "compatible", compat, sizeof(compat)); qemu_fdt_setprop_cells(ms->fdt, "/pmu", "interrupts", - GIC_FDT_IRQ_TYPE_PPI, VIRTUAL_PMU_IRQ, irqflags); + GIC_FDT_IRQ_TYPE_PPI, + INTID_TO_PPI(VIRTUAL_PMU_IRQ), irqflags); } } diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c index a07cd7eb5d..bfa53960c3 100644 --- a/hw/block/xen-block.c +++ b/hw/block/xen-block.c @@ -115,9 +115,13 @@ static void xen_block_connect(XenDevice *xendev, Error **errp) return; } - if (xen_device_frontend_scanf(xendev, "protocol", "%ms", - &str) != 1) { - protocol = BLKIF_PROTOCOL_NATIVE; + if (xen_device_frontend_scanf(xendev, "protocol", "%ms", &str) != 1) { + /* x86 defaults to the 32-bit protocol even for 64-bit guests. */ + if (object_dynamic_cast(OBJECT(qdev_get_machine()), "x86-machine")) { + protocol = BLKIF_PROTOCOL_X86_32; + } else { + protocol = BLKIF_PROTOCOL_NATIVE; + } } else { if (strcmp(str, XEN_IO_PROTO_ABI_X86_32) == 0) { protocol = BLKIF_PROTOCOL_X86_32; diff --git a/hw/core/loader.c b/hw/core/loader.c index 4dd5a71fb7..b7bb44b7f7 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -558,7 +558,7 @@ static void zfree(void *x, void *addr) ssize_t gunzip(void *dst, size_t dstlen, uint8_t *src, size_t srclen) { - z_stream s; + z_stream s = {}; ssize_t dstbytes; int r, i, flags; diff --git a/hw/core/machine-hmp-cmds.c b/hw/core/machine-hmp-cmds.c index 9a4b59c6f2..a6ff6a4875 100644 --- a/hw/core/machine-hmp-cmds.c +++ b/hw/core/machine-hmp-cmds.c @@ -253,6 +253,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) MemoryDeviceInfo *value; PCDIMMDeviceInfo *di; SgxEPCDeviceInfo *se; + HvBalloonDeviceInfo *hi; for (info = info_list; info; info = info->next) { value = info->value; @@ -310,6 +311,20 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) monitor_printf(mon, " node: %" PRId64 "\n", se->node); monitor_printf(mon, " memdev: %s\n", se->memdev); break; + case MEMORY_DEVICE_INFO_KIND_HV_BALLOON: + hi = value->u.hv_balloon.data; + monitor_printf(mon, "Memory device [%s]: \"%s\"\n", + MemoryDeviceInfoKind_str(value->type), + hi->id ? hi->id : ""); + if (hi->has_memaddr) { + monitor_printf(mon, " memaddr: 0x%" PRIx64 "\n", + hi->memaddr); + } + monitor_printf(mon, " max-size: %" PRIu64 "\n", hi->max_size); + if (hi->memdev) { + monitor_printf(mon, " memdev: %s\n", hi->memdev); + } + break; default: g_assert_not_reached(); } diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 2f1dbb3fd7..b46d16cd2c 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -705,7 +705,7 @@ static void get_reserved_region(Object *obj, Visitor *v, const char *name, int rc; rc = snprintf(buffer, sizeof(buffer), "0x%"PRIx64":0x%"PRIx64":%u", - rr->low, rr->high, rr->type); + range_lob(&rr->range), range_upb(&rr->range), rr->type); assert(rc < sizeof(buffer)); visit_type_str(v, name, &p, errp); @@ -717,6 +717,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, Property *prop = opaque; ReservedRegion *rr = object_field_prop_ptr(obj, prop); const char *endptr; + uint64_t lob, upb; char *str; int ret; @@ -724,7 +725,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, return; } - ret = qemu_strtou64(str, &endptr, 16, &rr->low); + ret = qemu_strtou64(str, &endptr, 16, &lob); if (ret) { error_setg(errp, "start address of '%s'" " must be a hexadecimal integer", name); @@ -734,7 +735,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, goto separator_error; } - ret = qemu_strtou64(endptr + 1, &endptr, 16, &rr->high); + ret = qemu_strtou64(endptr + 1, &endptr, 16, &upb); if (ret) { error_setg(errp, "end address of '%s'" " must be a hexadecimal integer", name); @@ -744,6 +745,8 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, goto separator_error; } + range_set_bounds(&rr->range, lob, upb); + ret = qemu_strtoui(endptr + 1, &endptr, 10, &rr->type); if (ret) { error_setg(errp, "type of '%s'" @@ -1111,7 +1114,7 @@ static void get_uuid(Object *obj, Visitor *v, const char *name, void *opaque, { Property *prop = opaque; QemuUUID *uuid = object_field_prop_ptr(obj, prop); - char buffer[UUID_FMT_LEN + 1]; + char buffer[UUID_STR_LEN]; char *p = buffer; qemu_uuid_unparse(uuid, buffer); diff --git a/hw/display/ati.c b/hw/display/ati.c index 6e38e00502..9a87a5504a 100644 --- a/hw/display/ati.c +++ b/hw/display/ati.c @@ -319,11 +319,13 @@ static uint64_t ati_mm_read(void *opaque, hwaddr addr, unsigned int size) case DAC_CNTL: val = s->regs.dac_cntl; break; - case GPIO_VGA_DDC: - val = s->regs.gpio_vga_ddc; + case GPIO_VGA_DDC ... GPIO_VGA_DDC + 3: + val = ati_reg_read_offs(s->regs.gpio_vga_ddc, + addr - GPIO_VGA_DDC, size); break; - case GPIO_DVI_DDC: - val = s->regs.gpio_dvi_ddc; + case GPIO_DVI_DDC ... GPIO_DVI_DDC + 3: + val = ati_reg_read_offs(s->regs.gpio_dvi_ddc, + addr - GPIO_DVI_DDC, size); break; case GPIO_MONID ... GPIO_MONID + 3: val = ati_reg_read_offs(s->regs.gpio_monid, @@ -337,6 +339,9 @@ static uint64_t ati_mm_read(void *opaque, hwaddr addr, unsigned int size) case PALETTE_DATA: val = vga_ioport_read(&s->vga, VGA_PEL_D); break; + case PALETTE_30_DATA: + val = s->regs.palette[vga_ioport_read(&s->vga, VGA_PEL_IR)]; + break; case CNFG_CNTL: val = s->regs.config_cntl; break; @@ -349,14 +354,17 @@ static uint64_t ati_mm_read(void *opaque, hwaddr addr, unsigned int size) PCI_BASE_ADDRESS_0, size) & 0xfffffff0; break; case CONFIG_APER_SIZE: - val = s->vga.vram_size; + val = s->vga.vram_size / 2; break; case CONFIG_REG_1_BASE: val = pci_default_read_config(&s->dev, PCI_BASE_ADDRESS_2, size) & 0xfffffff0; break; case CONFIG_REG_APER_SIZE: - val = memory_region_size(&s->mm); + val = memory_region_size(&s->mm) / 2; + break; + case HOST_PATH_CNTL: + val = BIT(23); /* Radeon HDP_APER_CNTL */ break; case MC_STATUS: val = 5; @@ -612,29 +620,34 @@ static void ati_mm_write(void *opaque, hwaddr addr, s->regs.dac_cntl = data & 0xffffe3ff; s->vga.dac_8bit = !!(data & DAC_8BIT_EN); break; - case GPIO_VGA_DDC: + /* + * GPIO regs for DDC access. Because some drivers access these via + * multiple byte writes we have to be careful when we send bits to + * avoid spurious changes in bitbang_i2c state. Only do it when either + * the enable bits are changed or output bits changed while enabled. + */ + case GPIO_VGA_DDC ... GPIO_VGA_DDC + 3: if (s->dev_id != PCI_DEVICE_ID_ATI_RAGE128_PF) { /* FIXME: Maybe add a property to select VGA or DVI port? */ } break; - case GPIO_DVI_DDC: + case GPIO_DVI_DDC ... GPIO_DVI_DDC + 3: if (s->dev_id != PCI_DEVICE_ID_ATI_RAGE128_PF) { - s->regs.gpio_dvi_ddc = ati_i2c(&s->bbi2c, data, 0); + ati_reg_write_offs(&s->regs.gpio_dvi_ddc, + addr - GPIO_DVI_DDC, data, size); + if ((addr <= GPIO_DVI_DDC + 2 && addr + size > GPIO_DVI_DDC + 2) || + (addr == GPIO_DVI_DDC && (s->regs.gpio_dvi_ddc & 0x30000))) { + s->regs.gpio_dvi_ddc = ati_i2c(&s->bbi2c, + s->regs.gpio_dvi_ddc, 0); + } } break; case GPIO_MONID ... GPIO_MONID + 3: /* FIXME What does Radeon have here? */ if (s->dev_id == PCI_DEVICE_ID_ATI_RAGE128_PF) { + /* Rage128p accesses DDC via MONID(1-2) with additional mask bit */ ati_reg_write_offs(&s->regs.gpio_monid, addr - GPIO_MONID, data, size); - /* - * Rage128p accesses DDC used to get EDID via these bits. - * Because some drivers access this via multiple byte writes - * we have to be careful when we send bits to avoid spurious - * changes in bitbang_i2c state. So only do it when mask is set - * and either the enable bits are changed or output bits changed - * while enabled. - */ if ((s->regs.gpio_monid & BIT(25)) && ((addr <= GPIO_MONID + 2 && addr + size > GPIO_MONID + 2) || (addr == GPIO_MONID && (s->regs.gpio_monid & 0x60000)))) { @@ -663,6 +676,12 @@ static void ati_mm_write(void *opaque, hwaddr addr, data >>= 8; vga_ioport_write(&s->vga, VGA_PEL_D, data & 0xff); break; + case PALETTE_30_DATA: + s->regs.palette[vga_ioport_read(&s->vga, VGA_PEL_IW)] = data; + vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 22) & 0xff); + vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 12) & 0xff); + vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 2) & 0xff); + break; case CNFG_CNTL: s->regs.config_cntl = data; break; @@ -1014,6 +1033,7 @@ static Property ati_vga_properties[] = { DEFINE_PROP_UINT16("x-device-id", ATIVGAState, dev_id, PCI_DEVICE_ID_ATI_RAGE128_PF), DEFINE_PROP_BOOL("guest_hwcursor", ATIVGAState, cursor_guest_mode, false), + DEFINE_PROP_UINT8("x-pixman", ATIVGAState, use_pixman, 3), DEFINE_PROP_END_OF_LIST() }; @@ -1035,11 +1055,18 @@ static void ati_vga_class_init(ObjectClass *klass, void *data) k->exit = ati_vga_exit; } +static void ati_vga_init(Object *o) +{ + object_property_set_description(o, "x-pixman", "Use pixman for: " + "1: fill, 2: blit"); +} + static const TypeInfo ati_vga_info = { .name = TYPE_ATI_VGA, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(ATIVGAState), .class_init = ati_vga_class_init, + .instance_init = ati_vga_init, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, { }, diff --git a/hw/display/ati_2d.c b/hw/display/ati_2d.c index 7d786653e8..0e6b8e4367 100644 --- a/hw/display/ati_2d.c +++ b/hw/display/ati_2d.c @@ -92,6 +92,7 @@ void ati_2d_blt(ATIVGAState *s) switch (s->regs.dp_mix & GMC_ROP3_MASK) { case ROP3_SRCCOPY: { + bool fallback = false; unsigned src_x = (s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT ? s->regs.src_x : s->regs.src_x + 1 - s->regs.dst_width); unsigned src_y = (s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM ? @@ -122,27 +123,50 @@ void ati_2d_blt(ATIVGAState *s) src_bits, dst_bits, src_stride, dst_stride, bpp, bpp, src_x, src_y, dst_x, dst_y, s->regs.dst_width, s->regs.dst_height); - if (s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT && + if ((s->use_pixman & BIT(1)) && + s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT && s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM) { - pixman_blt((uint32_t *)src_bits, (uint32_t *)dst_bits, - src_stride, dst_stride, bpp, bpp, - src_x, src_y, dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height); - } else { + fallback = !pixman_blt((uint32_t *)src_bits, (uint32_t *)dst_bits, + src_stride, dst_stride, bpp, bpp, + src_x, src_y, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height); + } else if (s->use_pixman & BIT(1)) { /* FIXME: We only really need a temporary if src and dst overlap */ int llb = s->regs.dst_width * (bpp / 8); int tmp_stride = DIV_ROUND_UP(llb, sizeof(uint32_t)); uint32_t *tmp = g_malloc(tmp_stride * sizeof(uint32_t) * s->regs.dst_height); - pixman_blt((uint32_t *)src_bits, tmp, - src_stride, tmp_stride, bpp, bpp, - src_x, src_y, 0, 0, - s->regs.dst_width, s->regs.dst_height); - pixman_blt(tmp, (uint32_t *)dst_bits, - tmp_stride, dst_stride, bpp, bpp, - 0, 0, dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height); + fallback = !pixman_blt((uint32_t *)src_bits, tmp, + src_stride, tmp_stride, bpp, bpp, + src_x, src_y, 0, 0, + s->regs.dst_width, s->regs.dst_height); + if (!fallback) { + fallback = !pixman_blt(tmp, (uint32_t *)dst_bits, + tmp_stride, dst_stride, bpp, bpp, + 0, 0, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height); + } g_free(tmp); + } else { + fallback = true; + } + if (fallback) { + unsigned int y, i, j, bypp = bpp / 8; + unsigned int src_pitch = src_stride * sizeof(uint32_t); + unsigned int dst_pitch = dst_stride * sizeof(uint32_t); + + for (y = 0; y < s->regs.dst_height; y++) { + i = dst_x * bypp; + j = src_x * bypp; + if (s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM) { + i += (dst_y + y) * dst_pitch; + j += (src_y + y) * src_pitch; + } else { + i += (dst_y + s->regs.dst_height - 1 - y) * dst_pitch; + j += (src_y + s->regs.dst_height - 1 - y) * src_pitch; + } + memmove(&dst_bits[i], &src_bits[j], s->regs.dst_width * bypp); + } } if (dst_bits >= s->vga.vram_ptr + s->vga.vbe_start_addr && dst_bits < s->vga.vram_ptr + s->vga.vbe_start_addr + @@ -180,14 +204,21 @@ void ati_2d_blt(ATIVGAState *s) dst_stride /= sizeof(uint32_t); DPRINTF("pixman_fill(%p, %d, %d, %d, %d, %d, %d, %x)\n", - dst_bits, dst_stride, bpp, - dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height, - filler); - pixman_fill((uint32_t *)dst_bits, dst_stride, bpp, - dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height, - filler); + dst_bits, dst_stride, bpp, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height, filler); + if (!(s->use_pixman & BIT(0)) || + !pixman_fill((uint32_t *)dst_bits, dst_stride, bpp, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height, filler)) { + /* fallback when pixman failed or we don't want to call it */ + unsigned int x, y, i, bypp = bpp / 8; + unsigned int dst_pitch = dst_stride * sizeof(uint32_t); + for (y = 0; y < s->regs.dst_height; y++) { + i = dst_x * bypp + (dst_y + y) * dst_pitch; + for (x = 0; x < s->regs.dst_width; x++, i += bypp) { + stn_he_p(&dst_bits[i], bypp, filler); + } + } + } if (dst_bits >= s->vga.vram_ptr + s->vga.vbe_start_addr && dst_bits < s->vga.vram_ptr + s->vga.vbe_start_addr + s->vga.vbe_regs[VBE_DISPI_INDEX_YRES] * s->vga.vbe_line_offset) { diff --git a/hw/display/ati_dbg.c b/hw/display/ati_dbg.c index bd0ecd48c7..3ffa7f35df 100644 --- a/hw/display/ati_dbg.c +++ b/hw/display/ati_dbg.c @@ -30,6 +30,7 @@ static struct ati_regdesc ati_reg_names[] = { {"AMCGPIO_EN_MIR", 0x00a8}, {"PALETTE_INDEX", 0x00b0}, {"PALETTE_DATA", 0x00b4}, + {"PALETTE_30_DATA", 0x00b8}, {"CNFG_CNTL", 0x00e0}, {"GEN_RESET_CNTL", 0x00f0}, {"CNFG_MEMSIZE", 0x00f8}, @@ -38,6 +39,7 @@ static struct ati_regdesc ati_reg_names[] = { {"CONFIG_APER_SIZE", 0x0108}, {"CONFIG_REG_1_BASE", 0x010c}, {"CONFIG_REG_APER_SIZE", 0x0110}, + {"HOST_PATH_CNTL", 0x0130}, {"MEM_CNTL", 0x0140}, {"MC_FB_LOCATION", 0x0148}, {"MC_AGP_LOCATION", 0x014C}, diff --git a/hw/display/ati_int.h b/hw/display/ati_int.h index e8d3c7af75..f5a47b82b0 100644 --- a/hw/display/ati_int.h +++ b/hw/display/ati_int.h @@ -44,6 +44,7 @@ typedef struct ATIVGARegs { uint32_t gpio_dvi_ddc; uint32_t gpio_monid; uint32_t config_cntl; + uint32_t palette[256]; uint32_t crtc_h_total_disp; uint32_t crtc_h_sync_strt_wid; uint32_t crtc_v_total_disp; @@ -89,6 +90,7 @@ struct ATIVGAState { char *model; uint16_t dev_id; uint8_t mode; + uint8_t use_pixman; bool cursor_guest_mode; uint16_t cursor_size; uint32_t cursor_offset; diff --git a/hw/display/ati_regs.h b/hw/display/ati_regs.h index d6282b2ef2..d7127748ff 100644 --- a/hw/display/ati_regs.h +++ b/hw/display/ati_regs.h @@ -48,6 +48,7 @@ #define AMCGPIO_EN_MIR 0x00a8 #define PALETTE_INDEX 0x00b0 #define PALETTE_DATA 0x00b4 +#define PALETTE_30_DATA 0x00b8 #define CNFG_CNTL 0x00e0 #define GEN_RESET_CNTL 0x00f0 #define CNFG_MEMSIZE 0x00f8 @@ -56,6 +57,7 @@ #define CONFIG_APER_SIZE 0x0108 #define CONFIG_REG_1_BASE 0x010c #define CONFIG_REG_APER_SIZE 0x0110 +#define HOST_PATH_CNTL 0x0130 #define MEM_CNTL 0x0140 #define MC_FB_LOCATION 0x0148 #define MC_AGP_LOCATION 0x014C diff --git a/hw/display/macfb.c b/hw/display/macfb.c index 2f8e016566..d61541ccb5 100644 --- a/hw/display/macfb.c +++ b/hw/display/macfb.c @@ -36,8 +36,8 @@ #define DAFB_INTR_MASK 0x104 #define DAFB_INTR_STAT 0x108 #define DAFB_INTR_CLEAR 0x10c -#define DAFB_RESET 0x200 -#define DAFB_LUT 0x213 +#define DAFB_LUT_INDEX 0x200 +#define DAFB_LUT 0x210 #define DAFB_INTR_VBL 0x4 @@ -537,6 +537,11 @@ static uint64_t macfb_ctrl_read(void *opaque, case DAFB_MODE_SENSE: val = macfb_sense_read(s); break; + case DAFB_LUT ... DAFB_LUT + 3: + val = s->color_palette[s->palette_current]; + s->palette_current = (s->palette_current + 1) % + ARRAY_SIZE(s->color_palette); + break; default: if (addr < MACFB_CTRL_TOPADDR) { val = s->regs[addr >> 2]; @@ -583,13 +588,11 @@ static void macfb_ctrl_write(void *opaque, s->regs[DAFB_INTR_STAT >> 2] &= ~DAFB_INTR_VBL; macfb_update_irq(s); break; - case DAFB_RESET: - s->palette_current = 0; - s->regs[DAFB_INTR_STAT >> 2] &= ~DAFB_INTR_VBL; - macfb_update_irq(s); + case DAFB_LUT_INDEX: + s->palette_current = (val & 0xff) * 3; break; - case DAFB_LUT: - s->color_palette[s->palette_current] = val; + case DAFB_LUT ... DAFB_LUT + 3: + s->color_palette[s->palette_current] = val & 0xff; s->palette_current = (s->palette_current + 1) % ARRAY_SIZE(s->color_palette); if (s->palette_current % 3) { diff --git a/hw/display/virtio-gpu-pci-rutabaga.c b/hw/display/virtio-gpu-pci-rutabaga.c index c96729e198..abbb898c65 100644 --- a/hw/display/virtio-gpu-pci-rutabaga.c +++ b/hw/display/virtio-gpu-pci-rutabaga.c @@ -36,6 +36,7 @@ static const TypeInfo virtio_gpu_rutabaga_pci_info[] = { .instance_init = virtio_gpu_rutabaga_initfn, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, } }, }; diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index 4265316cbb..2707bceea8 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -1213,6 +1213,9 @@ static int virtio_gpu_save(QEMUFile *f, void *opaque, size_t size, assert(QTAILQ_EMPTY(&g->cmdq)); QTAILQ_FOREACH(res, &g->reslist, next) { + if (res->blob_size) { + continue; + } qemu_put_be32(f, res->resource_id); qemu_put_be32(f, res->width); qemu_put_be32(f, res->height); @@ -1230,12 +1233,40 @@ static int virtio_gpu_save(QEMUFile *f, void *opaque, size_t size, return vmstate_save_state(f, &vmstate_virtio_gpu_scanouts, g, NULL); } +static bool virtio_gpu_load_restore_mapping(VirtIOGPU *g, + struct virtio_gpu_simple_resource *res) +{ + int i; + + for (i = 0; i < res->iov_cnt; i++) { + hwaddr len = res->iov[i].iov_len; + res->iov[i].iov_base = + dma_memory_map(VIRTIO_DEVICE(g)->dma_as, res->addrs[i], &len, + DMA_DIRECTION_TO_DEVICE, MEMTXATTRS_UNSPECIFIED); + + if (!res->iov[i].iov_base || len != res->iov[i].iov_len) { + /* Clean up the half-a-mapping we just created... */ + if (res->iov[i].iov_base) { + dma_memory_unmap(VIRTIO_DEVICE(g)->dma_as, res->iov[i].iov_base, + len, DMA_DIRECTION_TO_DEVICE, 0); + } + /* ...and the mappings for previous loop iterations */ + res->iov_cnt = i; + virtio_gpu_cleanup_mapping(g, res); + return false; + } + } + + QTAILQ_INSERT_HEAD(&g->reslist, res, next); + g->hostmem += res->hostmem; + return true; +} + static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size, const VMStateField *field) { VirtIOGPU *g = opaque; struct virtio_gpu_simple_resource *res; - struct virtio_gpu_scanout *scanout; uint32_t resource_id, pformat; void *bits = NULL; int i; @@ -1294,40 +1325,96 @@ static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size, qemu_get_buffer(f, (void *)pixman_image_get_data(res->image), pixman_image_get_stride(res->image) * res->height); - /* restore mapping */ - for (i = 0; i < res->iov_cnt; i++) { - hwaddr len = res->iov[i].iov_len; - res->iov[i].iov_base = - dma_memory_map(VIRTIO_DEVICE(g)->dma_as, res->addrs[i], &len, - DMA_DIRECTION_TO_DEVICE, - MEMTXATTRS_UNSPECIFIED); - - if (!res->iov[i].iov_base || len != res->iov[i].iov_len) { - /* Clean up the half-a-mapping we just created... */ - if (res->iov[i].iov_base) { - dma_memory_unmap(VIRTIO_DEVICE(g)->dma_as, - res->iov[i].iov_base, - len, - DMA_DIRECTION_TO_DEVICE, - 0); - } - /* ...and the mappings for previous loop iterations */ - res->iov_cnt = i; - virtio_gpu_cleanup_mapping(g, res); - pixman_image_unref(res->image); - g_free(res); - return -EINVAL; - } + if (!virtio_gpu_load_restore_mapping(g, res)) { + pixman_image_unref(res->image); + g_free(res); + return -EINVAL; } - QTAILQ_INSERT_HEAD(&g->reslist, res, next); - g->hostmem += res->hostmem; - resource_id = qemu_get_be32(f); } /* load & apply scanout state */ vmstate_load_state(f, &vmstate_virtio_gpu_scanouts, g, 1); + + return 0; +} + +static int virtio_gpu_blob_save(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field, JSONWriter *vmdesc) +{ + VirtIOGPU *g = opaque; + struct virtio_gpu_simple_resource *res; + int i; + + /* in 2d mode we should never find unprocessed commands here */ + assert(QTAILQ_EMPTY(&g->cmdq)); + + QTAILQ_FOREACH(res, &g->reslist, next) { + if (!res->blob_size) { + continue; + } + qemu_put_be32(f, res->resource_id); + qemu_put_be32(f, res->blob_size); + qemu_put_be32(f, res->iov_cnt); + for (i = 0; i < res->iov_cnt; i++) { + qemu_put_be64(f, res->addrs[i]); + qemu_put_be32(f, res->iov[i].iov_len); + } + } + qemu_put_be32(f, 0); /* end of list */ + + return 0; +} + +static int virtio_gpu_blob_load(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field) +{ + VirtIOGPU *g = opaque; + struct virtio_gpu_simple_resource *res; + uint32_t resource_id; + int i; + + resource_id = qemu_get_be32(f); + while (resource_id != 0) { + res = virtio_gpu_find_resource(g, resource_id); + if (res) { + return -EINVAL; + } + + res = g_new0(struct virtio_gpu_simple_resource, 1); + res->resource_id = resource_id; + res->blob_size = qemu_get_be32(f); + res->iov_cnt = qemu_get_be32(f); + res->addrs = g_new(uint64_t, res->iov_cnt); + res->iov = g_new(struct iovec, res->iov_cnt); + + /* read data */ + for (i = 0; i < res->iov_cnt; i++) { + res->addrs[i] = qemu_get_be64(f); + res->iov[i].iov_len = qemu_get_be32(f); + } + + if (!virtio_gpu_load_restore_mapping(g, res)) { + g_free(res); + return -EINVAL; + } + + virtio_gpu_init_udmabuf(res); + + resource_id = qemu_get_be32(f); + } + + return 0; +} + +static int virtio_gpu_post_load(void *opaque, int version_id) +{ + VirtIOGPU *g = opaque; + struct virtio_gpu_scanout *scanout; + struct virtio_gpu_simple_resource *res; + int i; + for (i = 0; i < g->parent_obj.conf.max_outputs; i++) { /* FIXME: should take scanout.r.{x,y} into account */ scanout = &g->parent_obj.scanout[i]; @@ -1475,6 +1562,32 @@ virtio_gpu_set_config(VirtIODevice *vdev, const uint8_t *config) } } +static bool virtio_gpu_blob_state_needed(void *opaque) +{ + VirtIOGPU *g = VIRTIO_GPU(opaque); + + return virtio_gpu_blob_enabled(g->parent_obj.conf); +} + +const VMStateDescription vmstate_virtio_gpu_blob_state = { + .name = "virtio-gpu/blob", + .minimum_version_id = VIRTIO_GPU_VM_VERSION, + .version_id = VIRTIO_GPU_VM_VERSION, + .needed = virtio_gpu_blob_state_needed, + .fields = (const VMStateField[]){ + { + .name = "virtio-gpu/blob", + .info = &(const VMStateInfo) { + .name = "blob", + .get = virtio_gpu_blob_load, + .put = virtio_gpu_blob_save, + }, + .flags = VMS_SINGLE, + } /* device */, + VMSTATE_END_OF_LIST() + }, +}; + /* * For historical reasons virtio_gpu does not adhere to virtio migration * scheme as described in doc/virtio-migration.txt, in a sense that no @@ -1500,6 +1613,11 @@ static const VMStateDescription vmstate_virtio_gpu = { } /* device */, VMSTATE_END_OF_LIST() }, + .subsections = (const VMStateDescription * []) { + &vmstate_virtio_gpu_blob_state, + NULL + }, + .post_load = virtio_gpu_post_load, }; static Property virtio_gpu_properties[] = { diff --git a/hw/hyperv/Kconfig b/hw/hyperv/Kconfig index fcf65903bd..41dd827c84 100644 --- a/hw/hyperv/Kconfig +++ b/hw/hyperv/Kconfig @@ -16,3 +16,13 @@ config SYNDBG bool default y depends on VMBUS + +config HV_BALLOON_SUPPORTED + bool + +config HV_BALLOON + bool + default y + depends on VMBUS + depends on HV_BALLOON_POSSIBLE + depends on HV_BALLOON_SUPPORTED diff --git a/hw/hyperv/hv-balloon-internal.h b/hw/hyperv/hv-balloon-internal.h new file mode 100644 index 0000000000..164c2e5825 --- /dev/null +++ b/hw/hyperv/hv-balloon-internal.h @@ -0,0 +1,33 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HYPERV_HV_BALLOON_INTERNAL_H +#define HW_HYPERV_HV_BALLOON_INTERNAL_H + +#include "qemu/osdep.h" + +#define HV_BALLOON_PFN_SHIFT 12 +#define HV_BALLOON_PAGE_SIZE (1 << HV_BALLOON_PFN_SHIFT) + +#define SUM_OVERFLOW_U64(in1, in2) ((in1) > UINT64_MAX - (in2)) +#define SUM_SATURATE_U64(in1, in2) \ + ({ \ + uint64_t _in1 = (in1), _in2 = (in2); \ + uint64_t _result; \ + \ + if (!SUM_OVERFLOW_U64(_in1, _in2)) { \ + _result = _in1 + _in2; \ + } else { \ + _result = UINT64_MAX; \ + } \ + \ + _result; \ + }) + +#endif diff --git a/hw/hyperv/hv-balloon-our_range_memslots.c b/hw/hyperv/hv-balloon-our_range_memslots.c new file mode 100644 index 0000000000..99bae870f3 --- /dev/null +++ b/hw/hyperv/hv-balloon-our_range_memslots.c @@ -0,0 +1,201 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "hv-balloon-internal.h" +#include "hv-balloon-our_range_memslots.h" +#include "trace.h" + +/* OurRange */ +static void our_range_init(OurRange *our_range, uint64_t start, uint64_t count) +{ + assert(count <= UINT64_MAX - start); + our_range->range.start = start; + our_range->range.count = count; + + hvb_page_range_tree_init(&our_range->removed_guest); + hvb_page_range_tree_init(&our_range->removed_both); + + /* mark the whole range as unused but for potential use */ + our_range->added = 0; + our_range->unusable_tail = 0; +} + +static void our_range_destroy(OurRange *our_range) +{ + hvb_page_range_tree_destroy(&our_range->removed_guest); + hvb_page_range_tree_destroy(&our_range->removed_both); +} + +void hvb_our_range_clear_removed_trees(OurRange *our_range) +{ + hvb_page_range_tree_destroy(&our_range->removed_guest); + hvb_page_range_tree_destroy(&our_range->removed_both); + hvb_page_range_tree_init(&our_range->removed_guest); + hvb_page_range_tree_init(&our_range->removed_both); +} + +void hvb_our_range_mark_added(OurRange *our_range, uint64_t additional_size) +{ + assert(additional_size <= UINT64_MAX - our_range->added); + + our_range->added += additional_size; + + assert(our_range->added <= UINT64_MAX - our_range->unusable_tail); + assert(our_range->added + our_range->unusable_tail <= + our_range->range.count); +} + +/* OurRangeMemslots */ +static void our_range_memslots_init_slots(OurRangeMemslots *our_range, + MemoryRegion *backing_mr, + Object *memslot_owner) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + unsigned int idx; + uint64_t memslot_offset; + + assert(memslots->count > 0); + memslots->slots = g_new0(MemoryRegion, memslots->count); + + /* Initialize our memslots, but don't map them yet. */ + assert(memslots->size_each > 0); + for (idx = 0, memslot_offset = 0; idx < memslots->count; + idx++, memslot_offset += memslots->size_each) { + uint64_t memslot_size; + g_autofree char *name = NULL; + + /* The size of the last memslot might be smaller. */ + if (idx == memslots->count - 1) { + uint64_t region_size; + + assert(our_range->mr); + region_size = memory_region_size(our_range->mr); + memslot_size = region_size - memslot_offset; + } else { + memslot_size = memslots->size_each; + } + + name = g_strdup_printf("memslot-%u", idx); + memory_region_init_alias(&memslots->slots[idx], memslot_owner, name, + backing_mr, memslot_offset, memslot_size); + /* + * We want to be able to atomically and efficiently activate/deactivate + * individual memslots without affecting adjacent memslots in memory + * notifiers. + */ + memory_region_set_unmergeable(&memslots->slots[idx], true); + } + + memslots->mapped_count = 0; +} + +OurRangeMemslots *hvb_our_range_memslots_new(uint64_t addr, + MemoryRegion *parent_mr, + MemoryRegion *backing_mr, + Object *memslot_owner, + unsigned int memslot_count, + uint64_t memslot_size) +{ + OurRangeMemslots *our_range; + + our_range = g_malloc(sizeof(*our_range)); + our_range_init(&our_range->range, + addr / HV_BALLOON_PAGE_SIZE, + memory_region_size(parent_mr) / HV_BALLOON_PAGE_SIZE); + our_range->slots.size_each = memslot_size; + our_range->slots.count = memslot_count; + our_range->mr = parent_mr; + our_range_memslots_init_slots(our_range, backing_mr, memslot_owner); + + return our_range; +} + +static void our_range_memslots_free_memslots(OurRangeMemslots *our_range) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + unsigned int idx; + uint64_t offset; + + memory_region_transaction_begin(); + for (idx = 0, offset = 0; idx < memslots->mapped_count; + idx++, offset += memslots->size_each) { + trace_hv_balloon_unmap_slot(idx, memslots->count, offset); + assert(memory_region_is_mapped(&memslots->slots[idx])); + memory_region_del_subregion(our_range->mr, &memslots->slots[idx]); + } + memory_region_transaction_commit(); + + for (idx = 0; idx < memslots->count; idx++) { + object_unparent(OBJECT(&memslots->slots[idx])); + } + + g_clear_pointer(&our_range->slots.slots, g_free); +} + +void hvb_our_range_memslots_free(OurRangeMemslots *our_range) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + MemoryRegion *hostmem_mr; + RAMBlock *rb; + + assert(our_range->slots.count > 0); + assert(our_range->slots.slots); + + hostmem_mr = memslots->slots[0].alias; + rb = hostmem_mr->ram_block; + ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); + + our_range_memslots_free_memslots(our_range); + our_range_destroy(&our_range->range); + g_free(our_range); +} + +void hvb_our_range_memslots_ensure_mapped_additional(OurRangeMemslots *our_range, + uint64_t additional_map_size) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + uint64_t total_map_size; + unsigned int idx; + uint64_t offset; + + total_map_size = (our_range->range.added + additional_map_size) * + HV_BALLOON_PAGE_SIZE; + idx = memslots->mapped_count; + assert(memslots->size_each > 0); + offset = idx * memslots->size_each; + + /* + * Activate all memslots covered by the newly added region in a single + * transaction. + */ + memory_region_transaction_begin(); + for ( ; idx < memslots->count; + idx++, offset += memslots->size_each) { + /* + * If this memslot starts beyond or at the end of the range to map so + * does every next one. + */ + if (offset >= total_map_size) { + break; + } + + /* + * Instead of enabling/disabling memslot, we add/remove them. This + * should make address space updates faster, because we don't have to + * loop over many disabled subregions. + */ + trace_hv_balloon_map_slot(idx, memslots->count, offset); + assert(!memory_region_is_mapped(&memslots->slots[idx])); + memory_region_add_subregion(our_range->mr, offset, + &memslots->slots[idx]); + + memslots->mapped_count++; + } + memory_region_transaction_commit(); +} diff --git a/hw/hyperv/hv-balloon-our_range_memslots.h b/hw/hyperv/hv-balloon-our_range_memslots.h new file mode 100644 index 0000000000..b6f592d34b --- /dev/null +++ b/hw/hyperv/hv-balloon-our_range_memslots.h @@ -0,0 +1,110 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HYPERV_HV_BALLOON_OUR_RANGE_MEMSLOTS_H +#define HW_HYPERV_HV_BALLOON_OUR_RANGE_MEMSLOTS_H + +#include "qemu/osdep.h" + +#include "exec/memory.h" +#include "qom/object.h" +#include "hv-balloon-page_range_tree.h" + +/* OurRange */ +#define OUR_RANGE(ptr) ((OurRange *)(ptr)) + +/* "our range" means the memory range owned by this driver (for hot-adding) */ +typedef struct OurRange { + PageRange range; + + /* How many pages were hot-added to the guest */ + uint64_t added; + + /* Pages at the end not currently usable */ + uint64_t unusable_tail; + + /* Memory removed from the guest */ + PageRangeTree removed_guest, removed_both; +} OurRange; + +static inline uint64_t our_range_get_remaining_start(OurRange *our_range) +{ + return our_range->range.start + our_range->added; +} + +static inline uint64_t our_range_get_remaining_size(OurRange *our_range) +{ + return our_range->range.count - our_range->added - our_range->unusable_tail; +} + +void hvb_our_range_mark_added(OurRange *our_range, uint64_t additional_size); + +static inline void our_range_mark_remaining_unusable(OurRange *our_range) +{ + our_range->unusable_tail = our_range->range.count - our_range->added; +} + +static inline PageRangeTree our_range_get_removed_tree(OurRange *our_range, + bool both) +{ + if (both) { + return our_range->removed_both; + } else { + return our_range->removed_guest; + } +} + +static inline bool our_range_is_removed_tree_empty(OurRange *our_range, + bool both) +{ + if (both) { + return page_range_tree_is_empty(our_range->removed_both); + } else { + return page_range_tree_is_empty(our_range->removed_guest); + } +} + +void hvb_our_range_clear_removed_trees(OurRange *our_range); + +/* OurRangeMemslots */ +typedef struct OurRangeMemslotsSlots { + /* Nominal size of each memslot (the last one might be smaller) */ + uint64_t size_each; + + /* Slots array and its element count */ + MemoryRegion *slots; + unsigned int count; + + /* How many slots are currently mapped */ + unsigned int mapped_count; +} OurRangeMemslotsSlots; + +typedef struct OurRangeMemslots { + OurRange range; + + /* Memslots covering our range */ + OurRangeMemslotsSlots slots; + + MemoryRegion *mr; +} OurRangeMemslots; + +OurRangeMemslots *hvb_our_range_memslots_new(uint64_t addr, + MemoryRegion *parent_mr, + MemoryRegion *backing_mr, + Object *memslot_owner, + unsigned int memslot_count, + uint64_t memslot_size); +void hvb_our_range_memslots_free(OurRangeMemslots *our_range); + +G_DEFINE_AUTOPTR_CLEANUP_FUNC(OurRangeMemslots, hvb_our_range_memslots_free) + +void hvb_our_range_memslots_ensure_mapped_additional(OurRangeMemslots *our_range, + uint64_t additional_map_size); + +#endif diff --git a/hw/hyperv/hv-balloon-page_range_tree.c b/hw/hyperv/hv-balloon-page_range_tree.c new file mode 100644 index 0000000000..e178d8b413 --- /dev/null +++ b/hw/hyperv/hv-balloon-page_range_tree.c @@ -0,0 +1,228 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "hv-balloon-internal.h" +#include "hv-balloon-page_range_tree.h" + +/* + * temporarily avoid warnings about enhanced GTree API usage requiring a + * too recent Glib version until GLIB_VERSION_MAX_ALLOWED finally reaches + * the Glib version with this API + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +/* PageRangeTree */ +static gint page_range_tree_key_compare(gconstpointer leftp, + gconstpointer rightp, + gpointer user_data) +{ + const uint64_t *left = leftp, *right = rightp; + + if (*left < *right) { + return -1; + } else if (*left > *right) { + return 1; + } else { /* *left == *right */ + return 0; + } +} + +static GTreeNode *page_range_tree_insert_new(PageRangeTree tree, + uint64_t start, uint64_t count) +{ + uint64_t *key = g_malloc(sizeof(*key)); + PageRange *range = g_malloc(sizeof(*range)); + + assert(count > 0); + + *key = range->start = start; + range->count = count; + + return g_tree_insert_node(tree.t, key, range); +} + +void hvb_page_range_tree_insert(PageRangeTree tree, + uint64_t start, uint64_t count, + uint64_t *dupcount) +{ + GTreeNode *node; + bool joinable; + uint64_t intersection; + PageRange *range; + + assert(!SUM_OVERFLOW_U64(start, count)); + if (count == 0) { + return; + } + + node = g_tree_upper_bound(tree.t, &start); + if (node) { + node = g_tree_node_previous(node); + } else { + node = g_tree_node_last(tree.t); + } + + if (node) { + range = g_tree_node_value(node); + assert(range); + intersection = page_range_intersection_size(range, start, count); + joinable = page_range_joinable_right(range, start, count); + } + + if (!node || + (!intersection && !joinable)) { + /* + * !node case: the tree is empty or the very first node in the tree + * already has a higher key (the start of its range). + * the other case: there is a gap in the tree between the new range + * and the previous one. + * anyway, let's just insert the new range into the tree. + */ + node = page_range_tree_insert_new(tree, start, count); + assert(node); + range = g_tree_node_value(node); + assert(range); + } else { + /* + * the previous range in the tree either partially covers the new + * range or ends just at its beginning - extend it + */ + if (dupcount) { + *dupcount += intersection; + } + + count += start - range->start; + range->count = MAX(range->count, count); + } + + /* check next nodes for possible merging */ + for (node = g_tree_node_next(node); node; ) { + PageRange *rangecur; + + rangecur = g_tree_node_value(node); + assert(rangecur); + + intersection = page_range_intersection_size(rangecur, + range->start, range->count); + joinable = page_range_joinable_left(rangecur, + range->start, range->count); + if (!intersection && !joinable) { + /* the current node is disjoint */ + break; + } + + if (dupcount) { + *dupcount += intersection; + } + + count = rangecur->count + (rangecur->start - range->start); + range->count = MAX(range->count, count); + + /* the current node was merged in, remove it */ + start = rangecur->start; + node = g_tree_node_next(node); + /* no hinted removal in GTree... */ + g_tree_remove(tree.t, &start); + } +} + +bool hvb_page_range_tree_pop(PageRangeTree tree, PageRange *out, + uint64_t maxcount) +{ + GTreeNode *node; + PageRange *range; + + node = g_tree_node_last(tree.t); + if (!node) { + return false; + } + + range = g_tree_node_value(node); + assert(range); + + out->start = range->start; + + /* can't modify range->start as it is the node key */ + if (range->count > maxcount) { + out->start += range->count - maxcount; + out->count = maxcount; + range->count -= maxcount; + } else { + out->count = range->count; + /* no hinted removal in GTree... */ + g_tree_remove(tree.t, &out->start); + } + + return true; +} + +bool hvb_page_range_tree_intree_any(PageRangeTree tree, + uint64_t start, uint64_t count) +{ + GTreeNode *node; + + if (count == 0) { + return false; + } + + /* find the first node that can possibly intersect our range */ + node = g_tree_upper_bound(tree.t, &start); + if (node) { + /* + * a NULL node below means that the very first node in the tree + * already has a higher key (the start of its range). + */ + node = g_tree_node_previous(node); + } else { + /* a NULL node below means that the tree is empty */ + node = g_tree_node_last(tree.t); + } + /* node range start <= range start */ + + if (!node) { + /* node range start > range start */ + node = g_tree_node_first(tree.t); + } + + for ( ; node; node = g_tree_node_next(node)) { + PageRange *range = g_tree_node_value(node); + + assert(range); + /* + * if this node starts beyond or at the end of our range so does + * every next one + */ + if (range->start >= start + count) { + break; + } + + if (page_range_intersection_size(range, start, count) > 0) { + return true; + } + } + + return false; +} + +void hvb_page_range_tree_init(PageRangeTree *tree) +{ + tree->t = g_tree_new_full(page_range_tree_key_compare, NULL, + g_free, g_free); +} + +void hvb_page_range_tree_destroy(PageRangeTree *tree) +{ + /* g_tree_destroy() is not NULL-safe */ + if (!tree->t) { + return; + } + + g_tree_destroy(tree->t); + tree->t = NULL; +} diff --git a/hw/hyperv/hv-balloon-page_range_tree.h b/hw/hyperv/hv-balloon-page_range_tree.h new file mode 100644 index 0000000000..07a9ae0da6 --- /dev/null +++ b/hw/hyperv/hv-balloon-page_range_tree.h @@ -0,0 +1,118 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HYPERV_HV_BALLOON_PAGE_RANGE_TREE_H +#define HW_HYPERV_HV_BALLOON_PAGE_RANGE_TREE_H + +#include "qemu/osdep.h" + +/* PageRange */ +typedef struct PageRange { + uint64_t start; + uint64_t count; +} PageRange; + +/* return just the part of range before (start) */ +static inline void page_range_part_before(const PageRange *range, + uint64_t start, PageRange *out) +{ + uint64_t endr = range->start + range->count; + uint64_t end = MIN(endr, start); + + out->start = range->start; + if (end > out->start) { + out->count = end - out->start; + } else { + out->count = 0; + } +} + +/* return just the part of range after (start, count) */ +static inline void page_range_part_after(const PageRange *range, + uint64_t start, uint64_t count, + PageRange *out) +{ + uint64_t end = range->start + range->count; + uint64_t ends = start + count; + + out->start = MAX(range->start, ends); + if (end > out->start) { + out->count = end - out->start; + } else { + out->count = 0; + } +} + +static inline void page_range_intersect(const PageRange *range, + uint64_t start, uint64_t count, + PageRange *out) +{ + uint64_t end1 = range->start + range->count; + uint64_t end2 = start + count; + uint64_t end = MIN(end1, end2); + + out->start = MAX(range->start, start); + out->count = out->start < end ? end - out->start : 0; +} + +static inline uint64_t page_range_intersection_size(const PageRange *range, + uint64_t start, uint64_t count) +{ + PageRange trange; + + page_range_intersect(range, start, count, &trange); + return trange.count; +} + +static inline bool page_range_joinable_left(const PageRange *range, + uint64_t start, uint64_t count) +{ + return start + count == range->start; +} + +static inline bool page_range_joinable_right(const PageRange *range, + uint64_t start, uint64_t count) +{ + return range->start + range->count == start; +} + +static inline bool page_range_joinable(const PageRange *range, + uint64_t start, uint64_t count) +{ + return page_range_joinable_left(range, start, count) || + page_range_joinable_right(range, start, count); +} + +/* PageRangeTree */ +/* type safety */ +typedef struct PageRangeTree { + GTree *t; +} PageRangeTree; + +static inline bool page_range_tree_is_empty(PageRangeTree tree) +{ + guint nnodes = g_tree_nnodes(tree.t); + + return nnodes == 0; +} + +void hvb_page_range_tree_init(PageRangeTree *tree); +void hvb_page_range_tree_destroy(PageRangeTree *tree); + +bool hvb_page_range_tree_intree_any(PageRangeTree tree, + uint64_t start, uint64_t count); + +bool hvb_page_range_tree_pop(PageRangeTree tree, PageRange *out, + uint64_t maxcount); + +void hvb_page_range_tree_insert(PageRangeTree tree, + uint64_t start, uint64_t count, + uint64_t *dupcount); + +#endif diff --git a/hw/hyperv/hv-balloon-stub.c b/hw/hyperv/hv-balloon-stub.c new file mode 100644 index 0000000000..a47412d4a8 --- /dev/null +++ b/hw/hyperv/hv-balloon-stub.c @@ -0,0 +1,19 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-machine.h" +#include "qapi/qapi-types-machine.h" + +HvBalloonInfo *qmp_query_hv_balloon_status_report(Error **errp) +{ + error_setg(errp, "hv-balloon device not enabled in this build"); + return NULL; +} diff --git a/hw/hyperv/hv-balloon.c b/hw/hyperv/hv-balloon.c new file mode 100644 index 0000000000..66f297c1d7 --- /dev/null +++ b/hw/hyperv/hv-balloon.c @@ -0,0 +1,1769 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "hv-balloon-internal.h" + +#include "exec/address-spaces.h" +#include "exec/cpu-common.h" +#include "exec/ramblock.h" +#include "hw/boards.h" +#include "hw/hyperv/dynmem-proto.h" +#include "hw/hyperv/hv-balloon.h" +#include "hw/hyperv/vmbus.h" +#include "hw/mem/memory-device.h" +#include "hw/mem/pc-dimm.h" +#include "hw/qdev-core.h" +#include "hw/qdev-properties.h" +#include "monitor/qdev.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-machine.h" +#include "qapi/qapi-events-machine.h" +#include "qapi/qapi-types-machine.h" +#include "qapi/qmp/qdict.h" +#include "qapi/visitor.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "qemu/units.h" +#include "qemu/timer.h" +#include "sysemu/balloon.h" +#include "sysemu/hostmem.h" +#include "sysemu/reset.h" +#include "hv-balloon-our_range_memslots.h" +#include "hv-balloon-page_range_tree.h" +#include "trace.h" + +#define HV_BALLOON_ADDR_PROP "addr" +#define HV_BALLOON_MEMDEV_PROP "memdev" +#define HV_BALLOON_GUID "525074DC-8985-46e2-8057-A307DC18A502" + +/* + * Some Windows versions (at least Server 2019) will crash with various + * error codes when receiving DM protocol requests (at least + * DM_MEM_HOT_ADD_REQUEST) immediately after boot. + * + * It looks like Hyper-V from Server 2016 uses a 50-second after-boot + * delay, probably to workaround this issue, so we'll use this value, too. + */ +#define HV_BALLOON_POST_INIT_WAIT (50 * 1000) + +#define HV_BALLOON_HA_CHUNK_SIZE (2 * GiB) +#define HV_BALLOON_HA_CHUNK_PAGES (HV_BALLOON_HA_CHUNK_SIZE / HV_BALLOON_PAGE_SIZE) + +#define HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN (128 * MiB) + +#define HV_BALLOON_HR_CHUNK_PAGES 585728 +/* + * ^ that's the maximum number of pages + * that Windows returns in one hot remove response + * + * If the number requested is too high Windows will no longer honor + * these requests + */ + +struct HvBalloonClass { + VMBusDeviceClass parent_class; +} HvBalloonClass; + +typedef enum State { + /* not a real state */ + S_NO_CHANGE = 0, + + S_WAIT_RESET, + S_POST_RESET_CLOSED, + + /* init flow */ + S_VERSION, + S_CAPS, + S_POST_INIT_WAIT, + + S_IDLE, + + /* balloon op flow */ + S_BALLOON_POSTING, + S_BALLOON_RB_WAIT, + S_BALLOON_REPLY_WAIT, + + /* unballoon + hot add ops flow */ + S_UNBALLOON_POSTING, + S_UNBALLOON_RB_WAIT, + S_UNBALLOON_REPLY_WAIT, + S_HOT_ADD_SETUP, + S_HOT_ADD_RB_WAIT, + S_HOT_ADD_POSTING, + S_HOT_ADD_REPLY_WAIT, +} State; + +typedef struct StateDesc { + State state; + const char *desc; +} StateDesc; + +typedef struct HvBalloon { + VMBusDevice parent; + State state; + + union dm_version version; + union dm_caps caps; + + QEMUTimer post_init_timer; + + unsigned int trans_id; + + struct { + bool enabled; + bool received; + uint64_t committed; + uint64_t available; + } status_report; + + /* Guest target size */ + uint64_t target; + bool target_changed; + + /* Current (un)balloon / hot-add operation parameters */ + union { + uint64_t balloon_diff; + + struct { + uint64_t unballoon_diff; + uint64_t hot_add_diff; + }; + + struct { + PageRange hot_add_range; + uint64_t ha_current_count; + }; + }; + + OurRangeMemslots *our_range; + + /* Count of memslots covering our memory */ + unsigned int memslot_count; + + /* Nominal size of each memslot (the last one might be smaller) */ + uint64_t memslot_size; + + /* Non-ours removed memory */ + PageRangeTree removed_guest, removed_both; + + /* Grand totals of removed memory (both ours and non-ours) */ + uint64_t removed_guest_ctr, removed_both_ctr; + + /* MEMORY_DEVICE props */ + uint64_t addr; + HostMemoryBackend *hostmem; + MemoryRegion *mr; +} HvBalloon; + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, HV_BALLOON, VMBUS_DEVICE, \ + { TYPE_MEMORY_DEVICE }, { }) + +#define HV_BALLOON_SET_STATE(hvb, news) \ + do { \ + assert(news != S_NO_CHANGE); \ + hv_balloon_state_set(hvb, news, # news); \ + } while (0) + +#define HV_BALLOON_STATE_DESC_SET(stdesc, news) \ + _hv_balloon_state_desc_set(stdesc, news, # news) + +#define HV_BALLOON_STATE_DESC_INIT \ + { \ + .state = S_NO_CHANGE, \ + } + +typedef struct HvBalloonReq { + VMBusChanReq vmreq; +} HvBalloonReq; + +/* total our memory includes parts currently removed from the guest */ +static uint64_t hv_balloon_total_our_ram(HvBalloon *balloon) +{ + if (!balloon->our_range) { + return 0; + } + + return balloon->our_range->range.added; +} + +/* TODO: unify the code below with virtio-balloon and cache the value */ +static int build_dimm_list(Object *obj, void *opaque) +{ + GSList **list = opaque; + + if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { + DeviceState *dev = DEVICE(obj); + if (dev->realized) { /* only realized DIMMs matter */ + *list = g_slist_prepend(*list, dev); + } + } + + object_child_foreach(obj, build_dimm_list, opaque); + return 0; +} + +static ram_addr_t get_current_ram_size(void) +{ + GSList *list = NULL, *item; + ram_addr_t size = current_machine->ram_size; + + build_dimm_list(qdev_get_machine(), &list); + for (item = list; item; item = g_slist_next(item)) { + Object *obj = OBJECT(item->data); + if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) + size += object_property_get_int(obj, PC_DIMM_SIZE_PROP, + &error_abort); + } + g_slist_free(list); + + return size; +} + +/* total RAM includes memory currently removed from the guest */ +static uint64_t hv_balloon_total_ram(HvBalloon *balloon) +{ + ram_addr_t ram_size = get_current_ram_size(); + uint64_t ram_size_pages = ram_size >> HV_BALLOON_PFN_SHIFT; + uint64_t our_ram_size_pages = hv_balloon_total_our_ram(balloon); + + assert(ram_size_pages > 0); + + return SUM_SATURATE_U64(ram_size_pages, our_ram_size_pages); +} + +/* + * calculating the total RAM size is a slow operation, + * avoid it as much as possible + */ +static uint64_t hv_balloon_total_removed_rs(HvBalloon *balloon, + uint64_t ram_size_pages) +{ + uint64_t total_removed; + + total_removed = SUM_SATURATE_U64(balloon->removed_guest_ctr, + balloon->removed_both_ctr); + + /* possible if guest returns pages outside actual RAM */ + if (total_removed > ram_size_pages) { + total_removed = ram_size_pages; + } + + return total_removed; +} + +/* Returns whether the state has actually changed */ +static bool hv_balloon_state_set(HvBalloon *balloon, + State newst, const char *newststr) +{ + if (newst == S_NO_CHANGE || balloon->state == newst) { + return false; + } + + balloon->state = newst; + trace_hv_balloon_state_change(newststr); + return true; +} + +static void _hv_balloon_state_desc_set(StateDesc *stdesc, + State newst, const char *newststr) +{ + /* state setting is only permitted on a freshly init desc */ + assert(stdesc->state == S_NO_CHANGE); + + assert(newst != S_NO_CHANGE); + + stdesc->state = newst; + stdesc->desc = newststr; +} + +static VMBusChannel *hv_balloon_get_channel_maybe(HvBalloon *balloon) +{ + return vmbus_device_channel(&balloon->parent, 0); +} + +static VMBusChannel *hv_balloon_get_channel(HvBalloon *balloon) +{ + VMBusChannel *chan; + + chan = hv_balloon_get_channel_maybe(balloon); + assert(chan != NULL); + return chan; +} + +static ssize_t hv_balloon_send_packet(VMBusChannel *chan, + struct dm_message *msg) +{ + int ret; + + ret = vmbus_channel_reserve(chan, 0, msg->hdr.size); + if (ret < 0) { + return ret; + } + + return vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, msg, msg->hdr.size, false, + msg->hdr.trans_id); +} + +static bool hv_balloon_unballoon_get_source(HvBalloon *balloon, + PageRangeTree *dtree, + uint64_t **dctr, + bool *is_our_range) +{ + OurRange *our_range = OUR_RANGE(balloon->our_range); + + /* Try the boot memory first */ + if (g_tree_nnodes(balloon->removed_guest.t) > 0) { + *dtree = balloon->removed_guest; + *dctr = &balloon->removed_guest_ctr; + *is_our_range = false; + } else if (g_tree_nnodes(balloon->removed_both.t) > 0) { + *dtree = balloon->removed_both; + *dctr = &balloon->removed_both_ctr; + *is_our_range = false; + } else if (!our_range) { + return false; + } else if (!our_range_is_removed_tree_empty(our_range, false)) { + *dtree = our_range_get_removed_tree(our_range, false); + *dctr = &balloon->removed_guest_ctr; + *is_our_range = true; + } else if (!our_range_is_removed_tree_empty(our_range, true)) { + *dtree = our_range_get_removed_tree(our_range, true); + *dctr = &balloon->removed_both_ctr; + *is_our_range = true; + } else { + return false; + } + + return true; +} + +static void hv_balloon_unballoon_rb_wait(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_unballoon_request *ur; + size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]); + + assert(balloon->state == S_UNBALLOON_RB_WAIT); + + if (vmbus_channel_reserve(chan, 0, ur_size) < 0) { + return; + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_POSTING); +} + +static void hv_balloon_unballoon_posting(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + PageRangeTree dtree; + uint64_t *dctr; + bool our_range; + struct dm_unballoon_request *ur; + size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]); + PageRange range; + bool bret; + ssize_t ret; + + assert(balloon->state == S_UNBALLOON_POSTING); + assert(balloon->unballoon_diff > 0); + + if (!hv_balloon_unballoon_get_source(balloon, &dtree, &dctr, &our_range)) { + error_report("trying to unballoon but nothing seems to be ballooned"); + /* + * there is little we can do as we might have already + * sent the guest a partial request we can't cancel + */ + return; + } + + assert(balloon->our_range || !our_range); + assert(dtree.t); + assert(dctr); + + ur = alloca(ur_size); + memset(ur, 0, ur_size); + ur->hdr.type = DM_UNBALLOON_REQUEST; + ur->hdr.size = ur_size; + ur->hdr.trans_id = balloon->trans_id; + + bret = hvb_page_range_tree_pop(dtree, &range, MIN(balloon->unballoon_diff, + HV_BALLOON_HA_CHUNK_PAGES)); + assert(bret); + /* TODO: madvise? */ + + *dctr -= range.count; + balloon->unballoon_diff -= range.count; + + ur->range_count = 1; + ur->range_array[0].finfo.start_page = range.start; + ur->range_array[0].finfo.page_cnt = range.count; + ur->more_pages = balloon->unballoon_diff > 0; + + trace_hv_balloon_outgoing_unballoon(ur->hdr.trans_id, + range.count, range.start, + balloon->unballoon_diff); + + if (ur->more_pages) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT); + } else { + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_REPLY_WAIT); + } + + ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, ur, ur_size, false, + ur->hdr.trans_id); + if (ret <= 0) { + error_report("error %zd when posting unballoon msg, expect problems", + ret); + } +} + +static bool hv_balloon_our_range_ensure(HvBalloon *balloon) +{ + uint64_t align; + MemoryRegion *hostmem_mr; + g_autoptr(OurRangeMemslots) our_range_memslots = NULL; + OurRange *our_range; + + if (balloon->our_range) { + return true; + } + + if (!balloon->hostmem) { + return false; + } + + align = (1 << balloon->caps.cap_bits.hot_add_alignment) * MiB; + assert(QEMU_IS_ALIGNED(balloon->addr, align)); + + hostmem_mr = host_memory_backend_get_memory(balloon->hostmem); + + our_range_memslots = hvb_our_range_memslots_new(balloon->addr, + balloon->mr, hostmem_mr, + OBJECT(balloon), + balloon->memslot_count, + balloon->memslot_size); + our_range = OUR_RANGE(our_range_memslots); + + if (hvb_page_range_tree_intree_any(balloon->removed_guest, + our_range->range.start, + our_range->range.count) || + hvb_page_range_tree_intree_any(balloon->removed_both, + our_range->range.start, + our_range->range.count)) { + error_report("some parts of the memory backend were already returned by the guest. this should not happen, please reboot the guest and try again"); + return false; + } + + trace_hv_balloon_our_range_add(our_range->range.count, + our_range->range.start); + + balloon->our_range = g_steal_pointer(&our_range_memslots); + return true; +} + +static void hv_balloon_hot_add_setup(HvBalloon *balloon, StateDesc *stdesc) +{ + /* need to make copy since it is in union with hot_add_range */ + uint64_t hot_add_diff = balloon->hot_add_diff; + PageRange *hot_add_range = &balloon->hot_add_range; + uint64_t align, our_range_remaining; + OurRange *our_range; + + assert(balloon->state == S_HOT_ADD_SETUP); + assert(hot_add_diff > 0); + + if (!hv_balloon_our_range_ensure(balloon)) { + goto ret_idle; + } + + our_range = OUR_RANGE(balloon->our_range); + + align = (1 << balloon->caps.cap_bits.hot_add_alignment) * + (MiB / HV_BALLOON_PAGE_SIZE); + + /* Absolute GPA in pages */ + hot_add_range->start = our_range_get_remaining_start(our_range); + assert(QEMU_IS_ALIGNED(hot_add_range->start, align)); + + our_range_remaining = our_range_get_remaining_size(our_range); + hot_add_range->count = MIN(our_range_remaining, hot_add_diff); + hot_add_range->count = QEMU_ALIGN_DOWN(hot_add_range->count, align); + if (hot_add_range->count == 0) { + goto ret_idle; + } + + hvb_our_range_memslots_ensure_mapped_additional(balloon->our_range, + hot_add_range->count); + + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT); + return; + +ret_idle: + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); +} + +static void hv_balloon_hot_add_rb_wait(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_hot_add *ha; + size_t ha_size = sizeof(*ha) + sizeof(ha->range); + + assert(balloon->state == S_HOT_ADD_RB_WAIT); + + if (vmbus_channel_reserve(chan, 0, ha_size) < 0) { + return; + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_POSTING); +} + +static void hv_balloon_hot_add_posting(HvBalloon *balloon, StateDesc *stdesc) +{ + PageRange *hot_add_range = &balloon->hot_add_range; + uint64_t *current_count = &balloon->ha_current_count; + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_hot_add *ha; + size_t ha_size = sizeof(*ha) + sizeof(ha->range); + union dm_mem_page_range *ha_region; + uint64_t align, chunk_max_size; + ssize_t ret; + + assert(balloon->state == S_HOT_ADD_POSTING); + assert(hot_add_range->count > 0); + + align = (1 << balloon->caps.cap_bits.hot_add_alignment) * + (MiB / HV_BALLOON_PAGE_SIZE); + if (align >= HV_BALLOON_HA_CHUNK_PAGES) { + /* + * If the required alignment is higher than the chunk size we let it + * override that size. + */ + chunk_max_size = align; + } else { + chunk_max_size = QEMU_ALIGN_DOWN(HV_BALLOON_HA_CHUNK_PAGES, align); + } + + /* + * hot_add_range->count starts aligned in hv_balloon_hot_add_setup(), + * then it is either reduced by subtracting aligned current_count or + * further hot-adds are prevented by marking the whole remaining our range + * as unusable in hv_balloon_handle_hot_add_response(). + */ + *current_count = MIN(hot_add_range->count, chunk_max_size); + + ha = alloca(ha_size); + ha_region = &(&ha->range)[1]; + memset(ha, 0, ha_size); + ha->hdr.type = DM_MEM_HOT_ADD_REQUEST; + ha->hdr.size = ha_size; + ha->hdr.trans_id = balloon->trans_id; + + ha->range.finfo.start_page = hot_add_range->start; + ha->range.finfo.page_cnt = *current_count; + ha_region->finfo.start_page = hot_add_range->start; + ha_region->finfo.page_cnt = ha->range.finfo.page_cnt; + + trace_hv_balloon_outgoing_hot_add(ha->hdr.trans_id, + *current_count, hot_add_range->start); + + ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, ha, ha_size, false, + ha->hdr.trans_id); + if (ret <= 0) { + error_report("error %zd when posting hot add msg, expect problems", + ret); + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_REPLY_WAIT); +} + +static void hv_balloon_balloon_rb_wait(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + size_t bl_size = sizeof(struct dm_balloon); + + assert(balloon->state == S_BALLOON_RB_WAIT); + + if (vmbus_channel_reserve(chan, 0, bl_size) < 0) { + return; + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_POSTING); +} + +static void hv_balloon_balloon_posting(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_balloon bl; + size_t bl_size = sizeof(bl); + ssize_t ret; + + assert(balloon->state == S_BALLOON_POSTING); + assert(balloon->balloon_diff > 0); + + memset(&bl, 0, sizeof(bl)); + bl.hdr.type = DM_BALLOON_REQUEST; + bl.hdr.size = bl_size; + bl.hdr.trans_id = balloon->trans_id; + bl.num_pages = MIN(balloon->balloon_diff, HV_BALLOON_HR_CHUNK_PAGES); + + trace_hv_balloon_outgoing_balloon(bl.hdr.trans_id, bl.num_pages, + balloon->balloon_diff); + + ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, &bl, bl_size, false, + bl.hdr.trans_id); + if (ret <= 0) { + error_report("error %zd when posting balloon msg, expect problems", + ret); + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_REPLY_WAIT); +} + +static void hv_balloon_idle_state_process_target(HvBalloon *balloon, + StateDesc *stdesc) +{ + bool can_balloon = balloon->caps.cap_bits.balloon; + uint64_t ram_size_pages, total_removed; + + ram_size_pages = hv_balloon_total_ram(balloon); + total_removed = hv_balloon_total_removed_rs(balloon, ram_size_pages); + + /* + * we need to cache the values computed from the balloon target value when + * starting the adjustment procedure in case someone changes the target when + * the procedure is in progress + */ + if (balloon->target > ram_size_pages - total_removed) { + bool can_hot_add = balloon->caps.cap_bits.hot_add; + uint64_t target_diff = balloon->target - + (ram_size_pages - total_removed); + + balloon->unballoon_diff = MIN(target_diff, total_removed); + + if (can_hot_add) { + balloon->hot_add_diff = target_diff - balloon->unballoon_diff; + } else { + balloon->hot_add_diff = 0; + } + + if (balloon->unballoon_diff > 0) { + assert(can_balloon); + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT); + } else if (balloon->hot_add_diff > 0) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP); + } + } else if (can_balloon && + balloon->target < ram_size_pages - total_removed) { + balloon->balloon_diff = ram_size_pages - total_removed - + balloon->target; + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT); + } +} + +static void hv_balloon_idle_state(HvBalloon *balloon, + StateDesc *stdesc) +{ + assert(balloon->state == S_IDLE); + + if (balloon->target_changed) { + balloon->target_changed = false; + hv_balloon_idle_state_process_target(balloon, stdesc); + return; + } +} + +static const struct { + void (*handler)(HvBalloon *balloon, StateDesc *stdesc); +} state_handlers[] = { + [S_IDLE].handler = hv_balloon_idle_state, + [S_BALLOON_POSTING].handler = hv_balloon_balloon_posting, + [S_BALLOON_RB_WAIT].handler = hv_balloon_balloon_rb_wait, + [S_UNBALLOON_POSTING].handler = hv_balloon_unballoon_posting, + [S_UNBALLOON_RB_WAIT].handler = hv_balloon_unballoon_rb_wait, + [S_HOT_ADD_SETUP].handler = hv_balloon_hot_add_setup, + [S_HOT_ADD_RB_WAIT].handler = hv_balloon_hot_add_rb_wait, + [S_HOT_ADD_POSTING].handler = hv_balloon_hot_add_posting, +}; + +static void hv_balloon_handle_state(HvBalloon *balloon, StateDesc *stdesc) +{ + if (balloon->state >= ARRAY_SIZE(state_handlers) || + !state_handlers[balloon->state].handler) { + return; + } + + state_handlers[balloon->state].handler(balloon, stdesc); +} + +static void hv_balloon_remove_response_insert_range(PageRangeTree tree, + const PageRange *range, + uint64_t *ctr1, + uint64_t *ctr2, + uint64_t *ctr3) +{ + uint64_t dupcount, effcount; + + if (range->count == 0) { + return; + } + + dupcount = 0; + hvb_page_range_tree_insert(tree, range->start, range->count, &dupcount); + + assert(dupcount <= range->count); + effcount = range->count - dupcount; + + *ctr1 += effcount; + *ctr2 += effcount; + if (ctr3) { + *ctr3 += effcount; + } +} + +static void hv_balloon_remove_response_handle_range(HvBalloon *balloon, + PageRange *range, + bool both, + uint64_t *removedctr) +{ + OurRange *our_range = OUR_RANGE(balloon->our_range); + PageRangeTree globaltree = + both ? balloon->removed_both : balloon->removed_guest; + uint64_t *globalctr = + both ? &balloon->removed_both_ctr : &balloon->removed_guest_ctr; + PageRange rangeeff; + + if (range->count == 0) { + return; + } + + trace_hv_balloon_remove_response(range->count, range->start, both); + + if (our_range) { + /* Includes the not-yet-hot-added and unusable parts. */ + rangeeff = our_range->range; + } else { + rangeeff.start = rangeeff.count = 0; + } + + if (page_range_intersection_size(range, rangeeff.start, rangeeff.count) > 0) { + PageRangeTree ourtree = our_range_get_removed_tree(our_range, both); + PageRange rangehole, rangecommon; + uint64_t ourremoved = 0; + + /* process the hole before our range, if it exists */ + page_range_part_before(range, rangeeff.start, &rangehole); + hv_balloon_remove_response_insert_range(globaltree, &rangehole, + globalctr, removedctr, NULL); + if (rangehole.count > 0) { + trace_hv_balloon_remove_response_hole(rangehole.count, + rangehole.start, + range->count, range->start, + rangeeff.start, both); + } + + /* process our part */ + page_range_intersect(range, rangeeff.start, rangeeff.count, + &rangecommon); + hv_balloon_remove_response_insert_range(ourtree, &rangecommon, + globalctr, removedctr, + &ourremoved); + if (rangecommon.count > 0) { + trace_hv_balloon_remove_response_common(rangecommon.count, + rangecommon.start, + range->count, range->start, + rangeeff.count, + rangeeff.start, ourremoved, + both); + } + + /* calculate what's left after our range */ + rangecommon = *range; + page_range_part_after(&rangecommon, rangeeff.start, rangeeff.count, + range); + } + + /* process the remainder of the range that lies after our range */ + if (range->count > 0) { + hv_balloon_remove_response_insert_range(globaltree, range, + globalctr, removedctr, NULL); + trace_hv_balloon_remove_response_remainder(range->count, range->start, + both); + range->count = 0; + } +} + +static void hv_balloon_remove_response_handle_pages(HvBalloon *balloon, + PageRange *range, + uint64_t start, + uint64_t count, + bool both, + uint64_t *removedctr) +{ + assert(count > 0); + + /* + * if there is an existing range that the new range can't be joined to + * dump it into tree(s) + */ + if (range->count > 0 && !page_range_joinable(range, start, count)) { + hv_balloon_remove_response_handle_range(balloon, range, both, + removedctr); + } + + if (range->count == 0) { + range->start = start; + range->count = count; + } else if (page_range_joinable_left(range, start, count)) { + range->start = start; + range->count += count; + } else { /* page_range_joinable_right() */ + range->count += count; + } +} + +static gboolean hv_balloon_handle_remove_host_addr_node(gpointer key, + gpointer value, + gpointer data) +{ + PageRange *range = value; + uint64_t pageoff; + + for (pageoff = 0; pageoff < range->count; ) { + uint64_t addr_64 = (range->start + pageoff) * HV_BALLOON_PAGE_SIZE; + void *addr; + RAMBlock *rb; + ram_addr_t rb_offset; + size_t rb_page_size; + size_t discard_size; + + assert(addr_64 <= UINTPTR_MAX); + addr = (void *)((uintptr_t)addr_64); + rb = qemu_ram_block_from_host(addr, false, &rb_offset); + rb_page_size = qemu_ram_pagesize(rb); + + if (rb_page_size != HV_BALLOON_PAGE_SIZE) { + /* TODO: these should end in "removed_guest" */ + warn_report("guest reported removed page backed by unsupported page size %zu", + rb_page_size); + pageoff++; + continue; + } + + discard_size = MIN(range->count - pageoff, + (rb->max_length - rb_offset) / + HV_BALLOON_PAGE_SIZE); + discard_size = MAX(discard_size, 1); + + if (ram_block_discard_range(rb, rb_offset, discard_size * + HV_BALLOON_PAGE_SIZE) != 0) { + warn_report("guest reported removed page failed discard"); + } + + pageoff += discard_size; + } + + return false; +} + +static void hv_balloon_handle_remove_host_addr_tree(PageRangeTree tree) +{ + g_tree_foreach(tree.t, hv_balloon_handle_remove_host_addr_node, NULL); +} + +static int hv_balloon_handle_remove_section(PageRangeTree tree, + const MemoryRegionSection *section, + uint64_t count) +{ + void *addr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + uint64_t addr_page; + + assert(count > 0); + + if ((uintptr_t)addr % HV_BALLOON_PAGE_SIZE) { + warn_report("guest reported removed pages at an unaligned host addr %p", + addr); + return -EINVAL; + } + + addr_page = (uintptr_t)addr / HV_BALLOON_PAGE_SIZE; + hvb_page_range_tree_insert(tree, addr_page, count, NULL); + + return 0; +} + +static void hv_balloon_handle_remove_ranges(HvBalloon *balloon, + union dm_mem_page_range ranges[], + uint32_t count) +{ + uint64_t removedcnt; + PageRangeTree removed_host_addr; + PageRange range_guest, range_both; + + hvb_page_range_tree_init(&removed_host_addr); + range_guest.count = range_both.count = removedcnt = 0; + for (unsigned int ctr = 0; ctr < count; ctr++) { + union dm_mem_page_range *mr = &ranges[ctr]; + hwaddr pa; + MemoryRegionSection section; + + for (unsigned int offset = 0; offset < mr->finfo.page_cnt; ) { + int ret; + uint64_t pageno = mr->finfo.start_page + offset; + uint64_t pagecnt = 1; + + pa = (hwaddr)pageno << HV_BALLOON_PFN_SHIFT; + section = memory_region_find(get_system_memory(), pa, + (mr->finfo.page_cnt - offset) * + HV_BALLOON_PAGE_SIZE); + if (!section.mr) { + warn_report("guest reported removed page %"PRIu64" not found in RAM", + pageno); + ret = -EINVAL; + goto finish_page; + } + + pagecnt = int128_get64(section.size) / HV_BALLOON_PAGE_SIZE; + if (pagecnt <= 0) { + warn_report("guest reported removed page %"PRIu64" in a section smaller than page size", + pageno); + pagecnt = 1; /* skip the whole page */ + ret = -EINVAL; + goto finish_page; + } + + if (!memory_region_is_ram(section.mr) || + memory_region_is_rom(section.mr) || + memory_region_is_romd(section.mr)) { + warn_report("guest reported removed page %"PRIu64" in a section that is not an ordinary RAM", + pageno); + ret = -EINVAL; + goto finish_page; + } + + ret = hv_balloon_handle_remove_section(removed_host_addr, §ion, + pagecnt); + + finish_page: + if (ret == 0) { + hv_balloon_remove_response_handle_pages(balloon, + &range_both, + pageno, pagecnt, + true, &removedcnt); + } else { + hv_balloon_remove_response_handle_pages(balloon, + &range_guest, + pageno, pagecnt, + false, &removedcnt); + } + + if (section.mr) { + memory_region_unref(section.mr); + } + + offset += pagecnt; + } + } + + hv_balloon_remove_response_handle_range(balloon, &range_both, true, + &removedcnt); + hv_balloon_remove_response_handle_range(balloon, &range_guest, false, + &removedcnt); + + hv_balloon_handle_remove_host_addr_tree(removed_host_addr); + hvb_page_range_tree_destroy(&removed_host_addr); + + if (removedcnt > balloon->balloon_diff) { + warn_report("guest reported more pages removed than currently pending (%"PRIu64" vs %"PRIu64")", + removedcnt, balloon->balloon_diff); + balloon->balloon_diff = 0; + } else { + balloon->balloon_diff -= removedcnt; + } +} + +static bool hv_balloon_handle_msg_size(HvBalloonReq *req, size_t minsize, + const char *msgname) +{ + VMBusChanReq *vmreq = &req->vmreq; + uint32_t msglen = vmreq->msglen; + + if (msglen >= minsize) { + return true; + } + + warn_report("%s message too short (%u vs %zu), ignoring", msgname, + (unsigned int)msglen, minsize); + return false; +} + +static void hv_balloon_handle_version_request(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_version_request *msgVr = vmreq->msg; + struct dm_version_response respVr; + + if (balloon->state != S_VERSION) { + warn_report("unexpected DM_VERSION_REQUEST in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgVr), + "DM_VERSION_REQUEST")) { + return; + } + + trace_hv_balloon_incoming_version(msgVr->version.major_version, + msgVr->version.minor_version); + + memset(&respVr, 0, sizeof(respVr)); + respVr.hdr.type = DM_VERSION_RESPONSE; + respVr.hdr.size = sizeof(respVr); + respVr.hdr.trans_id = msgVr->hdr.trans_id; + respVr.is_accepted = msgVr->version.version >= DYNMEM_PROTOCOL_VERSION_1 && + msgVr->version.version <= DYNMEM_PROTOCOL_VERSION_3; + + hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respVr); + + if (respVr.is_accepted) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_CAPS); + } +} + +static void hv_balloon_handle_caps_report(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_capabilities *msgCap = vmreq->msg; + struct dm_capabilities_resp_msg respCap; + + if (balloon->state != S_CAPS) { + warn_report("unexpected DM_CAPABILITIES_REPORT in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgCap), + "DM_CAPABILITIES_REPORT")) { + return; + } + + trace_hv_balloon_incoming_caps(msgCap->caps.caps); + balloon->caps = msgCap->caps; + + memset(&respCap, 0, sizeof(respCap)); + respCap.hdr.type = DM_CAPABILITIES_RESPONSE; + respCap.hdr.size = sizeof(respCap); + respCap.hdr.trans_id = msgCap->hdr.trans_id; + respCap.is_accepted = 1; + respCap.hot_remove = 1; + respCap.suppress_pressure_reports = !balloon->status_report.enabled; + hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respCap); + + timer_mod(&balloon->post_init_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + HV_BALLOON_POST_INIT_WAIT); + + HV_BALLOON_STATE_DESC_SET(stdesc, S_POST_INIT_WAIT); +} + +static void hv_balloon_handle_status_report(HvBalloon *balloon, + HvBalloonReq *req) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_status *msgStatus = vmreq->msg; + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgStatus), + "DM_STATUS_REPORT")) { + return; + } + + if (!balloon->status_report.enabled) { + return; + } + + balloon->status_report.committed = msgStatus->num_committed; + balloon->status_report.committed *= HV_BALLOON_PAGE_SIZE; + balloon->status_report.available = msgStatus->num_avail; + balloon->status_report.available *= HV_BALLOON_PAGE_SIZE; + balloon->status_report.received = true; + + qapi_event_send_hv_balloon_status_report(balloon->status_report.committed, + balloon->status_report.available); +} + +HvBalloonInfo *qmp_query_hv_balloon_status_report(Error **errp) +{ + HvBalloon *balloon; + HvBalloonInfo *info; + + balloon = HV_BALLOON(object_resolve_path_type("", TYPE_HV_BALLOON, NULL)); + if (!balloon) { + error_setg(errp, "no %s device present", TYPE_HV_BALLOON); + return NULL; + } + + if (!balloon->status_report.enabled) { + error_setg(errp, "guest memory status reporting not enabled"); + return NULL; + } + + if (!balloon->status_report.received) { + error_setg(errp, "no guest memory status report received yet"); + return NULL; + } + + info = g_malloc0(sizeof(*info)); + info->committed = balloon->status_report.committed; + info->available = balloon->status_report.available; + return info; +} + +static void hv_balloon_handle_unballoon_response(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_unballoon_response *msgUrR = vmreq->msg; + + if (balloon->state != S_UNBALLOON_REPLY_WAIT) { + warn_report("unexpected DM_UNBALLOON_RESPONSE in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgUrR), + "DM_UNBALLOON_RESPONSE")) + return; + + trace_hv_balloon_incoming_unballoon(msgUrR->hdr.trans_id); + + balloon->trans_id++; + + if (balloon->hot_add_diff > 0) { + bool can_hot_add = balloon->caps.cap_bits.hot_add; + + assert(can_hot_add); + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP); + } else { + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); + } +} + +static void hv_balloon_handle_hot_add_response(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + PageRange *hot_add_range = &balloon->hot_add_range; + VMBusChanReq *vmreq = &req->vmreq; + struct dm_hot_add_response *msgHaR = vmreq->msg; + OurRange *our_range; + + if (balloon->state != S_HOT_ADD_REPLY_WAIT) { + warn_report("unexpected DM_HOT_ADD_RESPONSE in %d state", + balloon->state); + return; + } + + assert(balloon->our_range); + our_range = OUR_RANGE(balloon->our_range); + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgHaR), + "DM_HOT_ADD_RESPONSE")) + return; + + trace_hv_balloon_incoming_hot_add(msgHaR->hdr.trans_id, msgHaR->result, + msgHaR->page_count); + + balloon->trans_id++; + + if (msgHaR->result) { + if (msgHaR->page_count > balloon->ha_current_count) { + warn_report("DM_HOT_ADD_RESPONSE page count higher than requested (%"PRIu32" vs %"PRIu64")", + msgHaR->page_count, balloon->ha_current_count); + msgHaR->page_count = balloon->ha_current_count; + } + + hvb_our_range_mark_added(our_range, msgHaR->page_count); + hot_add_range->start += msgHaR->page_count; + hot_add_range->count -= msgHaR->page_count; + } + + if (!msgHaR->result || msgHaR->page_count < balloon->ha_current_count) { + /* + * the current planned range was only partially hot-added, take note + * how much of it remains and don't attempt any further hot adds + */ + our_range_mark_remaining_unusable(our_range); + + goto ret_idle; + } + + /* any pages remaining to hot-add in our range? */ + if (hot_add_range->count > 0) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT); + return; + } + +ret_idle: + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); +} + +static void hv_balloon_handle_balloon_response(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_balloon_response *msgBR = vmreq->msg; + + if (balloon->state != S_BALLOON_REPLY_WAIT) { + warn_report("unexpected DM_BALLOON_RESPONSE in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgBR), + "DM_BALLOON_RESPONSE")) + return; + + trace_hv_balloon_incoming_balloon(msgBR->hdr.trans_id, msgBR->range_count, + msgBR->more_pages); + + if (vmreq->msglen < sizeof(*msgBR) + + (uint64_t)sizeof(msgBR->range_array[0]) * msgBR->range_count) { + warn_report("DM_BALLOON_RESPONSE too short for the range count"); + return; + } + + if (msgBR->range_count == 0) { + /* The guest is already at its minimum size */ + balloon->balloon_diff = 0; + goto ret_end_trans; + } else { + hv_balloon_handle_remove_ranges(balloon, + msgBR->range_array, + msgBR->range_count); + } + + /* More responses expected? */ + if (msgBR->more_pages) { + return; + } + +ret_end_trans: + balloon->trans_id++; + + if (balloon->balloon_diff > 0) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT); + } else { + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); + } +} + +static void hv_balloon_handle_packet(HvBalloon *balloon, HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_message *msg = vmreq->msg; + + if (vmreq->msglen < sizeof(msg->hdr)) { + return; + } + + switch (msg->hdr.type) { + case DM_VERSION_REQUEST: + hv_balloon_handle_version_request(balloon, req, stdesc); + break; + + case DM_CAPABILITIES_REPORT: + hv_balloon_handle_caps_report(balloon, req, stdesc); + break; + + case DM_STATUS_REPORT: + hv_balloon_handle_status_report(balloon, req); + break; + + case DM_MEM_HOT_ADD_RESPONSE: + hv_balloon_handle_hot_add_response(balloon, req, stdesc); + break; + + case DM_UNBALLOON_RESPONSE: + hv_balloon_handle_unballoon_response(balloon, req, stdesc); + break; + + case DM_BALLOON_RESPONSE: + hv_balloon_handle_balloon_response(balloon, req, stdesc); + break; + + default: + warn_report("unknown DM message %u", msg->hdr.type); + break; + } +} + +static bool hv_balloon_recv_channel(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan; + HvBalloonReq *req; + + if (balloon->state == S_WAIT_RESET || + balloon->state == S_POST_RESET_CLOSED) { + return false; + } + + chan = hv_balloon_get_channel(balloon); + if (vmbus_channel_recv_start(chan)) { + return false; + } + + while ((req = vmbus_channel_recv_peek(chan, sizeof(*req)))) { + hv_balloon_handle_packet(balloon, req, stdesc); + vmbus_free_req(req); + vmbus_channel_recv_pop(chan); + + if (stdesc->state != S_NO_CHANGE) { + break; + } + } + + return vmbus_channel_recv_done(chan) > 0; +} + +/* old state handler -> new state transition (potential) */ +static bool hv_balloon_event_loop_state(HvBalloon *balloon) +{ + StateDesc state_new = HV_BALLOON_STATE_DESC_INIT; + + hv_balloon_handle_state(balloon, &state_new); + return hv_balloon_state_set(balloon, state_new.state, state_new.desc); +} + +/* VMBus message -> new state transition (potential) */ +static bool hv_balloon_event_loop_recv(HvBalloon *balloon) +{ + StateDesc state_new = HV_BALLOON_STATE_DESC_INIT; + bool any_recv, state_changed; + + any_recv = hv_balloon_recv_channel(balloon, &state_new); + state_changed = hv_balloon_state_set(balloon, + state_new.state, state_new.desc); + + return state_changed || any_recv; +} + +static void hv_balloon_event_loop(HvBalloon *balloon) +{ + bool state_repeat, recv_repeat; + + do { + state_repeat = hv_balloon_event_loop_state(balloon); + recv_repeat = hv_balloon_event_loop_recv(balloon); + } while (state_repeat || recv_repeat); +} + +static void hv_balloon_vmdev_chan_notify(VMBusChannel *chan) +{ + HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan)); + + hv_balloon_event_loop(balloon); +} + +static void hv_balloon_stat(void *opaque, BalloonInfo *info) +{ + HvBalloon *balloon = opaque; + info->actual = (hv_balloon_total_ram(balloon) - balloon->removed_both_ctr) + << HV_BALLOON_PFN_SHIFT; +} + +static void hv_balloon_to_target(void *opaque, ram_addr_t target) +{ + HvBalloon *balloon = opaque; + uint64_t target_pages = target >> HV_BALLOON_PFN_SHIFT; + + if (!target_pages) { + return; + } + + /* + * always set target_changed, even with unchanged target, as the user + * might be asking us to try again reaching it + */ + balloon->target = target_pages; + balloon->target_changed = true; + + hv_balloon_event_loop(balloon); +} + +static int hv_balloon_vmdev_open_channel(VMBusChannel *chan) +{ + HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan)); + + if (balloon->state != S_POST_RESET_CLOSED) { + warn_report("guest trying to open a DM channel in invalid %d state", + balloon->state); + return -EINVAL; + } + + HV_BALLOON_SET_STATE(balloon, S_VERSION); + hv_balloon_event_loop(balloon); + + return 0; +} + +static void hv_balloon_vmdev_close_channel(VMBusChannel *chan) +{ + HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan)); + + timer_del(&balloon->post_init_timer); + + /* Don't report stale data */ + balloon->status_report.received = false; + + HV_BALLOON_SET_STATE(balloon, S_WAIT_RESET); + hv_balloon_event_loop(balloon); +} + +static void hv_balloon_post_init_timer(void *opaque) +{ + HvBalloon *balloon = opaque; + + if (balloon->state != S_POST_INIT_WAIT) { + return; + } + + HV_BALLOON_SET_STATE(balloon, S_IDLE); + hv_balloon_event_loop(balloon); +} + +static void hv_balloon_system_reset_unrealize_common(HvBalloon *balloon) +{ + g_clear_pointer(&balloon->our_range, hvb_our_range_memslots_free); +} + +static void hv_balloon_system_reset(void *opaque) +{ + HvBalloon *balloon = HV_BALLOON(opaque); + + hv_balloon_system_reset_unrealize_common(balloon); +} + +static void hv_balloon_ensure_mr(HvBalloon *balloon) +{ + MemoryRegion *hostmem_mr; + + assert(balloon->hostmem); + + if (balloon->mr) { + return; + } + + hostmem_mr = host_memory_backend_get_memory(balloon->hostmem); + + balloon->mr = g_new0(MemoryRegion, 1); + memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON, + memory_region_size(hostmem_mr)); + + /* + * The VM can indicate an alignment up to 32 GiB. Memory device core can + * usually only handle/guarantee 1 GiB alignment. The user will have to + * specify a larger maxmem eventually. + * + * The memory device core will warn the user in case maxmem might have to be + * increased and will fail plugging the device if there is not sufficient + * space after alignment. + * + * TODO: we could do the alignment ourselves in a slightly bigger region. + * But this feels better, although the warning might be annoying. Maybe + * we can optimize that in the future (e.g., with such a device on the + * cmdline place/size the device memory region differently. + */ + balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr)); +} + +static void hv_balloon_free_mr(HvBalloon *balloon) +{ + if (!balloon->mr) { + return; + } + + object_unparent(OBJECT(balloon->mr)); + g_clear_pointer(&balloon->mr, g_free); +} + +static void hv_balloon_vmdev_realize(VMBusDevice *vdev, Error **errp) +{ + ERRP_GUARD(); + HvBalloon *balloon = HV_BALLOON(vdev); + int ret; + + balloon->state = S_WAIT_RESET; + + ret = qemu_add_balloon_handler(hv_balloon_to_target, hv_balloon_stat, + balloon); + if (ret < 0) { + /* This also protects against having multiple hv-balloon instances */ + error_setg(errp, "Only one balloon device is supported"); + return; + } + + if (balloon->hostmem) { + if (host_memory_backend_is_mapped(balloon->hostmem)) { + Object *obj = OBJECT(balloon->hostmem); + + error_setg(errp, "'%s' property specifies a busy memdev: %s", + HV_BALLOON_MEMDEV_PROP, + object_get_canonical_path_component(obj)); + goto out_balloon_handler; + } + + hv_balloon_ensure_mr(balloon); + + /* This is rather unlikely to happen, but let's still check for it. */ + if (!QEMU_IS_ALIGNED(memory_region_size(balloon->mr), + HV_BALLOON_PAGE_SIZE)) { + error_setg(errp, "'%s' property memdev size has to be a multiple of 0x%" PRIx64, + HV_BALLOON_MEMDEV_PROP, (uint64_t)HV_BALLOON_PAGE_SIZE); + goto out_balloon_handler; + } + + host_memory_backend_set_mapped(balloon->hostmem, true); + vmstate_register_ram(host_memory_backend_get_memory(balloon->hostmem), + DEVICE(balloon)); + } else if (balloon->addr) { + error_setg(errp, "'%s' property must not be set without a memdev", + HV_BALLOON_MEMDEV_PROP); + goto out_balloon_handler; + } + + timer_init_ms(&balloon->post_init_timer, QEMU_CLOCK_VIRTUAL, + hv_balloon_post_init_timer, balloon); + + qemu_register_reset(hv_balloon_system_reset, balloon); + + return; + +out_balloon_handler: + qemu_remove_balloon_handler(balloon); +} + +/* + * VMBus device reset has to be implemented in case the guest decides to + * disconnect and reconnect to the VMBus without rebooting the whole system. + * + * However, the hot-added memory can't be removed here as Windows keeps on using + * it until the system is restarted, even after disconnecting from the VMBus. + */ +static void hv_balloon_vmdev_reset(VMBusDevice *vdev) +{ + HvBalloon *balloon = HV_BALLOON(vdev); + + if (balloon->state == S_POST_RESET_CLOSED) { + return; + } + + if (balloon->our_range) { + hvb_our_range_clear_removed_trees(OUR_RANGE(balloon->our_range)); + } + + hvb_page_range_tree_destroy(&balloon->removed_guest); + hvb_page_range_tree_destroy(&balloon->removed_both); + hvb_page_range_tree_init(&balloon->removed_guest); + hvb_page_range_tree_init(&balloon->removed_both); + + balloon->trans_id = 0; + balloon->removed_guest_ctr = 0; + balloon->removed_both_ctr = 0; + + HV_BALLOON_SET_STATE(balloon, S_POST_RESET_CLOSED); + hv_balloon_event_loop(balloon); +} + +/* + * Clean up things that were (possibly) allocated pre-realization, for example + * from memory_device_pre_plug(), so we don't leak them if the device don't + * actually get realized in the end. + */ +static void hv_balloon_unrealize_finalize_common(HvBalloon *balloon) +{ + hv_balloon_free_mr(balloon); + balloon->addr = 0; + + balloon->memslot_count = 0; +} + +static void hv_balloon_vmdev_unrealize(VMBusDevice *vdev) +{ + HvBalloon *balloon = HV_BALLOON(vdev); + + qemu_unregister_reset(hv_balloon_system_reset, balloon); + + hv_balloon_system_reset_unrealize_common(balloon); + + qemu_remove_balloon_handler(balloon); + + if (balloon->hostmem) { + vmstate_unregister_ram(host_memory_backend_get_memory(balloon->hostmem), + DEVICE(balloon)); + host_memory_backend_set_mapped(balloon->hostmem, false); + } + + hvb_page_range_tree_destroy(&balloon->removed_guest); + hvb_page_range_tree_destroy(&balloon->removed_both); + + hv_balloon_unrealize_finalize_common(balloon); +} + +static uint64_t hv_balloon_md_get_addr(const MemoryDeviceState *md) +{ + return object_property_get_uint(OBJECT(md), HV_BALLOON_ADDR_PROP, + &error_abort); +} + +static void hv_balloon_md_set_addr(MemoryDeviceState *md, uint64_t addr, + Error **errp) +{ + object_property_set_uint(OBJECT(md), HV_BALLOON_ADDR_PROP, addr, errp); +} + +static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md, + Error **errp) +{ + HvBalloon *balloon = HV_BALLOON(md); + + if (!balloon->hostmem) { + return NULL; + } + + hv_balloon_ensure_mr(balloon); + + return balloon->mr; +} + +static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md, + MemoryDeviceInfo *info) +{ + HvBalloonDeviceInfo *hi = g_new0(HvBalloonDeviceInfo, 1); + const HvBalloon *balloon = HV_BALLOON(md); + DeviceState *dev = DEVICE(md); + + if (dev->id) { + hi->id = g_strdup(dev->id); + } + + if (balloon->hostmem) { + hi->memdev = object_get_canonical_path(OBJECT(balloon->hostmem)); + hi->memaddr = balloon->addr; + hi->has_memaddr = true; + hi->max_size = memory_region_size(balloon->mr); + /* TODO: expose current provided size or something else? */ + } else { + hi->max_size = 0; + } + + info->u.hv_balloon.data = hi; + info->type = MEMORY_DEVICE_INFO_KIND_HV_BALLOON; +} + +static void hv_balloon_decide_memslots(MemoryDeviceState *md, + unsigned int limit) +{ + HvBalloon *balloon = HV_BALLOON(md); + MemoryRegion *hostmem_mr; + uint64_t region_size, memslot_size, memslots; + + /* We're called exactly once, before realizing the device. */ + assert(!balloon->memslot_count); + + /* We should not be called if we don't have a memory backend */ + assert(balloon->hostmem); + + hostmem_mr = host_memory_backend_get_memory(balloon->hostmem); + region_size = memory_region_size(hostmem_mr); + + assert(region_size > 0); + memslot_size = QEMU_ALIGN_UP(region_size / limit, + HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN); + memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size; + + if (memslots > 1) { + balloon->memslot_size = memslot_size; + } else { + balloon->memslot_size = region_size; + } + + assert(memslots <= UINT_MAX); + balloon->memslot_count = memslots; +} + +static unsigned int hv_balloon_get_memslots(MemoryDeviceState *md) +{ + const HvBalloon *balloon = HV_BALLOON(md); + + /* We're called after setting the suggested limit. */ + assert(balloon->memslot_count > 0); + + return balloon->memslot_count; +} + +static void hv_balloon_init(Object *obj) +{ +} + +static void hv_balloon_finalize(Object *obj) +{ + HvBalloon *balloon = HV_BALLOON(obj); + + hv_balloon_unrealize_finalize_common(balloon); +} + +static Property hv_balloon_properties[] = { + DEFINE_PROP_BOOL("status-report", HvBalloon, + status_report.enabled, false), + + /* MEMORY_DEVICE props */ + DEFINE_PROP_LINK(HV_BALLOON_MEMDEV_PROP, HvBalloon, hostmem, + TYPE_MEMORY_BACKEND, HostMemoryBackend *), + DEFINE_PROP_UINT64(HV_BALLOON_ADDR_PROP, HvBalloon, addr, 0), + + DEFINE_PROP_END_OF_LIST(), +}; + +static void hv_balloon_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VMBusDeviceClass *vdc = VMBUS_DEVICE_CLASS(klass); + MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass); + + device_class_set_props(dc, hv_balloon_properties); + qemu_uuid_parse(HV_BALLOON_GUID, &vdc->classid); + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + + vdc->vmdev_realize = hv_balloon_vmdev_realize; + vdc->vmdev_unrealize = hv_balloon_vmdev_unrealize; + vdc->vmdev_reset = hv_balloon_vmdev_reset; + vdc->open_channel = hv_balloon_vmdev_open_channel; + vdc->close_channel = hv_balloon_vmdev_close_channel; + vdc->chan_notify_cb = hv_balloon_vmdev_chan_notify; + + mdc->get_addr = hv_balloon_md_get_addr; + mdc->set_addr = hv_balloon_md_set_addr; + mdc->get_plugged_size = memory_device_get_region_size; + mdc->get_memory_region = hv_balloon_md_get_memory_region; + mdc->decide_memslots = hv_balloon_decide_memslots; + mdc->get_memslots = hv_balloon_get_memslots; + mdc->fill_device_info = hv_balloon_md_fill_device_info; +} diff --git a/hw/hyperv/meson.build b/hw/hyperv/meson.build index b43f119ea5..d3d2668c71 100644 --- a/hw/hyperv/meson.build +++ b/hw/hyperv/meson.build @@ -2,3 +2,4 @@ specific_ss.add(when: 'CONFIG_HYPERV', if_true: files('hyperv.c')) specific_ss.add(when: 'CONFIG_HYPERV_TESTDEV', if_true: files('hyperv_testdev.c')) specific_ss.add(when: 'CONFIG_VMBUS', if_true: files('vmbus.c')) specific_ss.add(when: 'CONFIG_SYNDBG', if_true: files('syndbg.c')) +specific_ss.add(when: 'CONFIG_HV_BALLOON', if_true: files('hv-balloon.c', 'hv-balloon-page_range_tree.c', 'hv-balloon-our_range_memslots.c'), if_false: files('hv-balloon-stub.c')) diff --git a/hw/hyperv/trace-events b/hw/hyperv/trace-events index b4c35ca8e3..7963c215b1 100644 --- a/hw/hyperv/trace-events +++ b/hw/hyperv/trace-events @@ -16,3 +16,21 @@ vmbus_gpadl_torndown(uint32_t gpadl_id) "gpadl #%d" vmbus_open_channel(uint32_t chan_id, uint32_t gpadl_id, uint32_t target_vp) "channel #%d gpadl #%d target vp %d" vmbus_channel_open(uint32_t chan_id, uint32_t status) "channel #%d status %d" vmbus_close_channel(uint32_t chan_id) "channel #%d" + +# hv-balloon +hv_balloon_state_change(const char *tostr) "-> %s" +hv_balloon_incoming_version(uint16_t major, uint16_t minor) "incoming proto version %u.%u" +hv_balloon_incoming_caps(uint32_t caps) "incoming caps 0x%x" +hv_balloon_outgoing_unballoon(uint32_t trans_id, uint64_t count, uint64_t start, uint64_t rempages) "posting unballoon %"PRIu32" for %"PRIu64" @ 0x%"PRIx64", remaining %"PRIu64 +hv_balloon_incoming_unballoon(uint32_t trans_id) "incoming unballoon response %"PRIu32 +hv_balloon_outgoing_hot_add(uint32_t trans_id, uint64_t count, uint64_t start) "posting hot add %"PRIu32" for %"PRIu64" @ 0x%"PRIx64 +hv_balloon_incoming_hot_add(uint32_t trans_id, uint32_t result, uint32_t count) "incoming hot add response %"PRIu32", result %"PRIu32", count %"PRIu32 +hv_balloon_outgoing_balloon(uint32_t trans_id, uint64_t count, uint64_t rempages) "posting balloon %"PRIu32" for %"PRIu64", remaining %"PRIu64 +hv_balloon_incoming_balloon(uint32_t trans_id, uint32_t range_count, uint32_t more_pages) "incoming balloon response %"PRIu32", ranges %"PRIu32", more %"PRIu32 +hv_balloon_our_range_add(uint64_t count, uint64_t start) "adding our range %"PRIu64" @ 0x%"PRIx64 +hv_balloon_remove_response(uint64_t count, uint64_t start, unsigned int both) "processing remove response range %"PRIu64" @ 0x%"PRIx64", both %u" +hv_balloon_remove_response_hole(uint64_t counthole, uint64_t starthole, uint64_t countrange, uint64_t startrange, uint64_t starthpr, unsigned int both) "response range hole %"PRIu64" @ 0x%"PRIx64" from range %"PRIu64" @ 0x%"PRIx64", before our start 0x%"PRIx64", both %u" +hv_balloon_remove_response_common(uint64_t countcommon, uint64_t startcommon, uint64_t countrange, uint64_t startrange, uint64_t counthpr, uint64_t starthpr, uint64_t removed, unsigned int both) "response common range %"PRIu64" @ 0x%"PRIx64" from range %"PRIu64" @ 0x%"PRIx64" with our %"PRIu64" @ 0x%"PRIx64", removed %"PRIu64", both %u" +hv_balloon_remove_response_remainder(uint64_t count, uint64_t start, unsigned int both) "remove response remaining range %"PRIu64" @ 0x%"PRIx64", both %u" +hv_balloon_map_slot(unsigned int idx, unsigned int total_slots, uint64_t offset) "mapping memslot %u / %u @ 0x%"PRIx64 +hv_balloon_unmap_slot(unsigned int idx, unsigned int total_slots, uint64_t offset) "unmapping memslot %u / %u @ 0x%"PRIx64 diff --git a/hw/hyperv/vmbus.c b/hw/hyperv/vmbus.c index 271289f902..c64eaa5a46 100644 --- a/hw/hyperv/vmbus.c +++ b/hw/hyperv/vmbus.c @@ -2271,7 +2271,7 @@ static void vmbus_dev_realize(DeviceState *dev, Error **errp) VMBus *vmbus = VMBUS(qdev_get_parent_bus(dev)); BusChild *child; Error *err = NULL; - char idstr[UUID_FMT_LEN + 1]; + char idstr[UUID_STR_LEN]; assert(!qemu_uuid_is_null(&vdev->instanceid)); @@ -2467,7 +2467,7 @@ static char *vmbus_get_dev_path(DeviceState *dev) static char *vmbus_get_fw_dev_path(DeviceState *dev) { VMBusDevice *vdev = VMBUS_DEVICE(dev); - char uuid[UUID_FMT_LEN + 1]; + char uuid[UUID_STR_LEN]; qemu_uuid_unparse(&vdev->instanceid, uuid); return g_strdup_printf("%s@%s", qdev_fw_name(dev), uuid); diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 94772c726b..55850791df 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -45,6 +45,7 @@ config PC select ACPI_VMGENID select VIRTIO_PMEM_SUPPORTED select VIRTIO_MEM_SUPPORTED + select HV_BALLOON_SUPPORTED config PC_PCI bool diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 7965415b47..4203144da9 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1450,6 +1450,10 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &iommu_as[devfn]->as; } +static const PCIIOMMUOps amdvi_iommu_ops = { + .get_address_space = amdvi_host_dma_iommu, +}; + static const MemoryRegionOps mmio_mem_ops = { .read = amdvi_mmio_read, .write = amdvi_mmio_write, @@ -1581,7 +1585,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) AMDVI_MMIO_SIZE); memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR, &s->mmio); - pci_setup_iommu(bus, amdvi_host_dma_iommu, s); + pci_setup_iommu(bus, &amdvi_iommu_ops, s); amdvi_init(s); } diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 1c6c18622f..5085a6fee3 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1045,18 +1045,35 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, * Rsvd field masks for spte: * vtd_spte_rsvd 4k pages * vtd_spte_rsvd_large large pages + * + * We support only 3-level and 4-level page tables (see vtd_init() which + * sets only VTD_CAP_SAGAW_39bit and maybe VTD_CAP_SAGAW_48bit bits in s->cap). */ -static uint64_t vtd_spte_rsvd[5]; -static uint64_t vtd_spte_rsvd_large[5]; +#define VTD_SPTE_RSVD_LEN 5 +static uint64_t vtd_spte_rsvd[VTD_SPTE_RSVD_LEN]; +static uint64_t vtd_spte_rsvd_large[VTD_SPTE_RSVD_LEN]; static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) { - uint64_t rsvd_mask = vtd_spte_rsvd[level]; + uint64_t rsvd_mask; + + /* + * We should have caught a guest-mis-programmed level earlier, + * via vtd_is_level_supported. + */ + assert(level < VTD_SPTE_RSVD_LEN); + /* + * Zero level doesn't exist. The smallest level is VTD_SL_PT_LEVEL=1 and + * checked by vtd_is_last_slpte(). + */ + assert(level); if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) && (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) { /* large page */ rsvd_mask = vtd_spte_rsvd_large[level]; + } else { + rsvd_mask = vtd_spte_rsvd[level]; } return slpte & rsvd_mask; @@ -4088,6 +4105,10 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &vtd_as->as; } +static PCIIOMMUOps vtd_iommu_ops = { + .get_address_space = vtd_host_dma_iommu, +}; + static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) { X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); @@ -4210,7 +4231,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal, g_free, g_free); vtd_init(s); - pci_setup_iommu(bus, vtd_host_dma_iommu, dev); + pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c index a731738411..b2b4be9983 100644 --- a/hw/i386/kvm/xen_evtchn.c +++ b/hw/i386/kvm/xen_evtchn.c @@ -490,6 +490,12 @@ int xen_evtchn_set_callback_param(uint64_t param) break; } + /* If the guest has set a per-vCPU callback vector, prefer that. */ + if (gsi && kvm_xen_has_vcpu_callback_vector()) { + in_kernel = kvm_xen_has_cap(EVTCHN_SEND); + gsi = 0; + } + if (!ret) { /* If vector delivery was turned *off* then tell the kernel */ if ((s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) == @@ -1129,6 +1135,7 @@ int xen_evtchn_reset_op(struct evtchn_reset *reset) return -ESRCH; } + QEMU_IOTHREAD_LOCK_GUARD(); return xen_evtchn_soft_reset(); } diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c index 21c30e3659..839ec920a1 100644 --- a/hw/i386/kvm/xen_gnttab.c +++ b/hw/i386/kvm/xen_gnttab.c @@ -541,7 +541,5 @@ int xen_gnttab_reset(void) s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access; s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE); - memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1); - return 0; } diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c index 660d0b72f9..8e716a7009 100644 --- a/hw/i386/kvm/xen_xenstore.c +++ b/hw/i386/kvm/xen_xenstore.c @@ -1357,10 +1357,12 @@ static void fire_watch_cb(void *opaque, const char *path, const char *token) } else { deliver_watch(s, path, token); /* - * If the message was queued because there was already ring activity, - * no need to wake the guest. But if not, we need to send the evtchn. + * Attempt to queue the message into the actual ring, and send + * the event channel notification if any bytes are copied. */ - xen_be_evtchn_notify(s->eh, s->be_port); + if (s->rsp_pending && put_rsp(s) > 0) { + xen_be_evtchn_notify(s->eh, s->be_port); + } } } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 6031234a73..1aef21aa2c 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -27,6 +27,7 @@ #include "hw/i386/pc.h" #include "hw/char/serial.h" #include "hw/char/parallel.h" +#include "hw/hyperv/hv-balloon.h" #include "hw/i386/fw_cfg.h" #include "hw/i386/vmport.h" #include "sysemu/cpus.h" @@ -57,6 +58,7 @@ #include "hw/i386/kvm/xen_evtchn.h" #include "hw/i386/kvm/xen_gnttab.h" #include "hw/i386/kvm/xen_xenstore.h" +#include "hw/mem/memory-device.h" #include "e820_memory_layout.h" #include "trace.h" #include CONFIG_DEVICES @@ -1422,6 +1424,21 @@ static void pc_memory_unplug(HotplugHandler *hotplug_dev, error_propagate(errp, local_err); } +static void pc_hv_balloon_pre_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + /* The vmbus handler has no hotplug handler; we should never end up here. */ + g_assert(!dev->hotplugged); + memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL, + errp); +} + +static void pc_hv_balloon_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev)); +} + static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -1452,6 +1469,8 @@ static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, return; } pcms->iommu = dev; + } else if (object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON)) { + pc_hv_balloon_pre_plug(hotplug_dev, dev, errp); } } @@ -1464,6 +1483,8 @@ static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev, x86_cpu_plug(hotplug_dev, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { virtio_md_pci_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON)) { + pc_hv_balloon_plug(hotplug_dev, dev, errp); } } @@ -1505,6 +1526,7 @@ static HotplugHandler *pc_get_hotplug_handler(MachineState *machine, object_dynamic_cast(OBJECT(dev), TYPE_CPU) || object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI) || object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON) || object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE)) { return HOTPLUG_HANDLER(machine); } diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c index ae38f48f16..e0704b8dc3 100644 --- a/hw/mem/memory-device.c +++ b/hw/mem/memory-device.c @@ -20,6 +20,22 @@ #include "exec/address-spaces.h" #include "trace.h" +static bool memory_device_is_empty(const MemoryDeviceState *md) +{ + const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); + Error *local_err = NULL; + MemoryRegion *mr; + + /* dropping const here is fine as we don't touch the memory region */ + mr = mdc->get_memory_region((MemoryDeviceState *)md, &local_err); + if (local_err) { + /* Not empty, we'll report errors later when ontaining the MR again. */ + error_free(local_err); + return false; + } + return !mr; +} + static gint memory_device_addr_sort(gconstpointer a, gconstpointer b) { const MemoryDeviceState *md_a = MEMORY_DEVICE(a); @@ -220,12 +236,6 @@ static uint64_t memory_device_get_free_addr(MachineState *ms, return 0; } - if (!QEMU_IS_ALIGNED(size, align)) { - error_setg(errp, "backend memory size must be multiple of 0x%" - PRIx64, align); - return 0; - } - if (hint) { if (range_init(&new, *hint, size) || !range_contains_range(&as, &new)) { error_setg(errp, "can't add memory device [0x%" PRIx64 ":0x%" PRIx64 @@ -249,6 +259,10 @@ static uint64_t memory_device_get_free_addr(MachineState *ms, uint64_t next_addr; Range tmp; + if (memory_device_is_empty(md)) { + continue; + } + range_init_nofail(&tmp, mdc->get_addr(md), memory_device_get_region_size(md, &error_abort)); @@ -292,6 +306,7 @@ MemoryDeviceInfoList *qmp_memory_device_list(void) const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(item->data); MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1); + /* Let's query infotmation even for empty memory devices. */ mdc->fill_device_info(md, info); QAPI_LIST_APPEND(tail, info); @@ -311,7 +326,7 @@ static int memory_device_plugged_size(Object *obj, void *opaque) const MemoryDeviceState *md = MEMORY_DEVICE(obj); const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(obj); - if (dev->realized) { + if (dev->realized && !memory_device_is_empty(md)) { *size += mdc->get_plugged_size(md, &error_abort); } } @@ -337,6 +352,11 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms, uint64_t addr, align = 0; MemoryRegion *mr; + /* We support empty memory devices even without device memory. */ + if (memory_device_is_empty(md)) { + return; + } + if (!ms->device_memory) { error_setg(errp, "the configuration is not prepared for memory devices" " (e.g., for memory hotplug), consider specifying the" @@ -380,10 +400,17 @@ out: void memory_device_plug(MemoryDeviceState *md, MachineState *ms) { const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); - const unsigned int memslots = memory_device_get_memslots(md); - const uint64_t addr = mdc->get_addr(md); + unsigned int memslots; + uint64_t addr; MemoryRegion *mr; + if (memory_device_is_empty(md)) { + return; + } + + memslots = memory_device_get_memslots(md); + addr = mdc->get_addr(md); + /* * We expect that a previous call to memory_device_pre_plug() succeeded, so * it can't fail at this point. @@ -408,6 +435,10 @@ void memory_device_unplug(MemoryDeviceState *md, MachineState *ms) const unsigned int memslots = memory_device_get_memslots(md); MemoryRegion *mr; + if (memory_device_is_empty(md)) { + return; + } + /* * We expect that a previous call to memory_device_pre_plug() succeeded, so * it can't fail at this point. diff --git a/hw/pci-host/astro.c b/hw/pci-host/astro.c index 4b2d7caf2d..84e0ca14ac 100644 --- a/hw/pci-host/astro.c +++ b/hw/pci-host/astro.c @@ -345,6 +345,10 @@ static AddressSpace *elroy_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->astro->iommu_as; } +static const PCIIOMMUOps elroy_pcihost_iommu_ops = { + .get_address_space = elroy_pcihost_set_iommu, +}; + /* * Encoding in IOSAPIC: * base_addr == 0xfffa0000, we want to get 0xa0ff0000. @@ -834,7 +838,7 @@ static void astro_realize(DeviceState *obj, Error **errp) &elroy->pci_io); /* Host memory as seen from the PCI side, via the IOMMU. */ - pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, elroy_pcihost_set_iommu, + pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, &elroy_pcihost_iommu_ops, elroy); } } diff --git a/hw/pci-host/designware.c b/hw/pci-host/designware.c index 6f5442f108..f477f97847 100644 --- a/hw/pci-host/designware.c +++ b/hw/pci-host/designware.c @@ -663,6 +663,10 @@ static AddressSpace *designware_pcie_host_set_iommu(PCIBus *bus, void *opaque, return &s->pci.address_space; } +static const PCIIOMMUOps designware_iommu_ops = { + .get_address_space = designware_pcie_host_set_iommu, +}; + static void designware_pcie_host_realize(DeviceState *dev, Error **errp) { PCIHostState *pci = PCI_HOST_BRIDGE(dev); @@ -705,7 +709,7 @@ static void designware_pcie_host_realize(DeviceState *dev, Error **errp) address_space_init(&s->pci.address_space, &s->pci.address_space_root, "pcie-bus-address-space"); - pci_setup_iommu(pci->bus, designware_pcie_host_set_iommu, s); + pci_setup_iommu(pci->bus, &designware_iommu_ops, s); qdev_realize(DEVICE(&s->root), BUS(pci->bus), &error_fatal); } diff --git a/hw/pci-host/dino.c b/hw/pci-host/dino.c index 82503229fa..5b0947a16c 100644 --- a/hw/pci-host/dino.c +++ b/hw/pci-host/dino.c @@ -354,6 +354,10 @@ static AddressSpace *dino_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->bm_as; } +static const PCIIOMMUOps dino_iommu_ops = { + .get_address_space = dino_pcihost_set_iommu, +}; + /* * Dino interrupts are connected as shown on Page 78, Table 23 * (Little-endian bit numbers) @@ -481,7 +485,7 @@ static void dino_pcihost_init(Object *obj) g_free(name); } - pci_setup_iommu(phb->bus, dino_pcihost_set_iommu, s); + pci_setup_iommu(phb->bus, &dino_iommu_ops, s); sysbus_init_mmio(sbd, &s->this_mem); diff --git a/hw/pci-host/pnv_phb3.c b/hw/pci-host/pnv_phb3.c index c5e58f4086..2a74dbe45f 100644 --- a/hw/pci-host/pnv_phb3.c +++ b/hw/pci-host/pnv_phb3.c @@ -968,6 +968,10 @@ static AddressSpace *pnv_phb3_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &ds->dma_as; } +static PCIIOMMUOps pnv_phb3_iommu_ops = { + .get_address_space = pnv_phb3_dma_iommu, +}; + static void pnv_phb3_instance_init(Object *obj) { PnvPHB3 *phb = PNV_PHB3(obj); @@ -1012,7 +1016,7 @@ void pnv_phb3_bus_init(DeviceState *dev, PnvPHB3 *phb) object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id, &error_abort); - pci_setup_iommu(pci->bus, pnv_phb3_dma_iommu, phb); + pci_setup_iommu(pci->bus, &pnv_phb3_iommu_ops, phb); } static void pnv_phb3_realize(DeviceState *dev, Error **errp) diff --git a/hw/pci-host/pnv_phb4.c b/hw/pci-host/pnv_phb4.c index 29cb11a5d9..37c7afc18c 100644 --- a/hw/pci-host/pnv_phb4.c +++ b/hw/pci-host/pnv_phb4.c @@ -1518,6 +1518,10 @@ static void pnv_phb4_xscom_realize(PnvPHB4 *phb) &phb->phb_regs_mr); } +static PCIIOMMUOps pnv_phb4_iommu_ops = { + .get_address_space = pnv_phb4_dma_iommu, +}; + static void pnv_phb4_instance_init(Object *obj) { PnvPHB4 *phb = PNV_PHB4(obj); @@ -1557,7 +1561,7 @@ void pnv_phb4_bus_init(DeviceState *dev, PnvPHB4 *phb) object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id, &error_abort); - pci_setup_iommu(pci->bus, pnv_phb4_dma_iommu, phb); + pci_setup_iommu(pci->bus, &pnv_phb4_iommu_ops, phb); pci->bus->flags |= PCI_BUS_EXTENDED_CONFIG_SPACE; } diff --git a/hw/pci-host/ppce500.c b/hw/pci-host/ppce500.c index 38814247f2..453a4e6ed3 100644 --- a/hw/pci-host/ppce500.c +++ b/hw/pci-host/ppce500.c @@ -435,6 +435,10 @@ static AddressSpace *e500_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->bm_as; } +static const PCIIOMMUOps ppce500_iommu_ops = { + .get_address_space = e500_pcihost_set_iommu, +}; + static void e500_pcihost_realize(DeviceState *dev, Error **errp) { SysBusDevice *sbd = SYS_BUS_DEVICE(dev); @@ -469,7 +473,7 @@ static void e500_pcihost_realize(DeviceState *dev, Error **errp) memory_region_init(&s->bm, OBJECT(s), "bm-e500", UINT64_MAX); memory_region_add_subregion(&s->bm, 0x0, &s->busmem); address_space_init(&s->bm_as, &s->bm, "pci-bm"); - pci_setup_iommu(b, e500_pcihost_set_iommu, s); + pci_setup_iommu(b, &ppce500_iommu_ops, s); pci_create_simple(b, 0, "e500-host-bridge"); diff --git a/hw/pci-host/raven.c b/hw/pci-host/raven.c index 9a11ac4b2b..86c3a49087 100644 --- a/hw/pci-host/raven.c +++ b/hw/pci-host/raven.c @@ -223,6 +223,10 @@ static AddressSpace *raven_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->bm_as; } +static const PCIIOMMUOps raven_iommu_ops = { + .get_address_space = raven_pcihost_set_iommu, +}; + static void raven_change_gpio(void *opaque, int n, int level) { PREPPCIState *s = opaque; @@ -320,7 +324,7 @@ static void raven_pcihost_initfn(Object *obj) memory_region_add_subregion(&s->bm, 0 , &s->bm_pci_memory_alias); memory_region_add_subregion(&s->bm, 0x80000000, &s->bm_ram_alias); address_space_init(&s->bm_as, &s->bm, "raven-bm"); - pci_setup_iommu(&s->pci_bus, raven_pcihost_set_iommu, s); + pci_setup_iommu(&s->pci_bus, &raven_iommu_ops, s); h->bus = &s->pci_bus; diff --git a/hw/pci-host/sabre.c b/hw/pci-host/sabre.c index dcb2e230b6..d0851b48b0 100644 --- a/hw/pci-host/sabre.c +++ b/hw/pci-host/sabre.c @@ -112,6 +112,10 @@ static AddressSpace *sabre_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &is->iommu_as; } +static const PCIIOMMUOps sabre_iommu_ops = { + .get_address_space = sabre_pci_dma_iommu, +}; + static void sabre_config_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) { @@ -384,7 +388,7 @@ static void sabre_realize(DeviceState *dev, Error **errp) /* IOMMU */ memory_region_add_subregion_overlap(&s->sabre_config, 0x200, sysbus_mmio_get_region(SYS_BUS_DEVICE(s->iommu), 0), 1); - pci_setup_iommu(phb->bus, sabre_pci_dma_iommu, s->iommu); + pci_setup_iommu(phb->bus, &sabre_iommu_ops, s->iommu); /* APB secondary busses */ pci_dev = pci_new_multifunction(PCI_DEVFN(1, 0), TYPE_SIMBA_PCI_BRIDGE); diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 885c04b6f5..c49417abb2 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2678,7 +2678,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) PCIBus *iommu_bus = bus; uint8_t devfn = dev->devfn; - while (iommu_bus && !iommu_bus->iommu_fn && iommu_bus->parent_dev) { + while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) { PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev); /* @@ -2717,15 +2717,23 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) iommu_bus = parent_bus; } - if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) { - return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn); + if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) { + return iommu_bus->iommu_ops->get_address_space(bus, + iommu_bus->iommu_opaque, devfn); } return &address_space_memory; } -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque) +void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) { - bus->iommu_fn = fn; + /* + * If called, pci_setup_iommu() should provide a minimum set of + * useful callbacks for the bus. + */ + assert(ops); + assert(ops->get_address_space); + + bus->iommu_ops = ops; bus->iommu_opaque = opaque; } diff --git a/hw/ppc/ppc440_pcix.c b/hw/ppc/ppc440_pcix.c index 672090de94..df4ee374d0 100644 --- a/hw/ppc/ppc440_pcix.c +++ b/hw/ppc/ppc440_pcix.c @@ -449,6 +449,10 @@ static AddressSpace *ppc440_pcix_set_iommu(PCIBus *b, void *opaque, int devfn) return &s->bm_as; } +static const PCIIOMMUOps ppc440_iommu_ops = { + .get_address_space = ppc440_pcix_set_iommu, +}; + /* * Some guests on sam460ex write all kinds of garbage here such as * missing enable bit and low bits set and still expect this to work @@ -503,7 +507,7 @@ static void ppc440_pcix_realize(DeviceState *dev, Error **errp) memory_region_init(&s->bm, OBJECT(s), "bm-ppc440-pcix", UINT64_MAX); memory_region_add_subregion(&s->bm, 0x0, &s->busmem); address_space_init(&s->bm_as, &s->bm, "pci-bm"); - pci_setup_iommu(h->bus, ppc440_pcix_set_iommu, s); + pci_setup_iommu(h->bus, &ppc440_iommu_ops, s); memory_region_init(&s->container, OBJECT(s), "pci-container", PCI_ALL_SIZE); memory_region_init_io(&h->conf_mem, OBJECT(s), &ppc440_pcix_host_conf_ops, diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 370c5a90f2..a27024e45a 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -780,6 +780,10 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &phb->iommu_as; } +static const PCIIOMMUOps spapr_iommu_ops = { + .get_address_space = spapr_pci_dma_iommu, +}; + static char *spapr_phb_vfio_get_loc_code(SpaprPhbState *sphb, PCIDevice *pdev) { g_autofree char *path = NULL; @@ -1978,7 +1982,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) memory_region_add_subregion(&sphb->iommu_root, SPAPR_PCI_MSI_WINDOW, &sphb->msiwindow); - pci_setup_iommu(bus, spapr_pci_dma_iommu, sphb); + pci_setup_iommu(bus, &spapr_iommu_ops, sphb); pci_bus_set_route_irq_fn(bus, spapr_route_intx_pin_to_irq); diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index 9016720547..f283f7e38d 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -18,14 +18,112 @@ */ #include "qemu/osdep.h" +#include <sys/ioctl.h> #include <linux/vfio.h> #include "hw/ppc/spapr.h" #include "hw/pci-host/spapr.h" #include "hw/pci/msix.h" #include "hw/pci/pci_device.h" -#include "hw/vfio/vfio.h" +#include "hw/vfio/vfio-common.h" #include "qemu/error-report.h" +/* + * Interfaces for IBM EEH (Enhanced Error Handling) + */ +static bool vfio_eeh_container_ok(VFIOContainer *container) +{ + /* + * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO + * implementation is broken if there are multiple groups in a + * container. The hardware works in units of Partitionable + * Endpoints (== IOMMU groups) and the EEH operations naively + * iterate across all groups in the container, without any logic + * to make sure the groups have their state synchronized. For + * certain operations (ENABLE) that might be ok, until an error + * occurs, but for others (GET_STATE) it's clearly broken. + */ + + /* + * XXX Once fixed kernels exist, test for them here + */ + + if (QLIST_EMPTY(&container->group_list)) { + return false; + } + + if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) { + return false; + } + + return true; +} + +static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) +{ + struct vfio_eeh_pe_op pe_op = { + .argsz = sizeof(pe_op), + .op = op, + }; + int ret; + + if (!vfio_eeh_container_ok(container)) { + error_report("vfio/eeh: EEH_PE_OP 0x%x: " + "kernel requires a container with exactly one group", op); + return -EPERM; + } + + ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op); + if (ret < 0) { + error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op); + return -errno; + } + + return ret; +} + +static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) +{ + VFIOAddressSpace *space = vfio_get_address_space(as); + VFIOContainer *container = NULL; + + if (QLIST_EMPTY(&space->containers)) { + /* No containers to act on */ + goto out; + } + + container = QLIST_FIRST(&space->containers); + + if (QLIST_NEXT(container, next)) { + /* + * We don't yet have logic to synchronize EEH state across + * multiple containers + */ + container = NULL; + goto out; + } + +out: + vfio_put_address_space(space); + return container; +} + +static bool vfio_eeh_as_ok(AddressSpace *as) +{ + VFIOContainer *container = vfio_eeh_as_container(as); + + return (container != NULL) && vfio_eeh_container_ok(container); +} + +static int vfio_eeh_as_op(AddressSpace *as, uint32_t op) +{ + VFIOContainer *container = vfio_eeh_as_container(as); + + if (!container) { + return -ENODEV; + } + return vfio_eeh_container_op(container, op); +} + bool spapr_phb_eeh_available(SpaprPhbState *sphb) { return vfio_eeh_as_ok(&sphb->iommu_as); diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c index 1391dd712c..7c56aad0fc 100644 --- a/hw/remote/iommu.c +++ b/hw/remote/iommu.c @@ -100,6 +100,10 @@ static void remote_iommu_finalize(Object *obj) iommu->elem_by_devfn = NULL; } +static const PCIIOMMUOps remote_iommu_ops = { + .get_address_space = remote_iommu_find_add_as, +}; + void remote_iommu_setup(PCIBus *pci_bus) { RemoteIommu *iommu = NULL; @@ -108,7 +112,7 @@ void remote_iommu_setup(PCIBus *pci_bus) iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU)); - pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu); + pci_setup_iommu(pci_bus, &remote_iommu_ops, iommu); object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu)); diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c index c27c362db9..2d391a8396 100644 --- a/hw/rtc/mc146818rtc.c +++ b/hw/rtc/mc146818rtc.c @@ -599,7 +599,7 @@ static void rtc_get_time(MC146818RtcState *s, struct tm *tm) static void rtc_set_time(MC146818RtcState *s) { - struct tm tm; + struct tm tm = {}; g_autofree const char *qom_path = object_get_canonical_path(OBJECT(s)); rtc_get_time(s, &tm); diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 2ca36f9f3b..347580ebac 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -652,6 +652,10 @@ static AddressSpace *s390_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &iommu->as; } +static const PCIIOMMUOps s390_iommu_ops = { + .get_address_space = s390_pci_dma_iommu, +}; + static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t to_be_set) { uint8_t expected, actual; @@ -839,7 +843,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) b = pci_register_root_bus(dev, NULL, s390_pci_set_irq, s390_pci_map_irq, NULL, get_system_memory(), get_system_io(), 0, 64, TYPE_PCI_BUS); - pci_setup_iommu(b, s390_pci_dma_iommu, s); + pci_setup_iommu(b, &s390_iommu_ops, s); bus = BUS(b); qbus_set_hotplug_handler(bus, OBJECT(dev)); @@ -1058,7 +1062,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, pdev = PCI_DEVICE(dev); pci_bridge_map_irq(pb, dev->id, s390_pci_map_irq); - pci_setup_iommu(&pb->sec_bus, s390_pci_dma_iommu, s); + pci_setup_iommu(&pb->sec_bus, &s390_iommu_ops, s); qbus_set_hotplug_handler(BUS(&pb->sec_bus), OBJECT(s)); diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 5f257bffb9..bbf69ff55a 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -14,7 +14,6 @@ #include <linux/vfio.h> #include <sys/ioctl.h> #include "qapi/error.h" -#include "hw/vfio/vfio.h" #include "hw/vfio/vfio-common.h" #include "hw/s390x/ap-device.h" #include "qemu/error-report.h" diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 6623ae237b..d857bb8d0f 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -20,7 +20,6 @@ #include <sys/ioctl.h> #include "qapi/error.h" -#include "hw/vfio/vfio.h" #include "hw/vfio/vfio-common.h" #include "hw/s390x/s390-ccw.h" #include "hw/s390x/vfio-ccw.h" diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d806057b40..e70fdf5e0c 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -26,7 +26,6 @@ #include <linux/vfio.h> #include "hw/vfio/vfio-common.h" -#include "hw/vfio/vfio.h" #include "hw/vfio/pci.h" #include "exec/address-spaces.h" #include "exec/memory.h" @@ -246,44 +245,6 @@ bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) return true; } -void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, - hwaddr max_iova, uint64_t iova_pgsizes) -{ - VFIOHostDMAWindow *hostwin; - - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (ranges_overlap(hostwin->min_iova, - hostwin->max_iova - hostwin->min_iova + 1, - min_iova, - max_iova - min_iova + 1)) { - hw_error("%s: Overlapped IOMMU are not enabled", __func__); - } - } - - hostwin = g_malloc0(sizeof(*hostwin)); - - hostwin->min_iova = min_iova; - hostwin->max_iova = max_iova; - hostwin->iova_pgsizes = iova_pgsizes; - QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); -} - -int vfio_host_win_del(VFIOContainer *container, - hwaddr min_iova, hwaddr max_iova) -{ - VFIOHostDMAWindow *hostwin; - - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { - QLIST_REMOVE(hostwin, hostwin_next); - g_free(hostwin); - return 0; - } - } - - return -1; -} - static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && @@ -532,22 +493,6 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container, g_free(vrdl); } -static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, - hwaddr iova, hwaddr end) -{ - VFIOHostDMAWindow *hostwin; - bool hostwin_found = false; - - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { - hostwin_found = true; - break; - } - } - - return hostwin_found ? hostwin : NULL; -} - static bool vfio_known_safe_misalignment(MemoryRegionSection *section) { MemoryRegion *mr = section->mr; @@ -626,7 +571,6 @@ static void vfio_listener_region_add(MemoryListener *listener, Int128 llend, llsize; void *vaddr; int ret; - VFIOHostDMAWindow *hostwin; Error *err = NULL; if (!vfio_listener_valid_section(section, "region_add")) { @@ -648,13 +592,6 @@ static void vfio_listener_region_add(MemoryListener *listener, goto fail; } - hostwin = vfio_find_hostwin(container, iova, end); - if (!hostwin) { - error_setg(&err, "Container %p can't map guest IOVA region" - " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); - goto fail; - } - memory_region_ref(section->mr); if (memory_region_is_iommu(section->mr)) { @@ -693,6 +630,15 @@ static void vfio_listener_region_add(MemoryListener *listener, goto fail; } + if (container->iova_ranges) { + ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, + container->iova_ranges, &err); + if (ret) { + g_free(giommu); + goto fail; + } + } + ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, &err); if (ret) { @@ -726,7 +672,7 @@ static void vfio_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1; if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { trace_vfio_listener_region_add_no_dma_map( @@ -825,12 +771,8 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_ram_device(section->mr)) { hwaddr pgmask; - VFIOHostDMAWindow *hostwin; - - hostwin = vfio_find_hostwin(container, iova, end); - assert(hostwin); /* or region_add() would have failed */ - pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + pgmask = (1ULL << ctz64(container->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { vfio_unregister_ram_discard_listener(container, section); diff --git a/hw/vfio/container.c b/hw/vfio/container.c index adc467210f..242010036a 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -20,20 +20,15 @@ #include "qemu/osdep.h" #include <sys/ioctl.h> -#ifdef CONFIG_KVM -#include <linux/kvm.h> -#endif #include <linux/vfio.h> #include "hw/vfio/vfio-common.h" -#include "hw/vfio/vfio.h" #include "exec/address-spaces.h" #include "exec/memory.h" #include "exec/ram_addr.h" #include "hw/hw.h" #include "qemu/error-report.h" #include "qemu/range.h" -#include "sysemu/kvm.h" #include "sysemu/reset.h" #include "trace.h" #include "qapi/error.h" @@ -205,92 +200,6 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, return -errno; } -int vfio_container_add_section_window(VFIOContainer *container, - MemoryRegionSection *section, - Error **errp) -{ - VFIOHostDMAWindow *hostwin; - hwaddr pgsize = 0; - int ret; - - if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { - return 0; - } - - /* For now intersections are not allowed, we may relax this later */ - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (ranges_overlap(hostwin->min_iova, - hostwin->max_iova - hostwin->min_iova + 1, - section->offset_within_address_space, - int128_get64(section->size))) { - error_setg(errp, - "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" - "host DMA window [0x%"PRIx64",0x%"PRIx64"]", - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1, - hostwin->min_iova, hostwin->max_iova); - return -EINVAL; - } - } - - ret = vfio_spapr_create_window(container, section, &pgsize); - if (ret) { - error_setg_errno(errp, -ret, "Failed to create SPAPR window"); - return ret; - } - - vfio_host_win_add(container, section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1, pgsize); -#ifdef CONFIG_KVM - if (kvm_enabled()) { - VFIOGroup *group; - IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); - struct kvm_vfio_spapr_tce param; - struct kvm_device_attr attr = { - .group = KVM_DEV_VFIO_GROUP, - .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, - .addr = (uint64_t)(unsigned long)¶m, - }; - - if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, - ¶m.tablefd)) { - QLIST_FOREACH(group, &container->group_list, container_next) { - param.groupfd = group->fd; - if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { - error_setg_errno(errp, errno, - "vfio: failed GROUP_SET_SPAPR_TCE for " - "KVM VFIO device %d and group fd %d", - param.tablefd, param.groupfd); - return -errno; - } - trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); - } - } - } -#endif - return 0; -} - -void vfio_container_del_section_window(VFIOContainer *container, - MemoryRegionSection *section) -{ - if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { - return; - } - - vfio_spapr_remove_window(container, - section->offset_within_address_space); - if (vfio_host_win_del(container, - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1) < 0) { - hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, - __func__, section->offset_within_address_space); - } -} - int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) { int ret; @@ -355,14 +264,6 @@ int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, return ret; } -static void vfio_listener_release(VFIOContainer *container) -{ - memory_listener_unregister(&container->listener); - if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - memory_listener_unregister(&container->prereg_listener); - } -} - static struct vfio_info_cap_header * vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) { @@ -382,7 +283,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, /* If the capability cannot be found, assume no DMA limiting */ hdr = vfio_get_iommu_type1_info_cap(info, VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); - if (hdr == NULL) { + if (!hdr) { return false; } @@ -394,6 +295,32 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, return true; } +static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, + VFIOContainer *container) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_iova_range *cap; + + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + if (!hdr) { + return false; + } + + cap = (void *)hdr; + + for (int i = 0; i < cap->nr_iovas; i++) { + Range *range = g_new(Range, 1); + + range_set_bounds(range, cap->iova_ranges[i].start, + cap->iova_ranges[i].end); + container->iova_ranges = + range_list_insert(container->iova_ranges, range); + } + + return true; +} + static void vfio_kvm_device_add_group(VFIOGroup *group) { Error *err = NULL; @@ -535,6 +462,12 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, } } +static void vfio_free_container(VFIOContainer *container) +{ + g_list_free_full(container->iova_ranges, g_free); + g_free(container); +} + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -616,8 +549,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->error = NULL; container->dirty_pages_supported = false; container->dma_max_mappings = 0; + container->iova_ranges = NULL; QLIST_INIT(&container->giommu_list); - QLIST_INIT(&container->hostwin_list); QLIST_INIT(&container->vrdl_list); ret = vfio_init_container(container, group->fd, errp); @@ -652,84 +585,21 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) { container->dma_max_mappings = 65535; } - vfio_get_iommu_info_migration(container, info); - g_free(info); - /* - * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE - * information to get the actual window extent rather than assume - * a 64-bit IOVA address space. - */ - vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes); + vfio_get_info_iova_range(info, container); + vfio_get_iommu_info_migration(container, info); + g_free(info); break; } case VFIO_SPAPR_TCE_v2_IOMMU: case VFIO_SPAPR_TCE_IOMMU: { - struct vfio_iommu_spapr_tce_info info; - bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; - - /* - * The host kernel code implementing VFIO_IOMMU_DISABLE is called - * when container fd is closed so we do not call it explicitly - * in this file. - */ - if (!v2) { - ret = ioctl(fd, VFIO_IOMMU_ENABLE); - if (ret) { - error_setg_errno(errp, errno, "failed to enable container"); - ret = -errno; - goto enable_discards_exit; - } - } else { - container->prereg_listener = vfio_prereg_listener; - - memory_listener_register(&container->prereg_listener, - &address_space_memory); - if (container->error) { - memory_listener_unregister(&container->prereg_listener); - ret = -1; - error_propagate_prepend(errp, container->error, - "RAM memory listener initialization failed: "); - goto enable_discards_exit; - } - } - - info.argsz = sizeof(info); - ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + ret = vfio_spapr_container_init(container, errp); if (ret) { - error_setg_errno(errp, errno, - "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); - ret = -errno; - if (v2) { - memory_listener_unregister(&container->prereg_listener); - } goto enable_discards_exit; } - - if (v2) { - container->pgsizes = info.ddw.pgsizes; - /* - * There is a default window in just created container. - * To make region_add/del simpler, we better remove this - * window now and let those iommu_listener callbacks - * create/remove them when needed. - */ - ret = vfio_spapr_remove_window(container, info.dma32_window_start); - if (ret) { - error_setg_errno(errp, -ret, - "failed to remove existing window"); - goto enable_discards_exit; - } - } else { - /* The default table uses 4K pages */ - container->pgsizes = 0x1000; - vfio_host_win_add(container, info.dma32_window_start, - info.dma32_window_start + - info.dma32_window_size - 1, - 0x1000); - } + break; } } @@ -759,13 +629,17 @@ listener_release_exit: QLIST_REMOVE(group, container_next); QLIST_REMOVE(container, next); vfio_kvm_device_del_group(group); - vfio_listener_release(container); + memory_listener_unregister(&container->listener); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { + vfio_spapr_container_deinit(container); + } enable_discards_exit: vfio_ram_block_discard_disable(container, false); free_container_exit: - g_free(container); + vfio_free_container(container); close_fd_exit: close(fd); @@ -789,7 +663,11 @@ static void vfio_disconnect_container(VFIOGroup *group) * group. */ if (QLIST_EMPTY(&container->group_list)) { - vfio_listener_release(container); + memory_listener_unregister(&container->listener); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { + vfio_spapr_container_deinit(container); + } } if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { @@ -800,7 +678,6 @@ static void vfio_disconnect_container(VFIOGroup *group) if (QLIST_EMPTY(&container->group_list)) { VFIOAddressSpace *space = container->space; VFIOGuestIOMMU *giommu, *tmp; - VFIOHostDMAWindow *hostwin, *next; QLIST_REMOVE(container, next); @@ -811,15 +688,9 @@ static void vfio_disconnect_container(VFIOGroup *group) g_free(giommu); } - QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, - next) { - QLIST_REMOVE(hostwin, hostwin_next); - g_free(hostwin); - } - trace_vfio_disconnect_container(container->fd); close(container->fd); - g_free(container); + vfio_free_container(container); vfio_put_address_space(space); } @@ -975,103 +846,6 @@ static void vfio_put_base_device(VFIODevice *vbasedev) close(vbasedev->fd); } -/* - * Interfaces for IBM EEH (Enhanced Error Handling) - */ -static bool vfio_eeh_container_ok(VFIOContainer *container) -{ - /* - * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO - * implementation is broken if there are multiple groups in a - * container. The hardware works in units of Partitionable - * Endpoints (== IOMMU groups) and the EEH operations naively - * iterate across all groups in the container, without any logic - * to make sure the groups have their state synchronized. For - * certain operations (ENABLE) that might be ok, until an error - * occurs, but for others (GET_STATE) it's clearly broken. - */ - - /* - * XXX Once fixed kernels exist, test for them here - */ - - if (QLIST_EMPTY(&container->group_list)) { - return false; - } - - if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) { - return false; - } - - return true; -} - -static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) -{ - struct vfio_eeh_pe_op pe_op = { - .argsz = sizeof(pe_op), - .op = op, - }; - int ret; - - if (!vfio_eeh_container_ok(container)) { - error_report("vfio/eeh: EEH_PE_OP 0x%x: " - "kernel requires a container with exactly one group", op); - return -EPERM; - } - - ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op); - if (ret < 0) { - error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op); - return -errno; - } - - return ret; -} - -static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) -{ - VFIOAddressSpace *space = vfio_get_address_space(as); - VFIOContainer *container = NULL; - - if (QLIST_EMPTY(&space->containers)) { - /* No containers to act on */ - goto out; - } - - container = QLIST_FIRST(&space->containers); - - if (QLIST_NEXT(container, next)) { - /* - * We don't yet have logic to synchronize EEH state across - * multiple containers - */ - container = NULL; - goto out; - } - -out: - vfio_put_address_space(space); - return container; -} - -bool vfio_eeh_as_ok(AddressSpace *as) -{ - VFIOContainer *container = vfio_eeh_as_container(as); - - return (container != NULL) && vfio_eeh_container_ok(container); -} - -int vfio_eeh_as_op(AddressSpace *as, uint32_t op) -{ - VFIOContainer *container = vfio_eeh_as_container(as); - - if (!container) { - return -ENODEV; - } - return vfio_eeh_container_op(container, op); -} - static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) { char *tmp, group_path[PATH_MAX], *group_name; diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 7e5da21b31..168847e7c5 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -23,7 +23,6 @@ #include <sys/ioctl.h> #include "hw/vfio/vfio-common.h" -#include "hw/vfio/vfio.h" #include "hw/hw.h" #include "trace.h" #include "qapi/error.h" diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b27011cee7..c62c02f7b6 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3081,7 +3081,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) struct stat st; int i, ret; bool is_mdev; - char uuid[UUID_FMT_LEN]; + char uuid[UUID_STR_LEN]; char *name; if (!vbasedev->sysfsdev) { diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 9ec1e95f6d..83da2f7ec2 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -11,6 +11,11 @@ #include "qemu/osdep.h" #include <sys/ioctl.h> #include <linux/vfio.h> +#ifdef CONFIG_KVM +#include <linux/kvm.h> +#endif +#include "sysemu/kvm.h" +#include "exec/address-spaces.h" #include "hw/vfio/vfio-common.h" #include "hw/hw.h" @@ -135,15 +140,90 @@ static void vfio_prereg_listener_region_del(MemoryListener *listener, trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); } -const MemoryListener vfio_prereg_listener = { +static const MemoryListener vfio_prereg_listener = { .name = "vfio-pre-reg", .region_add = vfio_prereg_listener_region_add, .region_del = vfio_prereg_listener_region_del, }; -int vfio_spapr_create_window(VFIOContainer *container, - MemoryRegionSection *section, - hwaddr *pgsize) +static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, + hwaddr max_iova, uint64_t iova_pgsizes) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + min_iova, + max_iova - min_iova + 1)) { + hw_error("%s: Overlapped IOMMU are not enabled", __func__); + } + } + + hostwin = g_malloc0(sizeof(*hostwin)); + + hostwin->min_iova = min_iova; + hostwin->max_iova = max_iova; + hostwin->iova_pgsizes = iova_pgsizes; + QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); +} + +static int vfio_host_win_del(VFIOContainer *container, + hwaddr min_iova, hwaddr max_iova) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); + return 0; + } + } + + return -1; +} + +static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, + hwaddr iova, hwaddr end) +{ + VFIOHostDMAWindow *hostwin; + bool hostwin_found = false; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { + hostwin_found = true; + break; + } + } + + return hostwin_found ? hostwin : NULL; +} + +static int vfio_spapr_remove_window(VFIOContainer *container, + hwaddr offset_within_address_space) +{ + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + .start_addr = offset_within_address_space, + }; + int ret; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + error_report("Failed to remove window at %"PRIx64, + (uint64_t)remove.start_addr); + return -errno; + } + + trace_vfio_spapr_remove_window(offset_within_address_space); + + return 0; +} + +static int vfio_spapr_create_window(VFIOContainer *container, + MemoryRegionSection *section, + hwaddr *pgsize) { int ret = 0; IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); @@ -233,23 +313,195 @@ int vfio_spapr_create_window(VFIOContainer *container, return 0; } -int vfio_spapr_remove_window(VFIOContainer *container, - hwaddr offset_within_address_space) +int vfio_container_add_section_window(VFIOContainer *container, + MemoryRegionSection *section, + Error **errp) { - struct vfio_iommu_spapr_tce_remove remove = { - .argsz = sizeof(remove), - .start_addr = offset_within_address_space, - }; + VFIOHostDMAWindow *hostwin; + hwaddr pgsize = 0; int ret; - ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + /* + * VFIO_SPAPR_TCE_IOMMU supports a single host window between + * [dma32_window_start, dma32_window_size), we need to ensure + * the section fall in this range. + */ + if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { + hwaddr iova, end; + + iova = section->offset_within_address_space; + end = iova + int128_get64(section->size) - 1; + + if (!vfio_find_hostwin(container, iova, end)) { + error_setg(errp, "Container %p can't map guest IOVA region" + " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, + iova, end); + return -EINVAL; + } + return 0; + } + + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { + return 0; + } + + /* For now intersections are not allowed, we may relax this later */ + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + section->offset_within_address_space, + int128_get64(section->size))) { + error_setg(errp, + "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" + "host DMA window [0x%"PRIx64",0x%"PRIx64"]", + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1, + hostwin->min_iova, hostwin->max_iova); + return -EINVAL; + } + } + + ret = vfio_spapr_create_window(container, section, &pgsize); if (ret) { - error_report("Failed to remove window at %"PRIx64, - (uint64_t)remove.start_addr); - return -errno; + error_setg_errno(errp, -ret, "Failed to create SPAPR window"); + return ret; } - trace_vfio_spapr_remove_window(offset_within_address_space); + vfio_host_win_add(container, section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1, pgsize); +#ifdef CONFIG_KVM + if (kvm_enabled()) { + VFIOGroup *group; + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); + struct kvm_vfio_spapr_tce param; + struct kvm_device_attr attr = { + .group = KVM_DEV_VFIO_GROUP, + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, + .addr = (uint64_t)(unsigned long)¶m, + }; + + if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, + ¶m.tablefd)) { + QLIST_FOREACH(group, &container->group_list, container_next) { + param.groupfd = group->fd; + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { + error_setg_errno(errp, errno, + "vfio: failed GROUP_SET_SPAPR_TCE for " + "KVM VFIO device %d and group fd %d", + param.tablefd, param.groupfd); + return -errno; + } + trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); + } + } + } +#endif + return 0; +} + +void vfio_container_del_section_window(VFIOContainer *container, + MemoryRegionSection *section) +{ + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { + return; + } + + vfio_spapr_remove_window(container, + section->offset_within_address_space); + if (vfio_host_win_del(container, + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1) < 0) { + hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, + __func__, section->offset_within_address_space); + } +} + +int vfio_spapr_container_init(VFIOContainer *container, Error **errp) +{ + struct vfio_iommu_spapr_tce_info info; + bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; + int ret, fd = container->fd; + + QLIST_INIT(&container->hostwin_list); + + /* + * The host kernel code implementing VFIO_IOMMU_DISABLE is called + * when container fd is closed so we do not call it explicitly + * in this file. + */ + if (!v2) { + ret = ioctl(fd, VFIO_IOMMU_ENABLE); + if (ret) { + error_setg_errno(errp, errno, "failed to enable container"); + return -errno; + } + } else { + container->prereg_listener = vfio_prereg_listener; + + memory_listener_register(&container->prereg_listener, + &address_space_memory); + if (container->error) { + ret = -1; + error_propagate_prepend(errp, container->error, + "RAM memory listener initialization failed: "); + goto listener_unregister_exit; + } + } + + info.argsz = sizeof(info); + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + error_setg_errno(errp, errno, + "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); + ret = -errno; + goto listener_unregister_exit; + } + + if (v2) { + container->pgsizes = info.ddw.pgsizes; + /* + * There is a default window in just created container. + * To make region_add/del simpler, we better remove this + * window now and let those iommu_listener callbacks + * create/remove them when needed. + */ + ret = vfio_spapr_remove_window(container, info.dma32_window_start); + if (ret) { + error_setg_errno(errp, -ret, + "failed to remove existing window"); + goto listener_unregister_exit; + } + } else { + /* The default table uses 4K pages */ + container->pgsizes = 0x1000; + vfio_host_win_add(container, info.dma32_window_start, + info.dma32_window_start + + info.dma32_window_size - 1, + 0x1000); + } return 0; + +listener_unregister_exit: + if (v2) { + memory_listener_unregister(&container->prereg_listener); + } + return ret; +} + +void vfio_spapr_container_deinit(VFIOContainer *container) +{ + VFIOHostDMAWindow *hostwin, *next; + + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + memory_listener_unregister(&container->prereg_listener); + } + QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, + next) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); + } } diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 0af7a2886c..637cac4edf 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -135,6 +135,7 @@ virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" virtio_iommu_freeze_granule(uint64_t page_size_mask) "granule set to 0x%"PRIx64 +virtio_iommu_host_resv_regions(const char *name, uint32_t index, uint64_t lob, uint64_t upb) "mr=%s host-resv-reg[%d] = [0x%"PRIx64",0x%"PRIx64"]" # virtio-mem.c virtio_mem_send_response(uint16_t type) "type=%" PRIu16 diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c index 7ef2f9dcdb..9459fbf6ed 100644 --- a/hw/virtio/virtio-iommu-pci.c +++ b/hw/virtio/virtio-iommu-pci.c @@ -37,7 +37,7 @@ struct VirtIOIOMMUPCI { static Property virtio_iommu_pci_properties[] = { DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), DEFINE_PROP_ARRAY("reserved-regions", VirtIOIOMMUPCI, - vdev.nb_reserved_regions, vdev.reserved_regions, + vdev.nr_prop_resv_regions, vdev.prop_resv_regions, qdev_prop_reserved_region, ReservedRegion), DEFINE_PROP_END_OF_LIST(), }; @@ -54,9 +54,9 @@ static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) "for the virtio-iommu-pci device"); return; } - for (int i = 0; i < s->nb_reserved_regions; i++) { - if (s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED && - s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) { + for (int i = 0; i < s->nr_prop_resv_regions; i++) { + if (s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED && + s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) { error_setg(errp, "reserved region %d has an invalid type", i); error_append_hint(errp, "Valid values are 0 and 1\n"); return; diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index be51635895..89fb5767d1 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -20,12 +20,15 @@ #include "qemu/osdep.h" #include "qemu/log.h" #include "qemu/iov.h" +#include "qemu/range.h" +#include "qemu/reserved-region.h" #include "exec/target_page.h" #include "hw/qdev-properties.h" #include "hw/virtio/virtio.h" #include "sysemu/kvm.h" #include "sysemu/reset.h" #include "sysemu/sysemu.h" +#include "qemu/reserved-region.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "trace.h" @@ -378,6 +381,19 @@ static void virtio_iommu_put_domain(gpointer data) g_free(domain); } +static void add_prop_resv_regions(IOMMUDevice *sdev) +{ + VirtIOIOMMU *s = sdev->viommu; + int i; + + for (i = 0; i < s->nr_prop_resv_regions; i++) { + ReservedRegion *reg = g_new0(ReservedRegion, 1); + + *reg = s->prop_resv_regions[i]; + sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg); + } +} + static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, int devfn) { @@ -408,6 +424,7 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX); address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU); + add_prop_resv_regions(sdev); /* * Build the IOMMU disabled container with aliases to the @@ -444,6 +461,10 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, return &sdev->as; } +static const PCIIOMMUOps virtio_iommu_ops = { + .get_address_space = virtio_iommu_find_add_as, +}; + static int virtio_iommu_attach(VirtIOIOMMU *s, struct virtio_iommu_req_attach *req) { @@ -624,29 +645,30 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s, return ret; } -static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep, +static ssize_t virtio_iommu_fill_resv_mem_prop(IOMMUDevice *sdev, uint32_t ep, uint8_t *buf, size_t free) { struct virtio_iommu_probe_resv_mem prop = {}; size_t size = sizeof(prop), length = size - sizeof(prop.head), total; - int i; - - total = size * s->nb_reserved_regions; + GList *l; + total = size * g_list_length(sdev->resv_regions); if (total > free) { return -ENOSPC; } - for (i = 0; i < s->nb_reserved_regions; i++) { - unsigned subtype = s->reserved_regions[i].type; + for (l = sdev->resv_regions; l; l = l->next) { + ReservedRegion *reg = l->data; + unsigned subtype = reg->type; + Range *range = ®->range; assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED || subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI); prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM); prop.head.length = cpu_to_le16(length); prop.subtype = subtype; - prop.start = cpu_to_le64(s->reserved_regions[i].low); - prop.end = cpu_to_le64(s->reserved_regions[i].high); + prop.start = cpu_to_le64(range_lob(range)); + prop.end = cpu_to_le64(range_upb(range)); memcpy(buf, &prop, size); @@ -666,19 +688,27 @@ static int virtio_iommu_probe(VirtIOIOMMU *s, uint8_t *buf) { uint32_t ep_id = le32_to_cpu(req->endpoint); + IOMMUMemoryRegion *iommu_mr = virtio_iommu_mr(s, ep_id); size_t free = VIOMMU_PROBE_SIZE; + IOMMUDevice *sdev; ssize_t count; - if (!virtio_iommu_mr(s, ep_id)) { + if (!iommu_mr) { return VIRTIO_IOMMU_S_NOENT; } - count = virtio_iommu_fill_resv_mem_prop(s, ep_id, buf, free); + sdev = container_of(iommu_mr, IOMMUDevice, iommu_mr); + if (!sdev) { + return -EINVAL; + } + + count = virtio_iommu_fill_resv_mem_prop(sdev, ep_id, buf, free); if (count < 0) { return VIRTIO_IOMMU_S_INVAL; } buf += count; free -= count; + sdev->probe_done = true; return VIRTIO_IOMMU_S_OK; } @@ -856,7 +886,7 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, bool bypass_allowed; int granule; bool found; - int i; + GList *l; interval.low = addr; interval.high = addr + 1; @@ -894,10 +924,10 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, goto unlock; } - for (i = 0; i < s->nb_reserved_regions; i++) { - ReservedRegion *reg = &s->reserved_regions[i]; + for (l = sdev->resv_regions; l; l = l->next) { + ReservedRegion *reg = l->data; - if (addr >= reg->low && addr <= reg->high) { + if (range_contains(®->range, addr)) { switch (reg->type) { case VIRTIO_IOMMU_RESV_MEM_T_MSI: entry.perm = flag; @@ -1131,6 +1161,106 @@ static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, return 0; } +/** + * rebuild_resv_regions: rebuild resv regions with both the + * info of host resv ranges and property set resv ranges + */ +static int rebuild_resv_regions(IOMMUDevice *sdev) +{ + GList *l; + int i = 0; + + /* free the existing list and rebuild it from scratch */ + g_list_free_full(sdev->resv_regions, g_free); + sdev->resv_regions = NULL; + + /* First add host reserved regions if any, all tagged as RESERVED */ + for (l = sdev->host_resv_ranges; l; l = l->next) { + ReservedRegion *reg = g_new0(ReservedRegion, 1); + Range *r = (Range *)l->data; + + reg->type = VIRTIO_IOMMU_RESV_MEM_T_RESERVED; + range_set_bounds(®->range, range_lob(r), range_upb(r)); + sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg); + trace_virtio_iommu_host_resv_regions(sdev->iommu_mr.parent_obj.name, i, + range_lob(®->range), + range_upb(®->range)); + i++; + } + /* + * then add higher priority reserved regions set by the machine + * through properties + */ + add_prop_resv_regions(sdev); + return 0; +} + +/** + * virtio_iommu_set_iova_ranges: Conveys the usable IOVA ranges + * + * The function turns those into reserved ranges. Once some + * reserved ranges have been set, new reserved regions cannot be + * added outside of the original ones. + * + * @mr: IOMMU MR + * @iova_ranges: list of usable IOVA ranges + * @errp: error handle + */ +static int virtio_iommu_set_iova_ranges(IOMMUMemoryRegion *mr, + GList *iova_ranges, + Error **errp) +{ + IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + GList *current_ranges = sdev->host_resv_ranges; + GList *l, *tmp, *new_ranges = NULL; + int ret = -EINVAL; + + /* check that each new resv region is included in an existing one */ + if (sdev->host_resv_ranges) { + range_inverse_array(iova_ranges, + &new_ranges, + 0, UINT64_MAX); + + for (tmp = new_ranges; tmp; tmp = tmp->next) { + Range *newr = (Range *)tmp->data; + bool included = false; + + for (l = current_ranges; l; l = l->next) { + Range * r = (Range *)l->data; + + if (range_contains_range(r, newr)) { + included = true; + break; + } + } + if (!included) { + goto error; + } + } + /* all new reserved ranges are included in existing ones */ + ret = 0; + goto out; + } + + if (sdev->probe_done) { + warn_report("%s: Notified about new host reserved regions after probe", + mr->parent_obj.name); + } + + range_inverse_array(iova_ranges, + &sdev->host_resv_ranges, + 0, UINT64_MAX); + rebuild_resv_regions(sdev); + + return 0; +error: + error_setg(errp, "IOMMU mr=%s Conflicting host reserved ranges set!", + mr->parent_obj.name); +out: + g_list_free_full(new_ranges, g_free); + return ret; +} + static void virtio_iommu_system_reset(void *opaque) { VirtIOIOMMU *s = opaque; @@ -1206,7 +1336,7 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free); if (s->primary_bus) { - pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s); + pci_setup_iommu(s->primary_bus, &virtio_iommu_ops, s); } else { error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!"); } @@ -1426,6 +1556,7 @@ static void virtio_iommu_memory_region_class_init(ObjectClass *klass, imrc->replay = virtio_iommu_replay; imrc->notify_flag_changed = virtio_iommu_notify_flag_changed; imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask; + imrc->iommu_set_iova_ranges = virtio_iommu_set_iova_ranges; } static const TypeInfo virtio_iommu_info = { diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c index cc24812d2e..c3512c2dae 100644 --- a/hw/virtio/virtio-pmem.c +++ b/hw/virtio/virtio-pmem.c @@ -147,7 +147,10 @@ static void virtio_pmem_fill_device_info(const VirtIOPMEM *pmem, static MemoryRegion *virtio_pmem_get_memory_region(VirtIOPMEM *pmem, Error **errp) { - assert(pmem->memdev); + if (!pmem->memdev) { + error_setg(errp, "'%s' property must be set", VIRTIO_PMEM_MEMDEV_PROP); + return NULL; + } return &pmem->memdev->mr; } |