diff options
Diffstat (limited to 'hw')
27 files changed, 2869 insertions, 93 deletions
diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c index a07cd7eb5d..bfa53960c3 100644 --- a/hw/block/xen-block.c +++ b/hw/block/xen-block.c @@ -115,9 +115,13 @@ static void xen_block_connect(XenDevice *xendev, Error **errp) return; } - if (xen_device_frontend_scanf(xendev, "protocol", "%ms", - &str) != 1) { - protocol = BLKIF_PROTOCOL_NATIVE; + if (xen_device_frontend_scanf(xendev, "protocol", "%ms", &str) != 1) { + /* x86 defaults to the 32-bit protocol even for 64-bit guests. */ + if (object_dynamic_cast(OBJECT(qdev_get_machine()), "x86-machine")) { + protocol = BLKIF_PROTOCOL_X86_32; + } else { + protocol = BLKIF_PROTOCOL_NATIVE; + } } else { if (strcmp(str, XEN_IO_PROTO_ABI_X86_32) == 0) { protocol = BLKIF_PROTOCOL_X86_32; diff --git a/hw/core/machine-hmp-cmds.c b/hw/core/machine-hmp-cmds.c index 9a4b59c6f2..a6ff6a4875 100644 --- a/hw/core/machine-hmp-cmds.c +++ b/hw/core/machine-hmp-cmds.c @@ -253,6 +253,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) MemoryDeviceInfo *value; PCDIMMDeviceInfo *di; SgxEPCDeviceInfo *se; + HvBalloonDeviceInfo *hi; for (info = info_list; info; info = info->next) { value = info->value; @@ -310,6 +311,20 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) monitor_printf(mon, " node: %" PRId64 "\n", se->node); monitor_printf(mon, " memdev: %s\n", se->memdev); break; + case MEMORY_DEVICE_INFO_KIND_HV_BALLOON: + hi = value->u.hv_balloon.data; + monitor_printf(mon, "Memory device [%s]: \"%s\"\n", + MemoryDeviceInfoKind_str(value->type), + hi->id ? hi->id : ""); + if (hi->has_memaddr) { + monitor_printf(mon, " memaddr: 0x%" PRIx64 "\n", + hi->memaddr); + } + monitor_printf(mon, " max-size: %" PRIu64 "\n", hi->max_size); + if (hi->memdev) { + monitor_printf(mon, " memdev: %s\n", hi->memdev); + } + break; default: g_assert_not_reached(); } diff --git a/hw/display/ati.c b/hw/display/ati.c index 6e38e00502..9a87a5504a 100644 --- a/hw/display/ati.c +++ b/hw/display/ati.c @@ -319,11 +319,13 @@ static uint64_t ati_mm_read(void *opaque, hwaddr addr, unsigned int size) case DAC_CNTL: val = s->regs.dac_cntl; break; - case GPIO_VGA_DDC: - val = s->regs.gpio_vga_ddc; + case GPIO_VGA_DDC ... GPIO_VGA_DDC + 3: + val = ati_reg_read_offs(s->regs.gpio_vga_ddc, + addr - GPIO_VGA_DDC, size); break; - case GPIO_DVI_DDC: - val = s->regs.gpio_dvi_ddc; + case GPIO_DVI_DDC ... GPIO_DVI_DDC + 3: + val = ati_reg_read_offs(s->regs.gpio_dvi_ddc, + addr - GPIO_DVI_DDC, size); break; case GPIO_MONID ... GPIO_MONID + 3: val = ati_reg_read_offs(s->regs.gpio_monid, @@ -337,6 +339,9 @@ static uint64_t ati_mm_read(void *opaque, hwaddr addr, unsigned int size) case PALETTE_DATA: val = vga_ioport_read(&s->vga, VGA_PEL_D); break; + case PALETTE_30_DATA: + val = s->regs.palette[vga_ioport_read(&s->vga, VGA_PEL_IR)]; + break; case CNFG_CNTL: val = s->regs.config_cntl; break; @@ -349,14 +354,17 @@ static uint64_t ati_mm_read(void *opaque, hwaddr addr, unsigned int size) PCI_BASE_ADDRESS_0, size) & 0xfffffff0; break; case CONFIG_APER_SIZE: - val = s->vga.vram_size; + val = s->vga.vram_size / 2; break; case CONFIG_REG_1_BASE: val = pci_default_read_config(&s->dev, PCI_BASE_ADDRESS_2, size) & 0xfffffff0; break; case CONFIG_REG_APER_SIZE: - val = memory_region_size(&s->mm); + val = memory_region_size(&s->mm) / 2; + break; + case HOST_PATH_CNTL: + val = BIT(23); /* Radeon HDP_APER_CNTL */ break; case MC_STATUS: val = 5; @@ -612,29 +620,34 @@ static void ati_mm_write(void *opaque, hwaddr addr, s->regs.dac_cntl = data & 0xffffe3ff; s->vga.dac_8bit = !!(data & DAC_8BIT_EN); break; - case GPIO_VGA_DDC: + /* + * GPIO regs for DDC access. Because some drivers access these via + * multiple byte writes we have to be careful when we send bits to + * avoid spurious changes in bitbang_i2c state. Only do it when either + * the enable bits are changed or output bits changed while enabled. + */ + case GPIO_VGA_DDC ... GPIO_VGA_DDC + 3: if (s->dev_id != PCI_DEVICE_ID_ATI_RAGE128_PF) { /* FIXME: Maybe add a property to select VGA or DVI port? */ } break; - case GPIO_DVI_DDC: + case GPIO_DVI_DDC ... GPIO_DVI_DDC + 3: if (s->dev_id != PCI_DEVICE_ID_ATI_RAGE128_PF) { - s->regs.gpio_dvi_ddc = ati_i2c(&s->bbi2c, data, 0); + ati_reg_write_offs(&s->regs.gpio_dvi_ddc, + addr - GPIO_DVI_DDC, data, size); + if ((addr <= GPIO_DVI_DDC + 2 && addr + size > GPIO_DVI_DDC + 2) || + (addr == GPIO_DVI_DDC && (s->regs.gpio_dvi_ddc & 0x30000))) { + s->regs.gpio_dvi_ddc = ati_i2c(&s->bbi2c, + s->regs.gpio_dvi_ddc, 0); + } } break; case GPIO_MONID ... GPIO_MONID + 3: /* FIXME What does Radeon have here? */ if (s->dev_id == PCI_DEVICE_ID_ATI_RAGE128_PF) { + /* Rage128p accesses DDC via MONID(1-2) with additional mask bit */ ati_reg_write_offs(&s->regs.gpio_monid, addr - GPIO_MONID, data, size); - /* - * Rage128p accesses DDC used to get EDID via these bits. - * Because some drivers access this via multiple byte writes - * we have to be careful when we send bits to avoid spurious - * changes in bitbang_i2c state. So only do it when mask is set - * and either the enable bits are changed or output bits changed - * while enabled. - */ if ((s->regs.gpio_monid & BIT(25)) && ((addr <= GPIO_MONID + 2 && addr + size > GPIO_MONID + 2) || (addr == GPIO_MONID && (s->regs.gpio_monid & 0x60000)))) { @@ -663,6 +676,12 @@ static void ati_mm_write(void *opaque, hwaddr addr, data >>= 8; vga_ioport_write(&s->vga, VGA_PEL_D, data & 0xff); break; + case PALETTE_30_DATA: + s->regs.palette[vga_ioport_read(&s->vga, VGA_PEL_IW)] = data; + vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 22) & 0xff); + vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 12) & 0xff); + vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 2) & 0xff); + break; case CNFG_CNTL: s->regs.config_cntl = data; break; @@ -1014,6 +1033,7 @@ static Property ati_vga_properties[] = { DEFINE_PROP_UINT16("x-device-id", ATIVGAState, dev_id, PCI_DEVICE_ID_ATI_RAGE128_PF), DEFINE_PROP_BOOL("guest_hwcursor", ATIVGAState, cursor_guest_mode, false), + DEFINE_PROP_UINT8("x-pixman", ATIVGAState, use_pixman, 3), DEFINE_PROP_END_OF_LIST() }; @@ -1035,11 +1055,18 @@ static void ati_vga_class_init(ObjectClass *klass, void *data) k->exit = ati_vga_exit; } +static void ati_vga_init(Object *o) +{ + object_property_set_description(o, "x-pixman", "Use pixman for: " + "1: fill, 2: blit"); +} + static const TypeInfo ati_vga_info = { .name = TYPE_ATI_VGA, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(ATIVGAState), .class_init = ati_vga_class_init, + .instance_init = ati_vga_init, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, { }, diff --git a/hw/display/ati_2d.c b/hw/display/ati_2d.c index 7d786653e8..0e6b8e4367 100644 --- a/hw/display/ati_2d.c +++ b/hw/display/ati_2d.c @@ -92,6 +92,7 @@ void ati_2d_blt(ATIVGAState *s) switch (s->regs.dp_mix & GMC_ROP3_MASK) { case ROP3_SRCCOPY: { + bool fallback = false; unsigned src_x = (s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT ? s->regs.src_x : s->regs.src_x + 1 - s->regs.dst_width); unsigned src_y = (s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM ? @@ -122,27 +123,50 @@ void ati_2d_blt(ATIVGAState *s) src_bits, dst_bits, src_stride, dst_stride, bpp, bpp, src_x, src_y, dst_x, dst_y, s->regs.dst_width, s->regs.dst_height); - if (s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT && + if ((s->use_pixman & BIT(1)) && + s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT && s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM) { - pixman_blt((uint32_t *)src_bits, (uint32_t *)dst_bits, - src_stride, dst_stride, bpp, bpp, - src_x, src_y, dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height); - } else { + fallback = !pixman_blt((uint32_t *)src_bits, (uint32_t *)dst_bits, + src_stride, dst_stride, bpp, bpp, + src_x, src_y, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height); + } else if (s->use_pixman & BIT(1)) { /* FIXME: We only really need a temporary if src and dst overlap */ int llb = s->regs.dst_width * (bpp / 8); int tmp_stride = DIV_ROUND_UP(llb, sizeof(uint32_t)); uint32_t *tmp = g_malloc(tmp_stride * sizeof(uint32_t) * s->regs.dst_height); - pixman_blt((uint32_t *)src_bits, tmp, - src_stride, tmp_stride, bpp, bpp, - src_x, src_y, 0, 0, - s->regs.dst_width, s->regs.dst_height); - pixman_blt(tmp, (uint32_t *)dst_bits, - tmp_stride, dst_stride, bpp, bpp, - 0, 0, dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height); + fallback = !pixman_blt((uint32_t *)src_bits, tmp, + src_stride, tmp_stride, bpp, bpp, + src_x, src_y, 0, 0, + s->regs.dst_width, s->regs.dst_height); + if (!fallback) { + fallback = !pixman_blt(tmp, (uint32_t *)dst_bits, + tmp_stride, dst_stride, bpp, bpp, + 0, 0, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height); + } g_free(tmp); + } else { + fallback = true; + } + if (fallback) { + unsigned int y, i, j, bypp = bpp / 8; + unsigned int src_pitch = src_stride * sizeof(uint32_t); + unsigned int dst_pitch = dst_stride * sizeof(uint32_t); + + for (y = 0; y < s->regs.dst_height; y++) { + i = dst_x * bypp; + j = src_x * bypp; + if (s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM) { + i += (dst_y + y) * dst_pitch; + j += (src_y + y) * src_pitch; + } else { + i += (dst_y + s->regs.dst_height - 1 - y) * dst_pitch; + j += (src_y + s->regs.dst_height - 1 - y) * src_pitch; + } + memmove(&dst_bits[i], &src_bits[j], s->regs.dst_width * bypp); + } } if (dst_bits >= s->vga.vram_ptr + s->vga.vbe_start_addr && dst_bits < s->vga.vram_ptr + s->vga.vbe_start_addr + @@ -180,14 +204,21 @@ void ati_2d_blt(ATIVGAState *s) dst_stride /= sizeof(uint32_t); DPRINTF("pixman_fill(%p, %d, %d, %d, %d, %d, %d, %x)\n", - dst_bits, dst_stride, bpp, - dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height, - filler); - pixman_fill((uint32_t *)dst_bits, dst_stride, bpp, - dst_x, dst_y, - s->regs.dst_width, s->regs.dst_height, - filler); + dst_bits, dst_stride, bpp, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height, filler); + if (!(s->use_pixman & BIT(0)) || + !pixman_fill((uint32_t *)dst_bits, dst_stride, bpp, dst_x, dst_y, + s->regs.dst_width, s->regs.dst_height, filler)) { + /* fallback when pixman failed or we don't want to call it */ + unsigned int x, y, i, bypp = bpp / 8; + unsigned int dst_pitch = dst_stride * sizeof(uint32_t); + for (y = 0; y < s->regs.dst_height; y++) { + i = dst_x * bypp + (dst_y + y) * dst_pitch; + for (x = 0; x < s->regs.dst_width; x++, i += bypp) { + stn_he_p(&dst_bits[i], bypp, filler); + } + } + } if (dst_bits >= s->vga.vram_ptr + s->vga.vbe_start_addr && dst_bits < s->vga.vram_ptr + s->vga.vbe_start_addr + s->vga.vbe_regs[VBE_DISPI_INDEX_YRES] * s->vga.vbe_line_offset) { diff --git a/hw/display/ati_dbg.c b/hw/display/ati_dbg.c index bd0ecd48c7..3ffa7f35df 100644 --- a/hw/display/ati_dbg.c +++ b/hw/display/ati_dbg.c @@ -30,6 +30,7 @@ static struct ati_regdesc ati_reg_names[] = { {"AMCGPIO_EN_MIR", 0x00a8}, {"PALETTE_INDEX", 0x00b0}, {"PALETTE_DATA", 0x00b4}, + {"PALETTE_30_DATA", 0x00b8}, {"CNFG_CNTL", 0x00e0}, {"GEN_RESET_CNTL", 0x00f0}, {"CNFG_MEMSIZE", 0x00f8}, @@ -38,6 +39,7 @@ static struct ati_regdesc ati_reg_names[] = { {"CONFIG_APER_SIZE", 0x0108}, {"CONFIG_REG_1_BASE", 0x010c}, {"CONFIG_REG_APER_SIZE", 0x0110}, + {"HOST_PATH_CNTL", 0x0130}, {"MEM_CNTL", 0x0140}, {"MC_FB_LOCATION", 0x0148}, {"MC_AGP_LOCATION", 0x014C}, diff --git a/hw/display/ati_int.h b/hw/display/ati_int.h index e8d3c7af75..f5a47b82b0 100644 --- a/hw/display/ati_int.h +++ b/hw/display/ati_int.h @@ -44,6 +44,7 @@ typedef struct ATIVGARegs { uint32_t gpio_dvi_ddc; uint32_t gpio_monid; uint32_t config_cntl; + uint32_t palette[256]; uint32_t crtc_h_total_disp; uint32_t crtc_h_sync_strt_wid; uint32_t crtc_v_total_disp; @@ -89,6 +90,7 @@ struct ATIVGAState { char *model; uint16_t dev_id; uint8_t mode; + uint8_t use_pixman; bool cursor_guest_mode; uint16_t cursor_size; uint32_t cursor_offset; diff --git a/hw/display/ati_regs.h b/hw/display/ati_regs.h index d6282b2ef2..d7127748ff 100644 --- a/hw/display/ati_regs.h +++ b/hw/display/ati_regs.h @@ -48,6 +48,7 @@ #define AMCGPIO_EN_MIR 0x00a8 #define PALETTE_INDEX 0x00b0 #define PALETTE_DATA 0x00b4 +#define PALETTE_30_DATA 0x00b8 #define CNFG_CNTL 0x00e0 #define GEN_RESET_CNTL 0x00f0 #define CNFG_MEMSIZE 0x00f8 @@ -56,6 +57,7 @@ #define CONFIG_APER_SIZE 0x0108 #define CONFIG_REG_1_BASE 0x010c #define CONFIG_REG_APER_SIZE 0x0110 +#define HOST_PATH_CNTL 0x0130 #define MEM_CNTL 0x0140 #define MC_FB_LOCATION 0x0148 #define MC_AGP_LOCATION 0x014C diff --git a/hw/display/macfb.c b/hw/display/macfb.c index 2f8e016566..d61541ccb5 100644 --- a/hw/display/macfb.c +++ b/hw/display/macfb.c @@ -36,8 +36,8 @@ #define DAFB_INTR_MASK 0x104 #define DAFB_INTR_STAT 0x108 #define DAFB_INTR_CLEAR 0x10c -#define DAFB_RESET 0x200 -#define DAFB_LUT 0x213 +#define DAFB_LUT_INDEX 0x200 +#define DAFB_LUT 0x210 #define DAFB_INTR_VBL 0x4 @@ -537,6 +537,11 @@ static uint64_t macfb_ctrl_read(void *opaque, case DAFB_MODE_SENSE: val = macfb_sense_read(s); break; + case DAFB_LUT ... DAFB_LUT + 3: + val = s->color_palette[s->palette_current]; + s->palette_current = (s->palette_current + 1) % + ARRAY_SIZE(s->color_palette); + break; default: if (addr < MACFB_CTRL_TOPADDR) { val = s->regs[addr >> 2]; @@ -583,13 +588,11 @@ static void macfb_ctrl_write(void *opaque, s->regs[DAFB_INTR_STAT >> 2] &= ~DAFB_INTR_VBL; macfb_update_irq(s); break; - case DAFB_RESET: - s->palette_current = 0; - s->regs[DAFB_INTR_STAT >> 2] &= ~DAFB_INTR_VBL; - macfb_update_irq(s); + case DAFB_LUT_INDEX: + s->palette_current = (val & 0xff) * 3; break; - case DAFB_LUT: - s->color_palette[s->palette_current] = val; + case DAFB_LUT ... DAFB_LUT + 3: + s->color_palette[s->palette_current] = val & 0xff; s->palette_current = (s->palette_current + 1) % ARRAY_SIZE(s->color_palette); if (s->palette_current % 3) { diff --git a/hw/display/virtio-gpu-pci-rutabaga.c b/hw/display/virtio-gpu-pci-rutabaga.c index c96729e198..abbb898c65 100644 --- a/hw/display/virtio-gpu-pci-rutabaga.c +++ b/hw/display/virtio-gpu-pci-rutabaga.c @@ -36,6 +36,7 @@ static const TypeInfo virtio_gpu_rutabaga_pci_info[] = { .instance_init = virtio_gpu_rutabaga_initfn, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, } }, }; diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index 4265316cbb..2707bceea8 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -1213,6 +1213,9 @@ static int virtio_gpu_save(QEMUFile *f, void *opaque, size_t size, assert(QTAILQ_EMPTY(&g->cmdq)); QTAILQ_FOREACH(res, &g->reslist, next) { + if (res->blob_size) { + continue; + } qemu_put_be32(f, res->resource_id); qemu_put_be32(f, res->width); qemu_put_be32(f, res->height); @@ -1230,12 +1233,40 @@ static int virtio_gpu_save(QEMUFile *f, void *opaque, size_t size, return vmstate_save_state(f, &vmstate_virtio_gpu_scanouts, g, NULL); } +static bool virtio_gpu_load_restore_mapping(VirtIOGPU *g, + struct virtio_gpu_simple_resource *res) +{ + int i; + + for (i = 0; i < res->iov_cnt; i++) { + hwaddr len = res->iov[i].iov_len; + res->iov[i].iov_base = + dma_memory_map(VIRTIO_DEVICE(g)->dma_as, res->addrs[i], &len, + DMA_DIRECTION_TO_DEVICE, MEMTXATTRS_UNSPECIFIED); + + if (!res->iov[i].iov_base || len != res->iov[i].iov_len) { + /* Clean up the half-a-mapping we just created... */ + if (res->iov[i].iov_base) { + dma_memory_unmap(VIRTIO_DEVICE(g)->dma_as, res->iov[i].iov_base, + len, DMA_DIRECTION_TO_DEVICE, 0); + } + /* ...and the mappings for previous loop iterations */ + res->iov_cnt = i; + virtio_gpu_cleanup_mapping(g, res); + return false; + } + } + + QTAILQ_INSERT_HEAD(&g->reslist, res, next); + g->hostmem += res->hostmem; + return true; +} + static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size, const VMStateField *field) { VirtIOGPU *g = opaque; struct virtio_gpu_simple_resource *res; - struct virtio_gpu_scanout *scanout; uint32_t resource_id, pformat; void *bits = NULL; int i; @@ -1294,40 +1325,96 @@ static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size, qemu_get_buffer(f, (void *)pixman_image_get_data(res->image), pixman_image_get_stride(res->image) * res->height); - /* restore mapping */ - for (i = 0; i < res->iov_cnt; i++) { - hwaddr len = res->iov[i].iov_len; - res->iov[i].iov_base = - dma_memory_map(VIRTIO_DEVICE(g)->dma_as, res->addrs[i], &len, - DMA_DIRECTION_TO_DEVICE, - MEMTXATTRS_UNSPECIFIED); - - if (!res->iov[i].iov_base || len != res->iov[i].iov_len) { - /* Clean up the half-a-mapping we just created... */ - if (res->iov[i].iov_base) { - dma_memory_unmap(VIRTIO_DEVICE(g)->dma_as, - res->iov[i].iov_base, - len, - DMA_DIRECTION_TO_DEVICE, - 0); - } - /* ...and the mappings for previous loop iterations */ - res->iov_cnt = i; - virtio_gpu_cleanup_mapping(g, res); - pixman_image_unref(res->image); - g_free(res); - return -EINVAL; - } + if (!virtio_gpu_load_restore_mapping(g, res)) { + pixman_image_unref(res->image); + g_free(res); + return -EINVAL; } - QTAILQ_INSERT_HEAD(&g->reslist, res, next); - g->hostmem += res->hostmem; - resource_id = qemu_get_be32(f); } /* load & apply scanout state */ vmstate_load_state(f, &vmstate_virtio_gpu_scanouts, g, 1); + + return 0; +} + +static int virtio_gpu_blob_save(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field, JSONWriter *vmdesc) +{ + VirtIOGPU *g = opaque; + struct virtio_gpu_simple_resource *res; + int i; + + /* in 2d mode we should never find unprocessed commands here */ + assert(QTAILQ_EMPTY(&g->cmdq)); + + QTAILQ_FOREACH(res, &g->reslist, next) { + if (!res->blob_size) { + continue; + } + qemu_put_be32(f, res->resource_id); + qemu_put_be32(f, res->blob_size); + qemu_put_be32(f, res->iov_cnt); + for (i = 0; i < res->iov_cnt; i++) { + qemu_put_be64(f, res->addrs[i]); + qemu_put_be32(f, res->iov[i].iov_len); + } + } + qemu_put_be32(f, 0); /* end of list */ + + return 0; +} + +static int virtio_gpu_blob_load(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field) +{ + VirtIOGPU *g = opaque; + struct virtio_gpu_simple_resource *res; + uint32_t resource_id; + int i; + + resource_id = qemu_get_be32(f); + while (resource_id != 0) { + res = virtio_gpu_find_resource(g, resource_id); + if (res) { + return -EINVAL; + } + + res = g_new0(struct virtio_gpu_simple_resource, 1); + res->resource_id = resource_id; + res->blob_size = qemu_get_be32(f); + res->iov_cnt = qemu_get_be32(f); + res->addrs = g_new(uint64_t, res->iov_cnt); + res->iov = g_new(struct iovec, res->iov_cnt); + + /* read data */ + for (i = 0; i < res->iov_cnt; i++) { + res->addrs[i] = qemu_get_be64(f); + res->iov[i].iov_len = qemu_get_be32(f); + } + + if (!virtio_gpu_load_restore_mapping(g, res)) { + g_free(res); + return -EINVAL; + } + + virtio_gpu_init_udmabuf(res); + + resource_id = qemu_get_be32(f); + } + + return 0; +} + +static int virtio_gpu_post_load(void *opaque, int version_id) +{ + VirtIOGPU *g = opaque; + struct virtio_gpu_scanout *scanout; + struct virtio_gpu_simple_resource *res; + int i; + for (i = 0; i < g->parent_obj.conf.max_outputs; i++) { /* FIXME: should take scanout.r.{x,y} into account */ scanout = &g->parent_obj.scanout[i]; @@ -1475,6 +1562,32 @@ virtio_gpu_set_config(VirtIODevice *vdev, const uint8_t *config) } } +static bool virtio_gpu_blob_state_needed(void *opaque) +{ + VirtIOGPU *g = VIRTIO_GPU(opaque); + + return virtio_gpu_blob_enabled(g->parent_obj.conf); +} + +const VMStateDescription vmstate_virtio_gpu_blob_state = { + .name = "virtio-gpu/blob", + .minimum_version_id = VIRTIO_GPU_VM_VERSION, + .version_id = VIRTIO_GPU_VM_VERSION, + .needed = virtio_gpu_blob_state_needed, + .fields = (const VMStateField[]){ + { + .name = "virtio-gpu/blob", + .info = &(const VMStateInfo) { + .name = "blob", + .get = virtio_gpu_blob_load, + .put = virtio_gpu_blob_save, + }, + .flags = VMS_SINGLE, + } /* device */, + VMSTATE_END_OF_LIST() + }, +}; + /* * For historical reasons virtio_gpu does not adhere to virtio migration * scheme as described in doc/virtio-migration.txt, in a sense that no @@ -1500,6 +1613,11 @@ static const VMStateDescription vmstate_virtio_gpu = { } /* device */, VMSTATE_END_OF_LIST() }, + .subsections = (const VMStateDescription * []) { + &vmstate_virtio_gpu_blob_state, + NULL + }, + .post_load = virtio_gpu_post_load, }; static Property virtio_gpu_properties[] = { diff --git a/hw/hyperv/Kconfig b/hw/hyperv/Kconfig index fcf65903bd..41dd827c84 100644 --- a/hw/hyperv/Kconfig +++ b/hw/hyperv/Kconfig @@ -16,3 +16,13 @@ config SYNDBG bool default y depends on VMBUS + +config HV_BALLOON_SUPPORTED + bool + +config HV_BALLOON + bool + default y + depends on VMBUS + depends on HV_BALLOON_POSSIBLE + depends on HV_BALLOON_SUPPORTED diff --git a/hw/hyperv/hv-balloon-internal.h b/hw/hyperv/hv-balloon-internal.h new file mode 100644 index 0000000000..164c2e5825 --- /dev/null +++ b/hw/hyperv/hv-balloon-internal.h @@ -0,0 +1,33 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HYPERV_HV_BALLOON_INTERNAL_H +#define HW_HYPERV_HV_BALLOON_INTERNAL_H + +#include "qemu/osdep.h" + +#define HV_BALLOON_PFN_SHIFT 12 +#define HV_BALLOON_PAGE_SIZE (1 << HV_BALLOON_PFN_SHIFT) + +#define SUM_OVERFLOW_U64(in1, in2) ((in1) > UINT64_MAX - (in2)) +#define SUM_SATURATE_U64(in1, in2) \ + ({ \ + uint64_t _in1 = (in1), _in2 = (in2); \ + uint64_t _result; \ + \ + if (!SUM_OVERFLOW_U64(_in1, _in2)) { \ + _result = _in1 + _in2; \ + } else { \ + _result = UINT64_MAX; \ + } \ + \ + _result; \ + }) + +#endif diff --git a/hw/hyperv/hv-balloon-our_range_memslots.c b/hw/hyperv/hv-balloon-our_range_memslots.c new file mode 100644 index 0000000000..99bae870f3 --- /dev/null +++ b/hw/hyperv/hv-balloon-our_range_memslots.c @@ -0,0 +1,201 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "hv-balloon-internal.h" +#include "hv-balloon-our_range_memslots.h" +#include "trace.h" + +/* OurRange */ +static void our_range_init(OurRange *our_range, uint64_t start, uint64_t count) +{ + assert(count <= UINT64_MAX - start); + our_range->range.start = start; + our_range->range.count = count; + + hvb_page_range_tree_init(&our_range->removed_guest); + hvb_page_range_tree_init(&our_range->removed_both); + + /* mark the whole range as unused but for potential use */ + our_range->added = 0; + our_range->unusable_tail = 0; +} + +static void our_range_destroy(OurRange *our_range) +{ + hvb_page_range_tree_destroy(&our_range->removed_guest); + hvb_page_range_tree_destroy(&our_range->removed_both); +} + +void hvb_our_range_clear_removed_trees(OurRange *our_range) +{ + hvb_page_range_tree_destroy(&our_range->removed_guest); + hvb_page_range_tree_destroy(&our_range->removed_both); + hvb_page_range_tree_init(&our_range->removed_guest); + hvb_page_range_tree_init(&our_range->removed_both); +} + +void hvb_our_range_mark_added(OurRange *our_range, uint64_t additional_size) +{ + assert(additional_size <= UINT64_MAX - our_range->added); + + our_range->added += additional_size; + + assert(our_range->added <= UINT64_MAX - our_range->unusable_tail); + assert(our_range->added + our_range->unusable_tail <= + our_range->range.count); +} + +/* OurRangeMemslots */ +static void our_range_memslots_init_slots(OurRangeMemslots *our_range, + MemoryRegion *backing_mr, + Object *memslot_owner) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + unsigned int idx; + uint64_t memslot_offset; + + assert(memslots->count > 0); + memslots->slots = g_new0(MemoryRegion, memslots->count); + + /* Initialize our memslots, but don't map them yet. */ + assert(memslots->size_each > 0); + for (idx = 0, memslot_offset = 0; idx < memslots->count; + idx++, memslot_offset += memslots->size_each) { + uint64_t memslot_size; + g_autofree char *name = NULL; + + /* The size of the last memslot might be smaller. */ + if (idx == memslots->count - 1) { + uint64_t region_size; + + assert(our_range->mr); + region_size = memory_region_size(our_range->mr); + memslot_size = region_size - memslot_offset; + } else { + memslot_size = memslots->size_each; + } + + name = g_strdup_printf("memslot-%u", idx); + memory_region_init_alias(&memslots->slots[idx], memslot_owner, name, + backing_mr, memslot_offset, memslot_size); + /* + * We want to be able to atomically and efficiently activate/deactivate + * individual memslots without affecting adjacent memslots in memory + * notifiers. + */ + memory_region_set_unmergeable(&memslots->slots[idx], true); + } + + memslots->mapped_count = 0; +} + +OurRangeMemslots *hvb_our_range_memslots_new(uint64_t addr, + MemoryRegion *parent_mr, + MemoryRegion *backing_mr, + Object *memslot_owner, + unsigned int memslot_count, + uint64_t memslot_size) +{ + OurRangeMemslots *our_range; + + our_range = g_malloc(sizeof(*our_range)); + our_range_init(&our_range->range, + addr / HV_BALLOON_PAGE_SIZE, + memory_region_size(parent_mr) / HV_BALLOON_PAGE_SIZE); + our_range->slots.size_each = memslot_size; + our_range->slots.count = memslot_count; + our_range->mr = parent_mr; + our_range_memslots_init_slots(our_range, backing_mr, memslot_owner); + + return our_range; +} + +static void our_range_memslots_free_memslots(OurRangeMemslots *our_range) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + unsigned int idx; + uint64_t offset; + + memory_region_transaction_begin(); + for (idx = 0, offset = 0; idx < memslots->mapped_count; + idx++, offset += memslots->size_each) { + trace_hv_balloon_unmap_slot(idx, memslots->count, offset); + assert(memory_region_is_mapped(&memslots->slots[idx])); + memory_region_del_subregion(our_range->mr, &memslots->slots[idx]); + } + memory_region_transaction_commit(); + + for (idx = 0; idx < memslots->count; idx++) { + object_unparent(OBJECT(&memslots->slots[idx])); + } + + g_clear_pointer(&our_range->slots.slots, g_free); +} + +void hvb_our_range_memslots_free(OurRangeMemslots *our_range) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + MemoryRegion *hostmem_mr; + RAMBlock *rb; + + assert(our_range->slots.count > 0); + assert(our_range->slots.slots); + + hostmem_mr = memslots->slots[0].alias; + rb = hostmem_mr->ram_block; + ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); + + our_range_memslots_free_memslots(our_range); + our_range_destroy(&our_range->range); + g_free(our_range); +} + +void hvb_our_range_memslots_ensure_mapped_additional(OurRangeMemslots *our_range, + uint64_t additional_map_size) +{ + OurRangeMemslotsSlots *memslots = &our_range->slots; + uint64_t total_map_size; + unsigned int idx; + uint64_t offset; + + total_map_size = (our_range->range.added + additional_map_size) * + HV_BALLOON_PAGE_SIZE; + idx = memslots->mapped_count; + assert(memslots->size_each > 0); + offset = idx * memslots->size_each; + + /* + * Activate all memslots covered by the newly added region in a single + * transaction. + */ + memory_region_transaction_begin(); + for ( ; idx < memslots->count; + idx++, offset += memslots->size_each) { + /* + * If this memslot starts beyond or at the end of the range to map so + * does every next one. + */ + if (offset >= total_map_size) { + break; + } + + /* + * Instead of enabling/disabling memslot, we add/remove them. This + * should make address space updates faster, because we don't have to + * loop over many disabled subregions. + */ + trace_hv_balloon_map_slot(idx, memslots->count, offset); + assert(!memory_region_is_mapped(&memslots->slots[idx])); + memory_region_add_subregion(our_range->mr, offset, + &memslots->slots[idx]); + + memslots->mapped_count++; + } + memory_region_transaction_commit(); +} diff --git a/hw/hyperv/hv-balloon-our_range_memslots.h b/hw/hyperv/hv-balloon-our_range_memslots.h new file mode 100644 index 0000000000..b6f592d34b --- /dev/null +++ b/hw/hyperv/hv-balloon-our_range_memslots.h @@ -0,0 +1,110 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HYPERV_HV_BALLOON_OUR_RANGE_MEMSLOTS_H +#define HW_HYPERV_HV_BALLOON_OUR_RANGE_MEMSLOTS_H + +#include "qemu/osdep.h" + +#include "exec/memory.h" +#include "qom/object.h" +#include "hv-balloon-page_range_tree.h" + +/* OurRange */ +#define OUR_RANGE(ptr) ((OurRange *)(ptr)) + +/* "our range" means the memory range owned by this driver (for hot-adding) */ +typedef struct OurRange { + PageRange range; + + /* How many pages were hot-added to the guest */ + uint64_t added; + + /* Pages at the end not currently usable */ + uint64_t unusable_tail; + + /* Memory removed from the guest */ + PageRangeTree removed_guest, removed_both; +} OurRange; + +static inline uint64_t our_range_get_remaining_start(OurRange *our_range) +{ + return our_range->range.start + our_range->added; +} + +static inline uint64_t our_range_get_remaining_size(OurRange *our_range) +{ + return our_range->range.count - our_range->added - our_range->unusable_tail; +} + +void hvb_our_range_mark_added(OurRange *our_range, uint64_t additional_size); + +static inline void our_range_mark_remaining_unusable(OurRange *our_range) +{ + our_range->unusable_tail = our_range->range.count - our_range->added; +} + +static inline PageRangeTree our_range_get_removed_tree(OurRange *our_range, + bool both) +{ + if (both) { + return our_range->removed_both; + } else { + return our_range->removed_guest; + } +} + +static inline bool our_range_is_removed_tree_empty(OurRange *our_range, + bool both) +{ + if (both) { + return page_range_tree_is_empty(our_range->removed_both); + } else { + return page_range_tree_is_empty(our_range->removed_guest); + } +} + +void hvb_our_range_clear_removed_trees(OurRange *our_range); + +/* OurRangeMemslots */ +typedef struct OurRangeMemslotsSlots { + /* Nominal size of each memslot (the last one might be smaller) */ + uint64_t size_each; + + /* Slots array and its element count */ + MemoryRegion *slots; + unsigned int count; + + /* How many slots are currently mapped */ + unsigned int mapped_count; +} OurRangeMemslotsSlots; + +typedef struct OurRangeMemslots { + OurRange range; + + /* Memslots covering our range */ + OurRangeMemslotsSlots slots; + + MemoryRegion *mr; +} OurRangeMemslots; + +OurRangeMemslots *hvb_our_range_memslots_new(uint64_t addr, + MemoryRegion *parent_mr, + MemoryRegion *backing_mr, + Object *memslot_owner, + unsigned int memslot_count, + uint64_t memslot_size); +void hvb_our_range_memslots_free(OurRangeMemslots *our_range); + +G_DEFINE_AUTOPTR_CLEANUP_FUNC(OurRangeMemslots, hvb_our_range_memslots_free) + +void hvb_our_range_memslots_ensure_mapped_additional(OurRangeMemslots *our_range, + uint64_t additional_map_size); + +#endif diff --git a/hw/hyperv/hv-balloon-page_range_tree.c b/hw/hyperv/hv-balloon-page_range_tree.c new file mode 100644 index 0000000000..e178d8b413 --- /dev/null +++ b/hw/hyperv/hv-balloon-page_range_tree.c @@ -0,0 +1,228 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "hv-balloon-internal.h" +#include "hv-balloon-page_range_tree.h" + +/* + * temporarily avoid warnings about enhanced GTree API usage requiring a + * too recent Glib version until GLIB_VERSION_MAX_ALLOWED finally reaches + * the Glib version with this API + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +/* PageRangeTree */ +static gint page_range_tree_key_compare(gconstpointer leftp, + gconstpointer rightp, + gpointer user_data) +{ + const uint64_t *left = leftp, *right = rightp; + + if (*left < *right) { + return -1; + } else if (*left > *right) { + return 1; + } else { /* *left == *right */ + return 0; + } +} + +static GTreeNode *page_range_tree_insert_new(PageRangeTree tree, + uint64_t start, uint64_t count) +{ + uint64_t *key = g_malloc(sizeof(*key)); + PageRange *range = g_malloc(sizeof(*range)); + + assert(count > 0); + + *key = range->start = start; + range->count = count; + + return g_tree_insert_node(tree.t, key, range); +} + +void hvb_page_range_tree_insert(PageRangeTree tree, + uint64_t start, uint64_t count, + uint64_t *dupcount) +{ + GTreeNode *node; + bool joinable; + uint64_t intersection; + PageRange *range; + + assert(!SUM_OVERFLOW_U64(start, count)); + if (count == 0) { + return; + } + + node = g_tree_upper_bound(tree.t, &start); + if (node) { + node = g_tree_node_previous(node); + } else { + node = g_tree_node_last(tree.t); + } + + if (node) { + range = g_tree_node_value(node); + assert(range); + intersection = page_range_intersection_size(range, start, count); + joinable = page_range_joinable_right(range, start, count); + } + + if (!node || + (!intersection && !joinable)) { + /* + * !node case: the tree is empty or the very first node in the tree + * already has a higher key (the start of its range). + * the other case: there is a gap in the tree between the new range + * and the previous one. + * anyway, let's just insert the new range into the tree. + */ + node = page_range_tree_insert_new(tree, start, count); + assert(node); + range = g_tree_node_value(node); + assert(range); + } else { + /* + * the previous range in the tree either partially covers the new + * range or ends just at its beginning - extend it + */ + if (dupcount) { + *dupcount += intersection; + } + + count += start - range->start; + range->count = MAX(range->count, count); + } + + /* check next nodes for possible merging */ + for (node = g_tree_node_next(node); node; ) { + PageRange *rangecur; + + rangecur = g_tree_node_value(node); + assert(rangecur); + + intersection = page_range_intersection_size(rangecur, + range->start, range->count); + joinable = page_range_joinable_left(rangecur, + range->start, range->count); + if (!intersection && !joinable) { + /* the current node is disjoint */ + break; + } + + if (dupcount) { + *dupcount += intersection; + } + + count = rangecur->count + (rangecur->start - range->start); + range->count = MAX(range->count, count); + + /* the current node was merged in, remove it */ + start = rangecur->start; + node = g_tree_node_next(node); + /* no hinted removal in GTree... */ + g_tree_remove(tree.t, &start); + } +} + +bool hvb_page_range_tree_pop(PageRangeTree tree, PageRange *out, + uint64_t maxcount) +{ + GTreeNode *node; + PageRange *range; + + node = g_tree_node_last(tree.t); + if (!node) { + return false; + } + + range = g_tree_node_value(node); + assert(range); + + out->start = range->start; + + /* can't modify range->start as it is the node key */ + if (range->count > maxcount) { + out->start += range->count - maxcount; + out->count = maxcount; + range->count -= maxcount; + } else { + out->count = range->count; + /* no hinted removal in GTree... */ + g_tree_remove(tree.t, &out->start); + } + + return true; +} + +bool hvb_page_range_tree_intree_any(PageRangeTree tree, + uint64_t start, uint64_t count) +{ + GTreeNode *node; + + if (count == 0) { + return false; + } + + /* find the first node that can possibly intersect our range */ + node = g_tree_upper_bound(tree.t, &start); + if (node) { + /* + * a NULL node below means that the very first node in the tree + * already has a higher key (the start of its range). + */ + node = g_tree_node_previous(node); + } else { + /* a NULL node below means that the tree is empty */ + node = g_tree_node_last(tree.t); + } + /* node range start <= range start */ + + if (!node) { + /* node range start > range start */ + node = g_tree_node_first(tree.t); + } + + for ( ; node; node = g_tree_node_next(node)) { + PageRange *range = g_tree_node_value(node); + + assert(range); + /* + * if this node starts beyond or at the end of our range so does + * every next one + */ + if (range->start >= start + count) { + break; + } + + if (page_range_intersection_size(range, start, count) > 0) { + return true; + } + } + + return false; +} + +void hvb_page_range_tree_init(PageRangeTree *tree) +{ + tree->t = g_tree_new_full(page_range_tree_key_compare, NULL, + g_free, g_free); +} + +void hvb_page_range_tree_destroy(PageRangeTree *tree) +{ + /* g_tree_destroy() is not NULL-safe */ + if (!tree->t) { + return; + } + + g_tree_destroy(tree->t); + tree->t = NULL; +} diff --git a/hw/hyperv/hv-balloon-page_range_tree.h b/hw/hyperv/hv-balloon-page_range_tree.h new file mode 100644 index 0000000000..07a9ae0da6 --- /dev/null +++ b/hw/hyperv/hv-balloon-page_range_tree.h @@ -0,0 +1,118 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_HYPERV_HV_BALLOON_PAGE_RANGE_TREE_H +#define HW_HYPERV_HV_BALLOON_PAGE_RANGE_TREE_H + +#include "qemu/osdep.h" + +/* PageRange */ +typedef struct PageRange { + uint64_t start; + uint64_t count; +} PageRange; + +/* return just the part of range before (start) */ +static inline void page_range_part_before(const PageRange *range, + uint64_t start, PageRange *out) +{ + uint64_t endr = range->start + range->count; + uint64_t end = MIN(endr, start); + + out->start = range->start; + if (end > out->start) { + out->count = end - out->start; + } else { + out->count = 0; + } +} + +/* return just the part of range after (start, count) */ +static inline void page_range_part_after(const PageRange *range, + uint64_t start, uint64_t count, + PageRange *out) +{ + uint64_t end = range->start + range->count; + uint64_t ends = start + count; + + out->start = MAX(range->start, ends); + if (end > out->start) { + out->count = end - out->start; + } else { + out->count = 0; + } +} + +static inline void page_range_intersect(const PageRange *range, + uint64_t start, uint64_t count, + PageRange *out) +{ + uint64_t end1 = range->start + range->count; + uint64_t end2 = start + count; + uint64_t end = MIN(end1, end2); + + out->start = MAX(range->start, start); + out->count = out->start < end ? end - out->start : 0; +} + +static inline uint64_t page_range_intersection_size(const PageRange *range, + uint64_t start, uint64_t count) +{ + PageRange trange; + + page_range_intersect(range, start, count, &trange); + return trange.count; +} + +static inline bool page_range_joinable_left(const PageRange *range, + uint64_t start, uint64_t count) +{ + return start + count == range->start; +} + +static inline bool page_range_joinable_right(const PageRange *range, + uint64_t start, uint64_t count) +{ + return range->start + range->count == start; +} + +static inline bool page_range_joinable(const PageRange *range, + uint64_t start, uint64_t count) +{ + return page_range_joinable_left(range, start, count) || + page_range_joinable_right(range, start, count); +} + +/* PageRangeTree */ +/* type safety */ +typedef struct PageRangeTree { + GTree *t; +} PageRangeTree; + +static inline bool page_range_tree_is_empty(PageRangeTree tree) +{ + guint nnodes = g_tree_nnodes(tree.t); + + return nnodes == 0; +} + +void hvb_page_range_tree_init(PageRangeTree *tree); +void hvb_page_range_tree_destroy(PageRangeTree *tree); + +bool hvb_page_range_tree_intree_any(PageRangeTree tree, + uint64_t start, uint64_t count); + +bool hvb_page_range_tree_pop(PageRangeTree tree, PageRange *out, + uint64_t maxcount); + +void hvb_page_range_tree_insert(PageRangeTree tree, + uint64_t start, uint64_t count, + uint64_t *dupcount); + +#endif diff --git a/hw/hyperv/hv-balloon-stub.c b/hw/hyperv/hv-balloon-stub.c new file mode 100644 index 0000000000..a47412d4a8 --- /dev/null +++ b/hw/hyperv/hv-balloon-stub.c @@ -0,0 +1,19 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-machine.h" +#include "qapi/qapi-types-machine.h" + +HvBalloonInfo *qmp_query_hv_balloon_status_report(Error **errp) +{ + error_setg(errp, "hv-balloon device not enabled in this build"); + return NULL; +} diff --git a/hw/hyperv/hv-balloon.c b/hw/hyperv/hv-balloon.c new file mode 100644 index 0000000000..66f297c1d7 --- /dev/null +++ b/hw/hyperv/hv-balloon.c @@ -0,0 +1,1769 @@ +/* + * QEMU Hyper-V Dynamic Memory Protocol driver + * + * Copyright (C) 2020-2023 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "hv-balloon-internal.h" + +#include "exec/address-spaces.h" +#include "exec/cpu-common.h" +#include "exec/ramblock.h" +#include "hw/boards.h" +#include "hw/hyperv/dynmem-proto.h" +#include "hw/hyperv/hv-balloon.h" +#include "hw/hyperv/vmbus.h" +#include "hw/mem/memory-device.h" +#include "hw/mem/pc-dimm.h" +#include "hw/qdev-core.h" +#include "hw/qdev-properties.h" +#include "monitor/qdev.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-machine.h" +#include "qapi/qapi-events-machine.h" +#include "qapi/qapi-types-machine.h" +#include "qapi/qmp/qdict.h" +#include "qapi/visitor.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "qemu/units.h" +#include "qemu/timer.h" +#include "sysemu/balloon.h" +#include "sysemu/hostmem.h" +#include "sysemu/reset.h" +#include "hv-balloon-our_range_memslots.h" +#include "hv-balloon-page_range_tree.h" +#include "trace.h" + +#define HV_BALLOON_ADDR_PROP "addr" +#define HV_BALLOON_MEMDEV_PROP "memdev" +#define HV_BALLOON_GUID "525074DC-8985-46e2-8057-A307DC18A502" + +/* + * Some Windows versions (at least Server 2019) will crash with various + * error codes when receiving DM protocol requests (at least + * DM_MEM_HOT_ADD_REQUEST) immediately after boot. + * + * It looks like Hyper-V from Server 2016 uses a 50-second after-boot + * delay, probably to workaround this issue, so we'll use this value, too. + */ +#define HV_BALLOON_POST_INIT_WAIT (50 * 1000) + +#define HV_BALLOON_HA_CHUNK_SIZE (2 * GiB) +#define HV_BALLOON_HA_CHUNK_PAGES (HV_BALLOON_HA_CHUNK_SIZE / HV_BALLOON_PAGE_SIZE) + +#define HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN (128 * MiB) + +#define HV_BALLOON_HR_CHUNK_PAGES 585728 +/* + * ^ that's the maximum number of pages + * that Windows returns in one hot remove response + * + * If the number requested is too high Windows will no longer honor + * these requests + */ + +struct HvBalloonClass { + VMBusDeviceClass parent_class; +} HvBalloonClass; + +typedef enum State { + /* not a real state */ + S_NO_CHANGE = 0, + + S_WAIT_RESET, + S_POST_RESET_CLOSED, + + /* init flow */ + S_VERSION, + S_CAPS, + S_POST_INIT_WAIT, + + S_IDLE, + + /* balloon op flow */ + S_BALLOON_POSTING, + S_BALLOON_RB_WAIT, + S_BALLOON_REPLY_WAIT, + + /* unballoon + hot add ops flow */ + S_UNBALLOON_POSTING, + S_UNBALLOON_RB_WAIT, + S_UNBALLOON_REPLY_WAIT, + S_HOT_ADD_SETUP, + S_HOT_ADD_RB_WAIT, + S_HOT_ADD_POSTING, + S_HOT_ADD_REPLY_WAIT, +} State; + +typedef struct StateDesc { + State state; + const char *desc; +} StateDesc; + +typedef struct HvBalloon { + VMBusDevice parent; + State state; + + union dm_version version; + union dm_caps caps; + + QEMUTimer post_init_timer; + + unsigned int trans_id; + + struct { + bool enabled; + bool received; + uint64_t committed; + uint64_t available; + } status_report; + + /* Guest target size */ + uint64_t target; + bool target_changed; + + /* Current (un)balloon / hot-add operation parameters */ + union { + uint64_t balloon_diff; + + struct { + uint64_t unballoon_diff; + uint64_t hot_add_diff; + }; + + struct { + PageRange hot_add_range; + uint64_t ha_current_count; + }; + }; + + OurRangeMemslots *our_range; + + /* Count of memslots covering our memory */ + unsigned int memslot_count; + + /* Nominal size of each memslot (the last one might be smaller) */ + uint64_t memslot_size; + + /* Non-ours removed memory */ + PageRangeTree removed_guest, removed_both; + + /* Grand totals of removed memory (both ours and non-ours) */ + uint64_t removed_guest_ctr, removed_both_ctr; + + /* MEMORY_DEVICE props */ + uint64_t addr; + HostMemoryBackend *hostmem; + MemoryRegion *mr; +} HvBalloon; + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, HV_BALLOON, VMBUS_DEVICE, \ + { TYPE_MEMORY_DEVICE }, { }) + +#define HV_BALLOON_SET_STATE(hvb, news) \ + do { \ + assert(news != S_NO_CHANGE); \ + hv_balloon_state_set(hvb, news, # news); \ + } while (0) + +#define HV_BALLOON_STATE_DESC_SET(stdesc, news) \ + _hv_balloon_state_desc_set(stdesc, news, # news) + +#define HV_BALLOON_STATE_DESC_INIT \ + { \ + .state = S_NO_CHANGE, \ + } + +typedef struct HvBalloonReq { + VMBusChanReq vmreq; +} HvBalloonReq; + +/* total our memory includes parts currently removed from the guest */ +static uint64_t hv_balloon_total_our_ram(HvBalloon *balloon) +{ + if (!balloon->our_range) { + return 0; + } + + return balloon->our_range->range.added; +} + +/* TODO: unify the code below with virtio-balloon and cache the value */ +static int build_dimm_list(Object *obj, void *opaque) +{ + GSList **list = opaque; + + if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { + DeviceState *dev = DEVICE(obj); + if (dev->realized) { /* only realized DIMMs matter */ + *list = g_slist_prepend(*list, dev); + } + } + + object_child_foreach(obj, build_dimm_list, opaque); + return 0; +} + +static ram_addr_t get_current_ram_size(void) +{ + GSList *list = NULL, *item; + ram_addr_t size = current_machine->ram_size; + + build_dimm_list(qdev_get_machine(), &list); + for (item = list; item; item = g_slist_next(item)) { + Object *obj = OBJECT(item->data); + if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) + size += object_property_get_int(obj, PC_DIMM_SIZE_PROP, + &error_abort); + } + g_slist_free(list); + + return size; +} + +/* total RAM includes memory currently removed from the guest */ +static uint64_t hv_balloon_total_ram(HvBalloon *balloon) +{ + ram_addr_t ram_size = get_current_ram_size(); + uint64_t ram_size_pages = ram_size >> HV_BALLOON_PFN_SHIFT; + uint64_t our_ram_size_pages = hv_balloon_total_our_ram(balloon); + + assert(ram_size_pages > 0); + + return SUM_SATURATE_U64(ram_size_pages, our_ram_size_pages); +} + +/* + * calculating the total RAM size is a slow operation, + * avoid it as much as possible + */ +static uint64_t hv_balloon_total_removed_rs(HvBalloon *balloon, + uint64_t ram_size_pages) +{ + uint64_t total_removed; + + total_removed = SUM_SATURATE_U64(balloon->removed_guest_ctr, + balloon->removed_both_ctr); + + /* possible if guest returns pages outside actual RAM */ + if (total_removed > ram_size_pages) { + total_removed = ram_size_pages; + } + + return total_removed; +} + +/* Returns whether the state has actually changed */ +static bool hv_balloon_state_set(HvBalloon *balloon, + State newst, const char *newststr) +{ + if (newst == S_NO_CHANGE || balloon->state == newst) { + return false; + } + + balloon->state = newst; + trace_hv_balloon_state_change(newststr); + return true; +} + +static void _hv_balloon_state_desc_set(StateDesc *stdesc, + State newst, const char *newststr) +{ + /* state setting is only permitted on a freshly init desc */ + assert(stdesc->state == S_NO_CHANGE); + + assert(newst != S_NO_CHANGE); + + stdesc->state = newst; + stdesc->desc = newststr; +} + +static VMBusChannel *hv_balloon_get_channel_maybe(HvBalloon *balloon) +{ + return vmbus_device_channel(&balloon->parent, 0); +} + +static VMBusChannel *hv_balloon_get_channel(HvBalloon *balloon) +{ + VMBusChannel *chan; + + chan = hv_balloon_get_channel_maybe(balloon); + assert(chan != NULL); + return chan; +} + +static ssize_t hv_balloon_send_packet(VMBusChannel *chan, + struct dm_message *msg) +{ + int ret; + + ret = vmbus_channel_reserve(chan, 0, msg->hdr.size); + if (ret < 0) { + return ret; + } + + return vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, msg, msg->hdr.size, false, + msg->hdr.trans_id); +} + +static bool hv_balloon_unballoon_get_source(HvBalloon *balloon, + PageRangeTree *dtree, + uint64_t **dctr, + bool *is_our_range) +{ + OurRange *our_range = OUR_RANGE(balloon->our_range); + + /* Try the boot memory first */ + if (g_tree_nnodes(balloon->removed_guest.t) > 0) { + *dtree = balloon->removed_guest; + *dctr = &balloon->removed_guest_ctr; + *is_our_range = false; + } else if (g_tree_nnodes(balloon->removed_both.t) > 0) { + *dtree = balloon->removed_both; + *dctr = &balloon->removed_both_ctr; + *is_our_range = false; + } else if (!our_range) { + return false; + } else if (!our_range_is_removed_tree_empty(our_range, false)) { + *dtree = our_range_get_removed_tree(our_range, false); + *dctr = &balloon->removed_guest_ctr; + *is_our_range = true; + } else if (!our_range_is_removed_tree_empty(our_range, true)) { + *dtree = our_range_get_removed_tree(our_range, true); + *dctr = &balloon->removed_both_ctr; + *is_our_range = true; + } else { + return false; + } + + return true; +} + +static void hv_balloon_unballoon_rb_wait(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_unballoon_request *ur; + size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]); + + assert(balloon->state == S_UNBALLOON_RB_WAIT); + + if (vmbus_channel_reserve(chan, 0, ur_size) < 0) { + return; + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_POSTING); +} + +static void hv_balloon_unballoon_posting(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + PageRangeTree dtree; + uint64_t *dctr; + bool our_range; + struct dm_unballoon_request *ur; + size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]); + PageRange range; + bool bret; + ssize_t ret; + + assert(balloon->state == S_UNBALLOON_POSTING); + assert(balloon->unballoon_diff > 0); + + if (!hv_balloon_unballoon_get_source(balloon, &dtree, &dctr, &our_range)) { + error_report("trying to unballoon but nothing seems to be ballooned"); + /* + * there is little we can do as we might have already + * sent the guest a partial request we can't cancel + */ + return; + } + + assert(balloon->our_range || !our_range); + assert(dtree.t); + assert(dctr); + + ur = alloca(ur_size); + memset(ur, 0, ur_size); + ur->hdr.type = DM_UNBALLOON_REQUEST; + ur->hdr.size = ur_size; + ur->hdr.trans_id = balloon->trans_id; + + bret = hvb_page_range_tree_pop(dtree, &range, MIN(balloon->unballoon_diff, + HV_BALLOON_HA_CHUNK_PAGES)); + assert(bret); + /* TODO: madvise? */ + + *dctr -= range.count; + balloon->unballoon_diff -= range.count; + + ur->range_count = 1; + ur->range_array[0].finfo.start_page = range.start; + ur->range_array[0].finfo.page_cnt = range.count; + ur->more_pages = balloon->unballoon_diff > 0; + + trace_hv_balloon_outgoing_unballoon(ur->hdr.trans_id, + range.count, range.start, + balloon->unballoon_diff); + + if (ur->more_pages) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT); + } else { + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_REPLY_WAIT); + } + + ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, ur, ur_size, false, + ur->hdr.trans_id); + if (ret <= 0) { + error_report("error %zd when posting unballoon msg, expect problems", + ret); + } +} + +static bool hv_balloon_our_range_ensure(HvBalloon *balloon) +{ + uint64_t align; + MemoryRegion *hostmem_mr; + g_autoptr(OurRangeMemslots) our_range_memslots = NULL; + OurRange *our_range; + + if (balloon->our_range) { + return true; + } + + if (!balloon->hostmem) { + return false; + } + + align = (1 << balloon->caps.cap_bits.hot_add_alignment) * MiB; + assert(QEMU_IS_ALIGNED(balloon->addr, align)); + + hostmem_mr = host_memory_backend_get_memory(balloon->hostmem); + + our_range_memslots = hvb_our_range_memslots_new(balloon->addr, + balloon->mr, hostmem_mr, + OBJECT(balloon), + balloon->memslot_count, + balloon->memslot_size); + our_range = OUR_RANGE(our_range_memslots); + + if (hvb_page_range_tree_intree_any(balloon->removed_guest, + our_range->range.start, + our_range->range.count) || + hvb_page_range_tree_intree_any(balloon->removed_both, + our_range->range.start, + our_range->range.count)) { + error_report("some parts of the memory backend were already returned by the guest. this should not happen, please reboot the guest and try again"); + return false; + } + + trace_hv_balloon_our_range_add(our_range->range.count, + our_range->range.start); + + balloon->our_range = g_steal_pointer(&our_range_memslots); + return true; +} + +static void hv_balloon_hot_add_setup(HvBalloon *balloon, StateDesc *stdesc) +{ + /* need to make copy since it is in union with hot_add_range */ + uint64_t hot_add_diff = balloon->hot_add_diff; + PageRange *hot_add_range = &balloon->hot_add_range; + uint64_t align, our_range_remaining; + OurRange *our_range; + + assert(balloon->state == S_HOT_ADD_SETUP); + assert(hot_add_diff > 0); + + if (!hv_balloon_our_range_ensure(balloon)) { + goto ret_idle; + } + + our_range = OUR_RANGE(balloon->our_range); + + align = (1 << balloon->caps.cap_bits.hot_add_alignment) * + (MiB / HV_BALLOON_PAGE_SIZE); + + /* Absolute GPA in pages */ + hot_add_range->start = our_range_get_remaining_start(our_range); + assert(QEMU_IS_ALIGNED(hot_add_range->start, align)); + + our_range_remaining = our_range_get_remaining_size(our_range); + hot_add_range->count = MIN(our_range_remaining, hot_add_diff); + hot_add_range->count = QEMU_ALIGN_DOWN(hot_add_range->count, align); + if (hot_add_range->count == 0) { + goto ret_idle; + } + + hvb_our_range_memslots_ensure_mapped_additional(balloon->our_range, + hot_add_range->count); + + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT); + return; + +ret_idle: + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); +} + +static void hv_balloon_hot_add_rb_wait(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_hot_add *ha; + size_t ha_size = sizeof(*ha) + sizeof(ha->range); + + assert(balloon->state == S_HOT_ADD_RB_WAIT); + + if (vmbus_channel_reserve(chan, 0, ha_size) < 0) { + return; + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_POSTING); +} + +static void hv_balloon_hot_add_posting(HvBalloon *balloon, StateDesc *stdesc) +{ + PageRange *hot_add_range = &balloon->hot_add_range; + uint64_t *current_count = &balloon->ha_current_count; + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_hot_add *ha; + size_t ha_size = sizeof(*ha) + sizeof(ha->range); + union dm_mem_page_range *ha_region; + uint64_t align, chunk_max_size; + ssize_t ret; + + assert(balloon->state == S_HOT_ADD_POSTING); + assert(hot_add_range->count > 0); + + align = (1 << balloon->caps.cap_bits.hot_add_alignment) * + (MiB / HV_BALLOON_PAGE_SIZE); + if (align >= HV_BALLOON_HA_CHUNK_PAGES) { + /* + * If the required alignment is higher than the chunk size we let it + * override that size. + */ + chunk_max_size = align; + } else { + chunk_max_size = QEMU_ALIGN_DOWN(HV_BALLOON_HA_CHUNK_PAGES, align); + } + + /* + * hot_add_range->count starts aligned in hv_balloon_hot_add_setup(), + * then it is either reduced by subtracting aligned current_count or + * further hot-adds are prevented by marking the whole remaining our range + * as unusable in hv_balloon_handle_hot_add_response(). + */ + *current_count = MIN(hot_add_range->count, chunk_max_size); + + ha = alloca(ha_size); + ha_region = &(&ha->range)[1]; + memset(ha, 0, ha_size); + ha->hdr.type = DM_MEM_HOT_ADD_REQUEST; + ha->hdr.size = ha_size; + ha->hdr.trans_id = balloon->trans_id; + + ha->range.finfo.start_page = hot_add_range->start; + ha->range.finfo.page_cnt = *current_count; + ha_region->finfo.start_page = hot_add_range->start; + ha_region->finfo.page_cnt = ha->range.finfo.page_cnt; + + trace_hv_balloon_outgoing_hot_add(ha->hdr.trans_id, + *current_count, hot_add_range->start); + + ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, ha, ha_size, false, + ha->hdr.trans_id); + if (ret <= 0) { + error_report("error %zd when posting hot add msg, expect problems", + ret); + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_REPLY_WAIT); +} + +static void hv_balloon_balloon_rb_wait(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + size_t bl_size = sizeof(struct dm_balloon); + + assert(balloon->state == S_BALLOON_RB_WAIT); + + if (vmbus_channel_reserve(chan, 0, bl_size) < 0) { + return; + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_POSTING); +} + +static void hv_balloon_balloon_posting(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan = hv_balloon_get_channel(balloon); + struct dm_balloon bl; + size_t bl_size = sizeof(bl); + ssize_t ret; + + assert(balloon->state == S_BALLOON_POSTING); + assert(balloon->balloon_diff > 0); + + memset(&bl, 0, sizeof(bl)); + bl.hdr.type = DM_BALLOON_REQUEST; + bl.hdr.size = bl_size; + bl.hdr.trans_id = balloon->trans_id; + bl.num_pages = MIN(balloon->balloon_diff, HV_BALLOON_HR_CHUNK_PAGES); + + trace_hv_balloon_outgoing_balloon(bl.hdr.trans_id, bl.num_pages, + balloon->balloon_diff); + + ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND, + NULL, 0, &bl, bl_size, false, + bl.hdr.trans_id); + if (ret <= 0) { + error_report("error %zd when posting balloon msg, expect problems", + ret); + } + + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_REPLY_WAIT); +} + +static void hv_balloon_idle_state_process_target(HvBalloon *balloon, + StateDesc *stdesc) +{ + bool can_balloon = balloon->caps.cap_bits.balloon; + uint64_t ram_size_pages, total_removed; + + ram_size_pages = hv_balloon_total_ram(balloon); + total_removed = hv_balloon_total_removed_rs(balloon, ram_size_pages); + + /* + * we need to cache the values computed from the balloon target value when + * starting the adjustment procedure in case someone changes the target when + * the procedure is in progress + */ + if (balloon->target > ram_size_pages - total_removed) { + bool can_hot_add = balloon->caps.cap_bits.hot_add; + uint64_t target_diff = balloon->target - + (ram_size_pages - total_removed); + + balloon->unballoon_diff = MIN(target_diff, total_removed); + + if (can_hot_add) { + balloon->hot_add_diff = target_diff - balloon->unballoon_diff; + } else { + balloon->hot_add_diff = 0; + } + + if (balloon->unballoon_diff > 0) { + assert(can_balloon); + HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT); + } else if (balloon->hot_add_diff > 0) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP); + } + } else if (can_balloon && + balloon->target < ram_size_pages - total_removed) { + balloon->balloon_diff = ram_size_pages - total_removed - + balloon->target; + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT); + } +} + +static void hv_balloon_idle_state(HvBalloon *balloon, + StateDesc *stdesc) +{ + assert(balloon->state == S_IDLE); + + if (balloon->target_changed) { + balloon->target_changed = false; + hv_balloon_idle_state_process_target(balloon, stdesc); + return; + } +} + +static const struct { + void (*handler)(HvBalloon *balloon, StateDesc *stdesc); +} state_handlers[] = { + [S_IDLE].handler = hv_balloon_idle_state, + [S_BALLOON_POSTING].handler = hv_balloon_balloon_posting, + [S_BALLOON_RB_WAIT].handler = hv_balloon_balloon_rb_wait, + [S_UNBALLOON_POSTING].handler = hv_balloon_unballoon_posting, + [S_UNBALLOON_RB_WAIT].handler = hv_balloon_unballoon_rb_wait, + [S_HOT_ADD_SETUP].handler = hv_balloon_hot_add_setup, + [S_HOT_ADD_RB_WAIT].handler = hv_balloon_hot_add_rb_wait, + [S_HOT_ADD_POSTING].handler = hv_balloon_hot_add_posting, +}; + +static void hv_balloon_handle_state(HvBalloon *balloon, StateDesc *stdesc) +{ + if (balloon->state >= ARRAY_SIZE(state_handlers) || + !state_handlers[balloon->state].handler) { + return; + } + + state_handlers[balloon->state].handler(balloon, stdesc); +} + +static void hv_balloon_remove_response_insert_range(PageRangeTree tree, + const PageRange *range, + uint64_t *ctr1, + uint64_t *ctr2, + uint64_t *ctr3) +{ + uint64_t dupcount, effcount; + + if (range->count == 0) { + return; + } + + dupcount = 0; + hvb_page_range_tree_insert(tree, range->start, range->count, &dupcount); + + assert(dupcount <= range->count); + effcount = range->count - dupcount; + + *ctr1 += effcount; + *ctr2 += effcount; + if (ctr3) { + *ctr3 += effcount; + } +} + +static void hv_balloon_remove_response_handle_range(HvBalloon *balloon, + PageRange *range, + bool both, + uint64_t *removedctr) +{ + OurRange *our_range = OUR_RANGE(balloon->our_range); + PageRangeTree globaltree = + both ? balloon->removed_both : balloon->removed_guest; + uint64_t *globalctr = + both ? &balloon->removed_both_ctr : &balloon->removed_guest_ctr; + PageRange rangeeff; + + if (range->count == 0) { + return; + } + + trace_hv_balloon_remove_response(range->count, range->start, both); + + if (our_range) { + /* Includes the not-yet-hot-added and unusable parts. */ + rangeeff = our_range->range; + } else { + rangeeff.start = rangeeff.count = 0; + } + + if (page_range_intersection_size(range, rangeeff.start, rangeeff.count) > 0) { + PageRangeTree ourtree = our_range_get_removed_tree(our_range, both); + PageRange rangehole, rangecommon; + uint64_t ourremoved = 0; + + /* process the hole before our range, if it exists */ + page_range_part_before(range, rangeeff.start, &rangehole); + hv_balloon_remove_response_insert_range(globaltree, &rangehole, + globalctr, removedctr, NULL); + if (rangehole.count > 0) { + trace_hv_balloon_remove_response_hole(rangehole.count, + rangehole.start, + range->count, range->start, + rangeeff.start, both); + } + + /* process our part */ + page_range_intersect(range, rangeeff.start, rangeeff.count, + &rangecommon); + hv_balloon_remove_response_insert_range(ourtree, &rangecommon, + globalctr, removedctr, + &ourremoved); + if (rangecommon.count > 0) { + trace_hv_balloon_remove_response_common(rangecommon.count, + rangecommon.start, + range->count, range->start, + rangeeff.count, + rangeeff.start, ourremoved, + both); + } + + /* calculate what's left after our range */ + rangecommon = *range; + page_range_part_after(&rangecommon, rangeeff.start, rangeeff.count, + range); + } + + /* process the remainder of the range that lies after our range */ + if (range->count > 0) { + hv_balloon_remove_response_insert_range(globaltree, range, + globalctr, removedctr, NULL); + trace_hv_balloon_remove_response_remainder(range->count, range->start, + both); + range->count = 0; + } +} + +static void hv_balloon_remove_response_handle_pages(HvBalloon *balloon, + PageRange *range, + uint64_t start, + uint64_t count, + bool both, + uint64_t *removedctr) +{ + assert(count > 0); + + /* + * if there is an existing range that the new range can't be joined to + * dump it into tree(s) + */ + if (range->count > 0 && !page_range_joinable(range, start, count)) { + hv_balloon_remove_response_handle_range(balloon, range, both, + removedctr); + } + + if (range->count == 0) { + range->start = start; + range->count = count; + } else if (page_range_joinable_left(range, start, count)) { + range->start = start; + range->count += count; + } else { /* page_range_joinable_right() */ + range->count += count; + } +} + +static gboolean hv_balloon_handle_remove_host_addr_node(gpointer key, + gpointer value, + gpointer data) +{ + PageRange *range = value; + uint64_t pageoff; + + for (pageoff = 0; pageoff < range->count; ) { + uint64_t addr_64 = (range->start + pageoff) * HV_BALLOON_PAGE_SIZE; + void *addr; + RAMBlock *rb; + ram_addr_t rb_offset; + size_t rb_page_size; + size_t discard_size; + + assert(addr_64 <= UINTPTR_MAX); + addr = (void *)((uintptr_t)addr_64); + rb = qemu_ram_block_from_host(addr, false, &rb_offset); + rb_page_size = qemu_ram_pagesize(rb); + + if (rb_page_size != HV_BALLOON_PAGE_SIZE) { + /* TODO: these should end in "removed_guest" */ + warn_report("guest reported removed page backed by unsupported page size %zu", + rb_page_size); + pageoff++; + continue; + } + + discard_size = MIN(range->count - pageoff, + (rb->max_length - rb_offset) / + HV_BALLOON_PAGE_SIZE); + discard_size = MAX(discard_size, 1); + + if (ram_block_discard_range(rb, rb_offset, discard_size * + HV_BALLOON_PAGE_SIZE) != 0) { + warn_report("guest reported removed page failed discard"); + } + + pageoff += discard_size; + } + + return false; +} + +static void hv_balloon_handle_remove_host_addr_tree(PageRangeTree tree) +{ + g_tree_foreach(tree.t, hv_balloon_handle_remove_host_addr_node, NULL); +} + +static int hv_balloon_handle_remove_section(PageRangeTree tree, + const MemoryRegionSection *section, + uint64_t count) +{ + void *addr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + uint64_t addr_page; + + assert(count > 0); + + if ((uintptr_t)addr % HV_BALLOON_PAGE_SIZE) { + warn_report("guest reported removed pages at an unaligned host addr %p", + addr); + return -EINVAL; + } + + addr_page = (uintptr_t)addr / HV_BALLOON_PAGE_SIZE; + hvb_page_range_tree_insert(tree, addr_page, count, NULL); + + return 0; +} + +static void hv_balloon_handle_remove_ranges(HvBalloon *balloon, + union dm_mem_page_range ranges[], + uint32_t count) +{ + uint64_t removedcnt; + PageRangeTree removed_host_addr; + PageRange range_guest, range_both; + + hvb_page_range_tree_init(&removed_host_addr); + range_guest.count = range_both.count = removedcnt = 0; + for (unsigned int ctr = 0; ctr < count; ctr++) { + union dm_mem_page_range *mr = &ranges[ctr]; + hwaddr pa; + MemoryRegionSection section; + + for (unsigned int offset = 0; offset < mr->finfo.page_cnt; ) { + int ret; + uint64_t pageno = mr->finfo.start_page + offset; + uint64_t pagecnt = 1; + + pa = (hwaddr)pageno << HV_BALLOON_PFN_SHIFT; + section = memory_region_find(get_system_memory(), pa, + (mr->finfo.page_cnt - offset) * + HV_BALLOON_PAGE_SIZE); + if (!section.mr) { + warn_report("guest reported removed page %"PRIu64" not found in RAM", + pageno); + ret = -EINVAL; + goto finish_page; + } + + pagecnt = int128_get64(section.size) / HV_BALLOON_PAGE_SIZE; + if (pagecnt <= 0) { + warn_report("guest reported removed page %"PRIu64" in a section smaller than page size", + pageno); + pagecnt = 1; /* skip the whole page */ + ret = -EINVAL; + goto finish_page; + } + + if (!memory_region_is_ram(section.mr) || + memory_region_is_rom(section.mr) || + memory_region_is_romd(section.mr)) { + warn_report("guest reported removed page %"PRIu64" in a section that is not an ordinary RAM", + pageno); + ret = -EINVAL; + goto finish_page; + } + + ret = hv_balloon_handle_remove_section(removed_host_addr, §ion, + pagecnt); + + finish_page: + if (ret == 0) { + hv_balloon_remove_response_handle_pages(balloon, + &range_both, + pageno, pagecnt, + true, &removedcnt); + } else { + hv_balloon_remove_response_handle_pages(balloon, + &range_guest, + pageno, pagecnt, + false, &removedcnt); + } + + if (section.mr) { + memory_region_unref(section.mr); + } + + offset += pagecnt; + } + } + + hv_balloon_remove_response_handle_range(balloon, &range_both, true, + &removedcnt); + hv_balloon_remove_response_handle_range(balloon, &range_guest, false, + &removedcnt); + + hv_balloon_handle_remove_host_addr_tree(removed_host_addr); + hvb_page_range_tree_destroy(&removed_host_addr); + + if (removedcnt > balloon->balloon_diff) { + warn_report("guest reported more pages removed than currently pending (%"PRIu64" vs %"PRIu64")", + removedcnt, balloon->balloon_diff); + balloon->balloon_diff = 0; + } else { + balloon->balloon_diff -= removedcnt; + } +} + +static bool hv_balloon_handle_msg_size(HvBalloonReq *req, size_t minsize, + const char *msgname) +{ + VMBusChanReq *vmreq = &req->vmreq; + uint32_t msglen = vmreq->msglen; + + if (msglen >= minsize) { + return true; + } + + warn_report("%s message too short (%u vs %zu), ignoring", msgname, + (unsigned int)msglen, minsize); + return false; +} + +static void hv_balloon_handle_version_request(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_version_request *msgVr = vmreq->msg; + struct dm_version_response respVr; + + if (balloon->state != S_VERSION) { + warn_report("unexpected DM_VERSION_REQUEST in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgVr), + "DM_VERSION_REQUEST")) { + return; + } + + trace_hv_balloon_incoming_version(msgVr->version.major_version, + msgVr->version.minor_version); + + memset(&respVr, 0, sizeof(respVr)); + respVr.hdr.type = DM_VERSION_RESPONSE; + respVr.hdr.size = sizeof(respVr); + respVr.hdr.trans_id = msgVr->hdr.trans_id; + respVr.is_accepted = msgVr->version.version >= DYNMEM_PROTOCOL_VERSION_1 && + msgVr->version.version <= DYNMEM_PROTOCOL_VERSION_3; + + hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respVr); + + if (respVr.is_accepted) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_CAPS); + } +} + +static void hv_balloon_handle_caps_report(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_capabilities *msgCap = vmreq->msg; + struct dm_capabilities_resp_msg respCap; + + if (balloon->state != S_CAPS) { + warn_report("unexpected DM_CAPABILITIES_REPORT in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgCap), + "DM_CAPABILITIES_REPORT")) { + return; + } + + trace_hv_balloon_incoming_caps(msgCap->caps.caps); + balloon->caps = msgCap->caps; + + memset(&respCap, 0, sizeof(respCap)); + respCap.hdr.type = DM_CAPABILITIES_RESPONSE; + respCap.hdr.size = sizeof(respCap); + respCap.hdr.trans_id = msgCap->hdr.trans_id; + respCap.is_accepted = 1; + respCap.hot_remove = 1; + respCap.suppress_pressure_reports = !balloon->status_report.enabled; + hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respCap); + + timer_mod(&balloon->post_init_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + HV_BALLOON_POST_INIT_WAIT); + + HV_BALLOON_STATE_DESC_SET(stdesc, S_POST_INIT_WAIT); +} + +static void hv_balloon_handle_status_report(HvBalloon *balloon, + HvBalloonReq *req) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_status *msgStatus = vmreq->msg; + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgStatus), + "DM_STATUS_REPORT")) { + return; + } + + if (!balloon->status_report.enabled) { + return; + } + + balloon->status_report.committed = msgStatus->num_committed; + balloon->status_report.committed *= HV_BALLOON_PAGE_SIZE; + balloon->status_report.available = msgStatus->num_avail; + balloon->status_report.available *= HV_BALLOON_PAGE_SIZE; + balloon->status_report.received = true; + + qapi_event_send_hv_balloon_status_report(balloon->status_report.committed, + balloon->status_report.available); +} + +HvBalloonInfo *qmp_query_hv_balloon_status_report(Error **errp) +{ + HvBalloon *balloon; + HvBalloonInfo *info; + + balloon = HV_BALLOON(object_resolve_path_type("", TYPE_HV_BALLOON, NULL)); + if (!balloon) { + error_setg(errp, "no %s device present", TYPE_HV_BALLOON); + return NULL; + } + + if (!balloon->status_report.enabled) { + error_setg(errp, "guest memory status reporting not enabled"); + return NULL; + } + + if (!balloon->status_report.received) { + error_setg(errp, "no guest memory status report received yet"); + return NULL; + } + + info = g_malloc0(sizeof(*info)); + info->committed = balloon->status_report.committed; + info->available = balloon->status_report.available; + return info; +} + +static void hv_balloon_handle_unballoon_response(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_unballoon_response *msgUrR = vmreq->msg; + + if (balloon->state != S_UNBALLOON_REPLY_WAIT) { + warn_report("unexpected DM_UNBALLOON_RESPONSE in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgUrR), + "DM_UNBALLOON_RESPONSE")) + return; + + trace_hv_balloon_incoming_unballoon(msgUrR->hdr.trans_id); + + balloon->trans_id++; + + if (balloon->hot_add_diff > 0) { + bool can_hot_add = balloon->caps.cap_bits.hot_add; + + assert(can_hot_add); + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP); + } else { + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); + } +} + +static void hv_balloon_handle_hot_add_response(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + PageRange *hot_add_range = &balloon->hot_add_range; + VMBusChanReq *vmreq = &req->vmreq; + struct dm_hot_add_response *msgHaR = vmreq->msg; + OurRange *our_range; + + if (balloon->state != S_HOT_ADD_REPLY_WAIT) { + warn_report("unexpected DM_HOT_ADD_RESPONSE in %d state", + balloon->state); + return; + } + + assert(balloon->our_range); + our_range = OUR_RANGE(balloon->our_range); + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgHaR), + "DM_HOT_ADD_RESPONSE")) + return; + + trace_hv_balloon_incoming_hot_add(msgHaR->hdr.trans_id, msgHaR->result, + msgHaR->page_count); + + balloon->trans_id++; + + if (msgHaR->result) { + if (msgHaR->page_count > balloon->ha_current_count) { + warn_report("DM_HOT_ADD_RESPONSE page count higher than requested (%"PRIu32" vs %"PRIu64")", + msgHaR->page_count, balloon->ha_current_count); + msgHaR->page_count = balloon->ha_current_count; + } + + hvb_our_range_mark_added(our_range, msgHaR->page_count); + hot_add_range->start += msgHaR->page_count; + hot_add_range->count -= msgHaR->page_count; + } + + if (!msgHaR->result || msgHaR->page_count < balloon->ha_current_count) { + /* + * the current planned range was only partially hot-added, take note + * how much of it remains and don't attempt any further hot adds + */ + our_range_mark_remaining_unusable(our_range); + + goto ret_idle; + } + + /* any pages remaining to hot-add in our range? */ + if (hot_add_range->count > 0) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT); + return; + } + +ret_idle: + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); +} + +static void hv_balloon_handle_balloon_response(HvBalloon *balloon, + HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_balloon_response *msgBR = vmreq->msg; + + if (balloon->state != S_BALLOON_REPLY_WAIT) { + warn_report("unexpected DM_BALLOON_RESPONSE in %d state", + balloon->state); + return; + } + + if (!hv_balloon_handle_msg_size(req, sizeof(*msgBR), + "DM_BALLOON_RESPONSE")) + return; + + trace_hv_balloon_incoming_balloon(msgBR->hdr.trans_id, msgBR->range_count, + msgBR->more_pages); + + if (vmreq->msglen < sizeof(*msgBR) + + (uint64_t)sizeof(msgBR->range_array[0]) * msgBR->range_count) { + warn_report("DM_BALLOON_RESPONSE too short for the range count"); + return; + } + + if (msgBR->range_count == 0) { + /* The guest is already at its minimum size */ + balloon->balloon_diff = 0; + goto ret_end_trans; + } else { + hv_balloon_handle_remove_ranges(balloon, + msgBR->range_array, + msgBR->range_count); + } + + /* More responses expected? */ + if (msgBR->more_pages) { + return; + } + +ret_end_trans: + balloon->trans_id++; + + if (balloon->balloon_diff > 0) { + HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT); + } else { + HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE); + } +} + +static void hv_balloon_handle_packet(HvBalloon *balloon, HvBalloonReq *req, + StateDesc *stdesc) +{ + VMBusChanReq *vmreq = &req->vmreq; + struct dm_message *msg = vmreq->msg; + + if (vmreq->msglen < sizeof(msg->hdr)) { + return; + } + + switch (msg->hdr.type) { + case DM_VERSION_REQUEST: + hv_balloon_handle_version_request(balloon, req, stdesc); + break; + + case DM_CAPABILITIES_REPORT: + hv_balloon_handle_caps_report(balloon, req, stdesc); + break; + + case DM_STATUS_REPORT: + hv_balloon_handle_status_report(balloon, req); + break; + + case DM_MEM_HOT_ADD_RESPONSE: + hv_balloon_handle_hot_add_response(balloon, req, stdesc); + break; + + case DM_UNBALLOON_RESPONSE: + hv_balloon_handle_unballoon_response(balloon, req, stdesc); + break; + + case DM_BALLOON_RESPONSE: + hv_balloon_handle_balloon_response(balloon, req, stdesc); + break; + + default: + warn_report("unknown DM message %u", msg->hdr.type); + break; + } +} + +static bool hv_balloon_recv_channel(HvBalloon *balloon, StateDesc *stdesc) +{ + VMBusChannel *chan; + HvBalloonReq *req; + + if (balloon->state == S_WAIT_RESET || + balloon->state == S_POST_RESET_CLOSED) { + return false; + } + + chan = hv_balloon_get_channel(balloon); + if (vmbus_channel_recv_start(chan)) { + return false; + } + + while ((req = vmbus_channel_recv_peek(chan, sizeof(*req)))) { + hv_balloon_handle_packet(balloon, req, stdesc); + vmbus_free_req(req); + vmbus_channel_recv_pop(chan); + + if (stdesc->state != S_NO_CHANGE) { + break; + } + } + + return vmbus_channel_recv_done(chan) > 0; +} + +/* old state handler -> new state transition (potential) */ +static bool hv_balloon_event_loop_state(HvBalloon *balloon) +{ + StateDesc state_new = HV_BALLOON_STATE_DESC_INIT; + + hv_balloon_handle_state(balloon, &state_new); + return hv_balloon_state_set(balloon, state_new.state, state_new.desc); +} + +/* VMBus message -> new state transition (potential) */ +static bool hv_balloon_event_loop_recv(HvBalloon *balloon) +{ + StateDesc state_new = HV_BALLOON_STATE_DESC_INIT; + bool any_recv, state_changed; + + any_recv = hv_balloon_recv_channel(balloon, &state_new); + state_changed = hv_balloon_state_set(balloon, + state_new.state, state_new.desc); + + return state_changed || any_recv; +} + +static void hv_balloon_event_loop(HvBalloon *balloon) +{ + bool state_repeat, recv_repeat; + + do { + state_repeat = hv_balloon_event_loop_state(balloon); + recv_repeat = hv_balloon_event_loop_recv(balloon); + } while (state_repeat || recv_repeat); +} + +static void hv_balloon_vmdev_chan_notify(VMBusChannel *chan) +{ + HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan)); + + hv_balloon_event_loop(balloon); +} + +static void hv_balloon_stat(void *opaque, BalloonInfo *info) +{ + HvBalloon *balloon = opaque; + info->actual = (hv_balloon_total_ram(balloon) - balloon->removed_both_ctr) + << HV_BALLOON_PFN_SHIFT; +} + +static void hv_balloon_to_target(void *opaque, ram_addr_t target) +{ + HvBalloon *balloon = opaque; + uint64_t target_pages = target >> HV_BALLOON_PFN_SHIFT; + + if (!target_pages) { + return; + } + + /* + * always set target_changed, even with unchanged target, as the user + * might be asking us to try again reaching it + */ + balloon->target = target_pages; + balloon->target_changed = true; + + hv_balloon_event_loop(balloon); +} + +static int hv_balloon_vmdev_open_channel(VMBusChannel *chan) +{ + HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan)); + + if (balloon->state != S_POST_RESET_CLOSED) { + warn_report("guest trying to open a DM channel in invalid %d state", + balloon->state); + return -EINVAL; + } + + HV_BALLOON_SET_STATE(balloon, S_VERSION); + hv_balloon_event_loop(balloon); + + return 0; +} + +static void hv_balloon_vmdev_close_channel(VMBusChannel *chan) +{ + HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan)); + + timer_del(&balloon->post_init_timer); + + /* Don't report stale data */ + balloon->status_report.received = false; + + HV_BALLOON_SET_STATE(balloon, S_WAIT_RESET); + hv_balloon_event_loop(balloon); +} + +static void hv_balloon_post_init_timer(void *opaque) +{ + HvBalloon *balloon = opaque; + + if (balloon->state != S_POST_INIT_WAIT) { + return; + } + + HV_BALLOON_SET_STATE(balloon, S_IDLE); + hv_balloon_event_loop(balloon); +} + +static void hv_balloon_system_reset_unrealize_common(HvBalloon *balloon) +{ + g_clear_pointer(&balloon->our_range, hvb_our_range_memslots_free); +} + +static void hv_balloon_system_reset(void *opaque) +{ + HvBalloon *balloon = HV_BALLOON(opaque); + + hv_balloon_system_reset_unrealize_common(balloon); +} + +static void hv_balloon_ensure_mr(HvBalloon *balloon) +{ + MemoryRegion *hostmem_mr; + + assert(balloon->hostmem); + + if (balloon->mr) { + return; + } + + hostmem_mr = host_memory_backend_get_memory(balloon->hostmem); + + balloon->mr = g_new0(MemoryRegion, 1); + memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON, + memory_region_size(hostmem_mr)); + + /* + * The VM can indicate an alignment up to 32 GiB. Memory device core can + * usually only handle/guarantee 1 GiB alignment. The user will have to + * specify a larger maxmem eventually. + * + * The memory device core will warn the user in case maxmem might have to be + * increased and will fail plugging the device if there is not sufficient + * space after alignment. + * + * TODO: we could do the alignment ourselves in a slightly bigger region. + * But this feels better, although the warning might be annoying. Maybe + * we can optimize that in the future (e.g., with such a device on the + * cmdline place/size the device memory region differently. + */ + balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr)); +} + +static void hv_balloon_free_mr(HvBalloon *balloon) +{ + if (!balloon->mr) { + return; + } + + object_unparent(OBJECT(balloon->mr)); + g_clear_pointer(&balloon->mr, g_free); +} + +static void hv_balloon_vmdev_realize(VMBusDevice *vdev, Error **errp) +{ + ERRP_GUARD(); + HvBalloon *balloon = HV_BALLOON(vdev); + int ret; + + balloon->state = S_WAIT_RESET; + + ret = qemu_add_balloon_handler(hv_balloon_to_target, hv_balloon_stat, + balloon); + if (ret < 0) { + /* This also protects against having multiple hv-balloon instances */ + error_setg(errp, "Only one balloon device is supported"); + return; + } + + if (balloon->hostmem) { + if (host_memory_backend_is_mapped(balloon->hostmem)) { + Object *obj = OBJECT(balloon->hostmem); + + error_setg(errp, "'%s' property specifies a busy memdev: %s", + HV_BALLOON_MEMDEV_PROP, + object_get_canonical_path_component(obj)); + goto out_balloon_handler; + } + + hv_balloon_ensure_mr(balloon); + + /* This is rather unlikely to happen, but let's still check for it. */ + if (!QEMU_IS_ALIGNED(memory_region_size(balloon->mr), + HV_BALLOON_PAGE_SIZE)) { + error_setg(errp, "'%s' property memdev size has to be a multiple of 0x%" PRIx64, + HV_BALLOON_MEMDEV_PROP, (uint64_t)HV_BALLOON_PAGE_SIZE); + goto out_balloon_handler; + } + + host_memory_backend_set_mapped(balloon->hostmem, true); + vmstate_register_ram(host_memory_backend_get_memory(balloon->hostmem), + DEVICE(balloon)); + } else if (balloon->addr) { + error_setg(errp, "'%s' property must not be set without a memdev", + HV_BALLOON_MEMDEV_PROP); + goto out_balloon_handler; + } + + timer_init_ms(&balloon->post_init_timer, QEMU_CLOCK_VIRTUAL, + hv_balloon_post_init_timer, balloon); + + qemu_register_reset(hv_balloon_system_reset, balloon); + + return; + +out_balloon_handler: + qemu_remove_balloon_handler(balloon); +} + +/* + * VMBus device reset has to be implemented in case the guest decides to + * disconnect and reconnect to the VMBus without rebooting the whole system. + * + * However, the hot-added memory can't be removed here as Windows keeps on using + * it until the system is restarted, even after disconnecting from the VMBus. + */ +static void hv_balloon_vmdev_reset(VMBusDevice *vdev) +{ + HvBalloon *balloon = HV_BALLOON(vdev); + + if (balloon->state == S_POST_RESET_CLOSED) { + return; + } + + if (balloon->our_range) { + hvb_our_range_clear_removed_trees(OUR_RANGE(balloon->our_range)); + } + + hvb_page_range_tree_destroy(&balloon->removed_guest); + hvb_page_range_tree_destroy(&balloon->removed_both); + hvb_page_range_tree_init(&balloon->removed_guest); + hvb_page_range_tree_init(&balloon->removed_both); + + balloon->trans_id = 0; + balloon->removed_guest_ctr = 0; + balloon->removed_both_ctr = 0; + + HV_BALLOON_SET_STATE(balloon, S_POST_RESET_CLOSED); + hv_balloon_event_loop(balloon); +} + +/* + * Clean up things that were (possibly) allocated pre-realization, for example + * from memory_device_pre_plug(), so we don't leak them if the device don't + * actually get realized in the end. + */ +static void hv_balloon_unrealize_finalize_common(HvBalloon *balloon) +{ + hv_balloon_free_mr(balloon); + balloon->addr = 0; + + balloon->memslot_count = 0; +} + +static void hv_balloon_vmdev_unrealize(VMBusDevice *vdev) +{ + HvBalloon *balloon = HV_BALLOON(vdev); + + qemu_unregister_reset(hv_balloon_system_reset, balloon); + + hv_balloon_system_reset_unrealize_common(balloon); + + qemu_remove_balloon_handler(balloon); + + if (balloon->hostmem) { + vmstate_unregister_ram(host_memory_backend_get_memory(balloon->hostmem), + DEVICE(balloon)); + host_memory_backend_set_mapped(balloon->hostmem, false); + } + + hvb_page_range_tree_destroy(&balloon->removed_guest); + hvb_page_range_tree_destroy(&balloon->removed_both); + + hv_balloon_unrealize_finalize_common(balloon); +} + +static uint64_t hv_balloon_md_get_addr(const MemoryDeviceState *md) +{ + return object_property_get_uint(OBJECT(md), HV_BALLOON_ADDR_PROP, + &error_abort); +} + +static void hv_balloon_md_set_addr(MemoryDeviceState *md, uint64_t addr, + Error **errp) +{ + object_property_set_uint(OBJECT(md), HV_BALLOON_ADDR_PROP, addr, errp); +} + +static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md, + Error **errp) +{ + HvBalloon *balloon = HV_BALLOON(md); + + if (!balloon->hostmem) { + return NULL; + } + + hv_balloon_ensure_mr(balloon); + + return balloon->mr; +} + +static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md, + MemoryDeviceInfo *info) +{ + HvBalloonDeviceInfo *hi = g_new0(HvBalloonDeviceInfo, 1); + const HvBalloon *balloon = HV_BALLOON(md); + DeviceState *dev = DEVICE(md); + + if (dev->id) { + hi->id = g_strdup(dev->id); + } + + if (balloon->hostmem) { + hi->memdev = object_get_canonical_path(OBJECT(balloon->hostmem)); + hi->memaddr = balloon->addr; + hi->has_memaddr = true; + hi->max_size = memory_region_size(balloon->mr); + /* TODO: expose current provided size or something else? */ + } else { + hi->max_size = 0; + } + + info->u.hv_balloon.data = hi; + info->type = MEMORY_DEVICE_INFO_KIND_HV_BALLOON; +} + +static void hv_balloon_decide_memslots(MemoryDeviceState *md, + unsigned int limit) +{ + HvBalloon *balloon = HV_BALLOON(md); + MemoryRegion *hostmem_mr; + uint64_t region_size, memslot_size, memslots; + + /* We're called exactly once, before realizing the device. */ + assert(!balloon->memslot_count); + + /* We should not be called if we don't have a memory backend */ + assert(balloon->hostmem); + + hostmem_mr = host_memory_backend_get_memory(balloon->hostmem); + region_size = memory_region_size(hostmem_mr); + + assert(region_size > 0); + memslot_size = QEMU_ALIGN_UP(region_size / limit, + HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN); + memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size; + + if (memslots > 1) { + balloon->memslot_size = memslot_size; + } else { + balloon->memslot_size = region_size; + } + + assert(memslots <= UINT_MAX); + balloon->memslot_count = memslots; +} + +static unsigned int hv_balloon_get_memslots(MemoryDeviceState *md) +{ + const HvBalloon *balloon = HV_BALLOON(md); + + /* We're called after setting the suggested limit. */ + assert(balloon->memslot_count > 0); + + return balloon->memslot_count; +} + +static void hv_balloon_init(Object *obj) +{ +} + +static void hv_balloon_finalize(Object *obj) +{ + HvBalloon *balloon = HV_BALLOON(obj); + + hv_balloon_unrealize_finalize_common(balloon); +} + +static Property hv_balloon_properties[] = { + DEFINE_PROP_BOOL("status-report", HvBalloon, + status_report.enabled, false), + + /* MEMORY_DEVICE props */ + DEFINE_PROP_LINK(HV_BALLOON_MEMDEV_PROP, HvBalloon, hostmem, + TYPE_MEMORY_BACKEND, HostMemoryBackend *), + DEFINE_PROP_UINT64(HV_BALLOON_ADDR_PROP, HvBalloon, addr, 0), + + DEFINE_PROP_END_OF_LIST(), +}; + +static void hv_balloon_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VMBusDeviceClass *vdc = VMBUS_DEVICE_CLASS(klass); + MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass); + + device_class_set_props(dc, hv_balloon_properties); + qemu_uuid_parse(HV_BALLOON_GUID, &vdc->classid); + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + + vdc->vmdev_realize = hv_balloon_vmdev_realize; + vdc->vmdev_unrealize = hv_balloon_vmdev_unrealize; + vdc->vmdev_reset = hv_balloon_vmdev_reset; + vdc->open_channel = hv_balloon_vmdev_open_channel; + vdc->close_channel = hv_balloon_vmdev_close_channel; + vdc->chan_notify_cb = hv_balloon_vmdev_chan_notify; + + mdc->get_addr = hv_balloon_md_get_addr; + mdc->set_addr = hv_balloon_md_set_addr; + mdc->get_plugged_size = memory_device_get_region_size; + mdc->get_memory_region = hv_balloon_md_get_memory_region; + mdc->decide_memslots = hv_balloon_decide_memslots; + mdc->get_memslots = hv_balloon_get_memslots; + mdc->fill_device_info = hv_balloon_md_fill_device_info; +} diff --git a/hw/hyperv/meson.build b/hw/hyperv/meson.build index b43f119ea5..d3d2668c71 100644 --- a/hw/hyperv/meson.build +++ b/hw/hyperv/meson.build @@ -2,3 +2,4 @@ specific_ss.add(when: 'CONFIG_HYPERV', if_true: files('hyperv.c')) specific_ss.add(when: 'CONFIG_HYPERV_TESTDEV', if_true: files('hyperv_testdev.c')) specific_ss.add(when: 'CONFIG_VMBUS', if_true: files('vmbus.c')) specific_ss.add(when: 'CONFIG_SYNDBG', if_true: files('syndbg.c')) +specific_ss.add(when: 'CONFIG_HV_BALLOON', if_true: files('hv-balloon.c', 'hv-balloon-page_range_tree.c', 'hv-balloon-our_range_memslots.c'), if_false: files('hv-balloon-stub.c')) diff --git a/hw/hyperv/trace-events b/hw/hyperv/trace-events index b4c35ca8e3..7963c215b1 100644 --- a/hw/hyperv/trace-events +++ b/hw/hyperv/trace-events @@ -16,3 +16,21 @@ vmbus_gpadl_torndown(uint32_t gpadl_id) "gpadl #%d" vmbus_open_channel(uint32_t chan_id, uint32_t gpadl_id, uint32_t target_vp) "channel #%d gpadl #%d target vp %d" vmbus_channel_open(uint32_t chan_id, uint32_t status) "channel #%d status %d" vmbus_close_channel(uint32_t chan_id) "channel #%d" + +# hv-balloon +hv_balloon_state_change(const char *tostr) "-> %s" +hv_balloon_incoming_version(uint16_t major, uint16_t minor) "incoming proto version %u.%u" +hv_balloon_incoming_caps(uint32_t caps) "incoming caps 0x%x" +hv_balloon_outgoing_unballoon(uint32_t trans_id, uint64_t count, uint64_t start, uint64_t rempages) "posting unballoon %"PRIu32" for %"PRIu64" @ 0x%"PRIx64", remaining %"PRIu64 +hv_balloon_incoming_unballoon(uint32_t trans_id) "incoming unballoon response %"PRIu32 +hv_balloon_outgoing_hot_add(uint32_t trans_id, uint64_t count, uint64_t start) "posting hot add %"PRIu32" for %"PRIu64" @ 0x%"PRIx64 +hv_balloon_incoming_hot_add(uint32_t trans_id, uint32_t result, uint32_t count) "incoming hot add response %"PRIu32", result %"PRIu32", count %"PRIu32 +hv_balloon_outgoing_balloon(uint32_t trans_id, uint64_t count, uint64_t rempages) "posting balloon %"PRIu32" for %"PRIu64", remaining %"PRIu64 +hv_balloon_incoming_balloon(uint32_t trans_id, uint32_t range_count, uint32_t more_pages) "incoming balloon response %"PRIu32", ranges %"PRIu32", more %"PRIu32 +hv_balloon_our_range_add(uint64_t count, uint64_t start) "adding our range %"PRIu64" @ 0x%"PRIx64 +hv_balloon_remove_response(uint64_t count, uint64_t start, unsigned int both) "processing remove response range %"PRIu64" @ 0x%"PRIx64", both %u" +hv_balloon_remove_response_hole(uint64_t counthole, uint64_t starthole, uint64_t countrange, uint64_t startrange, uint64_t starthpr, unsigned int both) "response range hole %"PRIu64" @ 0x%"PRIx64" from range %"PRIu64" @ 0x%"PRIx64", before our start 0x%"PRIx64", both %u" +hv_balloon_remove_response_common(uint64_t countcommon, uint64_t startcommon, uint64_t countrange, uint64_t startrange, uint64_t counthpr, uint64_t starthpr, uint64_t removed, unsigned int both) "response common range %"PRIu64" @ 0x%"PRIx64" from range %"PRIu64" @ 0x%"PRIx64" with our %"PRIu64" @ 0x%"PRIx64", removed %"PRIu64", both %u" +hv_balloon_remove_response_remainder(uint64_t count, uint64_t start, unsigned int both) "remove response remaining range %"PRIu64" @ 0x%"PRIx64", both %u" +hv_balloon_map_slot(unsigned int idx, unsigned int total_slots, uint64_t offset) "mapping memslot %u / %u @ 0x%"PRIx64 +hv_balloon_unmap_slot(unsigned int idx, unsigned int total_slots, uint64_t offset) "unmapping memslot %u / %u @ 0x%"PRIx64 diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 94772c726b..55850791df 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -45,6 +45,7 @@ config PC select ACPI_VMGENID select VIRTIO_PMEM_SUPPORTED select VIRTIO_MEM_SUPPORTED + select HV_BALLOON_SUPPORTED config PC_PCI bool diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c index a731738411..b2b4be9983 100644 --- a/hw/i386/kvm/xen_evtchn.c +++ b/hw/i386/kvm/xen_evtchn.c @@ -490,6 +490,12 @@ int xen_evtchn_set_callback_param(uint64_t param) break; } + /* If the guest has set a per-vCPU callback vector, prefer that. */ + if (gsi && kvm_xen_has_vcpu_callback_vector()) { + in_kernel = kvm_xen_has_cap(EVTCHN_SEND); + gsi = 0; + } + if (!ret) { /* If vector delivery was turned *off* then tell the kernel */ if ((s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) == @@ -1129,6 +1135,7 @@ int xen_evtchn_reset_op(struct evtchn_reset *reset) return -ESRCH; } + QEMU_IOTHREAD_LOCK_GUARD(); return xen_evtchn_soft_reset(); } diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c index 21c30e3659..839ec920a1 100644 --- a/hw/i386/kvm/xen_gnttab.c +++ b/hw/i386/kvm/xen_gnttab.c @@ -541,7 +541,5 @@ int xen_gnttab_reset(void) s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access; s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE); - memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1); - return 0; } diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c index 660d0b72f9..8e716a7009 100644 --- a/hw/i386/kvm/xen_xenstore.c +++ b/hw/i386/kvm/xen_xenstore.c @@ -1357,10 +1357,12 @@ static void fire_watch_cb(void *opaque, const char *path, const char *token) } else { deliver_watch(s, path, token); /* - * If the message was queued because there was already ring activity, - * no need to wake the guest. But if not, we need to send the evtchn. + * Attempt to queue the message into the actual ring, and send + * the event channel notification if any bytes are copied. */ - xen_be_evtchn_notify(s->eh, s->be_port); + if (s->rsp_pending && put_rsp(s) > 0) { + xen_be_evtchn_notify(s->eh, s->be_port); + } } } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 6031234a73..1aef21aa2c 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -27,6 +27,7 @@ #include "hw/i386/pc.h" #include "hw/char/serial.h" #include "hw/char/parallel.h" +#include "hw/hyperv/hv-balloon.h" #include "hw/i386/fw_cfg.h" #include "hw/i386/vmport.h" #include "sysemu/cpus.h" @@ -57,6 +58,7 @@ #include "hw/i386/kvm/xen_evtchn.h" #include "hw/i386/kvm/xen_gnttab.h" #include "hw/i386/kvm/xen_xenstore.h" +#include "hw/mem/memory-device.h" #include "e820_memory_layout.h" #include "trace.h" #include CONFIG_DEVICES @@ -1422,6 +1424,21 @@ static void pc_memory_unplug(HotplugHandler *hotplug_dev, error_propagate(errp, local_err); } +static void pc_hv_balloon_pre_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + /* The vmbus handler has no hotplug handler; we should never end up here. */ + g_assert(!dev->hotplugged); + memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL, + errp); +} + +static void pc_hv_balloon_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev)); +} + static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -1452,6 +1469,8 @@ static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, return; } pcms->iommu = dev; + } else if (object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON)) { + pc_hv_balloon_pre_plug(hotplug_dev, dev, errp); } } @@ -1464,6 +1483,8 @@ static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev, x86_cpu_plug(hotplug_dev, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { virtio_md_pci_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON)) { + pc_hv_balloon_plug(hotplug_dev, dev, errp); } } @@ -1505,6 +1526,7 @@ static HotplugHandler *pc_get_hotplug_handler(MachineState *machine, object_dynamic_cast(OBJECT(dev), TYPE_CPU) || object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI) || object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON) || object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE)) { return HOTPLUG_HANDLER(machine); } diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c index ae38f48f16..e0704b8dc3 100644 --- a/hw/mem/memory-device.c +++ b/hw/mem/memory-device.c @@ -20,6 +20,22 @@ #include "exec/address-spaces.h" #include "trace.h" +static bool memory_device_is_empty(const MemoryDeviceState *md) +{ + const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); + Error *local_err = NULL; + MemoryRegion *mr; + + /* dropping const here is fine as we don't touch the memory region */ + mr = mdc->get_memory_region((MemoryDeviceState *)md, &local_err); + if (local_err) { + /* Not empty, we'll report errors later when ontaining the MR again. */ + error_free(local_err); + return false; + } + return !mr; +} + static gint memory_device_addr_sort(gconstpointer a, gconstpointer b) { const MemoryDeviceState *md_a = MEMORY_DEVICE(a); @@ -220,12 +236,6 @@ static uint64_t memory_device_get_free_addr(MachineState *ms, return 0; } - if (!QEMU_IS_ALIGNED(size, align)) { - error_setg(errp, "backend memory size must be multiple of 0x%" - PRIx64, align); - return 0; - } - if (hint) { if (range_init(&new, *hint, size) || !range_contains_range(&as, &new)) { error_setg(errp, "can't add memory device [0x%" PRIx64 ":0x%" PRIx64 @@ -249,6 +259,10 @@ static uint64_t memory_device_get_free_addr(MachineState *ms, uint64_t next_addr; Range tmp; + if (memory_device_is_empty(md)) { + continue; + } + range_init_nofail(&tmp, mdc->get_addr(md), memory_device_get_region_size(md, &error_abort)); @@ -292,6 +306,7 @@ MemoryDeviceInfoList *qmp_memory_device_list(void) const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(item->data); MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1); + /* Let's query infotmation even for empty memory devices. */ mdc->fill_device_info(md, info); QAPI_LIST_APPEND(tail, info); @@ -311,7 +326,7 @@ static int memory_device_plugged_size(Object *obj, void *opaque) const MemoryDeviceState *md = MEMORY_DEVICE(obj); const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(obj); - if (dev->realized) { + if (dev->realized && !memory_device_is_empty(md)) { *size += mdc->get_plugged_size(md, &error_abort); } } @@ -337,6 +352,11 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms, uint64_t addr, align = 0; MemoryRegion *mr; + /* We support empty memory devices even without device memory. */ + if (memory_device_is_empty(md)) { + return; + } + if (!ms->device_memory) { error_setg(errp, "the configuration is not prepared for memory devices" " (e.g., for memory hotplug), consider specifying the" @@ -380,10 +400,17 @@ out: void memory_device_plug(MemoryDeviceState *md, MachineState *ms) { const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); - const unsigned int memslots = memory_device_get_memslots(md); - const uint64_t addr = mdc->get_addr(md); + unsigned int memslots; + uint64_t addr; MemoryRegion *mr; + if (memory_device_is_empty(md)) { + return; + } + + memslots = memory_device_get_memslots(md); + addr = mdc->get_addr(md); + /* * We expect that a previous call to memory_device_pre_plug() succeeded, so * it can't fail at this point. @@ -408,6 +435,10 @@ void memory_device_unplug(MemoryDeviceState *md, MachineState *ms) const unsigned int memslots = memory_device_get_memslots(md); MemoryRegion *mr; + if (memory_device_is_empty(md)) { + return; + } + /* * We expect that a previous call to memory_device_pre_plug() succeeded, so * it can't fail at this point. diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c index cc24812d2e..c3512c2dae 100644 --- a/hw/virtio/virtio-pmem.c +++ b/hw/virtio/virtio-pmem.c @@ -147,7 +147,10 @@ static void virtio_pmem_fill_device_info(const VirtIOPMEM *pmem, static MemoryRegion *virtio_pmem_get_memory_region(VirtIOPMEM *pmem, Error **errp) { - assert(pmem->memdev); + if (!pmem->memdev) { + error_setg(errp, "'%s' property must be set", VIRTIO_PMEM_MEMDEV_PROP); + return NULL; + } return &pmem->memdev->mr; } |