From d884e2727849fc95d337262ec91be60165473c4a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:48 +0100 Subject: libvhost-user: Dynamically allocate memory for memory slots Let's prepare for increasing VHOST_USER_MAX_RAM_SLOTS by dynamically allocating dev->regions. We don't have any ABI guarantees (not dynamically linked), so we can simply change the layout of VuDev. Let's zero out the memory, just as we used to do. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-2-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a3b158c671..360c5366d6 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -2171,6 +2171,8 @@ vu_deinit(VuDev *dev) free(dev->vq); dev->vq = NULL; + free(dev->regions); + dev->regions = NULL; } bool @@ -2205,9 +2207,18 @@ vu_init(VuDev *dev, dev->backend_fd = -1; dev->max_queues = max_queues; + dev->regions = malloc(VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0])); + if (!dev->regions) { + DPRINT("%s: failed to malloc mem regions\n", __func__); + return false; + } + memset(dev->regions, 0, VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0])); + dev->vq = malloc(max_queues * sizeof(dev->vq[0])); if (!dev->vq) { DPRINT("%s: failed to malloc virtqueues\n", __func__); + free(dev->regions); + dev->regions = NULL; return false; } -- cgit 1.4.1 From bec5820908919f70f87b57b92e4c92be128f5cfd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:50 +0100 Subject: libvhost-user: Factor out removing all mem regions Let's factor it out. Note that the check for MAP_FAILED was wrong as we never set mmap_addr if mmap() failed. We'll remove the NULL check separately. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-4-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 34 ++++++++++++++++--------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 360c5366d6..e4907dfc26 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -240,6 +240,22 @@ qva_to_va(VuDev *dev, uint64_t qemu_addr) return NULL; } +static void +vu_remove_all_mem_regs(VuDev *dev) +{ + unsigned int i; + + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + void *ma = (void *)(uintptr_t)r->mmap_addr; + + if (ma) { + munmap(ma, r->size + r->mmap_offset); + } + } + dev->nregions = 0; +} + static void vmsg_close_fds(VhostUserMsg *vmsg) { @@ -1003,14 +1019,7 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) unsigned int i; VhostUserMemory m = vmsg->payload.memory, *memory = &m; - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *r = &dev->regions[i]; - void *ma = (void *) (uintptr_t) r->mmap_addr; - - if (ma) { - munmap(ma, r->size + r->mmap_offset); - } - } + vu_remove_all_mem_regs(dev); dev->nregions = memory->nregions; if (dev->postcopy_listening) { @@ -2112,14 +2121,7 @@ vu_deinit(VuDev *dev) { unsigned int i; - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *r = &dev->regions[i]; - void *m = (void *) (uintptr_t) r->mmap_addr; - if (m != MAP_FAILED) { - munmap(m, r->size + r->mmap_offset); - } - } - dev->nregions = 0; + vu_remove_all_mem_regs(dev); for (i = 0; i < dev->max_queues; i++) { VuVirtq *vq = &dev->vq[i]; -- cgit 1.4.1 From 05a58ce47167a95b554f9afc47f78f267c394c6e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:51 +0100 Subject: libvhost-user: Merge vu_set_mem_table_exec_postcopy() into vu_set_mem_table_exec() Let's reduce some code duplication and prepare for further changes. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-5-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 119 ++++++++++-------------------- 1 file changed, 39 insertions(+), 80 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index e4907dfc26..a7bd7de3cd 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -937,95 +937,23 @@ vu_get_shared_object(VuDev *dev, VhostUserMsg *vmsg) } static bool -vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) +vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) { - unsigned int i; VhostUserMemory m = vmsg->payload.memory, *memory = &m; - dev->nregions = memory->nregions; - - DPRINT("Nregions: %u\n", memory->nregions); - for (i = 0; i < dev->nregions; i++) { - void *mmap_addr; - VhostUserMemoryRegion *msg_region = &memory->regions[i]; - VuDevRegion *dev_region = &dev->regions[i]; - - DPRINT("Region %d\n", i); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; + int prot = PROT_READ | PROT_WRITE; + unsigned int i; - /* We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. + if (dev->postcopy_listening) { + /* * In postcopy we're using PROT_NONE here to catch anyone * accessing it before we userfault */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_NONE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[i], 0); - - if (mmap_addr == MAP_FAILED) { - vu_panic(dev, "region mmap error: %s", strerror(errno)); - } else { - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", - dev_region->mmap_addr); - } - - /* Return the address to QEMU so that it can translate the ufd - * fault addresses back. - */ - msg_region->userspace_addr = (uintptr_t)(mmap_addr + - dev_region->mmap_offset); - close(vmsg->fds[i]); - } - - /* Send the message back to qemu with the addresses filled in */ - vmsg->fd_num = 0; - if (!vu_send_reply(dev, dev->sock, vmsg)) { - vu_panic(dev, "failed to respond to set-mem-table for postcopy"); - return false; - } - - /* Wait for QEMU to confirm that it's registered the handler for the - * faults. - */ - if (!dev->read_msg(dev, dev->sock, vmsg) || - vmsg->size != sizeof(vmsg->payload.u64) || - vmsg->payload.u64 != 0) { - vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); - return false; + prot = PROT_NONE; } - /* OK, now we can go and register the memory and generate faults */ - (void)generate_faults(dev); - - return false; -} - -static bool -vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) -{ - unsigned int i; - VhostUserMemory m = vmsg->payload.memory, *memory = &m; - vu_remove_all_mem_regs(dev); dev->nregions = memory->nregions; - if (dev->postcopy_listening) { - return vu_set_mem_table_exec_postcopy(dev, vmsg); - } - DPRINT("Nregions: %u\n", memory->nregions); for (i = 0; i < dev->nregions; i++) { void *mmap_addr; @@ -1051,8 +979,7 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) * mapped address has to be page aligned, and we use huge * pages. */ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[i], 0); + prot, MAP_SHARED | MAP_NORESERVE, vmsg->fds[i], 0); if (mmap_addr == MAP_FAILED) { vu_panic(dev, "region mmap error: %s", strerror(errno)); @@ -1062,9 +989,41 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) dev_region->mmap_addr); } + if (dev->postcopy_listening) { + /* + * Return the address to QEMU so that it can translate the ufd + * fault addresses back. + */ + msg_region->userspace_addr = (uintptr_t)(mmap_addr + + dev_region->mmap_offset); + } close(vmsg->fds[i]); } + if (dev->postcopy_listening) { + /* Send the message back to qemu with the addresses filled in */ + vmsg->fd_num = 0; + if (!vu_send_reply(dev, dev->sock, vmsg)) { + vu_panic(dev, "failed to respond to set-mem-table for postcopy"); + return false; + } + + /* + * Wait for QEMU to confirm that it's registered the handler for the + * faults. + */ + if (!dev->read_msg(dev, dev->sock, vmsg) || + vmsg->size != sizeof(vmsg->payload.u64) || + vmsg->payload.u64 != 0) { + vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); + return false; + } + + /* OK, now we can go and register the memory and generate faults */ + (void)generate_faults(dev); + return false; + } + for (i = 0; i < dev->max_queues; i++) { if (dev->vq[i].vring.desc) { if (map_ring(dev, &dev->vq[i])) { -- cgit 1.4.1 From 93fec23d8cecebf0e6917044a0c1635df20e350d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:52 +0100 Subject: libvhost-user: Factor out adding a memory region Let's factor it out, reducing quite some code duplication and perparing for further changes. If we fail to mmap a region and panic, we now simply don't add that (broken) region. Note that we now increment dev->nregions as we are successfully adding memory regions, and don't increment dev->nregions if anything went wrong. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-6-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 168 +++++++++++------------------- 1 file changed, 60 insertions(+), 108 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a7bd7de3cd..f43b5096d0 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -256,6 +256,61 @@ vu_remove_all_mem_regs(VuDev *dev) dev->nregions = 0; } +static void +_vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) +{ + int prot = PROT_READ | PROT_WRITE; + VuDevRegion *r; + void *mmap_addr; + + DPRINT("Adding region %d\n", dev->nregions); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr: 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset: 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + if (dev->postcopy_listening) { + /* + * In postcopy we're using PROT_NONE here to catch anyone + * accessing it before we userfault + */ + prot = PROT_NONE; + } + + /* + * We don't use offset argument of mmap() since the mapped address has + * to be page aligned, and we use huge pages. + */ + mmap_addr = mmap(0, msg_region->memory_size + msg_region->mmap_offset, + prot, MAP_SHARED | MAP_NORESERVE, fd, 0); + if (mmap_addr == MAP_FAILED) { + vu_panic(dev, "region mmap error: %s", strerror(errno)); + return; + } + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", + (uint64_t)(uintptr_t)mmap_addr); + + r = &dev->regions[dev->nregions]; + r->gpa = msg_region->guest_phys_addr; + r->size = msg_region->memory_size; + r->qva = msg_region->userspace_addr; + r->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + r->mmap_offset = msg_region->mmap_offset; + dev->nregions++; + + if (dev->postcopy_listening) { + /* + * Return the address to QEMU so that it can translate the ufd + * fault addresses back. + */ + msg_region->userspace_addr = r->mmap_addr + r->mmap_offset; + } +} + static void vmsg_close_fds(VhostUserMsg *vmsg) { @@ -727,10 +782,7 @@ generate_faults(VuDev *dev) { static bool vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { int i; - bool track_ramblocks = dev->postcopy_listening; VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; - VuDevRegion *dev_region = &dev->regions[dev->nregions]; - void *mmap_addr; if (vmsg->fd_num != 1) { vmsg_close_fds(vmsg); @@ -760,69 +812,20 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { * we know all the postcopy client bases have been received, and we * should start generating faults. */ - if (track_ramblocks && + if (dev->postcopy_listening && vmsg->size == sizeof(vmsg->payload.u64) && vmsg->payload.u64 == 0) { (void)generate_faults(dev); return false; } - DPRINT("Adding region: %u\n", dev->nregions); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* - * We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. - */ - if (track_ramblocks) { - /* - * In postcopy we're using PROT_NONE here to catch anyone - * accessing it before we userfault. - */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_NONE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[0], 0); - } else { - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[0], 0); - } - - if (mmap_addr == MAP_FAILED) { - vu_panic(dev, "region mmap error: %s", strerror(errno)); - } else { - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", - dev_region->mmap_addr); - } - + _vu_add_mem_reg(dev, msg_region, vmsg->fds[0]); close(vmsg->fds[0]); - if (track_ramblocks) { - /* - * Return the address to QEMU so that it can translate the ufd - * fault addresses back. - */ - msg_region->userspace_addr = (uintptr_t)(mmap_addr + - dev_region->mmap_offset); - + if (dev->postcopy_listening) { /* Send the message back to qemu with the addresses filled in. */ vmsg->fd_num = 0; DPRINT("Successfully added new region in postcopy\n"); - dev->nregions++; return true; } else { for (i = 0; i < dev->max_queues; i++) { @@ -835,7 +838,6 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { } DPRINT("Successfully added new region\n"); - dev->nregions++; return false; } } @@ -940,63 +942,13 @@ static bool vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) { VhostUserMemory m = vmsg->payload.memory, *memory = &m; - int prot = PROT_READ | PROT_WRITE; unsigned int i; - if (dev->postcopy_listening) { - /* - * In postcopy we're using PROT_NONE here to catch anyone - * accessing it before we userfault - */ - prot = PROT_NONE; - } - vu_remove_all_mem_regs(dev); - dev->nregions = memory->nregions; DPRINT("Nregions: %u\n", memory->nregions); - for (i = 0; i < dev->nregions; i++) { - void *mmap_addr; - VhostUserMemoryRegion *msg_region = &memory->regions[i]; - VuDevRegion *dev_region = &dev->regions[i]; - - DPRINT("Region %d\n", i); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - prot, MAP_SHARED | MAP_NORESERVE, vmsg->fds[i], 0); - - if (mmap_addr == MAP_FAILED) { - vu_panic(dev, "region mmap error: %s", strerror(errno)); - } else { - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", - dev_region->mmap_addr); - } - - if (dev->postcopy_listening) { - /* - * Return the address to QEMU so that it can translate the ufd - * fault addresses back. - */ - msg_region->userspace_addr = (uintptr_t)(mmap_addr + - dev_region->mmap_offset); - } + for (i = 0; i < memory->nregions; i++) { + _vu_add_mem_reg(dev, &memory->regions[i], vmsg->fds[i]); close(vmsg->fds[i]); } -- cgit 1.4.1 From 4f865c3b15a71dbeae999a10256742751410394e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:53 +0100 Subject: libvhost-user: No need to check for NULL when unmapping We never add a memory region if mmap() failed. Therefore, no need to check for NULL. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-7-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index f43b5096d0..225283f764 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -247,11 +247,8 @@ vu_remove_all_mem_regs(VuDev *dev) for (i = 0; i < dev->nregions; i++) { VuDevRegion *r = &dev->regions[i]; - void *ma = (void *)(uintptr_t)r->mmap_addr; - if (ma) { - munmap(ma, r->size + r->mmap_offset); - } + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); } dev->nregions = 0; } @@ -888,11 +885,8 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { for (i = 0; i < dev->nregions; i++) { if (reg_equal(&dev->regions[i], msg_region)) { VuDevRegion *r = &dev->regions[i]; - void *ma = (void *) (uintptr_t) r->mmap_addr; - if (ma) { - munmap(ma, r->size + r->mmap_offset); - } + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); /* * Shift all affected entries by 1 to close the hole at index i and -- cgit 1.4.1 From c6f90b785291389168b15fe797788bcbb5a7803a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:54 +0100 Subject: libvhost-user: Don't zero out memory for memory regions dev->nregions always covers only valid entries. Stop zeroing out other array elements that are unused. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-8-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 225283f764..2e8b611385 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -888,13 +888,9 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); - /* - * Shift all affected entries by 1 to close the hole at index i and - * zero out the last entry. - */ + /* Shift all affected entries by 1 to close the hole at index. */ memmove(dev->regions + i, dev->regions + i + 1, sizeof(VuDevRegion) * (dev->nregions - i - 1)); - memset(dev->regions + dev->nregions - 1, 0, sizeof(VuDevRegion)); DPRINT("Successfully removed a region\n"); dev->nregions--; i--; @@ -2119,7 +2115,6 @@ vu_init(VuDev *dev, DPRINT("%s: failed to malloc mem regions\n", __func__); return false; } - memset(dev->regions, 0, VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0])); dev->vq = malloc(max_queues * sizeof(dev->vq[0])); if (!dev->vq) { -- cgit 1.4.1 From 9c254cb413a631f6601e6e34429547759d7d63a6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:55 +0100 Subject: libvhost-user: Don't search for duplicates when removing memory regions We cannot have duplicate memory regions, something would be deeply flawed elsewhere. Let's just stop the search once we found an entry. We'll add more sanity checks when adding memory regions later. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-9-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 2e8b611385..7f29e01c30 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -896,8 +896,7 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { i--; found = true; - - /* Continue the search for eventual duplicates. */ + break; } } -- cgit 1.4.1 From 60ccdca42d96f730f57478caaf376da90c3d97f3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:56 +0100 Subject: libvhost-user: Factor out search for memory region by GPA and simplify Memory regions cannot overlap, and if we ever hit that case something would be really flawed. For example, when vhost code in QEMU decides to increase the size of memory regions to cover full huge pages, it makes sure to never create overlaps, and if there would be overlaps, it would bail out. QEMU commits 48d7c9757749 ("vhost: Merge sections added to temporary list"), c1ece84e7c93 ("vhost: Huge page align and merge") and e7b94a84b6cb ("vhost: Allow adjoining regions") added and clarified that handling and how overlaps are impossible. Consequently, each GPA can belong to at most one memory region, and everything else doesn't make sense. Let's factor out our search to prepare for further changes. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-10-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 79 ++++++++++++++++++------------- 1 file changed, 45 insertions(+), 34 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 7f29e01c30..d72f25396d 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -195,30 +195,47 @@ vu_panic(VuDev *dev, const char *msg, ...) */ } +/* Search for a memory region that covers this guest physical address. */ +static VuDevRegion * +vu_gpa_to_mem_region(VuDev *dev, uint64_t guest_addr) +{ + unsigned int i; + + /* + * Memory regions cannot overlap in guest physical address space. Each + * GPA belongs to exactly one memory region, so there can only be one + * match. + */ + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *cur = &dev->regions[i]; + + if (guest_addr >= cur->gpa && guest_addr < cur->gpa + cur->size) { + return cur; + } + } + return NULL; +} + /* Translate guest physical address to our virtual address. */ void * vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) { - unsigned int i; + VuDevRegion *r; if (*plen == 0) { return NULL; } - /* Find matching memory region. */ - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *r = &dev->regions[i]; - - if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { - if ((guest_addr + *plen) > (r->gpa + r->size)) { - *plen = r->gpa + r->size - guest_addr; - } - return (void *)(uintptr_t) - guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; - } + r = vu_gpa_to_mem_region(dev, guest_addr); + if (!r) { + return NULL; } - return NULL; + if ((guest_addr + *plen) > (r->gpa + r->size)) { + *plen = r->gpa + r->size - guest_addr; + } + return (void *)(uintptr_t)guest_addr - r->gpa + r->mmap_addr + + r->mmap_offset; } /* Translate qemu virtual address to our virtual address. */ @@ -854,8 +871,8 @@ static inline bool reg_equal(VuDevRegion *vudev_reg, static bool vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; - unsigned int i; - bool found = false; + unsigned int idx; + VuDevRegion *r; if (vmsg->fd_num > 1) { vmsg_close_fds(vmsg); @@ -882,28 +899,22 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { DPRINT(" mmap_offset 0x%016"PRIx64"\n", msg_region->mmap_offset); - for (i = 0; i < dev->nregions; i++) { - if (reg_equal(&dev->regions[i], msg_region)) { - VuDevRegion *r = &dev->regions[i]; - - munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); - - /* Shift all affected entries by 1 to close the hole at index. */ - memmove(dev->regions + i, dev->regions + i + 1, - sizeof(VuDevRegion) * (dev->nregions - i - 1)); - DPRINT("Successfully removed a region\n"); - dev->nregions--; - i--; - - found = true; - break; - } - } - - if (!found) { + r = vu_gpa_to_mem_region(dev, msg_region->guest_phys_addr); + if (!r || !reg_equal(r, msg_region)) { + vmsg_close_fds(vmsg); vu_panic(dev, "Specified region not found\n"); + return false; } + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); + + idx = r - dev->regions; + assert(idx < dev->nregions); + /* Shift all affected entries by 1 to close the hole. */ + memmove(r, r + 1, sizeof(VuDevRegion) * (dev->nregions - idx - 1)); + DPRINT("Successfully removed a region\n"); + dev->nregions--; + vmsg_close_fds(vmsg); return false; -- cgit 1.4.1 From a3c0118c5a1e64d32a2208388b5ea90e2ec9d214 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:57 +0100 Subject: libvhost-user: Speedup gpa_to_mem_region() and vu_gpa_to_va() Let's speed up GPA to memory region / virtual address lookup. Store the memory regions ordered by guest physical addresses, and use binary search for address translation, as well as when adding/removing memory regions. Most importantly, this will speed up GPA->VA address translation when we have many memslots. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-11-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 49 ++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 4 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index d72f25396d..ef6353d847 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -199,19 +199,30 @@ vu_panic(VuDev *dev, const char *msg, ...) static VuDevRegion * vu_gpa_to_mem_region(VuDev *dev, uint64_t guest_addr) { - unsigned int i; + int low = 0; + int high = dev->nregions - 1; /* * Memory regions cannot overlap in guest physical address space. Each * GPA belongs to exactly one memory region, so there can only be one * match. + * + * We store our memory regions ordered by GPA and can simply perform a + * binary search. */ - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *cur = &dev->regions[i]; + while (low <= high) { + unsigned int mid = low + (high - low) / 2; + VuDevRegion *cur = &dev->regions[mid]; if (guest_addr >= cur->gpa && guest_addr < cur->gpa + cur->size) { return cur; } + if (guest_addr >= cur->gpa + cur->size) { + low = mid + 1; + } + if (guest_addr < cur->gpa) { + high = mid - 1; + } } return NULL; } @@ -273,9 +284,14 @@ vu_remove_all_mem_regs(VuDev *dev) static void _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) { + const uint64_t start_gpa = msg_region->guest_phys_addr; + const uint64_t end_gpa = start_gpa + msg_region->memory_size; int prot = PROT_READ | PROT_WRITE; VuDevRegion *r; void *mmap_addr; + int low = 0; + int high = dev->nregions - 1; + unsigned int idx; DPRINT("Adding region %d\n", dev->nregions); DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", @@ -295,6 +311,29 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) prot = PROT_NONE; } + /* + * We will add memory regions into the array sorted by GPA. Perform a + * binary search to locate the insertion point: it will be at the low + * index. + */ + while (low <= high) { + unsigned int mid = low + (high - low) / 2; + VuDevRegion *cur = &dev->regions[mid]; + + /* Overlap of GPA addresses. */ + if (start_gpa < cur->gpa + cur->size && cur->gpa < end_gpa) { + vu_panic(dev, "regions with overlapping guest physical addresses"); + return; + } + if (start_gpa >= cur->gpa + cur->size) { + low = mid + 1; + } + if (start_gpa < cur->gpa) { + high = mid - 1; + } + } + idx = low; + /* * We don't use offset argument of mmap() since the mapped address has * to be page aligned, and we use huge pages. @@ -308,7 +347,9 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) DPRINT(" mmap_addr: 0x%016"PRIx64"\n", (uint64_t)(uintptr_t)mmap_addr); - r = &dev->regions[dev->nregions]; + /* Shift all affected entries by 1 to open a hole at idx. */ + r = &dev->regions[idx]; + memmove(r + 1, r, sizeof(VuDevRegion) * (dev->nregions - idx)); r->gpa = msg_region->guest_phys_addr; r->size = msg_region->memory_size; r->qva = msg_region->userspace_addr; -- cgit 1.4.1 From b2b63008b311126ade0a44ddb488d3704dffa51f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:58 +0100 Subject: libvhost-user: Use most of mmap_offset as fd_offset In the past, QEMU would create memory regions that could partially cover hugetlb pages, making mmap() fail if we would use the mmap_offset as an fd_offset. For that reason, we never used the mmap_offset as an offset into the fd and instead always mapped the fd from the very start. However, that can easily result in us mmap'ing a lot of unnecessary parts of an fd, possibly repeatedly. QEMU nowadays does not create memory regions that partially cover huge pages -- it never really worked with postcopy. QEMU handles merging of regions that partially cover huge pages (due to holes in boot memory) since 2018 in c1ece84e7c93 ("vhost: Huge page align and merge"). Let's be a bit careful and not unconditionally convert the mmap_offset into an fd_offset. Instead, let's simply detect the hugetlb size and pass as much as we can as fd_offset, making sure that we call mmap() with a properly aligned offset. With QEMU and a virtio-mem device that is fully plugged (50GiB using 50 memslots) the qemu-storage daemon process consumes in the VA space 1281GiB before this change and 58GiB after this change. ================ Vhost user message ================ Request: VHOST_USER_ADD_MEM_REG (37) Flags: 0x9 Size: 40 Fds: 59 Adding region 4 guest_phys_addr: 0x0000000200000000 memory_size: 0x0000000040000000 userspace_addr: 0x00007fb73bffe000 old mmap_offset: 0x0000000080000000 fd_offset: 0x0000000080000000 new mmap_offset: 0x0000000000000000 mmap_addr: 0x00007f02f1bdc000 Successfully added new region ================ Vhost user message ================ Request: VHOST_USER_ADD_MEM_REG (37) Flags: 0x9 Size: 40 Fds: 59 Adding region 5 guest_phys_addr: 0x0000000240000000 memory_size: 0x0000000040000000 userspace_addr: 0x00007fb77bffe000 old mmap_offset: 0x00000000c0000000 fd_offset: 0x00000000c0000000 new mmap_offset: 0x0000000000000000 mmap_addr: 0x00007f0284000000 Successfully added new region Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-12-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 54 +++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 6 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index ef6353d847..55aef5fcc6 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #ifdef __NR_userfaultfd #include @@ -281,12 +283,32 @@ vu_remove_all_mem_regs(VuDev *dev) dev->nregions = 0; } +static size_t +get_fd_hugepagesize(int fd) +{ +#if defined(__linux__) + struct statfs fs; + int ret; + + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + if (!ret && (unsigned int)fs.f_type == HUGETLBFS_MAGIC) { + return fs.f_bsize; + } +#endif + return 0; +} + static void _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) { const uint64_t start_gpa = msg_region->guest_phys_addr; const uint64_t end_gpa = start_gpa + msg_region->memory_size; int prot = PROT_READ | PROT_WRITE; + uint64_t mmap_offset, fd_offset; + size_t hugepagesize; VuDevRegion *r; void *mmap_addr; int low = 0; @@ -300,7 +322,7 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) msg_region->memory_size); DPRINT(" userspace_addr: 0x%016"PRIx64"\n", msg_region->userspace_addr); - DPRINT(" mmap_offset: 0x%016"PRIx64"\n", + DPRINT(" old mmap_offset: 0x%016"PRIx64"\n", msg_region->mmap_offset); if (dev->postcopy_listening) { @@ -335,11 +357,31 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) idx = low; /* - * We don't use offset argument of mmap() since the mapped address has - * to be page aligned, and we use huge pages. + * Convert most of msg_region->mmap_offset to fd_offset. In almost all + * cases, this will leave us with mmap_offset == 0, mmap()'ing only + * what we really need. Only if a memory region would partially cover + * hugetlb pages, we'd get mmap_offset != 0, which usually doesn't happen + * anymore (i.e., modern QEMU). + * + * Note that mmap() with hugetlb would fail if the offset into the file + * is not aligned to the huge page size. */ - mmap_addr = mmap(0, msg_region->memory_size + msg_region->mmap_offset, - prot, MAP_SHARED | MAP_NORESERVE, fd, 0); + hugepagesize = get_fd_hugepagesize(fd); + if (hugepagesize) { + fd_offset = ALIGN_DOWN(msg_region->mmap_offset, hugepagesize); + mmap_offset = msg_region->mmap_offset - fd_offset; + } else { + fd_offset = msg_region->mmap_offset; + mmap_offset = 0; + } + + DPRINT(" fd_offset: 0x%016"PRIx64"\n", + fd_offset); + DPRINT(" new mmap_offset: 0x%016"PRIx64"\n", + mmap_offset); + + mmap_addr = mmap(0, msg_region->memory_size + mmap_offset, + prot, MAP_SHARED | MAP_NORESERVE, fd, fd_offset); if (mmap_addr == MAP_FAILED) { vu_panic(dev, "region mmap error: %s", strerror(errno)); return; @@ -354,7 +396,7 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) r->size = msg_region->memory_size; r->qva = msg_region->userspace_addr; r->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - r->mmap_offset = msg_region->mmap_offset; + r->mmap_offset = mmap_offset; dev->nregions++; if (dev->postcopy_listening) { -- cgit 1.4.1 From 2a29022768f1777d71e26b784a264323d1914dd6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:59 +0100 Subject: libvhost-user: Factor out vq usability check Let's factor it out to prepare for further changes. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-13-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 55aef5fcc6..ed0a978d4f 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -283,6 +283,12 @@ vu_remove_all_mem_regs(VuDev *dev) dev->nregions = 0; } +static bool +vu_is_vq_usable(VuDev *dev, VuVirtq *vq) +{ + return likely(!dev->broken) && likely(vq->vring.avail); +} + static size_t get_fd_hugepagesize(int fd) { @@ -2380,8 +2386,7 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, idx = vq->last_avail_idx; total_bufs = in_total = out_total = 0; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { goto done; } @@ -2496,8 +2501,7 @@ vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, bool vu_queue_empty(VuDev *dev, VuVirtq *vq) { - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return true; } @@ -2536,8 +2540,7 @@ vring_notify(VuDev *dev, VuVirtq *vq) static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) { - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return; } @@ -2862,8 +2865,7 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) unsigned int head; VuVirtqElement *elem; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return NULL; } @@ -3020,8 +3022,7 @@ vu_queue_fill(VuDev *dev, VuVirtq *vq, { struct vring_used_elem uelem; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return; } @@ -3050,8 +3051,7 @@ vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) { uint16_t old, new; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return; } -- cgit 1.4.1 From 67f4f663cd6179d57f3e5a558f1526c7dc8c6742 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:17:00 +0100 Subject: libvhost-user: Dynamically remap rings after (temporarily?) removing memory regions Currently, we try to remap all rings whenever we add a single new memory region. That doesn't quite make sense, because we already map rings when setting the ring address, and panic if that goes wrong. Likely, that handling was simply copied from set_mem_table code, where we actually have to remap all rings. Remapping all rings might require us to walk quite a lot of memory regions to perform the address translations. Ideally, we'd simply remove that remapping. However, let's be a bit careful. There might be some weird corner cases where we might temporarily remove a single memory region (e.g., resize it), that would have worked for now. Further, a ring might be located on hotplugged memory, and as the VM reboots, we might unplug that memory, to hotplug memory before resetting the ring addresses. So let's unmap affected rings as we remove a memory region, and try dynamically mapping the ring again when required. Acked-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-14-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 107 ++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 29 deletions(-) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index ed0a978d4f..61fb3050b3 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -283,10 +283,75 @@ vu_remove_all_mem_regs(VuDev *dev) dev->nregions = 0; } +static bool +map_ring(VuDev *dev, VuVirtq *vq) +{ + vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); + + DPRINT("Setting virtq addresses:\n"); + DPRINT(" vring_desc at %p\n", vq->vring.desc); + DPRINT(" vring_used at %p\n", vq->vring.used); + DPRINT(" vring_avail at %p\n", vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + static bool vu_is_vq_usable(VuDev *dev, VuVirtq *vq) { - return likely(!dev->broken) && likely(vq->vring.avail); + if (unlikely(dev->broken)) { + return false; + } + + if (likely(vq->vring.avail)) { + return true; + } + + /* + * In corner cases, we might temporarily remove a memory region that + * mapped a ring. When removing a memory region we make sure to + * unmap any rings that would be impacted. Let's try to remap if we + * already succeeded mapping this ring once. + */ + if (!vq->vra.desc_user_addr || !vq->vra.used_user_addr || + !vq->vra.avail_user_addr) { + return false; + } + if (map_ring(dev, vq)) { + vu_panic(dev, "remapping queue on access"); + return false; + } + return true; +} + +static void +unmap_rings(VuDev *dev, VuDevRegion *r) +{ + int i; + + for (i = 0; i < dev->max_queues; i++) { + VuVirtq *vq = &dev->vq[i]; + const uintptr_t desc = (uintptr_t)vq->vring.desc; + const uintptr_t used = (uintptr_t)vq->vring.used; + const uintptr_t avail = (uintptr_t)vq->vring.avail; + + if (desc < r->mmap_addr || desc >= r->mmap_addr + r->size) { + continue; + } + if (used < r->mmap_addr || used >= r->mmap_addr + r->size) { + continue; + } + if (avail < r->mmap_addr || avail >= r->mmap_addr + r->size) { + continue; + } + + DPRINT("Unmapping rings of queue %d\n", i); + vq->vring.desc = NULL; + vq->vring.used = NULL; + vq->vring.avail = NULL; + } } static size_t @@ -786,21 +851,6 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) return false; } -static bool -map_ring(VuDev *dev, VuVirtq *vq) -{ - vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); - vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); - vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); - - DPRINT("Setting virtq addresses:\n"); - DPRINT(" vring_desc at %p\n", vq->vring.desc); - DPRINT(" vring_used at %p\n", vq->vring.used); - DPRINT(" vring_avail at %p\n", vq->vring.avail); - - return !(vq->vring.desc && vq->vring.used && vq->vring.avail); -} - static bool generate_faults(VuDev *dev) { unsigned int i; @@ -884,7 +934,6 @@ generate_faults(VuDev *dev) { static bool vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { - int i; VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; if (vmsg->fd_num != 1) { @@ -930,19 +979,9 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { vmsg->fd_num = 0; DPRINT("Successfully added new region in postcopy\n"); return true; - } else { - for (i = 0; i < dev->max_queues; i++) { - if (dev->vq[i].vring.desc) { - if (map_ring(dev, &dev->vq[i])) { - vu_panic(dev, "remapping queue %d for new memory region", - i); - } - } - } - - DPRINT("Successfully added new region\n"); - return false; } + DPRINT("Successfully added new region\n"); + return false; } static inline bool reg_equal(VuDevRegion *vudev_reg, @@ -995,6 +1034,16 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { return false; } + /* + * There might be valid cases where we temporarily remove memory regions + * to readd them again, or remove memory regions and don't use the rings + * anymore before we set the ring addresses and restart the device. + * + * Unmap all affected rings, remapping them on demand later. This should + * be a corner case. + */ + unmap_rings(dev, r); + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); idx = r - dev->regions; -- cgit 1.4.1 From 52767e1063beaa17d59c739efd0b9c342923929d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:17:01 +0100 Subject: libvhost-user: Mark mmap'ed region memory as MADV_DONTDUMP We already use MADV_NORESERVE to deal with sparse memory regions. Let's also set madvise(MADV_DONTDUMP), otherwise a crash of the process can result in us allocating all memory in the mmap'ed region for dumping purposes. This change implies that the mmap'ed rings won't be included in a coredump. If ever required for debugging purposes, we could mark only the mapped rings MADV_DODUMP. Ignore errors during madvise() for now. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-15-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'subprojects/libvhost-user/libvhost-user.c') diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 61fb3050b3..a879149fef 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -460,6 +460,12 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) DPRINT(" mmap_addr: 0x%016"PRIx64"\n", (uint64_t)(uintptr_t)mmap_addr); +#if defined(__linux__) + /* Don't include all guest memory in a coredump. */ + madvise(mmap_addr, msg_region->memory_size + mmap_offset, + MADV_DONTDUMP); +#endif + /* Shift all affected entries by 1 to open a hole at idx. */ r = &dev->regions[idx]; memmove(r + 1, r, sizeof(VuDevRegion) * (dev->nregions - idx)); -- cgit 1.4.1