From a890557d5a90b2c99988bc478bfd7f77392cfd8d Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 12 Mar 2021 10:22:06 +0100 Subject: vhost-user: Drop misleading EAGAIN checks in slave_read() slave_read() checks EAGAIN when reading or writing to the socket fails. This gives the impression that the slave channel is in non-blocking mode, which is certainly not the case with the current code base. And the rest of the code isn't actually ready to cope with non-blocking I/O. Just drop the checks everywhere in this function for the sake of clarity. Signed-off-by: Greg Kurz Message-Id: <20210312092212.782255-2-groug@kaod.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- hw/virtio/vhost-user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'hw/virtio/vhost-user.c') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 2fdd5daf74..6af9b43a72 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1420,7 +1420,7 @@ static void slave_read(void *opaque) do { size = recvmsg(u->slave_fd, &msgh, 0); - } while (size < 0 && (errno == EINTR || errno == EAGAIN)); + } while (size < 0 && errno == EINTR); if (size != VHOST_USER_HDR_SIZE) { error_report("Failed to read from slave."); @@ -1452,7 +1452,7 @@ static void slave_read(void *opaque) /* Read payload */ do { size = read(u->slave_fd, &payload, hdr.size); - } while (size < 0 && (errno == EINTR || errno == EAGAIN)); + } while (size < 0 && errno == EINTR); if (size != hdr.size) { error_report("Failed to read payload from slave."); @@ -1503,7 +1503,7 @@ static void slave_read(void *opaque) do { size = writev(u->slave_fd, iovec, ARRAY_SIZE(iovec)); - } while (size < 0 && (errno == EINTR || errno == EAGAIN)); + } while (size < 0 && errno == EINTR); if (size != VHOST_USER_HDR_SIZE + hdr.size) { error_report("Failed to send msg reply to slave."); -- cgit 1.4.1 From 9e06080bed293d94ec1f874e62f25f147b20bc6c Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 12 Mar 2021 10:22:07 +0100 Subject: vhost-user: Fix double-close on slave_read() error path Some message types, e.g. VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, can convey file descriptors. These must be closed before returning from slave_read() to avoid being leaked. This can currently be done in two different places: [1] just after the request has been processed [2] on the error path, under the goto label err: These path are supposed to be mutually exclusive but they are not actually. If the VHOST_USER_NEED_REPLY_MASK flag was passed and the sending of the reply fails, both [1] and [2] are performed with the same descriptor values. This can potentially cause subtle bugs if one of the descriptor was recycled by some other thread in the meantime. This code duplication complicates rollback for no real good benefit. Do the closing in a unique place, under a new fdcleanup: goto label at the end of the function. Signed-off-by: Greg Kurz Message-Id: <20210312092212.782255-3-groug@kaod.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- hw/virtio/vhost-user.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'hw/virtio/vhost-user.c') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 6af9b43a72..acde1d2936 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1475,13 +1475,6 @@ static void slave_read(void *opaque) ret = -EINVAL; } - /* Close the remaining file descriptors. */ - for (i = 0; i < fdsize; i++) { - if (fd[i] != -1) { - close(fd[i]); - } - } - /* * REPLY_ACK feature handling. Other reply types has to be managed * directly in their request handlers. @@ -1511,12 +1504,14 @@ static void slave_read(void *opaque) } } - return; + goto fdcleanup; err: qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); close(u->slave_fd); u->slave_fd = -1; + +fdcleanup: for (i = 0; i < fdsize; i++) { if (fd[i] != -1) { close(fd[i]); -- cgit 1.4.1 From de62e4946052076186428900a85d6547627e84c6 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 12 Mar 2021 10:22:08 +0100 Subject: vhost-user: Factor out duplicated slave_fd teardown code Signed-off-by: Greg Kurz Message-Id: <20210312092212.782255-4-groug@kaod.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- hw/virtio/vhost-user.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'hw/virtio/vhost-user.c') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index acde1d2936..cb0c98f30a 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1392,6 +1392,13 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, return 0; } +static void close_slave_channel(struct vhost_user *u) +{ + qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); + close(u->slave_fd); + u->slave_fd = -1; +} + static void slave_read(void *opaque) { struct vhost_dev *dev = opaque; @@ -1507,9 +1514,7 @@ static void slave_read(void *opaque) goto fdcleanup; err: - qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); - close(u->slave_fd); - u->slave_fd = -1; + close_slave_channel(u); fdcleanup: for (i = 0; i < fdsize; i++) { @@ -1560,9 +1565,7 @@ static int vhost_setup_slave_channel(struct vhost_dev *dev) out: close(sv[1]); if (ret) { - qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); - close(u->slave_fd); - u->slave_fd = -1; + close_slave_channel(u); } return ret; @@ -1915,9 +1918,7 @@ static int vhost_user_backend_cleanup(struct vhost_dev *dev) u->postcopy_fd.handler = NULL; } if (u->slave_fd >= 0) { - qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); - close(u->slave_fd); - u->slave_fd = -1; + close_slave_channel(u); } g_free(u->region_rb); u->region_rb = NULL; -- cgit 1.4.1 From 57dc02173cb089c11d3c84a0570cb60fe7d7f0d5 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 12 Mar 2021 10:22:09 +0100 Subject: vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- hw/virtio/vhost-user.c | 99 ++++++++++++++++++++------------------------------ 1 file changed, 39 insertions(+), 60 deletions(-) (limited to 'hw/virtio/vhost-user.c') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index cb0c98f30a..3c1e1611b0 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -16,6 +16,7 @@ #include "hw/virtio/virtio.h" #include "hw/virtio/virtio-net.h" #include "chardev/char-fe.h" +#include "io/channel-socket.h" #include "sysemu/kvm.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -237,7 +238,8 @@ struct vhost_user { struct vhost_dev *dev; /* Shared between vhost devs of the same virtio device */ VhostUserState *user; - int slave_fd; + QIOChannel *slave_ioc; + GSource *slave_src; NotifierWithReturn postcopy_notifier; struct PostCopyFD postcopy_fd; uint64_t postcopy_client_bases[VHOST_USER_MAX_RAM_SLOTS]; @@ -1394,61 +1396,37 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, static void close_slave_channel(struct vhost_user *u) { - qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); - close(u->slave_fd); - u->slave_fd = -1; + g_source_destroy(u->slave_src); + g_source_unref(u->slave_src); + u->slave_src = NULL; + object_unref(OBJECT(u->slave_ioc)); + u->slave_ioc = NULL; } -static void slave_read(void *opaque) +static gboolean slave_read(QIOChannel *ioc, GIOCondition condition, + gpointer opaque) { struct vhost_dev *dev = opaque; struct vhost_user *u = dev->opaque; VhostUserHeader hdr = { 0, }; VhostUserPayload payload = { 0, }; - int size, ret = 0; + Error *local_err = NULL; + gboolean rc = G_SOURCE_CONTINUE; + int ret = 0; struct iovec iov; - struct msghdr msgh; - int fd[VHOST_USER_SLAVE_MAX_FDS]; - char control[CMSG_SPACE(sizeof(fd))]; - struct cmsghdr *cmsg; - int i, fdsize = 0; - - memset(&msgh, 0, sizeof(msgh)); - msgh.msg_iov = &iov; - msgh.msg_iovlen = 1; - msgh.msg_control = control; - msgh.msg_controllen = sizeof(control); - - memset(fd, -1, sizeof(fd)); + g_autofree int *fd = NULL; + size_t fdsize = 0; + int i; /* Read header */ iov.iov_base = &hdr; iov.iov_len = VHOST_USER_HDR_SIZE; - do { - size = recvmsg(u->slave_fd, &msgh, 0); - } while (size < 0 && errno == EINTR); - - if (size != VHOST_USER_HDR_SIZE) { - error_report("Failed to read from slave."); + if (qio_channel_readv_full_all(ioc, &iov, 1, &fd, &fdsize, &local_err)) { + error_report_err(local_err); goto err; } - if (msgh.msg_flags & MSG_CTRUNC) { - error_report("Truncated message."); - goto err; - } - - for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msgh, cmsg)) { - if (cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_RIGHTS) { - fdsize = cmsg->cmsg_len - CMSG_LEN(0); - memcpy(fd, CMSG_DATA(cmsg), fdsize); - break; - } - } - if (hdr.size > VHOST_USER_PAYLOAD_SIZE) { error_report("Failed to read msg header." " Size %d exceeds the maximum %zu.", hdr.size, @@ -1457,12 +1435,8 @@ static void slave_read(void *opaque) } /* Read payload */ - do { - size = read(u->slave_fd, &payload, hdr.size); - } while (size < 0 && errno == EINTR); - - if (size != hdr.size) { - error_report("Failed to read payload from slave."); + if (qio_channel_read_all(ioc, (char *) &payload, hdr.size, &local_err)) { + error_report_err(local_err); goto err; } @@ -1475,7 +1449,7 @@ static void slave_read(void *opaque) break; case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG: ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area, - fd[0]); + fd ? fd[0] : -1); break; default: error_report("Received unexpected msg type: %d.", hdr.request); @@ -1501,12 +1475,8 @@ static void slave_read(void *opaque) iovec[1].iov_base = &payload; iovec[1].iov_len = hdr.size; - do { - size = writev(u->slave_fd, iovec, ARRAY_SIZE(iovec)); - } while (size < 0 && errno == EINTR); - - if (size != VHOST_USER_HDR_SIZE + hdr.size) { - error_report("Failed to send msg reply to slave."); + if (qio_channel_writev_all(ioc, iovec, ARRAY_SIZE(iovec), &local_err)) { + error_report_err(local_err); goto err; } } @@ -1515,14 +1485,15 @@ static void slave_read(void *opaque) err: close_slave_channel(u); + rc = G_SOURCE_REMOVE; fdcleanup: - for (i = 0; i < fdsize; i++) { - if (fd[i] != -1) { + if (fd) { + for (i = 0; i < fdsize; i++) { close(fd[i]); } } - return; + return rc; } static int vhost_setup_slave_channel(struct vhost_dev *dev) @@ -1535,6 +1506,8 @@ static int vhost_setup_slave_channel(struct vhost_dev *dev) int sv[2], ret = 0; bool reply_supported = virtio_has_feature(dev->protocol_features, VHOST_USER_PROTOCOL_F_REPLY_ACK); + Error *local_err = NULL; + QIOChannel *ioc; if (!virtio_has_feature(dev->protocol_features, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { @@ -1546,8 +1519,15 @@ static int vhost_setup_slave_channel(struct vhost_dev *dev) return -1; } - u->slave_fd = sv[0]; - qemu_set_fd_handler(u->slave_fd, slave_read, NULL, dev); + ioc = QIO_CHANNEL(qio_channel_socket_new_fd(sv[0], &local_err)); + if (!ioc) { + error_report_err(local_err); + return -1; + } + u->slave_ioc = ioc; + u->slave_src = qio_channel_add_watch_source(u->slave_ioc, + G_IO_IN | G_IO_HUP, + slave_read, dev, NULL, NULL); if (reply_supported) { msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; @@ -1802,7 +1782,6 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque) u = g_new0(struct vhost_user, 1); u->user = opaque; - u->slave_fd = -1; u->dev = dev; dev->opaque = u; @@ -1917,7 +1896,7 @@ static int vhost_user_backend_cleanup(struct vhost_dev *dev) close(u->postcopy_fd.fd); u->postcopy_fd.handler = NULL; } - if (u->slave_fd >= 0) { + if (u->slave_ioc) { close_slave_channel(u); } g_free(u->region_rb); -- cgit 1.4.1 From a7f523c7d114d445c5d83aecdba3efc038e5a692 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 12 Mar 2021 10:22:10 +0100 Subject: vhost-user: Introduce nested event loop in vhost_user_read() A deadlock condition potentially exists if a vhost-user process needs to request something to QEMU on the slave channel while processing a vhost-user message. This doesn't seem to affect any vhost-user implementation so far, but this is currently biting the upcoming enablement of DAX with virtio-fs. The issue is being observed when the guest does an emergency reboot while a mapping still exits in the DAX window, which is very easy to get with a busy enough workload (e.g. as simulated by blogbench [1]) : - QEMU sends VHOST_USER_GET_VRING_BASE to virtiofsd. - In order to complete the request, virtiofsd then asks QEMU to remove the mapping on the slave channel. All these dialogs are synchronous, hence the deadlock. As pointed out by Stefan Hajnoczi: When QEMU's vhost-user master implementation sends a vhost-user protocol message, vhost_user_read() does a "blocking" read during which slave_fd is not monitored by QEMU. The natural solution for this issue is an event loop. The main event loop cannot be nested though since we have no guarantees that its fd handlers are prepared for re-entrancy. Introduce a new event loop that only monitors the chardev I/O for now in vhost_user_read() and push the actual reading to a one-shot handler. A subsequent patch will teach the loop to monitor and process messages from the slave channel as well. [1] https://github.com/jedisct1/Blogbench Suggested-by: Stefan Hajnoczi Signed-off-by: Greg Kurz Message-Id: <20210312092212.782255-6-groug@kaod.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- hw/virtio/vhost-user.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 5 deletions(-) (limited to 'hw/virtio/vhost-user.c') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 3c1e1611b0..00256fa318 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -296,15 +296,27 @@ static int vhost_user_read_header(struct vhost_dev *dev, VhostUserMsg *msg) return 0; } -static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) +struct vhost_user_read_cb_data { + struct vhost_dev *dev; + VhostUserMsg *msg; + GMainLoop *loop; + int ret; +}; + +static gboolean vhost_user_read_cb(GIOChannel *source, GIOCondition condition, + gpointer opaque) { + struct vhost_user_read_cb_data *data = opaque; + struct vhost_dev *dev = data->dev; + VhostUserMsg *msg = data->msg; struct vhost_user *u = dev->opaque; CharBackend *chr = u->user->chr; uint8_t *p = (uint8_t *) msg; int r, size; if (vhost_user_read_header(dev, msg) < 0) { - return -1; + data->ret = -1; + goto end; } /* validate message size is sane */ @@ -312,7 +324,8 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) error_report("Failed to read msg header." " Size %d exceeds the maximum %zu.", msg->hdr.size, VHOST_USER_PAYLOAD_SIZE); - return -1; + data->ret = -1; + goto end; } if (msg->hdr.size) { @@ -322,11 +335,53 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) if (r != size) { error_report("Failed to read msg payload." " Read %d instead of %d.", r, msg->hdr.size); - return -1; + data->ret = -1; + goto end; } } - return 0; +end: + g_main_loop_quit(data->loop); + return G_SOURCE_REMOVE; +} + +static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) +{ + struct vhost_user *u = dev->opaque; + CharBackend *chr = u->user->chr; + GMainContext *prev_ctxt = chr->chr->gcontext; + GMainContext *ctxt = g_main_context_new(); + GMainLoop *loop = g_main_loop_new(ctxt, FALSE); + struct vhost_user_read_cb_data data = { + .dev = dev, + .loop = loop, + .msg = msg, + .ret = 0 + }; + + /* + * We want to be able to monitor the slave channel fd while waiting + * for chr I/O. This requires an event loop, but we can't nest the + * one to which chr is currently attached : its fd handlers might not + * be prepared for re-entrancy. So we create a new one and switch chr + * to use it. + */ + qemu_chr_be_update_read_handlers(chr->chr, ctxt); + qemu_chr_fe_add_watch(chr, G_IO_IN | G_IO_HUP, vhost_user_read_cb, &data); + + g_main_loop_run(loop); + + /* + * Restore the previous event loop context. This also destroys/recreates + * event sources : this guarantees that all pending events in the original + * context that have been processed by the nested loop are purged. + */ + qemu_chr_be_update_read_handlers(chr->chr, prev_ctxt); + + g_main_loop_unref(loop); + g_main_context_unref(ctxt); + + return data.ret; } static int process_message_reply(struct vhost_dev *dev, -- cgit 1.4.1 From db8a3772e300c1a656331a92da0785d81667dc81 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 12 Mar 2021 10:22:11 +0100 Subject: vhost-user: Monitor slave channel in vhost_user_read() Now that everything is in place, have the nested event loop to monitor the slave channel. The source in the main event loop is destroyed and recreated to ensure any pending even for the slave channel that was previously detected is purged. This guarantees that the main loop wont invoke slave_read() based on an event that was already handled by the nested loop. Signed-off-by: Greg Kurz Message-Id: <20210312092212.782255-7-groug@kaod.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- hw/virtio/vhost-user.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'hw/virtio/vhost-user.c') diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 00256fa318..ded0c10453 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -345,6 +345,35 @@ end: return G_SOURCE_REMOVE; } +static gboolean slave_read(QIOChannel *ioc, GIOCondition condition, + gpointer opaque); + +/* + * This updates the read handler to use a new event loop context. + * Event sources are removed from the previous context : this ensures + * that events detected in the previous context are purged. They will + * be re-detected and processed in the new context. + */ +static void slave_update_read_handler(struct vhost_dev *dev, + GMainContext *ctxt) +{ + struct vhost_user *u = dev->opaque; + + if (!u->slave_ioc) { + return; + } + + if (u->slave_src) { + g_source_destroy(u->slave_src); + g_source_unref(u->slave_src); + } + + u->slave_src = qio_channel_add_watch_source(u->slave_ioc, + G_IO_IN | G_IO_HUP, + slave_read, dev, NULL, + ctxt); +} + static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) { struct vhost_user *u = dev->opaque; @@ -366,6 +395,7 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) * be prepared for re-entrancy. So we create a new one and switch chr * to use it. */ + slave_update_read_handler(dev, ctxt); qemu_chr_be_update_read_handlers(chr->chr, ctxt); qemu_chr_fe_add_watch(chr, G_IO_IN | G_IO_HUP, vhost_user_read_cb, &data); @@ -377,6 +407,7 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) * context that have been processed by the nested loop are purged. */ qemu_chr_be_update_read_handlers(chr->chr, prev_ctxt); + slave_update_read_handler(dev, NULL); g_main_loop_unref(loop); g_main_context_unref(ctxt); @@ -1580,9 +1611,7 @@ static int vhost_setup_slave_channel(struct vhost_dev *dev) return -1; } u->slave_ioc = ioc; - u->slave_src = qio_channel_add_watch_source(u->slave_ioc, - G_IO_IN | G_IO_HUP, - slave_read, dev, NULL, NULL); + slave_update_read_handler(dev, NULL); if (reply_supported) { msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; -- cgit 1.4.1