diff options
| -rw-r--r-- | .gitlab-ci.d/buildtest.yml | 9 | ||||
| -rw-r--r-- | accel/kvm/kvm-all.c | 10 | ||||
| -rw-r--r-- | accel/stubs/kvm-stub.c | 5 | ||||
| -rw-r--r-- | block/blkio.c | 3 | ||||
| -rw-r--r-- | block/block-backend.c | 22 | ||||
| -rw-r--r-- | hw/block/virtio-blk.c | 226 | ||||
| -rw-r--r-- | hw/scsi/scsi-bus.c | 63 | ||||
| -rw-r--r-- | hw/scsi/virtio-scsi.c | 7 | ||||
| -rw-r--r-- | hw/usb/dev-storage-classic.c | 5 | ||||
| -rw-r--r-- | hw/virtio/virtio.c | 42 | ||||
| -rw-r--r-- | include/block/aio.h | 7 | ||||
| -rw-r--r-- | include/hw/scsi/scsi.h | 5 | ||||
| -rw-r--r-- | include/hw/virtio/virtio-blk.h | 2 | ||||
| -rw-r--r-- | include/sysemu/kvm.h | 6 | ||||
| -rw-r--r-- | migration/migration.c | 48 | ||||
| -rw-r--r-- | migration/multifd-zlib.c | 11 | ||||
| -rw-r--r-- | migration/multifd-zstd.c | 11 | ||||
| -rw-r--r-- | migration/multifd.c | 778 | ||||
| -rw-r--r-- | migration/multifd.h | 59 | ||||
| -rw-r--r-- | migration/ram.c | 2 | ||||
| -rw-r--r-- | migration/trace-events | 2 | ||||
| -rw-r--r-- | qapi/qmp-dispatch.c | 7 | ||||
| -rwxr-xr-x | tests/qemu-iotests/check | 3 | ||||
| -rw-r--r-- | tests/qemu-iotests/testenv.py | 2 | ||||
| -rw-r--r-- | tests/qtest/migration-test.c | 2 |
25 files changed, 783 insertions, 554 deletions
diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml index 79bbc8585b..f56df59c94 100644 --- a/.gitlab-ci.d/buildtest.yml +++ b/.gitlab-ci.d/buildtest.yml @@ -189,6 +189,8 @@ build-previous-qemu: TARGETS: x86_64-softmmu aarch64-softmmu before_script: - export QEMU_PREV_VERSION="$(sed 's/\([0-9.]*\)\.[0-9]*/v\1.0/' VERSION)" + - git remote add upstream https://gitlab.com/qemu-project/qemu + - git fetch upstream $QEMU_PREV_VERSION - git checkout $QEMU_PREV_VERSION after_script: - mv build build-previous @@ -217,9 +219,10 @@ build-previous-qemu: - QTEST_QEMU_BINARY_DST=./qemu-system-${TARGET} QTEST_QEMU_BINARY=../build/qemu-system-${TARGET} ./tests/qtest/migration-test -# This job is disabled until we release 9.0. The existing -# migration-test in 8.2 is broken on aarch64. The fix was already -# commited, but it will only take effect once 9.0 is out. +# This job needs to be disabled until we can have an aarch64 CPU model that +# will both (1) support both KVM and TCG, and (2) provide a stable ABI. +# Currently only "-cpu max" can provide (1), however it doesn't guarantee +# (2). Mark this test skipped until later. migration-compat-aarch64: extends: .migration-compat-common variables: diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 49e755ec4a..a8cecd040e 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1119,6 +1119,11 @@ int kvm_vm_check_extension(KVMState *s, unsigned int extension) return ret; } +/* + * We track the poisoned pages to be able to: + * - replace them on VM reset + * - block a migration for a VM with a poisoned page + */ typedef struct HWPoisonPage { ram_addr_t ram_addr; QLIST_ENTRY(HWPoisonPage) list; @@ -1152,6 +1157,11 @@ void kvm_hwpoison_page_add(ram_addr_t ram_addr) QLIST_INSERT_HEAD(&hwpoison_page_list, page, list); } +bool kvm_hwpoisoned_mem(void) +{ + return !QLIST_EMPTY(&hwpoison_page_list); +} + static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) { #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index 1b37d9a302..ca38172884 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -124,3 +124,8 @@ uint32_t kvm_dirty_ring_size(void) { return 0; } + +bool kvm_hwpoisoned_mem(void) +{ + return false; +} diff --git a/block/blkio.c b/block/blkio.c index bc2f21784c..882e1c297b 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -89,6 +89,9 @@ static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) /* Pad size to reduce frequency of resize calls */ bytes += 128 * 1024; + /* Align the pool size to avoid blkio_alloc_mem_region() failure */ + bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment); + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { int ret; diff --git a/block/block-backend.c b/block/block-backend.c index 209eb07528..9c4de79e6b 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -44,7 +44,7 @@ struct BlockBackend { char *name; int refcnt; BdrvChild *root; - AioContext *ctx; + AioContext *ctx; /* access with atomic operations only */ DriveInfo *legacy_dinfo; /* null unless created by drive_new() */ QTAILQ_ENTRY(BlockBackend) link; /* for block_backends */ QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */ @@ -2414,22 +2414,22 @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason) } } +/** + * Return BB's current AioContext. Note that this context may change + * concurrently at any time, with one exception: If the BB has a root node + * attached, its context will only change through bdrv_try_change_aio_context(), + * which creates a drained section. Therefore, incrementing such a BB's + * in-flight counter will prevent its context from changing. + */ AioContext *blk_get_aio_context(BlockBackend *blk) { - BlockDriverState *bs; IO_CODE(); if (!blk) { return qemu_get_aio_context(); } - bs = blk_bs(blk); - if (bs) { - AioContext *ctx = bdrv_get_aio_context(blk_bs(blk)); - assert(ctx == blk->ctx); - } - - return blk->ctx; + return qatomic_read(&blk->ctx); } int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, @@ -2442,7 +2442,7 @@ int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, GLOBAL_STATE_CODE(); if (!bs) { - blk->ctx = new_context; + qatomic_set(&blk->ctx, new_context); return 0; } @@ -2471,7 +2471,7 @@ static void blk_root_set_aio_ctx_commit(void *opaque) AioContext *new_context = s->new_ctx; ThrottleGroupMember *tgm = &blk->public.throttle_group_member; - blk->ctx = new_context; + qatomic_set(&blk->ctx, new_context); if (tgm->throttle_state) { throttle_group_detach_aio_context(tgm); throttle_group_attach_aio_context(tgm, new_context); diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 227d83569f..738cb2ac36 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -37,6 +37,8 @@ #include "hw/virtio/virtio-blk-common.h" #include "qemu/coroutine.h" +static void virtio_blk_ioeventfd_attach(VirtIOBlock *s); + static void virtio_blk_init_request(VirtIOBlock *s, VirtQueue *vq, VirtIOBlockReq *req) { @@ -64,7 +66,7 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status) iov_discard_undo(&req->inhdr_undo); iov_discard_undo(&req->outhdr_undo); virtqueue_push(req->vq, &req->elem, req->in_len); - if (s->ioeventfd_started && !s->ioeventfd_disabled) { + if (qemu_in_iothread()) { virtio_notify_irqfd(vdev, req->vq); } else { virtio_notify(vdev, req->vq); @@ -661,6 +663,9 @@ static void virtio_blk_zone_report_complete(void *opaque, int ret) int64_t zrp_size, n, j = 0; int64_t nz = data->zone_report_data.nr_zones; int8_t err_status = VIRTIO_BLK_S_OK; + struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) { + .nr_zones = cpu_to_le64(nz), + }; trace_virtio_blk_zone_report_complete(vdev, req, nz, ret); if (ret) { @@ -668,9 +673,6 @@ static void virtio_blk_zone_report_complete(void *opaque, int ret) goto out; } - struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) { - .nr_zones = cpu_to_le64(nz), - }; zrp_size = sizeof(struct virtio_blk_zone_report) + sizeof(struct virtio_blk_zone_descriptor) * nz; n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr)); @@ -898,13 +900,14 @@ static int virtio_blk_handle_zone_append(VirtIOBlockReq *req, int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS; int64_t len = iov_size(out_iov, out_num); + ZoneCmdData *data; trace_virtio_blk_handle_zone_append(vdev, req, offset >> BDRV_SECTOR_BITS); if (!check_zoned_request(s, offset, len, true, &err_status)) { goto out; } - ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData)); + data = g_malloc(sizeof(ZoneCmdData)); data->req = req; data->in_iov = in_iov; data->in_num = in_num; @@ -1191,14 +1194,15 @@ static void virtio_blk_dma_restart_cb(void *opaque, bool running, { VirtIOBlock *s = opaque; uint16_t num_queues = s->conf.num_queues; + g_autofree VirtIOBlockReq **vq_rq = NULL; + VirtIOBlockReq *rq; if (!running) { return; } /* Split the device-wide s->rq request list into per-vq request lists */ - g_autofree VirtIOBlockReq **vq_rq = g_new0(VirtIOBlockReq *, num_queues); - VirtIOBlockReq *rq; + vq_rq = g_new0(VirtIOBlockReq *, num_queues); WITH_QEMU_LOCK_GUARD(&s->rq_lock) { rq = s->rq; @@ -1209,6 +1213,8 @@ static void virtio_blk_dma_restart_cb(void *opaque, bool running, VirtIOBlockReq *next = rq->next; uint16_t idx = virtio_get_queue_index(rq->vq); + /* Only num_queues vqs were created so vq_rq[idx] is within bounds */ + assert(idx < num_queues); rq->next = vq_rq[idx]; vq_rq[idx] = rq; rq = next; @@ -1485,68 +1491,6 @@ static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f, return 0; } -static bool -validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list, - uint16_t num_queues, Error **errp) -{ - g_autofree unsigned long *vqs = bitmap_new(num_queues); - g_autoptr(GHashTable) iothreads = - g_hash_table_new(g_str_hash, g_str_equal); - - for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) { - const char *name = node->value->iothread; - uint16List *vq; - - if (!iothread_by_id(name)) { - error_setg(errp, "IOThread \"%s\" object does not exist", name); - return false; - } - - if (!g_hash_table_add(iothreads, (gpointer)name)) { - error_setg(errp, - "duplicate IOThread name \"%s\" in iothread-vq-mapping", - name); - return false; - } - - if (node != list) { - if (!!node->value->vqs != !!list->value->vqs) { - error_setg(errp, "either all items in iothread-vq-mapping " - "must have vqs or none of them must have it"); - return false; - } - } - - for (vq = node->value->vqs; vq; vq = vq->next) { - if (vq->value >= num_queues) { - error_setg(errp, "vq index %u for IOThread \"%s\" must be " - "less than num_queues %u in iothread-vq-mapping", - vq->value, name, num_queues); - return false; - } - - if (test_and_set_bit(vq->value, vqs)) { - error_setg(errp, "cannot assign vq %u to IOThread \"%s\" " - "because it is already assigned", vq->value, name); - return false; - } - } - } - - if (list->value->vqs) { - for (uint16_t i = 0; i < num_queues; i++) { - if (!test_bit(i, vqs)) { - error_setg(errp, - "missing vq %u IOThread assignment in iothread-vq-mapping", - i); - return false; - } - } - } - - return true; -} - static void virtio_resize_cb(void *opaque) { VirtIODevice *vdev = opaque; @@ -1613,15 +1557,95 @@ static const BlockDevOps virtio_block_ops = { .drained_end = virtio_blk_drained_end, }; -/* Generate vq:AioContext mappings from a validated iothread-vq-mapping list */ -static void -apply_vq_mapping(IOThreadVirtQueueMappingList *iothread_vq_mapping_list, - AioContext **vq_aio_context, uint16_t num_queues) +static bool +validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list, + uint16_t num_queues, Error **errp) +{ + g_autofree unsigned long *vqs = bitmap_new(num_queues); + g_autoptr(GHashTable) iothreads = + g_hash_table_new(g_str_hash, g_str_equal); + + for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) { + const char *name = node->value->iothread; + uint16List *vq; + + if (!iothread_by_id(name)) { + error_setg(errp, "IOThread \"%s\" object does not exist", name); + return false; + } + + if (!g_hash_table_add(iothreads, (gpointer)name)) { + error_setg(errp, + "duplicate IOThread name \"%s\" in iothread-vq-mapping", + name); + return false; + } + + if (node != list) { + if (!!node->value->vqs != !!list->value->vqs) { + error_setg(errp, "either all items in iothread-vq-mapping " + "must have vqs or none of them must have it"); + return false; + } + } + + for (vq = node->value->vqs; vq; vq = vq->next) { + if (vq->value >= num_queues) { + error_setg(errp, "vq index %u for IOThread \"%s\" must be " + "less than num_queues %u in iothread-vq-mapping", + vq->value, name, num_queues); + return false; + } + + if (test_and_set_bit(vq->value, vqs)) { + error_setg(errp, "cannot assign vq %u to IOThread \"%s\" " + "because it is already assigned", vq->value, name); + return false; + } + } + } + + if (list->value->vqs) { + for (uint16_t i = 0; i < num_queues; i++) { + if (!test_bit(i, vqs)) { + error_setg(errp, + "missing vq %u IOThread assignment in iothread-vq-mapping", + i); + return false; + } + } + } + + return true; +} + +/** + * apply_iothread_vq_mapping: + * @iothread_vq_mapping_list: The mapping of virtqueues to IOThreads. + * @vq_aio_context: The array of AioContext pointers to fill in. + * @num_queues: The length of @vq_aio_context. + * @errp: If an error occurs, a pointer to the area to store the error. + * + * Fill in the AioContext for each virtqueue in the @vq_aio_context array given + * the iothread-vq-mapping parameter in @iothread_vq_mapping_list. + * + * Returns: %true on success, %false on failure. + **/ +static bool apply_iothread_vq_mapping( + IOThreadVirtQueueMappingList *iothread_vq_mapping_list, + AioContext **vq_aio_context, + uint16_t num_queues, + Error **errp) { IOThreadVirtQueueMappingList *node; size_t num_iothreads = 0; size_t cur_iothread = 0; + if (!validate_iothread_vq_mapping_list(iothread_vq_mapping_list, + num_queues, errp)) { + return false; + } + for (node = iothread_vq_mapping_list; node; node = node->next) { num_iothreads++; } @@ -1638,6 +1662,7 @@ apply_vq_mapping(IOThreadVirtQueueMappingList *iothread_vq_mapping_list, /* Explicit vq:IOThread assignment */ for (vq = node->value->vqs; vq; vq = vq->next) { + assert(vq->value < num_queues); vq_aio_context[vq->value] = ctx; } } else { @@ -1650,6 +1675,8 @@ apply_vq_mapping(IOThreadVirtQueueMappingList *iothread_vq_mapping_list, cur_iothread++; } + + return true; } /* Context: BQL held */ @@ -1660,6 +1687,13 @@ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp) BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + if (conf->iothread && conf->iothread_vq_mapping_list) { + error_setg(errp, + "iothread and iothread-vq-mapping properties cannot be set " + "at the same time"); + return false; + } + if (conf->iothread || conf->iothread_vq_mapping_list) { if (!k->set_guest_notifiers || !k->ioeventfd_assign) { error_setg(errp, @@ -1685,8 +1719,14 @@ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp) s->vq_aio_context = g_new(AioContext *, conf->num_queues); if (conf->iothread_vq_mapping_list) { - apply_vq_mapping(conf->iothread_vq_mapping_list, s->vq_aio_context, - conf->num_queues); + if (!apply_iothread_vq_mapping(conf->iothread_vq_mapping_list, + s->vq_aio_context, + conf->num_queues, + errp)) { + g_free(s->vq_aio_context); + s->vq_aio_context = NULL; + return false; + } } else if (conf->iothread) { AioContext *ctx = iothread_get_aio_context(conf->iothread); for (unsigned i = 0; i < conf->num_queues; i++) { @@ -1790,6 +1830,7 @@ static int virtio_blk_start_ioeventfd(VirtIODevice *vdev) * Try to change the AioContext so that block jobs and other operations can * co-locate their activity in the same AioContext. If it fails, nevermind. */ + assert(nvqs > 0); /* enforced during ->realize() */ r = blk_set_aio_context(s->conf.conf.blk, s->vq_aio_context[0], &local_err); if (r < 0) { @@ -1808,17 +1849,14 @@ static int virtio_blk_start_ioeventfd(VirtIODevice *vdev) s->ioeventfd_started = true; smp_wmb(); /* paired with aio_notify_accept() on the read side */ - /* Get this show started by hooking up our callbacks */ - for (i = 0; i < nvqs; i++) { - VirtQueue *vq = virtio_get_queue(vdev, i); - AioContext *ctx = s->vq_aio_context[i]; - - /* Kick right away to begin processing requests already in vring */ - event_notifier_set(virtio_queue_get_host_notifier(vq)); - - if (!blk_in_drain(s->conf.conf.blk)) { - virtio_queue_aio_attach_host_notifier(vq, ctx); - } + /* + * Get this show started by hooking up our callbacks. If drained now, + * virtio_blk_drained_end() will do this later. + * Attaching the notifier also kicks the virtqueues, processing any requests + * they may already have. + */ + if (!blk_in_drain(s->conf.conf.blk)) { + virtio_blk_ioeventfd_attach(s); } return 0; @@ -1924,6 +1962,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOBlock *s = VIRTIO_BLK(dev); VirtIOBlkConf *conf = &s->conf; + BlockDriverState *bs; Error *err = NULL; unsigned i; @@ -1969,7 +2008,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) return; } - BlockDriverState *bs = blk_bs(conf->conf.blk); + bs = blk_bs(conf->conf.blk); if (bs->bl.zoned != BLK_Z_NONE) { virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED); if (bs->bl.zoned == BLK_Z_HM) { @@ -1996,19 +2035,6 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) return; } - if (conf->iothread_vq_mapping_list) { - if (conf->iothread) { - error_setg(errp, "iothread and iothread-vq-mapping properties " - "cannot be set at the same time"); - return; - } - - if (!validate_iothread_vq_mapping_list(conf->iothread_vq_mapping_list, - conf->num_queues, errp)) { - return; - } - } - s->config_size = virtio_get_config_size(&virtio_blk_cfg_size_params, s->host_features); virtio_init(vdev, VIRTIO_ID_BLOCK, s->config_size); diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c index 0a2eb11c56..9e40b0c920 100644 --- a/hw/scsi/scsi-bus.c +++ b/hw/scsi/scsi-bus.c @@ -120,17 +120,13 @@ static void scsi_device_for_each_req_async_bh(void *opaque) SCSIRequest *next; /* - * If the AioContext changed before this BH was called then reschedule into - * the new AioContext before accessing ->requests. This can happen when - * scsi_device_for_each_req_async() is called and then the AioContext is - * changed before BHs are run. + * The BB cannot have changed contexts between this BH being scheduled and + * now: BBs' AioContexts, when they have a node attached, can only be + * changed via bdrv_try_change_aio_context(), in a drained section. While + * we have the in-flight counter incremented, that drain must block. */ ctx = blk_get_aio_context(s->conf.blk); - if (ctx != qemu_get_current_aio_context()) { - aio_bh_schedule_oneshot(ctx, scsi_device_for_each_req_async_bh, - g_steal_pointer(&data)); - return; - } + assert(ctx == qemu_get_current_aio_context()); QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) { data->fn(req, data->fn_opaque); @@ -138,11 +134,16 @@ static void scsi_device_for_each_req_async_bh(void *opaque) /* Drop the reference taken by scsi_device_for_each_req_async() */ object_unref(OBJECT(s)); + + /* Paired with blk_inc_in_flight() in scsi_device_for_each_req_async() */ + blk_dec_in_flight(s->conf.blk); } /* * Schedule @fn() to be invoked for each enqueued request in device @s. @fn() * runs in the AioContext that is executing the request. + * Keeps the BlockBackend's in-flight counter incremented until everything is + * done, so draining it will settle all scheduled @fn() calls. */ static void scsi_device_for_each_req_async(SCSIDevice *s, void (*fn)(SCSIRequest *, void *), @@ -163,6 +164,8 @@ static void scsi_device_for_each_req_async(SCSIDevice *s, */ object_ref(OBJECT(s)); + /* Paired with blk_dec_in_flight() in scsi_device_for_each_req_async_bh() */ + blk_inc_in_flight(s->conf.blk); aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.blk), scsi_device_for_each_req_async_bh, data); @@ -373,15 +376,13 @@ static void scsi_qdev_unrealize(DeviceState *qdev) /* handle legacy '-drive if=scsi,...' cmd line args */ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, - int unit, bool removable, int bootindex, - bool share_rw, - BlockdevOnError rerror, - BlockdevOnError werror, + int unit, bool removable, BlockConf *conf, const char *serial, Error **errp) { const char *driver; char *name; DeviceState *dev; + SCSIDevice *s; DriveInfo *dinfo; if (blk_is_sg(blk)) { @@ -399,11 +400,10 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, object_property_add_child(OBJECT(bus), name, OBJECT(dev)); g_free(name); + s = SCSI_DEVICE(dev); + s->conf = *conf; + qdev_prop_set_uint32(dev, "scsi-id", unit); - if (bootindex >= 0) { - object_property_set_int(OBJECT(dev), "bootindex", bootindex, - &error_abort); - } if (object_property_find(OBJECT(dev), "removable")) { qdev_prop_set_bit(dev, "removable", removable); } @@ -414,19 +414,12 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, object_unparent(OBJECT(dev)); return NULL; } - if (!object_property_set_bool(OBJECT(dev), "share-rw", share_rw, errp)) { - object_unparent(OBJECT(dev)); - return NULL; - } - - qdev_prop_set_enum(dev, "rerror", rerror); - qdev_prop_set_enum(dev, "werror", werror); if (!qdev_realize_and_unref(dev, &bus->qbus, errp)) { object_unparent(OBJECT(dev)); return NULL; } - return SCSI_DEVICE(dev); + return s; } void scsi_bus_legacy_handle_cmdline(SCSIBus *bus) @@ -434,6 +427,12 @@ void scsi_bus_legacy_handle_cmdline(SCSIBus *bus) Location loc; DriveInfo *dinfo; int unit; + BlockConf conf = { + .bootindex = -1, + .share_rw = false, + .rerror = BLOCKDEV_ON_ERROR_AUTO, + .werror = BLOCKDEV_ON_ERROR_AUTO, + }; loc_push_none(&loc); for (unit = 0; unit <= bus->info->max_target; unit++) { @@ -443,10 +442,7 @@ void scsi_bus_legacy_handle_cmdline(SCSIBus *bus) } qemu_opts_loc_restore(dinfo->opts); scsi_bus_legacy_add_drive(bus, blk_by_legacy_dinfo(dinfo), - unit, false, -1, false, - BLOCKDEV_ON_ERROR_AUTO, - BLOCKDEV_ON_ERROR_AUTO, - NULL, &error_fatal); + unit, false, &conf, NULL, &error_fatal); } loc_pop(&loc); } @@ -1728,11 +1724,20 @@ static void scsi_device_purge_one_req(SCSIRequest *req, void *opaque) scsi_req_cancel_async(req, NULL); } +/** + * Cancel all requests, and block until they are deleted. + */ void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense) { scsi_device_for_each_req_async(sdev, scsi_device_purge_one_req, NULL); + /* + * Await all the scsi_device_purge_one_req() calls scheduled by + * scsi_device_for_each_req_async(), and all I/O requests that were + * cancelled this way, but may still take a bit of time to settle. + */ blk_drain(sdev->conf.blk); + scsi_device_set_ua(sdev, sense); } diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 690aceec45..9f02ceea09 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -1149,6 +1149,7 @@ static void virtio_scsi_drained_begin(SCSIBus *bus) static void virtio_scsi_drained_end(SCSIBus *bus) { VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); + VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); VirtIODevice *vdev = VIRTIO_DEVICE(s); uint32_t total_queues = VIRTIO_SCSI_VQ_NUM_FIXED + s->parent_obj.conf.num_queues; @@ -1166,7 +1167,11 @@ static void virtio_scsi_drained_end(SCSIBus *bus) for (uint32_t i = 0; i < total_queues; i++) { VirtQueue *vq = virtio_get_queue(vdev, i); - virtio_queue_aio_attach_host_notifier(vq, s->ctx); + if (vq == vs->event_vq) { + virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx); + } else { + virtio_queue_aio_attach_host_notifier(vq, s->ctx); + } } } diff --git a/hw/usb/dev-storage-classic.c b/hw/usb/dev-storage-classic.c index 84d19752b5..50a3ad6285 100644 --- a/hw/usb/dev-storage-classic.c +++ b/hw/usb/dev-storage-classic.c @@ -67,10 +67,7 @@ static void usb_msd_storage_realize(USBDevice *dev, Error **errp) scsi_bus_init(&s->bus, sizeof(s->bus), DEVICE(dev), &usb_msd_scsi_info_storage); scsi_dev = scsi_bus_legacy_add_drive(&s->bus, blk, 0, !!s->removable, - s->conf.bootindex, s->conf.share_rw, - s->conf.rerror, s->conf.werror, - dev->serial, - errp); + &s->conf, dev->serial, errp); blk_unref(blk); if (!scsi_dev) { return; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 7549094154..d229755eae 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3556,6 +3556,17 @@ static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx) { + /* + * virtio_queue_aio_detach_host_notifier() can leave notifications disabled. + * Re-enable them. (And if detach has not been used before, notifications + * being enabled is still the default state while a notifier is attached; + * see virtio_queue_host_notifier_aio_poll_end(), which will always leave + * notifications enabled once the polling section is left.) + */ + if (!virtio_queue_get_notification(vq)) { + virtio_queue_set_notification(vq, 1); + } + aio_set_event_notifier(ctx, &vq->host_notifier, virtio_queue_host_notifier_read, virtio_queue_host_notifier_aio_poll, @@ -3563,6 +3574,13 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx) aio_set_event_notifier_poll(ctx, &vq->host_notifier, virtio_queue_host_notifier_aio_poll_begin, virtio_queue_host_notifier_aio_poll_end); + + /* + * We will have ignored notifications about new requests from the guest + * while no notifiers were attached, so "kick" the virt queue to process + * those requests now. + */ + event_notifier_set(&vq->host_notifier); } /* @@ -3573,14 +3591,38 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx) */ void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext *ctx) { + /* See virtio_queue_aio_attach_host_notifier() */ + if (!virtio_queue_get_notification(vq)) { + virtio_queue_set_notification(vq, 1); + } + aio_set_event_notifier(ctx, &vq->host_notifier, virtio_queue_host_notifier_read, NULL, NULL); + + /* + * See virtio_queue_aio_attach_host_notifier(). + * Note that this may be unnecessary for the type of virtqueues this + * function is used for. Still, it will not hurt to have a quick look into + * whether we can/should process any of the virtqueue elements. + */ + event_notifier_set(&vq->host_notifier); } void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx) { aio_set_event_notifier(ctx, &vq->host_notifier, NULL, NULL, NULL); + + /* + * aio_set_event_notifier_poll() does not guarantee whether io_poll_end() + * will run after io_poll_begin(), so by removing the notifier, we do not + * know whether virtio_queue_host_notifier_aio_poll_end() has run after a + * previous virtio_queue_host_notifier_aio_poll_begin(), i.e. whether + * notifications are enabled or disabled. It does not really matter anyway; + * we just removed the notifier, so we do not care about notifications until + * we potentially re-attach it. The attach_host_notifier functions will + * ensure that notifications are enabled again when they are needed. + */ } void virtio_queue_host_notifier_read(EventNotifier *n) diff --git a/include/block/aio.h b/include/block/aio.h index 5d0a114988..8378553eb9 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -480,9 +480,14 @@ void aio_set_event_notifier(AioContext *ctx, AioPollFn *io_poll, EventNotifierHandler *io_poll_ready); -/* Set polling begin/end callbacks for an event notifier that has already been +/* + * Set polling begin/end callbacks for an event notifier that has already been * registered with aio_set_event_notifier. Do nothing if the event notifier is * not registered. + * + * Note that if the io_poll_end() callback (or the entire notifier) is removed + * during polling, it will not be called, so an io_poll_begin() is not + * necessarily always followed by an io_poll_end(). */ void aio_set_event_notifier_poll(AioContext *ctx, EventNotifier *notifier, diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h index 10c4e8288d..c3d5e17e38 100644 --- a/include/hw/scsi/scsi.h +++ b/include/hw/scsi/scsi.h @@ -199,10 +199,7 @@ static inline SCSIBus *scsi_bus_from_device(SCSIDevice *d) } SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, - int unit, bool removable, int bootindex, - bool share_rw, - BlockdevOnError rerror, - BlockdevOnError werror, + int unit, bool removable, BlockConf *conf, const char *serial, Error **errp); void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense); void scsi_bus_legacy_handle_cmdline(SCSIBus *bus); diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 833a9a344f..5c14110c4b 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -55,7 +55,7 @@ struct VirtIOBlock { VirtIODevice parent_obj; BlockBackend *blk; QemuMutex rq_lock; - void *rq; /* protected by rq_lock */ + struct VirtIOBlockReq *rq; /* protected by rq_lock */ VirtIOBlkConf conf; unsigned short sector_mask; bool original_wce; diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index d614878164..fad9a7e8ff 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -538,4 +538,10 @@ bool kvm_arch_cpu_check_are_resettable(void); bool kvm_dirty_ring_enabled(void); uint32_t kvm_dirty_ring_size(void); + +/** + * kvm_hwpoisoned_mem - indicate if there is any hwpoisoned page + * reported for the VM. + */ +bool kvm_hwpoisoned_mem(void); #endif diff --git a/migration/migration.c b/migration/migration.c index d5f705ceef..ab21de2cad 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -67,6 +67,7 @@ #include "options.h" #include "sysemu/dirtylimit.h" #include "qemu/sockets.h" +#include "sysemu/kvm.h" static NotifierList migration_state_notifiers = NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); @@ -128,11 +129,17 @@ static bool migration_needs_multiple_sockets(void) return migrate_multifd() || migrate_postcopy_preempt(); } -static bool transport_supports_multi_channels(SocketAddress *saddr) +static bool transport_supports_multi_channels(MigrationAddress *addr) { - return saddr->type == SOCKET_ADDRESS_TYPE_INET || - saddr->type == SOCKET_ADDRESS_TYPE_UNIX || - saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { + SocketAddress *saddr = &addr->u.socket; + + return saddr->type == SOCKET_ADDRESS_TYPE_INET || + saddr->type == SOCKET_ADDRESS_TYPE_UNIX || + saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + } + + return false; } static bool @@ -140,8 +147,7 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, Error **errp) { if (migration_needs_multiple_sockets() && - (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) && - !transport_supports_multi_channels(&addr->u.socket)) { + !transport_supports_multi_channels(addr)) { error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); return false; } @@ -311,7 +317,7 @@ void migration_incoming_state_destroy(void) { struct MigrationIncomingState *mis = migration_incoming_get_current(); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); if (mis->to_src_file) { @@ -662,7 +668,7 @@ static void process_incoming_migration_bh(void *opaque) trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced"); - multifd_load_shutdown(); + multifd_recv_shutdown(); dirty_bitmap_mig_before_vm_start(); @@ -759,7 +765,7 @@ fail: MIGRATION_STATUS_FAILED); qemu_fclose(mis->from_src_file); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); exit(EXIT_FAILURE); @@ -885,7 +891,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) default_channel = !mis->from_src_file; } - if (multifd_load_setup(errp) != 0) { + if (multifd_recv_setup(errp) != 0) { return; } @@ -1331,7 +1337,7 @@ static void migrate_fd_cleanup(MigrationState *s) } bql_lock(); - multifd_save_cleanup(); + multifd_send_shutdown(); qemu_mutex_lock(&s->qemu_file_lock); tmp = s->to_dst_file; s->to_dst_file = NULL; @@ -1906,6 +1912,12 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return false; } + if (kvm_hwpoisoned_mem()) { + error_setg(errp, "Can't migrate this vm with hardware poisoned memory, " + "please reboot the vm and try again"); + return false; + } + if (migration_is_blocked(errp)) { return false; } @@ -3315,6 +3327,10 @@ static void *migration_thread(void *opaque) object_ref(OBJECT(s)); update_iteration_initial_status(s); + if (!multifd_send_setup()) { + goto out; + } + bql_lock(); qemu_savevm_state_header(s->to_dst_file); bql_unlock(); @@ -3386,6 +3402,7 @@ static void *migration_thread(void *opaque) urgent = migration_rate_limit(); } +out: trace_migration_thread_after_loop(); migration_iteration_finish(s); object_unref(OBJECT(s)); @@ -3623,15 +3640,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (multifd_save_setup(&local_err) != 0) { - migrate_set_error(s, local_err); - error_report_err(local_err); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); - migrate_fd_cleanup(s); - return; - } - if (migrate_background_snapshot()) { qemu_thread_create(&s->thread, "bg_snapshot", bg_migration_thread, s, QEMU_THREAD_JOINABLE); diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 37ce48621e..012e3bdea1 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -116,17 +116,20 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { + MultiFDPages_t *pages = p->pages; struct zlib_data *z = p->data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; uint32_t i; - for (i = 0; i < p->normal_num; i++) { + multifd_send_prepare_header(p); + + for (i = 0; i < pages->num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == p->normal_num - 1) { + if (i == pages->num - 1) { flush = Z_SYNC_FLUSH; } @@ -135,7 +138,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) * with compression. zlib does not guarantee that this is safe, * therefore copy the page before calling deflate(). */ - memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size); + memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size); zs->avail_in = p->page_size; zs->next_in = z->buf; @@ -171,6 +174,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = out_size; p->flags |= MULTIFD_FLAG_ZLIB; + multifd_send_fill_packet(p); + return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index b471daadcd..dc8fe43e94 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -113,21 +113,24 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { + MultiFDPages_t *pages = p->pages; struct zstd_data *z = p->data; int ret; uint32_t i; + multifd_send_prepare_header(p); + z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == p->normal_num - 1) { + if (i == pages->num - 1) { flush = ZSTD_e_flush; } - z->in.src = p->pages->block->host + p->normal[i]; + z->in.src = p->pages->block->host + pages->offset[i]; z->in.size = p->page_size; z->in.pos = 0; @@ -160,6 +163,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = z->out.pos; p->flags |= MULTIFD_FLAG_ZSTD; + multifd_send_fill_packet(p); + return 0; } diff --git a/migration/multifd.c b/migration/multifd.c index 25cbc6dc6b..adfe8c9a0a 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -45,20 +45,54 @@ typedef struct { uint64_t unused2[4]; /* Reserved for future use */ } __attribute__((packed)) MultiFDInit_t; +struct { + MultiFDSendParams *params; + /* array of pages to sent */ + MultiFDPages_t *pages; + /* + * Global number of generated multifd packets. + * + * Note that we used 'uintptr_t' because it'll naturally support atomic + * operations on both 32bit / 64 bits hosts. It means on 32bit systems + * multifd will overflow the packet_num easier, but that should be + * fine. + * + * Another option is to use QEMU's Stat64 then it'll be 64 bits on all + * hosts, however so far it does not support atomic fetch_add() yet. + * Make it easy for now. + */ + uintptr_t packet_num; + /* + * Synchronization point past which no more channels will be + * created. + */ + QemuSemaphore channels_created; + /* send channels ready */ + QemuSemaphore channels_ready; + /* + * Have we already run terminate threads. There is a race when it + * happens that we got one error while we are exiting. + * We will use atomic operations. Only valid values are 0 and 1. + */ + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_send_state; + /* Multifd without compression */ /** * nocomp_send_setup: setup send side * - * For no compression this function does nothing. - * - * Returns 0 for success or -1 for error - * * @p: Params for the channel that we are using * @errp: pointer to an error */ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) { + if (migrate_zero_copy_send()) { + p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + } + return 0; } @@ -88,16 +122,38 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { + bool use_zero_copy_send = migrate_zero_copy_send(); MultiFDPages_t *pages = p->pages; + int ret; - for (int i = 0; i < p->normal_num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i]; + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will + * send it separately. + */ + multifd_send_prepare_header(p); + } + + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = p->normal_num * p->page_size; + p->next_packet_size = pages->num * p->page_size; p->flags |= MULTIFD_FLAG_NOCOMP; + + multifd_send_fill_packet(p); + + if (use_zero_copy_send) { + /* Send header first, without zerocopy */ + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, errp); + if (ret != 0) { + return -1; + } + } + return 0; } @@ -172,6 +228,17 @@ void multifd_register_ops(int method, MultiFDMethods *ops) multifd_ops[method] = ops; } +/* Reset a MultiFDPages_t* object for the next use */ +static void multifd_pages_reset(MultiFDPages_t *pages) +{ + /* + * We don't need to touch offset[] array, because it will be + * overwritten later when reused. + */ + pages->num = 0; + pages->block = NULL; +} + static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) { MultiFDInit_t msg = {}; @@ -248,35 +315,44 @@ static MultiFDPages_t *multifd_pages_init(uint32_t n) static void multifd_pages_clear(MultiFDPages_t *pages) { - pages->num = 0; + multifd_pages_reset(pages); pages->allocated = 0; - pages->block = NULL; g_free(pages->offset); pages->offset = NULL; g_free(pages); } -static void multifd_send_fill_packet(MultiFDSendParams *p) +void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; + MultiFDPages_t *pages = p->pages; + uint64_t packet_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(p->normal_num); + packet->normal_pages = cpu_to_be32(pages->num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); - packet->packet_num = cpu_to_be64(p->packet_num); - if (p->pages->block) { - strncpy(packet->ramblock, p->pages->block->idstr, 256); + packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); + packet->packet_num = cpu_to_be64(packet_num); + + if (pages->block) { + strncpy(packet->ramblock, pages->block->idstr, 256); } - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { /* there are architectures where ram_addr_t is 32 bit */ - uint64_t temp = p->normal[i]; + uint64_t temp = pages->offset[i]; packet->offset[i] = cpu_to_be64(temp); } + + p->packets_sent++; + p->total_normal_pages += pages->num; + + trace_multifd_send(p->id, packet_num, pages->num, p->flags, + p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -324,6 +400,11 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); + p->packets_recved++; + p->total_normal_pages += p->normal_num; + + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, + p->next_packet_size); if (p->normal_num == 0) { return 0; @@ -354,23 +435,22 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) return 0; } -struct { - MultiFDSendParams *params; - /* array of pages to sent */ - MultiFDPages_t *pages; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* send channels ready */ - QemuSemaphore channels_ready; - /* - * Have we already run terminate threads. There is a race when it - * happens that we got one error while we are exiting. - * We will use atomic operations. Only valid values are 0 and 1. - */ - int exiting; - /* multifd ops */ - MultiFDMethods *ops; -} *multifd_send_state; +static bool multifd_send_should_exit(void) +{ + return qatomic_read(&multifd_send_state->exiting); +} + +/* + * The migration thread can wait on either of the two semaphores. This + * function can be used to kick the main thread out of waiting on either of + * them. Should mostly only be called when something wrong happened with + * the current multifd send thread. + */ +static void multifd_send_kick_main(MultiFDSendParams *p) +{ + qemu_sem_post(&p->sem_sync); + qemu_sem_post(&multifd_send_state->channels_ready); +} /* * How we use multifd_send_state->pages and channel->pages? @@ -388,20 +468,23 @@ struct { * thread is using the channel mutex when changing it, and the channel * have to had finish with its own, otherwise pending_job can't be * false. + * + * Returns true if succeed, false otherwise. */ - -static int multifd_send_pages(void) +static bool multifd_send_pages(void) { int i; static int next_channel; MultiFDSendParams *p = NULL; /* make happy gcc */ MultiFDPages_t *pages = multifd_send_state->pages; - if (qatomic_read(&multifd_send_state->exiting)) { - return -1; + if (multifd_send_should_exit()) { + return false; } + /* We wait here, until at least one channel is ready */ qemu_sem_wait(&multifd_send_state->channels_ready); + /* * next_channel can remain from a previous migration that was * using more channels, so ensure it doesn't overflow if the @@ -409,69 +492,100 @@ static int multifd_send_pages(void) */ next_channel %= migrate_multifd_channels(); for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { - p = &multifd_send_state->params[i]; - - qemu_mutex_lock(&p->mutex); - if (p->quit) { - error_report("%s: channel %d has already quit!", __func__, i); - qemu_mutex_unlock(&p->mutex); - return -1; + if (multifd_send_should_exit()) { + return false; } - if (!p->pending_job) { - p->pending_job++; + p = &multifd_send_state->params[i]; + /* + * Lockless read to p->pending_job is safe, because only multifd + * sender thread can clear it. + */ + if (qatomic_read(&p->pending_job) == false) { next_channel = (i + 1) % migrate_multifd_channels(); break; } - qemu_mutex_unlock(&p->mutex); } - assert(!p->pages->num); - assert(!p->pages->block); - p->packet_num = multifd_send_state->packet_num++; + /* + * Make sure we read p->pending_job before all the rest. Pairs with + * qatomic_store_release() in multifd_send_thread(). + */ + smp_mb_acquire(); + assert(!p->pages->num); multifd_send_state->pages = p->pages; p->pages = pages; - qemu_mutex_unlock(&p->mutex); + /* + * Making sure p->pages is setup before marking pending_job=true. Pairs + * with the qatomic_load_acquire() in multifd_send_thread(). + */ + qatomic_store_release(&p->pending_job, true); qemu_sem_post(&p->sem); - return 1; + return true; } -int multifd_queue_page(RAMBlock *block, ram_addr_t offset) +static inline bool multifd_queue_empty(MultiFDPages_t *pages) { - MultiFDPages_t *pages = multifd_send_state->pages; - bool changed = false; + return pages->num == 0; +} - if (!pages->block) { - pages->block = block; - } +static inline bool multifd_queue_full(MultiFDPages_t *pages) +{ + return pages->num == pages->allocated; +} - if (pages->block == block) { - pages->offset[pages->num] = offset; - pages->num++; +static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset) +{ + pages->offset[pages->num++] = offset; +} - if (pages->num < pages->allocated) { - return 1; - } - } else { - changed = true; - } +/* Returns true if enqueue successful, false otherwise */ +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) +{ + MultiFDPages_t *pages; - if (multifd_send_pages() < 0) { - return -1; +retry: + pages = multifd_send_state->pages; + + /* If the queue is empty, we can already enqueue now */ + if (multifd_queue_empty(pages)) { + pages->block = block; + multifd_enqueue(pages, offset); + return true; } - if (changed) { - return multifd_queue_page(block, offset); + /* + * Not empty, meanwhile we need a flush. It can because of either: + * + * (1) The page is not on the same ramblock of previous ones, or, + * (2) The queue is full. + * + * After flush, always retry. + */ + if (pages->block != block || multifd_queue_full(pages)) { + if (!multifd_send_pages()) { + return false; + } + goto retry; } - return 1; + /* Not empty, and we still have space, do it! */ + multifd_enqueue(pages, offset); + return true; } -static void multifd_send_terminate_threads(Error *err) +/* Multifd send side hit an error; remember it and prepare to quit */ +static void multifd_send_set_error(Error *err) { - int i; - - trace_multifd_send_terminate_threads(err != NULL); + /* + * We don't want to exit each threads twice. Depending on where + * we get the error, or if there are two independent errors in two + * threads at the same time, we can end calling this function + * twice. + */ + if (qatomic_xchg(&multifd_send_state->exiting, 1)) { + return; + } if (err) { MigrationState *s = migrate_get_current(); @@ -484,27 +598,46 @@ static void multifd_send_terminate_threads(Error *err) MIGRATION_STATUS_FAILED); } } +} + +static void multifd_send_terminate_threads(void) +{ + int i; + + trace_multifd_send_terminate_threads(); /* - * We don't want to exit each threads twice. Depending on where - * we get the error, or if there are two independent errors in two - * threads at the same time, we can end calling this function - * twice. + * Tell everyone we're quitting. No xchg() needed here; we simply + * always set it. */ - if (qatomic_xchg(&multifd_send_state->exiting, 1)) { - return; - } + qatomic_set(&multifd_send_state->exiting, 1); + /* + * Firstly, kick all threads out; no matter whether they are just idle, + * or blocked in an IO system call. + */ for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_lock(&p->mutex); - p->quit = true; qemu_sem_post(&p->sem); if (p->c) { qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } - qemu_mutex_unlock(&p->mutex); + } + + /* + * Finally recycle all the threads. + */ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + if (p->tls_thread_created) { + qemu_thread_join(&p->tls_thread); + } + + if (p->thread_created) { + qemu_thread_join(&p->thread); + } } } @@ -513,57 +646,62 @@ static int multifd_send_channel_destroy(QIOChannel *send) return socket_send_channel_destroy(send); } -void multifd_save_cleanup(void) +static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) +{ + if (p->registered_yank) { + migration_ioc_unregister_yank(p->c); + } + multifd_send_channel_destroy(p->c); + p->c = NULL; + qemu_sem_destroy(&p->sem); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + multifd_pages_clear(p->pages); + p->pages = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + multifd_send_state->ops->send_cleanup(p, errp); + + return *errp == NULL; +} + +static void multifd_send_cleanup_state(void) +{ + qemu_sem_destroy(&multifd_send_state->channels_created); + qemu_sem_destroy(&multifd_send_state->channels_ready); + g_free(multifd_send_state->params); + multifd_send_state->params = NULL; + multifd_pages_clear(multifd_send_state->pages); + multifd_send_state->pages = NULL; + g_free(multifd_send_state); + multifd_send_state = NULL; +} + +void multifd_send_shutdown(void) { int i; if (!migrate_multifd()) { return; } - multifd_send_terminate_threads(NULL); - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDSendParams *p = &multifd_send_state->params[i]; - if (p->running) { - qemu_thread_join(&p->thread); - } - } + multifd_send_terminate_threads(); + for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; Error *local_err = NULL; - if (p->registered_yank) { - migration_ioc_unregister_yank(p->c); - } - multifd_send_channel_destroy(p->c); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - multifd_pages_clear(p->pages); - p->pages = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - g_free(p->normal); - p->normal = NULL; - multifd_send_state->ops->send_cleanup(p, &local_err); - if (local_err) { + if (!multifd_send_cleanup_channel(p, &local_err)) { migrate_set_error(migrate_get_current(), local_err); error_free(local_err); } } - qemu_sem_destroy(&multifd_send_state->channels_ready); - g_free(multifd_send_state->params); - multifd_send_state->params = NULL; - multifd_pages_clear(multifd_send_state->pages); - multifd_send_state->pages = NULL; - g_free(multifd_send_state); - multifd_send_state = NULL; + + multifd_send_cleanup_state(); } static int multifd_zero_copy_flush(QIOChannel *c) @@ -592,47 +730,38 @@ int multifd_send_sync_main(void) return 0; } if (multifd_send_state->pages->num) { - if (multifd_send_pages() < 0) { + if (!multifd_send_pages()) { error_report("%s: multifd_send_pages fail", __func__); return -1; } } - /* - * When using zero-copy, it's necessary to flush the pages before any of - * the pages can be sent again, so we'll make sure the new version of the - * pages will always arrive _later_ than the old pages. - * - * Currently we achieve this by flushing the zero-page requested writes - * per ram iteration, but in the future we could potentially optimize it - * to be less frequent, e.g. only after we finished one whole scanning of - * all the dirty bitmaps. - */ - flush_zero_copy = migrate_zero_copy_send(); for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - trace_multifd_send_sync_main_signal(p->id); - - qemu_mutex_lock(&p->mutex); - - if (p->quit) { - error_report("%s: channel %d has already quit", __func__, i); - qemu_mutex_unlock(&p->mutex); + if (multifd_send_should_exit()) { return -1; } - p->packet_num = multifd_send_state->packet_num++; - p->flags |= MULTIFD_FLAG_SYNC; - p->pending_job++; - qemu_mutex_unlock(&p->mutex); + trace_multifd_send_sync_main_signal(p->id); + + /* + * We should be the only user so far, so not possible to be set by + * others concurrently. + */ + assert(qatomic_read(&p->pending_sync) == false); + qatomic_set(&p->pending_sync, true); qemu_sem_post(&p->sem); } for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + if (multifd_send_should_exit()) { + return -1; + } + qemu_sem_wait(&multifd_send_state->channels_ready); trace_multifd_send_sync_main_wait(p->id); qemu_sem_wait(&p->sem_sync); @@ -652,7 +781,6 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; - bool use_zero_copy_send = migrate_zero_copy_send(); thread = migration_threads_add(p->name, qemu_get_thread_id()); @@ -663,64 +791,28 @@ static void *multifd_send_thread(void *opaque) ret = -1; goto out; } - /* initial packet */ - p->num_packets = 1; while (true) { qemu_sem_post(&multifd_send_state->channels_ready); qemu_sem_wait(&p->sem); - if (qatomic_read(&multifd_send_state->exiting)) { + if (multifd_send_should_exit()) { break; } - qemu_mutex_lock(&p->mutex); - - if (p->pending_job) { - uint64_t packet_num = p->packet_num; - uint32_t flags; - p->normal_num = 0; - - if (use_zero_copy_send) { - p->iovs_num = 0; - } else { - p->iovs_num = 1; - } - for (int i = 0; i < p->pages->num; i++) { - p->normal[p->normal_num] = p->pages->offset[i]; - p->normal_num++; - } + /* + * Read pending_job flag before p->pages. Pairs with the + * qatomic_store_release() in multifd_send_pages(). + */ + if (qatomic_load_acquire(&p->pending_job)) { + MultiFDPages_t *pages = p->pages; - if (p->normal_num) { - ret = multifd_send_state->ops->send_prepare(p, &local_err); - if (ret != 0) { - qemu_mutex_unlock(&p->mutex); - break; - } - } - multifd_send_fill_packet(p); - flags = p->flags; - p->flags = 0; - p->num_packets++; - p->total_normal_pages += p->normal_num; - p->pages->num = 0; - p->pages->block = NULL; - qemu_mutex_unlock(&p->mutex); + p->iovs_num = 0; + assert(pages->num); - trace_multifd_send(p->id, packet_num, p->normal_num, flags, - p->next_packet_size); - - if (use_zero_copy_send) { - /* Send header first, without zerocopy */ - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; - } - } else { - /* Send header using the same writev call */ - p->iov[0].iov_len = p->packet_len; - p->iov[0].iov_base = p->packet; + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + break; } ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, @@ -731,17 +823,35 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, p->next_packet_size + p->packet_len); + + multifd_pages_reset(p->pages); p->next_packet_size = 0; - qemu_mutex_lock(&p->mutex); - p->pending_job--; - qemu_mutex_unlock(&p->mutex); - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&p->sem_sync); - } + /* + * Making sure p->pages is published before saying "we're + * free". Pairs with the smp_mb_acquire() in + * multifd_send_pages(). + */ + qatomic_store_release(&p->pending_job, false); } else { - qemu_mutex_unlock(&p->mutex); - /* sometimes there are spurious wakeups */ + /* + * If not a normal job, must be a sync request. Note that + * pending_sync is a standalone flag (unlike pending_job), so + * it doesn't require explicit memory barriers. + */ + assert(qatomic_read(&p->pending_sync)); + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + break; + } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; + qatomic_set(&p->pending_sync, false); + qemu_sem_post(&p->sem_sync); } } @@ -749,53 +859,19 @@ out: if (ret) { assert(local_err); trace_multifd_send_error(p->id); - multifd_send_terminate_threads(local_err); - qemu_sem_post(&p->sem_sync); - qemu_sem_post(&multifd_send_state->channels_ready); + multifd_send_set_error(local_err); + multifd_send_kick_main(p); error_free(local_err); } - qemu_mutex_lock(&p->mutex); - p->running = false; - qemu_mutex_unlock(&p->mutex); - rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); return NULL; } -static bool multifd_channel_connect(MultiFDSendParams *p, - QIOChannel *ioc, - Error **errp); - -static void multifd_tls_outgoing_handshake(QIOTask *task, - gpointer opaque) -{ - MultiFDSendParams *p = opaque; - QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); - Error *err = NULL; - - if (!qio_task_propagate_error(task, &err)) { - trace_multifd_tls_outgoing_handshake_complete(ioc); - if (multifd_channel_connect(p, ioc, &err)) { - return; - } - } - - trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - - migrate_set_error(migrate_get_current(), err); - /* - * Error happen, mark multifd_send_thread status as 'quit' although it - * is not created, and then tell who pay attention to me. - */ - p->quit = true; - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); - error_free(err); -} +static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); static void *multifd_tls_handshake_thread(void *opaque) { @@ -803,7 +879,7 @@ static void *multifd_tls_handshake_thread(void *opaque) QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); qio_channel_tls_handshake(tioc, - multifd_tls_outgoing_handshake, + multifd_new_send_channel_async, p, NULL, NULL); @@ -823,11 +899,17 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return false; } + /* + * Ownership of the socket channel now transfers to the newly + * created TLS channel, which has already taken a reference. + */ object_unref(OBJECT(ioc)); trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); p->c = QIO_CHANNEL(tioc); - qemu_thread_create(&p->thread, "multifd-tls-handshake-worker", + + p->tls_thread_created = true; + qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", multifd_tls_handshake_thread, p, QEMU_THREAD_JOINABLE); return true; @@ -837,61 +919,72 @@ static bool multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc, Error **errp) { - trace_multifd_set_outgoing_channel( - ioc, object_get_typename(OBJECT(ioc)), - migrate_get_current()->hostname); - - if (migrate_channel_requires_tls_upgrade(ioc)) { - /* - * tls_channel_connect will call back to this - * function after the TLS handshake, - * so we mustn't call multifd_send_thread until then - */ - return multifd_tls_channel_connect(p, ioc, errp); - } + qio_channel_set_delay(ioc, false); migration_ioc_register_yank(ioc); p->registered_yank = true; p->c = ioc; + + p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, QEMU_THREAD_JOINABLE); return true; } -static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, - QIOChannel *ioc, Error *err) -{ - migrate_set_error(migrate_get_current(), err); - /* Error happen, we need to tell who pay attention to me */ - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); - /* - * Although multifd_send_thread is not created, but main migration - * thread need to judge whether it is running, so we need to mark - * its status. - */ - p->quit = true; - object_unref(OBJECT(ioc)); - error_free(err); -} - +/* + * When TLS is enabled this function is called once to establish the + * TLS connection and a second time after the TLS handshake to create + * the multifd channel. Without TLS it goes straight into the channel + * creation. + */ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); Error *local_err = NULL; + bool ret; trace_multifd_new_send_channel_async(p->id); - if (!qio_task_propagate_error(task, &local_err)) { - qio_channel_set_delay(ioc, false); - p->running = true; - if (multifd_channel_connect(p, ioc, &local_err)) { + + if (qio_task_propagate_error(task, &local_err)) { + ret = false; + goto out; + } + + trace_multifd_set_outgoing_channel(ioc, object_get_typename(OBJECT(ioc)), + migrate_get_current()->hostname); + + if (migrate_channel_requires_tls_upgrade(ioc)) { + ret = multifd_tls_channel_connect(p, ioc, &local_err); + if (ret) { return; } + } else { + ret = multifd_channel_connect(p, ioc, &local_err); + } + +out: + /* + * Here we're not interested whether creation succeeded, only that + * it happened at all. + */ + qemu_sem_post(&multifd_send_state->channels_created); + + if (ret) { + return; } trace_multifd_new_send_channel_async_error(p->id, local_err); - multifd_new_send_channel_cleanup(p, ioc, local_err); + multifd_send_set_error(local_err); + if (!p->c) { + /* + * If no channel has been created, drop the initial + * reference. Otherwise cleanup happens at + * multifd_send_channel_destroy() + */ + object_unref(OBJECT(ioc)); + } + error_free(local_err); } static void multifd_new_send_channel_create(gpointer opaque) @@ -899,20 +992,23 @@ static void multifd_new_send_channel_create(gpointer opaque) socket_send_channel_create(multifd_new_send_channel_async, opaque); } -int multifd_save_setup(Error **errp) +bool multifd_send_setup(void) { - int thread_count; + MigrationState *s = migrate_get_current(); + Error *local_err = NULL; + int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); uint8_t i; if (!migrate_multifd()) { - return 0; + return true; } thread_count = migrate_multifd_channels(); multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); multifd_send_state->pages = multifd_pages_init(page_count); + qemu_sem_init(&multifd_send_state->channels_created, 0); qemu_sem_init(&multifd_send_state->channels_ready, 0); qatomic_set(&multifd_send_state->exiting, 0); multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; @@ -920,11 +1016,8 @@ int multifd_save_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); - p->quit = false; - p->pending_job = 0; p->id = i; p->pages = multifd_pages_init(page_count); p->packet_len = sizeof(MultiFDPacket_t) @@ -935,29 +1028,39 @@ int multifd_save_setup(Error **errp) p->name = g_strdup_printf("multifdsend_%d", i); /* We need one extra place for the packet header */ p->iov = g_new0(struct iovec, page_count + 1); - p->normal = g_new0(ram_addr_t, page_count); p->page_size = qemu_target_page_size(); p->page_count = page_count; - - if (migrate_zero_copy_send()) { - p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; - } else { - p->write_flags = 0; - } - + p->write_flags = 0; multifd_new_send_channel_create(p); } + /* + * Wait until channel creation has started for all channels. The + * creation can still fail, but no more channels will be created + * past this point. + */ + for (i = 0; i < thread_count; i++) { + qemu_sem_wait(&multifd_send_state->channels_created); + } + for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - int ret; - ret = multifd_send_state->ops->send_setup(p, errp); + ret = multifd_send_state->ops->send_setup(p, &local_err); if (ret) { - return ret; + break; } } - return 0; + + if (ret) { + migrate_set_error(s, local_err); + error_report_err(local_err); + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + return false; + } + + return true; } struct { @@ -1006,14 +1109,42 @@ static void multifd_recv_terminate_threads(Error *err) } } -void multifd_load_shutdown(void) +void multifd_recv_shutdown(void) { if (migrate_multifd()) { multifd_recv_terminate_threads(NULL); } } -void multifd_load_cleanup(void) +static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) +{ + migration_ioc_unregister_yank(p->c); + object_unref(OBJECT(p->c)); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + g_free(p->normal); + p->normal = NULL; + multifd_recv_state->ops->recv_cleanup(p); +} + +static void multifd_recv_cleanup_state(void) +{ + qemu_sem_destroy(&multifd_recv_state->sem_sync); + g_free(multifd_recv_state->params); + multifd_recv_state->params = NULL; + g_free(multifd_recv_state); + multifd_recv_state = NULL; +} + +void multifd_recv_cleanup(void) { int i; @@ -1024,40 +1155,20 @@ void multifd_load_cleanup(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - if (p->running) { - /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. - */ - qemu_sem_post(&p->sem_sync); - } + /* + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ + qemu_sem_post(&p->sem_sync); - qemu_thread_join(&p->thread); + if (p->thread_created) { + qemu_thread_join(&p->thread); + } } for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - - migration_ioc_unregister_yank(p->c); - object_unref(OBJECT(p->c)); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - g_free(p->normal); - p->normal = NULL; - multifd_recv_state->ops->recv_cleanup(p); + multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); } - qemu_sem_destroy(&multifd_recv_state->sem_sync); - g_free(multifd_recv_state->params); - multifd_recv_state->params = NULL; - g_free(multifd_recv_state); - multifd_recv_state = NULL; + multifd_recv_cleanup_state(); } void multifd_recv_sync_main(void) @@ -1119,10 +1230,6 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, - p->next_packet_size); - p->num_packets++; - p->total_normal_pages += p->normal_num; qemu_mutex_unlock(&p->mutex); if (p->normal_num) { @@ -1142,17 +1249,14 @@ static void *multifd_recv_thread(void *opaque) multifd_recv_terminate_threads(local_err); error_free(local_err); } - qemu_mutex_lock(&p->mutex); - p->running = false; - qemu_mutex_unlock(&p->mutex); rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); return NULL; } -int multifd_load_setup(Error **errp) +int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); @@ -1249,10 +1353,8 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) } p->c = ioc; object_ref(OBJECT(ioc)); - /* initial packet */ - p->num_packets = 1; - p->running = true; + p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); diff --git a/migration/multifd.h b/migration/multifd.h index 35d11f103c..8a1cad0996 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,16 +13,16 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H -int multifd_save_setup(Error **errp); -void multifd_save_cleanup(void); -int multifd_load_setup(Error **errp); -void multifd_load_cleanup(void); -void multifd_load_shutdown(void); +bool multifd_send_setup(void); +void multifd_send_shutdown(void); +int multifd_recv_setup(Error **errp); +void multifd_recv_cleanup(void); +void multifd_recv_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); int multifd_send_sync_main(void); -int multifd_queue_page(RAMBlock *block, ram_addr_t offset); +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) @@ -73,6 +73,9 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; + QemuThread tls_thread; + bool tls_thread_created; /* communication channel */ QIOChannel *c; /* is the yank function registered */ @@ -91,18 +94,19 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; - /* this mutex protects the following parameters */ - QemuMutex mutex; - /* is this channel thread running */ - bool running; - /* should this thread finish */ - bool quit; /* multifd flags for each packet */ uint32_t flags; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* thread has work to do */ - int pending_job; + /* + * The sender thread has work to do if either of below boolean is set. + * + * @pending_job: a job is pending + * @pending_sync: a sync request is pending + * + * For both of these fields, they're only set by the requesters, and + * cleared by the multifd sender threads. + */ + bool pending_job; + bool pending_sync; /* array of pages to sent. * The owner of 'pages' depends of 'pending_job' value: * pending_job == 0 -> migration_thread can use it. @@ -117,17 +121,13 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets sent through this channel */ - uint64_t num_packets; + uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; /* buffers to send */ struct iovec *iov; /* number of iovs used */ uint32_t iovs_num; - /* Pages that are not zero */ - ram_addr_t *normal; - /* num of non zero pages */ - uint32_t normal_num; /* used for compression methods */ void *data; } MultiFDSendParams; @@ -142,6 +142,7 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; /* communication channel */ QIOChannel *c; /* packet allocated len */ @@ -156,8 +157,6 @@ typedef struct { /* this mutex protects the following parameters */ QemuMutex mutex; - /* is this channel thread running */ - bool running; /* should this thread finish */ bool quit; /* multifd flags for each packet */ @@ -171,8 +170,8 @@ typedef struct { MultiFDPacket_t *packet; /* size of the next packet that contains pages */ uint32_t next_packet_size; - /* packets sent through this channel */ - uint64_t num_packets; + /* packets received through this channel */ + uint64_t packets_recved; /* ramblock */ RAMBlock *block; /* ramblock host address */ @@ -205,6 +204,14 @@ typedef struct { } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); +void multifd_send_fill_packet(MultiFDSendParams *p); + +static inline void multifd_send_prepare_header(MultiFDSendParams *p) +{ + p->iov[0].iov_len = p->packet_len; + p->iov[0].iov_base = p->packet; + p->iovs_num++; +} -#endif +#endif diff --git a/migration/ram.c b/migration/ram.c index d5b7cd5ac2..4649a81204 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1252,7 +1252,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(block, offset) < 0) { + if (!multifd_queue_page(block, offset)) { return -1; } stat64_add(&mig_stats.normal_pages, 1); diff --git a/migration/trace-events b/migration/trace-events index de4a743c8a..298ad2b0dd 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -141,7 +141,7 @@ multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" -multifd_send_terminate_threads(bool error) "error %d" +multifd_send_terminate_threads(void) "" multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c index 176b549473..f3488afeef 100644 --- a/qapi/qmp-dispatch.c +++ b/qapi/qmp-dispatch.c @@ -212,8 +212,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ * executing the command handler so that it can make progress if it * involves an AIO_WAIT_WHILE(). */ - aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self()); - qemu_coroutine_yield(); + aio_co_reschedule_self(qemu_get_aio_context()); } monitor_set_cur(qemu_coroutine_self(), cur_mon); @@ -227,9 +226,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ * Move back to iohandler_ctx so that nested event loops for * qemu_aio_context don't start new monitor commands. */ - aio_co_schedule(iohandler_get_aio_context(), - qemu_coroutine_self()); - qemu_coroutine_yield(); + aio_co_reschedule_self(iohandler_get_aio_context()); } } else { /* diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check index f2e9d27dcf..56d88ca423 100755 --- a/tests/qemu-iotests/check +++ b/tests/qemu-iotests/check @@ -184,7 +184,8 @@ if __name__ == '__main__': sys.exit(str(e)) if args.dry_run: - print('\n'.join([os.path.basename(t) for t in tests])) + with env: + print('\n'.join([os.path.basename(t) for t in tests])) else: with TestRunner(env, tap=args.tap, color=args.color) as tr: diff --git a/tests/qemu-iotests/testenv.py b/tests/qemu-iotests/testenv.py index 3ff38f2661..588f30a4f1 100644 --- a/tests/qemu-iotests/testenv.py +++ b/tests/qemu-iotests/testenv.py @@ -126,7 +126,7 @@ class TestEnv(ContextManager['TestEnv']): self.tmp_sock_dir = False Path(self.sock_dir).mkdir(parents=True, exist_ok=True) except KeyError: - self.sock_dir = tempfile.mkdtemp() + self.sock_dir = tempfile.mkdtemp(prefix="qemu-iotests-") self.tmp_sock_dir = True self.sample_img_dir = os.getenv('SAMPLE_IMG_DIR', diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 7675519cfa..8a5bb1752e 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -819,7 +819,7 @@ static int test_migrate_start(QTestState **from, QTestState **to, } else if (strcmp(arch, "aarch64") == 0) { memory_size = "150M"; machine_alias = "virt"; - machine_opts = "gic-version=max"; + machine_opts = "gic-version=3"; arch_opts = g_strdup_printf("-cpu max -kernel %s", bootpath); start_address = ARM_TEST_MEM_START; end_address = ARM_TEST_MEM_END; |