diff options
Diffstat (limited to 'migration')
| -rw-r--r-- | migration/colo.c | 17 | ||||
| -rw-r--r-- | migration/file.c | 4 | ||||
| -rw-r--r-- | migration/meson.build | 1 | ||||
| -rw-r--r-- | migration/migration-hmp-cmds.c | 9 | ||||
| -rw-r--r-- | migration/migration.c | 67 | ||||
| -rw-r--r-- | migration/migration.h | 7 | ||||
| -rw-r--r-- | migration/multifd-zero-page.c | 87 | ||||
| -rw-r--r-- | migration/multifd-zlib.c | 21 | ||||
| -rw-r--r-- | migration/multifd-zstd.c | 20 | ||||
| -rw-r--r-- | migration/multifd.c | 120 | ||||
| -rw-r--r-- | migration/multifd.h | 23 | ||||
| -rw-r--r-- | migration/options.c | 32 | ||||
| -rw-r--r-- | migration/options.h | 7 | ||||
| -rw-r--r-- | migration/qemu-file.c | 5 | ||||
| -rw-r--r-- | migration/ram.c | 62 | ||||
| -rw-r--r-- | migration/ram.h | 3 | ||||
| -rw-r--r-- | migration/rdma.c | 2 | ||||
| -rw-r--r-- | migration/savevm.c | 23 | ||||
| -rw-r--r-- | migration/trace-events | 8 |
19 files changed, 409 insertions, 109 deletions
diff --git a/migration/colo.c b/migration/colo.c index 315e31fe32..84632a603e 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -63,9 +63,9 @@ static bool colo_runstate_is_stopped(void) return runstate_check(RUN_STATE_COLO) || !runstate_is_running(); } -static void colo_checkpoint_notify(void *opaque) +static void colo_checkpoint_notify(void) { - MigrationState *s = opaque; + MigrationState *s = migrate_get_current(); int64_t next_notify_time; qemu_event_set(&s->colo_checkpoint_event); @@ -74,10 +74,15 @@ static void colo_checkpoint_notify(void *opaque) timer_mod(s->colo_delay_timer, next_notify_time); } +static void colo_checkpoint_notify_timer(void *opaque) +{ + colo_checkpoint_notify(); +} + void colo_checkpoint_delay_set(void) { if (migration_in_colo_state()) { - colo_checkpoint_notify(migrate_get_current()); + colo_checkpoint_notify(); } } @@ -162,7 +167,7 @@ static void primary_vm_do_failover(void) * kick COLO thread which might wait at * qemu_sem_wait(&s->colo_checkpoint_sem). */ - colo_checkpoint_notify(s); + colo_checkpoint_notify(); /* * Wake up COLO thread which may blocked in recv() or send(), @@ -518,7 +523,7 @@ out: static void colo_compare_notify_checkpoint(Notifier *notifier, void *data) { - colo_checkpoint_notify(data); + colo_checkpoint_notify(); } static void colo_process_checkpoint(MigrationState *s) @@ -642,7 +647,7 @@ void migrate_start_colo_process(MigrationState *s) bql_unlock(); qemu_event_init(&s->colo_checkpoint_event, false); s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, - colo_checkpoint_notify, s); + colo_checkpoint_notify_timer, NULL); qemu_sem_init(&s->colo_exit_sem, 0); colo_process_checkpoint(s); diff --git a/migration/file.c b/migration/file.c index 164b079966..b0b963e0ce 100644 --- a/migration/file.c +++ b/migration/file.c @@ -159,7 +159,7 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, int niov, RAMBlock *block, Error **errp) { - ssize_t ret = -1; + ssize_t ret = 0; int i, slice_idx, slice_num; uintptr_t base, next, offset; size_t len; @@ -191,7 +191,7 @@ int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, */ offset = (uintptr_t) iov[slice_idx].iov_base - (uintptr_t) block->host; if (offset >= block->used_length) { - error_setg(errp, "offset " RAM_ADDR_FMT + error_setg(errp, "offset %" PRIxPTR "outside of ramblock %s range", offset, block->idstr); ret = -1; break; diff --git a/migration/meson.build b/migration/meson.build index 92b1cc4297..1eeb915ff6 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -22,6 +22,7 @@ system_ss.add(files( 'migration.c', 'multifd.c', 'multifd-zlib.c', + 'multifd-zero-page.c', 'ram-compress.c', 'options.c', 'postcopy-ram.c', diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 99b49df5dd..7e96ae6ffd 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -344,6 +344,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION), MultiFDCompression_str(params->multifd_compression)); + assert(params->has_zero_page_detection); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION), + qapi_enum_lookup(&ZeroPageDetection_lookup, + params->zero_page_detection)); monitor_printf(mon, "%s: %" PRIu64 " bytes\n", MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE), params->xbzrle_cache_size); @@ -634,6 +639,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); break; + case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION: + p->has_zero_page_detection = true; + visit_type_ZeroPageDetection(v, param, &p->zero_page_detection, &err); + break; case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: p->has_xbzrle_cache_size = true; if (!visit_type_size(v, param, &cache_size, &err)) { diff --git a/migration/migration.c b/migration/migration.c index a49fcd53ee..644e073b7d 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1081,9 +1081,11 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value) * Return true if we're already in the middle of a migration * (i.e. any of the active or setup states) */ -bool migration_is_setup_or_active(int state) +bool migration_is_setup_or_active(void) { - switch (state) { + MigrationState *s = current_migration; + + switch (s->state) { case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_POSTCOPY_PAUSED: @@ -1101,9 +1103,11 @@ bool migration_is_setup_or_active(int state) } } -bool migration_is_running(int state) +bool migration_is_running(void) { - switch (state) { + MigrationState *s = current_migration; + + switch (s->state) { case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_POSTCOPY_PAUSED: @@ -1404,7 +1408,7 @@ static void migrate_fd_cleanup(MigrationState *s) qemu_fclose(tmp); } - assert(!migration_is_active(s)); + assert(!migration_is_active()); if (s->state == MIGRATION_STATUS_CANCELLING) { migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, @@ -1475,7 +1479,7 @@ static void migrate_fd_cancel(MigrationState *s) do { old_state = s->state; - if (!migration_is_running(old_state)) { + if (!migration_is_running()) { break; } /* If the migration is paused, kick it out of the pause */ @@ -1544,16 +1548,6 @@ int migration_call_notifiers(MigrationState *s, MigrationEventType type, return ret; } -bool migration_in_setup(MigrationState *s) -{ - return s->state == MIGRATION_STATUS_SETUP; -} - -bool migration_has_finished(MigrationState *s) -{ - return s->state == MIGRATION_STATUS_COMPLETED; -} - bool migration_has_failed(MigrationState *s) { return (s->state == MIGRATION_STATUS_CANCELLED || @@ -1601,10 +1595,8 @@ bool migration_incoming_postcopy_advised(void) bool migration_in_bg_snapshot(void) { - MigrationState *s = migrate_get_current(); - return migrate_background_snapshot() && - migration_is_setup_or_active(s->state); + migration_is_setup_or_active(); } bool migration_is_idle(void) @@ -1637,12 +1629,28 @@ bool migration_is_idle(void) return false; } -bool migration_is_active(MigrationState *s) +bool migration_is_active(void) { + MigrationState *s = current_migration; + return (s->state == MIGRATION_STATUS_ACTIVE || s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); } +bool migration_is_device(void) +{ + MigrationState *s = current_migration; + + return s->state == MIGRATION_STATUS_DEVICE; +} + +bool migration_thread_is_self(void) +{ + MigrationState *s = current_migration; + + return qemu_thread_is_self(&s->thread); +} + bool migrate_mode_is_cpr(MigrationState *s) { return s->parameters.mode == MIG_MODE_CPR_REBOOT; @@ -1960,7 +1968,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return true; } - if (migration_is_running(s->state)) { + if (migration_is_running()) { error_setg(errp, QERR_MIGRATION_ACTIVE); return false; } @@ -2297,7 +2305,7 @@ static void *source_return_path_thread(void *opaque) trace_source_return_path_thread_entry(); rcu_register_thread(); - while (migration_is_setup_or_active(ms->state)) { + while (migration_is_setup_or_active()) { trace_source_return_path_thread_loop_top(); header_type = qemu_get_be16(rp); @@ -3020,6 +3028,17 @@ static MigThrError postcopy_pause(MigrationState *s) } } +void migration_file_set_error(int err) +{ + MigrationState *s = current_migration; + + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { + if (s->to_dst_file) { + qemu_file_set_error(s->to_dst_file, err); + } + } +} + static MigThrError migration_detect_error(MigrationState *s) { int ret; @@ -3461,7 +3480,7 @@ static void *migration_thread(void *opaque) trace_migration_thread_setup_complete(); - while (migration_is_active(s)) { + while (migration_is_active()) { if (urgent || !migration_rate_exceeded(s->to_dst_file)) { MigIterateState iter_state = migration_iteration_run(s); if (iter_state == MIG_ITERATE_SKIP) { @@ -3607,7 +3626,7 @@ static void *bg_migration_thread(void *opaque) migration_bh_schedule(bg_migration_vm_start_bh, s); bql_unlock(); - while (migration_is_active(s)) { + while (migration_is_active()) { MigIterateState iter_state = bg_migration_iteration_run(s); if (iter_state == MIG_ITERATE_SKIP) { continue; diff --git a/migration/migration.h b/migration/migration.h index 65c0b61cbd..8045e39c26 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -26,6 +26,7 @@ #include "qom/object.h" #include "postcopy-ram.h" #include "sysemu/runstate.h" +#include "migration/misc.h" struct PostcopyBlocktimeContext; @@ -479,8 +480,8 @@ bool migrate_has_error(MigrationState *s); void migrate_fd_connect(MigrationState *s, Error *error_in); -bool migration_is_setup_or_active(int state); -bool migration_is_running(int state); +int migration_call_notifiers(MigrationState *s, MigrationEventType type, + Error **errp); int migrate_init(MigrationState *s, Error **errp); bool migration_is_blocked(Error **errp); @@ -488,6 +489,8 @@ bool migration_is_blocked(Error **errp); bool migration_in_postcopy(void); bool migration_postcopy_is_alive(int state); MigrationState *migrate_get_current(void); +bool migration_has_failed(MigrationState *); +bool migrate_mode_is_cpr(MigrationState *); uint64_t ram_get_total_transferred_pages(void); diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c new file mode 100644 index 0000000000..1ba38be636 --- /dev/null +++ b/migration/multifd-zero-page.c @@ -0,0 +1,87 @@ +/* + * Multifd zero page detection implementation. + * + * Copyright (c) 2024 Bytedance Inc + * + * Authors: + * Hao Xiang <hao.xiang@bytedance.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "exec/ramblock.h" +#include "migration.h" +#include "multifd.h" +#include "options.h" +#include "ram.h" + +static bool multifd_zero_page_enabled(void) +{ + return migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD; +} + +static void swap_page_offset(ram_addr_t *pages_offset, int a, int b) +{ + ram_addr_t temp; + + if (a == b) { + return; + } + + temp = pages_offset[a]; + pages_offset[a] = pages_offset[b]; + pages_offset[b] = temp; +} + +/** + * multifd_send_zero_page_detect: Perform zero page detection on all pages. + * + * Sorts normal pages before zero pages in p->pages->offset and updates + * p->pages->normal_num. + * + * @param p A pointer to the send params. + */ +void multifd_send_zero_page_detect(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + RAMBlock *rb = pages->block; + int i = 0; + int j = pages->num - 1; + + if (!multifd_zero_page_enabled()) { + pages->normal_num = pages->num; + return; + } + + /* + * Sort the page offset array by moving all normal pages to + * the left and all zero pages to the right of the array. + */ + while (i <= j) { + uint64_t offset = pages->offset[i]; + + if (!buffer_is_zero(rb->host + offset, p->page_size)) { + i++; + continue; + } + + swap_page_offset(pages->offset, i, j); + ram_release_page(rb->idstr, offset); + j--; + } + + pages->normal_num = i; +} + +void multifd_recv_zero_page_process(MultiFDRecvParams *p) +{ + for (int i = 0; i < p->zero_num; i++) { + void *page = p->host + p->zero[i]; + if (!buffer_is_zero(page, p->page_size)) { + memset(page, 0, p->page_size); + } + } +} diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 6120faad65..83c0374380 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -123,13 +123,15 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = Z_SYNC_FLUSH; } @@ -172,10 +174,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = out_size; p->iovs_num++; p->next_packet_size = out_size; - p->flags |= MULTIFD_FLAG_ZLIB; +out: + p->flags |= MULTIFD_FLAG_ZLIB; multifd_send_fill_packet(p); - return 0; } @@ -261,6 +263,14 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZLIB); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { @@ -310,6 +320,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, out_size, expected_size); return -1; } + return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index cac236833d..02112255ad 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -118,16 +118,18 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = ZSTD_e_flush; } z->in.src = p->pages->block->host + pages->offset[i]; @@ -161,10 +163,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = z->out.pos; p->iovs_num++; p->next_packet_size = z->out.pos; - p->flags |= MULTIFD_FLAG_ZSTD; +out: + p->flags |= MULTIFD_FLAG_ZSTD; multifd_send_fill_packet(p); - return 0; } @@ -257,6 +259,14 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZSTD); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { diff --git a/migration/multifd.c b/migration/multifd.c index d4a44da559..0179422f6d 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -11,6 +11,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qemu/rcu.h" #include "exec/target_page.h" #include "sysemu/sysemu.h" @@ -111,11 +112,16 @@ void multifd_send_channel_created(void) static void multifd_set_file_bitmap(MultiFDSendParams *p) { MultiFDPages_t *pages = p->pages; + uint32_t zero_num = p->pages->num - p->pages->normal_num; assert(pages->block); - for (int i = 0; i < p->pages->num; i++) { - ramblock_set_file_bmap_atomic(pages->block, pages->offset[i]); + for (int i = 0; i < p->pages->normal_num; i++) { + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true); + } + + for (int i = p->pages->num; i < zero_num; i++) { + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false); } } @@ -153,13 +159,13 @@ static void multifd_send_prepare_iovs(MultiFDSendParams *p) { MultiFDPages_t *pages = p->pages; - for (int i = 0; i < pages->num; i++) { + for (int i = 0; i < pages->normal_num; i++) { p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = pages->num * p->page_size; + p->next_packet_size = pages->normal_num * p->page_size; } /** @@ -178,6 +184,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) bool use_zero_copy_send = migrate_zero_copy_send(); int ret; + multifd_send_zero_page_detect(p); + if (!multifd_use_packets()) { multifd_send_prepare_iovs(p); multifd_set_file_bitmap(p); @@ -261,6 +269,13 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_NOCOMP); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + return 0; + } + for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; p->iov[i].iov_len = p->page_size; @@ -295,6 +310,7 @@ static void multifd_pages_reset(MultiFDPages_t *pages) * overwritten later when reused. */ pages->num = 0; + pages->normal_num = 0; pages->block = NULL; } @@ -386,11 +402,13 @@ void multifd_send_fill_packet(MultiFDSendParams *p) MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; uint64_t packet_num; + uint32_t zero_num = pages->num - pages->normal_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(pages->num); + packet->normal_pages = cpu_to_be32(pages->normal_num); + packet->zero_pages = cpu_to_be32(zero_num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); @@ -408,10 +426,11 @@ void multifd_send_fill_packet(MultiFDSendParams *p) } p->packets_sent++; - p->total_normal_pages += pages->num; + p->total_normal_pages += pages->normal_num; + p->total_zero_pages += zero_num; - trace_multifd_send(p->id, packet_num, pages->num, p->flags, - p->next_packet_size); + trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num, + p->flags, p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -452,20 +471,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal_num = be32_to_cpu(packet->normal_pages); if (p->normal_num > packet->pages_alloc) { error_setg(errp, "multifd: received packet " - "with %u pages and expected maximum pages are %u", + "with %u normal pages and expected maximum pages are %u", p->normal_num, packet->pages_alloc) ; return -1; } + p->zero_num = be32_to_cpu(packet->zero_pages); + if (p->zero_num > packet->pages_alloc - p->normal_num) { + error_setg(errp, "multifd: received packet " + "with %u zero pages and expected maximum zero pages are %u", + p->zero_num, packet->pages_alloc - p->normal_num) ; + return -1; + } + p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; p->total_normal_pages += p->normal_num; + p->total_zero_pages += p->zero_num; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, - p->next_packet_size); + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num, + p->flags, p->next_packet_size); - if (p->normal_num == 0) { + if (p->normal_num == 0 && p->zero_num == 0) { return 0; } @@ -491,6 +519,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal[i] = offset; } + for (i = 0; i < p->zero_num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); + + if (offset > (p->block->used_length - p->page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, p->block->used_length); + return -1; + } + p->zero[i] = offset; + } + return 0; } @@ -710,16 +750,26 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) if (p->c) { migration_ioc_unregister_yank(p->c); /* - * An explicit close() on the channel here is normally not - * required, but can be helpful for "file:" iochannels, where it - * will include fdatasync() to make sure the data is flushed to the - * disk backend. + * The object_unref() cannot guarantee the fd will always be + * released because finalize() of the iochannel is only + * triggered on the last reference and it's not guaranteed + * that we always hold the last refcount when reaching here. + * + * Closing the fd explicitly has the benefit that if there is any + * registered I/O handler callbacks on such fd, that will get a + * POLLNVAL event and will further trigger the cleanup to finally + * release the IOC. * - * The object_unref() cannot guarantee that because: (1) finalize() - * of the iochannel is only triggered on the last reference, and - * it's not guaranteed that we always hold the last refcount when - * reaching here, and, (2) even if finalize() is invoked, it only - * does a close(fd) without data flush. + * FIXME: It should logically be guaranteed that all multifd + * channels have no I/O handler callback registered when reaching + * here, because migration thread will wait for all multifd channel + * establishments to complete during setup. Since + * migrate_fd_cleanup() will be scheduled in main thread too, all + * previous callbacks should guarantee to be completed when + * reaching here. See multifd_send_state.channels_created and its + * usage. In the future, we could replace this with an assert + * making sure we're the last reference, or simply drop it if above + * is more clear to be justified. */ qio_channel_close(p->c, &error_abort); object_unref(OBJECT(p->c)); @@ -908,6 +958,8 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, p->next_packet_size + p->packet_len); + stat64_add(&mig_stats.normal_pages, pages->normal_num); + stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num); multifd_pages_reset(p->pages); p->next_packet_size = 0; @@ -955,7 +1007,8 @@ out: rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1306,6 +1359,8 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->iov = NULL; g_free(p->normal); p->normal = NULL; + g_free(p->zero); + p->zero = NULL; multifd_recv_state->ops->recv_cleanup(p); } @@ -1439,7 +1494,7 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - has_data = !!p->normal_num; + has_data = p->normal_num || p->zero_num; qemu_mutex_unlock(&p->mutex); } else { /* @@ -1497,7 +1552,9 @@ static void *multifd_recv_thread(void *opaque) } rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, + p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1549,6 +1606,7 @@ int multifd_recv_setup(Error **errp) p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); + p->zero = g_new0(ram_addr_t, page_count); p->page_count = page_count; p->page_size = qemu_target_page_size(); } @@ -1623,3 +1681,17 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); } + +bool multifd_send_prepare_common(MultiFDSendParams *p) +{ + multifd_send_zero_page_detect(p); + + if (!p->pages->normal_num) { + p->next_packet_size = 0; + return false; + } + + multifd_send_prepare_header(p); + + return true; +} diff --git a/migration/multifd.h b/migration/multifd.h index 7447c2bea3..c9d9b09239 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -55,14 +55,24 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; uint64_t packet_num; - uint64_t unused[4]; /* Reserved for future use */ + /* zero pages */ + uint32_t zero_pages; + uint32_t unused32[1]; /* Reserved for future use */ + uint64_t unused64[3]; /* Reserved for future use */ char ramblock[256]; + /* + * This array contains the pointers to: + * - normal pages (initial normal_pages entries) + * - zero pages (following zero_pages entries) + */ uint64_t offset[]; } __attribute__((packed)) MultiFDPacket_t; typedef struct { /* number of used pages */ uint32_t num; + /* number of normal pages */ + uint32_t normal_num; /* number of allocated pages */ uint32_t allocated; /* offset of each page */ @@ -136,6 +146,8 @@ typedef struct { uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; + /* zero pages sent through this channel */ + uint64_t total_zero_pages; /* buffers to send */ struct iovec *iov; /* number of iovs used */ @@ -194,12 +206,18 @@ typedef struct { uint8_t *host; /* non zero pages recv through this channel */ uint64_t total_normal_pages; + /* zero pages recv through this channel */ + uint64_t total_zero_pages; /* buffers to recv */ struct iovec *iov; /* Pages that are not zero */ ram_addr_t *normal; /* num of non zero pages */ uint32_t normal_num; + /* Pages that are zero */ + ram_addr_t *zero; + /* num of zero pages */ + uint32_t zero_num; /* used for de-compression methods */ void *compress_data; } MultiFDRecvParams; @@ -221,6 +239,9 @@ typedef struct { void multifd_register_ops(int method, MultiFDMethods *ops); void multifd_send_fill_packet(MultiFDSendParams *p); +bool multifd_send_prepare_common(MultiFDSendParams *p); +void multifd_send_zero_page_detect(MultiFDSendParams *p); +void multifd_recv_zero_page_process(MultiFDRecvParams *p); static inline void multifd_send_prepare_header(MultiFDSendParams *p) { diff --git a/migration/options.c b/migration/options.c index 40eb930940..9ed2fe4bee 100644 --- a/migration/options.c +++ b/migration/options.c @@ -179,6 +179,9 @@ Property migration_properties[] = { DEFINE_PROP_MIG_MODE("mode", MigrationState, parameters.mode, MIG_MODE_NORMAL), + DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, + parameters.zero_page_detection, + ZERO_PAGE_DETECTION_MULTIFD), /* Migration capabilities */ DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), @@ -681,7 +684,7 @@ bool migrate_cap_set(int cap, bool value, Error **errp) MigrationState *s = migrate_get_current(); bool new_caps[MIGRATION_CAPABILITY__MAX]; - if (migration_is_running(s->state)) { + if (migration_is_running()) { error_setg(errp, QERR_MIGRATION_ACTIVE); return false; } @@ -725,7 +728,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, MigrationCapabilityStatusList *cap; bool new_caps[MIGRATION_CAPABILITY__MAX]; - if (migration_is_running(s->state) || migration_in_colo_state()) { + if (migration_is_running() || migration_in_colo_state()) { error_setg(errp, QERR_MIGRATION_ACTIVE); return; } @@ -924,6 +927,13 @@ const char *migrate_tls_hostname(void) return s->parameters.tls_hostname; } +uint64_t migrate_vcpu_dirty_limit_period(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.x_vcpu_dirty_limit_period; +} + uint64_t migrate_xbzrle_cache_size(void) { MigrationState *s = migrate_get_current(); @@ -931,6 +941,13 @@ uint64_t migrate_xbzrle_cache_size(void) return s->parameters.xbzrle_cache_size; } +ZeroPageDetection migrate_zero_page_detection(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.zero_page_detection; +} + /* parameter setters */ void migrate_set_block_incremental(bool value) @@ -1041,6 +1058,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit; params->has_mode = true; params->mode = s->parameters.mode; + params->has_zero_page_detection = true; + params->zero_page_detection = s->parameters.zero_page_detection; return params; } @@ -1077,6 +1096,7 @@ void migrate_params_init(MigrationParameters *params) params->has_x_vcpu_dirty_limit_period = true; params->has_vcpu_dirty_limit = true; params->has_mode = true; + params->has_zero_page_detection = true; } /* @@ -1391,6 +1411,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_mode) { dest->mode = params->mode; } + + if (params->has_zero_page_detection) { + dest->zero_page_detection = params->zero_page_detection; + } } static void migrate_params_apply(MigrateSetParameters *params, Error **errp) @@ -1541,6 +1565,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_mode) { s->parameters.mode = params->mode; } + + if (params->has_zero_page_detection) { + s->parameters.zero_page_detection = params->zero_page_detection; + } } void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp) diff --git a/migration/options.h b/migration/options.h index 6ddd8dad9b..ab8199e207 100644 --- a/migration/options.h +++ b/migration/options.h @@ -16,6 +16,7 @@ #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "migration/client-options.h" /* migration properties */ @@ -24,12 +25,10 @@ extern Property migration_properties[]; /* capabilities */ bool migrate_auto_converge(void); -bool migrate_background_snapshot(void); bool migrate_block(void); bool migrate_colo(void); bool migrate_compress(void); bool migrate_dirty_bitmaps(void); -bool migrate_dirty_limit(void); bool migrate_events(void); bool migrate_mapped_ram(void); bool migrate_ignore_shared(void); @@ -38,11 +37,9 @@ bool migrate_multifd(void); bool migrate_pause_before_switchover(void); bool migrate_postcopy_blocktime(void); bool migrate_postcopy_preempt(void); -bool migrate_postcopy_ram(void); bool migrate_rdma_pin_all(void); bool migrate_release_ram(void); bool migrate_return_path(void); -bool migrate_switchover_ack(void); bool migrate_validate_uuid(void); bool migrate_xbzrle(void); bool migrate_zero_blocks(void); @@ -84,7 +81,6 @@ uint8_t migrate_max_cpu_throttle(void); uint64_t migrate_max_bandwidth(void); uint64_t migrate_avail_switchover_bandwidth(void); uint64_t migrate_max_postcopy_bandwidth(void); -MigMode migrate_mode(void); int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); int migrate_multifd_zlib_level(void); @@ -94,6 +90,7 @@ const char *migrate_tls_authz(void); const char *migrate_tls_creds(void); const char *migrate_tls_hostname(void); uint64_t migrate_xbzrle_cache_size(void); +ZeroPageDetection migrate_zero_page_detection(void); /* parameters setters */ diff --git a/migration/qemu-file.c b/migration/qemu-file.c index b10c882629..a10882d47f 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -63,6 +63,8 @@ struct QEMUFile { */ int qemu_file_shutdown(QEMUFile *f) { + Error *err = NULL; + /* * We must set qemufile error before the real shutdown(), otherwise * there can be a race window where we thought IO all went though @@ -91,7 +93,8 @@ int qemu_file_shutdown(QEMUFile *f) return -ENOSYS; } - if (qio_channel_shutdown(f->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL) < 0) { + if (qio_channel_shutdown(f->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, &err) < 0) { + error_report_err(err); return -EIO; } diff --git a/migration/ram.c b/migration/ram.c index 003c28e133..8deb84984f 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1140,6 +1140,10 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, QEMUFile *file = pss->pss_channel; int len = 0; + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { + return 0; + } + if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { return 0; } @@ -1284,7 +1288,6 @@ static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) if (!multifd_queue_page(block, offset)) { return -1; } - stat64_add(&mig_stats.normal_pages, 1); return 1; } @@ -2076,7 +2079,6 @@ static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, */ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) { - RAMBlock *block = pss->block; ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; @@ -2092,17 +2094,33 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) return 1; } + return ram_save_page(rs, pss); +} + +/** + * ram_save_target_page_multifd: send one target page to multifd workers + * + * Returns 1 if the page was queued, -1 otherwise. + * + * @rs: current RAM state + * @pss: data about the page we want to send + */ +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) +{ + RAMBlock *block = pss->block; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + /* - * Do not use multifd in postcopy as one whole host page should be - * placed. Meanwhile postcopy requires atomic update of pages, so even - * if host page size == guest page size the dest guest during run may - * still see partially copied pages which is data corruption. + * While using multifd live migration, we still need to handle zero + * page checking on the migration main thread. */ - if (migrate_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(block, offset); + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { + if (save_zero_page(rs, pss, offset)) { + return 1; + } } - return ram_save_page(rs, pss); + return ram_save_multifd_page(block, offset); } /* Should be called before sending a host page */ @@ -2909,10 +2927,9 @@ void qemu_guest_free_page_hint(void *addr, size_t len) RAMBlock *block; ram_addr_t offset; size_t used_len, start, npages; - MigrationState *s = migrate_get_current(); /* This function is currently expected to be used during live migration */ - if (!migration_is_setup_or_active(s->state)) { + if (!migration_is_setup_or_active()) { return; } @@ -3110,7 +3127,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } migration_ops = g_malloc0(sizeof(MigrationOps)); - migration_ops->ram_save_target_page = ram_save_target_page_legacy; + + if (migrate_multifd()) { + migration_ops->ram_save_target_page = ram_save_target_page_multifd; + } else { + migration_ops->ram_save_target_page = ram_save_target_page_legacy; + } bql_unlock(); ret = multifd_send_sync_main(); @@ -3150,9 +3172,13 @@ static void ram_save_file_bmap(QEMUFile *f) } } -void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset) +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set) { - set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); + if (set) { + set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); + } else { + clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); + } } /** @@ -3263,7 +3289,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) out: if (ret >= 0 - && migration_is_setup_or_active(migrate_get_current()->state)) { + && migration_is_setup_or_active()) { if (migrate_multifd() && migrate_multifd_flush_after_each_section() && !migrate_mapped_ram()) { ret = multifd_send_sync_main(); @@ -4214,6 +4240,12 @@ static int ram_load_precopy(QEMUFile *f) i++; addr = qemu_get_be64(f); + ret = qemu_file_get_error(f); + if (ret) { + error_report("Getting RAM address failed"); + break; + } + flags = addr & ~TARGET_PAGE_MASK; addr &= TARGET_PAGE_MASK; diff --git a/migration/ram.h b/migration/ram.h index b9ac0da587..08feecaf51 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -75,7 +75,8 @@ bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start); void postcopy_preempt_shutdown_file(MigrationState *s); void *postcopy_preempt_thread(void *opaque); -void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset); +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, + bool set); /* ram cache */ int colo_init_ram_cache(void); diff --git a/migration/rdma.c b/migration/rdma.c index a355dcea89..855753c671 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -3357,7 +3357,7 @@ static int qemu_rdma_accept(RDMAContext *rdma) goto err_rdma_dest_wait; } - isock->host = rdma->host; + isock->host = g_strdup(rdma->host); isock->port = g_strdup_printf("%d", rdma->port); /* diff --git a/migration/savevm.c b/migration/savevm.c index dc1fb9c0d3..388d7af7cd 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1317,7 +1317,7 @@ void qemu_savevm_state_setup(QEMUFile *f) MigrationState *ms = migrate_get_current(); SaveStateEntry *se; Error *local_err = NULL; - int ret; + int ret = 0; json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size()); json_writer_start_array(ms->vmdesc, "devices"); @@ -1351,6 +1351,10 @@ void qemu_savevm_state_setup(QEMUFile *f) } } + if (ret) { + return; + } + if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) { error_report_err(local_err); } @@ -1390,7 +1394,8 @@ int qemu_savevm_state_resume_prepare(MigrationState *s) int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy) { SaveStateEntry *se; - int ret = 1; + bool all_finished = true; + int ret; trace_savevm_state_iterate(); QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { @@ -1431,16 +1436,12 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy) "%d(%s): %d", se->section_id, se->idstr, ret); qemu_file_set_error(f, ret); - } - if (ret <= 0) { - /* Do not proceed to the next vmstate before this one reported - completion of the current stage. This serializes the migration - and reduces the probability that a faster changing state is - synchronized over and over again. */ - break; + return ret; + } else if (!ret) { + all_finished = false; } } - return ret; + return all_finished; } static bool should_send_vmdesc(void) @@ -1705,7 +1706,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) MigrationState *ms = migrate_get_current(); MigrationStatus status; - if (migration_is_running(ms->state)) { + if (migration_is_running()) { error_setg(errp, QERR_MIGRATION_ACTIVE); return -EINVAL; } diff --git a/migration/trace-events b/migration/trace-events index bf1a069632..f0e1cb80c7 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -128,21 +128,21 @@ postcopy_preempt_reset_channel(void) "" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %u" multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p" -multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" +multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" -multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 +multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u flags 0x%x next packet size %u" +multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" multifd_send_terminate_threads(void) "" -multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 +multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" |