diff options
Diffstat (limited to 'migration')
| -rw-r--r-- | migration/colo.c | 102 | ||||
| -rw-r--r-- | migration/migration.c | 16 | ||||
| -rw-r--r-- | migration/qemu-file.c | 59 | ||||
| -rw-r--r-- | migration/ram.c | 78 | ||||
| -rw-r--r-- | migration/savevm.c | 2 | ||||
| -rw-r--r-- | migration/vmstate.c | 44 |
6 files changed, 266 insertions, 35 deletions
diff --git a/migration/colo.c b/migration/colo.c index 93c85c538b..712308ed5e 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -20,6 +20,8 @@ #include "qapi/error.h" #include "migration/failover.h" +static bool vmstate_loading; + #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024) bool colo_supported(void) @@ -51,6 +53,19 @@ static void secondary_vm_do_failover(void) int old_state; MigrationIncomingState *mis = migration_incoming_get_current(); + /* Can not do failover during the process of VM's loading VMstate, Or + * it will break the secondary VM. + */ + if (vmstate_loading) { + old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, + FAILOVER_STATUS_RELAUNCH); + if (old_state != FAILOVER_STATUS_ACTIVE) { + error_report("Unknown error while do failover for secondary VM," + "old_state: %s", FailoverStatus_lookup[old_state]); + } + return; + } + migrate_set_state(&mis->state, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED); @@ -59,6 +74,18 @@ static void secondary_vm_do_failover(void) /* recover runstate to normal migration finish state */ autostart = true; } + /* + * Make sure COLO incoming thread not block in recv or send, + * If mis->from_src_file and mis->to_src_file use the same fd, + * The second shutdown() will return -1, we ignore this value, + * It is harmless. + */ + if (mis->from_src_file) { + qemu_file_shutdown(mis->from_src_file); + } + if (mis->to_src_file) { + qemu_file_shutdown(mis->to_src_file); + } old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, FAILOVER_STATUS_COMPLETED); @@ -67,6 +94,8 @@ static void secondary_vm_do_failover(void) "secondary VM", FailoverStatus_lookup[old_state]); return; } + /* Notify COLO incoming thread that failover work is finished */ + qemu_sem_post(&mis->colo_incoming_sem); /* For Secondary VM, jump to incoming co */ if (mis->migration_incoming_co) { qemu_coroutine_enter(mis->migration_incoming_co); @@ -81,6 +110,18 @@ static void primary_vm_do_failover(void) migrate_set_state(&s->state, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED); + /* + * Wake up COLO thread which may blocked in recv() or send(), + * The s->rp_state.from_dst_file and s->to_dst_file may use the + * same fd, but we still shutdown the fd for twice, it is harmless. + */ + if (s->to_dst_file) { + qemu_file_shutdown(s->to_dst_file); + } + if (s->rp_state.from_dst_file) { + qemu_file_shutdown(s->rp_state.from_dst_file); + } + old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, FAILOVER_STATUS_COMPLETED); if (old_state != FAILOVER_STATUS_ACTIVE) { @@ -88,6 +129,8 @@ static void primary_vm_do_failover(void) FailoverStatus_lookup[old_state]); return; } + /* Notify COLO thread that failover work is finished */ + qemu_sem_post(&s->colo_exit_sem); } void colo_do_failover(MigrationState *s) @@ -302,7 +345,7 @@ static void colo_process_checkpoint(MigrationState *s) { QIOChannelBuffer *bioc; QEMUFile *fb = NULL; - int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); + int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); Error *local_err = NULL; int ret; @@ -332,26 +375,21 @@ static void colo_process_checkpoint(MigrationState *s) qemu_mutex_unlock_iothread(); trace_colo_vm_state_change("stop", "run"); + timer_mod(s->colo_delay_timer, + current_time + s->parameters.x_checkpoint_delay); + while (s->state == MIGRATION_STATUS_COLO) { if (failover_get_state() != FAILOVER_STATUS_NONE) { error_report("failover request"); goto out; } - current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); - if (current_time - checkpoint_time < - s->parameters.x_checkpoint_delay) { - int64_t delay_ms; + qemu_sem_wait(&s->colo_checkpoint_sem); - delay_ms = s->parameters.x_checkpoint_delay - - (current_time - checkpoint_time); - g_usleep(delay_ms * 1000); - } ret = colo_do_checkpoint_transaction(s, bioc, fb); if (ret < 0) { goto out; } - checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); } out: @@ -364,14 +402,41 @@ out: qemu_fclose(fb); } + timer_del(s->colo_delay_timer); + + /* Hope this not to be too long to wait here */ + qemu_sem_wait(&s->colo_exit_sem); + qemu_sem_destroy(&s->colo_exit_sem); + /* + * Must be called after failover BH is completed, + * Or the failover BH may shutdown the wrong fd that + * re-used by other threads after we release here. + */ if (s->rp_state.from_dst_file) { qemu_fclose(s->rp_state.from_dst_file); } } +void colo_checkpoint_notify(void *opaque) +{ + MigrationState *s = opaque; + int64_t next_notify_time; + + qemu_sem_post(&s->colo_checkpoint_sem); + s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); + next_notify_time = s->colo_checkpoint_time + + s->parameters.x_checkpoint_delay; + timer_mod(s->colo_delay_timer, next_notify_time); +} + void migrate_start_colo_process(MigrationState *s) { qemu_mutex_unlock_iothread(); + qemu_sem_init(&s->colo_checkpoint_sem, 0); + s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, + colo_checkpoint_notify, s); + + qemu_sem_init(&s->colo_exit_sem, 0); migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); colo_process_checkpoint(s); @@ -410,6 +475,8 @@ void *colo_process_incoming_thread(void *opaque) uint64_t value; Error *local_err = NULL; + qemu_sem_init(&mis->colo_incoming_sem, 0); + migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); @@ -496,13 +563,23 @@ void *colo_process_incoming_thread(void *opaque) qemu_mutex_lock_iothread(); qemu_system_reset(VMRESET_SILENT); + vmstate_loading = true; if (qemu_loadvm_state(fb) < 0) { error_report("COLO: loadvm failed"); qemu_mutex_unlock_iothread(); goto out; } + + vmstate_loading = false; qemu_mutex_unlock_iothread(); + if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) { + failover_set_state(FAILOVER_STATUS_RELAUNCH, + FAILOVER_STATUS_NONE); + failover_request_active(NULL); + goto out; + } + colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED, &local_err); if (local_err) { @@ -511,6 +588,7 @@ void *colo_process_incoming_thread(void *opaque) } out: + vmstate_loading = false; /* Throw the unreported error message after exited from loop */ if (local_err) { error_report_err(local_err); @@ -520,6 +598,10 @@ out: qemu_fclose(fb); } + /* Hope this not to be too long to loop here */ + qemu_sem_wait(&mis->colo_incoming_sem); + qemu_sem_destroy(&mis->colo_incoming_sem); + /* Must be called after failover BH is completed */ if (mis->to_src_file) { qemu_fclose(mis->to_src_file); } diff --git a/migration/migration.c b/migration/migration.c index 2b179c69fa..c6ae69d371 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -891,6 +891,9 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp) if (params->has_x_checkpoint_delay) { s->parameters.x_checkpoint_delay = params->x_checkpoint_delay; + if (migration_in_colo_state()) { + colo_checkpoint_notify(s); + } } } @@ -1297,6 +1300,15 @@ void qmp_migrate_set_downtime(double value, Error **errp) qmp_migrate_set_parameters(&p, errp); } +bool migrate_release_ram(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM]; +} + bool migrate_postcopy_ram(void) { MigrationState *s; @@ -1713,6 +1725,10 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) */ qemu_savevm_send_ping(ms->to_dst_file, 4); + if (migrate_release_ram()) { + ram_postcopy_migrated_memory_release(ms); + } + ret = qemu_file_get_error(ms->to_dst_file); if (ret) { error_report("postcopy_start: Migration stream errored"); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index e9fae31158..195fa94fcf 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -49,6 +49,7 @@ struct QEMUFile { int buf_size; /* 0 when writing */ uint8_t buf[IO_BUF_SIZE]; + DECLARE_BITMAP(may_free, MAX_IOV_SIZE); struct iovec iov[MAX_IOV_SIZE]; unsigned int iovcnt; @@ -132,6 +133,41 @@ bool qemu_file_is_writable(QEMUFile *f) return f->ops->writev_buffer; } +static void qemu_iovec_release_ram(QEMUFile *f) +{ + struct iovec iov; + unsigned long idx; + + /* Find and release all the contiguous memory ranges marked as may_free. */ + idx = find_next_bit(f->may_free, f->iovcnt, 0); + if (idx >= f->iovcnt) { + return; + } + iov = f->iov[idx]; + + /* The madvise() in the loop is called for iov within a continuous range and + * then reinitialize the iov. And in the end, madvise() is called for the + * last iov. + */ + while ((idx = find_next_bit(f->may_free, f->iovcnt, idx + 1)) < f->iovcnt) { + /* check for adjacent buffer and coalesce them */ + if (iov.iov_base + iov.iov_len == f->iov[idx].iov_base) { + iov.iov_len += f->iov[idx].iov_len; + continue; + } + if (qemu_madvise(iov.iov_base, iov.iov_len, QEMU_MADV_DONTNEED) < 0) { + error_report("migrate: madvise DONTNEED failed %p %zd: %s", + iov.iov_base, iov.iov_len, strerror(errno)); + } + iov = f->iov[idx]; + } + if (qemu_madvise(iov.iov_base, iov.iov_len, QEMU_MADV_DONTNEED) < 0) { + error_report("migrate: madvise DONTNEED failed %p %zd: %s", + iov.iov_base, iov.iov_len, strerror(errno)); + } + memset(f->may_free, 0, sizeof(f->may_free)); +} + /** * Flushes QEMUFile buffer * @@ -151,6 +187,8 @@ void qemu_fflush(QEMUFile *f) if (f->iovcnt > 0) { expect = iov_size(f->iov, f->iovcnt); ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos); + + qemu_iovec_release_ram(f); } if (ret >= 0) { @@ -304,13 +342,19 @@ int qemu_fclose(QEMUFile *f) return ret; } -static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size) +static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size, + bool may_free) { /* check for adjacent buffer and coalesce them */ if (f->iovcnt > 0 && buf == f->iov[f->iovcnt - 1].iov_base + - f->iov[f->iovcnt - 1].iov_len) { + f->iov[f->iovcnt - 1].iov_len && + may_free == test_bit(f->iovcnt - 1, f->may_free)) + { f->iov[f->iovcnt - 1].iov_len += size; } else { + if (may_free) { + set_bit(f->iovcnt, f->may_free); + } f->iov[f->iovcnt].iov_base = (uint8_t *)buf; f->iov[f->iovcnt++].iov_len = size; } @@ -320,14 +364,15 @@ static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size) } } -void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size) +void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size, + bool may_free) { if (f->last_error) { return; } f->bytes_xfer += size; - add_to_iovec(f, buf, size); + add_to_iovec(f, buf, size, may_free); } void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) @@ -345,7 +390,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) } memcpy(f->buf + f->buf_index, buf, l); f->bytes_xfer += l; - add_to_iovec(f, f->buf + f->buf_index, l); + add_to_iovec(f, f->buf + f->buf_index, l, false); f->buf_index += l; if (f->buf_index == IO_BUF_SIZE) { qemu_fflush(f); @@ -366,7 +411,7 @@ void qemu_put_byte(QEMUFile *f, int v) f->buf[f->buf_index] = v; f->bytes_xfer++; - add_to_iovec(f, f->buf + f->buf_index, 1); + add_to_iovec(f, f->buf + f->buf_index, 1, false); f->buf_index++; if (f->buf_index == IO_BUF_SIZE) { qemu_fflush(f); @@ -647,7 +692,7 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, } qemu_put_be32(f, blen); if (f->ops->writev_buffer) { - add_to_iovec(f, f->buf + f->buf_index, blen); + add_to_iovec(f, f->buf + f->buf_index, blen, false); } f->buf_index += blen; if (f->buf_index == IO_BUF_SIZE) { diff --git a/migration/ram.c b/migration/ram.c index ef8fadfe69..f289fcddd5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -705,6 +705,16 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, return pages; } +static void ram_release_pages(MigrationState *ms, const char *block_name, + uint64_t offset, int pages) +{ + if (!migrate_release_ram() || !migration_in_postcopy(ms)) { + return; + } + + ram_discard_range(NULL, block_name, offset, pages << TARGET_PAGE_BITS); +} + /** * ram_save_page: Send the given page to the stream * @@ -713,13 +723,14 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, * >=0 - Number of pages written - this might legally be 0 * if xbzrle noticed the page was the same. * + * @ms: The current migration state. * @f: QEMUFile where to send the data * @block: block that contains the page we want to send * @offset: offset inside the block for the page * @last_stage: if we are at the completion stage * @bytes_transferred: increase it with the number of transferred bytes */ -static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, +static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss, bool last_stage, uint64_t *bytes_transferred) { int pages = -1; @@ -764,9 +775,9 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, * page would be stale */ xbzrle_cache_zero_page(current_addr); + ram_release_pages(ms, block->idstr, pss->offset, pages); } else if (!ram_bulk_stage && - !migration_in_postcopy(migrate_get_current()) && - migrate_use_xbzrle()) { + !migration_in_postcopy(ms) && migrate_use_xbzrle()) { pages = save_xbzrle_page(f, &p, current_addr, block, offset, last_stage, bytes_transferred); if (!last_stage) { @@ -783,7 +794,9 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, *bytes_transferred += save_page_header(f, block, offset | RAM_SAVE_FLAG_PAGE); if (send_async) { - qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); + qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE, + migrate_release_ram() & + migration_in_postcopy(ms)); } else { qemu_put_buffer(f, p, TARGET_PAGE_SIZE); } @@ -813,6 +826,8 @@ static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, error_report("compressed data failed!"); } else { bytes_sent += blen; + ram_release_pages(migrate_get_current(), block->idstr, + offset & TARGET_PAGE_MASK, 1); } return bytes_sent; @@ -893,14 +908,15 @@ static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, * * Returns: Number of pages written. * + * @ms: The current migration state. * @f: QEMUFile where to send the data * @block: block that contains the page we want to send * @offset: offset inside the block for the page * @last_stage: if we are at the completion stage * @bytes_transferred: increase it with the number of transferred bytes */ -static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, - bool last_stage, +static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f, + PageSearchStatus *pss, bool last_stage, uint64_t *bytes_transferred) { int pages = -1; @@ -951,12 +967,17 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, error_report("compressed data failed!"); } } + if (pages > 0) { + ram_release_pages(ms, block->idstr, pss->offset, pages); + } } else { offset |= RAM_SAVE_FLAG_CONTINUE; pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { pages = compress_page_with_multi_thread(f, block, offset, bytes_transferred); + } else { + ram_release_pages(ms, block->idstr, pss->offset, pages); } } } @@ -1231,11 +1252,11 @@ static int ram_save_target_page(MigrationState *ms, QEMUFile *f, if (migration_bitmap_clear_dirty(dirty_ram_abs)) { unsigned long *unsentmap; if (compression_switch && migrate_use_compression()) { - res = ram_save_compressed_page(f, pss, + res = ram_save_compressed_page(ms, f, pss, last_stage, bytes_transferred); } else { - res = ram_save_page(f, pss, last_stage, + res = ram_save_page(ms, f, pss, last_stage, bytes_transferred); } @@ -1325,6 +1346,11 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in ram_addr_t space */ + /* No dirty page as there is zero RAM */ + if (!ram_bytes_total()) { + return pages; + } + pss.block = last_seen_block; pss.offset = last_offset; pss.complete_round = false; @@ -1516,6 +1542,25 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool expected) /* **** functions for postcopy ***** */ +void ram_postcopy_migrated_memory_release(MigrationState *ms) +{ + struct RAMBlock *block; + unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + unsigned long first = block->offset >> TARGET_PAGE_BITS; + unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS); + unsigned long run_start = find_next_zero_bit(bitmap, range, first); + + while (run_start < range) { + unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); + ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS, + (run_end - run_start) << TARGET_PAGE_BITS); + run_start = find_next_zero_bit(bitmap, range, run_end + 1); + } + } +} + /* * Callback from postcopy_each_ram_send_discard for each RAMBlock * Note: At this point the 'unsentmap' is the processed bitmap combined @@ -1912,14 +1957,17 @@ static int ram_save_init_globals(void) bytes_transferred = 0; reset_ram_globals(); - ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); - migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); - bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); - - if (migrate_postcopy_ram()) { - migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); - bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); + /* Skip setting bitmap if there is no RAM */ + if (ram_bytes_total()) { + ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; + migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); + bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); + + if (migrate_postcopy_ram()) { + migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); + bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); + } } /* diff --git a/migration/savevm.c b/migration/savevm.c index 01997687c4..5ecd264134 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -356,7 +356,7 @@ static const VMStateDescription vmstate_configuration = { .pre_save = configuration_pre_save, .fields = (VMStateField[]) { VMSTATE_UINT32(len, SaveState), - VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len), + VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len), VMSTATE_END_OF_LIST() }, .subsections = (const VMStateDescription*[]) { diff --git a/migration/vmstate.c b/migration/vmstate.c index 2b2b3a58e6..b4d8ae982a 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -68,10 +68,10 @@ static void *vmstate_base_addr(void *opaque, VMStateField *field, bool alloc) } } if (size) { - *((void **)base_addr + field->start) = g_malloc(size); + *(void **)base_addr = g_malloc(size); } } - base_addr = *(void **)base_addr + field->start; + base_addr = *(void **)base_addr; } return base_addr; @@ -935,6 +935,46 @@ const VMStateInfo vmstate_info_unused_buffer = { .put = put_unused_buffer, }; +/* vmstate_info_tmp, see VMSTATE_WITH_TMP, the idea is that we allocate + * a temporary buffer and the pre_load/pre_save methods in the child vmsd + * copy stuff from the parent into the child and do calculations to fill + * in fields that don't really exist in the parent but need to be in the + * stream. + */ +static int get_tmp(QEMUFile *f, void *pv, size_t size, VMStateField *field) +{ + int ret; + const VMStateDescription *vmsd = field->vmsd; + int version_id = field->version_id; + void *tmp = g_malloc(size); + + /* Writes the parent field which is at the start of the tmp */ + *(void **)tmp = pv; + ret = vmstate_load_state(f, vmsd, tmp, version_id); + g_free(tmp); + return ret; +} + +static int put_tmp(QEMUFile *f, void *pv, size_t size, VMStateField *field, + QJSON *vmdesc) +{ + const VMStateDescription *vmsd = field->vmsd; + void *tmp = g_malloc(size); + + /* Writes the parent field which is at the start of the tmp */ + *(void **)tmp = pv; + vmstate_save_state(f, vmsd, tmp, vmdesc); + g_free(tmp); + + return 0; +} + +const VMStateInfo vmstate_info_tmp = { + .name = "tmp", + .get = get_tmp, + .put = put_tmp, +}; + /* bitmaps (as defined by bitmap.h). Note that size here is the size * of the bitmap in bits. The on-the-wire format of a bitmap is 64 * bit words with the bits in big endian order. The in-memory format |