diff options
Diffstat (limited to 'migration/ram.c')
| -rw-r--r-- | migration/ram.c | 729 |
1 files changed, 340 insertions, 389 deletions
diff --git a/migration/ram.c b/migration/ram.c index 1338e47665..334309f1c6 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -85,6 +85,26 @@ XBZRLECacheStats xbzrle_counters; +/* used by the search for pages to send */ +struct PageSearchStatus { + /* The migration channel used for a specific host page */ + QEMUFile *pss_channel; + /* Last block from where we have sent data */ + RAMBlock *last_sent_block; + /* Current block being searched */ + RAMBlock *block; + /* Current page to search from */ + unsigned long page; + /* Set once we wrap around */ + bool complete_round; + /* Whether we're sending a host page */ + bool host_page_sending; + /* The start/end of current host page. Invalid if host_page_sending==false */ + unsigned long host_page_start; + unsigned long host_page_end; +}; +typedef struct PageSearchStatus PageSearchStatus; + /* struct contains XBZRLE cache and a static page used by the compression */ static struct { @@ -162,6 +182,11 @@ out: return ret; } +static bool postcopy_preempt_active(void) +{ + return migrate_postcopy_preempt() && migration_in_postcopy(); +} + bool ramblock_is_ignored(RAMBlock *block) { return !qemu_ram_is_migratable(block) || @@ -296,30 +321,17 @@ struct RAMSrcPageRequest { QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; }; -typedef struct { - /* - * Cached ramblock/offset values if preempted. They're only meaningful if - * preempted==true below. - */ - RAMBlock *ram_block; - unsigned long ram_page; - /* - * Whether a postcopy preemption just happened. Will be reset after - * precopy recovered to background migration. - */ - bool preempted; -} PostcopyPreemptState; - /* State of RAM for migration */ struct RAMState { - /* QEMUFile used for this migration */ - QEMUFile *f; + /* + * PageSearchStatus structures for the channels when send pages. + * Protected by the bitmap_mutex. + */ + PageSearchStatus pss[RAM_CHANNEL_MAX]; /* UFFD file descriptor, used in 'write-tracking' migration */ int uffdio_fd; /* Last block that we have visited searching for dirty pages */ RAMBlock *last_seen_block; - /* Last block from where we have sent data */ - RAMBlock *last_sent_block; /* Last dirty target page we have sent */ ram_addr_t last_page; /* last ram version we have seen */ @@ -357,21 +369,18 @@ struct RAMState { uint64_t target_page_count; /* number of dirty bits in the bitmap */ uint64_t migration_dirty_pages; - /* Protects modification of the bitmap and migration dirty pages */ + /* + * Protects: + * - dirty/clear bitmap + * - migration_dirty_pages + * - pss structures + */ QemuMutex bitmap_mutex; /* The RAMBlock used in the last src_page_requests */ RAMBlock *last_req_rb; /* Queue of outstanding page requests from the destination */ QemuMutex src_page_req_mutex; QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; - - /* Postcopy preemption informations */ - PostcopyPreemptState postcopy_preempt_state; - /* - * Current channel we're using on src VM. Only valid if postcopy-preempt - * is enabled. - */ - unsigned int postcopy_channel; }; typedef struct RAMState RAMState; @@ -379,11 +388,6 @@ static RAMState *ram_state; static NotifierWithReturnList precopy_notifier_list; -static void postcopy_preempt_reset(RAMState *rs) -{ - memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState)); -} - /* Whether postcopy has queued requests? */ static bool postcopy_has_request(RAMState *rs) { @@ -420,18 +424,25 @@ uint64_t ram_bytes_remaining(void) 0; } +/* + * NOTE: not all stats in ram_counters are used in reality. See comments + * for struct MigrationAtomicStats. The ultimate result of ram migration + * counters will be a merged version with both ram_counters and the atomic + * fields in ram_atomic_counters. + */ MigrationStats ram_counters; +MigrationAtomicStats ram_atomic_counters; -static void ram_transferred_add(uint64_t bytes) +void ram_transferred_add(uint64_t bytes) { if (runstate_is_running()) { ram_counters.precopy_bytes += bytes; } else if (migration_in_postcopy()) { - ram_counters.postcopy_bytes += bytes; + stat64_add(&ram_atomic_counters.postcopy_bytes, bytes); } else { ram_counters.downtime_bytes += bytes; } - ram_counters.transferred += bytes; + stat64_add(&ram_atomic_counters.transferred, bytes); } void dirty_sync_missed_zero_copy(void) @@ -439,39 +450,6 @@ void dirty_sync_missed_zero_copy(void) ram_counters.dirty_sync_missed_zero_copy++; } -/* used by the search for pages to send */ -struct PageSearchStatus { - /* Current block being searched */ - RAMBlock *block; - /* Current page to search from */ - unsigned long page; - /* Set once we wrap around */ - bool complete_round; - /* - * [POSTCOPY-ONLY] Whether current page is explicitly requested by - * postcopy. When set, the request is "urgent" because the dest QEMU - * threads are waiting for us. - */ - bool postcopy_requested; - /* - * [POSTCOPY-ONLY] The target channel to use to send current page. - * - * Note: This may _not_ match with the value in postcopy_requested - * above. Let's imagine the case where the postcopy request is exactly - * the page that we're sending in progress during precopy. In this case - * we'll have postcopy_requested set to true but the target channel - * will be the precopy channel (so that we don't split brain on that - * specific page since the precopy channel already contains partial of - * that page data). - * - * Besides that specific use case, postcopy_target_channel should - * always be equal to postcopy_requested, because by default we send - * postcopy pages via postcopy preempt channel. - */ - bool postcopy_target_channel; -}; -typedef struct PageSearchStatus PageSearchStatus; - CompressionStats compression_counters; struct CompressParam { @@ -517,11 +495,28 @@ static QemuThread *decompress_threads; static QemuMutex decomp_done_lock; static QemuCond decomp_done_cond; +static int ram_save_host_page_urgent(PageSearchStatus *pss); + static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, ram_addr_t offset, uint8_t *source_buf); -static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, - bool postcopy_requested); +/* NOTE: page is the PFN not real ram_addr_t. */ +static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) +{ + pss->block = rb; + pss->page = page; + pss->complete_round = false; +} + +/* + * Check whether two PSSs are actively sending the same page. Return true + * if it is, false otherwise. + */ +static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) +{ + return pss1->host_page_sending && pss2->host_page_sending && + (pss1->host_page_start == pss2->host_page_start); +} static void *do_data_compress(void *opaque) { @@ -647,28 +642,30 @@ exit: * * Returns the number of bytes written * - * @f: QEMUFile where to send the data + * @pss: current PSS channel status * @block: block that contains the page we want to send * @offset: offset inside the block for the page * in the lower bits, it contains flags */ -static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, +static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block, ram_addr_t offset) { size_t size, len; + bool same_block = (block == pss->last_sent_block); + QEMUFile *f = pss->pss_channel; - if (block == rs->last_sent_block) { + if (same_block) { offset |= RAM_SAVE_FLAG_CONTINUE; } qemu_put_be64(f, offset); size = 8; - if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { + if (!same_block) { len = strlen(block->idstr); qemu_put_byte(f, len); qemu_put_buffer(f, (uint8_t *)block->idstr, len); size += 1 + len; - rs->last_sent_block = block; + pss->last_sent_block = block; } return size; } @@ -719,7 +716,7 @@ void mig_throttle_counter_reset(void) rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); rs->num_dirty_pages_period = 0; - rs->bytes_xfer_prev = ram_counters.transferred; + rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); } /** @@ -736,10 +733,6 @@ void mig_throttle_counter_reset(void) */ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) { - if (!rs->xbzrle_enabled) { - return; - } - /* We don't care if this fails to allocate a new cache page * as long as it updated an old one */ cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, @@ -756,17 +749,19 @@ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) * -1 means that xbzrle would be longer than normal * * @rs: current RAM state + * @pss: current PSS channel * @current_data: pointer to the address of the page contents * @current_addr: addr of the page * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, - ram_addr_t current_addr, RAMBlock *block, - ram_addr_t offset) +static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, + uint8_t **current_data, ram_addr_t current_addr, + RAMBlock *block, ram_addr_t offset) { int encoded_len = 0, bytes_xbzrle; uint8_t *prev_cached_page; + QEMUFile *file = pss->pss_channel; if (!cache_is_cached(XBZRLE.cache, current_addr, ram_counters.dirty_sync_count)) { @@ -831,11 +826,11 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, } /* Send XBZRLE based compressed page */ - bytes_xbzrle = save_page_header(rs, rs->f, block, + bytes_xbzrle = save_page_header(pss, block, offset | RAM_SAVE_FLAG_XBZRLE); - qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); - qemu_put_be16(rs->f, encoded_len); - qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); + qemu_put_byte(file, ENCODING_FLAG_XBZRLE); + qemu_put_be16(file, encoded_len); + qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); bytes_xbzrle += encoded_len + 1 + 2; /* * Like compressed_size (please see update_compress_thread_counts), @@ -849,26 +844,38 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, } /** - * migration_bitmap_find_dirty: find the next dirty page from start + * pss_find_next_dirty: find the next dirty page of current ramblock * - * Returns the page offset within memory region of the start of a dirty page + * This function updates pss->page to point to the next dirty page index + * within the ramblock to migrate, or the end of ramblock when nothing + * found. Note that when pss->host_page_sending==true it means we're + * during sending a host page, so we won't look for dirty page that is + * outside the host page boundary. * - * @rs: current RAM state - * @rb: RAMBlock where to search for dirty pages - * @start: page where we start the search + * @pss: the current page search status */ -static inline -unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, - unsigned long start) +static void pss_find_next_dirty(PageSearchStatus *pss) { + RAMBlock *rb = pss->block; unsigned long size = rb->used_length >> TARGET_PAGE_BITS; unsigned long *bitmap = rb->bmap; if (ramblock_is_ignored(rb)) { - return size; + /* Points directly to the end, so we know no dirty page */ + pss->page = size; + return; + } + + /* + * If during sending a host page, only look for dirty pages within the + * current host page being send. + */ + if (pss->host_page_sending) { + assert(pss->host_page_end); + size = MIN(size, pss->host_page_end); } - return find_next_bit(bitmap, size, start); + pss->page = find_next_bit(bitmap, size, pss->page); } static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, @@ -1083,8 +1090,9 @@ uint64_t ram_pagesize_summary(void) uint64_t ram_get_total_transferred_pages(void) { - return ram_counters.normal + ram_counters.duplicate + - compression_counters.pages + xbzrle_counters.pages; + return stat64_get(&ram_atomic_counters.normal) + + stat64_get(&ram_atomic_counters.duplicate) + + compression_counters.pages + xbzrle_counters.pages; } static void migration_update_rates(RAMState *rs, int64_t end_time) @@ -1143,8 +1151,8 @@ static void migration_trigger_throttle(RAMState *rs) { MigrationState *s = migrate_get_current(); uint64_t threshold = s->parameters.throttle_trigger_threshold; - - uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; + uint64_t bytes_xfer_period = + stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev; uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; @@ -1207,7 +1215,7 @@ static void migration_bitmap_sync(RAMState *rs) /* reset period counters */ rs->time_last_bitmap_sync = end_time; rs->num_dirty_pages_period = 0; - rs->bytes_xfer_prev = ram_counters.transferred; + rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); } if (migrate_use_events()) { qapi_event_send_migration_pass(ram_counters.dirty_sync_count); @@ -1234,7 +1242,7 @@ static void migration_bitmap_sync_precopy(RAMState *rs) } } -static void ram_release_page(const char *rbname, uint64_t offset) +void ram_release_page(const char *rbname, uint64_t offset) { if (!migrate_release_ram() || !migration_in_postcopy()) { return; @@ -1249,19 +1257,19 @@ static void ram_release_page(const char *rbname, uint64_t offset) * Returns the size of data written to the file, 0 means the page is not * a zero page * - * @rs: current RAM state - * @file: the file where the data is saved + * @pss: current PSS channel * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, +static int save_zero_page_to_file(PageSearchStatus *pss, RAMBlock *block, ram_addr_t offset) { uint8_t *p = block->host + offset; + QEMUFile *file = pss->pss_channel; int len = 0; if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { - len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); + len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO); qemu_put_byte(file, 0); len += 1; ram_release_page(block->idstr, offset); @@ -1274,16 +1282,17 @@ static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, * * Returns the number of pages written. * - * @rs: current RAM state + * @pss: current PSS channel * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) +static int save_zero_page(PageSearchStatus *pss, RAMBlock *block, + ram_addr_t offset) { - int len = save_zero_page_to_file(rs, rs->f, block, offset); + int len = save_zero_page_to_file(pss, block, offset); if (len) { - ram_counters.duplicate++; + stat64_add(&ram_atomic_counters.duplicate, 1); ram_transferred_add(len); return 1; } @@ -1297,15 +1306,15 @@ static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) * * Return true if the pages has been saved, otherwise false is returned. */ -static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, - int *pages) +static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, + ram_addr_t offset, int *pages) { uint64_t bytes_xmit = 0; int ret; *pages = -1; - ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, - &bytes_xmit); + ret = ram_control_save_page(pss->pss_channel, block->offset, offset, + TARGET_PAGE_SIZE, &bytes_xmit); if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { return false; } @@ -1320,9 +1329,9 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, } if (bytes_xmit > 0) { - ram_counters.normal++; + stat64_add(&ram_atomic_counters.normal, 1); } else if (bytes_xmit == 0) { - ram_counters.duplicate++; + stat64_add(&ram_atomic_counters.duplicate, 1); } return true; @@ -1333,26 +1342,28 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, * * Returns the number of pages written. * - * @rs: current RAM state + * @pss: current PSS channel * @block: block that contains the page we want to send * @offset: offset inside the block for the page * @buf: the page to be sent * @async: send to page asyncly */ -static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, - uint8_t *buf, bool async) +static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, + ram_addr_t offset, uint8_t *buf, bool async) { - ram_transferred_add(save_page_header(rs, rs->f, block, + QEMUFile *file = pss->pss_channel; + + ram_transferred_add(save_page_header(pss, block, offset | RAM_SAVE_FLAG_PAGE)); if (async) { - qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, + qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, migrate_release_ram() && migration_in_postcopy()); } else { - qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); + qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); } ram_transferred_add(TARGET_PAGE_SIZE); - ram_counters.normal++; + stat64_add(&ram_atomic_counters.normal, 1); return 1; } @@ -1382,8 +1393,8 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) XBZRLE_cache_lock(); if (rs->xbzrle_enabled && !migration_in_postcopy()) { - pages = save_xbzrle_page(rs, &p, current_addr, block, - offset); + pages = save_xbzrle_page(rs, pss, &p, current_addr, + block, offset); if (!rs->last_stage) { /* Can't send this cached data async, since the cache page * might get updated before it gets to the wire @@ -1394,7 +1405,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) /* XBZRLE overflow or normal page */ if (pages == -1) { - pages = save_normal_page(rs, block, offset, p, send_async); + pages = save_normal_page(pss, block, offset, p, send_async); } XBZRLE_cache_unlock(); @@ -1402,13 +1413,13 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) return pages; } -static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, +static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(rs->f, block, offset) < 0) { + if (multifd_queue_page(file, block, offset) < 0) { return -1; } - ram_counters.normal++; + stat64_add(&ram_atomic_counters.normal, 1); return 1; } @@ -1417,14 +1428,15 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, ram_addr_t offset, uint8_t *source_buf) { RAMState *rs = ram_state; + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; uint8_t *p = block->host + offset; int ret; - if (save_zero_page_to_file(rs, f, block, offset)) { + if (save_zero_page_to_file(pss, block, offset)) { return true; } - save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); + save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); /* * copy it to a internal buffer to avoid it being modified by VM @@ -1446,7 +1458,7 @@ update_compress_thread_counts(const CompressParam *param, int bytes_xmit) ram_transferred_add(bytes_xmit); if (param->zero_page) { - ram_counters.duplicate++; + stat64_add(&ram_atomic_counters.duplicate, 1); return; } @@ -1459,6 +1471,7 @@ static bool save_page_use_compression(RAMState *rs); static void flush_compressed_data(RAMState *rs) { + MigrationState *ms = migrate_get_current(); int idx, len, thread_count; if (!save_page_use_compression(rs)) { @@ -1477,7 +1490,7 @@ static void flush_compressed_data(RAMState *rs) for (idx = 0; idx < thread_count; idx++) { qemu_mutex_lock(&comp_param[idx].mutex); if (!comp_param[idx].quit) { - len = qemu_put_qemu_file(rs->f, comp_param[idx].file); + len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); /* * it's safe to fetch zero_page without holding comp_done_lock * as there is no further request submitted to the thread, @@ -1496,11 +1509,11 @@ static inline void set_compress_params(CompressParam *param, RAMBlock *block, param->offset = offset; } -static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, - ram_addr_t offset) +static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) { int idx, thread_count, bytes_xmit = -1, pages = -1; bool wait = migrate_compress_wait_thread(); + MigrationState *ms = migrate_get_current(); thread_count = migrate_compress_threads(); qemu_mutex_lock(&comp_done_lock); @@ -1508,7 +1521,8 @@ retry: for (idx = 0; idx < thread_count; idx++) { if (comp_param[idx].done) { comp_param[idx].done = false; - bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); + bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, + comp_param[idx].file); qemu_mutex_lock(&comp_param[idx].mutex); set_compress_params(&comp_param[idx], block, offset); qemu_cond_signal(&comp_param[idx].cond); @@ -1544,14 +1558,9 @@ retry: */ static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) { - /* - * This is not a postcopy requested page, mark it "not urgent", and use - * precopy channel to send it. - */ - pss->postcopy_requested = false; - pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; + /* Update pss->page for the next dirty bit in ramblock */ + pss_find_next_dirty(pss); - pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); if (pss->complete_round && pss->block == rs->last_seen_block && pss->page >= rs->last_page) { /* @@ -1696,7 +1705,7 @@ static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; /* Flush async buffers before un-protect. */ - qemu_fflush(rs->f); + qemu_fflush(pss->pss_channel); /* Un-protect memory range. */ res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, false, false); @@ -1999,55 +2008,6 @@ void ram_write_tracking_stop(void) } #endif /* defined(__linux__) */ -/* - * Check whether two addr/offset of the ramblock falls onto the same host huge - * page. Returns true if so, false otherwise. - */ -static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1, - uint64_t addr2) -{ - size_t page_size = qemu_ram_pagesize(rb); - - addr1 = ROUND_DOWN(addr1, page_size); - addr2 = ROUND_DOWN(addr2, page_size); - - return addr1 == addr2; -} - -/* - * Whether a previous preempted precopy huge page contains current requested - * page? Returns true if so, false otherwise. - * - * This should really happen very rarely, because it means when we were sending - * during background migration for postcopy we're sending exactly the page that - * some vcpu got faulted on on dest node. When it happens, we probably don't - * need to do much but drop the request, because we know right after we restore - * the precopy stream it'll be serviced. It'll slightly affect the order of - * postcopy requests to be serviced (e.g. it'll be the same as we move current - * request to the end of the queue) but it shouldn't be a big deal. The most - * imporant thing is we can _never_ try to send a partial-sent huge page on the - * POSTCOPY channel again, otherwise that huge page will got "split brain" on - * two channels (PRECOPY, POSTCOPY). - */ -static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block, - ram_addr_t offset) -{ - PostcopyPreemptState *state = &rs->postcopy_preempt_state; - - /* No preemption at all? */ - if (!state->preempted) { - return false; - } - - /* Not even the same ramblock? */ - if (state->ram_block != block) { - return false; - } - - return offset_on_same_huge_page(block, offset, - state->ram_page << TARGET_PAGE_BITS); -} - /** * get_queued_page: unqueue a page from the postcopy requests * @@ -2087,20 +2047,7 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) } while (block && !dirty); - if (block) { - /* See comment above postcopy_preempted_contains() */ - if (postcopy_preempted_contains(rs, block, offset)) { - trace_postcopy_preempt_hit(block->idstr, offset); - /* - * If what we preempted previously was exactly what we're - * requesting right now, restore the preempted precopy - * immediately, boosting its priority as it's requested by - * postcopy. - */ - postcopy_preempt_restore(rs, pss, true); - return true; - } - } else { + if (!block) { /* * Poll write faults too if background snapshot is enabled; that's * when we have vcpus got blocked by the write protected pages. @@ -2122,9 +2069,6 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) * really rare. */ pss->complete_round = false; - /* Mark it an urgent request, meanwhile using POSTCOPY channel */ - pss->postcopy_requested = true; - pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY; } return !!block; @@ -2202,6 +2146,56 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) return -1; } + /* + * When with postcopy preempt, we send back the page directly in the + * rp-return thread. + */ + if (postcopy_preempt_active()) { + ram_addr_t page_start = start >> TARGET_PAGE_BITS; + size_t page_size = qemu_ram_pagesize(ramblock); + PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; + int ret = 0; + + qemu_mutex_lock(&rs->bitmap_mutex); + + pss_init(pss, ramblock, page_start); + /* + * Always use the preempt channel, and make sure it's there. It's + * safe to access without lock, because when rp-thread is running + * we should be the only one who operates on the qemufile + */ + pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; + assert(pss->pss_channel); + + /* + * It must be either one or multiple of host page size. Just + * assert; if something wrong we're mostly split brain anyway. + */ + assert(len % page_size == 0); + while (len) { + if (ram_save_host_page_urgent(pss)) { + error_report("%s: ram_save_host_page_urgent() failed: " + "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, + __func__, ramblock->idstr, start); + ret = -1; + break; + } + /* + * NOTE: after ram_save_host_page_urgent() succeeded, pss->page + * will automatically be moved and point to the next host page + * we're going to send, so no need to update here. + * + * Normally QEMU never sends >1 host page in requests, so + * logically we don't even need that as the loop should only + * run once, but just to be consistent. + */ + len -= page_size; + }; + qemu_mutex_unlock(&rs->bitmap_mutex); + + return ret; + } + struct RAMSrcPageRequest *new_entry = g_new0(struct RAMSrcPageRequest, 1); new_entry->rb = ramblock; @@ -2240,7 +2234,8 @@ static bool save_page_use_compression(RAMState *rs) * has been properly handled by compression, otherwise needs other * paths to handle it */ -static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) +static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, + RAMBlock *block, ram_addr_t offset) { if (!save_page_use_compression(rs)) { return false; @@ -2256,12 +2251,12 @@ static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) * We post the fist page as normal page as compression will take * much CPU resource. */ - if (block != rs->last_sent_block) { + if (block != pss->last_sent_block) { flush_compressed_data(rs); return false; } - if (compress_page_with_multi_thread(rs, block, offset) > 0) { + if (compress_page_with_multi_thread(block, offset) > 0) { return true; } @@ -2283,20 +2278,20 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; - if (control_save_page(rs, block, offset, &res)) { + if (control_save_page(pss, block, offset, &res)) { return res; } - if (save_compress_page(rs, block, offset)) { + if (save_compress_page(rs, pss, block, offset)) { return 1; } - res = save_zero_page(rs, block, offset); + res = save_zero_page(pss, block, offset); if (res > 0) { /* Must let xbzrle know, otherwise a previous (now 0'd) cached * page would be stale */ - if (!save_page_use_compression(rs)) { + if (rs->xbzrle_enabled) { XBZRLE_cache_lock(); xbzrle_cache_zero_page(rs, block->offset + offset); XBZRLE_cache_unlock(); @@ -2311,133 +2306,97 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) * still see partially copied pages which is data corruption. */ if (migrate_use_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(rs, block, offset); + return ram_save_multifd_page(pss->pss_channel, block, offset); } return ram_save_page(rs, pss); } -static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss) +/* Should be called before sending a host page */ +static void pss_host_page_prepare(PageSearchStatus *pss) { - MigrationState *ms = migrate_get_current(); - - /* Not enabled eager preempt? Then never do that. */ - if (!migrate_postcopy_preempt()) { - return false; - } + /* How many guest pages are there in one host page? */ + size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; - /* If the user explicitly disabled breaking of huge page, skip */ - if (!ms->postcopy_preempt_break_huge) { - return false; - } + pss->host_page_sending = true; + pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); + pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); +} - /* If the ramblock we're sending is a small page? Never bother. */ - if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) { - return false; - } +/* + * Whether the page pointed by PSS is within the host page being sent. + * Must be called after a previous pss_host_page_prepare(). + */ +static bool pss_within_range(PageSearchStatus *pss) +{ + ram_addr_t ram_addr; - /* Not in postcopy at all? */ - if (!migration_in_postcopy()) { - return false; - } + assert(pss->host_page_sending); - /* - * If we're already handling a postcopy request, don't preempt as this page - * has got the same high priority. - */ - if (pss->postcopy_requested) { + /* Over host-page boundary? */ + if (pss->page >= pss->host_page_end) { return false; } - /* If there's postcopy requests, then check it up! */ - return postcopy_has_request(rs); -} - -/* Returns true if we preempted precopy, false otherwise */ -static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss) -{ - PostcopyPreemptState *p_state = &rs->postcopy_preempt_state; - - trace_postcopy_preempt_triggered(pss->block->idstr, pss->page); + ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; - /* - * Time to preempt precopy. Cache current PSS into preempt state, so that - * after handling the postcopy pages we can recover to it. We need to do - * so because the dest VM will have partial of the precopy huge page kept - * over in its tmp huge page caches; better move on with it when we can. - */ - p_state->ram_block = pss->block; - p_state->ram_page = pss->page; - p_state->preempted = true; + return offset_in_ramblock(pss->block, ram_addr); } -/* Whether we're preempted by a postcopy request during sending a huge page */ -static bool postcopy_preempt_triggered(RAMState *rs) +static void pss_host_page_finish(PageSearchStatus *pss) { - return rs->postcopy_preempt_state.preempted; + pss->host_page_sending = false; + /* This is not needed, but just to reset it */ + pss->host_page_start = pss->host_page_end = 0; } -static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, - bool postcopy_requested) +/* + * Send an urgent host page specified by `pss'. Need to be called with + * bitmap_mutex held. + * + * Returns 0 if save host page succeeded, false otherwise. + */ +static int ram_save_host_page_urgent(PageSearchStatus *pss) { - PostcopyPreemptState *state = &rs->postcopy_preempt_state; - - assert(state->preempted); + bool page_dirty, sent = false; + RAMState *rs = ram_state; + int ret = 0; - pss->block = state->ram_block; - pss->page = state->ram_page; + trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); + pss_host_page_prepare(pss); - /* Whether this is a postcopy request? */ - pss->postcopy_requested = postcopy_requested; /* - * When restoring a preempted page, the old data resides in PRECOPY - * slow channel, even if postcopy_requested is set. So always use - * PRECOPY channel here. + * If precopy is sending the same page, let it be done in precopy, or + * we could send the same page in two channels and none of them will + * receive the whole page. */ - pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; - - trace_postcopy_preempt_restored(pss->block->idstr, pss->page); - - /* Reset preempt state, most importantly, set preempted==false */ - postcopy_preempt_reset(rs); -} - -static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss) -{ - MigrationState *s = migrate_get_current(); - unsigned int channel = pss->postcopy_target_channel; - QEMUFile *next; - - if (channel != rs->postcopy_channel) { - if (channel == RAM_CHANNEL_PRECOPY) { - next = s->to_dst_file; - } else { - next = s->postcopy_qemufile_src; - } - /* Update and cache the current channel */ - rs->f = next; - rs->postcopy_channel = channel; - - /* - * If channel switched, reset last_sent_block since the old sent block - * may not be on the same channel. - */ - rs->last_sent_block = NULL; - - trace_postcopy_preempt_switch_channel(channel); + if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { + trace_postcopy_preempt_hit(pss->block->idstr, + pss->page << TARGET_PAGE_BITS); + return 0; } - trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); -} - -/* We need to make sure rs->f always points to the default channel elsewhere */ -static void postcopy_preempt_reset_channel(RAMState *rs) -{ - if (migrate_postcopy_preempt() && migration_in_postcopy()) { - rs->postcopy_channel = RAM_CHANNEL_PRECOPY; - rs->f = migrate_get_current()->to_dst_file; - trace_postcopy_preempt_reset_channel(); + do { + page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); + + if (page_dirty) { + /* Be strict to return code; it must be 1, or what else? */ + if (ram_save_target_page(rs, pss) != 1) { + error_report_once("%s: ram_save_target_page failed", __func__); + ret = -1; + goto out; + } + sent = true; + } + pss_find_next_dirty(pss); + } while (pss_within_range(pss)); +out: + pss_host_page_finish(pss); + /* For urgent requests, flush immediately if sent */ + if (sent) { + qemu_fflush(pss->pss_channel); } + return ret; } /** @@ -2448,9 +2407,14 @@ static void postcopy_preempt_reset_channel(RAMState *rs) * a host page in which case the remainder of the hostpage is sent. * Only dirty target pages are sent. Note that the host page size may * be a huge page for this block. + * * The saving stops at the boundary of the used_length of the block * if the RAMBlock isn't a multiple of the host page size. * + * The caller must be with ram_state.bitmap_mutex held to call this + * function. Note that this function can temporarily release the lock, but + * when the function is returned it'll make sure the lock is still held. + * * Returns the number of pages written or negative on error * * @rs: current RAM state @@ -2458,11 +2422,10 @@ static void postcopy_preempt_reset_channel(RAMState *rs) */ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) { + bool page_dirty, preempt_active = postcopy_preempt_active(); int tmppages, pages = 0; size_t pagesize_bits = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; - unsigned long hostpage_boundary = - QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); unsigned long start_page = pss->page; int res; @@ -2471,51 +2434,49 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) return 0; } - if (migrate_postcopy_preempt() && migration_in_postcopy()) { - postcopy_preempt_choose_channel(rs, pss); - } + /* Update host page boundary information */ + pss_host_page_prepare(pss); do { - if (postcopy_needs_preempt(rs, pss)) { - postcopy_do_preempt(rs, pss); - break; - } + page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); /* Check the pages is dirty and if it is send it */ - if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { - tmppages = ram_save_target_page(rs, pss); - if (tmppages < 0) { - return tmppages; - } - - pages += tmppages; + if (page_dirty) { /* - * Allow rate limiting to happen in the middle of huge pages if - * something is sent in the current iteration. + * Properly yield the lock only in postcopy preempt mode + * because both migration thread and rp-return thread can + * operate on the bitmaps. */ - if (pagesize_bits > 1 && tmppages > 0) { - migration_rate_limit(); + if (preempt_active) { + qemu_mutex_unlock(&rs->bitmap_mutex); + } + tmppages = ram_save_target_page(rs, pss); + if (tmppages >= 0) { + pages += tmppages; + /* + * Allow rate limiting to happen in the middle of huge pages if + * something is sent in the current iteration. + */ + if (pagesize_bits > 1 && tmppages > 0) { + migration_rate_limit(); + } } + if (preempt_active) { + qemu_mutex_lock(&rs->bitmap_mutex); + } + } else { + tmppages = 0; } - pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); - } while ((pss->page < hostpage_boundary) && - offset_in_ramblock(pss->block, - ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); - /* The offset we leave with is the min boundary of host page and block */ - pss->page = MIN(pss->page, hostpage_boundary); - /* - * When with postcopy preempt mode, flush the data as soon as possible for - * postcopy requests, because we've already sent a whole huge page, so the - * dst node should already have enough resource to atomically filling in - * the current missing page. - * - * More importantly, when using separate postcopy channel, we must do - * explicit flush or it won't flush until the buffer is full. - */ - if (migrate_postcopy_preempt() && pss->postcopy_requested) { - qemu_fflush(rs->f); - } + if (tmppages < 0) { + pss_host_page_finish(pss); + return tmppages; + } + + pss_find_next_dirty(pss); + } while (pss_within_range(pss)); + + pss_host_page_finish(pss); res = ram_save_release_protection(rs, pss, start_page); return (res < 0 ? res : pages); @@ -2536,7 +2497,7 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) */ static int ram_find_and_save_block(RAMState *rs) { - PageSearchStatus pss; + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; int pages = 0; bool again, found; @@ -2557,35 +2518,24 @@ static int ram_find_and_save_block(RAMState *rs) rs->last_page = 0; } - pss.block = rs->last_seen_block; - pss.page = rs->last_page; - pss.complete_round = false; + pss_init(pss, rs->last_seen_block, rs->last_page); do { again = true; - found = get_queued_page(rs, &pss); + found = get_queued_page(rs, pss); if (!found) { - /* - * Recover previous precopy ramblock/offset if postcopy has - * preempted precopy. Otherwise find the next dirty bit. - */ - if (postcopy_preempt_triggered(rs)) { - postcopy_preempt_restore(rs, &pss, false); - found = true; - } else { - /* priority queue empty, so just search for something dirty */ - found = find_dirty_block(rs, &pss, &again); - } + /* priority queue empty, so just search for something dirty */ + found = find_dirty_block(rs, pss, &again); } if (found) { - pages = ram_save_host_page(rs, &pss); + pages = ram_save_host_page(rs, pss); } } while (!pages && again); - rs->last_seen_block = pss.block; - rs->last_page = pss.page; + rs->last_seen_block = pss->block; + rs->last_page = pss->page; return pages; } @@ -2595,9 +2545,9 @@ void acct_update_position(QEMUFile *f, size_t size, bool zero) uint64_t pages = size / TARGET_PAGE_SIZE; if (zero) { - ram_counters.duplicate += pages; + stat64_add(&ram_atomic_counters.duplicate, pages); } else { - ram_counters.normal += pages; + stat64_add(&ram_atomic_counters.normal, pages); ram_transferred_add(size); qemu_file_credit_transfer(f, size); } @@ -2699,13 +2649,16 @@ static void ram_save_cleanup(void *opaque) static void ram_state_reset(RAMState *rs) { + int i; + + for (i = 0; i < RAM_CHANNEL_MAX; i++) { + rs->pss[i].last_sent_block = NULL; + } + rs->last_seen_block = NULL; - rs->last_sent_block = NULL; rs->last_page = 0; rs->last_version = ram_list.version; rs->xbzrle_enabled = false; - postcopy_preempt_reset(rs); - rs->postcopy_channel = RAM_CHANNEL_PRECOPY; } #define MAX_WAIT 50 /* ms, half buffered_file limit */ @@ -2894,8 +2847,8 @@ void ram_postcopy_send_discard_bitmap(MigrationState *ms) migration_bitmap_sync(rs); /* Easiest way to make sure we don't resume in the middle of a host-page */ + rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; rs->last_seen_block = NULL; - rs->last_sent_block = NULL; rs->last_page = 0; postcopy_each_ram_send_discard(ms); @@ -3132,7 +3085,7 @@ static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) ram_state_reset(rs); /* Update RAMState cache of output QEMUFile */ - rs->f = out; + rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; trace_ram_state_resume_prepare(pages); } @@ -3223,7 +3176,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return -1; } } - (*rsp)->f = f; + (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; WITH_RCU_READ_LOCK_GUARD() { qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); @@ -3349,8 +3302,6 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) } qemu_mutex_unlock(&rs->bitmap_mutex); - postcopy_preempt_reset_channel(rs); - /* * Must occur before EOS (or any QEMUFile operation) * because of RDMA protocol. @@ -3360,7 +3311,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { - ret = multifd_send_sync_main(rs->f); + ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); if (ret < 0) { return ret; } @@ -3406,6 +3357,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) /* try transferring iterative blocks of memory */ /* flush all remaining blocks regardless of rate limiting */ + qemu_mutex_lock(&rs->bitmap_mutex); while (true) { int pages; @@ -3419,6 +3371,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) break; } } + qemu_mutex_unlock(&rs->bitmap_mutex); flush_compressed_data(rs); ram_control_after_iterate(f, RAM_CONTROL_FINISH); @@ -3428,9 +3381,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) return ret; } - postcopy_preempt_reset_channel(rs); - - ret = multifd_send_sync_main(rs->f); + ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); if (ret < 0) { return ret; } |