diff options
| -rw-r--r-- | migration/postcopy-ram.c | 259 | ||||
| -rw-r--r-- | migration/trace-events | 5 |
2 files changed, 216 insertions, 48 deletions
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index fd6c0bdb1e..91c23b446e 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -112,14 +112,69 @@ void postcopy_thread_create(MigrationIncomingState *mis, /* All the time records are in unit of nanoseconds */ typedef struct PostcopyBlocktimeContext { - /* time when page fault initiated per vCPU */ - uint64_t *vcpu_blocktime_start; /* blocktime per vCPU */ uint64_t *vcpu_blocktime_total; /* count of faults per vCPU */ uint64_t *vcpu_faults_count; - /* page address per vCPU */ - uintptr_t *vcpu_addr; + /* + * count of currently blocked faults per vCPU. + * + * NOTE: Normally there should only be one fault in-progress per vCPU + * thread, so logically it _seems_ vcpu_faults_count[] for any vCPU + * should be either zero or one. However, there can be reasons we see + * >1 faults on the same vCPU thread. + * + * CASE (1): since the process to resolve faults (ioctl(UFFDIO_COPY), + * for example) is done before taking the mutex that protects the + * blocktime context, it can happen that we read more than one faulted + * addresses per vCPU. + * + * One example when we can see >1 faulted addresses for one vCPU: + * + * vcpu1 thread fault thread resolve thread + * ============ ============ ============== + * + * faulted on addr1 + * read uffd msg (addr1) + * MUTEX_LOCK + * add entry (cpu1, addr1) + * MUTEX_UNLOCK + * request remote fault (addr1) + * resolve fault (addr1) + * addr1 resolved, continue.. + * faulted on addr2 + * read uffd msg (addr2) + * MUTEX_LOCK + * add entry (cpu1, addr2) <--------------- [A] + * MUTEX_UNLOCK + * MUTEX_LOCK + * remove entry (cpu1, addr1) + * MUTEX_UNLOCK + * + * In above case, we may see (cpu1, addr1) and (cpu1, addr2) entries to + * appear together at [A], when it gets the lock before the resolve + * thread. Use this counter to maintain such case, and only when it + * reaches zero we know the vCPU is not blocked anymore. + * + * CASE (2): theoretically (the author admit to not have verified + * this..), one vCPU thread can also generate more than one userfaultfd + * message on the same address. It can happen e.g. for whatever reason + * the fault got retried before a resolution arrives. In that extremely + * rare case, we could also see two (cpu1, addr1) entries. + * + * In all cases, be prepared with such re-entrancies with this array. + * + * Using uint8_t should be far enough for now. For example, when + * there're only one resolve thread (postcopy ram listening thread), + * the max (concurrent fault entries) should be two. + */ + uint8_t *vcpu_faults_current; + /* + * The hash that contains addr1->[(cpu1,ts1),(cpu2,ts2) ...] mappings. + * Each of the entry is a tuple of (CPU index, fault timestamp) showing + * that a fault was requested. + */ + GHashTable *vcpu_addr_hash; /* total blocktime when all vCPUs are stopped */ uint64_t total_blocktime; /* point in time when last page fault was initiated */ @@ -145,13 +200,38 @@ typedef struct PostcopyBlocktimeContext { Notifier exit_notifier; } PostcopyBlocktimeContext; +typedef struct { + /* The time the fault was triggered */ + uint64_t fault_time; + /* The vCPU index that was blocked */ + int cpu; +} BlocktimeVCPUEntry; + +/* Alloc an entry to record a vCPU fault */ +static BlocktimeVCPUEntry * +blocktime_vcpu_entry_alloc(int cpu, uint64_t fault_time) +{ + BlocktimeVCPUEntry *entry = g_new(BlocktimeVCPUEntry, 1); + + entry->fault_time = fault_time; + entry->cpu = cpu; + + return entry; +} + +/* Free a @GList of @BlocktimeVCPUEntry */ +static void blocktime_vcpu_list_free(gpointer data) +{ + g_list_free_full(data, g_free); +} + static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx) { g_hash_table_destroy(ctx->tid_to_vcpu_hash); - g_free(ctx->vcpu_blocktime_start); + g_hash_table_destroy(ctx->vcpu_addr_hash); g_free(ctx->vcpu_blocktime_total); g_free(ctx->vcpu_faults_count); - g_free(ctx->vcpu_addr); + g_free(ctx->vcpu_faults_current); g_free(ctx); } @@ -198,12 +278,22 @@ static struct PostcopyBlocktimeContext *blocktime_context_new(void) unsigned int smp_cpus = ms->smp.cpus; PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1); - ctx->vcpu_blocktime_start = g_new0(uint64_t, smp_cpus); ctx->vcpu_blocktime_total = g_new0(uint64_t, smp_cpus); ctx->vcpu_faults_count = g_new0(uint64_t, smp_cpus); - ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus); + ctx->vcpu_faults_current = g_new0(uint8_t, smp_cpus); ctx->tid_to_vcpu_hash = blocktime_init_tid_to_vcpu_hash(); + /* + * The key (host virtual addresses) will always be gpointer-sized on + * either 32bits or 64bits systems, so it'll fit as a direct key. + * + * The value will be a list of BlocktimeVCPUEntry entries. + */ + ctx->vcpu_addr_hash = g_hash_table_new_full(g_direct_hash, + g_direct_equal, + NULL, + blocktime_vcpu_list_free); + ctx->exit_notifier.notify = migration_exit_cb; qemu_add_exit_notifier(&ctx->exit_notifier); @@ -893,6 +983,39 @@ static uint64_t get_current_ns(void) return (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_REALTIME); } +/* Inject an (cpu, fault_time) entry into the database, using addr as key */ +static void blocktime_fault_inject(PostcopyBlocktimeContext *ctx, + uintptr_t addr, int cpu, uint64_t time) +{ + BlocktimeVCPUEntry *entry = blocktime_vcpu_entry_alloc(cpu, time); + GHashTable *table = ctx->vcpu_addr_hash; + gpointer key = (gpointer)addr; + GList *head, *list; + gboolean result; + + head = g_hash_table_lookup(table, key); + if (head) { + /* + * If existed, steal the @head for list operation rather than + * freeing it, making sure steal succeeded. + */ + result = g_hash_table_steal(table, key); + assert(result == TRUE); + } + + /* + * Now the key is guaranteed to be absent. Two cases: + * + * (1) There's no existing entry, list contains the only one. Insert. + * (2) There're existing entries, after stealing we own it, prepend the + * result and re-insert. + */ + list = g_list_prepend(head, entry); + g_hash_table_insert(table, key, list); + + trace_postcopy_blocktime_begin(addr, time, cpu, !!head); +} + /* * This function is being called when pagefault occurs. It tracks down vCPU * blocking time. It's protected by @page_request_mutex. @@ -912,30 +1035,74 @@ void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, if (!dc || ptid == 0) { return; } + + /* + * The caller should only inject a blocktime entry when the page is + * yet missing. + */ + assert(!ramblock_recv_bitmap_test(rb, (void *)addr)); + + current = get_current_ns(); cpu = blocktime_get_vcpu(dc, ptid); - if (cpu < 0) { + + if (cpu >= 0) { + /* How many faults on this vCPU in total? */ + dc->vcpu_faults_count[cpu]++; + + /* + * Account how many concurrent faults on this vCPU we trapped. See + * comments above vcpu_faults_current[] on why it can be more than one. + */ + if (dc->vcpu_faults_current[cpu]++ == 0) { + dc->smp_cpus_down++; + /* + * We use last_begin to cover (1) the 1st fault on this specific + * vCPU, but meanwhile (2) the last vCPU that got blocked. It's + * only used to calculate system-wide blocktime. + */ + dc->last_begin = current; + } + + /* Making sure it won't overflow - it really should never! */ + assert(dc->vcpu_faults_current[cpu] <= 255); + } else { + /* We do not support non-vCPU thread tracking yet */ dc->non_vcpu_faults++; return; } - current = get_current_ns(); - if (dc->vcpu_addr[cpu] == 0) { - dc->smp_cpus_down++; - } + blocktime_fault_inject(dc, addr, cpu, current); +} - dc->last_begin = current; - dc->vcpu_blocktime_start[cpu] = current; - dc->vcpu_addr[cpu] = addr; - dc->vcpu_faults_count[cpu]++; +typedef struct { + PostcopyBlocktimeContext *ctx; + uint64_t current; + int affected_cpus; +} BlockTimeVCPUIter; + +static void blocktime_cpu_list_iter_fn(gpointer data, gpointer user_data) +{ + BlockTimeVCPUIter *iter = user_data; + PostcopyBlocktimeContext *ctx = iter->ctx; + BlocktimeVCPUEntry *entry = data; + int cpu = entry->cpu; /* - * The caller should only inject a blocktime entry when the page is - * yet missing. + * Time should never go back.. so when the fault is resolved it must be + * later than when it was faulted. */ - assert(!ramblock_recv_bitmap_test(rb, (void *)addr)); + assert(iter->current >= entry->fault_time); + + /* + * If we resolved all pending faults on one vCPU due to this page + * resolution, take a note. + */ + if (--ctx->vcpu_faults_current[cpu] == 0) { + ctx->vcpu_blocktime_total[cpu] += iter->current - entry->fault_time; + iter->affected_cpus += 1; + } - trace_mark_postcopy_blocktime_begin(addr, dc->vcpu_blocktime_start[cpu], - cpu); + trace_postcopy_blocktime_end_one(cpu, ctx->vcpu_faults_current[cpu]); } /* @@ -971,43 +1138,43 @@ static void mark_postcopy_blocktime_end(uintptr_t addr) PostcopyBlocktimeContext *dc = mis->blocktime_ctx; MachineState *ms = MACHINE(qdev_get_machine()); unsigned int smp_cpus = ms->smp.cpus; - int i, affected_cpu = 0; - uint64_t read_vcpu_time, current; + BlockTimeVCPUIter iter = { + .current = get_current_ns(), + .affected_cpus = 0, + .ctx = dc, + }; + gpointer key = (gpointer)addr; + GHashTable *table; + GList *list; if (!dc) { return; } - current = get_current_ns(); - /* lookup cpu, to clear it, - * that algorithm looks straightforward, but it's not - * optimal, more optimal algorithm is keeping tree or hash - * where key is address value is a list of */ - for (i = 0; i < smp_cpus; i++) { - uint64_t vcpu_blocktime = 0; - - read_vcpu_time = dc->vcpu_blocktime_start[i]; - if (dc->vcpu_addr[i] != addr || read_vcpu_time == 0) { - continue; - } - dc->vcpu_addr[i] = 0; - vcpu_blocktime = current - read_vcpu_time; - affected_cpu += 1; - /* continue cycle, due to one page could affect several vCPUs */ - dc->vcpu_blocktime_total[i] += vcpu_blocktime; + table = dc->vcpu_addr_hash; + /* the address wasn't tracked at all? */ + list = g_hash_table_lookup(table, key); + if (!list) { + return; } /* + * Loop over the set of vCPUs that got blocked on this addr, do the + * blocktime accounting. After that, remove the whole list. + */ + g_list_foreach(list, blocktime_cpu_list_iter_fn, &iter); + g_hash_table_remove(table, key); + + /* * If all vCPUs used to be down, and copying this page would free some * vCPUs, then the system-level blocktime ends here. */ - if (dc->smp_cpus_down == smp_cpus && affected_cpu) { - dc->total_blocktime += current - dc->last_begin; + if (dc->smp_cpus_down == smp_cpus && iter.affected_cpus) { + dc->total_blocktime += iter.current - dc->last_begin; } - dc->smp_cpus_down -= affected_cpu; + dc->smp_cpus_down -= iter.affected_cpus; - trace_mark_postcopy_blocktime_end(addr, dc->total_blocktime, - affected_cpu); + trace_postcopy_blocktime_end(addr, iter.current, iter.affected_cpus); } static void postcopy_pause_fault_thread(MigrationIncomingState *mis) diff --git a/migration/trace-events b/migration/trace-events index 9c1f3b7044..a36a78f01a 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -285,8 +285,6 @@ postcopy_nhp_range(const char *ramblock, void *host_addr, size_t offset, size_t postcopy_place_page(void *host_addr) "host=%p" postcopy_place_page_zero(void *host_addr) "host=%p" postcopy_ram_enable_notify(void) "" -mark_postcopy_blocktime_begin(uint64_t addr, uint64_t time, int cpu) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", cpu: %d" -mark_postcopy_blocktime_end(uint64_t addr, uint64_t time, int affected_cpu) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", affected_cpus: %d" postcopy_pause_fault_thread(void) "" postcopy_pause_fault_thread_continued(void) "" postcopy_pause_fast_load(void) "" @@ -311,6 +309,9 @@ postcopy_preempt_new_channel(void) "" postcopy_preempt_thread_entry(void) "" postcopy_preempt_thread_exit(void) "" postcopy_blocktime_tid_cpu_map(int cpu, uint32_t tid) "cpu: %d, tid: %u" +postcopy_blocktime_begin(uint64_t addr, uint64_t time, int cpu, bool exists) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", cpu: %d, exist: %d" +postcopy_blocktime_end(uint64_t addr, uint64_t time, int affected_cpu) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", affected_cpus: %d" +postcopy_blocktime_end_one(int cpu, uint8_t left_faults) "cpu: %d, left_faults: %" PRIu8 # exec.c migration_exec_outgoing(const char *cmd) "cmd=%s" |