diff options
188 files changed, 3026 insertions, 924 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 692628cd78..5df6020ed5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -175,7 +175,7 @@ F: include/exec/helper-info.c.inc F: include/exec/page-protection.h F: include/system/cpus.h F: include/system/tcg.h -F: include/hw/core/tcg-cpu-ops.h +F: include/accel/tcg/cpu-ops.h F: host/include/*/host/cpuinfo.h F: util/cpuinfo-*.c F: include/tcg/ @@ -499,7 +499,7 @@ R: Paolo Bonzini <pbonzini@redhat.com> S: Maintained F: include/qemu/accel.h F: include/system/accel-*.h -F: include/hw/core/accel-cpu.h +F: include/accel/accel-cpu-target.h F: accel/accel-*.c F: accel/Makefile.objs F: accel/stubs/Makefile.objs @@ -2186,10 +2186,17 @@ M: Cédric Le Goater <clg@redhat.com> S: Supported F: hw/vfio/* F: include/hw/vfio/ -F: docs/igd-assign.txt F: docs/devel/migration/vfio.rst F: qapi/vfio.json +vfio-igd +M: Alex Williamson <alex.williamson@redhat.com> +M: Cédric Le Goater <clg@redhat.com> +M: Tomita Moeko <tomitamoeko@gmail.com> +S: Supported +F: hw/vfio/igd.c +F: docs/igd-assign.txt + vfio-ccw M: Eric Farman <farman@linux.ibm.com> M: Matthew Rosato <mjrosato@linux.ibm.com> @@ -3791,6 +3798,7 @@ Overall usermode emulation M: Riku Voipio <riku.voipio@iki.fi> S: Maintained F: accel/tcg/user-exec*.c +F: hw/core/cpu-user.c F: include/user/ F: common-user/ diff --git a/accel/accel-system.c b/accel/accel-system.c index a7596aef59..5df49fbe83 100644 --- a/accel/accel-system.c +++ b/accel/accel-system.c @@ -26,6 +26,7 @@ #include "qemu/osdep.h" #include "qemu/accel.h" #include "hw/boards.h" +#include "system/accel-ops.h" #include "system/cpus.h" #include "qemu/error-report.h" #include "accel-system.h" diff --git a/accel/accel-target.c b/accel/accel-target.c index 08626c00c2..8358727462 100644 --- a/accel/accel-target.c +++ b/accel/accel-target.c @@ -27,7 +27,7 @@ #include "qemu/accel.h" #include "cpu.h" -#include "hw/core/accel-cpu.h" +#include "accel/accel-cpu-target.h" #ifndef CONFIG_USER_ONLY #include "accel-system.h" @@ -38,6 +38,7 @@ static const TypeInfo accel_type = { .parent = TYPE_OBJECT, .class_size = sizeof(AccelClass), .instance_size = sizeof(AccelState), + .abstract = true, }; /* Lookup AccelClass from opt_name. Returns NULL if not found */ diff --git a/accel/hvf/hvf-accel-ops.c b/accel/hvf/hvf-accel-ops.c index 945ba72051..12fc30c276 100644 --- a/accel/hvf/hvf-accel-ops.c +++ b/accel/hvf/hvf-accel-ops.c @@ -54,6 +54,7 @@ #include "exec/exec-all.h" #include "gdbstub/enums.h" #include "hw/boards.h" +#include "system/accel-ops.h" #include "system/cpus.h" #include "system/hvf.h" #include "system/hvf_int.h" diff --git a/accel/kvm/kvm-accel-ops.c b/accel/kvm/kvm-accel-ops.c index a81e8f3b03..54ea60909e 100644 --- a/accel/kvm/kvm-accel-ops.c +++ b/accel/kvm/kvm-accel-ops.c @@ -16,6 +16,7 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" +#include "system/accel-ops.h" #include "system/kvm.h" #include "system/kvm_int.h" #include "system/runstate.h" diff --git a/accel/kvm/kvm-cpus.h b/accel/kvm/kvm-cpus.h index b5435286e4..688511151c 100644 --- a/accel/kvm/kvm-cpus.h +++ b/accel/kvm/kvm-cpus.h @@ -10,8 +10,6 @@ #ifndef KVM_CPUS_H #define KVM_CPUS_H -#include "system/cpus.h" - int kvm_init_vcpu(CPUState *cpu, Error **errp); int kvm_cpu_exec(CPUState *cpu); void kvm_destroy_vcpu(CPUState *cpu); diff --git a/accel/qtest/qtest.c b/accel/qtest/qtest.c index ad7e3441a5..7fae80f6a1 100644 --- a/accel/qtest/qtest.c +++ b/accel/qtest/qtest.c @@ -18,6 +18,7 @@ #include "qemu/option.h" #include "qemu/config-file.h" #include "qemu/accel.h" +#include "system/accel-ops.h" #include "system/qtest.h" #include "system/cpus.h" #include "qemu/guest-random.h" diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c index 7f4208fddf..b2b9881bdf 100644 --- a/accel/stubs/tcg-stub.c +++ b/accel/stubs/tcg-stub.c @@ -14,10 +14,6 @@ #include "exec/tb-flush.h" #include "exec/exec-all.h" -void tb_flush(CPUState *cpu) -{ -} - G_NORETURN void cpu_loop_exit(CPUState *cpu) { g_assert_not_reached(); diff --git a/accel/tcg/cpu-exec-common.c b/accel/tcg/cpu-exec-common.c index 6ecfc4e7c2..c5c513f1e4 100644 --- a/accel/tcg/cpu-exec-common.c +++ b/accel/tcg/cpu-exec-common.c @@ -18,13 +18,45 @@ */ #include "qemu/osdep.h" -#include "system/cpus.h" +#include "exec/log.h" #include "system/tcg.h" #include "qemu/plugin.h" #include "internal-common.h" bool tcg_allowed; +bool tcg_cflags_has(CPUState *cpu, uint32_t flags) +{ + return cpu->tcg_cflags & flags; +} + +void tcg_cflags_set(CPUState *cpu, uint32_t flags) +{ + cpu->tcg_cflags |= flags; +} + +uint32_t curr_cflags(CPUState *cpu) +{ + uint32_t cflags = cpu->tcg_cflags; + + /* + * Record gdb single-step. We should be exiting the TB by raising + * EXCP_DEBUG, but to simplify other tests, disable chaining too. + * + * For singlestep and -d nochain, suppress goto_tb so that + * we can log -d cpu,exec after every TB. + */ + if (unlikely(cpu->singlestep_enabled)) { + cflags |= CF_NO_GOTO_TB | CF_NO_GOTO_PTR | CF_SINGLE_STEP | 1; + } else if (qatomic_read(&one_insn_per_tb)) { + cflags |= CF_NO_GOTO_TB | 1; + } else if (qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { + cflags |= CF_NO_GOTO_TB; + } + + return cflags; +} + /* exit the current TB, but without causing any exception to be raised */ void cpu_loop_exit_noexc(CPUState *cpu) { diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 8b773d8847..ef3d967e3a 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -22,7 +22,7 @@ #include "qapi/error.h" #include "qapi/type-helpers.h" #include "hw/core/cpu.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "trace.h" #include "disas/disas.h" #include "exec/cpu-common.h" @@ -33,7 +33,6 @@ #include "qemu/rcu.h" #include "exec/log.h" #include "qemu/main-loop.h" -#include "system/cpus.h" #include "exec/cpu-all.h" #include "system/cpu-timers.h" #include "exec/replay-core.h" @@ -148,38 +147,6 @@ static void init_delay_params(SyncClocks *sc, const CPUState *cpu) } #endif /* CONFIG USER ONLY */ -bool tcg_cflags_has(CPUState *cpu, uint32_t flags) -{ - return cpu->tcg_cflags & flags; -} - -void tcg_cflags_set(CPUState *cpu, uint32_t flags) -{ - cpu->tcg_cflags |= flags; -} - -uint32_t curr_cflags(CPUState *cpu) -{ - uint32_t cflags = cpu->tcg_cflags; - - /* - * Record gdb single-step. We should be exiting the TB by raising - * EXCP_DEBUG, but to simplify other tests, disable chaining too. - * - * For singlestep and -d nochain, suppress goto_tb so that - * we can log -d cpu,exec after every TB. - */ - if (unlikely(cpu->singlestep_enabled)) { - cflags |= CF_NO_GOTO_TB | CF_NO_GOTO_PTR | CF_SINGLE_STEP | 1; - } else if (qatomic_read(&one_insn_per_tb)) { - cflags |= CF_NO_GOTO_TB | 1; - } else if (qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { - cflags |= CF_NO_GOTO_TB; - } - - return cflags; -} - struct tb_desc { vaddr pc; uint64_t cs_base; diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index ad158050a1..c8761683a0 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -19,7 +19,7 @@ #include "qemu/osdep.h" #include "qemu/main-loop.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "exec/exec-all.h" #include "exec/page-protection.h" #include "exec/memory.h" diff --git a/accel/tcg/icount-common.c b/accel/tcg/icount-common.c index b178dccec4..402d3e3f4e 100644 --- a/accel/tcg/icount-common.c +++ b/accel/tcg/icount-common.c @@ -48,6 +48,8 @@ static bool icount_sleep = true; /* Arbitrarily pick 1MIPS as the minimum allowable speed. */ #define MAX_ICOUNT_SHIFT 10 +bool icount_align_option; + /* Do not count executed instructions */ ICountMode use_icount = ICOUNT_DISABLED; diff --git a/accel/tcg/internal-common.h b/accel/tcg/internal-common.h index c8d714256c..7ef620d963 100644 --- a/accel/tcg/internal-common.h +++ b/accel/tcg/internal-common.h @@ -17,6 +17,8 @@ extern int64_t max_advance; extern bool one_insn_per_tb; +extern bool icount_align_option; + /* * Return true if CS is not running in parallel with other cpus, either * because there are no other cpus or we are within an exclusive context. @@ -53,6 +55,17 @@ TranslationBlock *tb_link_page(TranslationBlock *tb); void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, uintptr_t host_pc); +/** + * tlb_init - initialize a CPU's TLB + * @cpu: CPU whose TLB should be initialized + */ +void tlb_init(CPUState *cpu); +/** + * tlb_destroy - destroy a CPU's TLB + * @cpu: CPU whose TLB should be destroyed + */ +void tlb_destroy(CPUState *cpu); + bool tcg_exec_realizefn(CPUState *cpu, Error **errp); void tcg_exec_unrealizefn(CPUState *cpu); diff --git a/accel/tcg/monitor.c b/accel/tcg/monitor.c index ae1dbeb79f..eeb38a4d9c 100644 --- a/accel/tcg/monitor.c +++ b/accel/tcg/monitor.c @@ -13,7 +13,6 @@ #include "qapi/type-helpers.h" #include "qapi/qapi-commands-machine.h" #include "monitor/monitor.h" -#include "system/cpus.h" #include "system/cpu-timers.h" #include "system/tcg.h" #include "tcg/tcg.h" diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c index 6e3f1fa92b..132c5d1461 100644 --- a/accel/tcg/tcg-accel-ops.c +++ b/accel/tcg/tcg-accel-ops.c @@ -26,6 +26,7 @@ */ #include "qemu/osdep.h" +#include "system/accel-ops.h" #include "system/tcg.h" #include "system/replay.h" #include "system/cpu-timers.h" diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index d4189c7386..0914d6e98b 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -54,11 +54,10 @@ #include "qemu/cacheinfo.h" #include "qemu/timer.h" #include "exec/log.h" -#include "system/cpus.h" #include "system/cpu-timers.h" #include "system/tcg.h" #include "qapi/error.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "tb-jmp-cache.h" #include "tb-hash.h" #include "tb-context.h" diff --git a/accel/tcg/user-exec-stub.c b/accel/tcg/user-exec-stub.c index 4fbe2dbdc8..1d52f48226 100644 --- a/accel/tcg/user-exec-stub.c +++ b/accel/tcg/user-exec-stub.c @@ -1,6 +1,7 @@ #include "qemu/osdep.h" #include "hw/core/cpu.h" #include "exec/replay-core.h" +#include "internal-common.h" void cpu_resume(CPUState *cpu) { @@ -18,6 +19,16 @@ void cpu_exec_reset_hold(CPUState *cpu) { } +/* User mode emulation does not support softmmu yet. */ + +void tlb_init(CPUState *cpu) +{ +} + +void tlb_destroy(CPUState *cpu) +{ +} + /* User mode emulation does not support record/replay yet. */ bool replay_exception(void) diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index 0561c4f6dc..2322181b15 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -17,8 +17,9 @@ * License along with this library; if not, see <http://www.gnu.org/licenses/>. */ #include "qemu/osdep.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "disas/disas.h" +#include "exec/vaddr.h" #include "exec/exec-all.h" #include "tcg/tcg.h" #include "qemu/bitops.h" @@ -30,6 +31,8 @@ #include "exec/page-protection.h" #include "exec/helper-proto.h" #include "qemu/atomic128.h" +#include "qemu/bswap.h" +#include "qemu/int128.h" #include "trace.h" #include "tcg/tcg-ldst.h" #include "internal-common.h" @@ -969,6 +972,85 @@ static void *cpu_mmu_lookup(CPUState *cpu, vaddr addr, return ret; } +/* physical memory access (slow version, mainly for debug) */ +int cpu_memory_rw_debug(CPUState *cpu, vaddr addr, + void *ptr, size_t len, bool is_write) +{ + int flags; + vaddr l, page; + uint8_t *buf = ptr; + ssize_t written; + int ret = -1; + int fd = -1; + + mmap_lock(); + + while (len > 0) { + page = addr & TARGET_PAGE_MASK; + l = (page + TARGET_PAGE_SIZE) - addr; + if (l > len) { + l = len; + } + flags = page_get_flags(page); + if (!(flags & PAGE_VALID)) { + goto out_close; + } + if (is_write) { + if (flags & PAGE_WRITE) { + memcpy(g2h(cpu, addr), buf, l); + } else { + /* Bypass the host page protection using ptrace. */ + if (fd == -1) { + fd = open("/proc/self/mem", O_WRONLY); + if (fd == -1) { + goto out; + } + } + /* + * If there is a TranslationBlock and we weren't bypassing the + * host page protection, the memcpy() above would SEGV, + * ultimately leading to page_unprotect(). So invalidate the + * translations manually. Both invalidation and pwrite() must + * be under mmap_lock() in order to prevent the creation of + * another TranslationBlock in between. + */ + tb_invalidate_phys_range(addr, addr + l - 1); + written = pwrite(fd, buf, l, + (off_t)(uintptr_t)g2h_untagged(addr)); + if (written != l) { + goto out_close; + } + } + } else if (flags & PAGE_READ) { + memcpy(buf, g2h(cpu, addr), l); + } else { + /* Bypass the host page protection using ptrace. */ + if (fd == -1) { + fd = open("/proc/self/mem", O_RDONLY); + if (fd == -1) { + goto out; + } + } + if (pread(fd, buf, l, + (off_t)(uintptr_t)g2h_untagged(addr)) != l) { + goto out_close; + } + } + len -= l; + buf += l; + addr += l; + } + ret = 0; +out_close: + if (fd != -1) { + close(fd); + } +out: + mmap_unlock(); + + return ret; +} + #include "ldst_atomicity.c.inc" static uint8_t do_ld1_mmu(CPUState *cpu, vaddr addr, MemOpIdx oi, diff --git a/accel/tcg/watchpoint.c b/accel/tcg/watchpoint.c index af57d182d5..40112b2b2e 100644 --- a/accel/tcg/watchpoint.c +++ b/accel/tcg/watchpoint.c @@ -26,7 +26,7 @@ #include "tb-internal.h" #include "system/tcg.h" #include "system/replay.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "hw/core/cpu.h" #include "internal-common.h" diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c index 852e9fbe5f..7aa28b9ab9 100644 --- a/accel/xen/xen-all.c +++ b/accel/xen/xen-all.c @@ -18,6 +18,7 @@ #include "hw/xen/xen_igd.h" #include "chardev/char.h" #include "qemu/accel.h" +#include "system/accel-ops.h" #include "system/cpus.h" #include "system/xen.h" #include "system/runstate.h" diff --git a/backends/tpm/tpm_util.c b/backends/tpm/tpm_util.c index 0a428eaf75..f07a2656ce 100644 --- a/backends/tpm/tpm_util.c +++ b/backends/tpm/tpm_util.c @@ -76,7 +76,7 @@ static void release_tpm(Object *obj, const char *name, void *opaque) } const PropertyInfo qdev_prop_tpm = { - .name = "str", + .type = "str", .description = "ID of a tpm to use as a backend", .get = get_tpm, .set = set_tpm, diff --git a/block/create.c b/block/create.c index 72abafb4c1..6b23a21675 100644 --- a/block/create.c +++ b/block/create.c @@ -24,7 +24,6 @@ #include "qemu/osdep.h" #include "block/block_int.h" -#include "qemu/clang-tsa.h" #include "qemu/job.h" #include "qemu/main-loop.h" #include "qapi/qapi-commands-block-core.h" diff --git a/block/qed.c b/block/qed.c index 382c9e5335..ac24449ffb 100644 --- a/block/qed.c +++ b/block/qed.c @@ -353,6 +353,7 @@ static void bdrv_qed_detach_aio_context(BlockDriverState *bs) qed_cancel_need_check_timer(s); timer_free(s->need_check_timer); + s->need_check_timer = NULL; } static void bdrv_qed_attach_aio_context(BlockDriverState *bs, diff --git a/blockdev-nbd.c b/blockdev-nbd.c index 3f6f4ef92b..1e3e634b87 100644 --- a/blockdev-nbd.c +++ b/blockdev-nbd.c @@ -219,12 +219,12 @@ void nbd_server_start_options(NbdServerOptions *arg, Error **errp) arg->tls_authz, arg->max_connections, errp); } -void qmp_nbd_server_start(SocketAddressLegacy *addr, - bool has_handshake_max_secs, +void qmp_nbd_server_start(bool has_handshake_max_secs, uint32_t handshake_max_secs, const char *tls_creds, const char *tls_authz, bool has_max_connections, uint32_t max_connections, + SocketAddressLegacy *addr, Error **errp) { SocketAddress *addr_flat = socket_address_flatten(addr); diff --git a/bsd-user/qemu.h b/bsd-user/qemu.h index 3eaa14f3f5..4e97c79631 100644 --- a/bsd-user/qemu.h +++ b/bsd-user/qemu.h @@ -40,7 +40,6 @@ extern char **environ; #include "target.h" #include "exec/gdbstub.h" #include "exec/page-protection.h" -#include "qemu/clang-tsa.h" #include "accel/tcg/vcpu-state.h" #include "qemu-os.h" diff --git a/bsd-user/signal.c b/bsd-user/signal.c index ff2ccbbf60..ab1d9ddd50 100644 --- a/bsd-user/signal.c +++ b/bsd-user/signal.c @@ -29,7 +29,7 @@ #include "gdbstub/user.h" #include "signal-common.h" #include "trace.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "host-signal.h" /* target_siginfo_t must fit in gdbstub's siginfo save area. */ diff --git a/cpu-common.c b/cpu-common.c index 4248b2d727..f5dcc2d136 100644 --- a/cpu-common.c +++ b/cpu-common.c @@ -21,7 +21,6 @@ #include "qemu/main-loop.h" #include "exec/cpu-common.h" #include "hw/core/cpu.h" -#include "system/cpus.h" #include "qemu/lockable.h" #include "trace/trace-root.h" diff --git a/cpu-target.c b/cpu-target.c index 667688332c..5aa6c4b0c6 100644 --- a/cpu-target.c +++ b/cpu-target.c @@ -19,22 +19,13 @@ #include "qemu/osdep.h" #include "qapi/error.h" - -#include "exec/target_page.h" -#include "exec/page-protection.h" -#include "hw/qdev-core.h" -#include "hw/qdev-properties.h" #include "qemu/error-report.h" #include "qemu/qemu-print.h" #include "migration/vmstate.h" -#ifdef CONFIG_USER_ONLY -#include "qemu.h" -#include "user/page-protection.h" -#else +#ifndef CONFIG_USER_ONLY #include "hw/core/sysemu-cpu-ops.h" -#include "exec/address-spaces.h" -#include "exec/memory.h" #endif +#include "system/accel-ops.h" #include "system/cpus.h" #include "system/tcg.h" #include "exec/tswap.h" @@ -42,28 +33,34 @@ #include "exec/cpu-common.h" #include "exec/exec-all.h" #include "exec/tb-flush.h" -#include "exec/translation-block.h" #include "exec/log.h" -#include "hw/core/accel-cpu.h" +#include "accel/accel-cpu-target.h" #include "trace/trace-root.h" #include "qemu/accel.h" +#include "hw/core/cpu.h" #ifndef CONFIG_USER_ONLY static int cpu_common_post_load(void *opaque, int version_id) { - CPUState *cpu = opaque; - - /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the - version_id is increased. */ - cpu->interrupt_request &= ~0x01; - tlb_flush(cpu); - - /* loadvm has just updated the content of RAM, bypassing the - * usual mechanisms that ensure we flush TBs for writes to - * memory we've translated code from. So we must flush all TBs, - * which will now be stale. - */ - tb_flush(cpu); + if (tcg_enabled()) { + CPUState *cpu = opaque; + + /* + * 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the + * version_id is increased. + */ + cpu->interrupt_request &= ~0x01; + + tlb_flush(cpu); + + /* + * loadvm has just updated the content of RAM, bypassing the + * usual mechanisms that ensure we flush TBs for writes to + * memory we've translated code from. So we must flush all TBs, + * which will now be stale. + */ + tb_flush(cpu); + } return 0; } @@ -134,9 +131,6 @@ const VMStateDescription vmstate_cpu_common = { bool cpu_exec_realizefn(CPUState *cpu, Error **errp) { - /* cache the cpu class for the hotpath */ - cpu->cc = CPU_GET_CLASS(cpu); - if (!accel_cpu_common_realize(cpu, errp)) { return false; } @@ -180,72 +174,6 @@ void cpu_exec_unrealizefn(CPUState *cpu) accel_cpu_common_unrealize(cpu); } -/* - * This can't go in hw/core/cpu.c because that file is compiled only - * once for both user-mode and system builds. - */ -static const Property cpu_common_props[] = { -#ifdef CONFIG_USER_ONLY - /* - * Create a property for the user-only object, so users can - * adjust prctl(PR_SET_UNALIGN) from the command-line. - * Has no effect if the target does not support the feature. - */ - DEFINE_PROP_BOOL("prctl-unalign-sigbus", CPUState, - prctl_unalign_sigbus, false), -#else - /* - * Create a memory property for system CPU object, so users can - * wire up its memory. The default if no link is set up is to use - * the system address space. - */ - DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION, - MemoryRegion *), -#endif -}; - -#ifndef CONFIG_USER_ONLY -static bool cpu_get_start_powered_off(Object *obj, Error **errp) -{ - CPUState *cpu = CPU(obj); - return cpu->start_powered_off; -} - -static void cpu_set_start_powered_off(Object *obj, bool value, Error **errp) -{ - CPUState *cpu = CPU(obj); - cpu->start_powered_off = value; -} -#endif - -void cpu_class_init_props(DeviceClass *dc) -{ -#ifndef CONFIG_USER_ONLY - ObjectClass *oc = OBJECT_CLASS(dc); - - /* - * We can't use DEFINE_PROP_BOOL in the Property array for this - * property, because we want this to be settable after realize. - */ - object_class_property_add_bool(oc, "start-powered-off", - cpu_get_start_powered_off, - cpu_set_start_powered_off); -#endif - - device_class_set_props(dc, cpu_common_props); -} - -void cpu_exec_initfn(CPUState *cpu) -{ - cpu->as = NULL; - cpu->num_ases = 0; - -#ifndef CONFIG_USER_ONLY - cpu->memory = get_system_memory(); - object_ref(OBJECT(cpu->memory)); -#endif -} - char *cpu_model_from_type(const char *typename) { const char *suffix = "-" CPU_RESOLVING_TYPE; @@ -372,97 +300,6 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...) abort(); } -/* physical memory access (slow version, mainly for debug) */ -#if defined(CONFIG_USER_ONLY) -int cpu_memory_rw_debug(CPUState *cpu, vaddr addr, - void *ptr, size_t len, bool is_write) -{ - int flags; - vaddr l, page; - void * p; - uint8_t *buf = ptr; - ssize_t written; - int ret = -1; - int fd = -1; - - while (len > 0) { - page = addr & TARGET_PAGE_MASK; - l = (page + TARGET_PAGE_SIZE) - addr; - if (l > len) - l = len; - flags = page_get_flags(page); - if (!(flags & PAGE_VALID)) { - goto out_close; - } - if (is_write) { - if (flags & PAGE_WRITE) { - /* XXX: this code should not depend on lock_user */ - p = lock_user(VERIFY_WRITE, addr, l, 0); - if (!p) { - goto out_close; - } - memcpy(p, buf, l); - unlock_user(p, addr, l); - } else { - /* Bypass the host page protection using ptrace. */ - if (fd == -1) { - fd = open("/proc/self/mem", O_WRONLY); - if (fd == -1) { - goto out; - } - } - /* - * If there is a TranslationBlock and we weren't bypassing the - * host page protection, the memcpy() above would SEGV, - * ultimately leading to page_unprotect(). So invalidate the - * translations manually. Both invalidation and pwrite() must - * be under mmap_lock() in order to prevent the creation of - * another TranslationBlock in between. - */ - mmap_lock(); - tb_invalidate_phys_range(addr, addr + l - 1); - written = pwrite(fd, buf, l, - (off_t)(uintptr_t)g2h_untagged(addr)); - mmap_unlock(); - if (written != l) { - goto out_close; - } - } - } else if (flags & PAGE_READ) { - /* XXX: this code should not depend on lock_user */ - p = lock_user(VERIFY_READ, addr, l, 1); - if (!p) { - goto out_close; - } - memcpy(buf, p, l); - unlock_user(p, addr, 0); - } else { - /* Bypass the host page protection using ptrace. */ - if (fd == -1) { - fd = open("/proc/self/mem", O_RDONLY); - if (fd == -1) { - goto out; - } - } - if (pread(fd, buf, l, - (off_t)(uintptr_t)g2h_untagged(addr)) != l) { - goto out_close; - } - } - len -= l; - buf += l; - addr += l; - } - ret = 0; -out_close: - if (fd != -1) { - close(fd); - } -out: - return ret; -} -#endif - bool target_words_bigendian(void) { return TARGET_BIG_ENDIAN; diff --git a/disas/disas-common.c b/disas/disas-common.c index de61f6d8a1..ae3f9e46ea 100644 --- a/disas/disas-common.c +++ b/disas/disas-common.c @@ -7,7 +7,6 @@ #include "disas/disas.h" #include "disas/capstone.h" #include "hw/core/cpu.h" -#include "exec/tswap.h" #include "disas-internal.h" @@ -61,15 +60,12 @@ void disas_initialize_debug_target(CPUDebug *s, CPUState *cpu) s->cpu = cpu; s->info.print_address_func = print_address; - if (target_words_bigendian()) { - s->info.endian = BFD_ENDIAN_BIG; - } else { - s->info.endian = BFD_ENDIAN_LITTLE; - } + s->info.endian = BFD_ENDIAN_UNKNOWN; CPUClass *cc = CPU_GET_CLASS(cpu); if (cc->disas_set_info) { cc->disas_set_info(cpu, &s->info); + g_assert(s->info.endian != BFD_ENDIAN_UNKNOWN); } } diff --git a/docs/about/build-platforms.rst b/docs/about/build-platforms.rst index 482b09819c..1552b1a704 100644 --- a/docs/about/build-platforms.rst +++ b/docs/about/build-platforms.rst @@ -101,7 +101,7 @@ Python runtime option of the ``configure`` script to point QEMU to a supported version of the Python runtime. - As of QEMU |version|, the minimum supported version of Python is 3.7. + As of QEMU |version|, the minimum supported version of Python is 3.8. Python build dependencies Some of QEMU's build dependencies are written in Python. Usually these diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index abadf8de27..589951b136 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -434,6 +434,31 @@ Stream ``reconnect`` (since 9.2) The ``reconnect`` option only allows specifiying second granularity timeouts, which is not enough for all types of use cases, use ``reconnect-ms`` instead. +VFIO device options +''''''''''''''''''' + +``-device vfio-calxeda-xgmac`` (since 10.0) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The vfio-calxeda-xgmac device allows to assign a host Calxeda Highbank +10Gb XGMAC Ethernet controller device ("calxeda,hb-xgmac" compatibility +string) to a guest. Calxeda HW has been ewasted now and there is no point +keeping that device. + +``-device vfio-amd-xgbe`` (since 10.0) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The vfio-amd-xgbe device allows to assign a host AMD 10GbE controller +to a guest ("amd,xgbe-seattle-v1a" compatibility string). AMD "Seattle" +is not supported anymore and there is no point keeping that device. + +``-device vfio-platform`` (since 10.0) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The vfio-platform device allows to assign a host platform device +to a guest in a generic manner. Integrating a new device into +the vfio-platform infrastructure requires some adaptation at +both kernel and qemu level. No such attempt has been done for years +and the conclusion is that vfio-platform has not got any traction. +PCIe passthrough shall be the mainline solution. + CPU device properties ''''''''''''''''''''' diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index c49482eab6..673e354754 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -67,15 +67,35 @@ VFIO implements the device hooks for the iterative approach as follows: * A ``switchover_ack_needed`` function that checks if the VFIO device uses "switchover-ack" migration capability when this capability is enabled. -* A ``save_state`` function to save the device config space if it is present. +* A ``switchover_start`` function that in the multifd mode starts a thread that + reassembles the multifd received data and loads it in-order into the device. + In the non-multifd mode this function is a NOP. + +* A ``save_state`` function to save the device config space if it is present + in the non-multifd mode. + In the multifd mode it just emits either a dummy EOS marker. * A ``save_live_complete_precopy`` function that sets the VFIO device in _STOP_COPY state and iteratively copies the data for the VFIO device until the vendor driver indicates that no data remains. + In the multifd mode it just emits a dummy EOS marker. + +* A ``save_live_complete_precopy_thread`` function that in the multifd mode + provides thread handler performing multifd device state transfer. + It sets the VFIO device to _STOP_COPY state, iteratively reads the data + from the VFIO device and queues it for multifd transmission until the vendor + driver indicates that no data remains. + After that, it saves the device config space and queues it for multifd + transfer too. + In the non-multifd mode this thread is a NOP. * A ``load_state`` function that loads the config section and the data sections that are generated by the save functions above. +* A ``load_state_buffer`` function that loads the device state and the device + config that arrived via multifd channels. + It's used only in the multifd mode. + * ``cleanup`` functions for both save and load that perform any migration related cleanup. @@ -176,8 +196,11 @@ Live migration save path Then the VFIO device is put in _STOP_COPY state (FINISH_MIGRATE, _ACTIVE, _STOP_COPY) .save_live_complete_precopy() is called for each active device - For the VFIO device, iterate in .save_live_complete_precopy() until + For the VFIO device: in the non-multifd mode iterate in + .save_live_complete_precopy() until pending data is 0 + In the multifd mode this iteration is done in + .save_live_complete_precopy_thread() instead. | (POSTMIGRATE, _COMPLETED, _STOP_COPY) Migraton thread schedules cleanup bottom half and exits @@ -194,6 +217,9 @@ Live migration resume path (RESTORE_VM, _ACTIVE, _STOP) | For each device, .load_state() is called for that device section data + transmitted via the main migration channel. + For data transmitted via multifd channels .load_state_buffer() is called + instead. (RESTORE_VM, _ACTIVE, _RESUMING) | At the end, .load_cleanup() is called for each device and vCPUs are started @@ -206,3 +232,18 @@ Postcopy ======== Postcopy migration is currently not supported for VFIO devices. + +Multifd +======= + +Starting from QEMU version 10.0 there's a possibility to transfer VFIO device +_STOP_COPY state via multifd channels. This helps reduce downtime - especially +with multiple VFIO devices or with devices having a large migration state. +As an additional benefit, setting the VFIO device to _STOP_COPY state and +saving its config space is also parallelized (run in a separate thread) in +such migration mode. + +The multifd VFIO device state transfer is controlled by +"x-migration-multifd-transfer" VFIO device property. This property defaults to +AUTO, which means that VFIO device state transfer via multifd channels is +attempted in configurations that otherwise support it. diff --git a/docs/devel/qapi-code-gen.rst b/docs/devel/qapi-code-gen.rst index 9fa94251b0..f9cfe8721f 100644 --- a/docs/devel/qapi-code-gen.rst +++ b/docs/devel/qapi-code-gen.rst @@ -229,7 +229,8 @@ These are of the form PREFIX_NAME, where PREFIX is derived from the enumeration type's name, and NAME from the value's name. For the example above, the generator maps 'MyEnum' to MY_ENUM and 'value1' to VALUE1, resulting in the enumeration constant MY_ENUM_VALUE1. The -optional 'prefix' member overrides PREFIX. +optional 'prefix' member overrides PREFIX. This is rarely necessary, +and should be used with restraint. The generated C enumeration constants have values 0, 1, ..., N-1 (in QAPI schema order), where N is the number of values. There is an diff --git a/gdbstub/system.c b/gdbstub/system.c index 8ce79fa88c..416c1dbe1e 100644 --- a/gdbstub/system.c +++ b/gdbstub/system.c @@ -19,9 +19,11 @@ #include "gdbstub/commands.h" #include "exec/hwaddr.h" #include "exec/tb-flush.h" +#include "system/accel-ops.h" #include "system/cpus.h" #include "system/runstate.h" #include "system/replay.h" +#include "system/tcg.h" #include "hw/core/cpu.h" #include "hw/cpu/cluster.h" #include "hw/boards.h" @@ -171,7 +173,9 @@ static void gdb_vm_state_change(void *opaque, bool running, RunState state) } else { trace_gdbstub_hit_break(); } - tb_flush(cpu); + if (tcg_enabled()) { + tb_flush(cpu); + } ret = GDB_SIGNAL_TRAP; break; case RUN_STATE_PAUSED: diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c index 6c26052561..2098286b5f 100644 --- a/hw/block/xen-block.c +++ b/hw/block/xen-block.c @@ -661,8 +661,8 @@ invalid: * https://xenbits.xen.org/docs/unstable/man/xen-vbd-interface.7.html */ static const PropertyInfo xen_block_prop_vdev = { - .name = "str", - .description = "Virtual Disk specifier: d*p*/xvd*/hd*/sd*", + .type = "str", + .description = "Virtual Disk specifier (d*p*/xvd*/hd*/sd*)", .get = xen_block_get_vdev, .set = xen_block_set_vdev, }; diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c index cb79566cc5..d5cd227fe6 100644 --- a/hw/core/cpu-common.c +++ b/hw/core/cpu-common.c @@ -238,13 +238,17 @@ static void cpu_common_initfn(Object *obj) { CPUState *cpu = CPU(obj); + /* cache the cpu class for the hotpath */ + cpu->cc = CPU_GET_CLASS(cpu); + gdb_init_cpu(cpu); cpu->cpu_index = UNASSIGNED_CPU_INDEX; cpu->cluster_index = UNASSIGNED_CLUSTER_INDEX; + cpu->as = NULL; + cpu->num_ases = 0; /* user-mode doesn't have configurable SMP topology */ /* the default value is changed by qemu_init_vcpu() for system-mode */ cpu->nr_threads = 1; - cpu->cflags_next_tb = -1; /* allocate storage for thread info, initialise condition variables */ cpu->thread = g_new0(QemuThread, 1); diff --git a/hw/core/cpu-system.c b/hw/core/cpu-system.c index 6e307c8959..e511507e13 100644 --- a/hw/core/cpu-system.c +++ b/hw/core/cpu-system.c @@ -20,7 +20,11 @@ #include "qemu/osdep.h" #include "qapi/error.h" +#include "exec/address-spaces.h" +#include "exec/memory.h" #include "exec/tswap.h" +#include "hw/qdev-core.h" +#include "hw/qdev-properties.h" #include "hw/core/sysemu-cpu-ops.h" bool cpu_paging_enabled(const CPUState *cpu) @@ -147,3 +151,46 @@ GuestPanicInformation *cpu_get_crash_info(CPUState *cpu) } return res; } + +static const Property cpu_system_props[] = { + /* + * Create a memory property for system CPU object, so users can + * wire up its memory. The default if no link is set up is to use + * the system address space. + */ + DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION, + MemoryRegion *), +}; + +static bool cpu_get_start_powered_off(Object *obj, Error **errp) +{ + CPUState *cpu = CPU(obj); + return cpu->start_powered_off; +} + +static void cpu_set_start_powered_off(Object *obj, bool value, Error **errp) +{ + CPUState *cpu = CPU(obj); + cpu->start_powered_off = value; +} + +void cpu_class_init_props(DeviceClass *dc) +{ + ObjectClass *oc = OBJECT_CLASS(dc); + + /* + * We can't use DEFINE_PROP_BOOL in the Property array for this + * property, because we want this to be settable after realize. + */ + object_class_property_add_bool(oc, "start-powered-off", + cpu_get_start_powered_off, + cpu_set_start_powered_off); + + device_class_set_props(dc, cpu_system_props); +} + +void cpu_exec_initfn(CPUState *cpu) +{ + cpu->memory = get_system_memory(); + object_ref(OBJECT(cpu->memory)); +} diff --git a/hw/core/cpu-user.c b/hw/core/cpu-user.c new file mode 100644 index 0000000000..cdd8de2fef --- /dev/null +++ b/hw/core/cpu-user.c @@ -0,0 +1,32 @@ +/* + * QEMU CPU model (user specific) + * + * Copyright (c) Linaro, Ltd. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/qdev-core.h" +#include "hw/qdev-properties.h" +#include "hw/core/cpu.h" + +static const Property cpu_user_props[] = { + /* + * Create a property for the user-only object, so users can + * adjust prctl(PR_SET_UNALIGN) from the command-line. + * Has no effect if the target does not support the feature. + */ + DEFINE_PROP_BOOL("prctl-unalign-sigbus", CPUState, + prctl_unalign_sigbus, false), +}; + +void cpu_class_init_props(DeviceClass *dc) +{ + device_class_set_props(dc, cpu_user_props); +} + +void cpu_exec_initfn(CPUState *cpu) +{ + /* nothing to do */ +} diff --git a/hw/core/generic-loader.c b/hw/core/generic-loader.c index d9f5c2e832..d3a426a1a2 100644 --- a/hw/core/generic-loader.c +++ b/hw/core/generic-loader.c @@ -47,11 +47,8 @@ static void generic_loader_reset(void *opaque) GenericLoaderState *s = GENERIC_LOADER(opaque); if (s->set_pc) { - CPUClass *cc = CPU_GET_CLASS(s->cpu); cpu_reset(s->cpu); - if (cc) { - cc->set_pc(s->cpu, s->addr); - } + cpu_set_pc(s->cpu, s->addr); } if (s->data_len) { diff --git a/hw/core/machine.c b/hw/core/machine.c index b68b8b94a3..f52a4f2273 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -44,6 +44,8 @@ GlobalProperty hw_compat_9_2[] = { { "virtio-balloon-pci-non-transitional", "vectors", "0" }, { "virtio-mem-pci", "vectors", "0" }, { "migration", "multifd-clean-tls-termination", "false" }, + { "migration", "send-switchover-start", "off"}, + { "vfio-pci", "x-migration-multifd-transfer", "off" }, }; const size_t hw_compat_9_2_len = G_N_ELEMENTS(hw_compat_9_2); diff --git a/hw/core/meson.build b/hw/core/meson.build index 65a1698ed1..b5a545a0ed 100644 --- a/hw/core/meson.build +++ b/hw/core/meson.build @@ -46,4 +46,7 @@ system_ss.add(files( 'vm-change-state-handler.c', 'clock-vmstate.c', )) -user_ss.add(files('qdev-user.c')) +user_ss.add(files( + 'cpu-user.c', + 'qdev-user.c', +)) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index a91551a5ee..a7dde73c29 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -235,7 +235,7 @@ static void release_drive(Object *obj, const char *name, void *opaque) } const PropertyInfo qdev_prop_drive = { - .name = "str", + .type = "str", .description = "Node name or ID of a block device to use as a backend", .realized_set_allowed = true, .get = get_drive, @@ -244,7 +244,7 @@ const PropertyInfo qdev_prop_drive = { }; const PropertyInfo qdev_prop_drive_iothread = { - .name = "str", + .type = "str", .description = "Node name or ID of a block device to use as a backend", .realized_set_allowed = true, .get = get_drive, @@ -312,7 +312,7 @@ static void release_chr(Object *obj, const char *name, void *opaque) } const PropertyInfo qdev_prop_chr = { - .name = "str", + .type = "str", .description = "ID of a chardev to use as a backend", .get = get_chr, .set = set_chr, @@ -386,7 +386,7 @@ inval: } const PropertyInfo qdev_prop_macaddr = { - .name = "str", + .type = "str", .description = "Ethernet 6-byte MAC Address, example: 52:54:00:12:34:56", .get = get_mac, .set = set_mac, @@ -474,7 +474,7 @@ out: } const PropertyInfo qdev_prop_netdev = { - .name = "str", + .type = "str", .description = "ID of a netdev to use as a backend", .get = get_netdev, .set = set_netdev, @@ -512,7 +512,7 @@ static void set_audiodev(Object *obj, Visitor *v, const char* name, } const PropertyInfo qdev_prop_audiodev = { - .name = "str", + .type = "str", .description = "ID of an audiodev to use as a backend", /* release done on shutdown */ .get = get_audiodev, @@ -602,7 +602,8 @@ static void qdev_propinfo_set_losttickpolicy(Object *obj, Visitor *v, QEMU_BUILD_BUG_ON(sizeof(LostTickPolicy) != sizeof(int)); const PropertyInfo qdev_prop_losttickpolicy = { - .name = "LostTickPolicy", + .type = "LostTickPolicy", + .description = "Policy for handling lost ticks (discard/delay/slew)", .enum_table = &LostTickPolicy_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_losttickpolicy, @@ -628,7 +629,7 @@ static void set_blocksize(Object *obj, Visitor *v, const char *name, } const PropertyInfo qdev_prop_blocksize = { - .name = "size", + .type = "size", .description = "A power of two between " MIN_BLOCK_SIZE_STR " and " MAX_BLOCK_SIZE_STR, .get = qdev_propinfo_get_size32, @@ -641,9 +642,8 @@ const PropertyInfo qdev_prop_blocksize = { QEMU_BUILD_BUG_ON(sizeof(BlockdevOnError) != sizeof(int)); const PropertyInfo qdev_prop_blockdev_on_error = { - .name = "BlockdevOnError", - .description = "Error handling policy, " - "report/ignore/enospc/stop/auto", + .type = "BlockdevOnError", + .description = "Error handling policy (report/ignore/enospc/stop/auto)", .enum_table = &BlockdevOnError_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, @@ -655,9 +655,9 @@ const PropertyInfo qdev_prop_blockdev_on_error = { QEMU_BUILD_BUG_ON(sizeof(BiosAtaTranslation) != sizeof(int)); const PropertyInfo qdev_prop_bios_chs_trans = { - .name = "BiosAtaTranslation", - .description = "Logical CHS translation algorithm, " - "auto/none/lba/large/rechs", + .type = "BiosAtaTranslation", + .description = "Logical CHS translation algorithm " + " (auto/none/lba/large/rechs)", .enum_table = &BiosAtaTranslation_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, @@ -667,9 +667,8 @@ const PropertyInfo qdev_prop_bios_chs_trans = { /* --- FDC default drive types */ const PropertyInfo qdev_prop_fdc_drive_type = { - .name = "FdcDriveType", - .description = "FDC drive type, " - "144/288/120/none/auto", + .type = "FloppyDriveType", + .description = "Floppy drive type (144/288/120/none/auto)", .enum_table = &FloppyDriveType_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, @@ -679,9 +678,9 @@ const PropertyInfo qdev_prop_fdc_drive_type = { /* --- MultiFDCompression --- */ const PropertyInfo qdev_prop_multifd_compression = { - .name = "MultiFDCompression", - .description = "multifd_compression values, " - "none/zlib/zstd/qpl/uadk/qatzip", + .type = "MultiFDCompression", + .description = "multifd_compression values" + " (none/zlib/zstd/qpl/uadk/qatzip)", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, @@ -693,9 +692,8 @@ const PropertyInfo qdev_prop_multifd_compression = { QEMU_BUILD_BUG_ON(sizeof(MigMode) != sizeof(int)); const PropertyInfo qdev_prop_mig_mode = { - .name = "MigMode", - .description = "mig_mode values, " - "normal,cpr-reboot", + .type = "MigMode", + .description = "Migration mode (normal/cpr-reboot)", .enum_table = &MigMode_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, @@ -707,9 +705,8 @@ const PropertyInfo qdev_prop_mig_mode = { QEMU_BUILD_BUG_ON(sizeof(GranuleMode) != sizeof(int)); const PropertyInfo qdev_prop_granule_mode = { - .name = "GranuleMode", - .description = "granule_mode values, " - "4k, 8k, 16k, 64k, host", + .type = "GranuleMode", + .description = "Granule page size (4k/8k/16k/64k/host)", .enum_table = &GranuleMode_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, @@ -717,9 +714,8 @@ const PropertyInfo qdev_prop_granule_mode = { }; const PropertyInfo qdev_prop_zero_page_detection = { - .name = "ZeroPageDetection", - .description = "zero_page_detection values, " - "none,legacy,multifd", + .type = "ZeroPageDetection", + .description = "Zero page detection (none/legacy/multifd)", .enum_table = &ZeroPageDetection_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, @@ -801,7 +797,7 @@ out: } const PropertyInfo qdev_prop_reserved_region = { - .name = "reserved_region", + .type = "str", .description = "Reserved Region, example: 0xFEE00000:0xFEEFFFFF:0", .get = get_reserved_region, .set = set_reserved_region, @@ -882,7 +878,7 @@ static int print_pci_devfn(Object *obj, const Property *prop, char *dest, } const PropertyInfo qdev_prop_pci_devfn = { - .name = "int32", + .type = "str", .description = "Slot and optional function number, example: 06.0 or 06", .print = print_pci_devfn, .get = qdev_propinfo_get_int32, @@ -988,8 +984,8 @@ inval: } const PropertyInfo qdev_prop_pci_host_devaddr = { - .name = "str", - .description = "Address (bus/device/function) of " + .type = "str", + .description = "Address (bus:device.function) of " "the host device, example: 04:10.0", .get = get_pci_host_devaddr, .set = set_pci_host_devaddr, @@ -998,7 +994,7 @@ const PropertyInfo qdev_prop_pci_host_devaddr = { /* --- OffAutoPCIBAR off/auto/bar0/bar1/bar2/bar3/bar4/bar5 --- */ const PropertyInfo qdev_prop_off_auto_pcibar = { - .name = "OffAutoPCIBAR", + .type = "OffAutoPCIBAR", .description = "off/auto/bar0/bar1/bar2/bar3/bar4/bar5", .enum_table = &OffAutoPCIBAR_lookup, .get = qdev_propinfo_get_enum, @@ -1080,7 +1076,7 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, } const PropertyInfo qdev_prop_pcie_link_speed = { - .name = "PCIELinkSpeed", + .type = "PCIELinkSpeed", .description = "2_5/5/8/16/32/64", .enum_table = &PCIELinkSpeed_lookup, .get = get_prop_pcielinkspeed, @@ -1168,7 +1164,7 @@ static void set_prop_pcielinkwidth(Object *obj, Visitor *v, const char *name, } const PropertyInfo qdev_prop_pcie_link_width = { - .name = "PCIELinkWidth", + .type = "PCIELinkWidth", .description = "1/2/4/8/12/16/32", .enum_table = &PCIELinkWidth_lookup, .get = get_prop_pcielinkwidth, @@ -1218,7 +1214,7 @@ static void set_default_uuid_auto(ObjectProperty *op, const Property *prop) } const PropertyInfo qdev_prop_uuid = { - .name = "str", + .type = "str", .description = "UUID (aka GUID) or \"" UUID_VALUE_AUTO "\" for random value (default)", .get = get_uuid, @@ -1231,8 +1227,8 @@ const PropertyInfo qdev_prop_uuid = { QEMU_BUILD_BUG_ON(sizeof(S390CpuEntitlement) != sizeof(int)); const PropertyInfo qdev_prop_cpus390entitlement = { - .name = "S390CpuEntitlement", - .description = "low/medium (default)/high", + .type = "S390CpuEntitlement", + .description = "auto/low/medium/high (default medium)", .enum_table = &S390CpuEntitlement_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, @@ -1276,7 +1272,7 @@ static void release_iothread_vq_mapping_list(Object *obj, } const PropertyInfo qdev_prop_iothread_vq_mapping_list = { - .name = "IOThreadVirtQueueMappingList", + .type = "IOThreadVirtQueueMappingList", .description = "IOThread virtqueue mapping list [{\"iothread\":\"<id>\", " "\"vqs\":[1,2,3,...]},...]", .get = get_iothread_vq_mapping_list, @@ -1287,7 +1283,7 @@ const PropertyInfo qdev_prop_iothread_vq_mapping_list = { /* --- Endian modes */ const PropertyInfo qdev_prop_endian_mode = { - .name = "EndianMode", + .type = "EndianMode", .description = "Endian mode, big/little/unspecified", .enum_table = &EndianMode_lookup, .get = qdev_propinfo_get_enum, @@ -1296,7 +1292,7 @@ const PropertyInfo qdev_prop_endian_mode = { }; const PropertyInfo qdev_prop_vmapple_virtio_blk_variant = { - .name = "VMAppleVirtioBlkVariant", + .type = "VMAppleVirtioBlkVariant", .description = "unspecified/root/aux", .enum_table = &VMAppleVirtioBlkVariant_lookup, .get = qdev_propinfo_get_enum, diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c index 0b52aad555..c04df3b337 100644 --- a/hw/core/qdev-properties.c +++ b/hw/core/qdev-properties.c @@ -122,13 +122,6 @@ void qdev_propinfo_set_default_value_enum(ObjectProperty *op, qapi_enum_lookup(prop->info->enum_table, prop->defval.i)); } -const PropertyInfo qdev_prop_enum = { - .name = "enum", - .get = qdev_propinfo_get_enum, - .set = qdev_propinfo_set_enum, - .set_default_value = qdev_propinfo_set_default_value_enum, -}; - /* Bit */ static uint32_t qdev_get_prop_mask(const Property *prop) @@ -176,7 +169,7 @@ static void set_default_value_bool(ObjectProperty *op, const Property *prop) } const PropertyInfo qdev_prop_bit = { - .name = "bool", + .type = "bool", .description = "on/off", .get = prop_get_bit, .set = prop_set_bit, @@ -225,7 +218,7 @@ static void prop_set_bit64(Object *obj, Visitor *v, const char *name, } const PropertyInfo qdev_prop_bit64 = { - .name = "bool", + .type = "bool", .description = "on/off", .get = prop_get_bit64, .set = prop_set_bit64, @@ -253,7 +246,8 @@ static void set_bool(Object *obj, Visitor *v, const char *name, void *opaque, } const PropertyInfo qdev_prop_bool = { - .name = "bool", + .type = "bool", + .description = "on/off", .get = get_bool, .set = set_bool, .set_default_value = set_default_value_bool, @@ -292,7 +286,7 @@ void qdev_propinfo_set_default_value_uint(ObjectProperty *op, } const PropertyInfo qdev_prop_uint8 = { - .name = "uint8", + .type = "uint8", .get = get_uint8, .set = set_uint8, .set_default_value = qdev_propinfo_set_default_value_uint, @@ -319,7 +313,7 @@ static void set_uint16(Object *obj, Visitor *v, const char *name, } const PropertyInfo qdev_prop_uint16 = { - .name = "uint16", + .type = "uint16", .get = get_uint16, .set = set_uint16, .set_default_value = qdev_propinfo_set_default_value_uint, @@ -364,14 +358,14 @@ static void set_int32(Object *obj, Visitor *v, const char *name, void *opaque, } const PropertyInfo qdev_prop_uint32 = { - .name = "uint32", + .type = "uint32", .get = get_uint32, .set = set_uint32, .set_default_value = qdev_propinfo_set_default_value_uint, }; const PropertyInfo qdev_prop_int32 = { - .name = "int32", + .type = "int32", .get = qdev_propinfo_get_int32, .set = set_int32, .set_default_value = qdev_propinfo_set_default_value_int, @@ -416,14 +410,14 @@ static void set_int64(Object *obj, Visitor *v, const char *name, } const PropertyInfo qdev_prop_uint64 = { - .name = "uint64", + .type = "uint64", .get = get_uint64, .set = set_uint64, .set_default_value = qdev_propinfo_set_default_value_uint, }; const PropertyInfo qdev_prop_int64 = { - .name = "int64", + .type = "int64", .get = get_int64, .set = set_int64, .set_default_value = qdev_propinfo_set_default_value_int, @@ -443,7 +437,7 @@ static void set_uint64_checkmask(Object *obj, Visitor *v, const char *name, } const PropertyInfo qdev_prop_uint64_checkmask = { - .name = "uint64", + .type = "uint64", .get = get_uint64, .set = set_uint64_checkmask, }; @@ -485,7 +479,7 @@ static void set_string(Object *obj, Visitor *v, const char *name, } const PropertyInfo qdev_prop_string = { - .name = "str", + .type = "str", .release = release_string, .get = get_string, .set = set_string, @@ -494,7 +488,7 @@ const PropertyInfo qdev_prop_string = { /* --- on/off/auto --- */ const PropertyInfo qdev_prop_on_off_auto = { - .name = "OnOffAuto", + .type = "OnOffAuto", .description = "on/off/auto", .enum_table = &OnOffAuto_lookup, .get = qdev_propinfo_get_enum, @@ -537,7 +531,7 @@ static void set_size32(Object *obj, Visitor *v, const char *name, void *opaque, } const PropertyInfo qdev_prop_size32 = { - .name = "size", + .type = "size", .get = qdev_propinfo_get_size32, .set = set_size32, .set_default_value = qdev_propinfo_set_default_value_uint, @@ -740,7 +734,7 @@ static void default_prop_array(ObjectProperty *op, const Property *prop) } const PropertyInfo qdev_prop_array = { - .name = "list", + .type = "list", .get = get_prop_array, .set = set_prop_array, .release = release_prop_array, @@ -944,7 +938,7 @@ static void set_size(Object *obj, Visitor *v, const char *name, void *opaque, } const PropertyInfo qdev_prop_size = { - .name = "size", + .type = "size", .get = get_size, .set = set_size, .set_default_value = qdev_propinfo_set_default_value_uint, @@ -962,7 +956,7 @@ static ObjectProperty *create_link_property(ObjectClass *oc, const char *name, } const PropertyInfo qdev_prop_link = { - .name = "link", + .type = "link", .create = create_link_property, }; @@ -973,7 +967,7 @@ void qdev_property_add_static(DeviceState *dev, const Property *prop) assert(!prop->info->create); - op = object_property_add(obj, prop->name, prop->info->name, + op = object_property_add(obj, prop->name, prop->info->type, field_prop_getter(prop->info), field_prop_setter(prop->info), prop->info->release, @@ -1000,7 +994,7 @@ static void qdev_class_add_property(DeviceClass *klass, const char *name, op = prop->info->create(oc, name, prop); } else { op = object_class_property_add(oc, - name, prop->info->name, + name, prop->info->type, field_prop_getter(prop->info), field_prop_setter(prop->info), prop->info->release, diff --git a/hw/display/apple-gfx.m b/hw/display/apple-gfx.m index 1554f3b801..c4323574e1 100644 --- a/hw/display/apple-gfx.m +++ b/hw/display/apple-gfx.m @@ -871,7 +871,7 @@ separator_error: } const PropertyInfo qdev_prop_apple_gfx_display_mode = { - .name = "display_mode", + .type = "display_mode", .description = "Display mode in pixels and Hertz, as <width>x<height>@<refresh-rate> " "Example: 3840x2160@60", diff --git a/hw/mips/jazz.c b/hw/mips/jazz.c index c89610639a..1700c3765d 100644 --- a/hw/mips/jazz.c +++ b/hw/mips/jazz.c @@ -50,7 +50,7 @@ #include "qemu/error-report.h" #include "qemu/help_option.h" #ifdef CONFIG_TCG -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #endif /* CONFIG_TCG */ #include "cpu.h" diff --git a/hw/misc/xlnx-versal-trng.c b/hw/misc/xlnx-versal-trng.c index dbd9b58a4e..ba93f93cab 100644 --- a/hw/misc/xlnx-versal-trng.c +++ b/hw/misc/xlnx-versal-trng.c @@ -652,7 +652,7 @@ static void trng_prop_fault_event_set(Object *obj, Visitor *v, } static const PropertyInfo trng_prop_fault_events = { - .name = "uint32:bits", + .type = "uint32", .description = "Set to trigger TRNG fault events", .set = trng_prop_fault_event_set, .realized_set_allowed = true, diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c index f637853073..b72cbab7e8 100644 --- a/hw/net/e1000e.c +++ b/hw/net/e1000e.c @@ -372,8 +372,7 @@ static int e1000e_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc) { Error *local_err = NULL; - int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, - PCI_PM_SIZEOF, &local_err); + int ret = pci_pm_init(pdev, offset, &local_err); if (local_err) { error_report_err(local_err); diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c index 6d853229ae..29a39865a6 100644 --- a/hw/net/eepro100.c +++ b/hw/net/eepro100.c @@ -551,9 +551,7 @@ static void e100_pci_reset(EEPRO100State *s, Error **errp) if (info->power_management) { /* Power Management Capabilities */ int cfg_offset = 0xdc; - int r = pci_add_capability(&s->dev, PCI_CAP_ID_PM, - cfg_offset, PCI_PM_SIZEOF, - errp); + int r = pci_pm_init(&s->dev, cfg_offset, errp); if (r < 0) { return; } diff --git a/hw/net/igb.c b/hw/net/igb.c index c965fc2fb6..e318df40e0 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -356,8 +356,7 @@ static int igb_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc) { Error *local_err = NULL; - int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, - PCI_PM_SIZEOF, &local_err); + int ret = pci_pm_init(pdev, offset, &local_err); if (local_err) { error_report_err(local_err); diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index e62c6a3588..518d02dc66 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8600,8 +8600,7 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) Error *err = NULL; int ret; - ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset, - PCI_PM_SIZEOF, &err); + ret = pci_pm_init(pci_dev, offset, &err); if (err) { error_report_err(err); return ret; diff --git a/hw/nvme/nguid.c b/hw/nvme/nguid.c index be63cb75e1..4cd6fad6ac 100644 --- a/hw/nvme/nguid.c +++ b/hw/nvme/nguid.c @@ -179,7 +179,7 @@ static void set_nguid(Object *obj, Visitor *v, const char *name, void *opaque, } const PropertyInfo qdev_prop_nguid = { - .name = "str", + .type = "str", .description = "NGUID or \"" NGUID_VALUE_AUTO "\" for random value", .get = get_nguid, diff --git a/hw/nvram/xlnx-bbram.c b/hw/nvram/xlnx-bbram.c index 0e8552ce65..14cc9073c7 100644 --- a/hw/nvram/xlnx-bbram.c +++ b/hw/nvram/xlnx-bbram.c @@ -502,7 +502,7 @@ static void bbram_prop_release_drive(Object *obj, const char *name, } static const PropertyInfo bbram_prop_drive = { - .name = "str", + .type = "str", .description = "Node name or ID of a block device to use as BBRAM backend", .realized_set_allowed = true, .get = bbram_prop_get_drive, diff --git a/hw/nvram/xlnx-efuse.c b/hw/nvram/xlnx-efuse.c index e2e8311a48..29e7dd539e 100644 --- a/hw/nvram/xlnx-efuse.c +++ b/hw/nvram/xlnx-efuse.c @@ -257,7 +257,7 @@ static void efuse_prop_release_drive(Object *obj, const char *name, } static const PropertyInfo efuse_prop_drive = { - .name = "str", + .type = "str", .description = "Node name or ID of a block device to use as eFUSE backend", .realized_set_allowed = true, .get = efuse_prop_get_drive, diff --git a/hw/openrisc/openrisc_sim.c b/hw/openrisc/openrisc_sim.c index d9e0744922..83d7c2a8af 100644 --- a/hw/openrisc/openrisc_sim.c +++ b/hw/openrisc/openrisc_sim.c @@ -306,8 +306,6 @@ static void openrisc_sim_init(MachineState *machine) exit(1); } - cpu_openrisc_clock_init(cpus[n]); - qemu_register_reset(main_cpu_reset, cpus[n]); } diff --git a/hw/openrisc/virt.c b/hw/openrisc/virt.c index 9afe407b00..3055306783 100644 --- a/hw/openrisc/virt.c +++ b/hw/openrisc/virt.c @@ -487,8 +487,6 @@ static void openrisc_virt_init(MachineState *machine) exit(1); } - cpu_openrisc_clock_init(cpus[n]); - qemu_register_reset(main_cpu_reset, cpus[n]); } diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c index fd4514a595..2429503cfb 100644 --- a/hw/pci-bridge/pcie_pci_bridge.c +++ b/hw/pci-bridge/pcie_pci_bridge.c @@ -52,11 +52,10 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp) goto cap_error; } - pos = pci_add_capability(d, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF, errp); + pos = pci_pm_init(d, 0, errp); if (pos < 0) { goto pm_error; } - d->exp.pm_cap = pos; pci_set_word(d->config + pos + PCI_PM_PMC, 0x3); pcie_cap_arifwd_init(d); diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 1d42847ef0..2844ec5556 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -77,7 +77,7 @@ static void prop_pci_busnr_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_pci_busnr = { - .name = "busnr", + .type = "busnr", .get = prop_pci_busnr_get, }; @@ -435,6 +435,84 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg) attrs, NULL); } +/* + * Register and track a PM capability. If wmask is also enabled for the power + * state field of the pmcsr register, guest writes may change the device PM + * state. BAR access is only enabled while the device is in the D0 state. + * Return the capability offset or negative error code. + */ +int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp) +{ + int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, errp); + + if (cap < 0) { + return cap; + } + + d->pm_cap = cap; + d->cap_present |= QEMU_PCI_CAP_PM; + + return cap; +} + +static uint8_t pci_pm_state(PCIDevice *d) +{ + uint16_t pmcsr; + + if (!(d->cap_present & QEMU_PCI_CAP_PM)) { + return 0; + } + + pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL); + + return pmcsr & PCI_PM_CTRL_STATE_MASK; +} + +/* + * Update the PM capability state based on the new value stored in config + * space respective to the old, pre-write state provided. If the new value + * is rejected (unsupported or invalid transition) restore the old value. + * Return the resulting PM state. + */ +static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t old) +{ + uint16_t pmc; + uint8_t new; + + if (!(d->cap_present & QEMU_PCI_CAP_PM) || + !range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) { + return old; + } + + new = pci_pm_state(d); + if (new == old) { + return old; + } + + pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC); + + /* + * Transitions to D1 & D2 are only allowed if supported. Devices may + * only transition to higher D-states or to D0. + */ + if ((!(pmc & PCI_PM_CAP_D1) && new == 1) || + (!(pmc & PCI_PM_CAP_D2) && new == 2) || + (old && new && new < old)) { + pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL, + PCI_PM_CTRL_STATE_MASK); + pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL, + old); + trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d), + PCI_SLOT(d->devfn), PCI_FUNC(d->devfn), + old, new); + return old; + } + + trace_pci_pm_transition(d->name, pci_dev_bus_num(d), PCI_SLOT(d->devfn), + PCI_FUNC(d->devfn), old, new); + return new; +} + static void pci_reset_regions(PCIDevice *dev) { int r; @@ -474,6 +552,11 @@ static void pci_do_device_reset(PCIDevice *dev) pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) | pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE)); dev->config[PCI_CACHE_LINE_SIZE] = 0x0; + /* Default PM state is D0 */ + if (dev->cap_present & QEMU_PCI_CAP_PM) { + pci_word_test_and_clear_mask(dev->config + dev->pm_cap + PCI_PM_CTRL, + PCI_PM_CTRL_STATE_MASK); + } pci_reset_regions(dev); pci_update_mappings(dev); @@ -1606,7 +1689,7 @@ static void pci_update_mappings(PCIDevice *d) continue; new_addr = pci_bar_address(d, i, r->type, r->size); - if (!d->enabled) { + if (!d->enabled || pci_pm_state(d)) { new_addr = PCI_BAR_UNMAPPED; } @@ -1672,6 +1755,7 @@ uint32_t pci_default_read_config(PCIDevice *d, void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l) { + uint8_t new_pm_state, old_pm_state = pci_pm_state(d); int i, was_irq_disabled = pci_irq_disabled(d); uint32_t val = val_in; @@ -1684,11 +1768,16 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & wmask); d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear */ } + + new_pm_state = pci_pm_update(d, addr, l, old_pm_state); + if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) || ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) || ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) || - range_covers_byte(addr, l, PCI_COMMAND)) + range_covers_byte(addr, l, PCI_COMMAND) || + !!new_pm_state != !!old_pm_state) { pci_update_mappings(d); + } if (ranges_overlap(addr, l, PCI_COMMAND, 2)) { pci_update_irq_disabled(d, was_irq_disabled); diff --git a/hw/pci/trace-events b/hw/pci/trace-events index e98f575a9d..6a9968962f 100644 --- a/hw/pci/trace-events +++ b/hw/pci/trace-events @@ -1,6 +1,8 @@ # See docs/devel/tracing.rst for syntax documentation. # pci.c +pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x REJECTED PM transition D%d->D%d" +pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d" pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64 pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64 pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const char *parent_path) "IRQ %d @%s -> IRQ %d @%s" diff --git a/hw/s390x/ccw-device.c b/hw/s390x/ccw-device.c index 494faebb5a..1ea9934f6c 100644 --- a/hw/s390x/ccw-device.c +++ b/hw/s390x/ccw-device.c @@ -74,9 +74,9 @@ static void ccw_device_set_loadparm(Object *obj, Visitor *v, } const PropertyInfo ccw_loadparm = { - .name = "ccw_loadparm", - .description = "Up to 8 chars in set of [A-Za-z0-9. ] to pass" - " to the guest loader/kernel", + .type = "str", + .description = "Up to 8 chars in set of [A-Za-z0-9. ] to select" + " a guest kernel", .get = ccw_device_get_loadparm, .set = ccw_device_set_loadparm, }; diff --git a/hw/s390x/css.c b/hw/s390x/css.c index 4e27b2961b..738800c98d 100644 --- a/hw/s390x/css.c +++ b/hw/s390x/css.c @@ -2523,7 +2523,7 @@ out: } const PropertyInfo css_devid_propinfo = { - .name = "str", + .type = "str", .description = "Identifier of an I/O device in the channel " "subsystem, example: fe.1.23ab", .get = get_css_devid, @@ -2531,7 +2531,7 @@ const PropertyInfo css_devid_propinfo = { }; const PropertyInfo css_devid_ro_propinfo = { - .name = "str", + .type = "str", .description = "Read-only identifier of an I/O device in the channel " "subsystem, example: fe.1.23ab", .get = get_css_devid, diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 913d72cc74..04cdd4a11b 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -1495,7 +1495,8 @@ static void s390_pci_set_fid(Object *obj, Visitor *v, const char *name, } static const PropertyInfo s390_pci_fid_propinfo = { - .name = "zpci_fid", + .type = "uint32", + .description = "zpci_fid", .get = s390_pci_get_fid, .set = s390_pci_set_fid, }; diff --git a/hw/vfio/amd-xgbe.c b/hw/vfio/amd-xgbe.c index aaa96903db..5927503b5c 100644 --- a/hw/vfio/amd-xgbe.c +++ b/hw/vfio/amd-xgbe.c @@ -15,12 +15,14 @@ #include "hw/vfio/vfio-amd-xgbe.h" #include "migration/vmstate.h" #include "qemu/module.h" +#include "qemu/error-report.h" static void amd_xgbe_realize(DeviceState *dev, Error **errp) { VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); VFIOAmdXgbeDeviceClass *k = VFIO_AMD_XGBE_DEVICE_GET_CLASS(dev); + warn_report("-device vfio-amd-xgbe is deprecated"); vdev->compat = g_strdup("amd,xgbe-seattle-v1a"); vdev->num_compat = 1; diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 30b08ad375..c7ab4ff57a 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -257,6 +257,15 @@ static void vfio_ap_class_init(ObjectClass *klass, void *data) dc->hotpluggable = true; device_class_set_legacy_reset(dc, vfio_ap_reset); dc->bus_type = TYPE_AP_BUS; + + object_class_property_set_description(klass, /* 3.1 */ + "sysfsdev", + "Host sysfs path of assigned device"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif } static const TypeInfo vfio_ap_info = { diff --git a/hw/vfio/calxeda-xgmac.c b/hw/vfio/calxeda-xgmac.c index b016d42b49..a5ef262def 100644 --- a/hw/vfio/calxeda-xgmac.c +++ b/hw/vfio/calxeda-xgmac.c @@ -15,12 +15,14 @@ #include "hw/vfio/vfio-calxeda-xgmac.h" #include "migration/vmstate.h" #include "qemu/module.h" +#include "qemu/error-report.h" static void calxeda_xgmac_realize(DeviceState *dev, Error **errp) { VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); VFIOCalxedaXgmacDeviceClass *k = VFIO_CALXEDA_XGMAC_DEVICE_GET_CLASS(dev); + warn_report("-device vfio-calxeda-xgmac is deprecated"); vdev->compat = g_strdup("calxeda,hb-xgmac"); vdev->num_compat = 1; diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 67bc137f9b..e5e0d9e3e7 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -51,17 +51,8 @@ struct VFIOCCWDevice { EventNotifier crw_notifier; EventNotifier req_notifier; bool force_orb_pfch; - bool warned_orb_pfch; }; -static inline void warn_once_pfch(VFIOCCWDevice *vcdev, SubchDev *sch, - const char *msg) -{ - warn_report_once_cond(&vcdev->warned_orb_pfch, - "vfio-ccw (devno %x.%x.%04x): %s", - sch->cssid, sch->ssid, sch->devno, msg); -} - static void vfio_ccw_compute_needs_reset(VFIODevice *vdev) { vdev->needs_reset = false; @@ -83,7 +74,8 @@ static IOInstEnding vfio_ccw_handle_request(SubchDev *sch) if (!(sch->orb.ctrl0 & ORB_CTRL0_MASK_PFCH) && vcdev->force_orb_pfch) { sch->orb.ctrl0 |= ORB_CTRL0_MASK_PFCH; - warn_once_pfch(vcdev, sch, "PFCH flag forced"); + warn_report_once("vfio-ccw (devno %x.%x.%04x): PFCH flag forced", + sch->cssid, sch->ssid, sch->devno); } QEMU_BUILD_BUG_ON(sizeof(region->orb_area) != sizeof(ORB)); @@ -717,6 +709,21 @@ static void vfio_ccw_class_init(ObjectClass *klass, void *data) cdc->handle_halt = vfio_ccw_handle_halt; cdc->handle_clear = vfio_ccw_handle_clear; cdc->handle_store = vfio_ccw_handle_store; + + object_class_property_set_description(klass, /* 2.10 */ + "sysfsdev", + "Host sysfs path of assigned device"); + object_class_property_set_description(klass, /* 3.0 */ + "force-orb-pfch", + "Force unlimited prefetch"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif + object_class_property_set_description(klass, /* 9.2 */ + "loadparm", + "Define which devices that can be used for booting"); } static const TypeInfo vfio_ccw_info = { diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index bba776f75c..260d65febd 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -5,6 +5,7 @@ vfio_ss.add(files( 'container-base.c', 'container.c', 'migration.c', + 'migration-multifd.c', 'cpr.c', )) vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c new file mode 100644 index 0000000000..233724710b --- /dev/null +++ b/hw/vfio/migration-multifd.c @@ -0,0 +1,679 @@ +/* + * Multifd VFIO migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/vfio/vfio-common.h" +#include "migration/misc.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/lockable.h" +#include "qemu/main-loop.h" +#include "qemu/thread.h" +#include "io/channel-buffer.h" +#include "migration/qemu-file.h" +#include "migration-multifd.h" +#include "trace.h" + +#define VFIO_DEVICE_STATE_CONFIG_STATE (1) + +#define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) + +typedef struct VFIODeviceStatePacket { + uint32_t version; + uint32_t idx; + uint32_t flags; + uint8_t data[0]; +} QEMU_PACKED VFIODeviceStatePacket; + +/* type safety */ +typedef struct VFIOStateBuffers { + GArray *array; +} VFIOStateBuffers; + +typedef struct VFIOStateBuffer { + bool is_present; + char *data; + size_t len; +} VFIOStateBuffer; + +typedef struct VFIOMultifd { + bool load_bufs_thread_running; + bool load_bufs_thread_want_exit; + + VFIOStateBuffers load_bufs; + QemuCond load_bufs_buffer_ready_cond; + QemuCond load_bufs_thread_finished_cond; + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ + uint32_t load_buf_idx; + uint32_t load_buf_idx_last; +} VFIOMultifd; + +static void vfio_state_buffer_clear(gpointer data) +{ + VFIOStateBuffer *lb = data; + + if (!lb->is_present) { + return; + } + + g_clear_pointer(&lb->data, g_free); + lb->is_present = false; +} + +static void vfio_state_buffers_init(VFIOStateBuffers *bufs) +{ + bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); + g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); +} + +static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) +{ + g_clear_pointer(&bufs->array, g_array_unref); +} + +static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) +{ + assert(bufs->array); +} + +static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) +{ + return bufs->array->len; +} + +static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, + unsigned int size) +{ + g_array_set_size(bufs->array, size); +} + +static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, + unsigned int idx) +{ + return &g_array_index(bufs->array, VFIOStateBuffer, idx); +} + +/* called with load_bufs_mutex locked */ +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, + VFIODeviceStatePacket *packet, + size_t packet_total_size, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIOStateBuffer *lb; + + vfio_state_buffers_assert_init(&multifd->load_bufs); + if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { + vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); + } + + lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); + if (lb->is_present) { + error_setg(errp, "%s: state buffer %" PRIu32 " already filled", + vbasedev->name, packet->idx); + return false; + } + + assert(packet->idx >= multifd->load_buf_idx); + + lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); + lb->len = packet_total_size - sizeof(*packet); + lb->is_present = true; + + return true; +} + +bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, + Error **errp) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + error_setg(errp, + "%s: got device state packet but not doing multifd transfer", + vbasedev->name); + return false; + } + + assert(multifd); + + if (data_size < sizeof(*packet)) { + error_setg(errp, "%s: packet too short at %zu (min is %zu)", + vbasedev->name, data_size, sizeof(*packet)); + return false; + } + + if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { + error_setg(errp, "%s: packet has unknown version %" PRIu32, + vbasedev->name, packet->version); + return false; + } + + if (packet->idx == UINT32_MAX) { + error_setg(errp, "%s: packet index is invalid", vbasedev->name); + return false; + } + + trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); + + /* + * Holding BQL here would violate the lock order and can cause + * a deadlock once we attempt to lock load_bufs_mutex below. + */ + assert(!bql_locked()); + + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + /* config state packet should be the last one in the stream */ + if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { + multifd->load_buf_idx_last = packet->idx; + } + + if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, + errp)) { + return false; + } + + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + } + + return true; +} + +static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIOStateBuffer *lb; + g_autoptr(QIOChannelBuffer) bioc = NULL; + g_autoptr(QEMUFile) f_out = NULL, f_in = NULL; + uint64_t mig_header; + int ret; + + assert(multifd->load_buf_idx == multifd->load_buf_idx_last); + lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); + assert(lb->is_present); + + bioc = qio_channel_buffer_new(lb->len); + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load"); + + f_out = qemu_file_new_output(QIO_CHANNEL(bioc)); + qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len); + + ret = qemu_fflush(f_out); + if (ret) { + error_setg(errp, "%s: load config state flush failed: %d", + vbasedev->name, ret); + return false; + } + + qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL); + f_in = qemu_file_new_input(QIO_CHANNEL(bioc)); + + mig_header = qemu_get_be64(f_in); + if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) { + error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64, + vbasedev->name, mig_header); + return false; + } + + bql_lock(); + ret = vfio_load_device_config_state(f_in, vbasedev); + bql_unlock(); + + if (ret < 0) { + error_setg(errp, "%s: vfio_load_device_config_state() failed: %d", + vbasedev->name, ret); + return false; + } + + return true; +} + +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) +{ + VFIOStateBuffer *lb; + unsigned int bufs_len; + + bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); + if (multifd->load_buf_idx >= bufs_len) { + assert(multifd->load_buf_idx == bufs_len); + return NULL; + } + + lb = vfio_state_buffers_at(&multifd->load_bufs, + multifd->load_buf_idx); + if (!lb->is_present) { + return NULL; + } + + return lb; +} + +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, + VFIOStateBuffer *lb, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + g_autofree char *buf = NULL; + char *buf_cur; + size_t buf_len; + + if (!lb->len) { + return true; + } + + trace_vfio_load_state_device_buffer_load_start(vbasedev->name, + multifd->load_buf_idx); + + /* lb might become re-allocated when we drop the lock */ + buf = g_steal_pointer(&lb->data); + buf_cur = buf; + buf_len = lb->len; + while (buf_len > 0) { + ssize_t wr_ret; + int errno_save; + + /* + * Loading data to the device takes a while, + * drop the lock during this process. + */ + qemu_mutex_unlock(&multifd->load_bufs_mutex); + wr_ret = write(migration->data_fd, buf_cur, buf_len); + errno_save = errno; + qemu_mutex_lock(&multifd->load_bufs_mutex); + + if (wr_ret < 0) { + error_setg(errp, + "%s: writing state buffer %" PRIu32 " failed: %d", + vbasedev->name, multifd->load_buf_idx, errno_save); + return false; + } + + assert(wr_ret <= buf_len); + buf_len -= wr_ret; + buf_cur += wr_ret; + } + + trace_vfio_load_state_device_buffer_load_end(vbasedev->name, + multifd->load_buf_idx); + + return true; +} + +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, + bool *should_quit) +{ + return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); +} + +/* + * This thread is spawned by vfio_multifd_switchover_start() which gets + * called upon encountering the switchover point marker in main migration + * stream. + * + * It exits after either: + * * completing loading the remaining device state and device config, OR: + * * encountering some error while doing the above, OR: + * * being forcefully aborted by the migration core by it setting should_quit + * or by vfio_load_cleanup_load_bufs_thread() setting + * multifd->load_bufs_thread_want_exit. + */ +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + bool ret = false; + + trace_vfio_load_bufs_thread_start(vbasedev->name); + + assert(multifd); + QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); + + assert(multifd->load_bufs_thread_running); + + while (true) { + VFIOStateBuffer *lb; + + /* + * Always check cancellation first after the buffer_ready wait below in + * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). + */ + if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + + assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); + + lb = vfio_load_state_buffer_get(multifd); + if (!lb) { + trace_vfio_load_state_device_buffer_starved(vbasedev->name, + multifd->load_buf_idx); + qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, + &multifd->load_bufs_mutex); + continue; + } + + if (multifd->load_buf_idx == multifd->load_buf_idx_last) { + break; + } + + if (multifd->load_buf_idx == 0) { + trace_vfio_load_state_device_buffer_start(vbasedev->name); + } + + if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { + goto thread_exit; + } + + if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { + trace_vfio_load_state_device_buffer_end(vbasedev->name); + } + + multifd->load_buf_idx++; + } + + if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { + goto thread_exit; + } + + ret = true; + +thread_exit: + /* + * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that + * this thread is exiting. + */ + multifd->load_bufs_thread_running = false; + qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); + + trace_vfio_load_bufs_thread_end(vbasedev->name); + + return ret; +} + +static VFIOMultifd *vfio_multifd_new(void) +{ + VFIOMultifd *multifd = g_new(VFIOMultifd, 1); + + vfio_state_buffers_init(&multifd->load_bufs); + + qemu_mutex_init(&multifd->load_bufs_mutex); + + multifd->load_buf_idx = 0; + multifd->load_buf_idx_last = UINT32_MAX; + qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); + + multifd->load_bufs_thread_running = false; + multifd->load_bufs_thread_want_exit = false; + qemu_cond_init(&multifd->load_bufs_thread_finished_cond); + + return multifd; +} + +/* + * Terminates vfio_load_bufs_thread by setting + * multifd->load_bufs_thread_want_exit and signalling all the conditions + * the thread could be blocked on. + * + * Waits for the thread to signal that it had finished. + */ +static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) +{ + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + while (multifd->load_bufs_thread_running) { + multifd->load_bufs_thread_want_exit = true; + + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, + &multifd->load_bufs_mutex); + } + } + bql_lock(); +} + +static void vfio_multifd_free(VFIOMultifd *multifd) +{ + vfio_load_cleanup_load_bufs_thread(multifd); + + qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); + vfio_state_buffers_destroy(&multifd->load_bufs); + qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); + qemu_mutex_destroy(&multifd->load_bufs_mutex); + + g_free(multifd); +} + +void vfio_multifd_cleanup(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + g_clear_pointer(&migration->multifd, vfio_multifd_free); +} + +bool vfio_multifd_transfer_supported(void) +{ + return multifd_device_state_supported() && + migrate_send_switchover_start(); +} + +bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + return migration->multifd_transfer; +} + +bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + + /* + * Make a copy of this setting at the start in case it is changed + * mid-migration. + */ + if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { + migration->multifd_transfer = vfio_multifd_transfer_supported(); + } else { + migration->multifd_transfer = + vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON; + } + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + /* Nothing further to check or do */ + return true; + } + + if (!vfio_multifd_transfer_supported()) { + error_setg(errp, + "%s: Multifd device transfer requested but unsupported in the current config", + vbasedev->name); + return false; + } + + if (alloc_multifd) { + assert(!migration->multifd); + migration->multifd = vfio_multifd_new(); + } + + return true; +} + +void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f) +{ + assert(vfio_multifd_transfer_enabled(vbasedev)); + + /* + * Emit dummy NOP data on the main migration channel since the actual + * device state transfer is done via multifd channels. + */ + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); +} + +static bool +vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, + char *idstr, + uint32_t instance_id, + uint32_t idx, + Error **errp) +{ + g_autoptr(QIOChannelBuffer) bioc = NULL; + g_autoptr(QEMUFile) f = NULL; + int ret; + g_autofree VFIODeviceStatePacket *packet = NULL; + size_t packet_len; + + bioc = qio_channel_buffer_new(0); + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); + + f = qemu_file_new_output(QIO_CHANNEL(bioc)); + + if (vfio_save_device_config_state(f, vbasedev, errp)) { + return false; + } + + ret = qemu_fflush(f); + if (ret) { + error_setg(errp, "%s: save config state flush failed: %d", + vbasedev->name, ret); + return false; + } + + packet_len = sizeof(*packet) + bioc->usage; + packet = g_malloc0(packet_len); + packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; + packet->idx = idx; + packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; + memcpy(&packet->data, bioc->data, bioc->usage); + + if (!multifd_queue_device_state(idstr, instance_id, + (char *)packet, packet_len)) { + error_setg(errp, "%s: multifd config data queuing failed", + vbasedev->name); + return false; + } + + vfio_mig_add_bytes_transferred(packet_len); + + return true; +} + +/* + * This thread is spawned by the migration core directly via + * .save_live_complete_precopy_thread SaveVMHandler. + * + * It exits after either: + * * completing saving the remaining device state and device config, OR: + * * encountering some error while doing the above, OR: + * * being forcefully aborted by the migration core by + * multifd_device_state_save_thread_should_exit() returning true. + */ +bool +vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, + Error **errp) +{ + VFIODevice *vbasedev = d->handler_opaque; + VFIOMigration *migration = vbasedev->migration; + bool ret = false; + g_autofree VFIODeviceStatePacket *packet = NULL; + uint32_t idx; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ + return true; + } + + trace_vfio_save_complete_precopy_thread_start(vbasedev->name, + d->idstr, d->instance_id); + + /* We reach here with device state STOP or STOP_COPY only */ + if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, + VFIO_DEVICE_STATE_STOP, errp)) { + goto thread_exit; + } + + packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); + packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; + + for (idx = 0; ; idx++) { + ssize_t data_size; + size_t packet_size; + + if (multifd_device_state_save_thread_should_exit()) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + + data_size = read(migration->data_fd, &packet->data, + migration->data_buffer_size); + if (data_size < 0) { + error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d", + vbasedev->name, idx, errno); + goto thread_exit; + } else if (data_size == 0) { + break; + } + + packet->idx = idx; + packet_size = sizeof(*packet) + data_size; + + if (!multifd_queue_device_state(d->idstr, d->instance_id, + (char *)packet, packet_size)) { + error_setg(errp, "%s: multifd data queuing failed", vbasedev->name); + goto thread_exit; + } + + vfio_mig_add_bytes_transferred(packet_size); + } + + if (!vfio_save_complete_precopy_thread_config_state(vbasedev, + d->idstr, + d->instance_id, + idx, errp)) { + goto thread_exit; + } + + ret = true; + +thread_exit: + trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret); + + return ret; +} + +int vfio_multifd_switchover_start(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + + assert(multifd); + + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + assert(!multifd->load_bufs_thread_running); + multifd->load_bufs_thread_running = true; + } + bql_lock(); + + qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); + + return 0; +} diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h new file mode 100644 index 0000000000..a664051eb8 --- /dev/null +++ b/hw/vfio/migration-multifd.h @@ -0,0 +1,34 @@ +/* + * Multifd VFIO migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_MIGRATION_MULTIFD_H +#define HW_VFIO_MIGRATION_MULTIFD_H + +#include "hw/vfio/vfio-common.h" + +bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp); +void vfio_multifd_cleanup(VFIODevice *vbasedev); + +bool vfio_multifd_transfer_supported(void); +bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev); + +bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, + Error **errp); + +void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f); + +bool +vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, + Error **errp); + +int vfio_multifd_switchover_start(VFIODevice *vbasedev); + +#endif diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index adfa752db5..416643ddd6 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -23,6 +23,7 @@ #include "migration/qemu-file.h" #include "migration/register.h" #include "migration/blocker.h" +#include "migration-multifd.h" #include "qapi/error.h" #include "qapi/qapi-events-vfio.h" #include "exec/ramlist.h" @@ -32,30 +33,13 @@ #include "hw/hw.h" /* - * Flags to be used as unique delimiters for VFIO devices in the migration - * stream. These flags are composed as: - * 0xffffffff => MSB 32-bit all 1s - * 0xef10 => Magic ID, represents emulated (virtual) function IO - * 0x0000 => 16-bits reserved for flags - * - * The beginning of state information is marked by _DEV_CONFIG_STATE, - * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a - * certain state information is marked by _END_OF_STATE. - */ -#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) -#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) -#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) -#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) -#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) - -/* * This is an arbitrary size based on migration of mlx5 devices, where typically * total device migration size is on the order of 100s of MB. Testing with * larger values, e.g. 128MB and 1GB, did not show a performance improvement. */ #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) -static int64_t bytes_transferred; +static unsigned long bytes_transferred; static const char *mig_state_to_str(enum vfio_device_mig_state state) { @@ -136,10 +120,10 @@ static void vfio_migration_set_device_state(VFIODevice *vbasedev, vfio_migration_send_event(vbasedev); } -static int vfio_migration_set_state(VFIODevice *vbasedev, - enum vfio_device_mig_state new_state, - enum vfio_device_mig_state recover_state, - Error **errp) +int vfio_migration_set_state(VFIODevice *vbasedev, + enum vfio_device_mig_state new_state, + enum vfio_device_mig_state recover_state, + Error **errp) { VFIOMigration *migration = vbasedev->migration; uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + @@ -254,8 +238,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, return ret; } -static int vfio_save_device_config_state(QEMUFile *f, void *opaque, - Error **errp) +int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp) { VFIODevice *vbasedev = opaque; int ret; @@ -280,11 +263,13 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque, return ret; } -static int vfio_load_device_config_state(QEMUFile *f, void *opaque) +int vfio_load_device_config_state(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; uint64_t data; + trace_vfio_load_device_config_state_start(vbasedev->name); + if (vbasedev->ops && vbasedev->ops->vfio_load_config) { int ret; @@ -303,7 +288,7 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque) return -EINVAL; } - trace_vfio_load_device_config_state(vbasedev->name); + trace_vfio_load_device_config_state_end(vbasedev->name); return qemu_file_get_error(f); } @@ -389,7 +374,7 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); qemu_put_be64(f, data_size); qemu_put_buffer(f, migration->data_buffer, data_size); - bytes_transferred += data_size; + vfio_mig_add_bytes_transferred(data_size); trace_vfio_save_block(migration->vbasedev->name, data_size); @@ -467,6 +452,10 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; int ret; + if (!vfio_multifd_setup(vbasedev, false, errp)) { + return -EINVAL; + } + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); vfio_query_stop_copy_size(vbasedev, &stop_copy_size); @@ -523,6 +512,9 @@ static void vfio_save_cleanup(void *opaque) Error *local_err = NULL; int ret; + /* Currently a NOP, done for symmetry with load_cleanup() */ + vfio_multifd_cleanup(vbasedev); + /* * Changing device state from STOP_COPY to STOP can take time. Do it here, * after migration has completed, so it won't increase downtime. @@ -645,6 +637,11 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) int ret; Error *local_err = NULL; + if (vfio_multifd_transfer_enabled(vbasedev)) { + vfio_multifd_emit_dummy_eos(vbasedev, f); + return 0; + } + trace_vfio_save_complete_precopy_start(vbasedev->name); /* We reach here with device state STOP or STOP_COPY only */ @@ -676,6 +673,11 @@ static void vfio_save_state(QEMUFile *f, void *opaque) Error *local_err = NULL; int ret; + if (vfio_multifd_transfer_enabled(vbasedev)) { + vfio_multifd_emit_dummy_eos(vbasedev, f); + return; + } + ret = vfio_save_device_config_state(f, opaque, &local_err); if (ret) { error_prepend(&local_err, @@ -688,15 +690,28 @@ static void vfio_save_state(QEMUFile *f, void *opaque) static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp) { VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; + + if (!vfio_multifd_setup(vbasedev, true, errp)) { + return -EINVAL; + } + + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, + migration->device_state, errp); + if (ret) { + return ret; + } - return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, - vbasedev->migration->device_state, errp); + return 0; } static int vfio_load_cleanup(void *opaque) { VFIODevice *vbasedev = opaque; + vfio_multifd_cleanup(vbasedev); + vfio_migration_cleanup(vbasedev); trace_vfio_load_cleanup(vbasedev->name); @@ -717,6 +732,13 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) switch (data) { case VFIO_MIG_FLAG_DEV_CONFIG_STATE: { + if (vfio_multifd_transfer_enabled(vbasedev)) { + error_report("%s: got DEV_CONFIG_STATE in main migration " + "channel but doing multifd transfer", + vbasedev->name); + return -EINVAL; + } + return vfio_load_device_config_state(f, opaque); } case VFIO_MIG_FLAG_DEV_SETUP_STATE: @@ -782,6 +804,17 @@ static bool vfio_switchover_ack_needed(void *opaque) return vfio_precopy_supported(vbasedev); } +static int vfio_switchover_start(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + if (vfio_multifd_transfer_enabled(vbasedev)) { + return vfio_multifd_switchover_start(vbasedev); + } + + return 0; +} + static const SaveVMHandlers savevm_vfio_handlers = { .save_prepare = vfio_save_prepare, .save_setup = vfio_save_setup, @@ -796,6 +829,12 @@ static const SaveVMHandlers savevm_vfio_handlers = { .load_cleanup = vfio_load_cleanup, .load_state = vfio_load_state, .switchover_ack_needed = vfio_switchover_ack_needed, + /* + * Multifd support + */ + .load_state_buffer = vfio_multifd_load_state_buffer, + .switchover_start = vfio_switchover_start, + .save_live_complete_precopy_thread = vfio_multifd_save_complete_precopy_thread, }; /* ---------------------------------------------------------------------- */ @@ -1011,12 +1050,17 @@ static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) int64_t vfio_mig_bytes_transferred(void) { - return bytes_transferred; + return MIN(qatomic_read(&bytes_transferred), INT64_MAX); } void vfio_reset_bytes_transferred(void) { - bytes_transferred = 0; + qatomic_set(&bytes_transferred, 0); +} + +void vfio_mig_add_bytes_transferred(unsigned long val) +{ + qatomic_add(&bytes_transferred, val); } /* diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index fbe43b0a79..c53591fe2b 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1480,7 +1480,7 @@ static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v, } const PropertyInfo qdev_prop_nv_gpudirect_clique = { - .name = "uint4", + .type = "uint8", .description = "NVIDIA GPUDirect Clique ID (0 - 15)", .get = get_nv_gpudirect_clique_id, .set = set_nv_gpudirect_clique_id, diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 89d900e9cf..fdbc15885d 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2215,8 +2215,12 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) break; case PCI_CAP_ID_PM: vfio_check_pm_reset(vdev, pos); - vdev->pm_cap = pos; - ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0; + ret = pci_pm_init(pdev, pos, errp) >= 0; + /* + * PCI-core config space emulation needs write access to the power + * state enabled for tracking BAR mapping relative to PM state. + */ + pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK); break; case PCI_CAP_ID_AF: vfio_check_af_flr(vdev, pos); @@ -2406,18 +2410,27 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) vfio_disable_interrupts(vdev); + /* + * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. + * Also put INTx Disable in known state. + */ + cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); + cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | + PCI_COMMAND_INTX_DISABLE); + vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); + /* Make sure the device is in D0 */ - if (vdev->pm_cap) { + if (pdev->pm_cap) { uint16_t pmcsr; uint8_t state; - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (state) { pmcsr &= ~PCI_PM_CTRL_STATE_MASK; - vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); + vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); /* vfio handles the necessary delay here */ - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (state) { error_report("vfio: Unable to power on device, stuck in D%d", @@ -2425,15 +2438,6 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) } } } - - /* - * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. - * Also put INTx Disable in known state. - */ - cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); - cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | - PCI_COMMAND_INTX_DISABLE); - vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); } void vfio_pci_post_reset(VFIOPCIDevice *vdev) @@ -3353,6 +3357,8 @@ static void vfio_instance_init(Object *obj) pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; } +static PropertyInfo vfio_pci_migration_multifd_transfer_prop; + static const Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), @@ -3377,6 +3383,10 @@ static const Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, vbasedev.enable_migration, ON_OFF_AUTO_AUTO), + DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice, + vbasedev.migration_multifd_transfer, + vfio_pci_migration_multifd_transfer_prop, OnOffAuto, + .set_default = true, .defval.i = ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, vbasedev.migration_events, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), @@ -3433,6 +3443,126 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) pdc->exit = vfio_exitfn; pdc->config_read = vfio_pci_read_config; pdc->config_write = vfio_pci_write_config; + + object_class_property_set_description(klass, /* 1.3 */ + "host", + "Host PCI address [domain:]<bus:slot.function> of assigned device"); + object_class_property_set_description(klass, /* 1.3 */ + "x-intx-mmap-timeout-ms", + "When EOI is not provided by KVM/QEMU, wait time " + "(milliseconds) to re-enable device direct access " + "after INTx (DEBUG)"); + object_class_property_set_description(klass, /* 1.5 */ + "x-vga", + "Expose VGA address spaces for device"); + object_class_property_set_description(klass, /* 2.3 */ + "x-req", + "Disable device request notification support (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 and 2.5 */ + "x-no-mmap", + "Disable MMAP for device. Allows to trace MMIO " + "accesses (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-intx", + "Disable direct VFIO->KVM INTx injection. Allows to " + "trace INTx interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-msi", + "Disable direct VFIO->KVM MSI injection. Allows to " + "trace MSI interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-msix", + "Disable direct VFIO->KVM MSIx injection. Allows to " + "trace MSIx interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-vendor-id", + "Override PCI Vendor ID with provided value (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-device-id", + "Override PCI device ID with provided value (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-sub-vendor-id", + "Override PCI Subsystem Vendor ID with provided value " + "(DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-sub-device-id", + "Override PCI Subsystem Device ID with provided value " + "(DEBUG)"); + object_class_property_set_description(klass, /* 2.6 */ + "sysfsdev", + "Host sysfs path of assigned device"); + object_class_property_set_description(klass, /* 2.7 */ + "x-igd-opregion", + "Expose host IGD OpRegion to guest"); + object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */ + "x-igd-gms", + "Override IGD data stolen memory size (32MiB units)"); + object_class_property_set_description(klass, /* 2.11 */ + "x-nv-gpudirect-clique", + "Add NVIDIA GPUDirect capability indicating P2P DMA " + "clique for device [0-15]"); + object_class_property_set_description(klass, /* 2.12 */ + "x-no-geforce-quirks", + "Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). " + "Improves performance"); + object_class_property_set_description(klass, /* 2.12 */ + "display", + "Enable display support for device, ex. vGPU"); + object_class_property_set_description(klass, /* 2.12 */ + "x-msix-relocation", + "Specify MSI-X MMIO relocation to the end of specified " + "existing BAR or new BAR to avoid virtualization overhead " + "due to adjacent device registers"); + object_class_property_set_description(klass, /* 3.0 */ + "x-no-kvm-ioeventfd", + "Disable registration of ioeventfds with KVM (DEBUG)"); + object_class_property_set_description(klass, /* 3.0 */ + "x-no-vfio-ioeventfd", + "Disable linking of KVM ioeventfds to VFIO ioeventfds " + "(DEBUG)"); + object_class_property_set_description(klass, /* 3.1 */ + "x-balloon-allowed", + "Override allowing ballooning with device (DEBUG, DANGER)"); + object_class_property_set_description(klass, /* 3.2 */ + "xres", + "Set X display resolution the vGPU should use"); + object_class_property_set_description(klass, /* 3.2 */ + "yres", + "Set Y display resolution the vGPU should use"); + object_class_property_set_description(klass, /* 5.2 */ + "x-pre-copy-dirty-page-tracking", + "Disable dirty pages tracking during iterative phase " + "(DEBUG)"); + object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */ + "enable-migration", + "Enale device migration. Also requires a host VFIO PCI " + "variant or mdev driver with migration support enabled"); + object_class_property_set_description(klass, /* 8.1 */ + "vf-token", + "Specify UUID VF token. Required for VF when PF is owned " + "by another VFIO driver"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif + object_class_property_set_description(klass, /* 9.1 */ + "x-device-dirty-page-tracking", + "Disable device dirty page tracking and use " + "container-based dirty page tracking (DEBUG)"); + object_class_property_set_description(klass, /* 9.1 */ + "migration-events", + "Emit VFIO migration QAPI event when a VFIO device " + "changes its migration state. For management applications"); + object_class_property_set_description(klass, /* 9.1 */ + "skip-vsc-check", + "Skip config space check for Vendor Specific Capability. " + "Setting to false will enforce strict checking of VSC content " + "(DEBUG)"); + object_class_property_set_description(klass, /* 10.0 */ + "x-migration-multifd-transfer", + "Transfer this device state via " + "multifd channels when live migrating it"); } static const TypeInfo vfio_pci_dev_info = { @@ -3461,6 +3591,15 @@ static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, vfio_pci_dev_nohotplug_properties); dc->hotpluggable = false; + + object_class_property_set_description(klass, /* 3.1 */ + "ramfb", + "Enable ramfb to provide pre-boot graphics for devices " + "enabling display option"); + object_class_property_set_description(klass, /* 8.2 */ + "x-ramfb-migrate", + "Override default migration support for ramfb support " + "(DEBUG)"); } static const TypeInfo vfio_pci_nohotplug_dev_info = { @@ -3472,6 +3611,17 @@ static const TypeInfo vfio_pci_nohotplug_dev_info = { static void register_vfio_pci_dev_type(void) { + /* + * Ordinary ON_OFF_AUTO property isn't runtime-mutable, but source VM can + * run for a long time before being migrated so it is desirable to have a + * fallback mechanism to the old way of transferring VFIO device state if + * it turns to be necessary. + * The following makes this type of property have the same mutability level + * as ordinary migration parameters. + */ + vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto; + vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true; + type_register_static(&vfio_pci_dev_info); type_register_static(&vfio_pci_nohotplug_dev_info); } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 43c166680a..d638c781f6 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -160,7 +160,6 @@ struct VFIOPCIDevice { int32_t bootindex; uint32_t igd_gms; OffAutoPCIBAR msix_relo; - uint8_t pm_cap; uint8_t nv_gpudirect_clique; bool pci_aer; bool req_enabled; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index f491f4dc95..67bc57409c 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -575,6 +575,7 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) VFIODevice *vbasedev = &vdev->vbasedev; int i; + warn_report("-device vfio-platform is deprecated"); qemu_mutex_init(&vdev->intp_mutex); trace_vfio_platform_realize(vbasedev->sysfsdev ? @@ -672,6 +673,30 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data) dc->desc = "VFIO-based platform device assignment"; sbc->connect_irq_notifier = vfio_start_irqfd_injection; set_bit(DEVICE_CATEGORY_MISC, dc->categories); + + object_class_property_set_description(klass, /* 2.4 */ + "host", + "Host device name of assigned device"); + object_class_property_set_description(klass, /* 2.4 and 2.5 */ + "x-no-mmap", + "Disable MMAP for device. Allows to trace MMIO " + "accesses (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 */ + "mmap-timeout-ms", + "When EOI is not provided by KVM/QEMU, wait time " + "(milliseconds) to re-enable device direct access " + "after level interrupt (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 */ + "x-irqfd", + "Allow disabling irqfd support (DEBUG)"); + object_class_property_set_description(klass, /* 2.6 */ + "sysfsdev", + "Host sysfs path of assigned device"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif } static const TypeInfo vfio_platform_dev_info = { diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index c5385e1a4f..9347e3a5f6 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -149,10 +149,19 @@ vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u" vfio_display_edid_write_error(void) "" # migration.c +vfio_load_bufs_thread_start(const char *name) " (%s)" +vfio_load_bufs_thread_end(const char *name) " (%s)" vfio_load_cleanup(const char *name) " (%s)" -vfio_load_device_config_state(const char *name) " (%s)" +vfio_load_device_config_state_start(const char *name) " (%s)" +vfio_load_device_config_state_end(const char *name) " (%s)" vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d" +vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_start(const char *name) " (%s)" +vfio_load_state_device_buffer_starved(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_load_start(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_load_end(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_end(const char *name) " (%s)" vfio_migration_realize(const char *name) " (%s)" vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s" vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s" @@ -162,6 +171,8 @@ vfio_save_block_precopy_empty_hit(const char *name) " (%s)" vfio_save_cleanup(const char *name) " (%s)" vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d" vfio_save_complete_precopy_start(const char *name) " (%s)" +vfio_save_complete_precopy_thread_start(const char *name, const char *idstr, uint32_t instance_id) " (%s) idstr %s instance %"PRIu32 +vfio_save_complete_precopy_thread_end(const char *name, int ret) " (%s) ret %d" vfio_save_device_config_state(const char *name) " (%s)" vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size %"PRIu64" precopy dirty size %"PRIu64 vfio_save_iterate_start(const char *name) " (%s)" diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index c773a9130c..3ca3f849d3 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -2204,14 +2204,11 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) pos = pcie_endpoint_cap_init(pci_dev, 0); assert(pos > 0); - pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0, - PCI_PM_SIZEOF, errp); + pos = pci_pm_init(pci_dev, 0, errp); if (pos < 0) { return; } - pci_dev->exp.pm_cap = pos; - /* * Indicates that this function complies with revision 1.2 of the * PCI Power Management Interface Specification. @@ -2310,11 +2307,11 @@ static bool virtio_pci_no_soft_reset(PCIDevice *dev) { uint16_t pmcsr; - if (!pci_is_express(dev) || !dev->exp.pm_cap) { + if (!pci_is_express(dev) || !(dev->cap_present & QEMU_PCI_CAP_PM)) { return false; } - pmcsr = pci_get_word(dev->config + dev->exp.pm_cap + PCI_PM_CTRL); + pmcsr = pci_get_word(dev->config + dev->pm_cap + PCI_PM_CTRL); /* * When No_Soft_Reset bit is set and the device @@ -2343,7 +2340,7 @@ static void virtio_pci_bus_reset_hold(Object *obj, ResetType type) if (proxy->flags & VIRTIO_PCI_FLAG_INIT_PM) { pci_word_test_and_clear_mask( - dev->config + dev->exp.pm_cap + PCI_PM_CTRL, + dev->config + dev->pm_cap + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK); } } diff --git a/include/hw/core/accel-cpu.h b/include/accel/accel-cpu-target.h index 24dad45ab9..37dde7fae3 100644 --- a/include/hw/core/accel-cpu.h +++ b/include/accel/accel-cpu-target.h @@ -8,8 +8,8 @@ * See the COPYING file in the top-level directory. */ -#ifndef ACCEL_CPU_H -#define ACCEL_CPU_H +#ifndef ACCEL_CPU_TARGET_H +#define ACCEL_CPU_TARGET_H /* * This header is used to define new accelerator-specific target-specific @@ -20,6 +20,9 @@ * subclasses in target/, or the accel implementation itself in accel/ */ +#include "qom/object.h" +#include "cpu.h" + #define TYPE_ACCEL_CPU "accel-" CPU_RESOLVING_TYPE #define ACCEL_CPU_NAME(name) (name "-" TYPE_ACCEL_CPU) typedef struct AccelCPUClass AccelCPUClass; diff --git a/include/hw/core/tcg-cpu-ops.h b/include/accel/tcg/cpu-ops.h index 2e3f1690f1..2e3f1690f1 100644 --- a/include/hw/core/tcg-cpu-ops.h +++ b/include/accel/tcg/cpu-ops.h diff --git a/include/block/aio.h b/include/block/aio.h index 43883a8a33..b2ab3514de 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -54,7 +54,7 @@ typedef void QEMUBHFunc(void *opaque); typedef bool AioPollFn(void *opaque); typedef void IOHandler(void *opaque); -struct ThreadPool; +struct ThreadPoolAio; struct LinuxAioState; typedef struct LuringState LuringState; @@ -207,7 +207,7 @@ struct AioContext { /* Thread pool for performing work and receiving completion callbacks. * Has its own locking. */ - struct ThreadPool *thread_pool; + struct ThreadPoolAio *thread_pool; #ifdef CONFIG_LINUX_AIO struct LinuxAioState *linux_aio; @@ -500,8 +500,8 @@ void aio_set_event_notifier_poll(AioContext *ctx, */ GSource *aio_get_g_source(AioContext *ctx); -/* Return the ThreadPool bound to this AioContext */ -struct ThreadPool *aio_get_thread_pool(AioContext *ctx); +/* Return the ThreadPoolAio bound to this AioContext */ +struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx); /* Setup the LinuxAioState bound to this AioContext */ struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp); diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index bb91a0f62f..ebb4e56a50 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -28,7 +28,6 @@ #include "block/block-common.h" #include "block/block-global-state.h" #include "block/snapshot.h" -#include "qemu/clang-tsa.h" #include "qemu/iov.h" #include "qemu/rcu.h" #include "qemu/stats64.h" diff --git a/include/block/graph-lock.h b/include/block/graph-lock.h index dc8d949184..2c26c72108 100644 --- a/include/block/graph-lock.h +++ b/include/block/graph-lock.h @@ -20,8 +20,6 @@ #ifndef GRAPH_LOCK_H #define GRAPH_LOCK_H -#include "qemu/clang-tsa.h" - /** * Graph Lock API * This API provides a rwlock used to protect block layer diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h index 948ff5f30c..dd48cf07e8 100644 --- a/include/block/thread-pool.h +++ b/include/block/thread-pool.h @@ -24,20 +24,70 @@ typedef int ThreadPoolFunc(void *opaque); -typedef struct ThreadPool ThreadPool; +typedef struct ThreadPoolAio ThreadPoolAio; -ThreadPool *thread_pool_new(struct AioContext *ctx); -void thread_pool_free(ThreadPool *pool); +ThreadPoolAio *thread_pool_new_aio(struct AioContext *ctx); +void thread_pool_free_aio(ThreadPoolAio *pool); /* - * thread_pool_submit* API: submit I/O requests in the thread's + * thread_pool_submit_{aio,co} API: submit I/O requests in the thread's * current AioContext. */ BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, BlockCompletionFunc *cb, void *opaque); int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg); -void thread_pool_submit(ThreadPoolFunc *func, void *arg); +void thread_pool_update_params(ThreadPoolAio *pool, struct AioContext *ctx); + +/* ------------------------------------------- */ +/* Generic thread pool types and methods below */ +typedef struct ThreadPool ThreadPool; + +/* Create a new thread pool. Never returns NULL. */ +ThreadPool *thread_pool_new(void); + +/* + * Free the thread pool. + * Waits for all the previously submitted work to complete before performing + * the actual freeing operation. + */ +void thread_pool_free(ThreadPool *pool); + +/* + * Submit a new work (task) for the pool. + * + * @opaque_destroy is an optional GDestroyNotify for the @opaque argument + * to the work function at @func. + */ +void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, + void *opaque, GDestroyNotify opaque_destroy); + +/* + * Submit a new work (task) for the pool, making sure it starts getting + * processed immediately, launching a new thread for it if necessary. + * + * @opaque_destroy is an optional GDestroyNotify for the @opaque argument + * to the work function at @func. + */ +void thread_pool_submit_immediate(ThreadPool *pool, ThreadPoolFunc *func, + void *opaque, GDestroyNotify opaque_destroy); + +/* + * Wait for all previously submitted work to complete before returning. + * + * Can be used as a barrier between two sets of tasks executed on a thread + * pool without destroying it or in a performance sensitive path where the + * caller just wants to wait for all tasks to complete while deferring the + * pool free operation for later, less performance sensitive time. + */ +void thread_pool_wait(ThreadPool *pool); -void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx); +/* Set the maximum number of threads in the pool. */ +bool thread_pool_set_max_threads(ThreadPool *pool, int max_threads); + +/* + * Adjust the maximum number of threads in the pool to give each task its + * own thread (exactly one thread per task). + */ +bool thread_pool_adjust_max_threads_to_work(ThreadPool *pool); #endif diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index d9045c9ac4..8eb0df48f9 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -30,16 +30,6 @@ #if !defined(CONFIG_USER_ONLY) && defined(CONFIG_TCG) /* cputlb.c */ /** - * tlb_init - initialize a CPU's TLB - * @cpu: CPU whose TLB should be initialized - */ -void tlb_init(CPUState *cpu); -/** - * tlb_destroy - destroy a CPU's TLB - * @cpu: CPU whose TLB should be destroyed - */ -void tlb_destroy(CPUState *cpu); -/** * tlb_flush_page: * @cpu: CPU whose TLB should be flushed * @addr: virtual address of page to be flushed @@ -223,12 +213,6 @@ void tlb_set_page(CPUState *cpu, vaddr addr, hwaddr paddr, int prot, int mmu_idx, vaddr size); #else -static inline void tlb_init(CPUState *cpu) -{ -} -static inline void tlb_destroy(CPUState *cpu) -{ -} static inline void tlb_flush_page(CPUState *cpu, vaddr addr) { } diff --git a/include/exec/page-protection.h b/include/exec/page-protection.h index bae3355f62..3e0a8a0333 100644 --- a/include/exec/page-protection.h +++ b/include/exec/page-protection.h @@ -40,8 +40,6 @@ #ifdef CONFIG_USER_ONLY -#include "qemu/clang-tsa.h" - void TSA_NO_TSA mmap_lock(void); void TSA_NO_TSA mmap_unlock(void); bool have_mmap_lock(void); diff --git a/include/exec/poison.h b/include/exec/poison.h index f4283f693a..d6d4832854 100644 --- a/include/exec/poison.h +++ b/include/exec/poison.h @@ -48,6 +48,7 @@ #pragma GCC poison TARGET_PAGE_MASK #pragma GCC poison TARGET_PAGE_BITS #pragma GCC poison TARGET_PAGE_ALIGN +#pragma GCC poison TARGET_PHYS_ADDR_SPACE_BITS #pragma GCC poison CPU_INTERRUPT_HARD #pragma GCC poison CPU_INTERRUPT_EXITTB diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index fb397cdfc5..9dd6ac7c76 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -124,7 +124,9 @@ struct SysemuCPUOps; * @get_pc: Callback for getting the Program Counter register. * As above, with the semantics of the target architecture. * @gdb_read_register: Callback for letting GDB read a register. + * No more than @gdb_num_core_regs registers can be read. * @gdb_write_register: Callback for letting GDB write a register. + * No more than @gdb_num_core_regs registers can be written. * @gdb_adjust_breakpoint: Callback for adjusting the address of a * breakpoint. Used by AVR to handle a gdb mis-feature with * its Harvard architecture split code and data. @@ -614,6 +616,8 @@ extern bool mttcg_enabled; */ bool cpu_paging_enabled(const CPUState *cpu); +#if !defined(CONFIG_USER_ONLY) + /** * cpu_get_memory_mapping: * @cpu: The CPU whose memory mappings are to be obtained. @@ -625,8 +629,6 @@ bool cpu_paging_enabled(const CPUState *cpu); bool cpu_get_memory_mapping(CPUState *cpu, MemoryMappingList *list, Error **errp); -#if !defined(CONFIG_USER_ONLY) - /** * cpu_write_elf64_note: * @f: pointer to a function that writes memory to a file diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 4002bbeebd..c220cc8449 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -216,6 +216,8 @@ enum { QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR), #define QEMU_PCIE_EXT_TAG_BITNR 13 QEMU_PCIE_EXT_TAG = (1 << QEMU_PCIE_EXT_TAG_BITNR), +#define QEMU_PCI_CAP_PM_BITNR 14 + QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR), }; typedef struct PCIINTxRoute { @@ -676,5 +678,6 @@ static inline void pci_irq_deassert(PCIDevice *pci_dev) MSIMessage pci_get_msi_message(PCIDevice *dev, int vector); void pci_set_enabled(PCIDevice *pci_dev, bool state); void pci_set_power(PCIDevice *pci_dev, bool state); +int pci_pm_init(PCIDevice *pci_dev, uint8_t offset, Error **errp); #endif diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h index add208edfa..345b12eaac 100644 --- a/include/hw/pci/pci_device.h +++ b/include/hw/pci/pci_device.h @@ -105,6 +105,9 @@ struct PCIDevice { /* Capability bits */ uint32_t cap_present; + /* Offset of PM capability in config space */ + uint8_t pm_cap; + /* Offset of MSI-X capability in config space */ uint8_t msix_cap; diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h index b8d59732bc..70a5de09de 100644 --- a/include/hw/pci/pcie.h +++ b/include/hw/pci/pcie.h @@ -58,8 +58,6 @@ typedef enum { struct PCIExpressDevice { /* Offset of express capability in config space */ uint8_t exp_cap; - /* Offset of Power Management capability in config space */ - uint8_t pm_cap; /* SLOT */ bool hpev_notified; /* Logical AND of conditions for hot plug event. diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h index bf27375a3c..15fcec5260 100644 --- a/include/hw/qdev-properties.h +++ b/include/hw/qdev-properties.h @@ -30,7 +30,7 @@ struct Property { }; struct PropertyInfo { - const char *name; + const char *type; const char *description; const QEnumLookup *enum_table; bool realized_set_allowed; /* allow setting property on realized device */ @@ -49,7 +49,6 @@ struct PropertyInfo { extern const PropertyInfo qdev_prop_bit; extern const PropertyInfo qdev_prop_bit64; extern const PropertyInfo qdev_prop_bool; -extern const PropertyInfo qdev_prop_enum; extern const PropertyInfo qdev_prop_uint8; extern const PropertyInfo qdev_prop_uint16; extern const PropertyInfo qdev_prop_uint32; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index ac35136a11..04b123a6c9 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -36,6 +36,23 @@ #define VFIO_MSG_PREFIX "vfio %s: " +/* + * Flags to be used as unique delimiters for VFIO devices in the migration + * stream. These flags are composed as: + * 0xffffffff => MSB 32-bit all 1s + * 0xef10 => Magic ID, represents emulated (virtual) function IO + * 0x0000 => 16-bits reserved for flags + * + * The beginning of state information is marked by _DEV_CONFIG_STATE, + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a + * certain state information is marked by _END_OF_STATE. + */ +#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) +#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) +#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) +#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) + enum { VFIO_DEVICE_TYPE_PCI = 0, VFIO_DEVICE_TYPE_PLATFORM = 1, @@ -61,6 +78,8 @@ typedef struct VFIORegion { uint8_t nr; /* cache the region number for debug */ } VFIORegion; +typedef struct VFIOMultifd VFIOMultifd; + typedef struct VFIOMigration { struct VFIODevice *vbasedev; VMChangeStateEntry *vm_state; @@ -72,6 +91,8 @@ typedef struct VFIOMigration { uint64_t mig_flags; uint64_t precopy_init_size; uint64_t precopy_dirty_size; + bool multifd_transfer; + VFIOMultifd *multifd; bool initial_data_sent; bool event_save_iterate_started; @@ -133,6 +154,7 @@ typedef struct VFIODevice { bool no_mmap; bool ram_block_discard_allowed; OnOffAuto enable_migration; + OnOffAuto migration_multifd_transfer; bool migration_events; VFIODeviceOps *ops; unsigned int num_irqs; @@ -274,9 +296,13 @@ void vfio_unblock_multiple_devices_migration(void); bool vfio_viommu_preset(VFIODevice *vbasedev); int64_t vfio_mig_bytes_transferred(void); void vfio_reset_bytes_transferred(void); +void vfio_mig_add_bytes_transferred(unsigned long val); bool vfio_device_state_is_running(VFIODevice *vbasedev); bool vfio_device_state_is_precopy(VFIODevice *vbasedev); +int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp); +int vfio_load_device_config_state(QEMUFile *f, void *opaque); + #ifdef CONFIG_LINUX int vfio_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info); @@ -291,6 +317,11 @@ struct vfio_info_cap_header * vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); struct vfio_info_cap_header * vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id); + +int vfio_migration_set_state(VFIODevice *vbasedev, + enum vfio_device_mig_state new_state, + enum vfio_device_mig_state recover_state, + Error **errp); #endif bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); diff --git a/include/migration/client-options.h b/include/migration/client-options.h index 59f4b55cf4..289c9d7762 100644 --- a/include/migration/client-options.h +++ b/include/migration/client-options.h @@ -10,6 +10,10 @@ #ifndef QEMU_MIGRATION_CLIENT_OPTIONS_H #define QEMU_MIGRATION_CLIENT_OPTIONS_H + +/* properties */ +bool migrate_send_switchover_start(void); + /* capabilities */ bool migrate_background_snapshot(void); diff --git a/include/migration/misc.h b/include/migration/misc.h index c660be8095..8fd36eba1d 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -45,9 +45,12 @@ bool migrate_ram_is_ignored(RAMBlock *block); /* migration/block.c */ AnnounceParameters *migrate_announce_params(void); + /* migration/savevm.c */ void dump_vmstate_json_to_file(FILE *out_fp); +void qemu_loadvm_start_load_thread(MigrationLoadThread function, + void *opaque); /* migration/migration.c */ void migration_object_init(void); @@ -115,4 +118,26 @@ bool migrate_is_uri(const char *uri); bool migrate_uri_parse(const char *uri, MigrationChannel **channel, Error **errp); +/* migration/multifd-device-state.c */ +typedef struct SaveLiveCompletePrecopyThreadData { + SaveLiveCompletePrecopyThreadHandler hdlr; + char *idstr; + uint32_t instance_id; + void *handler_opaque; +} SaveLiveCompletePrecopyThreadData; + +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, + char *data, size_t len); +bool multifd_device_state_supported(void); + +void +multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr, + char *idstr, uint32_t instance_id, + void *opaque); + +bool multifd_device_state_save_thread_should_exit(void); + +void multifd_abort_device_state_save_threads(void); +bool multifd_join_device_state_save_threads(void); + #endif diff --git a/include/migration/register.h b/include/migration/register.h index f60e797894..c041ce32f2 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -69,7 +69,9 @@ typedef struct SaveVMHandlers { /** * @save_cleanup * - * Uninitializes the data structures on the source + * Uninitializes the data structures on the source. + * Note that this handler can be called even if save_setup + * wasn't called earlier. * * @opaque: data pointer passed to register_savevm_live() */ @@ -103,6 +105,25 @@ typedef struct SaveVMHandlers { */ int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); + /** + * @save_live_complete_precopy_thread (invoked in a separate thread) + * + * Called at the end of a precopy phase from a separate worker thread + * in configurations where multifd device state transfer is supported + * in order to perform asynchronous transmission of the remaining data in + * parallel with @save_live_complete_precopy handlers. + * When postcopy is enabled, devices that support postcopy will skip this + * step. + * + * @d: a #SaveLiveCompletePrecopyThreadData containing parameters that the + * handler may need, including this device section idstr and instance_id, + * and opaque data pointer passed to register_savevm_live(). + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for errors. + */ + SaveLiveCompletePrecopyThreadHandler save_live_complete_precopy_thread; + /* This runs both outside and inside the BQL. */ /** @@ -228,6 +249,21 @@ typedef struct SaveVMHandlers { int (*load_state)(QEMUFile *f, void *opaque, int version_id); /** + * @load_state_buffer (invoked outside the BQL) + * + * Load device state buffer provided to qemu_loadvm_load_state_buffer(). + * + * @opaque: data pointer passed to register_savevm_live() + * @buf: the data buffer to load + * @len: the data length in buffer + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for errors. + */ + bool (*load_state_buffer)(void *opaque, char *buf, size_t len, + Error **errp); + + /** * @load_setup * * Initializes the data structures on the destination. @@ -244,6 +280,8 @@ typedef struct SaveVMHandlers { * @load_cleanup * * Uninitializes the data structures on the destination. + * Note that this handler can be called even if load_setup + * wasn't called earlier. * * @opaque: data pointer passed to register_savevm_live() * @@ -275,6 +313,18 @@ typedef struct SaveVMHandlers { * otherwise */ bool (*switchover_ack_needed)(void *opaque); + + /** + * @switchover_start + * + * Notifies that the switchover has started. Called only on + * the destination. + * + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ + int (*switchover_start)(void *opaque); } SaveVMHandlers; /** diff --git a/include/qapi/error.h b/include/qapi/error.h index f5fe216262..41e3816380 100644 --- a/include/qapi/error.h +++ b/include/qapi/error.h @@ -437,6 +437,8 @@ Error *error_copy(const Error *err); */ void error_free(Error *err); +G_DEFINE_AUTOPTR_CLEANUP_FUNC(Error, error_free) + /* * Convenience function to assert that *@errp is set, then silently free it. */ diff --git a/include/qemu/clang-tsa.h b/include/qemu/clang-tsa.h deleted file mode 100644 index ba06fb8c92..0000000000 --- a/include/qemu/clang-tsa.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef CLANG_TSA_H -#define CLANG_TSA_H - -/* - * Copyright 2018 Jarkko Hietaniemi <jhi@iki.fi> - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without - * limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/* http://clang.llvm.org/docs/ThreadSafetyAnalysis.html - * - * TSA is available since clang 3.6-ish. - */ -#ifdef __clang__ -# define TSA(x) __attribute__((x)) -#else -# define TSA(x) /* No TSA, make TSA attributes no-ops. */ -#endif - -/* TSA_CAPABILITY() is used to annotate typedefs: - * - * typedef pthread_mutex_t TSA_CAPABILITY("mutex") tsa_mutex; - */ -#define TSA_CAPABILITY(x) TSA(capability(x)) - -/* TSA_GUARDED_BY() is used to annotate global variables, - * the data is guarded: - * - * Foo foo TSA_GUARDED_BY(mutex); - */ -#define TSA_GUARDED_BY(x) TSA(guarded_by(x)) - -/* TSA_PT_GUARDED_BY() is used to annotate global pointers, the data - * behind the pointer is guarded. - * - * Foo* ptr TSA_PT_GUARDED_BY(mutex); - */ -#define TSA_PT_GUARDED_BY(x) TSA(pt_guarded_by(x)) - -/* The TSA_REQUIRES() is used to annotate functions: the caller of the - * function MUST hold the resource, the function will NOT release it. - * - * More than one mutex may be specified, comma-separated. - * - * void Foo(void) TSA_REQUIRES(mutex); - */ -#define TSA_REQUIRES(...) TSA(requires_capability(__VA_ARGS__)) -#define TSA_REQUIRES_SHARED(...) TSA(requires_shared_capability(__VA_ARGS__)) - -/* TSA_EXCLUDES() is used to annotate functions: the caller of the - * function MUST NOT hold resource, the function first acquires the - * resource, and then releases it. - * - * More than one mutex may be specified, comma-separated. - * - * void Foo(void) TSA_EXCLUDES(mutex); - */ -#define TSA_EXCLUDES(...) TSA(locks_excluded(__VA_ARGS__)) - -/* TSA_ACQUIRE() is used to annotate functions: the caller of the - * function MUST NOT hold the resource, the function will acquire the - * resource, but NOT release it. - * - * More than one mutex may be specified, comma-separated. - * - * void Foo(void) TSA_ACQUIRE(mutex); - */ -#define TSA_ACQUIRE(...) TSA(acquire_capability(__VA_ARGS__)) -#define TSA_ACQUIRE_SHARED(...) TSA(acquire_shared_capability(__VA_ARGS__)) - -/* TSA_RELEASE() is used to annotate functions: the caller of the - * function MUST hold the resource, but the function will then release it. - * - * More than one mutex may be specified, comma-separated. - * - * void Foo(void) TSA_RELEASE(mutex); - */ -#define TSA_RELEASE(...) TSA(release_capability(__VA_ARGS__)) -#define TSA_RELEASE_SHARED(...) TSA(release_shared_capability(__VA_ARGS__)) - -/* TSA_NO_TSA is used to annotate functions. Use only when you need to. - * - * void Foo(void) TSA_NO_TSA; - */ -#define TSA_NO_TSA TSA(no_thread_safety_analysis) - -/* - * TSA_ASSERT() is used to annotate functions: This function will assert that - * the lock is held. When it returns, the caller of the function is assumed to - * already hold the resource. - * - * More than one mutex may be specified, comma-separated. - */ -#define TSA_ASSERT(...) TSA(assert_capability(__VA_ARGS__)) -#define TSA_ASSERT_SHARED(...) TSA(assert_shared_capability(__VA_ARGS__)) - -#endif /* #ifndef CLANG_TSA_H */ diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h index d904408e5e..496dac5ac1 100644 --- a/include/qemu/compiler.h +++ b/include/qemu/compiler.h @@ -208,6 +208,102 @@ #endif /* + * http://clang.llvm.org/docs/ThreadSafetyAnalysis.html + * + * TSA is available since clang 3.6-ish. + */ +#ifdef __clang__ +# define TSA(x) __attribute__((x)) +#else +# define TSA(x) /* No TSA, make TSA attributes no-ops. */ +#endif + +/* + * TSA_CAPABILITY() is used to annotate typedefs: + * + * typedef pthread_mutex_t TSA_CAPABILITY("mutex") tsa_mutex; + */ +#define TSA_CAPABILITY(x) TSA(capability(x)) + +/* + * TSA_GUARDED_BY() is used to annotate global variables, + * the data is guarded: + * + * Foo foo TSA_GUARDED_BY(mutex); + */ +#define TSA_GUARDED_BY(x) TSA(guarded_by(x)) + +/* + * TSA_PT_GUARDED_BY() is used to annotate global pointers, the data + * behind the pointer is guarded. + * + * Foo* ptr TSA_PT_GUARDED_BY(mutex); + */ +#define TSA_PT_GUARDED_BY(x) TSA(pt_guarded_by(x)) + +/* + * The TSA_REQUIRES() is used to annotate functions: the caller of the + * function MUST hold the resource, the function will NOT release it. + * + * More than one mutex may be specified, comma-separated. + * + * void Foo(void) TSA_REQUIRES(mutex); + */ +#define TSA_REQUIRES(...) TSA(requires_capability(__VA_ARGS__)) +#define TSA_REQUIRES_SHARED(...) TSA(requires_shared_capability(__VA_ARGS__)) + +/* + * TSA_EXCLUDES() is used to annotate functions: the caller of the + * function MUST NOT hold resource, the function first acquires the + * resource, and then releases it. + * + * More than one mutex may be specified, comma-separated. + * + * void Foo(void) TSA_EXCLUDES(mutex); + */ +#define TSA_EXCLUDES(...) TSA(locks_excluded(__VA_ARGS__)) + +/* + * TSA_ACQUIRE() is used to annotate functions: the caller of the + * function MUST NOT hold the resource, the function will acquire the + * resource, but NOT release it. + * + * More than one mutex may be specified, comma-separated. + * + * void Foo(void) TSA_ACQUIRE(mutex); + */ +#define TSA_ACQUIRE(...) TSA(acquire_capability(__VA_ARGS__)) +#define TSA_ACQUIRE_SHARED(...) TSA(acquire_shared_capability(__VA_ARGS__)) + +/* + * TSA_RELEASE() is used to annotate functions: the caller of the + * function MUST hold the resource, but the function will then release it. + * + * More than one mutex may be specified, comma-separated. + * + * void Foo(void) TSA_RELEASE(mutex); + */ +#define TSA_RELEASE(...) TSA(release_capability(__VA_ARGS__)) +#define TSA_RELEASE_SHARED(...) TSA(release_shared_capability(__VA_ARGS__)) + +/* + * TSA_NO_TSA is used to annotate functions. Use only when you need to. + * + * void Foo(void) TSA_NO_TSA; + */ +#define TSA_NO_TSA TSA(no_thread_safety_analysis) + +/* + * TSA_ASSERT() is used to annotate functions: This function will assert that + * the lock is held. When it returns, the caller of the function is assumed to + * already hold the resource. + * + * More than one mutex may be specified, comma-separated. + */ +#define TSA_ASSERT(...) TSA(assert_capability(__VA_ARGS__)) +#define TSA_ASSERT_SHARED(...) TSA(assert_shared_capability(__VA_ARGS__)) + +/* * Ugly CPP trick that is like "defined FOO", but also works in C * code. Useful to replace #ifdef with "if" statements; assumes * the symbol was defined with Meson's "config.set()", so it is empty diff --git a/include/qemu/thread.h b/include/qemu/thread.h index 7eba27a704..6f800aad31 100644 --- a/include/qemu/thread.h +++ b/include/qemu/thread.h @@ -3,7 +3,6 @@ #include "qemu/processor.h" #include "qemu/atomic.h" -#include "qemu/clang-tsa.h" typedef struct QemuCond QemuCond; typedef struct QemuSemaphore QemuSemaphore; diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 3d84efcac4..507f0814d5 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -22,6 +22,7 @@ * Please keep this list in case-insensitive alphabetical order. */ typedef struct AccelCPUState AccelCPUState; +typedef struct AccelOpsClass AccelOpsClass; typedef struct AccelState AccelState; typedef struct AddressSpace AddressSpace; typedef struct AioContext AioContext; @@ -108,6 +109,7 @@ typedef struct QString QString; typedef struct RAMBlock RAMBlock; typedef struct Range Range; typedef struct ReservedRegion ReservedRegion; +typedef struct SaveLiveCompletePrecopyThreadData SaveLiveCompletePrecopyThreadData; typedef struct SHPCDevice SHPCDevice; typedef struct SSIBus SSIBus; typedef struct TCGCPUOps TCGCPUOps; @@ -131,5 +133,9 @@ typedef struct IRQState *qemu_irq; * Function types */ typedef void (*qemu_irq_handler)(void *opaque, int n, int level); +typedef bool (*MigrationLoadThread)(void *opaque, bool *should_quit, + Error **errp); +typedef bool (*SaveLiveCompletePrecopyThreadHandler)(SaveLiveCompletePrecopyThreadData *d, + Error **errp); #endif /* QEMU_TYPEDEFS_H */ diff --git a/include/system/accel-ops.h b/include/system/accel-ops.h index 137fb96d44..4c99d25aef 100644 --- a/include/system/accel-ops.h +++ b/include/system/accel-ops.h @@ -17,7 +17,6 @@ #define TYPE_ACCEL_OPS "accel" ACCEL_OPS_SUFFIX #define ACCEL_OPS_NAME(name) (name "-" TYPE_ACCEL_OPS) -typedef struct AccelOpsClass AccelOpsClass; DECLARE_CLASS_CHECKERS(AccelOpsClass, ACCEL_OPS, TYPE_ACCEL_OPS) /** diff --git a/include/system/arch_init.h b/include/system/arch_init.h index 5b1c1026f3..d8b7744048 100644 --- a/include/system/arch_init.h +++ b/include/system/arch_init.h @@ -27,6 +27,4 @@ enum { extern const uint32_t arch_type; -void qemu_init_arch_modules(void); - #endif diff --git a/include/system/cpus.h b/include/system/cpus.h index 3d8fd368f3..3226c765d0 100644 --- a/include/system/cpus.h +++ b/include/system/cpus.h @@ -1,8 +1,6 @@ #ifndef QEMU_CPUS_H #define QEMU_CPUS_H -#include "system/accel-ops.h" - /* register accel-specific operations */ void cpus_register_accel(const AccelOpsClass *i); @@ -38,8 +36,6 @@ void resume_all_vcpus(void); void pause_all_vcpus(void); void cpu_stop_current(void); -extern int icount_align_option; - /* Unblock cpu */ void qemu_cpu_kick_self(void); diff --git a/linux-user/signal.c b/linux-user/signal.c index 81a98c6d02..4799b79ded 100644 --- a/linux-user/signal.c +++ b/linux-user/signal.c @@ -21,7 +21,7 @@ #include "qemu/cutils.h" #include "gdbstub/user.h" #include "exec/page-protection.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include <sys/ucontext.h> #include <sys/resource.h> diff --git a/migration/colo.c b/migration/colo.c index 9a8e5fbe9b..c976b3ff34 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -452,6 +452,9 @@ static int colo_do_checkpoint_transaction(MigrationState *s, bql_unlock(); goto out; } + + qemu_savevm_maybe_send_switchover_start(s->to_dst_file); + /* Note: device state is saved into buffer */ ret = qemu_save_device_state(fb); diff --git a/migration/meson.build b/migration/meson.build index d3bfe84d62..9aa48b290e 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -25,6 +25,7 @@ system_ss.add(files( 'migration-hmp-cmds.c', 'migration.c', 'multifd.c', + 'multifd-device-state.c', 'multifd-nocomp.c', 'multifd-zlib.c', 'multifd-zero-page.c', diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 3347e34c48..49c26daed3 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -46,6 +46,8 @@ static void migration_global_dump(Monitor *mon) ms->send_configuration ? "on" : "off"); monitor_printf(mon, "send-section-footer: %s\n", ms->send_section_footer ? "on" : "off"); + monitor_printf(mon, "send-switchover-start: %s\n", + ms->send_switchover_start ? "on" : "off"); monitor_printf(mon, "clear-bitmap-shift: %u\n", ms->clear_bitmap_shift); } diff --git a/migration/migration.c b/migration/migration.c index c597aa707e..1833cfe358 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -402,11 +402,24 @@ void migration_incoming_state_destroy(void) struct MigrationIncomingState *mis = migration_incoming_get_current(); multifd_recv_cleanup(); + /* * RAM state cleanup needs to happen after multifd cleanup, because * multifd threads can use some of its states (receivedmap). + * The VFIO load_cleanup() implementation is BQL-sensitive. It requires + * BQL must NOT be taken when recycling load threads, so that it won't + * block the load threads from making progress on address space + * modification operations. + * + * To make it work, we could try to not take BQL for all load_cleanup(), + * or conditionally unlock BQL only if bql_locked() in VFIO. + * + * Since most existing call sites take BQL for load_cleanup(), make + * it simple by taking BQL always as the rule, so that VFIO can unlock + * BQL and retake unconditionally. */ - qemu_loadvm_state_cleanup(); + assert(bql_locked()); + qemu_loadvm_state_cleanup(mis); if (mis->to_src_file) { /* Tell source that we are done */ @@ -2891,6 +2904,8 @@ static bool migration_switchover_start(MigrationState *s, Error **errp) precopy_notify_complete(); + qemu_savevm_maybe_send_switchover_start(s->to_dst_file); + return true; } diff --git a/migration/migration.h b/migration/migration.h index 4639e2a7e4..d53f7cad84 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -43,6 +43,7 @@ #define MIGRATION_THREAD_DST_PREEMPT "mig/dst/preempt" struct PostcopyBlocktimeContext; +typedef struct ThreadPool ThreadPool; #define MIGRATION_RESUME_ACK_VALUE (1) @@ -187,6 +188,10 @@ struct MigrationIncomingState { Coroutine *colo_incoming_co; QemuSemaphore colo_incoming_sem; + /* Optional load threads pool and its thread exit request flag */ + ThreadPool *load_threads; + bool load_threads_abort; + /* * PostcopyBlocktimeContext to keep information for postcopy * live migration, to calculate vCPU block time @@ -400,6 +405,8 @@ struct MigrationState { bool send_configuration; /* Whether we send section footer during migration */ bool send_section_footer; + /* Whether we send switchover start notification during migration */ + bool send_switchover_start; /* Needed by postcopy-pause state */ QemuSemaphore postcopy_pause_sem; diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c new file mode 100644 index 0000000000..94222d0eb0 --- /dev/null +++ b/migration/multifd-device-state.c @@ -0,0 +1,212 @@ +/* + * Multifd device state migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/lockable.h" +#include "block/thread-pool.h" +#include "migration.h" +#include "migration/misc.h" +#include "multifd.h" +#include "options.h" + +static struct { + QemuMutex queue_job_mutex; + + MultiFDSendData *send_data; + + ThreadPool *threads; + bool threads_abort; +} *multifd_send_device_state; + +void multifd_device_state_send_setup(void) +{ + assert(!multifd_send_device_state); + multifd_send_device_state = g_malloc(sizeof(*multifd_send_device_state)); + + qemu_mutex_init(&multifd_send_device_state->queue_job_mutex); + + multifd_send_device_state->send_data = multifd_send_data_alloc(); + + multifd_send_device_state->threads = thread_pool_new(); + multifd_send_device_state->threads_abort = false; +} + +void multifd_device_state_send_cleanup(void) +{ + g_clear_pointer(&multifd_send_device_state->threads, thread_pool_free); + g_clear_pointer(&multifd_send_device_state->send_data, + multifd_send_data_free); + + qemu_mutex_destroy(&multifd_send_device_state->queue_job_mutex); + + g_clear_pointer(&multifd_send_device_state, g_free); +} + +void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state) +{ + g_clear_pointer(&device_state->idstr, g_free); + g_clear_pointer(&device_state->buf, g_free); +} + +static void multifd_device_state_fill_packet(MultiFDSendParams *p) +{ + MultiFDDeviceState_t *device_state = &p->data->u.device_state; + MultiFDPacketDeviceState_t *packet = p->packet_device_state; + + packet->hdr.flags = cpu_to_be32(p->flags); + strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr) - 1); + packet->idstr[sizeof(packet->idstr) - 1] = 0; + packet->instance_id = cpu_to_be32(device_state->instance_id); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); +} + +static void multifd_prepare_header_device_state(MultiFDSendParams *p) +{ + p->iov[0].iov_len = sizeof(*p->packet_device_state); + p->iov[0].iov_base = p->packet_device_state; + p->iovs_num++; +} + +void multifd_device_state_send_prepare(MultiFDSendParams *p) +{ + MultiFDDeviceState_t *device_state = &p->data->u.device_state; + + assert(multifd_payload_device_state(p->data)); + + multifd_prepare_header_device_state(p); + + assert(!(p->flags & MULTIFD_FLAG_SYNC)); + + p->next_packet_size = device_state->buf_len; + if (p->next_packet_size > 0) { + p->iov[p->iovs_num].iov_base = device_state->buf; + p->iov[p->iovs_num].iov_len = p->next_packet_size; + p->iovs_num++; + } + + p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE; + + multifd_device_state_fill_packet(p); +} + +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, + char *data, size_t len) +{ + /* Device state submissions can come from multiple threads */ + QEMU_LOCK_GUARD(&multifd_send_device_state->queue_job_mutex); + MultiFDDeviceState_t *device_state; + + assert(multifd_payload_empty(multifd_send_device_state->send_data)); + + multifd_set_payload_type(multifd_send_device_state->send_data, + MULTIFD_PAYLOAD_DEVICE_STATE); + device_state = &multifd_send_device_state->send_data->u.device_state; + device_state->idstr = g_strdup(idstr); + device_state->instance_id = instance_id; + device_state->buf = g_memdup2(data, len); + device_state->buf_len = len; + + if (!multifd_send(&multifd_send_device_state->send_data)) { + multifd_send_data_clear(multifd_send_device_state->send_data); + return false; + } + + return true; +} + +bool multifd_device_state_supported(void) +{ + return migrate_multifd() && !migrate_mapped_ram() && + migrate_multifd_compression() == MULTIFD_COMPRESSION_NONE; +} + +static void multifd_device_state_save_thread_data_free(void *opaque) +{ + SaveLiveCompletePrecopyThreadData *data = opaque; + + g_clear_pointer(&data->idstr, g_free); + g_free(data); +} + +static int multifd_device_state_save_thread(void *opaque) +{ + SaveLiveCompletePrecopyThreadData *data = opaque; + g_autoptr(Error) local_err = NULL; + + if (!data->hdlr(data, &local_err)) { + MigrationState *s = migrate_get_current(); + + /* + * Can't call abort_device_state_save_threads() here since new + * save threads could still be in process of being launched + * (if, for example, the very first save thread launched exited + * with an error very quickly). + */ + + assert(local_err); + + /* + * In case of multiple save threads failing which thread error + * return we end setting is purely arbitrary. + */ + migrate_set_error(s, local_err); + } + + return 0; +} + +bool multifd_device_state_save_thread_should_exit(void) +{ + return qatomic_read(&multifd_send_device_state->threads_abort); +} + +void +multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr, + char *idstr, uint32_t instance_id, + void *opaque) +{ + SaveLiveCompletePrecopyThreadData *data; + + assert(multifd_device_state_supported()); + assert(multifd_send_device_state); + + assert(!qatomic_read(&multifd_send_device_state->threads_abort)); + + data = g_new(SaveLiveCompletePrecopyThreadData, 1); + data->hdlr = hdlr; + data->idstr = g_strdup(idstr); + data->instance_id = instance_id; + data->handler_opaque = opaque; + + thread_pool_submit_immediate(multifd_send_device_state->threads, + multifd_device_state_save_thread, + data, + multifd_device_state_save_thread_data_free); +} + +void multifd_abort_device_state_save_threads(void) +{ + assert(multifd_device_state_supported()); + + qatomic_set(&multifd_send_device_state->threads_abort, true); +} + +bool multifd_join_device_state_save_threads(void) +{ + MigrationState *s = migrate_get_current(); + + assert(multifd_device_state_supported()); + + thread_pool_wait(multifd_send_device_state->threads); + + return !migrate_has_error(s); +} diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index 1325dba97c..ffe75256c9 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -14,6 +14,7 @@ #include "exec/ramblock.h" #include "exec/target_page.h" #include "file.h" +#include "migration-stats.h" #include "multifd.h" #include "options.h" #include "qapi/error.h" @@ -24,15 +25,14 @@ static MultiFDSendData *multifd_ram_send; -size_t multifd_ram_payload_size(void) +void multifd_ram_payload_alloc(MultiFDPages_t *pages) { - uint32_t n = multifd_ram_page_count(); + pages->offset = g_new0(ram_addr_t, multifd_ram_page_count()); +} - /* - * We keep an array of page offsets at the end of MultiFDPages_t, - * add space for it in the allocation. - */ - return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t); +void multifd_ram_payload_free(MultiFDPages_t *pages) +{ + g_clear_pointer(&pages->offset, g_free); } void multifd_ram_save_setup(void) @@ -42,8 +42,7 @@ void multifd_ram_save_setup(void) void multifd_ram_save_cleanup(void) { - g_free(multifd_ram_send); - multifd_ram_send = NULL; + g_clear_pointer(&multifd_ram_send, multifd_send_data_free); } static void multifd_set_file_bitmap(MultiFDSendParams *p) @@ -86,6 +85,13 @@ static void multifd_nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) return; } +static void multifd_ram_prepare_header(MultiFDSendParams *p) +{ + p->iov[0].iov_len = p->packet_len; + p->iov[0].iov_base = p->packet; + p->iovs_num++; +} + static void multifd_send_prepare_iovs(MultiFDSendParams *p) { MultiFDPages_t *pages = &p->data->u.ram; @@ -119,7 +125,7 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp) * Only !zerocopy needs the header in IOV; zerocopy will * send it separately. */ - multifd_send_prepare_header(p); + multifd_ram_prepare_header(p); } multifd_send_prepare_iovs(p); @@ -134,6 +140,8 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp) if (ret != 0) { return -1; } + + stat64_add(&mig_stats.multifd_bytes, p->packet_len); } return 0; @@ -432,7 +440,7 @@ int multifd_ram_flush_and_sync(QEMUFile *f) bool multifd_send_prepare_common(MultiFDSendParams *p) { MultiFDPages_t *pages = &p->data->u.ram; - multifd_send_prepare_header(p); + multifd_ram_prepare_header(p); multifd_send_zero_page_detect(p); if (!pages->normal_num) { diff --git a/migration/multifd.c b/migration/multifd.c index 215ad0414a..dfb5189f0e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" +#include "qemu/iov.h" #include "qemu/rcu.h" #include "exec/target_page.h" #include "system/system.h" @@ -19,8 +20,10 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "file.h" +#include "migration/misc.h" #include "migration.h" #include "migration-stats.h" +#include "savevm.h" #include "socket.h" #include "tls.h" #include "qemu-file.h" @@ -49,6 +52,10 @@ typedef struct { struct { MultiFDSendParams *params; + + /* multifd_send() body is not thread safe, needs serialization */ + QemuMutex multifd_send_mutex; + /* * Global number of generated multifd packets. * @@ -98,24 +105,44 @@ struct { MultiFDSendData *multifd_send_data_alloc(void) { - size_t max_payload_size, size_minus_payload; + MultiFDSendData *new = g_new0(MultiFDSendData, 1); - /* - * MultiFDPages_t has a flexible array at the end, account for it - * when allocating MultiFDSendData. Use max() in case other types - * added to the union in the future are larger than - * (MultiFDPages_t + flex array). - */ - max_payload_size = MAX(multifd_ram_payload_size(), sizeof(MultiFDPayload)); + multifd_ram_payload_alloc(&new->u.ram); + /* Device state allocates its payload on-demand */ - /* - * Account for any holes the compiler might insert. We can't pack - * the structure because that misaligns the members and triggers - * Waddress-of-packed-member. - */ - size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload); + return new; +} - return g_malloc0(size_minus_payload + max_payload_size); +void multifd_send_data_clear(MultiFDSendData *data) +{ + if (multifd_payload_empty(data)) { + return; + } + + switch (data->type) { + case MULTIFD_PAYLOAD_DEVICE_STATE: + multifd_send_data_clear_device_state(&data->u.device_state); + break; + default: + /* Nothing to do */ + break; + } + + data->type = MULTIFD_PAYLOAD_NONE; +} + +void multifd_send_data_free(MultiFDSendData *data) +{ + if (!data) { + return; + } + + /* This also free's device state payload */ + multifd_send_data_clear(data); + + multifd_ram_payload_free(&data->u.ram); + + g_free(data); } static bool multifd_use_packets(void) @@ -201,6 +228,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) return msg.id; } +/* Fills a RAM multifd packet */ void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; @@ -209,10 +237,10 @@ void multifd_send_fill_packet(MultiFDSendParams *p) memset(packet, 0, p->packet_len); - packet->magic = cpu_to_be32(MULTIFD_MAGIC); - packet->version = cpu_to_be32(MULTIFD_VERSION); + packet->hdr.magic = cpu_to_be32(MULTIFD_MAGIC); + packet->hdr.version = cpu_to_be32(MULTIFD_VERSION); - packet->flags = cpu_to_be32(p->flags); + packet->hdr.flags = cpu_to_be32(p->flags); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); @@ -228,12 +256,12 @@ void multifd_send_fill_packet(MultiFDSendParams *p) p->flags, p->next_packet_size); } -static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +static int multifd_recv_unfill_packet_header(MultiFDRecvParams *p, + const MultiFDPacketHdr_t *hdr, + Error **errp) { - const MultiFDPacket_t *packet = p->packet; - uint32_t magic = be32_to_cpu(packet->magic); - uint32_t version = be32_to_cpu(packet->version); - int ret = 0; + uint32_t magic = be32_to_cpu(hdr->magic); + uint32_t version = be32_to_cpu(hdr->version); if (magic != MULTIFD_MAGIC) { error_setg(errp, "multifd: received packet magic %x, expected %x", @@ -247,10 +275,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) return -1; } - p->flags = be32_to_cpu(packet->flags); + p->flags = be32_to_cpu(hdr->flags); + + return 0; +} + +static int multifd_recv_unfill_packet_device_state(MultiFDRecvParams *p, + Error **errp) +{ + MultiFDPacketDeviceState_t *packet = p->packet_dev_state; + + packet->instance_id = be32_to_cpu(packet->instance_id); + p->next_packet_size = be32_to_cpu(packet->next_packet_size); + + return 0; +} + +static int multifd_recv_unfill_packet_ram(MultiFDRecvParams *p, Error **errp) +{ + const MultiFDPacket_t *packet = p->packet; + int ret = 0; + p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); - p->packets_recved++; /* Always unfill, old QEMUs (<9.0) send data along with SYNC */ ret = multifd_ram_unfill_packet(p, errp); @@ -261,6 +308,17 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) return ret; } +static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +{ + p->packets_recved++; + + if (p->flags & MULTIFD_FLAG_DEVICE_STATE) { + return multifd_recv_unfill_packet_device_state(p, errp); + } + + return multifd_recv_unfill_packet_ram(p, errp); +} + static bool multifd_send_should_exit(void) { return qatomic_read(&multifd_send_state->exiting); @@ -308,6 +366,8 @@ bool multifd_send(MultiFDSendData **send_data) return false; } + QEMU_LOCK_GUARD(&multifd_send_state->multifd_send_mutex); + /* We wait here, until at least one channel is ready */ qemu_sem_wait(&multifd_send_state->channels_ready); @@ -459,9 +519,9 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) qemu_sem_destroy(&p->sem_sync); g_free(p->name); p->name = NULL; - g_free(p->data); - p->data = NULL; + g_clear_pointer(&p->data, multifd_send_data_free); p->packet_len = 0; + g_clear_pointer(&p->packet_device_state, g_free); g_free(p->packet); p->packet = NULL; multifd_send_state->ops->send_cleanup(p, errp); @@ -474,8 +534,10 @@ static void multifd_send_cleanup_state(void) { file_cleanup_outgoing_migration(); socket_cleanup_outgoing_migration(); + multifd_device_state_send_cleanup(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); + qemu_mutex_destroy(&multifd_send_state->multifd_send_mutex); g_free(multifd_send_state->params); multifd_send_state->params = NULL; g_free(multifd_send_state); @@ -631,16 +693,32 @@ static void *multifd_send_thread(void *opaque) * qatomic_store_release() in multifd_send(). */ if (qatomic_load_acquire(&p->pending_job)) { + bool is_device_state = multifd_payload_device_state(p->data); + size_t total_size; + p->flags = 0; p->iovs_num = 0; assert(!multifd_payload_empty(p->data)); - ret = multifd_send_state->ops->send_prepare(p, &local_err); - if (ret != 0) { - break; + if (is_device_state) { + multifd_device_state_send_prepare(p); + } else { + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + break; + } } + /* + * The packet header in the zerocopy RAM case is accounted for + * in multifd_nocomp_send_prepare() - where it is actually + * being sent. + */ + total_size = iov_size(p->iov, p->iovs_num); + if (migrate_mapped_ram()) { + assert(!is_device_state); + ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num, &p->data->u.ram, &local_err); } else { @@ -653,11 +731,10 @@ static void *multifd_send_thread(void *opaque) break; } - stat64_add(&mig_stats.multifd_bytes, - (uint64_t)p->next_packet_size + p->packet_len); + stat64_add(&mig_stats.multifd_bytes, total_size); p->next_packet_size = 0; - multifd_set_payload_type(p->data, MULTIFD_PAYLOAD_NONE); + multifd_send_data_clear(p->data); /* * Making sure p->data is published before saying "we're @@ -856,6 +933,7 @@ bool multifd_send_setup(void) thread_count = migrate_multifd_channels(); multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); + qemu_mutex_init(&multifd_send_state->multifd_send_mutex); qemu_sem_init(&multifd_send_state->channels_created, 0); qemu_sem_init(&multifd_send_state->channels_ready, 0); qatomic_set(&multifd_send_state->exiting, 0); @@ -874,6 +952,9 @@ bool multifd_send_setup(void) p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; p->packet = g_malloc0(p->packet_len); + p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state)); + p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION); } p->name = g_strdup_printf(MIGRATION_THREAD_SRC_MULTIFD, i); p->write_flags = 0; @@ -909,6 +990,8 @@ bool multifd_send_setup(void) assert(p->iov); } + multifd_device_state_send_setup(); + return true; err: @@ -1048,6 +1131,7 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->packet_len = 0; g_free(p->packet); p->packet = NULL; + g_clear_pointer(&p->packet_dev_state, g_free); g_free(p->normal); p->normal = NULL; g_free(p->zero); @@ -1149,6 +1233,34 @@ void multifd_recv_sync_main(void) trace_multifd_recv_sync_main(multifd_recv_state->packet_num); } +static int multifd_device_state_recv(MultiFDRecvParams *p, Error **errp) +{ + g_autofree char *dev_state_buf = NULL; + int ret; + + dev_state_buf = g_malloc(p->next_packet_size); + + ret = qio_channel_read_all(p->c, dev_state_buf, p->next_packet_size, errp); + if (ret != 0) { + return ret; + } + + if (p->packet_dev_state->idstr[sizeof(p->packet_dev_state->idstr) - 1] + != 0) { + error_setg(errp, "unterminated multifd device state idstr"); + return -1; + } + + if (!qemu_loadvm_load_state_buffer(p->packet_dev_state->idstr, + p->packet_dev_state->instance_id, + dev_state_buf, p->next_packet_size, + errp)) { + ret = -1; + } + + return ret; +} + static void *multifd_recv_thread(void *opaque) { MigrationState *s = migrate_get_current(); @@ -1165,14 +1277,19 @@ static void *multifd_recv_thread(void *opaque) } while (true) { + MultiFDPacketHdr_t hdr; uint32_t flags = 0; + bool is_device_state = false; bool has_data = false; + uint8_t *pkt_buf; + size_t pkt_len; + p->normal_num = 0; if (use_packets) { struct iovec iov = { - .iov_base = (void *)p->packet, - .iov_len = p->packet_len + .iov_base = (void *)&hdr, + .iov_len = sizeof(hdr) }; if (multifd_recv_should_exit()) { @@ -1191,6 +1308,32 @@ static void *multifd_recv_thread(void *opaque) break; } + ret = multifd_recv_unfill_packet_header(p, &hdr, &local_err); + if (ret) { + break; + } + + is_device_state = p->flags & MULTIFD_FLAG_DEVICE_STATE; + if (is_device_state) { + pkt_buf = (uint8_t *)p->packet_dev_state + sizeof(hdr); + pkt_len = sizeof(*p->packet_dev_state) - sizeof(hdr); + } else { + pkt_buf = (uint8_t *)p->packet + sizeof(hdr); + pkt_len = p->packet_len - sizeof(hdr); + } + + ret = qio_channel_read_all_eof(p->c, (char *)pkt_buf, pkt_len, + &local_err); + if (!ret) { + /* EOF */ + error_setg(&local_err, "multifd: unexpected EOF after packet header"); + break; + } + + if (ret == -1) { + break; + } + qemu_mutex_lock(&p->mutex); ret = multifd_recv_unfill_packet(p, &local_err); if (ret) { @@ -1202,12 +1345,17 @@ static void *multifd_recv_thread(void *opaque) /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - /* - * Even if it's a SYNC packet, this needs to be set - * because older QEMUs (<9.0) still send data along with - * the SYNC packet. - */ - has_data = p->normal_num || p->zero_num; + if (is_device_state) { + has_data = p->next_packet_size > 0; + } else { + /* + * Even if it's a SYNC packet, this needs to be set + * because older QEMUs (<9.0) still send data along with + * the SYNC packet. + */ + has_data = p->normal_num || p->zero_num; + } + qemu_mutex_unlock(&p->mutex); } else { /* @@ -1236,14 +1384,29 @@ static void *multifd_recv_thread(void *opaque) } if (has_data) { - ret = multifd_recv_state->ops->recv(p, &local_err); + if (is_device_state) { + assert(use_packets); + ret = multifd_device_state_recv(p, &local_err); + } else { + ret = multifd_recv_state->ops->recv(p, &local_err); + } if (ret != 0) { break; } + } else if (is_device_state) { + error_setg(&local_err, + "multifd: received empty device state packet"); + break; } if (use_packets) { if (flags & MULTIFD_FLAG_SYNC) { + if (is_device_state) { + error_setg(&local_err, + "multifd: received SYNC device state packet"); + break; + } + qemu_sem_post(&multifd_recv_state->sem_sync); qemu_sem_wait(&p->sem_sync); } @@ -1312,6 +1475,7 @@ int multifd_recv_setup(Error **errp) p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; p->packet = g_malloc0(p->packet_len); + p->packet_dev_state = g_malloc0(sizeof(*p->packet_dev_state)); } p->name = g_strdup_printf(MIGRATION_THREAD_DST_MULTIFD, i); p->normal = g_new0(ram_addr_t, page_count); diff --git a/migration/multifd.h b/migration/multifd.h index cf408ff721..2d337e7b3b 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -62,6 +62,12 @@ MultiFDRecvData *multifd_get_recv_data(void); #define MULTIFD_FLAG_UADK (8 << 1) #define MULTIFD_FLAG_QATZIP (16 << 1) +/* + * If set it means that this packet contains device state + * (MultiFDPacketDeviceState_t), not RAM data (MultiFDPacket_t). + */ +#define MULTIFD_FLAG_DEVICE_STATE (32 << 1) + /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) @@ -69,6 +75,11 @@ typedef struct { uint32_t magic; uint32_t version; uint32_t flags; +} __attribute__((packed)) MultiFDPacketHdr_t; + +typedef struct { + MultiFDPacketHdr_t hdr; + /* maximum number of allocated pages */ uint32_t pages_alloc; /* non zero pages */ @@ -90,13 +101,27 @@ typedef struct { } __attribute__((packed)) MultiFDPacket_t; typedef struct { + MultiFDPacketHdr_t hdr; + + char idstr[256]; + uint32_t instance_id; + + /* size of the next packet that contains the actual data */ + uint32_t next_packet_size; +} __attribute__((packed)) MultiFDPacketDeviceState_t; + +typedef struct { /* number of used pages */ uint32_t num; /* number of normal pages */ uint32_t normal_num; + /* + * Pointer to the ramblock. NOTE: it's caller's responsibility to make + * sure the pointer is always valid! + */ RAMBlock *block; - /* offset of each page */ - ram_addr_t offset[]; + /* offset array of each page, managed by multifd */ + ram_addr_t *offset; } MultiFDPages_t; struct MultiFDRecvData { @@ -106,13 +131,22 @@ struct MultiFDRecvData { off_t file_offset; }; +typedef struct { + char *idstr; + uint32_t instance_id; + char *buf; + size_t buf_len; +} MultiFDDeviceState_t; + typedef enum { MULTIFD_PAYLOAD_NONE, MULTIFD_PAYLOAD_RAM, + MULTIFD_PAYLOAD_DEVICE_STATE, } MultiFDPayloadType; -typedef union MultiFDPayload { +typedef struct MultiFDPayload { MultiFDPages_t ram; + MultiFDDeviceState_t device_state; } MultiFDPayload; struct MultiFDSendData { @@ -125,9 +159,17 @@ static inline bool multifd_payload_empty(MultiFDSendData *data) return data->type == MULTIFD_PAYLOAD_NONE; } +static inline bool multifd_payload_device_state(MultiFDSendData *data) +{ + return data->type == MULTIFD_PAYLOAD_DEVICE_STATE; +} + static inline void multifd_set_payload_type(MultiFDSendData *data, MultiFDPayloadType type) { + assert(multifd_payload_empty(data)); + assert(type != MULTIFD_PAYLOAD_NONE); + data->type = type; } @@ -174,8 +216,9 @@ typedef struct { /* thread local variables. No locking required */ - /* pointer to the packet */ + /* pointers to the possible packet types */ MultiFDPacket_t *packet; + MultiFDPacketDeviceState_t *packet_device_state; /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets sent through this channel */ @@ -222,8 +265,9 @@ typedef struct { /* thread local variables. No locking required */ - /* pointer to the packet */ + /* pointers to the possible packet types */ MultiFDPacket_t *packet; + MultiFDPacketDeviceState_t *packet_dev_state; /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets received through this channel */ @@ -333,16 +377,11 @@ bool multifd_send_prepare_common(MultiFDSendParams *p); void multifd_send_zero_page_detect(MultiFDSendParams *p); void multifd_recv_zero_page_process(MultiFDRecvParams *p); -static inline void multifd_send_prepare_header(MultiFDSendParams *p) -{ - p->iov[0].iov_len = p->packet_len; - p->iov[0].iov_base = p->packet; - p->iovs_num++; -} - void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); bool multifd_send(MultiFDSendData **send_data); MultiFDSendData *multifd_send_data_alloc(void); +void multifd_send_data_clear(MultiFDSendData *data); +void multifd_send_data_free(MultiFDSendData *data); static inline uint32_t multifd_ram_page_size(void) { @@ -359,7 +398,16 @@ void multifd_ram_save_cleanup(void); int multifd_ram_flush_and_sync(QEMUFile *f); bool multifd_ram_sync_per_round(void); bool multifd_ram_sync_per_section(void); -size_t multifd_ram_payload_size(void); +void multifd_ram_payload_alloc(MultiFDPages_t *pages); +void multifd_ram_payload_free(MultiFDPages_t *pages); void multifd_ram_fill_packet(MultiFDSendParams *p); int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); + +void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state); + +void multifd_device_state_send_setup(void); +void multifd_device_state_send_cleanup(void); + +void multifd_device_state_send_prepare(MultiFDSendParams *p); + #endif diff --git a/migration/options.c b/migration/options.c index bb259d192a..b0ac2ea408 100644 --- a/migration/options.c +++ b/migration/options.c @@ -93,6 +93,8 @@ const Property migration_properties[] = { send_configuration, true), DEFINE_PROP_BOOL("send-section-footer", MigrationState, send_section_footer, true), + DEFINE_PROP_BOOL("send-switchover-start", MigrationState, + send_switchover_start, true), DEFINE_PROP_BOOL("multifd-flush-after-each-section", MigrationState, multifd_flush_after_each_section, false), DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState, @@ -209,6 +211,13 @@ bool migrate_auto_converge(void) return s->capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; } +bool migrate_send_switchover_start(void) +{ + MigrationState *s = migrate_get_current(); + + return s->send_switchover_start; +} + bool migrate_background_snapshot(void) { MigrationState *s = migrate_get_current(); diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 3e47a20621..f5b9f430e0 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -33,6 +33,8 @@ QEMUFile *qemu_file_new_input(QIOChannel *ioc); QEMUFile *qemu_file_new_output(QIOChannel *ioc); int qemu_fclose(QEMUFile *f); +G_DEFINE_AUTOPTR_CLEANUP_FUNC(QEMUFile, qemu_fclose) + /* * qemu_file_transferred: * diff --git a/migration/savevm.c b/migration/savevm.c index 4046faf009..5c4fdfd95e 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -37,6 +37,7 @@ #include "migration/register.h" #include "migration/global_state.h" #include "migration/channel-block.h" +#include "multifd.h" #include "ram.h" #include "qemu-file.h" #include "savevm.h" @@ -54,6 +55,7 @@ #include "qemu/job.h" #include "qemu/main-loop.h" #include "block/snapshot.h" +#include "block/thread-pool.h" #include "qemu/cutils.h" #include "io/channel-buffer.h" #include "io/channel-file.h" @@ -90,6 +92,7 @@ enum qemu_vm_cmd { MIG_CMD_ENABLE_COLO, /* Enable COLO */ MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ + MIG_CMD_SWITCHOVER_START, /* Switchover start notification */ MIG_CMD_MAX }; @@ -109,6 +112,7 @@ static struct mig_cmd_args { [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, + [MIG_CMD_SWITCHOVER_START] = { .len = 0, .name = "SWITCHOVER_START" }, [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, }; @@ -130,6 +134,35 @@ static struct mig_cmd_args { */ /***********************************************************/ +/* Optional load threads pool support */ + +static void qemu_loadvm_thread_pool_create(MigrationIncomingState *mis) +{ + assert(!mis->load_threads); + mis->load_threads = thread_pool_new(); + mis->load_threads_abort = false; +} + +static void qemu_loadvm_thread_pool_destroy(MigrationIncomingState *mis) +{ + qatomic_set(&mis->load_threads_abort, true); + + bql_unlock(); /* Load threads might be waiting for BQL */ + g_clear_pointer(&mis->load_threads, thread_pool_free); + bql_lock(); +} + +static bool qemu_loadvm_thread_pool_wait(MigrationState *s, + MigrationIncomingState *mis) +{ + bql_unlock(); /* Let load threads do work requiring BQL */ + thread_pool_wait(mis->load_threads); + bql_lock(); + + return !migrate_has_error(s); +} + +/***********************************************************/ /* savevm/loadvm support */ static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable) @@ -1201,6 +1234,19 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); } +static void qemu_savevm_send_switchover_start(QEMUFile *f) +{ + trace_savevm_send_switchover_start(); + qemu_savevm_command_send(f, MIG_CMD_SWITCHOVER_START, 0, NULL); +} + +void qemu_savevm_maybe_send_switchover_start(QEMUFile *f) +{ + if (migrate_send_switchover_start()) { + qemu_savevm_send_switchover_start(f); + } +} + bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -1482,6 +1528,24 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) int64_t start_ts_each, end_ts_each; SaveStateEntry *se; int ret; + bool multifd_device_state = multifd_device_state_supported(); + + if (multifd_device_state) { + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + SaveLiveCompletePrecopyThreadHandler hdlr; + + if (!se->ops || (in_postcopy && se->ops->has_postcopy && + se->ops->has_postcopy(se->opaque)) || + !se->ops->save_live_complete_precopy_thread) { + continue; + } + + hdlr = se->ops->save_live_complete_precopy_thread; + multifd_spawn_device_state_save_thread(hdlr, + se->idstr, se->instance_id, + se->opaque); + } + } QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || @@ -1507,16 +1571,35 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) save_section_footer(f, se); if (ret < 0) { qemu_file_set_error(f, ret); - return -1; + goto ret_fail_abort_threads; } end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id, end_ts_each - start_ts_each); } + if (multifd_device_state) { + if (migrate_has_error(migrate_get_current())) { + multifd_abort_device_state_save_threads(); + } + + if (!multifd_join_device_state_save_threads()) { + qemu_file_set_error(f, -EINVAL); + return -1; + } + } + trace_vmstate_downtime_checkpoint("src-iterable-saved"); return 0; + +ret_fail_abort_threads: + if (multifd_device_state) { + multifd_abort_device_state_save_threads(); + multifd_join_device_state_save_threads(); + } + + return -1; } int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, @@ -1687,6 +1770,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) ret = qemu_file_get_error(f); if (ret == 0) { + qemu_savevm_maybe_send_switchover_start(f); qemu_savevm_state_complete_precopy(f, false); ret = qemu_file_get_error(f); } @@ -1970,6 +2054,8 @@ static void *postcopy_ram_listen_thread(void *opaque) * in qemu_file, and thus we must be blocking now. */ qemu_file_set_blocking(f, true); + + /* TODO: sanity check that only postcopiable data will be loaded here */ load_res = qemu_loadvm_state_main(f, mis); /* @@ -2030,7 +2116,9 @@ static void *postcopy_ram_listen_thread(void *opaque) * (If something broke then qemu will have to exit anyway since it's * got a bad migration state). */ + bql_lock(); migration_incoming_state_destroy(); + bql_unlock(); rcu_unregister_thread(); mis->have_listen_thread = false; @@ -2383,6 +2471,26 @@ static int loadvm_process_enable_colo(MigrationIncomingState *mis) return ret; } +static int loadvm_postcopy_handle_switchover_start(void) +{ + SaveStateEntry *se; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + int ret; + + if (!se->ops || !se->ops->switchover_start) { + continue; + } + + ret = se->ops->switchover_start(se->opaque); + if (ret < 0) { + return ret; + } + } + + return 0; +} + /* * Process an incoming 'QEMU_VM_COMMAND' * 0 just a normal return @@ -2481,6 +2589,9 @@ static int loadvm_process_command(QEMUFile *f) case MIG_CMD_ENABLE_COLO: return loadvm_process_enable_colo(mis); + + case MIG_CMD_SWITCHOVER_START: + return loadvm_postcopy_handle_switchover_start(); } return 0; @@ -2740,16 +2851,68 @@ static int qemu_loadvm_state_setup(QEMUFile *f, Error **errp) return 0; } -void qemu_loadvm_state_cleanup(void) +struct LoadThreadData { + MigrationLoadThread function; + void *opaque; +}; + +static int qemu_loadvm_load_thread(void *thread_opaque) +{ + struct LoadThreadData *data = thread_opaque; + MigrationIncomingState *mis = migration_incoming_get_current(); + g_autoptr(Error) local_err = NULL; + + if (!data->function(data->opaque, &mis->load_threads_abort, &local_err)) { + MigrationState *s = migrate_get_current(); + + /* + * Can't set load_threads_abort here since processing of main migration + * channel data could still be happening, resulting in launching of new + * load threads. + */ + + assert(local_err); + + /* + * In case of multiple load threads failing which thread error + * return we end setting is purely arbitrary. + */ + migrate_set_error(s, local_err); + } + + return 0; +} + +void qemu_loadvm_start_load_thread(MigrationLoadThread function, + void *opaque) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + struct LoadThreadData *data; + + /* We only set it from this thread so it's okay to read it directly */ + assert(!mis->load_threads_abort); + + data = g_new(struct LoadThreadData, 1); + data->function = function; + data->opaque = opaque; + + thread_pool_submit_immediate(mis->load_threads, qemu_loadvm_load_thread, + data, g_free); +} + +void qemu_loadvm_state_cleanup(MigrationIncomingState *mis) { SaveStateEntry *se; trace_loadvm_state_cleanup(); + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (se->ops && se->ops->load_cleanup) { se->ops->load_cleanup(se->opaque); } } + + qemu_loadvm_thread_pool_destroy(mis); } /* Return true if we should continue the migration, or false. */ @@ -2900,6 +3063,7 @@ out: int qemu_loadvm_state(QEMUFile *f) { + MigrationState *s = migrate_get_current(); MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; int ret; @@ -2909,6 +3073,8 @@ int qemu_loadvm_state(QEMUFile *f) return -EINVAL; } + qemu_loadvm_thread_pool_create(mis); + ret = qemu_loadvm_state_header(f); if (ret) { return ret; @@ -2940,12 +3106,18 @@ int qemu_loadvm_state(QEMUFile *f) /* When reaching here, it must be precopy */ if (ret == 0) { - if (migrate_has_error(migrate_get_current())) { + if (migrate_has_error(migrate_get_current()) || + !qemu_loadvm_thread_pool_wait(s, mis)) { ret = -EINVAL; } else { ret = qemu_file_get_error(f); } } + /* + * Set this flag unconditionally so we'll catch further attempts to + * start additional threads via an appropriate assert() + */ + qatomic_set(&mis->load_threads_abort, true); /* * Try to read in the VMDESC section as well, so that dumping tools that @@ -3021,6 +3193,29 @@ int qemu_loadvm_approve_switchover(void) return migrate_send_rp_switchover_ack(mis); } +bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id, + char *buf, size_t len, Error **errp) +{ + SaveStateEntry *se; + + se = find_se(idstr, instance_id); + if (!se) { + error_setg(errp, + "Unknown idstr %s or instance id %u for load state buffer", + idstr, instance_id); + return false; + } + + if (!se->ops || !se->ops->load_state_buffer) { + error_setg(errp, + "idstr %s / instance %u has no load state buffer operation", + idstr, instance_id); + return false; + } + + return se->ops->load_state_buffer(se->opaque, buf, len, errp); +} + bool save_snapshot(const char *name, bool overwrite, const char *vmstate, bool has_devices, strList *devices, Error **errp) { diff --git a/migration/savevm.h b/migration/savevm.h index 7957460062..138c39a7f9 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -53,6 +53,7 @@ void qemu_savevm_send_postcopy_listen(QEMUFile *f); void qemu_savevm_send_postcopy_run(QEMUFile *f); void qemu_savevm_send_postcopy_resume(QEMUFile *f); void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name); +void qemu_savevm_maybe_send_switchover_start(QEMUFile *f); void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, uint16_t len, @@ -63,11 +64,14 @@ void qemu_savevm_live_state(QEMUFile *f); int qemu_save_device_state(QEMUFile *f); int qemu_loadvm_state(QEMUFile *f); -void qemu_loadvm_state_cleanup(void); +void qemu_loadvm_state_cleanup(MigrationIncomingState *mis); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_load_device_state(QEMUFile *f); int qemu_loadvm_approve_switchover(void); int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy); +bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id, + char *buf, size_t len, Error **errp); + #endif diff --git a/migration/trace-events b/migration/trace-events index 58c0f07f5b..c506e11a2e 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -39,6 +39,7 @@ savevm_send_postcopy_run(void) "" savevm_send_postcopy_resume(void) "" savevm_send_colo_enable(void) "" savevm_send_recv_bitmap(char *name) "%s" +savevm_send_switchover_start(void) "" savevm_state_setup(void) "" savevm_state_resume_prepare(void) "" savevm_state_header(void) "" diff --git a/qapi/block-export.json b/qapi/block-export.json index 68dcec7edc..c783e01a53 100644 --- a/qapi/block-export.json +++ b/qapi/block-export.json @@ -9,17 +9,11 @@ { 'include': 'block-core.json' } ## -# @NbdServerOptions: -# -# Keep this type consistent with the nbd-server-start arguments. The -# only intended difference is using SocketAddress instead of -# SocketAddressLegacy. -# -# @addr: Address on which to listen. +# @NbdServerOptionsBase: # # @handshake-max-seconds: Time limit, in seconds, at which a client # that has not completed the negotiation handshake will be -# disconnected, 0 for no limit (since 10.0; default: 10). +# disconnected, or 0 for no limit (since 10.0; default: 10). # # @tls-creds: ID of the TLS credentials object (since 2.6). # @@ -32,47 +26,47 @@ # @max-connections: The maximum number of connections to allow at the # same time, 0 for unlimited. Setting this to 1 also stops the # server from advertising multiple client support (since 5.2; -# default: 100) -# -# Since: 4.2 +# default: 100). ## -{ 'struct': 'NbdServerOptions', - 'data': { 'addr': 'SocketAddress', - '*handshake-max-seconds': 'uint32', +{ 'struct': 'NbdServerOptionsBase', + 'data': { '*handshake-max-seconds': 'uint32', '*tls-creds': 'str', '*tls-authz': 'str', '*max-connections': 'uint32' } } ## -# @nbd-server-start: +# @NbdServerOptions: # -# Start an NBD server listening on the given host and port. Block -# devices can then be exported using @nbd-server-add. The NBD server -# will present them as named exports; for example, another QEMU -# instance could refer to them as "nbd:HOST:PORT:exportname=NAME". +# Keep this type consistent with the NbdServerOptionsLegacy type. The +# only intended difference is using SocketAddress instead of +# SocketAddressLegacy. +# +# @addr: Address on which to listen (since 4.2). +## +{ 'struct': 'NbdServerOptions', + 'base': 'NbdServerOptionsBase', + 'data': { 'addr': 'SocketAddress' } } + +## +# @NbdServerOptionsLegacy: # # Keep this type consistent with the NbdServerOptions type. The only # intended difference is using SocketAddressLegacy instead of # SocketAddress. # -# @addr: Address on which to listen. -# -# @handshake-max-seconds: Time limit, in seconds, at which a client -# that has not completed the negotiation handshake will be -# disconnected, or 0 for no limit (since 10.0; default: 10). -# -# @tls-creds: ID of the TLS credentials object (since 2.6). -# -# @tls-authz: ID of the QAuthZ authorization object used to validate -# the client's x509 distinguished name. This object is is only -# resolved at time of use, so can be deleted and recreated on the -# fly while the NBD server is active. If missing, it will default -# to denying access (since 4.0). +# @addr: Address on which to listen (since 1.3). +## +{ 'struct': 'NbdServerOptionsLegacy', + 'base': 'NbdServerOptionsBase', + 'data': { 'addr': 'SocketAddressLegacy' } } + +## +# @nbd-server-start: # -# @max-connections: The maximum number of connections to allow at the -# same time, 0 for unlimited. Setting this to 1 also stops the -# server from advertising multiple client support (since 5.2; -# default: 100). +# Start an NBD server listening on the given host and port. Block +# devices can then be exported using @nbd-server-add. The NBD server +# will present them as named exports; for example, another QEMU +# instance could refer to them as "nbd:HOST:PORT:exportname=NAME". # # Errors: # - if the server is already running @@ -80,11 +74,7 @@ # Since: 1.3 ## { 'command': 'nbd-server-start', - 'data': { 'addr': 'SocketAddressLegacy', - '*handshake-max-seconds': 'uint32', - '*tls-creds': 'str', - '*tls-authz': 'str', - '*max-connections': 'uint32' }, + 'data': 'NbdServerOptionsLegacy', 'allow-preconfig': true } ## diff --git a/qemu-nbd.c b/qemu-nbd.c index 05b61da51e..ed5895861b 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -852,10 +852,6 @@ int main(int argc, char **argv) export_name = ""; } - if (!trace_init_backends()) { - exit(1); - } - trace_init_file(); qemu_set_log(LOG_TRACE, &error_fatal); socket_activation = check_socket_activation(); @@ -1045,6 +1041,18 @@ int main(int argc, char **argv) #endif /* WIN32 */ } + /* + * trace_init must be done after daemonization. Why? Because at + * least the simple backend spins up a helper thread as well as an + * atexit() handler that waits on that thread, but the helper + * thread won't survive a fork, leading to deadlock in the child + * if we initialized pre-fork. + */ + if (!trace_init_backends()) { + exit(1); + } + trace_init_file(); + if (opts.device != NULL && sockpath == NULL) { sockpath = g_malloc(128); snprintf(sockpath, 128, SOCKET_PATH, basename(opts.device)); diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py index 8e1fbf4c9d..67631ac43e 100755 --- a/scripts/analyze-migration.py +++ b/scripts/analyze-migration.py @@ -620,7 +620,9 @@ class MigrationDump(object): QEMU_VM_SUBSECTION = 0x05 QEMU_VM_VMDESCRIPTION = 0x06 QEMU_VM_CONFIGURATION = 0x07 + QEMU_VM_COMMAND = 0x08 QEMU_VM_SECTION_FOOTER= 0x7e + QEMU_MIG_CMD_SWITCHOVER_START = 0x0b def __init__(self, filename): self.section_classes = { @@ -685,6 +687,15 @@ class MigrationDump(object): elif section_type == self.QEMU_VM_SECTION_PART or section_type == self.QEMU_VM_SECTION_END: section_id = file.read32() self.sections[section_id].read() + elif section_type == self.QEMU_VM_COMMAND: + command_type = file.read16() + command_data_len = file.read16() + if command_type != self.QEMU_MIG_CMD_SWITCHOVER_START: + raise Exception("Unknown QEMU_VM_COMMAND: %x" % + (command_type)) + if command_data_len != 0: + raise Exception("Invalid SWITCHOVER_START length: %x" % + (command_data_len)) elif section_type == self.QEMU_VM_SECTION_FOOTER: read_section_id = file.read32() if read_section_id != section_id: diff --git a/scripts/qapi/introspect.py b/scripts/qapi/introspect.py index 42e5185c7c..89ee5d5f17 100644 --- a/scripts/qapi/introspect.py +++ b/scripts/qapi/introspect.py @@ -11,6 +11,7 @@ This work is licensed under the terms of the GNU GPL, version 2. See the COPYING file in the top-level directory. """ +from dataclasses import dataclass from typing import ( Any, Dict, @@ -79,19 +80,16 @@ SchemaInfoCommand = Dict[str, object] _ValueT = TypeVar('_ValueT', bound=_Value) +@dataclass class Annotated(Generic[_ValueT]): """ Annotated generally contains a SchemaInfo-like type (as a dict), But it also used to wrap comments/ifconds around scalar leaf values, for the benefit of features and enums. """ - # TODO: Remove after Python 3.7 adds @dataclass: - # pylint: disable=too-few-public-methods - def __init__(self, value: _ValueT, ifcond: QAPISchemaIfCond, - comment: Optional[str] = None): - self.value = value - self.comment: Optional[str] = comment - self.ifcond = ifcond + value: _ValueT + ifcond: QAPISchemaIfCond + comment: Optional[str] = None def _tree_to_qlit(obj: JSONValue, diff --git a/scripts/qapi/parser.py b/scripts/qapi/parser.py index adc85b5b39..64f0bb824a 100644 --- a/scripts/qapi/parser.py +++ b/scripts/qapi/parser.py @@ -14,7 +14,6 @@ # This work is licensed under the terms of the GNU GPL, version 2. # See the COPYING file in the top-level directory. -from collections import OrderedDict import os import re from typing import ( @@ -154,7 +153,7 @@ class QAPISchemaParser: "value of 'include' must be a string") incl_fname = os.path.join(os.path.dirname(self._fname), include) - self._add_expr(OrderedDict({'include': incl_fname}), info) + self._add_expr({'include': incl_fname}, info) exprs_include = self._include(include, info, incl_fname, self._included) if exprs_include: @@ -355,7 +354,7 @@ class QAPISchemaParser: raise QAPIParseError(self, "stray '%s'" % match.group(0)) def get_members(self) -> Dict[str, object]: - expr: Dict[str, object] = OrderedDict() + expr: Dict[str, object] = {} if self.tok == '}': self.accept() return expr diff --git a/scripts/qapi/schema.py b/scripts/qapi/schema.py index 7f70969c09..cbe3b5aa91 100644 --- a/scripts/qapi/schema.py +++ b/scripts/qapi/schema.py @@ -19,7 +19,6 @@ from __future__ import annotations from abc import ABC, abstractmethod -from collections import OrderedDict import os import re from typing import ( @@ -557,7 +556,7 @@ class QAPISchemaObjectType(QAPISchemaType): super().check(schema) assert self._checked and not self._check_complete - seen = OrderedDict() + seen = {} if self._base_name: self.base = schema.resolve_type(self._base_name, self.info, "'base'") @@ -1141,10 +1140,10 @@ class QAPISchema: self.docs = parser.docs self._entity_list: List[QAPISchemaEntity] = [] self._entity_dict: Dict[str, QAPISchemaDefinition] = {} - self._module_dict: Dict[str, QAPISchemaModule] = OrderedDict() + self._module_dict: Dict[str, QAPISchemaModule] = {} # NB, values in the dict will identify the first encountered # usage of a named feature only - self._feature_dict: Dict[str, QAPISchemaFeature] = OrderedDict() + self._feature_dict: Dict[str, QAPISchemaFeature] = {} # All schemas get the names defined in the QapiSpecialFeature enum. # Rely on dict iteration order matching insertion order so that @@ -1454,7 +1453,7 @@ class QAPISchema: ifcond = QAPISchemaIfCond(expr.get('if')) info = expr.info features = self._make_features(expr.get('features'), info) - if isinstance(data, OrderedDict): + if isinstance(data, dict): data = self._make_implicit_object_type( name, info, ifcond, 'arg', self._make_members(data, info)) @@ -1473,7 +1472,7 @@ class QAPISchema: ifcond = QAPISchemaIfCond(expr.get('if')) info = expr.info features = self._make_features(expr.get('features'), info) - if isinstance(data, OrderedDict): + if isinstance(data, dict): data = self._make_implicit_object_type( name, info, ifcond, 'arg', self._make_members(data, info)) diff --git a/system/arch_init.c b/system/arch_init.c index d2c32f8488..b1baed18a3 100644 --- a/system/arch_init.c +++ b/system/arch_init.c @@ -22,7 +22,6 @@ * THE SOFTWARE. */ #include "qemu/osdep.h" -#include "qemu/module.h" #include "system/arch_init.h" #ifdef TARGET_SPARC @@ -40,11 +39,3 @@ int graphic_depth = 32; #endif const uint32_t arch_type = QEMU_ARCH; - -void qemu_init_arch_modules(void) -{ -#ifdef CONFIG_MODULES - module_init_info(qemu_modinfo); - module_allow_arch(TARGET_NAME); -#endif -} diff --git a/system/cpus.c b/system/cpus.c index 37e5892c24..2cc5f887ab 100644 --- a/system/cpus.c +++ b/system/cpus.c @@ -31,6 +31,7 @@ #include "qapi/qapi-events-run-state.h" #include "qapi/qmp/qerror.h" #include "exec/gdbstub.h" +#include "system/accel-ops.h" #include "system/hw_accel.h" #include "exec/cpu-common.h" #include "qemu/thread.h" diff --git a/system/globals.c b/system/globals.c index 316623bd20..9640c9511e 100644 --- a/system/globals.c +++ b/system/globals.c @@ -58,7 +58,6 @@ unsigned int nb_prom_envs; const char *prom_envs[MAX_PROM_ENVS]; uint8_t *boot_splash_filedata; int only_migratable; /* turn it off unless user states otherwise */ -int icount_align_option; /* The bytes in qemu_uuid are in the order specified by RFC4122, _not_ in the * little-endian "wire format" described in the SMBIOS 2.6 specification. diff --git a/system/physmem.c b/system/physmem.c index eff8b55c2d..8c1736f84e 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -28,7 +28,7 @@ #include "qemu/lockable.h" #ifdef CONFIG_TCG -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #endif /* CONFIG_TCG */ #include "exec/exec-all.h" diff --git a/system/vl.c b/system/vl.c index 8f776684ec..04f78466c4 100644 --- a/system/vl.c +++ b/system/vl.c @@ -26,6 +26,7 @@ #include "qemu/help-texts.h" #include "qemu/datadir.h" #include "qemu/units.h" +#include "qemu/module.h" #include "exec/cpu-common.h" #include "exec/page-vary.h" #include "hw/qdev-properties.h" @@ -78,6 +79,7 @@ #include "hw/block/block.h" #include "hw/i386/x86.h" #include "hw/i386/pc.h" +#include "hw/core/cpu.h" #include "migration/cpr.h" #include "migration/misc.h" #include "migration/snapshot.h" @@ -2885,7 +2887,10 @@ void qemu_init(int argc, char **argv) os_setup_limits(); - qemu_init_arch_modules(); +#ifdef CONFIG_MODULES + module_init_info(qemu_modinfo); + module_allow_arch(target_name()); +#endif qemu_init_subsystems(); diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c index f5dd744987..2eabd7724d 100644 --- a/target/alpha/cpu.c +++ b/target/alpha/cpu.c @@ -85,6 +85,7 @@ static int alpha_cpu_mmu_index(CPUState *cs, bool ifetch) static void alpha_cpu_disas_set_info(CPUState *cpu, disassemble_info *info) { + info->endian = BFD_ENDIAN_LITTLE; info->mach = bfd_mach_alpha_ev6; info->print_insn = print_insn_alpha; } @@ -227,7 +228,7 @@ static const struct SysemuCPUOps alpha_sysemu_ops = { }; #endif -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps alpha_tcg_ops = { .initialize = alpha_translate_init, diff --git a/target/alpha/fpu_helper.c b/target/alpha/fpu_helper.c index f810a9b6a4..6aefb9b851 100644 --- a/target/alpha/fpu_helper.c +++ b/target/alpha/fpu_helper.c @@ -455,29 +455,28 @@ static uint64_t do_cvttq(CPUAlphaState *env, uint64_t a, int roundmode) { float64 fa; int64_t ret; - uint32_t exc; + uint32_t exc = 0; + int flags; fa = t_to_float64(a); ret = float64_to_int64_modulo(fa, roundmode, &FP_STATUS); - exc = get_float_exception_flags(&FP_STATUS); - if (unlikely(exc)) { + flags = get_float_exception_flags(&FP_STATUS); + if (unlikely(flags)) { set_float_exception_flags(0, &FP_STATUS); /* We need to massage the resulting exceptions. */ - if (exc & float_flag_invalid_cvti) { + if (flags & float_flag_invalid_cvti) { /* Overflow, either normal or infinity. */ if (float64_is_infinity(fa)) { exc = FPCR_INV; } else { exc = FPCR_IOV | FPCR_INE; } - } else if (exc & float_flag_invalid) { + } else if (flags & float_flag_invalid) { exc = FPCR_INV; - } else if (exc & float_flag_inexact) { + } else if (flags & float_flag_inexact) { exc = FPCR_INE; - } else { - exc = 0; } } env->error_code = exc; diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 656070afb5..948defa3f5 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -29,7 +29,7 @@ #include "cpu.h" #ifdef CONFIG_TCG #include "exec/translation-block.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #endif /* CONFIG_TCG */ #include "internals.h" #include "cpu-features.h" @@ -1171,7 +1171,7 @@ static void arm_disas_set_info(CPUState *cpu, disassemble_info *info) { ARMCPU *ac = ARM_CPU(cpu); CPUARMState *env = &ac->env; - bool sctlr_b; + bool sctlr_b = arm_sctlr_b(env); if (is_a64(env)) { info->cap_arch = CS_ARCH_ARM64; @@ -1198,13 +1198,9 @@ static void arm_disas_set_info(CPUState *cpu, disassemble_info *info) info->cap_mode = cap_mode; } - sctlr_b = arm_sctlr_b(env); + info->endian = BFD_ENDIAN_LITTLE; if (bswap_code(sctlr_b)) { -#if TARGET_BIG_ENDIAN - info->endian = BFD_ENDIAN_LITTLE; -#else - info->endian = BFD_ENDIAN_BIG; -#endif + info->endian = TARGET_BIG_ENDIAN ? BFD_ENDIAN_LITTLE : BFD_ENDIAN_BIG; } info->flags &= ~INSN_ARM_BE32; #ifndef CONFIG_USER_ONLY diff --git a/target/arm/tcg/cpu-v7m.c b/target/arm/tcg/cpu-v7m.c index 03acdf83e0..29a41fde69 100644 --- a/target/arm/tcg/cpu-v7m.c +++ b/target/arm/tcg/cpu-v7m.c @@ -10,7 +10,7 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "internals.h" #if !defined(CONFIG_USER_ONLY) diff --git a/target/arm/tcg/cpu32.c b/target/arm/tcg/cpu32.c index 0f1c5bc87e..2c45b7eddd 100644 --- a/target/arm/tcg/cpu32.c +++ b/target/arm/tcg/cpu32.c @@ -10,7 +10,7 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "internals.h" #include "target/arm/idau.h" #if !defined(CONFIG_USER_ONLY) diff --git a/target/arm/tcg/mte_helper.c b/target/arm/tcg/mte_helper.c index f72ce2ae0d..5d6d8a17ae 100644 --- a/target/arm/tcg/mte_helper.c +++ b/target/arm/tcg/mte_helper.c @@ -31,7 +31,7 @@ #endif #include "exec/cpu_ldst.h" #include "exec/helper-proto.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "qapi/error.h" #include "qemu/guest-random.h" #include "mte_helper.h" diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c index c206ca65ce..d786b4b111 100644 --- a/target/arm/tcg/sve_helper.c +++ b/target/arm/tcg/sve_helper.c @@ -28,7 +28,7 @@ #include "tcg/tcg.h" #include "vec_internal.h" #include "sve_ldst_internal.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #ifdef CONFIG_USER_ONLY #include "user/page-protection.h" #endif diff --git a/target/avr/cpu.c b/target/avr/cpu.c index 8a126ff322..2871d30540 100644 --- a/target/avr/cpu.c +++ b/target/avr/cpu.c @@ -102,6 +102,7 @@ static void avr_cpu_reset_hold(Object *obj, ResetType type) static void avr_cpu_disas_set_info(CPUState *cpu, disassemble_info *info) { + info->endian = BFD_ENDIAN_LITTLE; info->mach = bfd_arch_avr; info->print_insn = avr_print_insn; } @@ -203,7 +204,7 @@ static const struct SysemuCPUOps avr_sysemu_ops = { .get_phys_page_debug = avr_cpu_get_phys_page_debug, }; -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps avr_tcg_ops = { .initialize = avr_cpu_tcg_init, diff --git a/target/avr/helper.c b/target/avr/helper.c index 345708a1b3..9ea6870e44 100644 --- a/target/avr/helper.c +++ b/target/avr/helper.c @@ -22,7 +22,7 @@ #include "qemu/log.h" #include "qemu/error-report.h" #include "cpu.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "exec/exec-all.h" #include "exec/page-protection.h" #include "exec/cpu_ldst.h" diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c index 0b7fc98f6c..a9beb9a175 100644 --- a/target/hexagon/cpu.c +++ b/target/hexagon/cpu.c @@ -293,6 +293,7 @@ static void hexagon_cpu_reset_hold(Object *obj, ResetType type) static void hexagon_cpu_disas_set_info(CPUState *s, disassemble_info *info) { info->print_insn = print_insn_hexagon; + info->endian = BFD_ENDIAN_LITTLE; } static void hexagon_cpu_realize(DeviceState *dev, Error **errp) @@ -321,7 +322,7 @@ static void hexagon_cpu_init(Object *obj) { } -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps hexagon_tcg_ops = { .initialize = hexagon_translate_init, diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h index 79e60d4bfa..f78c8f9c2a 100644 --- a/target/hexagon/cpu.h +++ b/target/hexagon/cpu.h @@ -26,6 +26,10 @@ #include "mmvec/mmvec.h" #include "hw/registerfields.h" +#ifndef CONFIG_USER_ONLY +#error "Hexagon does not support system emulation" +#endif + #define NUM_PREGS 4 #define TOTAL_PER_THREAD_REGS 64 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c index 5655677431..d15f8c9c21 100644 --- a/target/hppa/cpu.c +++ b/target/hppa/cpu.c @@ -150,6 +150,7 @@ static int hppa_cpu_mmu_index(CPUState *cs, bool ifetch) static void hppa_cpu_disas_set_info(CPUState *cs, disassemble_info *info) { info->mach = bfd_mach_hppa20; + info->endian = BFD_ENDIAN_BIG; info->print_insn = print_insn_hppa; } @@ -245,7 +246,7 @@ static const struct SysemuCPUOps hppa_sysemu_ops = { }; #endif -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps hppa_tcg_ops = { .initialize = hppa_translate_init, diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 0cd9b70938..b3e1c2bca4 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6563,7 +6563,7 @@ void x86_cpu_apply_props(X86CPU *cpu, PropValue *props) * Only for builtin_x86_defs models initialized with x86_register_cpudef_types. */ -static void x86_cpu_apply_version_props(X86CPU *cpu, X86CPUModel *model) +static void x86_cpu_apply_version_props(X86CPU *cpu, const X86CPUModel *model) { const X86CPUVersionDefinition *vdef; X86CPUVersion version = x86_cpu_model_resolve_version(model); @@ -6592,7 +6592,7 @@ static void x86_cpu_apply_version_props(X86CPU *cpu, X86CPUModel *model) } static const CPUCaches *x86_cpu_get_versioned_cache_info(X86CPU *cpu, - X86CPUModel *model) + const X86CPUModel *model) { const X86CPUVersionDefinition *vdef; X86CPUVersion version = x86_cpu_model_resolve_version(model); @@ -6620,7 +6620,7 @@ static const CPUCaches *x86_cpu_get_versioned_cache_info(X86CPU *cpu, * Load data from X86CPUDefinition into a X86CPU object. * Only for builtin_x86_defs models initialized with x86_register_cpudef_types. */ -static void x86_cpu_load_model(X86CPU *cpu, X86CPUModel *model) +static void x86_cpu_load_model(X86CPU *cpu, const X86CPUModel *model) { const X86CPUDefinition *def = model->cpudef; CPUX86State *env = &cpu->env; @@ -6690,7 +6690,7 @@ static const gchar *x86_gdb_arch_name(CPUState *cs) static void x86_cpu_cpudef_class_init(ObjectClass *oc, void *data) { - X86CPUModel *model = data; + const X86CPUModel *model = data; X86CPUClass *xcc = X86_CPU_CLASS(oc); CPUClass *cc = CPU_CLASS(oc); @@ -8691,6 +8691,7 @@ static void x86_disas_set_info(CPUState *cs, disassemble_info *info) X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + info->endian = BFD_ENDIAN_LITTLE; info->mach = (env->hflags & HF_CS64_MASK ? bfd_mach_x86_64 : env->hflags & HF_CS32_MASK ? bfd_mach_i386_i386 : bfd_mach_i386_i8086); diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 10ce019e3f..7882b63b9b 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2327,7 +2327,7 @@ struct X86CPUClass { * CPU definition, automatically loaded by instance_init if not NULL. * Should be eventually replaced by subclass-specific property defaults. */ - X86CPUModel *model; + const X86CPUModel *model; bool host_cpuid_required; int ordering; diff --git a/target/i386/hvf/hvf-cpu.c b/target/i386/hvf/hvf-cpu.c index 560b5a0594..b5f4c80028 100644 --- a/target/i386/hvf/hvf-cpu.c +++ b/target/i386/hvf/hvf-cpu.c @@ -14,7 +14,7 @@ #include "system/system.h" #include "hw/boards.h" #include "system/hvf.h" -#include "hw/core/accel-cpu.h" +#include "accel/accel-cpu-target.h" #include "hvf-i386.h" static void hvf_cpu_max_instance_init(X86CPU *cpu) diff --git a/target/i386/hvf/x86_decode.c b/target/i386/hvf/x86_decode.c index d6d5894e54..5fea2dd3cc 100644 --- a/target/i386/hvf/x86_decode.c +++ b/target/i386/hvf/x86_decode.c @@ -61,8 +61,8 @@ uint64_t sign(uint64_t val, int size) static inline uint64_t decode_bytes(CPUX86State *env, struct x86_decode *decode, int size) { - target_ulong val = 0; - + uint64_t val = 0; + switch (size) { case 1: case 2: diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c index 1bda403f88..6269fa8045 100644 --- a/target/i386/kvm/kvm-cpu.c +++ b/target/i386/kvm/kvm-cpu.c @@ -15,7 +15,7 @@ #include "hw/boards.h" #include "kvm_i386.h" -#include "hw/core/accel-cpu.h" +#include "accel/accel-cpu-target.h" static void kvm_set_guest_phys_bits(CPUState *cs) { diff --git a/target/i386/nvmm/nvmm-accel-ops.c b/target/i386/nvmm/nvmm-accel-ops.c index e7b56662fe..4e4e63de78 100644 --- a/target/i386/nvmm/nvmm-accel-ops.c +++ b/target/i386/nvmm/nvmm-accel-ops.c @@ -10,6 +10,7 @@ #include "qemu/osdep.h" #include "system/kvm_int.h" #include "qemu/main-loop.h" +#include "system/accel-ops.h" #include "system/cpus.h" #include "qemu/guest-random.h" diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c index 14ee038079..b8aff825ee 100644 --- a/target/i386/tcg/tcg-cpu.c +++ b/target/i386/tcg/tcg-cpu.c @@ -21,7 +21,7 @@ #include "cpu.h" #include "helper-tcg.h" #include "qemu/accel.h" -#include "hw/core/accel-cpu.h" +#include "accel/accel-cpu-target.h" #include "exec/translation-block.h" #include "tcg-cpu.h" @@ -105,7 +105,7 @@ static bool x86_debug_check_breakpoint(CPUState *cs) } #endif -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps x86_tcg_ops = { .initialize = tcg_x86_init, diff --git a/target/i386/whpx/whpx-accel-ops.c b/target/i386/whpx/whpx-accel-ops.c index ab2e014c9e..81fdd06e48 100644 --- a/target/i386/whpx/whpx-accel-ops.c +++ b/target/i386/whpx/whpx-accel-ops.c @@ -11,6 +11,7 @@ #include "qemu/osdep.h" #include "system/kvm_int.h" #include "qemu/main-loop.h" +#include "system/accel-ops.h" #include "system/cpus.h" #include "qemu/guest-random.h" diff --git a/target/i386/whpx/whpx-apic.c b/target/i386/whpx/whpx-apic.c index 4245ab68a2..630a9616d7 100644 --- a/target/i386/whpx/whpx-apic.c +++ b/target/i386/whpx/whpx-apic.c @@ -231,7 +231,7 @@ static void whpx_apic_mem_write(void *opaque, hwaddr addr, static const MemoryRegionOps whpx_apic_io_ops = { .read = whpx_apic_mem_read, .write = whpx_apic_mem_write, - .endianness = DEVICE_NATIVE_ENDIAN, + .endianness = DEVICE_LITTLE_ENDIAN, }; static void whpx_apic_reset(APICCommonState *s) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 0486853048..49f603149d 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -629,6 +629,7 @@ static void loongarch_cpu_reset_hold(Object *obj, ResetType type) static void loongarch_cpu_disas_set_info(CPUState *s, disassemble_info *info) { + info->endian = BFD_ENDIAN_LITTLE; info->print_insn = print_insn_loongarch; } @@ -862,7 +863,7 @@ static void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags) } #ifdef CONFIG_TCG -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps loongarch_tcg_ops = { .initialize = loongarch_translate_init, diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c index 2617d8f6ed..df8b9c53fc 100644 --- a/target/m68k/cpu.c +++ b/target/m68k/cpu.c @@ -157,6 +157,7 @@ static void m68k_cpu_reset_hold(Object *obj, ResetType type) static void m68k_cpu_disas_set_info(CPUState *s, disassemble_info *info) { info->print_insn = print_insn_m68k; + info->endian = BFD_ENDIAN_BIG; info->mach = 0; } @@ -582,7 +583,7 @@ static const struct SysemuCPUOps m68k_sysemu_ops = { }; #endif /* !CONFIG_USER_ONLY */ -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps m68k_tcg_ops = { .initialize = m68k_tcg_init, diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c index f114789abd..d5ee1244ca 100644 --- a/target/microblaze/cpu.c +++ b/target/microblaze/cpu.c @@ -224,6 +224,8 @@ static void mb_disas_set_info(CPUState *cpu, disassemble_info *info) { info->mach = bfd_arch_microblaze; info->print_insn = print_insn_microblaze; + info->endian = TARGET_BIG_ENDIAN ? BFD_ENDIAN_BIG + : BFD_ENDIAN_LITTLE; } static void mb_cpu_realizefn(DeviceState *dev, Error **errp) @@ -419,7 +421,7 @@ static const struct SysemuCPUOps mb_sysemu_ops = { }; #endif -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps mb_tcg_ops = { .initialize = mb_tcg_init, diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h index f6879eee35..e44ddd5307 100644 --- a/target/microblaze/cpu.h +++ b/target/microblaze/cpu.h @@ -414,6 +414,13 @@ void mb_translate_code(CPUState *cs, TranslationBlock *tb, /* Ensure there is no overlap between the two masks. */ QEMU_BUILD_BUG_ON(MSR_TB_MASK & IFLAGS_TB_MASK); +static inline bool mb_cpu_is_big_endian(CPUState *cs) +{ + MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs); + + return !cpu->cfg.endi; +} + static inline void cpu_get_tb_cpu_state(CPUMBState *env, vaddr *pc, uint64_t *cs_base, uint32_t *flags) { diff --git a/target/microblaze/gdbstub.c b/target/microblaze/gdbstub.c index 09d74e164d..d493681d38 100644 --- a/target/microblaze/gdbstub.c +++ b/target/microblaze/gdbstub.c @@ -110,14 +110,9 @@ int mb_cpu_gdb_read_stack_protect(CPUState *cs, GByteArray *mem_buf, int n) int mb_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n) { - CPUClass *cc = CPU_GET_CLASS(cs); CPUMBState *env = cpu_env(cs); uint32_t tmp; - if (n > cc->gdb_num_core_regs) { - return 0; - } - tmp = ldl_p(mem_buf); switch (n) { diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c index 24005f05b2..b54e5ac4b2 100644 --- a/target/microblaze/translate.c +++ b/target/microblaze/translate.c @@ -708,11 +708,18 @@ static void record_unaligned_ess(DisasContext *dc, int rd, } #endif +static inline MemOp mo_endian(DisasContext *dc) +{ + return dc->cfg->endi ? MO_LE : MO_BE; +} + static bool do_load(DisasContext *dc, int rd, TCGv addr, MemOp mop, int mem_index, bool rev) { MemOp size = mop & MO_SIZE; + mop |= mo_endian(dc); + /* * When doing reverse accesses we need to do two things. * @@ -780,13 +787,13 @@ static bool trans_lbui(DisasContext *dc, arg_typeb *arg) static bool trans_lhu(DisasContext *dc, arg_typea *arg) { TCGv addr = compute_ldst_addr_typea(dc, arg->ra, arg->rb); - return do_load(dc, arg->rd, addr, MO_TEUW, dc->mem_index, false); + return do_load(dc, arg->rd, addr, MO_UW, dc->mem_index, false); } static bool trans_lhur(DisasContext *dc, arg_typea *arg) { TCGv addr = compute_ldst_addr_typea(dc, arg->ra, arg->rb); - return do_load(dc, arg->rd, addr, MO_TEUW, dc->mem_index, true); + return do_load(dc, arg->rd, addr, MO_UW, dc->mem_index, true); } static bool trans_lhuea(DisasContext *dc, arg_typea *arg) @@ -798,26 +805,26 @@ static bool trans_lhuea(DisasContext *dc, arg_typea *arg) return true; #else TCGv addr = compute_ldst_addr_ea(dc, arg->ra, arg->rb); - return do_load(dc, arg->rd, addr, MO_TEUW, MMU_NOMMU_IDX, false); + return do_load(dc, arg->rd, addr, MO_UW, MMU_NOMMU_IDX, false); #endif } static bool trans_lhui(DisasContext *dc, arg_typeb *arg) { TCGv addr = compute_ldst_addr_typeb(dc, arg->ra, arg->imm); - return do_load(dc, arg->rd, addr, MO_TEUW, dc->mem_index, false); + return do_load(dc, arg->rd, addr, MO_UW, dc->mem_index, false); } static bool trans_lw(DisasContext *dc, arg_typea *arg) { TCGv addr = compute_ldst_addr_typea(dc, arg->ra, arg->rb); - return do_load(dc, arg->rd, addr, MO_TEUL, dc->mem_index, false); + return do_load(dc, arg->rd, addr, MO_UL, dc->mem_index, false); } static bool trans_lwr(DisasContext *dc, arg_typea *arg) { TCGv addr = compute_ldst_addr_typea(dc, arg->ra, arg->rb); - return do_load(dc, arg->rd, addr, MO_TEUL, dc->mem_index, true); + return do_load(dc, arg->rd, addr, MO_UL, dc->mem_index, true); } static bool trans_lwea(DisasContext *dc, arg_typea *arg) @@ -829,14 +836,14 @@ static bool trans_lwea(DisasContext *dc, arg_typea *arg) return true; #else TCGv addr = compute_ldst_addr_ea(dc, arg->ra, arg->rb); - return do_load(dc, arg->rd, addr, MO_TEUL, MMU_NOMMU_IDX, false); + return do_load(dc, arg->rd, addr, MO_UL, MMU_NOMMU_IDX, false); #endif } static bool trans_lwi(DisasContext *dc, arg_typeb *arg) { TCGv addr = compute_ldst_addr_typeb(dc, arg->ra, arg->imm); - return do_load(dc, arg->rd, addr, MO_TEUL, dc->mem_index, false); + return do_load(dc, arg->rd, addr, MO_UL, dc->mem_index, false); } static bool trans_lwx(DisasContext *dc, arg_typea *arg) @@ -846,7 +853,8 @@ static bool trans_lwx(DisasContext *dc, arg_typea *arg) /* lwx does not throw unaligned access errors, so force alignment */ tcg_gen_andi_tl(addr, addr, ~3); - tcg_gen_qemu_ld_i32(cpu_res_val, addr, dc->mem_index, MO_TEUL); + tcg_gen_qemu_ld_i32(cpu_res_val, addr, dc->mem_index, + mo_endian(dc) | MO_UL); tcg_gen_mov_tl(cpu_res_addr, addr); if (arg->rd) { @@ -863,6 +871,8 @@ static bool do_store(DisasContext *dc, int rd, TCGv addr, MemOp mop, { MemOp size = mop & MO_SIZE; + mop |= mo_endian(dc); + /* * When doing reverse accesses we need to do two things. * @@ -930,13 +940,13 @@ static bool trans_sbi(DisasContext *dc, arg_typeb *arg) static bool trans_sh(DisasContext *dc, arg_typea *arg) { TCGv addr = compute_ldst_addr_typea(dc, arg->ra, arg->rb); - return do_store(dc, arg->rd, addr, MO_TEUW, dc->mem_index, false); + return do_store(dc, arg->rd, addr, MO_UW, dc->mem_index, false); } static bool trans_shr(DisasContext *dc, arg_typea *arg) { TCGv addr = compute_ldst_addr_typea(dc, arg->ra, arg->rb); - return do_store(dc, arg->rd, addr, MO_TEUW, dc->mem_index, true); + return do_store(dc, arg->rd, addr, MO_UW, dc->mem_index, true); } static bool trans_shea(DisasContext *dc, arg_typea *arg) @@ -948,26 +958,26 @@ static bool trans_shea(DisasContext *dc, arg_typea *arg) return true; #else TCGv addr = compute_ldst_addr_ea(dc, arg->ra, arg->rb); - return do_store(dc, arg->rd, addr, MO_TEUW, MMU_NOMMU_IDX, false); + return do_store(dc, arg->rd, addr, MO_UW, MMU_NOMMU_IDX, false); #endif } static bool trans_shi(DisasContext *dc, arg_typeb *arg) { TCGv addr = compute_ldst_addr_typeb(dc, arg->ra, arg->imm); - return do_store(dc, arg->rd, addr, MO_TEUW, dc->mem_index, false); + return do_store(dc, arg->rd, addr, MO_UW, dc->mem_index, false); } static bool trans_sw(DisasContext *dc, arg_typea *arg) { TCGv addr = compute_ldst_addr_typea(dc, arg->ra, arg->rb); - return do_store(dc, arg->rd, addr, MO_TEUL, dc->mem_index, false); + return do_store(dc, arg->rd, addr, MO_UL, dc->mem_index, false); } static bool trans_swr(DisasContext *dc, arg_typea *arg) { TCGv addr = compute_ldst_addr_typea(dc, arg->ra, arg->rb); - return do_store(dc, arg->rd, addr, MO_TEUL, dc->mem_index, true); + return do_store(dc, arg->rd, addr, MO_UL, dc->mem_index, true); } static bool trans_swea(DisasContext *dc, arg_typea *arg) @@ -979,14 +989,14 @@ static bool trans_swea(DisasContext *dc, arg_typea *arg) return true; #else TCGv addr = compute_ldst_addr_ea(dc, arg->ra, arg->rb); - return do_store(dc, arg->rd, addr, MO_TEUL, MMU_NOMMU_IDX, false); + return do_store(dc, arg->rd, addr, MO_UL, MMU_NOMMU_IDX, false); #endif } static bool trans_swi(DisasContext *dc, arg_typeb *arg) { TCGv addr = compute_ldst_addr_typeb(dc, arg->ra, arg->imm); - return do_store(dc, arg->rd, addr, MO_TEUL, dc->mem_index, false); + return do_store(dc, arg->rd, addr, MO_UL, dc->mem_index, false); } static bool trans_swx(DisasContext *dc, arg_typea *arg) @@ -1015,7 +1025,7 @@ static bool trans_swx(DisasContext *dc, arg_typea *arg) tcg_gen_atomic_cmpxchg_i32(tval, cpu_res_addr, cpu_res_val, reg_for_write(dc, arg->rd), - dc->mem_index, MO_TEUL); + dc->mem_index, mo_endian(dc) | MO_UL); tcg_gen_brcond_i32(TCG_COND_NE, cpu_res_val, tval, swx_fail); @@ -1637,7 +1647,8 @@ static void mb_tr_translate_insn(DisasContextBase *dcb, CPUState *cs) dc->tb_flags_to_set = 0; - ir = translator_ldl(cpu_env(cs), &dc->base, dc->base.pc_next); + ir = translator_ldl_swap(cpu_env(cs), &dc->base, dc->base.pc_next, + mb_cpu_is_big_endian(cs) != TARGET_BIG_ENDIAN); if (!decode(dc, ir)) { trap_illegal(dc, true); } diff --git a/target/mips/cpu.c b/target/mips/cpu.c index 47cd7cfdce..e76298699a 100644 --- a/target/mips/cpu.c +++ b/target/mips/cpu.c @@ -428,13 +428,13 @@ static void mips_cpu_reset_hold(Object *obj, ResetType type) static void mips_cpu_disas_set_info(CPUState *s, disassemble_info *info) { if (!(cpu_env(s)->insn_flags & ISA_NANOMIPS32)) { -#if TARGET_BIG_ENDIAN - info->print_insn = print_insn_big_mips; -#else - info->print_insn = print_insn_little_mips; -#endif + info->endian = TARGET_BIG_ENDIAN ? BFD_ENDIAN_BIG + : BFD_ENDIAN_LITTLE; + info->print_insn = TARGET_BIG_ENDIAN ? print_insn_big_mips + : print_insn_little_mips; } else { info->print_insn = print_insn_nanomips; + info->endian = BFD_ENDIAN_LITTLE; } } @@ -544,7 +544,7 @@ static const Property mips_cpu_properties[] = { }; #ifdef CONFIG_TCG -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps mips_tcg_ops = { .initialize = mips_tcg_init, .translate_code = mips_translate_code, diff --git a/target/mips/tcg/msa_helper.c b/target/mips/tcg/msa_helper.c index ec38d9fde5..74fb80cc25 100644 --- a/target/mips/tcg/msa_helper.c +++ b/target/mips/tcg/msa_helper.c @@ -5577,7 +5577,7 @@ static inline int64_t msa_mulr_q_df(uint32_t df, int64_t arg1, int64_t arg2) { int64_t q_min = DF_MIN_INT(df); int64_t q_max = DF_MAX_INT(df); - int64_t r_bit = 1 << (DF_BITS(df) - 2); + int64_t r_bit = 1LL << (DF_BITS(df) - 2); if (arg1 == q_min && arg2 == q_min) { return q_max; @@ -5685,7 +5685,7 @@ static inline int64_t msa_maddr_q_df(uint32_t df, int64_t dest, int64_t arg1, int64_t q_max = DF_MAX_INT(df); int64_t q_min = DF_MIN_INT(df); - int64_t r_bit = 1 << (DF_BITS(df) - 2); + int64_t r_bit = 1LL << (DF_BITS(df) - 2); q_prod = arg1 * arg2; q_ret = ((dest << (DF_BITS(df) - 1)) + q_prod + r_bit) >> (DF_BITS(df) - 1); @@ -5700,7 +5700,7 @@ static inline int64_t msa_msubr_q_df(uint32_t df, int64_t dest, int64_t arg1, int64_t q_max = DF_MAX_INT(df); int64_t q_min = DF_MIN_INT(df); - int64_t r_bit = 1 << (DF_BITS(df) - 2); + int64_t r_bit = 1LL << (DF_BITS(df) - 2); q_prod = arg1 * arg2; q_ret = ((dest << (DF_BITS(df) - 1)) - q_prod + r_bit) >> (DF_BITS(df) - 1); diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c index b7bab0d7ab..e8c357ae83 100644 --- a/target/openrisc/cpu.c +++ b/target/openrisc/cpu.c @@ -83,6 +83,7 @@ static int openrisc_cpu_mmu_index(CPUState *cs, bool ifetch) static void openrisc_disas_set_info(CPUState *cpu, disassemble_info *info) { + info->endian = BFD_ENDIAN_BIG; info->print_insn = print_insn_or1k; } @@ -165,6 +166,10 @@ static void openrisc_cpu_realizefn(DeviceState *dev, Error **errp) qemu_init_vcpu(cs); cpu_reset(cs); +#ifndef CONFIG_USER_ONLY + cpu_openrisc_clock_init(OPENRISC_CPU(dev)); +#endif + occ->parent_realize(dev, errp); } @@ -232,7 +237,7 @@ static const struct SysemuCPUOps openrisc_sysemu_ops = { }; #endif -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps openrisc_tcg_ops = { .initialize = openrisc_translate_init, diff --git a/target/openrisc/gdbstub.c b/target/openrisc/gdbstub.c index c2a77d5d4d..45bba80d87 100644 --- a/target/openrisc/gdbstub.c +++ b/target/openrisc/gdbstub.c @@ -47,14 +47,9 @@ int openrisc_cpu_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n) int openrisc_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n) { - CPUClass *cc = CPU_GET_CLASS(cs); CPUOpenRISCState *env = cpu_env(cs); uint32_t tmp; - if (n > cc->gdb_num_core_regs) { - return 0; - } - tmp = ldl_p(mem_buf); if (n < 32) { diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c index 062a6e85fb..b9772c53ec 100644 --- a/target/ppc/cpu_init.c +++ b/target/ppc/cpu_init.c @@ -7401,6 +7401,8 @@ static void ppc_disas_set_info(CPUState *cs, disassemble_info *info) if ((env->hflags >> MSR_LE) & 1) { info->endian = BFD_ENDIAN_LITTLE; + } else { + info->endian = BFD_ENDIAN_BIG; } info->mach = env->bfd_mach; if (!env->bfd_mach) { @@ -7430,7 +7432,7 @@ static const struct SysemuCPUOps ppc_sysemu_ops = { #endif #ifdef CONFIG_TCG -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps ppc_tcg_ops = { .initialize = ppc_translate_init, diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index 966c2c6572..216638dee4 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -49,7 +49,7 @@ #include "elf.h" #include "system/kvm_int.h" #include "system/kvm.h" -#include "hw/core/accel-cpu.h" +#include "accel/accel-cpu-target.h" #include CONFIG_DEVICES diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index 47424fd5e2..1c000c30f8 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -1152,6 +1152,15 @@ static void riscv_cpu_disas_set_info(CPUState *s, disassemble_info *info) CPURISCVState *env = &cpu->env; info->target_info = &cpu->cfg; + /* + * A couple of bits in MSTATUS set the endianness: + * - MSTATUS_UBE (User-mode), + * - MSTATUS_SBE (Supervisor-mode), + * - MSTATUS_MBE (Machine-mode) + * but we don't implement that yet. + */ + info->endian = BFD_ENDIAN_LITTLE; + switch (env->xl) { case MXL_RV32: info->print_insn = print_insn_riscv32; @@ -1816,7 +1825,8 @@ static void prop_pmu_num_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_pmu_num = { - .name = "pmu-num", + .type = "int8", + .description = "pmu-num", .get = prop_pmu_num_get, .set = prop_pmu_num_set, }; @@ -1857,7 +1867,8 @@ static void prop_pmu_mask_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_pmu_mask = { - .name = "pmu-mask", + .type = "int8", + .description = "pmu-mask", .get = prop_pmu_mask_get, .set = prop_pmu_mask_set, }; @@ -1888,7 +1899,8 @@ static void prop_mmu_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_mmu = { - .name = "mmu", + .type = "bool", + .description = "mmu", .get = prop_mmu_get, .set = prop_mmu_set, }; @@ -1919,7 +1931,8 @@ static void prop_pmp_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_pmp = { - .name = "pmp", + .type = "bool", + .description = "pmp", .get = prop_pmp_get, .set = prop_pmp_set, }; @@ -1993,7 +2006,9 @@ static void prop_priv_spec_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_priv_spec = { - .name = "priv_spec", + .type = "str", + .description = "priv_spec", + /* FIXME enum? */ .get = prop_priv_spec_get, .set = prop_priv_spec_set, }; @@ -2024,7 +2039,9 @@ static void prop_vext_spec_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_vext_spec = { - .name = "vext_spec", + .type = "str", + .description = "vext_spec", + /* FIXME enum? */ .get = prop_vext_spec_get, .set = prop_vext_spec_set, }; @@ -2065,7 +2082,8 @@ static void prop_vlen_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_vlen = { - .name = "vlen", + .type = "uint16", + .description = "vlen", .get = prop_vlen_get, .set = prop_vlen_set, }; @@ -2105,7 +2123,8 @@ static void prop_elen_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_elen = { - .name = "elen", + .type = "uint16", + .description = "elen", .get = prop_elen_get, .set = prop_elen_set, }; @@ -2140,7 +2159,8 @@ static void prop_cbom_blksize_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_cbom_blksize = { - .name = "cbom_blocksize", + .type = "uint16", + .description = "cbom_blocksize", .get = prop_cbom_blksize_get, .set = prop_cbom_blksize_set, }; @@ -2175,7 +2195,8 @@ static void prop_cbop_blksize_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_cbop_blksize = { - .name = "cbop_blocksize", + .type = "uint16", + .description = "cbop_blocksize", .get = prop_cbop_blksize_get, .set = prop_cbop_blksize_set, }; @@ -2210,7 +2231,8 @@ static void prop_cboz_blksize_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_cboz_blksize = { - .name = "cboz_blocksize", + .type = "uint16", + .description = "cboz_blocksize", .get = prop_cboz_blksize_get, .set = prop_cboz_blksize_set, }; @@ -2245,7 +2267,8 @@ static void prop_mvendorid_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_mvendorid = { - .name = "mvendorid", + .type = "uint32", + .description = "mvendorid", .get = prop_mvendorid_get, .set = prop_mvendorid_set, }; @@ -2280,7 +2303,8 @@ static void prop_mimpid_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_mimpid = { - .name = "mimpid", + .type = "uint64", + .description = "mimpid", .get = prop_mimpid_get, .set = prop_mimpid_set, }; @@ -2336,7 +2360,8 @@ static void prop_marchid_get(Object *obj, Visitor *v, const char *name, } static const PropertyInfo prop_marchid = { - .name = "marchid", + .type = "uint64", + .description = "marchid", .get = prop_marchid_get, .set = prop_marchid_set, }; @@ -3047,7 +3072,7 @@ static void riscv_cpu_class_init(ObjectClass *c, void *data) { RISCVCPUClass *mcc = RISCV_CPU_CLASS(c); - mcc->misa_mxl_max = (uint32_t)(uintptr_t)data; + mcc->misa_mxl_max = (RISCVMXL)GPOINTER_TO_UINT(data); riscv_cpu_validate_misa_mxl(mcc); } @@ -3149,7 +3174,7 @@ void riscv_isa_write_fdt(RISCVCPU *cpu, void *fdt, char *nodename) .parent = TYPE_RISCV_DYNAMIC_CPU, \ .instance_init = (initfn), \ .class_init = riscv_cpu_class_init, \ - .class_data = (void *)(misa_mxl_max) \ + .class_data = GUINT_TO_POINTER(misa_mxl_max) \ } #define DEFINE_VENDOR_CPU(type_name, misa_mxl_max, initfn) \ @@ -3158,7 +3183,7 @@ void riscv_isa_write_fdt(RISCVCPU *cpu, void *fdt, char *nodename) .parent = TYPE_RISCV_VENDOR_CPU, \ .instance_init = (initfn), \ .class_init = riscv_cpu_class_init, \ - .class_data = (void *)(misa_mxl_max) \ + .class_data = GUINT_TO_POINTER(misa_mxl_max) \ } #define DEFINE_BARE_CPU(type_name, misa_mxl_max, initfn) \ @@ -3167,7 +3192,7 @@ void riscv_isa_write_fdt(RISCVCPU *cpu, void *fdt, char *nodename) .parent = TYPE_RISCV_BARE_CPU, \ .instance_init = (initfn), \ .class_init = riscv_cpu_class_init, \ - .class_data = (void *)(misa_mxl_max) \ + .class_data = GUINT_TO_POINTER(misa_mxl_max) \ } #define DEFINE_PROFILE_CPU(type_name, misa_mxl_max, initfn) \ @@ -3176,7 +3201,7 @@ void riscv_isa_write_fdt(RISCVCPU *cpu, void *fdt, char *nodename) .parent = TYPE_RISCV_BARE_CPU, \ .instance_init = (initfn), \ .class_init = riscv_cpu_class_init, \ - .class_data = (void *)(misa_mxl_max) \ + .class_data = GUINT_TO_POINTER(misa_mxl_max) \ } static const TypeInfo riscv_cpu_type_infos[] = { diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h index 616c3bdc1c..7de19b4183 100644 --- a/target/riscv/cpu.h +++ b/target/riscv/cpu.h @@ -539,7 +539,7 @@ struct RISCVCPUClass { DeviceRealize parent_realize; ResettablePhases parent_phases; - uint32_t misa_mxl_max; /* max mxl for this cpu */ + RISCVMXL misa_mxl_max; /* max mxl for this cpu */ }; static inline int riscv_has_ext(CPURISCVState *env, target_ulong ext) diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c index 3f5fd861a8..34092f372d 100644 --- a/target/riscv/cpu_helper.c +++ b/target/riscv/cpu_helper.c @@ -27,7 +27,7 @@ #include "exec/page-protection.h" #include "instmap.h" #include "tcg/tcg-op.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "trace.h" #include "semihosting/common-semi.h" #include "system/cpu-timers.h" diff --git a/target/riscv/kvm/kvm-cpu.c b/target/riscv/kvm/kvm-cpu.c index 7f3b59cb72..4ffeeaa1c9 100644 --- a/target/riscv/kvm/kvm-cpu.c +++ b/target/riscv/kvm/kvm-cpu.c @@ -32,7 +32,7 @@ #include "system/kvm_int.h" #include "cpu.h" #include "trace.h" -#include "hw/core/accel-cpu.h" +#include "accel/accel-cpu-target.h" #include "hw/pci/pci.h" #include "exec/memattrs.h" #include "exec/address-spaces.h" diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c index f1d971eec1..5aef9eef36 100644 --- a/target/riscv/tcg/tcg-cpu.c +++ b/target/riscv/tcg/tcg-cpu.c @@ -30,8 +30,8 @@ #include "qemu/accel.h" #include "qemu/error-report.h" #include "qemu/log.h" -#include "hw/core/accel-cpu.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/accel-cpu-target.h" +#include "accel/tcg/cpu-ops.h" #include "tcg/tcg.h" #ifndef CONFIG_USER_ONLY #include "hw/boards.h" diff --git a/target/rx/cpu.c b/target/rx/cpu.c index 37a6fdd569..1c40c8977e 100644 --- a/target/rx/cpu.c +++ b/target/rx/cpu.c @@ -168,6 +168,7 @@ static void rx_cpu_set_irq(void *opaque, int no, int request) static void rx_cpu_disas_set_info(CPUState *cpu, disassemble_info *info) { + info->endian = BFD_ENDIAN_LITTLE; info->mach = bfd_mach_rx; info->print_insn = print_insn_rx; } @@ -192,15 +193,13 @@ static void rx_cpu_init(Object *obj) qdev_init_gpio_in(DEVICE(cpu), rx_cpu_set_irq, 2); } -#ifndef CONFIG_USER_ONLY #include "hw/core/sysemu-cpu-ops.h" static const struct SysemuCPUOps rx_sysemu_ops = { .get_phys_page_debug = rx_cpu_get_phys_page_debug, }; -#endif -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps rx_tcg_ops = { .initialize = rx_translate_init, @@ -209,11 +208,9 @@ static const TCGCPUOps rx_tcg_ops = { .restore_state_to_opc = rx_restore_state_to_opc, .tlb_fill = rx_cpu_tlb_fill, -#ifndef CONFIG_USER_ONLY .cpu_exec_interrupt = rx_cpu_exec_interrupt, .cpu_exec_halt = rx_cpu_has_work, .do_interrupt = rx_cpu_do_interrupt, -#endif /* !CONFIG_USER_ONLY */ }; static void rx_cpu_class_init(ObjectClass *klass, void *data) @@ -235,9 +232,7 @@ static void rx_cpu_class_init(ObjectClass *klass, void *data) cc->set_pc = rx_cpu_set_pc; cc->get_pc = rx_cpu_get_pc; -#ifndef CONFIG_USER_ONLY cc->sysemu_ops = &rx_sysemu_ops; -#endif cc->gdb_read_register = rx_cpu_gdb_read_register; cc->gdb_write_register = rx_cpu_gdb_write_register; cc->disas_set_info = rx_cpu_disas_set_info; diff --git a/target/rx/cpu.h b/target/rx/cpu.h index 5ba1874bd7..349d61c4e4 100644 --- a/target/rx/cpu.h +++ b/target/rx/cpu.h @@ -26,6 +26,10 @@ #include "exec/cpu-defs.h" #include "qemu/cpu-float.h" +#ifdef CONFIG_USER_ONLY +#error "RX does not support user mode emulation" +#endif + /* PSW define */ REG32(PSW, 0) FIELD(PSW, C, 0, 1) @@ -129,11 +133,9 @@ struct RXCPUClass { #define CPU_RESOLVING_TYPE TYPE_RX_CPU const char *rx_crname(uint8_t cr); -#ifndef CONFIG_USER_ONLY void rx_cpu_do_interrupt(CPUState *cpu); bool rx_cpu_exec_interrupt(CPUState *cpu, int int_req); hwaddr rx_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr); -#endif /* !CONFIG_USER_ONLY */ void rx_cpu_dump_state(CPUState *cpu, FILE *f, int flags); int rx_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg); int rx_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg); diff --git a/target/rx/helper.c b/target/rx/helper.c index 80912e8dcb..7f28e72989 100644 --- a/target/rx/helper.c +++ b/target/rx/helper.c @@ -40,8 +40,6 @@ void rx_cpu_unpack_psw(CPURXState *env, uint32_t psw, int rte) env->psw_c = FIELD_EX32(psw, PSW, C); } -#ifndef CONFIG_USER_ONLY - #define INT_FLAGS (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIR) void rx_cpu_do_interrupt(CPUState *cs) { @@ -146,5 +144,3 @@ hwaddr rx_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) { return addr; } - -#endif /* !CONFIG_USER_ONLY */ diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c index 97d41c23de..972d265478 100644 --- a/target/s390x/cpu.c +++ b/target/s390x/cpu.c @@ -243,6 +243,7 @@ static void s390_cpu_disas_set_info(CPUState *cpu, disassemble_info *info) { info->mach = bfd_mach_s390_64; info->cap_arch = CS_ARCH_SYSZ; + info->endian = BFD_ENDIAN_BIG; info->cap_insn_unit = 2; info->cap_insn_split = 6; } @@ -322,7 +323,7 @@ static const Property s390x_cpu_properties[] = { #endif #ifdef CONFIG_TCG -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" void cpu_get_tb_cpu_state(CPUS390XState *env, vaddr *pc, uint64_t *cs_base, uint32_t *pflags) diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c index c6ab2901e5..ea9fa64d6b 100644 --- a/target/s390x/tcg/mem_helper.c +++ b/target/s390x/tcg/mem_helper.c @@ -28,7 +28,7 @@ #include "exec/exec-all.h" #include "exec/page-protection.h" #include "exec/cpu_ldst.h" -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" #include "qemu/int128.h" #include "qemu/atomic128.h" diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c index ccfe222bdf..c2aaa40a03 100644 --- a/target/sh4/cpu.c +++ b/target/sh4/cpu.c @@ -143,6 +143,8 @@ static void superh_cpu_reset_hold(Object *obj, ResetType type) static void superh_cpu_disas_set_info(CPUState *cpu, disassemble_info *info) { + info->endian = TARGET_BIG_ENDIAN ? BFD_ENDIAN_BIG + : BFD_ENDIAN_LITTLE; info->mach = bfd_mach_sh4; info->print_insn = print_insn_sh; } @@ -256,7 +258,7 @@ static const struct SysemuCPUOps sh4_sysemu_ops = { }; #endif -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps superh_tcg_ops = { .initialize = sh4_translate_init, diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c index fbd38ec334..e27b1fa294 100644 --- a/target/sparc/cpu.c +++ b/target/sparc/cpu.c @@ -106,6 +106,7 @@ static bool sparc_cpu_exec_interrupt(CPUState *cs, int interrupt_request) static void cpu_sparc_disas_set_info(CPUState *cpu, disassemble_info *info) { info->print_insn = print_insn_sparc; + info->endian = BFD_ENDIAN_BIG; #ifdef TARGET_SPARC64 info->mach = bfd_mach_sparc_v9b; #endif @@ -938,7 +939,8 @@ static void sparc_set_nwindows(Object *obj, Visitor *v, const char *name, } static const PropertyInfo qdev_prop_nwindows = { - .name = "int", + .type = "int", + .description = "Number of register windows", .get = sparc_get_nwindows, .set = sparc_set_nwindows, }; @@ -992,7 +994,7 @@ static const struct SysemuCPUOps sparc_sysemu_ops = { #endif #ifdef CONFIG_TCG -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps sparc_tcg_ops = { .initialize = sparc_tcg_init, diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h index dda811503b..462bcb6c0e 100644 --- a/target/sparc/cpu.h +++ b/target/sparc/cpu.h @@ -574,7 +574,7 @@ struct SPARCCPUClass { DeviceRealize parent_realize; ResettablePhases parent_phases; - sparc_def_t *cpu_def; + const sparc_def_t *cpu_def; }; #ifndef CONFIG_USER_ONLY diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c index 95202fadbf..eb794674c8 100644 --- a/target/tricore/cpu.c +++ b/target/tricore/cpu.c @@ -168,7 +168,7 @@ static const struct SysemuCPUOps tricore_sysemu_ops = { .get_phys_page_debug = tricore_cpu_get_phys_page_debug, }; -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps tricore_tcg_ops = { .initialize = tricore_tcg_init, diff --git a/target/tricore/cpu.h b/target/tricore/cpu.h index 8e431d7922..cf9dbc6df8 100644 --- a/target/tricore/cpu.h +++ b/target/tricore/cpu.h @@ -26,6 +26,10 @@ #include "qemu/cpu-float.h" #include "tricore-defs.h" +#ifdef CONFIG_USER_ONLY +#error "TriCore does not support user mode emulation" +#endif + typedef struct CPUArchState { /* GPR Register */ uint32_t gpr_a[16]; diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c index 4eb699d1f4..f9e298ace4 100644 --- a/target/xtensa/cpu.c +++ b/target/xtensa/cpu.c @@ -159,6 +159,8 @@ static void xtensa_cpu_disas_set_info(CPUState *cs, disassemble_info *info) info->private_data = cpu->env.config->isa; info->print_insn = print_insn_xtensa; + info->endian = TARGET_BIG_ENDIAN ? BFD_ENDIAN_BIG + : BFD_ENDIAN_LITTLE; } static void xtensa_cpu_realizefn(DeviceState *dev, Error **errp) @@ -228,7 +230,7 @@ static const struct SysemuCPUOps xtensa_sysemu_ops = { }; #endif -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" static const TCGCPUOps xtensa_tcg_ops = { .initialize = xtensa_translate_init, diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h index 0e6302c5bd..8d70bfc0cd 100644 --- a/target/xtensa/cpu.h +++ b/target/xtensa/cpu.h @@ -490,7 +490,7 @@ typedef struct XtensaConfig { } XtensaConfig; typedef struct XtensaConfigList { - const XtensaConfig *config; + XtensaConfig *config; struct XtensaConfigList *next; } XtensaConfigList; diff --git a/target/xtensa/helper.c b/target/xtensa/helper.c index 2978c471c1..f64699b116 100644 --- a/target/xtensa/helper.c +++ b/target/xtensa/helper.c @@ -173,9 +173,8 @@ static void xtensa_core_class_init(ObjectClass *oc, void *data) { CPUClass *cc = CPU_CLASS(oc); XtensaCPUClass *xcc = XTENSA_CPU_CLASS(oc); - XtensaConfig *config = data; + const XtensaConfig *config = data; - xtensa_finalize_config(config); xcc->config = config; /* @@ -195,6 +194,8 @@ void xtensa_register_core(XtensaConfigList *node) .class_data = (void *)node->config, }; + xtensa_finalize_config(node->config); + node->next = xtensa_cores; xtensa_cores = node; type.name = g_strdup_printf(XTENSA_CPU_TYPE_NAME("%s"), node->config->name); diff --git a/tests/qapi-schema/test-qapi.py b/tests/qapi-schema/test-qapi.py index 7e3f9f4aa1..8fe951c880 100755 --- a/tests/qapi-schema/test-qapi.py +++ b/tests/qapi-schema/test-qapi.py @@ -96,17 +96,8 @@ class QAPISchemaTestVisitor(QAPISchemaVisitor): @staticmethod def _print_if(ifcond, indent=4): - # TODO Drop this hack after replacing OrderedDict by plain - # dict (requires Python 3.7) - def _massage(subcond): - if isinstance(subcond, str): - return subcond - if isinstance(subcond, list): - return [_massage(val) for val in subcond] - return {key: _massage(val) for key, val in subcond.items()} - if ifcond.is_present(): - print('%sif %s' % (' ' * indent, _massage(ifcond.ifcond))) + print('%sif %s' % (' ' * indent, ifcond.ifcond)) @classmethod def _print_features(cls, features, indent=4): diff --git a/tests/qemu-iotests/162 b/tests/qemu-iotests/162 index 94dae60d30..956c2c5f33 100755 --- a/tests/qemu-iotests/162 +++ b/tests/qemu-iotests/162 @@ -65,6 +65,7 @@ done $QEMU_IMG info "json:{'driver': 'nbd', 'host': 'localhost', 'port': $port}" \ | grep '^image' | sed -e "s/$port/PORT/" +_stop_nbd_server # This is a test for NBD's bdrv_refresh_filename() implementation: It expects # either host or path to be set, but it must not assume that they are set to diff --git a/tests/qemu-iotests/302 b/tests/qemu-iotests/302 index a6d79e727b..e980ec513f 100755 --- a/tests/qemu-iotests/302 +++ b/tests/qemu-iotests/302 @@ -115,13 +115,22 @@ with tarfile.open(tar_file, "w") as tar: disk = tarfile.TarInfo("disk") disk.size = actual_size - tar.addfile(disk) - # 6. Shrink the tar to the actual size, aligned to 512 bytes. + # Since python 3.13 we cannot use addfile() to create the member header. + # Add the tarinfo directly using public but undocumented attributes. - tar_size = offset + (disk.size + 511) & ~511 - tar.fileobj.seek(tar_size) - tar.fileobj.truncate(tar_size) + buf = disk.tobuf(tar.format, tar.encoding, tar.errors) + tar.fileobj.write(buf) + tar.members.append(disk) + + # Update the offset and position to the location of the next member. + + tar.offset = offset + (disk.size + 511) & ~511 + tar.fileobj.seek(tar.offset) + + # 6. Shrink the tar to the actual size. + + tar.fileobj.truncate(tar.offset) with tarfile.open(tar_file) as tar: members = [{"name": m.name, "size": m.size, "offset": m.offset_data} diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c index 98ad89b390..7410e6f352 100644 --- a/tests/unit/test-bdrv-drain.c +++ b/tests/unit/test-bdrv-drain.c @@ -28,7 +28,6 @@ #include "system/block-backend.h" #include "qapi/error.h" #include "qemu/main-loop.h" -#include "qemu/clang-tsa.h" #include "iothread.h" static QemuEvent done_event; diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c index 7324ea4a68..2b358eaaa8 100644 --- a/tests/unit/test-block-iothread.c +++ b/tests/unit/test-block-iothread.c @@ -29,7 +29,6 @@ #include "system/block-backend.h" #include "qapi/error.h" #include "qobject/qdict.h" -#include "qemu/clang-tsa.h" #include "qemu/main-loop.h" #include "iothread.h" diff --git a/tests/unit/test-thread-pool.c b/tests/unit/test-thread-pool.c index 1483e53473..33407b595d 100644 --- a/tests/unit/test-thread-pool.c +++ b/tests/unit/test-thread-pool.c @@ -43,10 +43,10 @@ static void done_cb(void *opaque, int ret) active--; } -static void test_submit(void) +static void test_submit_no_complete(void) { WorkerTestData data = { .n = 0 }; - thread_pool_submit(worker_cb, &data); + thread_pool_submit_aio(worker_cb, &data, NULL, NULL); while (data.n == 0) { aio_poll(ctx, true); } @@ -236,7 +236,7 @@ int main(int argc, char **argv) ctx = qemu_get_current_aio_context(); g_test_init(&argc, &argv, NULL); - g_test_add_func("/thread-pool/submit", test_submit); + g_test_add_func("/thread-pool/submit-no-complete", test_submit_no_complete); g_test_add_func("/thread-pool/submit-aio", test_submit_aio); g_test_add_func("/thread-pool/submit-co", test_submit_co); g_test_add_func("/thread-pool/submit-many", test_submit_many); diff --git a/util/async.c b/util/async.c index 0fe2943609..47e3d35a26 100644 --- a/util/async.c +++ b/util/async.c @@ -369,7 +369,7 @@ aio_ctx_finalize(GSource *source) QEMUBH *bh; unsigned flags; - thread_pool_free(ctx->thread_pool); + thread_pool_free_aio(ctx->thread_pool); #ifdef CONFIG_LINUX_AIO if (ctx->linux_aio) { @@ -435,10 +435,10 @@ GSource *aio_get_g_source(AioContext *ctx) return &ctx->source; } -ThreadPool *aio_get_thread_pool(AioContext *ctx) +ThreadPoolAio *aio_get_thread_pool(AioContext *ctx) { if (!ctx->thread_pool) { - ctx->thread_pool = thread_pool_new(ctx); + ctx->thread_pool = thread_pool_new_aio(ctx); } return ctx->thread_pool; } diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c index 6fff4162ac..b2e26e2120 100644 --- a/util/qemu-thread-posix.c +++ b/util/qemu-thread-posix.c @@ -17,7 +17,6 @@ #include "qemu-thread-common.h" #include "qemu/tsan.h" #include "qemu/bitmap.h" -#include "qemu/clang-tsa.h" #ifdef CONFIG_PTHREAD_SET_NAME_NP #include <pthread_np.h> diff --git a/util/thread-pool.c b/util/thread-pool.c index 27eb777e85..d2ead6b728 100644 --- a/util/thread-pool.c +++ b/util/thread-pool.c @@ -23,9 +23,9 @@ #include "block/thread-pool.h" #include "qemu/main-loop.h" -static void do_spawn_thread(ThreadPool *pool); +static void do_spawn_thread(ThreadPoolAio *pool); -typedef struct ThreadPoolElement ThreadPoolElement; +typedef struct ThreadPoolElementAio ThreadPoolElementAio; enum ThreadState { THREAD_QUEUED, @@ -33,9 +33,9 @@ enum ThreadState { THREAD_DONE, }; -struct ThreadPoolElement { +struct ThreadPoolElementAio { BlockAIOCB common; - ThreadPool *pool; + ThreadPoolAio *pool; ThreadPoolFunc *func; void *arg; @@ -47,13 +47,13 @@ struct ThreadPoolElement { int ret; /* Access to this list is protected by lock. */ - QTAILQ_ENTRY(ThreadPoolElement) reqs; + QTAILQ_ENTRY(ThreadPoolElementAio) reqs; /* This list is only written by the thread pool's mother thread. */ - QLIST_ENTRY(ThreadPoolElement) all; + QLIST_ENTRY(ThreadPoolElementAio) all; }; -struct ThreadPool { +struct ThreadPoolAio { AioContext *ctx; QEMUBH *completion_bh; QemuMutex lock; @@ -62,10 +62,10 @@ struct ThreadPool { QEMUBH *new_thread_bh; /* The following variables are only accessed from one AioContext. */ - QLIST_HEAD(, ThreadPoolElement) head; + QLIST_HEAD(, ThreadPoolElementAio) head; /* The following variables are protected by lock. */ - QTAILQ_HEAD(, ThreadPoolElement) request_list; + QTAILQ_HEAD(, ThreadPoolElementAio) request_list; int cur_threads; int idle_threads; int new_threads; /* backlog of threads we need to create */ @@ -76,14 +76,14 @@ struct ThreadPool { static void *worker_thread(void *opaque) { - ThreadPool *pool = opaque; + ThreadPoolAio *pool = opaque; qemu_mutex_lock(&pool->lock); pool->pending_threads--; do_spawn_thread(pool); while (pool->cur_threads <= pool->max_threads) { - ThreadPoolElement *req; + ThreadPoolElementAio *req; int ret; if (QTAILQ_EMPTY(&pool->request_list)) { @@ -131,7 +131,7 @@ static void *worker_thread(void *opaque) return NULL; } -static void do_spawn_thread(ThreadPool *pool) +static void do_spawn_thread(ThreadPoolAio *pool) { QemuThread t; @@ -148,14 +148,14 @@ static void do_spawn_thread(ThreadPool *pool) static void spawn_thread_bh_fn(void *opaque) { - ThreadPool *pool = opaque; + ThreadPoolAio *pool = opaque; qemu_mutex_lock(&pool->lock); do_spawn_thread(pool); qemu_mutex_unlock(&pool->lock); } -static void spawn_thread(ThreadPool *pool) +static void spawn_thread(ThreadPoolAio *pool) { pool->cur_threads++; pool->new_threads++; @@ -173,8 +173,8 @@ static void spawn_thread(ThreadPool *pool) static void thread_pool_completion_bh(void *opaque) { - ThreadPool *pool = opaque; - ThreadPoolElement *elem, *next; + ThreadPoolAio *pool = opaque; + ThreadPoolElementAio *elem, *next; defer_call_begin(); /* cb() may use defer_call() to coalesce work */ @@ -184,8 +184,8 @@ restart: continue; } - trace_thread_pool_complete(pool, elem, elem->common.opaque, - elem->ret); + trace_thread_pool_complete_aio(pool, elem, elem->common.opaque, + elem->ret); QLIST_REMOVE(elem, all); if (elem->common.cb) { @@ -217,10 +217,10 @@ restart: static void thread_pool_cancel(BlockAIOCB *acb) { - ThreadPoolElement *elem = (ThreadPoolElement *)acb; - ThreadPool *pool = elem->pool; + ThreadPoolElementAio *elem = (ThreadPoolElementAio *)acb; + ThreadPoolAio *pool = elem->pool; - trace_thread_pool_cancel(elem, elem->common.opaque); + trace_thread_pool_cancel_aio(elem, elem->common.opaque); QEMU_LOCK_GUARD(&pool->lock); if (elem->state == THREAD_QUEUED) { @@ -234,16 +234,16 @@ static void thread_pool_cancel(BlockAIOCB *acb) } static const AIOCBInfo thread_pool_aiocb_info = { - .aiocb_size = sizeof(ThreadPoolElement), + .aiocb_size = sizeof(ThreadPoolElementAio), .cancel_async = thread_pool_cancel, }; BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, BlockCompletionFunc *cb, void *opaque) { - ThreadPoolElement *req; + ThreadPoolElementAio *req; AioContext *ctx = qemu_get_current_aio_context(); - ThreadPool *pool = aio_get_thread_pool(ctx); + ThreadPoolAio *pool = aio_get_thread_pool(ctx); /* Assert that the thread submitting work is the same running the pool */ assert(pool->ctx == qemu_get_current_aio_context()); @@ -256,7 +256,7 @@ BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, QLIST_INSERT_HEAD(&pool->head, req, all); - trace_thread_pool_submit(pool, req, arg); + trace_thread_pool_submit_aio(pool, req, arg); qemu_mutex_lock(&pool->lock); if (pool->idle_threads == 0 && pool->cur_threads < pool->max_threads) { @@ -290,12 +290,7 @@ int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg) return tpc.ret; } -void thread_pool_submit(ThreadPoolFunc *func, void *arg) -{ - thread_pool_submit_aio(func, arg, NULL, NULL); -} - -void thread_pool_update_params(ThreadPool *pool, AioContext *ctx) +void thread_pool_update_params(ThreadPoolAio *pool, AioContext *ctx) { qemu_mutex_lock(&pool->lock); @@ -322,7 +317,7 @@ void thread_pool_update_params(ThreadPool *pool, AioContext *ctx) qemu_mutex_unlock(&pool->lock); } -static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) +static void thread_pool_init_one(ThreadPoolAio *pool, AioContext *ctx) { if (!ctx) { ctx = qemu_get_aio_context(); @@ -342,14 +337,14 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) thread_pool_update_params(pool, ctx); } -ThreadPool *thread_pool_new(AioContext *ctx) +ThreadPoolAio *thread_pool_new_aio(AioContext *ctx) { - ThreadPool *pool = g_new(ThreadPool, 1); + ThreadPoolAio *pool = g_new(ThreadPoolAio, 1); thread_pool_init_one(pool, ctx); return pool; } -void thread_pool_free(ThreadPool *pool) +void thread_pool_free_aio(ThreadPoolAio *pool) { if (!pool) { return; @@ -379,3 +374,122 @@ void thread_pool_free(ThreadPool *pool) qemu_mutex_destroy(&pool->lock); g_free(pool); } + +struct ThreadPool { + GThreadPool *t; + size_t cur_work; + QemuMutex cur_work_lock; + QemuCond all_finished_cond; +}; + +typedef struct { + ThreadPoolFunc *func; + void *opaque; + GDestroyNotify opaque_destroy; +} ThreadPoolElement; + +static void thread_pool_func(gpointer data, gpointer user_data) +{ + ThreadPool *pool = user_data; + g_autofree ThreadPoolElement *el = data; + + el->func(el->opaque); + + if (el->opaque_destroy) { + el->opaque_destroy(el->opaque); + } + + QEMU_LOCK_GUARD(&pool->cur_work_lock); + + assert(pool->cur_work > 0); + pool->cur_work--; + + if (pool->cur_work == 0) { + qemu_cond_signal(&pool->all_finished_cond); + } +} + +ThreadPool *thread_pool_new(void) +{ + ThreadPool *pool = g_new(ThreadPool, 1); + + pool->cur_work = 0; + qemu_mutex_init(&pool->cur_work_lock); + qemu_cond_init(&pool->all_finished_cond); + + pool->t = g_thread_pool_new(thread_pool_func, pool, 0, TRUE, NULL); + /* + * g_thread_pool_new() can only return errors if initial thread(s) + * creation fails but we ask for 0 initial threads above. + */ + assert(pool->t); + + return pool; +} + +void thread_pool_free(ThreadPool *pool) +{ + /* + * With _wait = TRUE this effectively waits for all + * previously submitted work to complete first. + */ + g_thread_pool_free(pool->t, FALSE, TRUE); + + qemu_cond_destroy(&pool->all_finished_cond); + qemu_mutex_destroy(&pool->cur_work_lock); + + g_free(pool); +} + +void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, + void *opaque, GDestroyNotify opaque_destroy) +{ + ThreadPoolElement *el = g_new(ThreadPoolElement, 1); + + el->func = func; + el->opaque = opaque; + el->opaque_destroy = opaque_destroy; + + WITH_QEMU_LOCK_GUARD(&pool->cur_work_lock) { + pool->cur_work++; + } + + /* + * Ignore the return value since this function can only return errors + * if creation of an additional thread fails but even in this case the + * provided work is still getting queued (just for the existing threads). + */ + g_thread_pool_push(pool->t, el, NULL); +} + +void thread_pool_submit_immediate(ThreadPool *pool, ThreadPoolFunc *func, + void *opaque, GDestroyNotify opaque_destroy) +{ + thread_pool_submit(pool, func, opaque, opaque_destroy); + thread_pool_adjust_max_threads_to_work(pool); +} + +void thread_pool_wait(ThreadPool *pool) +{ + QEMU_LOCK_GUARD(&pool->cur_work_lock); + + while (pool->cur_work > 0) { + qemu_cond_wait(&pool->all_finished_cond, + &pool->cur_work_lock); + } +} + +bool thread_pool_set_max_threads(ThreadPool *pool, + int max_threads) +{ + assert(max_threads > 0); + + return g_thread_pool_set_max_threads(pool->t, max_threads, NULL); +} + +bool thread_pool_adjust_max_threads_to_work(ThreadPool *pool) +{ + QEMU_LOCK_GUARD(&pool->cur_work_lock); + + return thread_pool_set_max_threads(pool, pool->cur_work); +} diff --git a/util/trace-events b/util/trace-events index 49a4962e18..bd8f25fb59 100644 --- a/util/trace-events +++ b/util/trace-events @@ -14,9 +14,9 @@ aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p" reentrant_aio(void *ctx, const char *name) "ctx %p name %s" # thread-pool.c -thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p" -thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d" -thread_pool_cancel(void *req, void *opaque) "req %p opaque %p" +thread_pool_submit_aio(void *pool, void *req, void *opaque) "pool %p req %p opaque %p" +thread_pool_complete_aio(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d" +thread_pool_cancel_aio(void *req, void *opaque) "req %p opaque %p" # buffer.c buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd" |