diff options
75 files changed, 2831 insertions, 1182 deletions
diff --git a/.gitlab-ci.d/cirrus.yml b/.gitlab-ci.d/cirrus.yml index 64f2e25afa..b45f9de62f 100644 --- a/.gitlab-ci.d/cirrus.yml +++ b/.gitlab-ci.d/cirrus.yml @@ -52,7 +52,7 @@ x64-freebsd-13-build: NAME: freebsd-13 CIRRUS_VM_INSTANCE_TYPE: freebsd_instance CIRRUS_VM_IMAGE_SELECTOR: image_family - CIRRUS_VM_IMAGE_NAME: freebsd-13-2 + CIRRUS_VM_IMAGE_NAME: freebsd-13-3 CIRRUS_VM_CPUS: 8 CIRRUS_VM_RAM: 8G UPDATE_COMMAND: pkg update; pkg upgrade -y diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 977576ca14..52239a441f 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -396,6 +396,14 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env) uint64_t cs_base; uint32_t flags, cflags; + /* + * By definition we've just finished a TB, so I/O is OK. + * Avoid the possibility of calling cpu_io_recompile() if + * a page table walk triggered by tb_lookup() calling + * probe_access_internal() happens to touch an MMIO device. + * The next TB, if we chain to it, will clear the flag again. + */ + cpu->neg.can_do_io = true; cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); cflags = curr_cflags(cpu); diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 047cd2cc0a..6243bcb179 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -2022,7 +2022,6 @@ static uint64_t do_ld_mmio_beN(CPUState *cpu, CPUTLBEntryFull *full, MemoryRegion *mr; hwaddr mr_offset; MemTxAttrs attrs; - uint64_t ret; tcg_debug_assert(size > 0 && size <= 8); @@ -2030,12 +2029,9 @@ static uint64_t do_ld_mmio_beN(CPUState *cpu, CPUTLBEntryFull *full, section = io_prepare(&mr_offset, cpu, full->xlat_section, attrs, addr, ra); mr = section->mr; - bql_lock(); - ret = int_ld_mmio_beN(cpu, full, ret_be, addr, size, mmu_idx, - type, ra, mr, mr_offset); - bql_unlock(); - - return ret; + BQL_LOCK_GUARD(); + return int_ld_mmio_beN(cpu, full, ret_be, addr, size, mmu_idx, + type, ra, mr, mr_offset); } static Int128 do_ld16_mmio_beN(CPUState *cpu, CPUTLBEntryFull *full, @@ -2054,13 +2050,11 @@ static Int128 do_ld16_mmio_beN(CPUState *cpu, CPUTLBEntryFull *full, section = io_prepare(&mr_offset, cpu, full->xlat_section, attrs, addr, ra); mr = section->mr; - bql_lock(); + BQL_LOCK_GUARD(); a = int_ld_mmio_beN(cpu, full, ret_be, addr, size - 8, mmu_idx, MMU_DATA_LOAD, ra, mr, mr_offset); b = int_ld_mmio_beN(cpu, full, ret_be, addr + size - 8, 8, mmu_idx, MMU_DATA_LOAD, ra, mr, mr_offset + size - 8); - bql_unlock(); - return int128_make128(b, a); } @@ -2569,7 +2563,6 @@ static uint64_t do_st_mmio_leN(CPUState *cpu, CPUTLBEntryFull *full, hwaddr mr_offset; MemoryRegion *mr; MemTxAttrs attrs; - uint64_t ret; tcg_debug_assert(size > 0 && size <= 8); @@ -2577,12 +2570,9 @@ static uint64_t do_st_mmio_leN(CPUState *cpu, CPUTLBEntryFull *full, section = io_prepare(&mr_offset, cpu, full->xlat_section, attrs, addr, ra); mr = section->mr; - bql_lock(); - ret = int_st_mmio_leN(cpu, full, val_le, addr, size, mmu_idx, - ra, mr, mr_offset); - bql_unlock(); - - return ret; + BQL_LOCK_GUARD(); + return int_st_mmio_leN(cpu, full, val_le, addr, size, mmu_idx, + ra, mr, mr_offset); } static uint64_t do_st16_mmio_leN(CPUState *cpu, CPUTLBEntryFull *full, @@ -2593,7 +2583,6 @@ static uint64_t do_st16_mmio_leN(CPUState *cpu, CPUTLBEntryFull *full, MemoryRegion *mr; hwaddr mr_offset; MemTxAttrs attrs; - uint64_t ret; tcg_debug_assert(size > 8 && size <= 16); @@ -2601,14 +2590,11 @@ static uint64_t do_st16_mmio_leN(CPUState *cpu, CPUTLBEntryFull *full, section = io_prepare(&mr_offset, cpu, full->xlat_section, attrs, addr, ra); mr = section->mr; - bql_lock(); + BQL_LOCK_GUARD(); int_st_mmio_leN(cpu, full, int128_getlo(val_le), addr, 8, mmu_idx, ra, mr, mr_offset); - ret = int_st_mmio_leN(cpu, full, int128_gethi(val_le), addr + 8, - size - 8, mmu_idx, ra, mr, mr_offset + 8); - bql_unlock(); - - return ret; + return int_st_mmio_leN(cpu, full, int128_gethi(val_le), addr + 8, + size - 8, mmu_idx, ra, mr, mr_offset + 8); } /* diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 1c695efe02..c1f57e894a 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -256,7 +256,6 @@ bool cpu_unwind_state_data(CPUState *cpu, uintptr_t host_pc, uint64_t *data) void page_init(void) { - page_size_init(); page_table_config_init(); } diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index 68b252cb8e..3cac3a78c4 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -651,16 +651,17 @@ void page_protect(tb_page_addr_t address) { PageFlagsNode *p; target_ulong start, last; + int host_page_size = qemu_real_host_page_size(); int prot; assert_memory_lock(); - if (qemu_host_page_size <= TARGET_PAGE_SIZE) { + if (host_page_size <= TARGET_PAGE_SIZE) { start = address & TARGET_PAGE_MASK; last = start + TARGET_PAGE_SIZE - 1; } else { - start = address & qemu_host_page_mask; - last = start + qemu_host_page_size - 1; + start = address & -host_page_size; + last = start + host_page_size - 1; } p = pageflags_find(start, last); @@ -671,7 +672,7 @@ void page_protect(tb_page_addr_t address) if (unlikely(p->itree.last < last)) { /* More than one protection region covers the one host page. */ - assert(TARGET_PAGE_SIZE < qemu_host_page_size); + assert(TARGET_PAGE_SIZE < host_page_size); while ((p = pageflags_next(p, start, last)) != NULL) { prot |= p->flags; } @@ -679,7 +680,7 @@ void page_protect(tb_page_addr_t address) if (prot & PAGE_WRITE) { pageflags_set_clear(start, last, 0, PAGE_WRITE); - mprotect(g2h_untagged(start), qemu_host_page_size, + mprotect(g2h_untagged(start), last - start + 1, prot & (PAGE_READ | PAGE_EXEC) ? PROT_READ : PROT_NONE); } } @@ -725,18 +726,19 @@ int page_unprotect(target_ulong address, uintptr_t pc) } #endif } else { + int host_page_size = qemu_real_host_page_size(); target_ulong start, len, i; int prot; - if (qemu_host_page_size <= TARGET_PAGE_SIZE) { + if (host_page_size <= TARGET_PAGE_SIZE) { start = address & TARGET_PAGE_MASK; len = TARGET_PAGE_SIZE; prot = p->flags | PAGE_WRITE; pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0); current_tb_invalidated = tb_invalidate_phys_page_unwind(start, pc); } else { - start = address & qemu_host_page_mask; - len = qemu_host_page_size; + start = address & -host_page_size; + len = host_page_size; prot = 0; for (i = 0; i < len; i += TARGET_PAGE_SIZE) { @@ -862,7 +864,7 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, vaddr addr, typedef struct TargetPageDataNode { struct rcu_head rcu; IntervalTreeNode itree; - char data[TPD_PAGES][TARGET_PAGE_DATA_SIZE] __attribute__((aligned)); + char data[] __attribute__((aligned)); } TargetPageDataNode; static IntervalTreeRoot targetdata_root; @@ -900,7 +902,8 @@ void page_reset_target_data(target_ulong start, target_ulong last) n_last = MIN(last, n->last); p_len = (n_last + 1 - n_start) >> TARGET_PAGE_BITS; - memset(t->data[p_ofs], 0, p_len * TARGET_PAGE_DATA_SIZE); + memset(t->data + p_ofs * TARGET_PAGE_DATA_SIZE, 0, + p_len * TARGET_PAGE_DATA_SIZE); } } @@ -908,7 +911,7 @@ void *page_get_target_data(target_ulong address) { IntervalTreeNode *n; TargetPageDataNode *t; - target_ulong page, region; + target_ulong page, region, p_ofs; page = address & TARGET_PAGE_MASK; region = address & TBD_MASK; @@ -924,7 +927,8 @@ void *page_get_target_data(target_ulong address) mmap_lock(); n = interval_tree_iter_first(&targetdata_root, page, page); if (!n) { - t = g_new0(TargetPageDataNode, 1); + t = g_malloc0(sizeof(TargetPageDataNode) + + TPD_PAGES * TARGET_PAGE_DATA_SIZE); n = &t->itree; n->start = region; n->last = region | ~TBD_MASK; @@ -934,7 +938,8 @@ void *page_get_target_data(target_ulong address) } t = container_of(n, TargetPageDataNode, itree); - return t->data[(page - region) >> TARGET_PAGE_BITS]; + p_ofs = (page - region) >> TARGET_PAGE_BITS; + return t->data + p_ofs * TARGET_PAGE_DATA_SIZE; } #else void page_reset_target_data(target_ulong start, target_ulong last) { } diff --git a/bsd-user/main.c b/bsd-user/main.c index e5efb7b845..512d4ab69f 100644 --- a/bsd-user/main.c +++ b/bsd-user/main.c @@ -49,6 +49,13 @@ #include "host-os.h" #include "target_arch_cpu.h" + +/* + * TODO: Remove these and rely only on qemu_real_host_page_size(). + */ +uintptr_t qemu_host_page_size; +intptr_t qemu_host_page_mask; + static bool opt_one_insn_per_tb; uintptr_t guest_base; bool have_guest_base; @@ -307,6 +314,9 @@ int main(int argc, char **argv) (void) envlist_setenv(envlist, *wrk); } + qemu_host_page_size = getpagesize(); + qemu_host_page_size = MAX(qemu_host_page_size, TARGET_PAGE_SIZE); + cpu_model = NULL; qemu_add_opts(&qemu_trace_opts); @@ -364,11 +374,12 @@ int main(int argc, char **argv) } else if (!strcmp(r, "L")) { interp_prefix = argv[optind++]; } else if (!strcmp(r, "p")) { - qemu_host_page_size = atoi(argv[optind++]); - if (qemu_host_page_size == 0 || - (qemu_host_page_size & (qemu_host_page_size - 1)) != 0) { - fprintf(stderr, "page size must be a power of two\n"); - exit(1); + unsigned size, want = qemu_real_host_page_size(); + + r = argv[optind++]; + if (qemu_strtoui(r, NULL, 10, &size) || size != want) { + warn_report("Deprecated page size option cannot " + "change host page size (%u)", want); } } else if (!strcmp(r, "g")) { gdbstub = g_strdup(argv[optind++]); @@ -403,6 +414,8 @@ int main(int argc, char **argv) } } + qemu_host_page_mask = -qemu_host_page_size; + /* init debug */ { int mask = 0; diff --git a/bsd-user/qemu.h b/bsd-user/qemu.h index dc842fffa7..c05c512767 100644 --- a/bsd-user/qemu.h +++ b/bsd-user/qemu.h @@ -40,6 +40,13 @@ extern char **environ; #include "qemu-os.h" /* + * TODO: Remove these and rely only on qemu_real_host_page_size(). + */ +extern uintptr_t qemu_host_page_size; +extern intptr_t qemu_host_page_mask; +#define HOST_PAGE_ALIGN(addr) ROUND_UP((addr), qemu_host_page_size) + +/* * This struct is used to hold certain information about the image. Basically, * it replicates in user space what would be certain task_struct fields in the * kernel diff --git a/cpu-target.c b/cpu-target.c index 86444cc2c6..4c0621bf33 100644 --- a/cpu-target.c +++ b/cpu-target.c @@ -45,9 +45,6 @@ #include "trace/trace-root.h" #include "qemu/accel.h" -uintptr_t qemu_host_page_size; -intptr_t qemu_host_page_mask; - #ifndef CONFIG_USER_ONLY static int cpu_common_post_load(void *opaque, int version_id) { @@ -474,16 +471,3 @@ const char *target_name(void) { return TARGET_NAME; } - -void page_size_init(void) -{ - /* NOTE: we can always suppose that qemu_host_page_size >= - TARGET_PAGE_SIZE */ - if (qemu_host_page_size == 0) { - qemu_host_page_size = qemu_real_host_page_size(); - } - if (qemu_host_page_size < TARGET_PAGE_SIZE) { - qemu_host_page_size = TARGET_PAGE_SIZE; - } - qemu_host_page_mask = -(intptr_t)qemu_host_page_size; -} diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 36bd3e15ef..8565644da6 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -63,6 +63,16 @@ as short-form boolean values, and passed to plugins as ``arg_name=on``. However, short-form booleans are deprecated and full explicit ``arg_name=on`` form is preferred. +User-mode emulator command line arguments +----------------------------------------- + +``-p`` (since 9.0) +'''''''''''''''''' + +The ``-p`` option pretends to control the host page size. However, +it is not possible to change the host page size, and using the +option only causes failures. + QEMU Machine Protocol (QMP) commands ------------------------------------ diff --git a/docs/conf.py b/docs/conf.py index e84a95e71c..1b2afa241c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,7 +29,6 @@ import os import sys import sphinx -from distutils.version import LooseVersion from sphinx.errors import ConfigError # The per-manual conf.py will set qemu_docdir for a single-manual build; @@ -165,11 +164,10 @@ html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -if LooseVersion(sphinx_rtd_theme.__version__) >= LooseVersion("0.4.3"): - html_theme_options = { - "style_nav_header_background": "#802400", - "navigation_with_keys": True, - } +html_theme_options = { + "style_nav_header_background": "#802400", + "navigation_with_keys": True, +} html_logo = os.path.join(qemu_docdir, "../ui/icons/qemu_128x128.png") diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index a9acaf618e..9d1abd2587 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -10,3 +10,4 @@ Migration has plenty of features to support different use cases. dirty-limit vfio virtio + mapped-ram diff --git a/docs/devel/migration/mapped-ram.rst b/docs/devel/migration/mapped-ram.rst new file mode 100644 index 0000000000..fa4cefd9fc --- /dev/null +++ b/docs/devel/migration/mapped-ram.rst @@ -0,0 +1,138 @@ +Mapped-ram +========== + +Mapped-ram is a new stream format for the RAM section designed to +supplement the existing ``file:`` migration and make it compatible +with ``multifd``. This enables parallel migration of a guest's RAM to +a file. + +The core of the feature is to ensure that RAM pages are mapped +directly to offsets in the resulting migration file. This enables the +``multifd`` threads to write exclusively to those offsets even if the +guest is constantly dirtying pages (i.e. live migration). Another +benefit is that the resulting file will have a bounded size, since +pages which are dirtied multiple times will always go to a fixed +location in the file, rather than constantly being added to a +sequential stream. Having the pages at fixed offsets also allows the +usage of O_DIRECT for save/restore of the migration stream as the +pages are ensured to be written respecting O_DIRECT alignment +restrictions (direct-io support not yet implemented). + +Usage +----- + +On both source and destination, enable the ``multifd`` and +``mapped-ram`` capabilities: + + ``migrate_set_capability multifd on`` + + ``migrate_set_capability mapped-ram on`` + +Use a ``file:`` URL for migration: + + ``migrate file:/path/to/migration/file`` + +Mapped-ram migration is best done non-live, i.e. by stopping the VM on +the source side before migrating. + +Use-cases +--------- + +The mapped-ram feature was designed for use cases where the migration +stream will be directed to a file in the filesystem and not +immediately restored on the destination VM [#]_. These could be +thought of as snapshots. We can further categorize them into live and +non-live. + +- Non-live snapshot + +If the use case requires a VM to be stopped before taking a snapshot, +that's the ideal scenario for mapped-ram migration. Not having to +track dirty pages, the migration will write the RAM pages to the disk +as fast as it can. + +Note: if a snapshot is taken of a running VM, but the VM will be +stopped after the snapshot by the admin, then consider stopping it +right before the snapshot to take benefit of the performance gains +mentioned above. + +- Live snapshot + +If the use case requires that the VM keeps running during and after +the snapshot operation, then mapped-ram migration can still be used, +but will be less performant. Other strategies such as +background-snapshot should be evaluated as well. One benefit of +mapped-ram in this scenario is portability since background-snapshot +depends on async dirty tracking (KVM_GET_DIRTY_LOG) which is not +supported outside of Linux. + +.. [#] While this same effect could be obtained with the usage of + snapshots or the ``file:`` migration alone, mapped-ram provides + a performance increase for VMs with larger RAM sizes (10s to + 100s of GiBs), specially if the VM has been stopped beforehand. + +RAM section format +------------------ + +Instead of having a sequential stream of pages that follow the +RAMBlock headers, the dirty pages for a RAMBlock follow its header +instead. This ensures that each RAM page has a fixed offset in the +resulting migration file. + +A bitmap is introduced to track which pages have been written in the +migration file. Pages are written at a fixed location for every +ramblock. Zero pages are ignored as they'd be zero in the destination +migration as well. + +:: + + Without mapped-ram: With mapped-ram: + + --------------------- -------------------------------- + | ramblock 1 header | | ramblock 1 header | + --------------------- -------------------------------- + | ramblock 2 header | | ramblock 1 mapped-ram header | + --------------------- -------------------------------- + | ... | | padding to next 1MB boundary | + --------------------- | ... | + | ramblock n header | -------------------------------- + --------------------- | ramblock 1 pages | + | RAM_SAVE_FLAG_EOS | | ... | + --------------------- -------------------------------- + | stream of pages | | ramblock 2 header | + | (iter 1) | -------------------------------- + | ... | | ramblock 2 mapped-ram header | + --------------------- -------------------------------- + | RAM_SAVE_FLAG_EOS | | padding to next 1MB boundary | + --------------------- | ... | + | stream of pages | -------------------------------- + | (iter 2) | | ramblock 2 pages | + | ... | | ... | + --------------------- -------------------------------- + | ... | | ... | + --------------------- -------------------------------- + | RAM_SAVE_FLAG_EOS | + -------------------------------- + | ... | + -------------------------------- + +where: + - ramblock header: the generic information for a ramblock, such as + idstr, used_len, etc. + + - ramblock mapped-ram header: the information added by this feature: + bitmap of pages written, bitmap size and offset of pages in the + migration file. + +Restrictions +------------ + +Since pages are written to their relative offsets and out of order +(due to the memory dirtying patterns), streaming channels such as +sockets are not supported. A seekable channel such as a file is +required. This can be verified in the QIOChannel by the presence of +the QIO_CHANNEL_FEATURE_SEEKABLE. + +The improvements brought by this feature apply only to guest physical +RAM. Other types of memory such as VRAM are migrated as part of device +states. diff --git a/docs/user/main.rst b/docs/user/main.rst index 7e7ad07409..d5fbb78d3c 100644 --- a/docs/user/main.rst +++ b/docs/user/main.rst @@ -87,9 +87,6 @@ Debug options: Activate logging of the specified items (use '-d help' for a list of log items) -``-p pagesize`` - Act as if the host page size was 'pagesize' bytes - ``-g port`` Wait gdb connection to port diff --git a/hw/tpm/tpm_ppi.c b/hw/tpm/tpm_ppi.c index 7f74e26ec6..f27ed6c35e 100644 --- a/hw/tpm/tpm_ppi.c +++ b/hw/tpm/tpm_ppi.c @@ -47,8 +47,10 @@ void tpm_ppi_reset(TPMPPI *tpmppi) void tpm_ppi_init(TPMPPI *tpmppi, MemoryRegion *m, hwaddr addr, Object *obj) { - tpmppi->buf = qemu_memalign(qemu_real_host_page_size(), - HOST_PAGE_ALIGN(TPM_PPI_ADDR_SIZE)); + size_t host_page_size = qemu_real_host_page_size(); + + tpmppi->buf = qemu_memalign(host_page_size, + ROUND_UP(TPM_PPI_ADDR_SIZE, host_page_size)); memory_region_init_ram_device_ptr(&tpmppi->ram, obj, "tpm-ppi", TPM_PPI_ADDR_SIZE, tpmppi->buf); vmstate_register_ram(&tpmppi->ram, DEVICE(obj)); diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 9ead1be100..6346df17ce 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -20,13 +20,6 @@ void cpu_exec_init_all(void); void cpu_exec_step_atomic(CPUState *cpu); -/* Using intptr_t ensures that qemu_*_page_mask is sign-extended even - * when intptr_t is 32-bit and we are aligning a long long. - */ -extern uintptr_t qemu_host_page_size; -extern intptr_t qemu_host_page_mask; - -#define HOST_PAGE_ALIGN(addr) ROUND_UP((addr), qemu_host_page_size) #define REAL_HOST_PAGE_ALIGN(addr) ROUND_UP((addr), qemu_real_host_page_size()) /* The CPU list lock nests outside page_(un)lock or mmap_(un)lock */ diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 3eb79723c6..848915ea5b 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -44,6 +44,19 @@ struct RAMBlock { size_t page_size; /* dirty bitmap used during migration */ unsigned long *bmap; + + /* + * Below fields are only used by mapped-ram migration + */ + /* bitmap of pages present in the migration file */ + unsigned long *file_bmap; + /* + * offset in the file pages belonging to this ramblock are saved, + * used only during migration to a file. + */ + off_t bitmap_offset; + uint64_t pages_offset; + /* bitmap of already received pages in postcopy */ unsigned long *receivedmap; diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index af1a29526d..d0e345419f 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -1179,8 +1179,6 @@ bool target_words_bigendian(void); const char *target_name(void); -void page_size_init(void); - #ifdef NEED_CPU_H #ifndef CONFIG_USER_ONLY diff --git a/include/io/channel.h b/include/io/channel.h index 5f9dbaab65..7986c49c71 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -44,6 +44,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, QIO_CHANNEL_FEATURE_READ_MSG_PEEK, + QIO_CHANNEL_FEATURE_SEEKABLE, }; @@ -130,6 +131,16 @@ struct QIOChannelClass { Error **errp); /* Optional callbacks */ + ssize_t (*io_pwritev)(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp); + ssize_t (*io_preadv)(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp); int (*io_shutdown)(QIOChannel *ioc, QIOChannelShutdown how, Error **errp); @@ -529,6 +540,78 @@ int qio_channel_close(QIOChannel *ioc, Error **errp); /** + * qio_channel_pwritev + * @ioc: the channel object + * @iov: the array of memory regions to write data from + * @niov: the length of the @iov array + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + * Behaves as qio_channel_writev_full, apart from not supporting + * sending of file handles as well as beginning the write at the + * passed @offset + * + */ +ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp); + +/** + * qio_channel_pwrite + * @ioc: the channel object + * @buf: the memory region to write data into + * @buflen: the number of bytes to @buf + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + */ +ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp); + +/** + * qio_channel_preadv + * @ioc: the channel object + * @iov: the array of memory regions to read data into + * @niov: the length of the @iov array + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + * Behaves as qio_channel_readv_full, apart from not supporting + * receiving of file handles as well as beginning the read at the + * passed @offset + * + */ +ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp); + +/** + * qio_channel_pread + * @ioc: the channel object + * @buf: the memory region to write data into + * @buflen: the number of bytes to @buf + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + */ +ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp); + +/** * qio_channel_shutdown: * @ioc: the channel object * @how: the direction to shutdown diff --git a/include/migration/qemu-file-types.h b/include/migration/qemu-file-types.h index 9ba163f333..adec5abc07 100644 --- a/include/migration/qemu-file-types.h +++ b/include/migration/qemu-file-types.h @@ -50,6 +50,8 @@ unsigned int qemu_get_be16(QEMUFile *f); unsigned int qemu_get_be32(QEMUFile *f); uint64_t qemu_get_be64(QEMUFile *f); +bool qemu_file_is_seekable(QEMUFile *f); + static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { qemu_put_be64(f, *pv); diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h index cb3526d1f4..2c0a2fe751 100644 --- a/include/qemu/bitops.h +++ b/include/qemu/bitops.h @@ -68,6 +68,19 @@ static inline void clear_bit(long nr, unsigned long *addr) } /** + * clear_bit_atomic - Clears a bit in memory atomically + * @nr: Bit to clear + * @addr: Address to start counting from + */ +static inline void clear_bit_atomic(long nr, unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = addr + BIT_WORD(nr); + + return qatomic_and(p, ~mask); +} + +/** * change_bit - Toggle a bit in memory * @nr: Bit to change * @addr: Address to start counting from diff --git a/io/channel-file.c b/io/channel-file.c index 4a12c61886..d4706fa592 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -36,6 +36,10 @@ qio_channel_file_new_fd(int fd) ioc->fd = fd; + if (lseek(fd, 0, SEEK_CUR) != (off_t)-1) { + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE); + } + trace_qio_channel_file_new_fd(ioc, fd); return ioc; @@ -60,6 +64,10 @@ qio_channel_file_new_path(const char *path, return NULL; } + if (lseek(ioc->fd, 0, SEEK_CUR) != (off_t)-1) { + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE); + } + trace_qio_channel_file_new_path(ioc, path, flags, mode, ioc->fd); return ioc; @@ -138,6 +146,58 @@ static ssize_t qio_channel_file_writev(QIOChannel *ioc, return ret; } +#ifdef CONFIG_PREADV +static ssize_t qio_channel_file_preadv(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp) +{ + QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + ssize_t ret; + + retry: + ret = preadv(fioc->fd, iov, niov, offset); + if (ret < 0) { + if (errno == EAGAIN) { + return QIO_CHANNEL_ERR_BLOCK; + } + if (errno == EINTR) { + goto retry; + } + + error_setg_errno(errp, errno, "Unable to read from file"); + return -1; + } + + return ret; +} + +static ssize_t qio_channel_file_pwritev(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp) +{ + QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + ssize_t ret; + + retry: + ret = pwritev(fioc->fd, iov, niov, offset); + if (ret <= 0) { + if (errno == EAGAIN) { + return QIO_CHANNEL_ERR_BLOCK; + } + if (errno == EINTR) { + goto retry; + } + error_setg_errno(errp, errno, "Unable to write to file"); + return -1; + } + return ret; +} +#endif /* CONFIG_PREADV */ + static int qio_channel_file_set_blocking(QIOChannel *ioc, bool enabled, Error **errp) @@ -182,6 +242,11 @@ static int qio_channel_file_close(QIOChannel *ioc, { QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + if (qemu_fdatasync(fioc->fd) < 0) { + error_setg_errno(errp, errno, + "Unable to synchronize file data with storage device"); + return -1; + } if (qemu_close(fioc->fd) < 0) { error_setg_errno(errp, errno, "Unable to close file"); @@ -223,6 +288,10 @@ static void qio_channel_file_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_file_writev; ioc_klass->io_readv = qio_channel_file_readv; ioc_klass->io_set_blocking = qio_channel_file_set_blocking; +#ifdef CONFIG_PREADV + ioc_klass->io_pwritev = qio_channel_file_pwritev; + ioc_klass->io_preadv = qio_channel_file_preadv; +#endif ioc_klass->io_seek = qio_channel_file_seek; ioc_klass->io_close = qio_channel_file_close; ioc_klass->io_create_watch = qio_channel_file_create_watch; diff --git a/io/channel.c b/io/channel.c index 86c5834510..a1f12f8e90 100644 --- a/io/channel.c +++ b/io/channel.c @@ -454,6 +454,64 @@ GSource *qio_channel_add_watch_source(QIOChannel *ioc, } +ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp) +{ + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + + if (!klass->io_pwritev) { + error_setg(errp, "Channel does not support pwritev"); + return -1; + } + + if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) { + error_setg_errno(errp, EINVAL, "Requested channel is not seekable"); + return -1; + } + + return klass->io_pwritev(ioc, iov, niov, offset, errp); +} + +ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = buflen + }; + + return qio_channel_pwritev(ioc, &iov, 1, offset, errp); +} + +ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp) +{ + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + + if (!klass->io_preadv) { + error_setg(errp, "Channel does not support preadv"); + return -1; + } + + if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) { + error_setg_errno(errp, EINVAL, "Requested channel is not seekable"); + return -1; + } + + return klass->io_preadv(ioc, iov, niov, offset, errp); +} + +ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = buflen + }; + + return qio_channel_preadv(ioc, &iov, 1, offset, errp); +} + int qio_channel_shutdown(QIOChannel *ioc, QIOChannelShutdown how, Error **errp) diff --git a/linux-user/elfload.c b/linux-user/elfload.c index b8eef893d0..0c299a7c15 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -460,6 +460,7 @@ enum { static bool init_guest_commpage(void) { ARMCPU *cpu = ARM_CPU(thread_cpu); + int host_page_size = qemu_real_host_page_size(); abi_ptr commpage; void *want; void *addr; @@ -472,10 +473,12 @@ static bool init_guest_commpage(void) return true; } - commpage = HI_COMMPAGE & -qemu_host_page_size; + commpage = HI_COMMPAGE & -host_page_size; want = g2h_untagged(commpage); - addr = mmap(want, qemu_host_page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + addr = mmap(want, host_page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | + (commpage < reserved_va ? MAP_FIXED : MAP_FIXED_NOREPLACE), + -1, 0); if (addr == MAP_FAILED) { perror("Allocating guest commpage"); @@ -488,12 +491,12 @@ static bool init_guest_commpage(void) /* Set kernel helper versions; rest of page is 0. */ __put_user(5, (uint32_t *)g2h_untagged(0xffff0ffcu)); - if (mprotect(addr, qemu_host_page_size, PROT_READ)) { + if (mprotect(addr, host_page_size, PROT_READ)) { perror("Protecting guest commpage"); exit(EXIT_FAILURE); } - page_set_flags(commpage, commpage | ~qemu_host_page_mask, + page_set_flags(commpage, commpage | (host_page_size - 1), PAGE_READ | PAGE_EXEC | PAGE_VALID); return true; } @@ -1532,10 +1535,14 @@ static bool init_guest_commpage(void) 0x3a, 0x68, 0x3b, 0x00, /* trap 0 */ }; - void *want = g2h_untagged(LO_COMMPAGE & -qemu_host_page_size); - void *addr = mmap(want, qemu_host_page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + int host_page_size = qemu_real_host_page_size(); + void *want, *addr; + want = g2h_untagged(LO_COMMPAGE & -host_page_size); + addr = mmap(want, host_page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | + (reserved_va ? MAP_FIXED : MAP_FIXED_NOREPLACE), + -1, 0); if (addr == MAP_FAILED) { perror("Allocating guest commpage"); exit(EXIT_FAILURE); @@ -1544,9 +1551,9 @@ static bool init_guest_commpage(void) return false; } - memcpy(addr, kuser_page, sizeof(kuser_page)); + memcpy(g2h_untagged(LO_COMMPAGE), kuser_page, sizeof(kuser_page)); - if (mprotect(addr, qemu_host_page_size, PROT_READ)) { + if (mprotect(addr, host_page_size, PROT_READ)) { perror("Protecting guest commpage"); exit(EXIT_FAILURE); } @@ -1970,16 +1977,20 @@ static inline void init_thread(struct target_pt_regs *regs, static bool init_guest_commpage(void) { - void *want = g2h_untagged(LO_COMMPAGE); - void *addr = mmap(want, qemu_host_page_size, PROT_NONE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + /* If reserved_va, then we have already mapped 0 page on the host. */ + if (!reserved_va) { + void *want, *addr; - if (addr == MAP_FAILED) { - perror("Allocating guest commpage"); - exit(EXIT_FAILURE); - } - if (addr != want) { - return false; + want = g2h_untagged(LO_COMMPAGE); + addr = mmap(want, TARGET_PAGE_SIZE, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED_NOREPLACE, -1, 0); + if (addr == MAP_FAILED) { + perror("Allocating guest commpage"); + exit(EXIT_FAILURE); + } + if (addr != want) { + return false; + } } /* @@ -2679,13 +2690,7 @@ static abi_ulong create_elf_tables(abi_ulong p, int argc, int envc, NEW_AUX_ENT(AT_PHDR, (abi_ulong)(info->load_addr + exec->e_phoff)); NEW_AUX_ENT(AT_PHENT, (abi_ulong)(sizeof (struct elf_phdr))); NEW_AUX_ENT(AT_PHNUM, (abi_ulong)(exec->e_phnum)); - if ((info->alignment & ~qemu_host_page_mask) != 0) { - /* Target doesn't support host page size alignment */ - NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(TARGET_PAGE_SIZE)); - } else { - NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(MAX(TARGET_PAGE_SIZE, - qemu_host_page_size))); - } + NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(TARGET_PAGE_SIZE)); NEW_AUX_ENT(AT_BASE, (abi_ulong)(interp_info ? interp_info->load_addr : 0)); NEW_AUX_ENT(AT_FLAGS, (abi_ulong)0); NEW_AUX_ENT(AT_ENTRY, info->entry); @@ -2893,7 +2898,7 @@ static bool pgb_addr_set(PGBAddrs *ga, abi_ulong guest_loaddr, /* Add any HI_COMMPAGE not covered by reserved_va. */ if (reserved_va < HI_COMMPAGE) { - ga->bounds[n][0] = HI_COMMPAGE & qemu_host_page_mask; + ga->bounds[n][0] = HI_COMMPAGE & qemu_real_host_page_mask(); ga->bounds[n][1] = HI_COMMPAGE + TARGET_PAGE_SIZE - 1; n++; } @@ -3017,8 +3022,6 @@ static void pgb_dynamic(const char *image_name, uintptr_t guest_loaddr, uintptr_t brk, ret; PGBAddrs ga; - assert(QEMU_IS_ALIGNED(guest_loaddr, align)); - /* Try the identity map first. */ if (pgb_addr_set(&ga, guest_loaddr, guest_hiaddr, true)) { brk = (uintptr_t)sbrk(0); @@ -3075,7 +3078,7 @@ void probe_guest_base(const char *image_name, abi_ulong guest_loaddr, abi_ulong guest_hiaddr) { /* In order to use host shmat, we must be able to honor SHMLBA. */ - uintptr_t align = MAX(SHMLBA, qemu_host_page_size); + uintptr_t align = MAX(SHMLBA, TARGET_PAGE_SIZE); /* Sanity check the guest binary. */ if (reserved_va) { @@ -3912,8 +3915,9 @@ int load_elf_binary(struct linux_binprm *bprm, struct image_info *info) and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ - target_mmap(0, qemu_host_page_size, PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + target_mmap(0, TARGET_PAGE_SIZE, PROT_READ | PROT_EXEC, + MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); } #ifdef TARGET_MIPS info->interp_fp_abi = interp_info.fp_abi; @@ -3963,6 +3967,8 @@ int load_elf_binary(struct linux_binprm *bprm, struct image_info *info) } #ifdef USE_ELF_CORE_DUMP +#include "exec/translate-all.h" + /* * Definitions to generate Intel SVR4-like core files. * These mostly have the same names as the SVR4 types with "target_elf_" @@ -4002,18 +4008,6 @@ int load_elf_binary(struct linux_binprm *bprm, struct image_info *info) * Example for ARM target is provided in this file. */ -/* An ELF note in memory */ -struct memelfnote { - const char *name; - size_t namesz; - size_t namesz_rounded; - int type; - size_t datasz; - size_t datasz_rounded; - void *data; - size_t notesz; -}; - struct target_elf_siginfo { abi_int si_signo; /* signal number */ abi_int si_code; /* extra code */ @@ -4053,77 +4047,6 @@ struct target_elf_prpsinfo { char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ }; -/* Here is the structure in which status of each thread is captured. */ -struct elf_thread_status { - QTAILQ_ENTRY(elf_thread_status) ets_link; - struct target_elf_prstatus prstatus; /* NT_PRSTATUS */ -#if 0 - elf_fpregset_t fpu; /* NT_PRFPREG */ - struct task_struct *thread; - elf_fpxregset_t xfpu; /* ELF_CORE_XFPREG_TYPE */ -#endif - struct memelfnote notes[1]; - int num_notes; -}; - -struct elf_note_info { - struct memelfnote *notes; - struct target_elf_prstatus *prstatus; /* NT_PRSTATUS */ - struct target_elf_prpsinfo *psinfo; /* NT_PRPSINFO */ - - QTAILQ_HEAD(, elf_thread_status) thread_list; -#if 0 - /* - * Current version of ELF coredump doesn't support - * dumping fp regs etc. - */ - elf_fpregset_t *fpu; - elf_fpxregset_t *xfpu; - int thread_status_size; -#endif - int notes_size; - int numnote; -}; - -struct vm_area_struct { - target_ulong vma_start; /* start vaddr of memory region */ - target_ulong vma_end; /* end vaddr of memory region */ - abi_ulong vma_flags; /* protection etc. flags for the region */ - QTAILQ_ENTRY(vm_area_struct) vma_link; -}; - -struct mm_struct { - QTAILQ_HEAD(, vm_area_struct) mm_mmap; - int mm_count; /* number of mappings */ -}; - -static struct mm_struct *vma_init(void); -static void vma_delete(struct mm_struct *); -static int vma_add_mapping(struct mm_struct *, target_ulong, - target_ulong, abi_ulong); -static int vma_get_mapping_count(const struct mm_struct *); -static struct vm_area_struct *vma_first(const struct mm_struct *); -static struct vm_area_struct *vma_next(struct vm_area_struct *); -static abi_ulong vma_dump_size(const struct vm_area_struct *); -static int vma_walker(void *priv, target_ulong start, target_ulong end, - unsigned long flags); - -static void fill_elf_header(struct elfhdr *, int, uint16_t, uint32_t); -static void fill_note(struct memelfnote *, const char *, int, - unsigned int, void *); -static void fill_prstatus(struct target_elf_prstatus *, const TaskState *, int); -static int fill_psinfo(struct target_elf_prpsinfo *, const TaskState *); -static void fill_auxv_note(struct memelfnote *, const TaskState *); -static void fill_elf_note_phdr(struct elf_phdr *, int, off_t); -static size_t note_size(const struct memelfnote *); -static void free_note_info(struct elf_note_info *); -static int fill_note_info(struct elf_note_info *, long, const CPUArchState *); -static void fill_thread_info(struct elf_note_info *, const CPUArchState *); - -static int dump_write(int, const void *, size_t); -static int write_note(struct memelfnote *, int); -static int write_note_info(struct elf_note_info *, int); - #ifdef BSWAP_NEEDED static void bswap_prstatus(struct target_elf_prstatus *prstatus) { @@ -4166,145 +4089,66 @@ static inline void bswap_note(struct elf_note *en) { } #endif /* BSWAP_NEEDED */ /* - * Minimal support for linux memory regions. These are needed - * when we are finding out what memory exactly belongs to - * emulated process. No locks needed here, as long as - * thread that received the signal is stopped. - */ - -static struct mm_struct *vma_init(void) -{ - struct mm_struct *mm; - - if ((mm = g_malloc(sizeof (*mm))) == NULL) - return (NULL); - - mm->mm_count = 0; - QTAILQ_INIT(&mm->mm_mmap); - - return (mm); -} - -static void vma_delete(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - while ((vma = vma_first(mm)) != NULL) { - QTAILQ_REMOVE(&mm->mm_mmap, vma, vma_link); - g_free(vma); - } - g_free(mm); -} - -static int vma_add_mapping(struct mm_struct *mm, target_ulong start, - target_ulong end, abi_ulong flags) -{ - struct vm_area_struct *vma; - - if ((vma = g_malloc0(sizeof (*vma))) == NULL) - return (-1); - - vma->vma_start = start; - vma->vma_end = end; - vma->vma_flags = flags; - - QTAILQ_INSERT_TAIL(&mm->mm_mmap, vma, vma_link); - mm->mm_count++; - - return (0); -} - -static struct vm_area_struct *vma_first(const struct mm_struct *mm) -{ - return (QTAILQ_FIRST(&mm->mm_mmap)); -} - -static struct vm_area_struct *vma_next(struct vm_area_struct *vma) -{ - return (QTAILQ_NEXT(vma, vma_link)); -} - -static int vma_get_mapping_count(const struct mm_struct *mm) -{ - return (mm->mm_count); -} - -/* * Calculate file (dump) size of given memory region. */ -static abi_ulong vma_dump_size(const struct vm_area_struct *vma) +static size_t vma_dump_size(target_ulong start, target_ulong end, + unsigned long flags) { - /* if we cannot even read the first page, skip it */ - if (!access_ok_untagged(VERIFY_READ, vma->vma_start, TARGET_PAGE_SIZE)) - return (0); + /* The area must be readable. */ + if (!(flags & PAGE_READ)) { + return 0; + } /* * Usually we don't dump executable pages as they contain * non-writable code that debugger can read directly from - * target library etc. However, thread stacks are marked - * also executable so we read in first page of given region - * and check whether it contains elf header. If there is - * no elf header, we dump it. + * target library etc. If there is no elf header, we dump it. */ - if (vma->vma_flags & PROT_EXEC) { - char page[TARGET_PAGE_SIZE]; - - if (copy_from_user(page, vma->vma_start, sizeof (page))) { - return 0; - } - if ((page[EI_MAG0] == ELFMAG0) && - (page[EI_MAG1] == ELFMAG1) && - (page[EI_MAG2] == ELFMAG2) && - (page[EI_MAG3] == ELFMAG3)) { - /* - * Mappings are possibly from ELF binary. Don't dump - * them. - */ - return (0); - } + if (!(flags & PAGE_WRITE_ORG) && + (flags & PAGE_EXEC) && + memcmp(g2h_untagged(start), ELFMAG, SELFMAG) == 0) { + return 0; } - return (vma->vma_end - vma->vma_start); + return end - start; } -static int vma_walker(void *priv, target_ulong start, target_ulong end, - unsigned long flags) +static size_t size_note(const char *name, size_t datasz) { - struct mm_struct *mm = (struct mm_struct *)priv; + size_t namesz = strlen(name) + 1; - vma_add_mapping(mm, start, end, flags); - return (0); + namesz = ROUND_UP(namesz, 4); + datasz = ROUND_UP(datasz, 4); + + return sizeof(struct elf_note) + namesz + datasz; } -static void fill_note(struct memelfnote *note, const char *name, int type, - unsigned int sz, void *data) +static void *fill_note(void **pptr, int type, const char *name, size_t datasz) { - unsigned int namesz; + void *ptr = *pptr; + struct elf_note *n = ptr; + size_t namesz = strlen(name) + 1; - namesz = strlen(name) + 1; - note->name = name; - note->namesz = namesz; - note->namesz_rounded = roundup(namesz, sizeof (int32_t)); - note->type = type; - note->datasz = sz; - note->datasz_rounded = roundup(sz, sizeof (int32_t)); + n->n_namesz = namesz; + n->n_descsz = datasz; + n->n_type = type; + bswap_note(n); - note->data = data; + ptr += sizeof(*n); + memcpy(ptr, name, namesz); - /* - * We calculate rounded up note size here as specified by - * ELF document. - */ - note->notesz = sizeof (struct elf_note) + - note->namesz_rounded + note->datasz_rounded; + namesz = ROUND_UP(namesz, 4); + datasz = ROUND_UP(datasz, 4); + + *pptr = ptr + namesz + datasz; + return ptr + namesz; } static void fill_elf_header(struct elfhdr *elf, int segs, uint16_t machine, uint32_t flags) { - (void) memset(elf, 0, sizeof(*elf)); + memcpy(elf->e_ident, ELFMAG, SELFMAG); - (void) memcpy(elf->e_ident, ELFMAG, SELFMAG); elf->e_ident[EI_CLASS] = ELF_CLASS; elf->e_ident[EI_DATA] = ELF_DATA; elf->e_ident[EI_VERSION] = EV_CURRENT; @@ -4322,95 +4166,79 @@ static void fill_elf_header(struct elfhdr *elf, int segs, uint16_t machine, bswap_ehdr(elf); } -static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, off_t offset) +static void fill_elf_note_phdr(struct elf_phdr *phdr, size_t sz, off_t offset) { phdr->p_type = PT_NOTE; phdr->p_offset = offset; - phdr->p_vaddr = 0; - phdr->p_paddr = 0; phdr->p_filesz = sz; - phdr->p_memsz = 0; - phdr->p_flags = 0; - phdr->p_align = 0; bswap_phdr(phdr, 1); } -static size_t note_size(const struct memelfnote *note) +static void fill_prstatus_note(void *data, const TaskState *ts, + CPUState *cpu, int signr) { - return (note->notesz); -} + /* + * Because note memory is only aligned to 4, and target_elf_prstatus + * may well have higher alignment requirements, fill locally and + * memcpy to the destination afterward. + */ + struct target_elf_prstatus prstatus = { + .pr_info.si_signo = signr, + .pr_cursig = signr, + .pr_pid = ts->ts_tid, + .pr_ppid = getppid(), + .pr_pgrp = getpgrp(), + .pr_sid = getsid(0), + }; -static void fill_prstatus(struct target_elf_prstatus *prstatus, - const TaskState *ts, int signr) -{ - (void) memset(prstatus, 0, sizeof (*prstatus)); - prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; - prstatus->pr_pid = ts->ts_tid; - prstatus->pr_ppid = getppid(); - prstatus->pr_pgrp = getpgrp(); - prstatus->pr_sid = getsid(0); - - bswap_prstatus(prstatus); + elf_core_copy_regs(&prstatus.pr_reg, cpu_env(cpu)); + bswap_prstatus(&prstatus); + memcpy(data, &prstatus, sizeof(prstatus)); } -static int fill_psinfo(struct target_elf_prpsinfo *psinfo, const TaskState *ts) +static void fill_prpsinfo_note(void *data, const TaskState *ts) { + /* + * Because note memory is only aligned to 4, and target_elf_prpsinfo + * may well have higher alignment requirements, fill locally and + * memcpy to the destination afterward. + */ + struct target_elf_prpsinfo psinfo; char *base_filename; - unsigned int i, len; - - (void) memset(psinfo, 0, sizeof (*psinfo)); + size_t len; len = ts->info->env_strings - ts->info->arg_strings; - if (len >= ELF_PRARGSZ) - len = ELF_PRARGSZ - 1; - if (copy_from_user(&psinfo->pr_psargs, ts->info->arg_strings, len)) { - return -EFAULT; - } - for (i = 0; i < len; i++) - if (psinfo->pr_psargs[i] == 0) - psinfo->pr_psargs[i] = ' '; - psinfo->pr_psargs[len] = 0; - - psinfo->pr_pid = getpid(); - psinfo->pr_ppid = getppid(); - psinfo->pr_pgrp = getpgrp(); - psinfo->pr_sid = getsid(0); - psinfo->pr_uid = getuid(); - psinfo->pr_gid = getgid(); + len = MIN(len, ELF_PRARGSZ); + memcpy(&psinfo.pr_psargs, g2h_untagged(ts->info->arg_strings), len); + for (size_t i = 0; i < len; i++) { + if (psinfo.pr_psargs[i] == 0) { + psinfo.pr_psargs[i] = ' '; + } + } + + psinfo.pr_pid = getpid(); + psinfo.pr_ppid = getppid(); + psinfo.pr_pgrp = getpgrp(); + psinfo.pr_sid = getsid(0); + psinfo.pr_uid = getuid(); + psinfo.pr_gid = getgid(); base_filename = g_path_get_basename(ts->bprm->filename); /* * Using strncpy here is fine: at max-length, * this field is not NUL-terminated. */ - (void) strncpy(psinfo->pr_fname, base_filename, - sizeof(psinfo->pr_fname)); - + strncpy(psinfo.pr_fname, base_filename, sizeof(psinfo.pr_fname)); g_free(base_filename); - bswap_psinfo(psinfo); - return (0); + + bswap_psinfo(&psinfo); + memcpy(data, &psinfo, sizeof(psinfo)); } -static void fill_auxv_note(struct memelfnote *note, const TaskState *ts) +static void fill_auxv_note(void *data, const TaskState *ts) { - elf_addr_t auxv = (elf_addr_t)ts->info->saved_auxv; - elf_addr_t orig_auxv = auxv; - void *ptr; - int len = ts->info->auxv_len; - - /* - * Auxiliary vector is stored in target process stack. It contains - * {type, value} pairs that we need to dump into note. This is not - * strictly necessary but we do it here for sake of completeness. - */ - - /* read in whole auxv vector and copy it to memelfnote */ - ptr = lock_user(VERIFY_READ, orig_auxv, len, 0); - if (ptr != NULL) { - fill_note(note, "CORE", NT_AUXV, len, ptr); - unlock_user(ptr, auxv, len); - } + memcpy(data, g2h_untagged(ts->info->saved_auxv), ts->info->auxv_len); } /* @@ -4434,27 +4262,9 @@ static int dump_write(int fd, const void *ptr, size_t size) { const char *bufp = (const char *)ptr; ssize_t bytes_written, bytes_left; - struct rlimit dumpsize; - off_t pos; bytes_written = 0; - getrlimit(RLIMIT_CORE, &dumpsize); - if ((pos = lseek(fd, 0, SEEK_CUR))==-1) { - if (errno == ESPIPE) { /* not a seekable stream */ - bytes_left = size; - } else { - return pos; - } - } else { - if (dumpsize.rlim_cur <= pos) { - return -1; - } else if (dumpsize.rlim_cur == RLIM_INFINITY) { - bytes_left = size; - } else { - size_t limit_left=dumpsize.rlim_cur - pos; - bytes_left = limit_left >= size ? size : limit_left ; - } - } + bytes_left = size; /* * In normal conditions, single write(2) should do but @@ -4476,135 +4286,76 @@ static int dump_write(int fd, const void *ptr, size_t size) return (0); } -static int write_note(struct memelfnote *men, int fd) +static int wmr_page_unprotect_regions(void *opaque, target_ulong start, + target_ulong end, unsigned long flags) { - struct elf_note en; - - en.n_namesz = men->namesz; - en.n_type = men->type; - en.n_descsz = men->datasz; + if ((flags & (PAGE_WRITE | PAGE_WRITE_ORG)) == PAGE_WRITE_ORG) { + size_t step = MAX(TARGET_PAGE_SIZE, qemu_real_host_page_size()); - bswap_note(&en); - - if (dump_write(fd, &en, sizeof(en)) != 0) - return (-1); - if (dump_write(fd, men->name, men->namesz_rounded) != 0) - return (-1); - if (dump_write(fd, men->data, men->datasz_rounded) != 0) - return (-1); - - return (0); -} - -static void fill_thread_info(struct elf_note_info *info, const CPUArchState *env) -{ - CPUState *cpu = env_cpu((CPUArchState *)env); - TaskState *ts = (TaskState *)cpu->opaque; - struct elf_thread_status *ets; - - ets = g_malloc0(sizeof (*ets)); - ets->num_notes = 1; /* only prstatus is dumped */ - fill_prstatus(&ets->prstatus, ts, 0); - elf_core_copy_regs(&ets->prstatus.pr_reg, env); - fill_note(&ets->notes[0], "CORE", NT_PRSTATUS, sizeof (ets->prstatus), - &ets->prstatus); - - QTAILQ_INSERT_TAIL(&info->thread_list, ets, ets_link); - - info->notes_size += note_size(&ets->notes[0]); + while (1) { + page_unprotect(start, 0); + if (end - start <= step) { + break; + } + start += step; + } + } + return 0; } -static void init_note_info(struct elf_note_info *info) -{ - /* Initialize the elf_note_info structure so that it is at - * least safe to call free_note_info() on it. Must be - * called before calling fill_note_info(). - */ - memset(info, 0, sizeof (*info)); - QTAILQ_INIT(&info->thread_list); -} +typedef struct { + unsigned count; + size_t size; +} CountAndSizeRegions; -static int fill_note_info(struct elf_note_info *info, - long signr, const CPUArchState *env) +static int wmr_count_and_size_regions(void *opaque, target_ulong start, + target_ulong end, unsigned long flags) { -#define NUMNOTES 3 - CPUState *cpu = env_cpu((CPUArchState *)env); - TaskState *ts = (TaskState *)cpu->opaque; - int i; - - info->notes = g_new0(struct memelfnote, NUMNOTES); - if (info->notes == NULL) - return (-ENOMEM); - info->prstatus = g_malloc0(sizeof (*info->prstatus)); - if (info->prstatus == NULL) - return (-ENOMEM); - info->psinfo = g_malloc0(sizeof (*info->psinfo)); - if (info->prstatus == NULL) - return (-ENOMEM); - - /* - * First fill in status (and registers) of current thread - * including process info & aux vector. - */ - fill_prstatus(info->prstatus, ts, signr); - elf_core_copy_regs(&info->prstatus->pr_reg, env); - fill_note(&info->notes[0], "CORE", NT_PRSTATUS, - sizeof (*info->prstatus), info->prstatus); - fill_psinfo(info->psinfo, ts); - fill_note(&info->notes[1], "CORE", NT_PRPSINFO, - sizeof (*info->psinfo), info->psinfo); - fill_auxv_note(&info->notes[2], ts); - info->numnote = 3; - - info->notes_size = 0; - for (i = 0; i < info->numnote; i++) - info->notes_size += note_size(&info->notes[i]); - - /* read and fill status of all threads */ - WITH_QEMU_LOCK_GUARD(&qemu_cpu_list_lock) { - CPU_FOREACH(cpu) { - if (cpu == thread_cpu) { - continue; - } - fill_thread_info(info, cpu_env(cpu)); - } - } + CountAndSizeRegions *css = opaque; - return (0); + css->count++; + css->size += vma_dump_size(start, end, flags); + return 0; } -static void free_note_info(struct elf_note_info *info) +typedef struct { + struct elf_phdr *phdr; + off_t offset; +} FillRegionPhdr; + +static int wmr_fill_region_phdr(void *opaque, target_ulong start, + target_ulong end, unsigned long flags) { - struct elf_thread_status *ets; + FillRegionPhdr *d = opaque; + struct elf_phdr *phdr = d->phdr; - while (!QTAILQ_EMPTY(&info->thread_list)) { - ets = QTAILQ_FIRST(&info->thread_list); - QTAILQ_REMOVE(&info->thread_list, ets, ets_link); - g_free(ets); - } + phdr->p_type = PT_LOAD; + phdr->p_vaddr = start; + phdr->p_paddr = 0; + phdr->p_filesz = vma_dump_size(start, end, flags); + phdr->p_offset = d->offset; + d->offset += phdr->p_filesz; + phdr->p_memsz = end - start; + phdr->p_flags = (flags & PAGE_READ ? PF_R : 0) + | (flags & PAGE_WRITE_ORG ? PF_W : 0) + | (flags & PAGE_EXEC ? PF_X : 0); + phdr->p_align = ELF_EXEC_PAGESIZE; - g_free(info->prstatus); - g_free(info->psinfo); - g_free(info->notes); + bswap_phdr(phdr, 1); + d->phdr = phdr + 1; + return 0; } -static int write_note_info(struct elf_note_info *info, int fd) +static int wmr_write_region(void *opaque, target_ulong start, + target_ulong end, unsigned long flags) { - struct elf_thread_status *ets; - int i, error = 0; + int fd = *(int *)opaque; + size_t size = vma_dump_size(start, end, flags); - /* write prstatus, psinfo and auxv for current thread */ - for (i = 0; i < info->numnote; i++) - if ((error = write_note(&info->notes[i], fd)) != 0) - return (error); - - /* write prstatus for each thread */ - QTAILQ_FOREACH(ets, &info->thread_list, ets_link) { - if ((error = write_note(&ets->notes[0], fd)) != 0) - return (error); + if (!size) { + return 0; } - - return (0); + return dump_write(fd, g2h_untagged(start), size); } /* @@ -4654,151 +4405,125 @@ static int elf_core_dump(int signr, const CPUArchState *env) { const CPUState *cpu = env_cpu((CPUArchState *)env); const TaskState *ts = (const TaskState *)cpu->opaque; - struct vm_area_struct *vma = NULL; - g_autofree char *corefile = NULL; - struct elf_note_info info; - struct elfhdr elf; - struct elf_phdr phdr; struct rlimit dumpsize; - struct mm_struct *mm = NULL; - off_t offset = 0, data_offset = 0; - int segs = 0; + CountAndSizeRegions css; + off_t offset, note_offset, data_offset; + size_t note_size; + int cpus, ret; int fd = -1; - - init_note_info(&info); - - errno = 0; + CPUState *cpu_iter; if (prctl(PR_GET_DUMPABLE) == 0) { return 0; } - if (getrlimit(RLIMIT_CORE, &dumpsize) == 0 && dumpsize.rlim_cur == 0) { + if (getrlimit(RLIMIT_CORE, &dumpsize) < 0 || dumpsize.rlim_cur == 0) { return 0; } - corefile = core_dump_filename(ts); + cpu_list_lock(); + mmap_lock(); - if ((fd = open(corefile, O_WRONLY | O_CREAT, - S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH)) < 0) - return (-errno); + /* By unprotecting, we merge vmas that might be split. */ + walk_memory_regions(NULL, wmr_page_unprotect_regions); /* * Walk through target process memory mappings and - * set up structure containing this information. After - * this point vma_xxx functions can be used. + * set up structure containing this information. */ - if ((mm = vma_init()) == NULL) - goto out; + memset(&css, 0, sizeof(css)); + walk_memory_regions(&css, wmr_count_and_size_regions); - walk_memory_regions(mm, vma_walker); - segs = vma_get_mapping_count(mm); - - /* - * Construct valid coredump ELF header. We also - * add one more segment for notes. - */ - fill_elf_header(&elf, segs + 1, ELF_MACHINE, 0); - if (dump_write(fd, &elf, sizeof (elf)) != 0) - goto out; + cpus = 0; + CPU_FOREACH(cpu_iter) { + cpus++; + } - /* fill in the in-memory version of notes */ - if (fill_note_info(&info, signr, env) < 0) - goto out; + offset = sizeof(struct elfhdr); + offset += (css.count + 1) * sizeof(struct elf_phdr); + note_offset = offset; - offset += sizeof (elf); /* elf header */ - offset += (segs + 1) * sizeof (struct elf_phdr); /* program headers */ + offset += size_note("CORE", ts->info->auxv_len); + offset += size_note("CORE", sizeof(struct target_elf_prpsinfo)); + offset += size_note("CORE", sizeof(struct target_elf_prstatus)) * cpus; + note_size = offset - note_offset; + data_offset = ROUND_UP(offset, ELF_EXEC_PAGESIZE); - /* write out notes program header */ - fill_elf_note_phdr(&phdr, info.notes_size, offset); + /* Do not dump if the corefile size exceeds the limit. */ + if (dumpsize.rlim_cur != RLIM_INFINITY + && dumpsize.rlim_cur < data_offset + css.size) { + errno = 0; + goto out; + } - offset += info.notes_size; - if (dump_write(fd, &phdr, sizeof (phdr)) != 0) + { + g_autofree char *corefile = core_dump_filename(ts); + fd = open(corefile, O_WRONLY | O_CREAT | O_TRUNC, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + } + if (fd < 0) { goto out; + } /* - * ELF specification wants data to start at page boundary so - * we align it here. + * There is a fair amount of alignment padding within the notes + * as well as preceeding the process memory. Allocate a zeroed + * block to hold it all. Write all of the headers directly into + * this buffer and then write it out as a block. */ - data_offset = offset = roundup(offset, ELF_EXEC_PAGESIZE); + { + g_autofree void *header = g_malloc0(data_offset); + FillRegionPhdr frp; + void *hptr, *dptr; + + /* Create elf file header. */ + hptr = header; + fill_elf_header(hptr, css.count + 1, ELF_MACHINE, 0); + hptr += sizeof(struct elfhdr); + + /* Create elf program headers. */ + fill_elf_note_phdr(hptr, note_size, note_offset); + hptr += sizeof(struct elf_phdr); + + frp.phdr = hptr; + frp.offset = data_offset; + walk_memory_regions(&frp, wmr_fill_region_phdr); + hptr = frp.phdr; + + /* Create the notes. */ + dptr = fill_note(&hptr, NT_AUXV, "CORE", ts->info->auxv_len); + fill_auxv_note(dptr, ts); + + dptr = fill_note(&hptr, NT_PRPSINFO, "CORE", + sizeof(struct target_elf_prpsinfo)); + fill_prpsinfo_note(dptr, ts); + + CPU_FOREACH(cpu_iter) { + dptr = fill_note(&hptr, NT_PRSTATUS, "CORE", + sizeof(struct target_elf_prstatus)); + fill_prstatus_note(dptr, ts, cpu_iter, + cpu_iter == cpu ? signr : 0); + } - /* - * Write program headers for memory regions mapped in - * the target process. - */ - for (vma = vma_first(mm); vma != NULL; vma = vma_next(vma)) { - (void) memset(&phdr, 0, sizeof (phdr)); - - phdr.p_type = PT_LOAD; - phdr.p_offset = offset; - phdr.p_vaddr = vma->vma_start; - phdr.p_paddr = 0; - phdr.p_filesz = vma_dump_size(vma); - offset += phdr.p_filesz; - phdr.p_memsz = vma->vma_end - vma->vma_start; - phdr.p_flags = vma->vma_flags & PROT_READ ? PF_R : 0; - if (vma->vma_flags & PROT_WRITE) - phdr.p_flags |= PF_W; - if (vma->vma_flags & PROT_EXEC) - phdr.p_flags |= PF_X; - phdr.p_align = ELF_EXEC_PAGESIZE; - - bswap_phdr(&phdr, 1); - if (dump_write(fd, &phdr, sizeof(phdr)) != 0) { + if (dump_write(fd, header, data_offset) < 0) { goto out; } } /* - * Next we write notes just after program headers. No - * alignment needed here. + * Finally write process memory into the corefile as well. */ - if (write_note_info(&info, fd) < 0) + if (walk_memory_regions(&fd, wmr_write_region) < 0) { goto out; - - /* align data to page boundary */ - if (lseek(fd, data_offset, SEEK_SET) != data_offset) - goto out; - - /* - * Finally we can dump process memory into corefile as well. - */ - for (vma = vma_first(mm); vma != NULL; vma = vma_next(vma)) { - abi_ulong addr; - abi_ulong end; - - end = vma->vma_start + vma_dump_size(vma); - - for (addr = vma->vma_start; addr < end; - addr += TARGET_PAGE_SIZE) { - char page[TARGET_PAGE_SIZE]; - int error; - - /* - * Read in page from target process memory and - * write it to coredump file. - */ - error = copy_from_user(page, addr, sizeof (page)); - if (error != 0) { - (void) fprintf(stderr, "unable to dump " TARGET_ABI_FMT_lx "\n", - addr); - errno = -error; - goto out; - } - if (dump_write(fd, page, TARGET_PAGE_SIZE) < 0) - goto out; - } } + errno = 0; out: - free_note_info(&info); - if (mm != NULL) - vma_delete(mm); - (void) close(fd); - - if (errno != 0) - return (-errno); - return (0); + ret = -errno; + mmap_unlock(); + cpu_list_unlock(); + close(fd); + return ret; } #endif /* USE_ELF_CORE_DUMP */ diff --git a/linux-user/loongarch64/target_syscall.h b/linux-user/loongarch64/target_syscall.h index 8b5de52124..39f229bb9c 100644 --- a/linux-user/loongarch64/target_syscall.h +++ b/linux-user/loongarch64/target_syscall.h @@ -38,11 +38,4 @@ struct target_pt_regs { #define TARGET_MCL_FUTURE 2 #define TARGET_MCL_ONFAULT 4 -#define TARGET_FORCE_SHMLBA - -static inline abi_ulong target_shmlba(CPULoongArchState *env) -{ - return 64 * KiB; -} - #endif diff --git a/linux-user/main.c b/linux-user/main.c index 74b2fbb393..551acf1661 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -55,6 +55,7 @@ #include "loader.h" #include "user-mmap.h" #include "tcg/perf.h" +#include "exec/page-vary.h" #ifdef CONFIG_SEMIHOSTING #include "semihosting/semihost.h" @@ -332,11 +333,11 @@ static void handle_arg_ld_prefix(const char *arg) static void handle_arg_pagesize(const char *arg) { - qemu_host_page_size = atoi(arg); - if (qemu_host_page_size == 0 || - (qemu_host_page_size & (qemu_host_page_size - 1)) != 0) { - fprintf(stderr, "page size must be a power of two\n"); - exit(EXIT_FAILURE); + unsigned size, want = qemu_real_host_page_size(); + + if (qemu_strtoui(arg, NULL, 10, &size) || size != want) { + warn_report("Deprecated page size option cannot " + "change host page size (%u)", want); } } @@ -496,7 +497,7 @@ static const struct qemu_argument arg_table[] = { {"D", "QEMU_LOG_FILENAME", true, handle_arg_log_filename, "logfile", "write logs to 'logfile' (default stderr)"}, {"p", "QEMU_PAGESIZE", true, handle_arg_pagesize, - "pagesize", "set the host page size to 'pagesize'"}, + "pagesize", "deprecated change to host page size"}, {"one-insn-per-tb", "QEMU_ONE_INSN_PER_TB", false, handle_arg_one_insn_per_tb, "", "run with one guest instruction per emulated TB"}, @@ -680,6 +681,7 @@ int main(int argc, char **argv, char **envp) int i; int ret; int execfd; + int host_page_size; unsigned long max_reserved_va; bool preserve_argv0; @@ -781,7 +783,7 @@ int main(int argc, char **argv, char **envp) } cpu_type = parse_cpu_option(cpu_model); - /* init tcg before creating CPUs and to get qemu_host_page_size */ + /* init tcg before creating CPUs */ { AccelState *accel = current_accel(); AccelClass *ac = ACCEL_GET_CLASS(accel); @@ -791,6 +793,16 @@ int main(int argc, char **argv, char **envp) opt_one_insn_per_tb, &error_abort); ac->init_machine(NULL); } + + /* + * Finalize page size before creating CPUs. + * This will do nothing if !TARGET_PAGE_BITS_VARY. + * The most efficient setting is to match the host. + */ + host_page_size = qemu_real_host_page_size(); + set_preferred_target_page_bits(ctz32(host_page_size)); + finalize_target_page_bits(); + cpu = cpu_create(cpu_type); env = cpu_env(cpu); cpu_reset(cpu); @@ -804,8 +816,8 @@ int main(int argc, char **argv, char **envp) */ max_reserved_va = MAX_RESERVED_VA(cpu); if (reserved_va != 0) { - if ((reserved_va + 1) % qemu_host_page_size) { - char *s = size_to_str(qemu_host_page_size); + if ((reserved_va + 1) % host_page_size) { + char *s = size_to_str(host_page_size); fprintf(stderr, "Reserved virtual address not aligned mod %s\n", s); g_free(s); exit(EXIT_FAILURE); @@ -889,7 +901,7 @@ int main(int argc, char **argv, char **envp) if ((fp = fopen("/proc/sys/vm/mmap_min_addr", "r")) != NULL) { unsigned long tmp; if (fscanf(fp, "%lu", &tmp) == 1 && tmp != 0) { - mmap_min_addr = tmp; + mmap_min_addr = MAX(tmp, host_page_size); qemu_log_mask(CPU_LOG_PAGE, "host mmap_min_addr=0x%lx\n", mmap_min_addr); } @@ -902,7 +914,7 @@ int main(int argc, char **argv, char **envp) * If we're in a chroot with no /proc, fall back to 1 page. */ if (mmap_min_addr == 0) { - mmap_min_addr = qemu_host_page_size; + mmap_min_addr = host_page_size; qemu_log_mask(CPU_LOG_PAGE, "host mmap_min_addr=0x%lx (fallback)\n", mmap_min_addr); diff --git a/linux-user/mmap.c b/linux-user/mmap.c index 96c9433e27..4505fd7376 100644 --- a/linux-user/mmap.c +++ b/linux-user/mmap.c @@ -165,6 +165,7 @@ static int target_to_host_prot(int prot) /* NOTE: all the constants are the HOST ones, but addresses are target. */ int target_mprotect(abi_ulong start, abi_ulong len, int target_prot) { + int host_page_size = qemu_real_host_page_size(); abi_ulong starts[3]; abi_ulong lens[3]; int prots[3]; @@ -189,13 +190,13 @@ int target_mprotect(abi_ulong start, abi_ulong len, int target_prot) } last = start + len - 1; - host_start = start & qemu_host_page_mask; - host_last = HOST_PAGE_ALIGN(last) - 1; + host_start = start & -host_page_size; + host_last = ROUND_UP(last, host_page_size) - 1; nranges = 0; mmap_lock(); - if (host_last - host_start < qemu_host_page_size) { + if (host_last - host_start < host_page_size) { /* Single host page contains all guest pages: sum the prot. */ prot1 = target_prot; for (abi_ulong a = host_start; a < start; a += TARGET_PAGE_SIZE) { @@ -205,7 +206,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int target_prot) prot1 |= page_get_flags(a + 1); } starts[nranges] = host_start; - lens[nranges] = qemu_host_page_size; + lens[nranges] = host_page_size; prots[nranges] = prot1; nranges++; } else { @@ -218,10 +219,10 @@ int target_mprotect(abi_ulong start, abi_ulong len, int target_prot) /* If the resulting sum differs, create a new range. */ if (prot1 != target_prot) { starts[nranges] = host_start; - lens[nranges] = qemu_host_page_size; + lens[nranges] = host_page_size; prots[nranges] = prot1; nranges++; - host_start += qemu_host_page_size; + host_start += host_page_size; } } @@ -233,9 +234,9 @@ int target_mprotect(abi_ulong start, abi_ulong len, int target_prot) } /* If the resulting sum differs, create a new range. */ if (prot1 != target_prot) { - host_last -= qemu_host_page_size; + host_last -= host_page_size; starts[nranges] = host_last + 1; - lens[nranges] = qemu_host_page_size; + lens[nranges] = host_page_size; prots[nranges] = prot1; nranges++; } @@ -266,10 +267,35 @@ int target_mprotect(abi_ulong start, abi_ulong len, int target_prot) return ret; } -/* map an incomplete host page */ +/* + * Perform munmap on behalf of the target, with host parameters. + * If reserved_va, we must replace the memory reservation. + */ +static int do_munmap(void *addr, size_t len) +{ + if (reserved_va) { + void *ptr = mmap(addr, len, PROT_NONE, + MAP_FIXED | MAP_ANONYMOUS + | MAP_PRIVATE | MAP_NORESERVE, -1, 0); + return ptr == addr ? 0 : -1; + } + return munmap(addr, len); +} + +/* + * Map an incomplete host page. + * + * Here be dragons. This case will not work if there is an existing + * overlapping host page, which is file mapped, and for which the mapping + * is beyond the end of the file. In that case, we will see SIGBUS when + * trying to write a portion of this page. + * + * FIXME: Work around this with a temporary signal handler and longjmp. + */ static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last, int prot, int flags, int fd, off_t offset) { + int host_page_size = qemu_real_host_page_size(); abi_ulong real_last; void *host_start; int prot_old, prot_new; @@ -286,7 +312,7 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last, return false; } - real_last = real_start + qemu_host_page_size - 1; + real_last = real_start + host_page_size - 1; host_start = g2h_untagged(real_start); /* Get the protection of the target pages outside the mapping. */ @@ -304,12 +330,12 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last, * outside of the fragment we need to map. Allocate a new host * page to cover, discarding whatever else may have been present. */ - void *p = mmap(host_start, qemu_host_page_size, + void *p = mmap(host_start, host_page_size, target_to_host_prot(prot), flags | MAP_ANONYMOUS, -1, 0); if (p != host_start) { if (p != MAP_FAILED) { - munmap(p, qemu_host_page_size); + do_munmap(p, host_page_size); errno = EEXIST; } return false; @@ -324,7 +350,7 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last, /* Adjust protection to be able to write. */ if (!(host_prot_old & PROT_WRITE)) { host_prot_old |= PROT_WRITE; - mprotect(host_start, qemu_host_page_size, host_prot_old); + mprotect(host_start, host_page_size, host_prot_old); } /* Read or zero the new guest pages. */ @@ -338,7 +364,7 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last, /* Put final protection */ if (host_prot_new != host_prot_old) { - mprotect(host_start, qemu_host_page_size, host_prot_new); + mprotect(host_start, host_page_size, host_prot_new); } return true; } @@ -373,21 +399,21 @@ static abi_ulong mmap_find_vma_reserved(abi_ulong start, abi_ulong size, */ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, abi_ulong align) { + int host_page_size = qemu_real_host_page_size(); void *ptr, *prev; abi_ulong addr; int wrapped, repeat; - align = MAX(align, qemu_host_page_size); + align = MAX(align, host_page_size); /* If 'start' == 0, then a default start address is used. */ if (start == 0) { start = mmap_next_start; } else { - start &= qemu_host_page_mask; + start &= -host_page_size; } start = ROUND_UP(start, align); - - size = HOST_PAGE_ALIGN(size); + size = ROUND_UP(size, host_page_size); if (reserved_va) { return mmap_find_vma_reserved(start, size, align); @@ -488,302 +514,463 @@ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, abi_ulong align) } } -/* NOTE: all the constants are the HOST ones */ -abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot, - int flags, int fd, off_t offset) +/* + * Record a successful mmap within the user-exec interval tree. + */ +static abi_long mmap_end(abi_ulong start, abi_ulong last, + abi_ulong passthrough_start, + abi_ulong passthrough_last, + int flags, int page_flags) { - abi_ulong ret, last, real_start, real_last, retaddr, host_len; - abi_ulong passthrough_start = -1, passthrough_last = 0; - int page_flags; - off_t host_offset; - - mmap_lock(); - trace_target_mmap(start, len, target_prot, flags, fd, offset); - - if (!len) { - errno = EINVAL; - goto fail; + if (flags & MAP_ANONYMOUS) { + page_flags |= PAGE_ANON; } - - page_flags = validate_prot_to_pageflags(target_prot); - if (!page_flags) { - errno = EINVAL; - goto fail; + page_flags |= PAGE_RESET; + if (passthrough_start > passthrough_last) { + page_set_flags(start, last, page_flags); + } else { + if (start < passthrough_start) { + page_set_flags(start, passthrough_start - 1, page_flags); + } + page_set_flags(passthrough_start, passthrough_last, + page_flags | PAGE_PASSTHROUGH); + if (passthrough_last < last) { + page_set_flags(passthrough_last + 1, last, page_flags); + } } - - /* Also check for overflows... */ - len = TARGET_PAGE_ALIGN(len); - if (!len) { - errno = ENOMEM; - goto fail; + shm_region_rm_complete(start, last); + trace_target_mmap_complete(start); + if (qemu_loglevel_mask(CPU_LOG_PAGE)) { + FILE *f = qemu_log_trylock(); + if (f) { + fprintf(f, "page layout changed following mmap\n"); + page_dump(f); + qemu_log_unlock(f); + } } + return start; +} - if (offset & ~TARGET_PAGE_MASK) { - errno = EINVAL; - goto fail; - } +/* + * Special case host page size == target page size, + * where there are no edge conditions. + */ +static abi_long mmap_h_eq_g(abi_ulong start, abi_ulong len, + int host_prot, int flags, int page_flags, + int fd, off_t offset) +{ + void *p, *want_p = g2h_untagged(start); + abi_ulong last; - /* - * If we're mapping shared memory, ensure we generate code for parallel - * execution and flush old translations. This will work up to the level - * supported by the host -- anything that requires EXCP_ATOMIC will not - * be atomic with respect to an external process. - */ - if (flags & MAP_SHARED) { - CPUState *cpu = thread_cpu; - if (!(cpu->tcg_cflags & CF_PARALLEL)) { - cpu->tcg_cflags |= CF_PARALLEL; - tb_flush(cpu); - } + p = mmap(want_p, len, host_prot, flags, fd, offset); + if (p == MAP_FAILED) { + return -1; + } + /* If the host kernel does not support MAP_FIXED_NOREPLACE, emulate. */ + if ((flags & MAP_FIXED_NOREPLACE) && p != want_p) { + do_munmap(p, len); + errno = EEXIST; + return -1; } - real_start = start & qemu_host_page_mask; - host_offset = offset & qemu_host_page_mask; + start = h2g(p); + last = start + len - 1; + return mmap_end(start, last, start, last, flags, page_flags); +} - /* - * If the user is asking for the kernel to find a location, do that - * before we truncate the length for mapping files below. - */ - if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) { - host_len = len + offset - host_offset; - host_len = HOST_PAGE_ALIGN(host_len); - start = mmap_find_vma(real_start, host_len, TARGET_PAGE_SIZE); - if (start == (abi_ulong)-1) { - errno = ENOMEM; - goto fail; - } - } +/* + * Special case host page size < target page size. + * + * The two special cases are increased guest alignment, and mapping + * past the end of a file. + * + * When mapping files into a memory area larger than the file, + * accesses to pages beyond the file size will cause a SIGBUS. + * + * For example, if mmaping a file of 100 bytes on a host with 4K + * pages emulating a target with 8K pages, the target expects to + * be able to access the first 8K. But the host will trap us on + * any access beyond 4K. + * + * When emulating a target with a larger page-size than the hosts, + * we may need to truncate file maps at EOF and add extra anonymous + * pages up to the targets page boundary. + * + * This workaround only works for files that do not change. + * If the file is later extended (e.g. ftruncate), the SIGBUS + * vanishes and the proper behaviour is that changes within the + * anon page should be reflected in the file. + * + * However, this case is rather common with executable images, + * so the workaround is important for even trivial tests, whereas + * the mmap of of a file being extended is less common. + */ +static abi_long mmap_h_lt_g(abi_ulong start, abi_ulong len, int host_prot, + int mmap_flags, int page_flags, int fd, + off_t offset, int host_page_size) +{ + void *p, *want_p = g2h_untagged(start); + off_t fileend_adj = 0; + int flags = mmap_flags; + abi_ulong last, pass_last; - /* - * When mapping files into a memory area larger than the file, accesses - * to pages beyond the file size will cause a SIGBUS. - * - * For example, if mmaping a file of 100 bytes on a host with 4K pages - * emulating a target with 8K pages, the target expects to be able to - * access the first 8K. But the host will trap us on any access beyond - * 4K. - * - * When emulating a target with a larger page-size than the hosts, we - * may need to truncate file maps at EOF and add extra anonymous pages - * up to the targets page boundary. - */ - if ((qemu_real_host_page_size() < qemu_host_page_size) && - !(flags & MAP_ANONYMOUS)) { + if (!(flags & MAP_ANONYMOUS)) { struct stat sb; if (fstat(fd, &sb) == -1) { - goto fail; + return -1; } - - /* Are we trying to create a map beyond EOF?. */ - if (offset + len > sb.st_size) { + if (offset >= sb.st_size) { /* - * If so, truncate the file map at eof aligned with - * the hosts real pagesize. Additional anonymous maps - * will be created beyond EOF. + * The entire map is beyond the end of the file. + * Transform it to an anonymous mapping. */ - len = REAL_HOST_PAGE_ALIGN(sb.st_size - offset); + flags |= MAP_ANONYMOUS; + fd = -1; + offset = 0; + } else if (offset + len > sb.st_size) { + /* + * A portion of the map is beyond the end of the file. + * Truncate the file portion of the allocation. + */ + fileend_adj = offset + len - sb.st_size; } } - if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) { - uintptr_t host_start; - int host_prot; - void *p; + if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) { + if (fileend_adj) { + p = mmap(want_p, len, host_prot, flags | MAP_ANONYMOUS, -1, 0); + } else { + p = mmap(want_p, len, host_prot, flags, fd, offset); + } + if (p != want_p) { + if (p != MAP_FAILED) { + /* Host does not support MAP_FIXED_NOREPLACE: emulate. */ + do_munmap(p, len); + errno = EEXIST; + } + return -1; + } - host_len = len + offset - host_offset; - host_len = HOST_PAGE_ALIGN(host_len); - host_prot = target_to_host_prot(target_prot); + if (fileend_adj) { + void *t = mmap(p, len - fileend_adj, host_prot, + (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED, + fd, offset); + + if (t == MAP_FAILED) { + int save_errno = errno; + + /* + * We failed a map over the top of the successful anonymous + * mapping above. The only failure mode is running out of VMAs, + * and there's nothing that we can do to detect that earlier. + * If we have replaced an existing mapping with MAP_FIXED, + * then we cannot properly recover. It's a coin toss whether + * it would be better to exit or continue here. + */ + if (!(flags & MAP_FIXED_NOREPLACE) && + !page_check_range_empty(start, start + len - 1)) { + qemu_log("QEMU target_mmap late failure: %s", + strerror(save_errno)); + } + + do_munmap(want_p, len); + errno = save_errno; + return -1; + } + } + } else { + size_t host_len, part_len; /* - * Note: we prefer to control the mapping address. It is - * especially important if qemu_host_page_size > - * qemu_real_host_page_size. + * Take care to align the host memory. Perform a larger anonymous + * allocation and extract the aligned portion. Remap the file on + * top of that. */ - p = mmap(g2h_untagged(start), host_len, host_prot, - flags | MAP_FIXED | MAP_ANONYMOUS, -1, 0); + host_len = len + TARGET_PAGE_SIZE - host_page_size; + p = mmap(want_p, host_len, host_prot, flags | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { - goto fail; + return -1; + } + + part_len = (uintptr_t)p & (TARGET_PAGE_SIZE - 1); + if (part_len) { + part_len = TARGET_PAGE_SIZE - part_len; + do_munmap(p, part_len); + p += part_len; + host_len -= part_len; } - /* update start so that it points to the file position at 'offset' */ - host_start = (uintptr_t)p; + if (len < host_len) { + do_munmap(p + len, host_len - len); + } + if (!(flags & MAP_ANONYMOUS)) { - p = mmap(g2h_untagged(start), len, host_prot, - flags | MAP_FIXED, fd, host_offset); - if (p == MAP_FAILED) { - munmap(g2h_untagged(start), host_len); - goto fail; + void *t = mmap(p, len - fileend_adj, host_prot, + flags | MAP_FIXED, fd, offset); + + if (t == MAP_FAILED) { + int save_errno = errno; + do_munmap(p, len); + errno = save_errno; + return -1; } - host_start += offset - host_offset; } - start = h2g(host_start); - last = start + len - 1; - passthrough_start = start; - passthrough_last = last; + + start = h2g(p); + } + + last = start + len - 1; + if (fileend_adj) { + pass_last = ROUND_UP(last - fileend_adj, host_page_size) - 1; } else { - if (start & ~TARGET_PAGE_MASK) { - errno = EINVAL; - goto fail; + pass_last = last; + } + return mmap_end(start, last, start, pass_last, mmap_flags, page_flags); +} + +/* + * Special case host page size > target page size. + * + * The two special cases are address and file offsets that are valid + * for the guest that cannot be directly represented by the host. + */ +static abi_long mmap_h_gt_g(abi_ulong start, abi_ulong len, + int target_prot, int host_prot, + int flags, int page_flags, int fd, + off_t offset, int host_page_size) +{ + void *p, *want_p = g2h_untagged(start); + off_t host_offset = offset & -host_page_size; + abi_ulong last, real_start, real_last; + bool misaligned_offset = false; + size_t host_len; + + if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) { + /* + * Adjust the offset to something representable on the host. + */ + host_len = len + offset - host_offset; + p = mmap(want_p, host_len, host_prot, flags, fd, host_offset); + if (p == MAP_FAILED) { + return -1; } + + /* Update start to the file position at offset. */ + p += offset - host_offset; + + start = h2g(p); last = start + len - 1; - real_last = HOST_PAGE_ALIGN(last) - 1; + return mmap_end(start, last, start, last, flags, page_flags); + } + + if (!(flags & MAP_ANONYMOUS)) { + misaligned_offset = (start ^ offset) & (host_page_size - 1); /* - * Test if requested memory area fits target address space - * It can fail only on 64-bit host with 32-bit target. - * On any other target/host host mmap() handles this error correctly. + * The fallback for misalignment is a private mapping + read. + * This carries none of semantics required of MAP_SHARED. */ - if (last < start || !guest_range_valid_untagged(start, len)) { - errno = ENOMEM; - goto fail; + if (misaligned_offset && (flags & MAP_TYPE) != MAP_PRIVATE) { + errno = EINVAL; + return -1; } + } - if (flags & MAP_FIXED_NOREPLACE) { - /* Validate that the chosen range is empty. */ - if (!page_check_range_empty(start, last)) { - errno = EEXIST; - goto fail; - } + last = start + len - 1; + real_start = start & -host_page_size; + real_last = ROUND_UP(last, host_page_size) - 1; - /* - * With reserved_va, the entire address space is mmaped in the - * host to ensure it isn't accidentally used for something else. - * We have just checked that the guest address is not mapped - * within the guest, but need to replace the host reservation. - * - * Without reserved_va, despite the guest address check above, - * keep MAP_FIXED_NOREPLACE so that the guest does not overwrite - * any host address mappings. - */ - if (reserved_va) { - flags = (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED; + /* + * Handle the start and end of the mapping. + */ + if (real_start < start) { + abi_ulong real_page_last = real_start + host_page_size - 1; + if (last <= real_page_last) { + /* Entire allocation a subset of one host page. */ + if (!mmap_frag(real_start, start, last, target_prot, + flags, fd, offset)) { + return -1; } + return mmap_end(start, last, -1, 0, flags, page_flags); } - /* - * worst case: we cannot map the file because the offset is not - * aligned, so we read it - */ - if (!(flags & MAP_ANONYMOUS) && - (offset & ~qemu_host_page_mask) != (start & ~qemu_host_page_mask)) { - /* - * msync() won't work here, so we return an error if write is - * possible while it is a shared mapping - */ - if ((flags & MAP_TYPE) == MAP_SHARED - && (target_prot & PROT_WRITE)) { - errno = EINVAL; - goto fail; - } - retaddr = target_mmap(start, len, target_prot | PROT_WRITE, - (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) - | MAP_PRIVATE | MAP_ANONYMOUS, - -1, 0); - if (retaddr == -1) { - goto fail; - } - if (pread(fd, g2h_untagged(start), len, offset) == -1) { - goto fail; - } - if (!(target_prot & PROT_WRITE)) { - ret = target_mprotect(start, len, target_prot); - assert(ret == 0); - } - goto the_end; + if (!mmap_frag(real_start, start, real_page_last, target_prot, + flags, fd, offset)) { + return -1; } + real_start = real_page_last + 1; + } - /* handle the start of the mapping */ - if (start > real_start) { - if (real_last == real_start + qemu_host_page_size - 1) { - /* one single host page */ - if (!mmap_frag(real_start, start, last, - target_prot, flags, fd, offset)) { - goto fail; - } - goto the_end1; - } - if (!mmap_frag(real_start, start, - real_start + qemu_host_page_size - 1, - target_prot, flags, fd, offset)) { - goto fail; - } - real_start += qemu_host_page_size; + if (last < real_last) { + abi_ulong real_page_start = real_last - host_page_size + 1; + if (!mmap_frag(real_page_start, real_page_start, last, + target_prot, flags, fd, + offset + real_page_start - start)) { + return -1; } - /* handle the end of the mapping */ - if (last < real_last) { - abi_ulong real_page = real_last - qemu_host_page_size + 1; - if (!mmap_frag(real_page, real_page, last, - target_prot, flags, fd, - offset + real_page - start)) { - goto fail; - } - real_last -= qemu_host_page_size; + real_last = real_page_start - 1; + } + + if (real_start > real_last) { + return mmap_end(start, last, -1, 0, flags, page_flags); + } + + /* + * Handle the middle of the mapping. + */ + + host_len = real_last - real_start + 1; + want_p += real_start - start; + + if (flags & MAP_ANONYMOUS) { + p = mmap(want_p, host_len, host_prot, flags, -1, 0); + } else if (!misaligned_offset) { + p = mmap(want_p, host_len, host_prot, flags, fd, + offset + real_start - start); + } else { + p = mmap(want_p, host_len, host_prot | PROT_WRITE, + flags | MAP_ANONYMOUS, -1, 0); + } + if (p != want_p) { + if (p != MAP_FAILED) { + do_munmap(p, host_len); + errno = EEXIST; } + return -1; + } - /* map the middle (easier) */ - if (real_start < real_last) { - void *p, *want_p; - off_t offset1; - size_t len1; + if (misaligned_offset) { + /* TODO: The read could be short. */ + if (pread(fd, p, host_len, offset + real_start - start) != host_len) { + do_munmap(p, host_len); + return -1; + } + if (!(host_prot & PROT_WRITE)) { + mprotect(p, host_len, host_prot); + } + } - if (flags & MAP_ANONYMOUS) { - offset1 = 0; - } else { - offset1 = offset + real_start - start; + return mmap_end(start, last, -1, 0, flags, page_flags); +} + +static abi_long target_mmap__locked(abi_ulong start, abi_ulong len, + int target_prot, int flags, int page_flags, + int fd, off_t offset) +{ + int host_page_size = qemu_real_host_page_size(); + int host_prot; + + /* + * For reserved_va, we are in full control of the allocation. + * Find a suitable hole and convert to MAP_FIXED. + */ + if (reserved_va) { + if (flags & MAP_FIXED_NOREPLACE) { + /* Validate that the chosen range is empty. */ + if (!page_check_range_empty(start, start + len - 1)) { + errno = EEXIST; + return -1; } - len1 = real_last - real_start + 1; - want_p = g2h_untagged(real_start); - - p = mmap(want_p, len1, target_to_host_prot(target_prot), - flags, fd, offset1); - if (p != want_p) { - if (p != MAP_FAILED) { - munmap(p, len1); - errno = EEXIST; - } - goto fail; + flags = (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED; + } else if (!(flags & MAP_FIXED)) { + abi_ulong real_start = start & -host_page_size; + off_t host_offset = offset & -host_page_size; + size_t real_len = len + offset - host_offset; + abi_ulong align = MAX(host_page_size, TARGET_PAGE_SIZE); + + start = mmap_find_vma(real_start, real_len, align); + if (start == (abi_ulong)-1) { + errno = ENOMEM; + return -1; } - passthrough_start = real_start; - passthrough_last = real_last; + start += offset - host_offset; + flags |= MAP_FIXED; } } - the_end1: - if (flags & MAP_ANONYMOUS) { - page_flags |= PAGE_ANON; - } - page_flags |= PAGE_RESET; - if (passthrough_start > passthrough_last) { - page_set_flags(start, last, page_flags); + + host_prot = target_to_host_prot(target_prot); + + if (host_page_size == TARGET_PAGE_SIZE) { + return mmap_h_eq_g(start, len, host_prot, flags, + page_flags, fd, offset); + } else if (host_page_size < TARGET_PAGE_SIZE) { + return mmap_h_lt_g(start, len, host_prot, flags, + page_flags, fd, offset, host_page_size); } else { - if (start < passthrough_start) { - page_set_flags(start, passthrough_start - 1, page_flags); + return mmap_h_gt_g(start, len, target_prot, host_prot, flags, + page_flags, fd, offset, host_page_size); + } +} + +/* NOTE: all the constants are the HOST ones */ +abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot, + int flags, int fd, off_t offset) +{ + abi_long ret; + int page_flags; + + trace_target_mmap(start, len, target_prot, flags, fd, offset); + + if (!len) { + errno = EINVAL; + return -1; + } + + page_flags = validate_prot_to_pageflags(target_prot); + if (!page_flags) { + errno = EINVAL; + return -1; + } + + /* Also check for overflows... */ + len = TARGET_PAGE_ALIGN(len); + if (!len || len != (size_t)len) { + errno = ENOMEM; + return -1; + } + + if (offset & ~TARGET_PAGE_MASK) { + errno = EINVAL; + return -1; + } + if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) { + if (start & ~TARGET_PAGE_MASK) { + errno = EINVAL; + return -1; } - page_set_flags(passthrough_start, passthrough_last, - page_flags | PAGE_PASSTHROUGH); - if (passthrough_last < last) { - page_set_flags(passthrough_last + 1, last, page_flags); + if (!guest_range_valid_untagged(start, len)) { + errno = ENOMEM; + return -1; } } - shm_region_rm_complete(start, last); - the_end: - trace_target_mmap_complete(start); - if (qemu_loglevel_mask(CPU_LOG_PAGE)) { - FILE *f = qemu_log_trylock(); - if (f) { - fprintf(f, "page layout changed following mmap\n"); - page_dump(f); - qemu_log_unlock(f); + + mmap_lock(); + + ret = target_mmap__locked(start, len, target_prot, flags, + page_flags, fd, offset); + + mmap_unlock(); + + /* + * If we're mapping shared memory, ensure we generate code for parallel + * execution and flush old translations. This will work up to the level + * supported by the host -- anything that requires EXCP_ATOMIC will not + * be atomic with respect to an external process. + */ + if (ret != -1 && (flags & MAP_TYPE) != MAP_PRIVATE) { + CPUState *cpu = thread_cpu; + if (!(cpu->tcg_cflags & CF_PARALLEL)) { + cpu->tcg_cflags |= CF_PARALLEL; + tb_flush(cpu); } } - mmap_unlock(); - return start; -fail: - mmap_unlock(); - return -1; + + return ret; } static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong len) { + int host_page_size = qemu_real_host_page_size(); abi_ulong real_start; abi_ulong real_last; abi_ulong real_len; @@ -793,8 +980,8 @@ static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong len) int prot; last = start + len - 1; - real_start = start & qemu_host_page_mask; - real_last = HOST_PAGE_ALIGN(last) - 1; + real_start = start & -host_page_size; + real_last = ROUND_UP(last, host_page_size) - 1; /* * If guest pages remain on the first or last host pages, @@ -802,7 +989,7 @@ static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong len) * The single page special case is required for the last page, * lest real_start overflow to zero. */ - if (real_last - real_start < qemu_host_page_size) { + if (real_last - real_start < host_page_size) { prot = 0; for (a = real_start; a < start; a += TARGET_PAGE_SIZE) { prot |= page_get_flags(a); @@ -818,14 +1005,14 @@ static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong len) prot |= page_get_flags(a); } if (prot != 0) { - real_start += qemu_host_page_size; + real_start += host_page_size; } for (prot = 0, a = last; a < real_last; a += TARGET_PAGE_SIZE) { prot |= page_get_flags(a + 1); } if (prot != 0) { - real_last -= qemu_host_page_size; + real_last -= host_page_size; } if (real_last < real_start) { @@ -836,13 +1023,7 @@ static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong len) real_len = real_last - real_start + 1; host_start = g2h_untagged(real_start); - if (reserved_va) { - void *ptr = mmap(host_start, real_len, PROT_NONE, - MAP_FIXED | MAP_ANONYMOUS - | MAP_PRIVATE | MAP_NORESERVE, -1, 0); - return ptr == host_start ? 0 : -1; - } - return munmap(host_start, real_len); + return do_munmap(host_start, real_len); } int target_munmap(abi_ulong start, abi_ulong len) @@ -1055,69 +1236,161 @@ static inline abi_ulong target_shmlba(CPUArchState *cpu_env) } #endif +#if defined(__arm__) || defined(__mips__) || defined(__sparc__) +#define HOST_FORCE_SHMLBA 1 +#else +#define HOST_FORCE_SHMLBA 0 +#endif + abi_ulong target_shmat(CPUArchState *cpu_env, int shmid, abi_ulong shmaddr, int shmflg) { CPUState *cpu = env_cpu(cpu_env); - abi_ulong raddr; struct shmid_ds shm_info; int ret; - abi_ulong shmlba; + int h_pagesize; + int t_shmlba, h_shmlba, m_shmlba; + size_t t_len, h_len, m_len; /* shmat pointers are always untagged */ - /* find out the length of the shared memory segment */ + /* + * Because we can't use host shmat() unless the address is sufficiently + * aligned for the host, we'll need to check both. + * TODO: Could be fixed with softmmu. + */ + t_shmlba = target_shmlba(cpu_env); + h_pagesize = qemu_real_host_page_size(); + h_shmlba = (HOST_FORCE_SHMLBA ? SHMLBA : h_pagesize); + m_shmlba = MAX(t_shmlba, h_shmlba); + + if (shmaddr) { + if (shmaddr & (m_shmlba - 1)) { + if (shmflg & SHM_RND) { + /* + * The guest is allowing the kernel to round the address. + * Assume that the guest is ok with us rounding to the + * host required alignment too. Anyway if we don't, we'll + * get an error from the kernel. + */ + shmaddr &= ~(m_shmlba - 1); + if (shmaddr == 0 && (shmflg & SHM_REMAP)) { + return -TARGET_EINVAL; + } + } else { + int require = TARGET_PAGE_SIZE; +#ifdef TARGET_FORCE_SHMLBA + require = t_shmlba; +#endif + /* + * Include host required alignment, as otherwise we cannot + * use host shmat at all. + */ + require = MAX(require, h_shmlba); + if (shmaddr & (require - 1)) { + return -TARGET_EINVAL; + } + } + } + } else { + if (shmflg & SHM_REMAP) { + return -TARGET_EINVAL; + } + } + /* All rounding now manually concluded. */ + shmflg &= ~SHM_RND; + + /* Find out the length of the shared memory segment. */ ret = get_errno(shmctl(shmid, IPC_STAT, &shm_info)); if (is_error(ret)) { /* can't get length, bail out */ return ret; } + t_len = TARGET_PAGE_ALIGN(shm_info.shm_segsz); + h_len = ROUND_UP(shm_info.shm_segsz, h_pagesize); + m_len = MAX(t_len, h_len); - shmlba = target_shmlba(cpu_env); - - if (shmaddr & (shmlba - 1)) { - if (shmflg & SHM_RND) { - shmaddr &= ~(shmlba - 1); - } else { - return -TARGET_EINVAL; - } - } - if (!guest_range_valid_untagged(shmaddr, shm_info.shm_segsz)) { + if (!guest_range_valid_untagged(shmaddr, m_len)) { return -TARGET_EINVAL; } WITH_MMAP_LOCK_GUARD() { - void *host_raddr; + bool mapped = false; + void *want, *test; abi_ulong last; - if (shmaddr) { - host_raddr = shmat(shmid, (void *)g2h_untagged(shmaddr), shmflg); + if (!shmaddr) { + shmaddr = mmap_find_vma(0, m_len, m_shmlba); + if (shmaddr == -1) { + return -TARGET_ENOMEM; + } + mapped = !reserved_va; + } else if (shmflg & SHM_REMAP) { + /* + * If host page size > target page size, the host shmat may map + * more memory than the guest expects. Reject a mapping that + * would replace memory in the unexpected gap. + * TODO: Could be fixed with softmmu. + */ + if (t_len < h_len && + !page_check_range_empty(shmaddr + t_len, + shmaddr + h_len - 1)) { + return -TARGET_EINVAL; + } } else { - abi_ulong mmap_start; + if (!page_check_range_empty(shmaddr, shmaddr + m_len - 1)) { + return -TARGET_EINVAL; + } + } - /* In order to use the host shmat, we need to honor host SHMLBA. */ - mmap_start = mmap_find_vma(0, shm_info.shm_segsz, - MAX(SHMLBA, shmlba)); + /* All placement is now complete. */ + want = (void *)g2h_untagged(shmaddr); - if (mmap_start == -1) { - return -TARGET_ENOMEM; + /* + * Map anonymous pages across the entire range, then remap with + * the shared memory. This is required for a number of corner + * cases for which host and guest page sizes differ. + */ + if (h_len != t_len) { + int mmap_p = PROT_READ | (shmflg & SHM_RDONLY ? 0 : PROT_WRITE); + int mmap_f = MAP_PRIVATE | MAP_ANONYMOUS + | (reserved_va || (shmflg & SHM_REMAP) + ? MAP_FIXED : MAP_FIXED_NOREPLACE); + + test = mmap(want, m_len, mmap_p, mmap_f, -1, 0); + if (unlikely(test != want)) { + /* shmat returns EINVAL not EEXIST like mmap. */ + ret = (test == MAP_FAILED && errno != EEXIST + ? get_errno(-1) : -TARGET_EINVAL); + if (mapped) { + do_munmap(want, m_len); + } + return ret; } - host_raddr = shmat(shmid, g2h_untagged(mmap_start), - shmflg | SHM_REMAP); + mapped = true; } - if (host_raddr == (void *)-1) { - return get_errno(-1); + if (reserved_va || mapped) { + shmflg |= SHM_REMAP; + } + test = shmat(shmid, want, shmflg); + if (test == MAP_FAILED) { + ret = get_errno(-1); + if (mapped) { + do_munmap(want, m_len); + } + return ret; } - raddr = h2g(host_raddr); - last = raddr + shm_info.shm_segsz - 1; + assert(test == want); - page_set_flags(raddr, last, + last = shmaddr + m_len - 1; + page_set_flags(shmaddr, last, PAGE_VALID | PAGE_RESET | PAGE_READ | - (shmflg & SHM_RDONLY ? 0 : PAGE_WRITE)); + (shmflg & SHM_RDONLY ? 0 : PAGE_WRITE) | + (shmflg & SHM_EXEC ? PAGE_EXEC : 0)); - shm_region_rm_complete(raddr, last); - shm_region_add(raddr, last); + shm_region_rm_complete(shmaddr, last); + shm_region_add(shmaddr, last); } /* @@ -1131,7 +1404,15 @@ abi_ulong target_shmat(CPUArchState *cpu_env, int shmid, tb_flush(cpu); } - return raddr; + if (qemu_loglevel_mask(CPU_LOG_PAGE)) { + FILE *f = qemu_log_trylock(); + if (f) { + fprintf(f, "page layout changed following shmat\n"); + page_dump(f); + qemu_log_unlock(f); + } + } + return shmaddr; } abi_long target_shmdt(abi_ulong shmaddr) diff --git a/linux-user/strace.c b/linux-user/strace.c index cf26e55264..8d13e55a5b 100644 --- a/linux-user/strace.c +++ b/linux-user/strace.c @@ -670,6 +670,26 @@ print_semctl(CPUArchState *cpu_env, const struct syscallname *name, } #endif +static void +print_shmat(CPUArchState *cpu_env, const struct syscallname *name, + abi_long arg0, abi_long arg1, abi_long arg2, + abi_long arg3, abi_long arg4, abi_long arg5) +{ + static const struct flags shmat_flags[] = { + FLAG_GENERIC(SHM_RND), + FLAG_GENERIC(SHM_REMAP), + FLAG_GENERIC(SHM_RDONLY), + FLAG_GENERIC(SHM_EXEC), + FLAG_END + }; + + print_syscall_prologue(name); + print_raw_param(TARGET_ABI_FMT_ld, arg0, 0); + print_pointer(arg1, 0); + print_flags(shmat_flags, arg2, 1); + print_syscall_epilogue(name); +} + #ifdef TARGET_NR_ipc static void print_ipc(CPUArchState *cpu_env, const struct syscallname *name, @@ -683,6 +703,10 @@ print_ipc(CPUArchState *cpu_env, const struct syscallname *name, print_ipc_cmd(arg3); qemu_log(",0x" TARGET_ABI_FMT_lx ")", arg4); break; + case IPCOP_shmat: + print_shmat(cpu_env, &(const struct syscallname){ .name = "shmat" }, + arg1, arg4, arg2, 0, 0, 0); + break; default: qemu_log(("%s(" TARGET_ABI_FMT_ld "," diff --git a/linux-user/strace.list b/linux-user/strace.list index 6655d4f26d..dfd4237d14 100644 --- a/linux-user/strace.list +++ b/linux-user/strace.list @@ -1398,7 +1398,7 @@ { TARGET_NR_sgetmask, "sgetmask" , NULL, NULL, NULL }, #endif #ifdef TARGET_NR_shmat -{ TARGET_NR_shmat, "shmat" , NULL, NULL, print_syscall_ret_addr }, +{ TARGET_NR_shmat, "shmat" , NULL, print_shmat, print_syscall_ret_addr }, #endif #ifdef TARGET_NR_shmctl { TARGET_NR_shmctl, "shmctl" , NULL, NULL, NULL }, diff --git a/linux-user/syscall.c b/linux-user/syscall.c index e384e14248..bc8c06522f 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -7994,6 +7994,10 @@ static void open_self_maps_4(const struct open_self_maps_data *d, path = "[heap]"; } else if (start == info->vdso) { path = "[vdso]"; +#ifdef TARGET_X86_64 + } else if (start == TARGET_VSYSCALL_PAGE) { + path = "[vsyscall]"; +#endif } /* Except null device (MAP_ANON), adjust offset for this fragment. */ @@ -8082,6 +8086,18 @@ static int open_self_maps_2(void *opaque, target_ulong guest_start, uintptr_t host_start = (uintptr_t)g2h_untagged(guest_start); uintptr_t host_last = (uintptr_t)g2h_untagged(guest_end - 1); +#ifdef TARGET_X86_64 + /* + * Because of the extremely high position of the page within the guest + * virtual address space, this is not backed by host memory at all. + * Therefore the loop below would fail. This is the only instance + * of not having host backing memory. + */ + if (guest_start == TARGET_VSYSCALL_PAGE) { + return open_self_maps_3(opaque, guest_start, guest_end, flags); + } +#endif + while (1) { IntervalTreeNode *n = interval_tree_iter_first(d->host_maps, host_start, host_start); diff --git a/meson.build b/meson.build index 0ef1654e86..c59ca496f2 100644 --- a/meson.build +++ b/meson.build @@ -555,17 +555,24 @@ endif # Check further flags that make QEMU more robust against malicious parties hardening_flags = [ - # Zero out registers used during a function call - # upon its return. This makes it harder to assemble - # ROP gadgets into something usable - '-fzero-call-used-regs=used-gpr', - # Initialize all stack variables to zero. This makes # it harder to take advantage of uninitialized stack # data to drive exploits '-ftrivial-auto-var-init=zero', ] +# Zero out registers used during a function call +# upon its return. This makes it harder to assemble +# ROP gadgets into something usable +# +# NB: Clang 17 is broken and SEGVs +# https://github.com/llvm/llvm-project/issues/75168 +if cc.compiles('extern struct { void (*cb)(void); } s; void f(void) { s.cb(); }', + name: '-fzero-call-used-regs=used-gpr', + args: ['-O2', '-fzero-call-used-regs=used-gpr']) + hardening_flags += '-fzero-call-used-regs=used-gpr' +endif + qemu_common_flags += cc.get_supported_arguments(hardening_flags) add_global_arguments(qemu_common_flags, native: false, language: all_languages) diff --git a/migration/fd.c b/migration/fd.c index 0eb677dcae..d4ae72d132 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -15,18 +15,41 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include "channel.h" #include "fd.h" #include "migration.h" #include "monitor/monitor.h" +#include "io/channel-file.h" #include "io/channel-util.h" +#include "options.h" #include "trace.h" +static struct FdOutgoingArgs { + int fd; +} outgoing_args; + +int fd_args_get_fd(void) +{ + return outgoing_args.fd; +} + +void fd_cleanup_outgoing_migration(void) +{ + if (outgoing_args.fd > 0) { + close(outgoing_args.fd); + outgoing_args.fd = -1; + } +} + void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) { QIOChannel *ioc; int fd = monitor_get_fd(monitor_cur(), fdname, errp); + + outgoing_args.fd = -1; + if (fd == -1) { return; } @@ -38,6 +61,8 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error ** return; } + outgoing_args.fd = fd; + qio_channel_set_name(ioc, "migration-fd-outgoing"); migration_channel_connect(s, ioc, NULL, NULL); object_unref(OBJECT(ioc)); @@ -73,4 +98,23 @@ void fd_start_incoming_migration(const char *fdname, Error **errp) fd_accept_incoming_migration, NULL, NULL, g_main_context_get_thread_default()); + + if (migrate_multifd()) { + int channels = migrate_multifd_channels(); + + while (channels--) { + ioc = QIO_CHANNEL(qio_channel_file_new_fd(dup(fd))); + + if (QIO_CHANNEL_FILE(ioc)->fd == -1) { + error_setg(errp, "Failed to duplicate fd %d", fd); + return; + } + + qio_channel_set_name(ioc, "migration-fd-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + fd_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); + } + } } diff --git a/migration/fd.h b/migration/fd.h index b901bc014e..0c0a18d9e7 100644 --- a/migration/fd.h +++ b/migration/fd.h @@ -20,4 +20,6 @@ void fd_start_incoming_migration(const char *fdname, Error **errp); void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp); +void fd_cleanup_outgoing_migration(void); +int fd_args_get_fd(void); #endif diff --git a/migration/file.c b/migration/file.c index 5d4975f43e..164b079966 100644 --- a/migration/file.c +++ b/migration/file.c @@ -6,17 +6,25 @@ */ #include "qemu/osdep.h" +#include "exec/ramblock.h" #include "qemu/cutils.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "channel.h" +#include "fd.h" #include "file.h" #include "migration.h" #include "io/channel-file.h" #include "io/channel-util.h" +#include "options.h" #include "trace.h" #define OFFSET_OPTION ",offset=" +static struct FileOutgoingArgs { + char *fname; +} outgoing_args; + /* Remove the offset option from @filespec and return it in @offsetp. */ int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp) @@ -36,6 +44,41 @@ int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp) return 0; } +void file_cleanup_outgoing_migration(void) +{ + g_free(outgoing_args.fname); + outgoing_args.fname = NULL; +} + +bool file_send_channel_create(gpointer opaque, Error **errp) +{ + QIOChannelFile *ioc; + int flags = O_WRONLY; + bool ret = false; + int fd = fd_args_get_fd(); + + if (fd && fd != -1) { + ioc = qio_channel_file_new_fd(dup(fd)); + } else { + ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); + if (!ioc) { + goto out; + } + } + + multifd_channel_connect(opaque, QIO_CHANNEL(ioc)); + ret = true; + +out: + /* + * File channel creation is synchronous. However posting this + * semaphore here is simpler than adding a special case. + */ + multifd_send_channel_created(); + + return ret; +} + void file_start_outgoing_migration(MigrationState *s, FileMigrationArgs *file_args, Error **errp) { @@ -52,6 +95,8 @@ void file_start_outgoing_migration(MigrationState *s, return; } + outgoing_args.fname = g_strdup(filename); + ioc = QIO_CHANNEL(fioc); if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) { return; @@ -74,7 +119,8 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) g_autofree char *filename = g_strdup(file_args->filename); QIOChannelFile *fioc = NULL; uint64_t offset = file_args->offset; - QIOChannel *ioc; + int channels = 1; + int i = 0; trace_migration_file_incoming(filename); @@ -83,13 +129,100 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) return; } - ioc = QIO_CHANNEL(fioc); - if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) { + if (offset && + qio_channel_io_seek(QIO_CHANNEL(fioc), offset, SEEK_SET, errp) < 0) { return; } - qio_channel_set_name(QIO_CHANNEL(ioc), "migration-file-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, - file_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); + + if (migrate_multifd()) { + channels += migrate_multifd_channels(); + } + + do { + QIOChannel *ioc = QIO_CHANNEL(fioc); + + qio_channel_set_name(ioc, "migration-file-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + file_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); + + fioc = qio_channel_file_new_fd(dup(fioc->fd)); + + if (!fioc || fioc->fd == -1) { + error_setg(errp, "Error creating migration incoming channel"); + break; + } + } while (++i < channels); +} + +int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, + int niov, RAMBlock *block, Error **errp) +{ + ssize_t ret = -1; + int i, slice_idx, slice_num; + uintptr_t base, next, offset; + size_t len; + + slice_idx = 0; + slice_num = 1; + + /* + * If the iov array doesn't have contiguous elements, we need to + * split it in slices because we only have one file offset for the + * whole iov. Do this here so callers don't need to break the iov + * array themselves. + */ + for (i = 0; i < niov; i++, slice_num++) { + base = (uintptr_t) iov[i].iov_base; + + if (i != niov - 1) { + len = iov[i].iov_len; + next = (uintptr_t) iov[i + 1].iov_base; + + if (base + len == next) { + continue; + } + } + + /* + * Use the offset of the first element of the segment that + * we're sending. + */ + offset = (uintptr_t) iov[slice_idx].iov_base - (uintptr_t) block->host; + if (offset >= block->used_length) { + error_setg(errp, "offset " RAM_ADDR_FMT + "outside of ramblock %s range", offset, block->idstr); + ret = -1; + break; + } + + ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num, + block->pages_offset + offset, errp); + if (ret < 0) { + break; + } + + slice_idx += slice_num; + slice_num = 0; + } + + return (ret < 0) ? ret : 0; +} + +int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp) +{ + MultiFDRecvData *data = p->data; + size_t ret; + + ret = qio_channel_pread(p->c, (char *) data->opaque, + data->size, data->file_offset, errp); + if (ret != data->size) { + error_prepend(errp, + "multifd recv (%u): read 0x%zx, expected 0x%zx", + p->id, ret, data->size); + return -1; + } + + return 0; } diff --git a/migration/file.h b/migration/file.h index 37d6a08bfc..9f71e87f74 100644 --- a/migration/file.h +++ b/migration/file.h @@ -9,10 +9,18 @@ #define QEMU_MIGRATION_FILE_H #include "qapi/qapi-types-migration.h" +#include "io/task.h" +#include "channel.h" +#include "multifd.h" void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp); void file_start_outgoing_migration(MigrationState *s, FileMigrationArgs *file_args, Error **errp); int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp); +void file_cleanup_outgoing_migration(void); +bool file_send_channel_create(gpointer opaque, Error **errp); +int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, + int niov, RAMBlock *block, Error **errp); +int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp); #endif diff --git a/migration/migration.c b/migration/migration.c index bab68bcbef..a49fcd53ee 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -140,9 +140,38 @@ static bool transport_supports_multi_channels(MigrationAddress *addr) if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { SocketAddress *saddr = &addr->u.socket; - return saddr->type == SOCKET_ADDRESS_TYPE_INET || - saddr->type == SOCKET_ADDRESS_TYPE_UNIX || - saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + if (saddr->type == SOCKET_ADDRESS_TYPE_FD) { + return migrate_mapped_ram(); + } + + return (saddr->type == SOCKET_ADDRESS_TYPE_INET || + saddr->type == SOCKET_ADDRESS_TYPE_UNIX || + saddr->type == SOCKET_ADDRESS_TYPE_VSOCK); + } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { + return migrate_mapped_ram(); + } else { + return false; + } +} + +static bool migration_needs_seekable_channel(void) +{ + return migrate_mapped_ram(); +} + +static bool transport_supports_seeking(MigrationAddress *addr) +{ + if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { + return true; + } + + /* + * At this point, the user might not yet have passed the file + * descriptor to QEMU, so we cannot know for sure whether it + * refers to a plain file or a socket. Let it through anyway. + */ + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { + return addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD; } return false; @@ -152,6 +181,12 @@ static bool migration_channels_and_transport_compatible(MigrationAddress *addr, Error **errp) { + if (migration_needs_seekable_channel() && + !transport_supports_seeking(addr)) { + error_setg(errp, "Migration requires seekable transport (e.g. file)"); + return false; + } + if (migration_needs_multiple_sockets() && !transport_supports_multi_channels(addr)) { error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); @@ -881,7 +916,8 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) uint32_t channel_magic = 0; int ret = 0; - if (migrate_multifd() && !migrate_postcopy_ram() && + if (migrate_multifd() && !migrate_mapped_ram() && + !migrate_postcopy_ram() && qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { /* * With multiple channels, it is possible that we receive channels @@ -1950,6 +1986,18 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return false; } + if (migrate_mapped_ram()) { + if (migrate_tls()) { + error_setg(errp, "Cannot use TLS with mapped-ram"); + return false; + } + + if (migrate_multifd_compression()) { + error_setg(errp, "Cannot use compression with mapped-ram"); + return false; + } + } + if (migrate_mode_is_cpr(s)) { const char *conflict = NULL; diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 012e3bdea1..6120faad65 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -69,7 +69,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) err_msg = "out of memory for buf"; goto err_free_zbuff; } - p->data = z; + p->compress_data = z; return 0; err_free_zbuff: @@ -92,15 +92,15 @@ err_free_z: */ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; deflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; g_free(z->buf); z->buf = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -117,7 +117,7 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; @@ -194,7 +194,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) struct zlib_data *z = g_new0(struct zlib_data, 1); z_stream *zs = &z->zs; - p->data = z; + p->compress_data = z; zs->zalloc = Z_NULL; zs->zfree = Z_NULL; zs->opaque = Z_NULL; @@ -224,17 +224,17 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zlib_recv_cleanup(MultiFDRecvParams *p) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; inflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** - * zlib_recv_pages: read the data from the channel into actual pages + * zlib_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -244,9 +244,9 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zlib_recv(MultiFDRecvParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t in_size = p->next_packet_size; /* we measure the change of total_out */ @@ -319,7 +319,7 @@ static MultiFDMethods multifd_zlib_ops = { .send_prepare = zlib_send_prepare, .recv_setup = zlib_recv_setup, .recv_cleanup = zlib_recv_cleanup, - .recv_pages = zlib_recv_pages + .recv = zlib_recv }; static void multifd_zlib_register(void) diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index dc8fe43e94..cac236833d 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -52,7 +52,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int res; - p->data = z; + p->compress_data = z; z->zcs = ZSTD_createCStream(); if (!z->zcs) { g_free(z); @@ -90,14 +90,14 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) */ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeCStream(z->zcs); z->zcs = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -114,7 +114,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; uint32_t i; @@ -183,7 +183,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int ret; - p->data = z; + p->compress_data = z; z->zds = ZSTD_createDStream(); if (!z->zds) { g_free(z); @@ -221,18 +221,18 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zstd_recv_cleanup(MultiFDRecvParams *p) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeDStream(z->zds); z->zds = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** - * zstd_recv_pages: read the data from the channel into actual pages + * zstd_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -242,13 +242,13 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zstd_recv(MultiFDRecvParams *p, Error **errp) { uint32_t in_size = p->next_packet_size; uint32_t out_size = 0; uint32_t expected_size = p->normal_num * p->page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; int i; @@ -310,7 +310,7 @@ static MultiFDMethods multifd_zstd_ops = { .send_prepare = zstd_send_prepare, .recv_setup = zstd_recv_setup, .recv_cleanup = zstd_recv_cleanup, - .recv_pages = zstd_recv_pages + .recv = zstd_recv }; static void multifd_zstd_register(void) diff --git a/migration/multifd.c b/migration/multifd.c index 6c07f19af1..d4a44da559 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -17,7 +17,8 @@ #include "exec/ramblock.h" #include "qemu/error-report.h" #include "qapi/error.h" -#include "ram.h" +#include "fd.h" +#include "file.h" #include "migration.h" #include "migration-stats.h" #include "socket.h" @@ -28,6 +29,7 @@ #include "threadinfo.h" #include "options.h" #include "qemu/yank.h" +#include "io/channel-file.h" #include "io/channel-socket.h" #include "yank_functions.h" @@ -81,9 +83,13 @@ struct { struct { MultiFDRecvParams *params; + MultiFDRecvData *data; /* number of created threads */ int count; - /* syncs main thread and channels */ + /* + * This is always posted by the recv threads, the migration thread + * uses it to wait for recv threads to finish assigned tasks. + */ QemuSemaphore sem_sync; /* global number of generated multifd packets */ uint64_t packet_num; @@ -92,6 +98,27 @@ struct { MultiFDMethods *ops; } *multifd_recv_state; +static bool multifd_use_packets(void) +{ + return !migrate_mapped_ram(); +} + +void multifd_send_channel_created(void) +{ + qemu_sem_post(&multifd_send_state->channels_created); +} + +static void multifd_set_file_bitmap(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + + assert(pages->block); + + for (int i = 0; i < p->pages->num; i++) { + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i]); + } +} + /* Multifd without compression */ /** @@ -122,6 +149,19 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) return; } +static void multifd_send_prepare_iovs(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; + p->iov[p->iovs_num].iov_len = p->page_size; + p->iovs_num++; + } + + p->next_packet_size = pages->num * p->page_size; +} + /** * nocomp_send_prepare: prepare date to be able to send * @@ -136,9 +176,15 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { bool use_zero_copy_send = migrate_zero_copy_send(); - MultiFDPages_t *pages = p->pages; int ret; + if (!multifd_use_packets()) { + multifd_send_prepare_iovs(p); + multifd_set_file_bitmap(p); + + return 0; + } + if (!use_zero_copy_send) { /* * Only !zerocopy needs the header in IOV; zerocopy will @@ -147,13 +193,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) multifd_send_prepare_header(p); } - for (int i = 0; i < pages->num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; - p->iov[p->iovs_num].iov_len = p->page_size; - p->iovs_num++; - } - - p->next_packet_size = pages->num * p->page_size; + multifd_send_prepare_iovs(p); p->flags |= MULTIFD_FLAG_NOCOMP; multifd_send_fill_packet(p); @@ -197,7 +237,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) } /** - * nocomp_recv_pages: read the data from the channel into actual pages + * nocomp_recv: read the data from the channel * * For no compression we just need to read things into the correct place. * @@ -206,9 +246,15 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) +static int nocomp_recv(MultiFDRecvParams *p, Error **errp) { - uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t flags; + + if (!multifd_use_packets()) { + return multifd_file_recv_data(p, errp); + } + + flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; if (flags != MULTIFD_FLAG_NOCOMP) { error_setg(errp, "multifd %u: flags received %x flags expected %x", @@ -228,7 +274,7 @@ static MultiFDMethods multifd_nocomp_ops = { .send_prepare = nocomp_send_prepare, .recv_setup = nocomp_recv_setup, .recv_cleanup = nocomp_recv_cleanup, - .recv_pages = nocomp_recv_pages + .recv = nocomp_recv }; static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = { @@ -663,6 +709,19 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { if (p->c) { migration_ioc_unregister_yank(p->c); + /* + * An explicit close() on the channel here is normally not + * required, but can be helpful for "file:" iochannels, where it + * will include fdatasync() to make sure the data is flushed to the + * disk backend. + * + * The object_unref() cannot guarantee that because: (1) finalize() + * of the iochannel is only triggered on the last reference, and + * it's not guaranteed that we always hold the last refcount when + * reaching here, and, (2) even if finalize() is invoked, it only + * does a close(fd) without data flush. + */ + qio_channel_close(p->c, &error_abort); object_unref(OBJECT(p->c)); p->c = NULL; } @@ -684,6 +743,8 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { + file_cleanup_outgoing_migration(); + fd_cleanup_outgoing_migration(); socket_cleanup_outgoing_migration(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); @@ -795,15 +856,18 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; + bool use_packets = multifd_use_packets(); thread = migration_threads_add(p->name, qemu_get_thread_id()); trace_multifd_send_thread_start(p->id); rcu_register_thread(); - if (multifd_send_initial_packet(p, &local_err) < 0) { - ret = -1; - goto out; + if (use_packets) { + if (multifd_send_initial_packet(p, &local_err) < 0) { + ret = -1; + goto out; + } } while (true) { @@ -829,8 +893,15 @@ static void *multifd_send_thread(void *opaque) break; } - ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, - 0, p->write_flags, &local_err); + if (migrate_mapped_ram()) { + ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num, + p->pages->block, &local_err); + } else { + ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, + NULL, 0, p->write_flags, + &local_err); + } + if (ret != 0) { break; } @@ -854,16 +925,20 @@ static void *multifd_send_thread(void *opaque) * it doesn't require explicit memory barriers. */ assert(qatomic_read(&p->pending_sync)); - p->flags = MULTIFD_FLAG_SYNC; - multifd_send_fill_packet(p); - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; + + if (use_packets) { + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + break; + } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; } - /* p->next_packet_size will always be zero for a SYNC packet */ - stat64_add(&mig_stats.multifd_bytes, p->packet_len); - p->flags = 0; + qatomic_set(&p->pending_sync, false); qemu_sem_post(&p->sem_sync); } @@ -939,7 +1014,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return true; } -static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) +void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) { qio_channel_set_delay(ioc, false); @@ -990,7 +1065,7 @@ out: * Here we're not interested whether creation succeeded, only that * it happened at all. */ - qemu_sem_post(&multifd_send_state->channels_created); + multifd_send_channel_created(); if (ret) { return; @@ -1007,9 +1082,14 @@ out: error_free(local_err); } -static void multifd_new_send_channel_create(gpointer opaque) +static bool multifd_new_send_channel_create(gpointer opaque, Error **errp) { + if (!multifd_use_packets()) { + return file_send_channel_create(opaque, errp); + } + socket_send_channel_create(multifd_new_send_channel_async, opaque); + return true; } bool multifd_send_setup(void) @@ -1018,6 +1098,7 @@ bool multifd_send_setup(void) Error *local_err = NULL; int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; if (!migrate_multifd()) { @@ -1040,18 +1121,27 @@ bool multifd_send_setup(void) qemu_sem_init(&p->sem_sync, 0); p->id = i; p->pages = multifd_pages_init(page_count); - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); - p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); - p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); + } else { + p->iov = g_new0(struct iovec, page_count); + } p->name = g_strdup_printf("multifdsend_%d", i); - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, page_count + 1); p->page_size = qemu_target_page_size(); p->page_count = page_count; p->write_flags = 0; - multifd_new_send_channel_create(p); + + if (!multifd_new_send_channel_create(p, &local_err)) { + return false; + } } /* @@ -1083,6 +1173,57 @@ bool multifd_send_setup(void) return true; } +bool multifd_recv(void) +{ + int i; + static int next_recv_channel; + MultiFDRecvParams *p = NULL; + MultiFDRecvData *data = multifd_recv_state->data; + + /* + * next_channel can remain from a previous migration that was + * using more channels, so ensure it doesn't overflow if the + * limit is lower now. + */ + next_recv_channel %= migrate_multifd_channels(); + for (i = next_recv_channel;; i = (i + 1) % migrate_multifd_channels()) { + if (multifd_recv_should_exit()) { + return false; + } + + p = &multifd_recv_state->params[i]; + + if (qatomic_read(&p->pending_job) == false) { + next_recv_channel = (i + 1) % migrate_multifd_channels(); + break; + } + } + + /* + * Order pending_job read before manipulating p->data below. Pairs + * with qatomic_store_release() at multifd_recv_thread(). + */ + smp_mb_acquire(); + + assert(!p->data->size); + multifd_recv_state->data = p->data; + p->data = data; + + /* + * Order p->data update before setting pending_job. Pairs with + * qatomic_load_acquire() at multifd_recv_thread(). + */ + qatomic_store_release(&p->pending_job, true); + qemu_sem_post(&p->sem); + + return true; +} + +MultiFDRecvData *multifd_get_recv_data(void) +{ + return multifd_recv_state->data; +} + static void multifd_recv_terminate_threads(Error *err) { int i; @@ -1107,10 +1248,27 @@ static void multifd_recv_terminate_threads(Error *err) MultiFDRecvParams *p = &multifd_recv_state->params[i]; /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. + * The migration thread and channels interact differently + * depending on the presence of packets. */ - qemu_sem_post(&p->sem_sync); + if (multifd_use_packets()) { + /* + * The channel receives as long as there are packets. When + * packets end (i.e. MULTIFD_FLAG_SYNC is reached), the + * channel waits for the migration thread to sync. If the + * sync never happens, do it here. + */ + qemu_sem_post(&p->sem_sync); + } else { + /* + * The channel waits for the migration thread to give it + * work. When the migration thread runs out of work, it + * releases the channel and waits for any pending work to + * finish. If we reach here (e.g. due to error) before the + * work runs out, release the channel. + */ + qemu_sem_post(&p->sem); + } /* * We could arrive here for two reasons: @@ -1138,6 +1296,7 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->c = NULL; qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem_sync); + qemu_sem_destroy(&p->sem); g_free(p->name); p->name = NULL; p->packet_len = 0; @@ -1155,6 +1314,8 @@ static void multifd_recv_cleanup_state(void) qemu_sem_destroy(&multifd_recv_state->sem_sync); g_free(multifd_recv_state->params); multifd_recv_state->params = NULL; + g_free(multifd_recv_state->data); + multifd_recv_state->data = NULL; g_free(multifd_recv_state); multifd_recv_state = NULL; } @@ -1182,18 +1343,53 @@ void multifd_recv_cleanup(void) void multifd_recv_sync_main(void) { + int thread_count = migrate_multifd_channels(); + bool file_based = !multifd_use_packets(); int i; if (!migrate_multifd()) { return; } - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - trace_multifd_recv_sync_main_wait(p->id); + /* + * File-based channels don't use packets and therefore need to + * wait for more work. Release them to start the sync. + */ + if (file_based) { + for (i = 0; i < thread_count; i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + trace_multifd_recv_sync_main_signal(p->id); + qemu_sem_post(&p->sem); + } + } + + /* + * Initiate the synchronization by waiting for all channels. + * + * For socket-based migration this means each channel has received + * the SYNC packet on the stream. + * + * For file-based migration this means each channel is done with + * the work (pending_job=false). + */ + for (i = 0; i < thread_count; i++) { + trace_multifd_recv_sync_main_wait(i); qemu_sem_wait(&multifd_recv_state->sem_sync); } - for (i = 0; i < migrate_multifd_channels(); i++) { + + if (file_based) { + /* + * For file-based loading is done in one iteration. We're + * done. + */ + return; + } + + /* + * Sync done. Release the channels for the next iteration. + */ + for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; WITH_QEMU_LOCK_GUARD(&p->mutex) { @@ -1211,46 +1407,87 @@ static void *multifd_recv_thread(void *opaque) { MultiFDRecvParams *p = opaque; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int ret; trace_multifd_recv_thread_start(p->id); rcu_register_thread(); while (true) { - uint32_t flags; + uint32_t flags = 0; + bool has_data = false; + p->normal_num = 0; - if (multifd_recv_should_exit()) { - break; - } + if (use_packets) { + if (multifd_recv_should_exit()) { + break; + } - ret = qio_channel_read_all_eof(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ - break; - } + ret = qio_channel_read_all_eof(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ + break; + } - qemu_mutex_lock(&p->mutex); - ret = multifd_recv_unfill_packet(p, &local_err); - if (ret) { + qemu_mutex_lock(&p->mutex); + ret = multifd_recv_unfill_packet(p, &local_err); + if (ret) { + qemu_mutex_unlock(&p->mutex); + break; + } + + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; + has_data = !!p->normal_num; qemu_mutex_unlock(&p->mutex); - break; - } + } else { + /* + * No packets, so we need to wait for the vmstate code to + * give us work. + */ + qemu_sem_wait(&p->sem); + + if (multifd_recv_should_exit()) { + break; + } + + /* pairs with qatomic_store_release() at multifd_recv() */ + if (!qatomic_load_acquire(&p->pending_job)) { + /* + * Migration thread did not send work, this is + * equivalent to pending_sync on the sending + * side. Post sem_sync to notify we reached this + * point. + */ + qemu_sem_post(&multifd_recv_state->sem_sync); + continue; + } - flags = p->flags; - /* recv methods don't know how to handle the SYNC flag */ - p->flags &= ~MULTIFD_FLAG_SYNC; - qemu_mutex_unlock(&p->mutex); + has_data = !!p->data->size; + } - if (p->normal_num) { - ret = multifd_recv_state->ops->recv_pages(p, &local_err); + if (has_data) { + ret = multifd_recv_state->ops->recv(p, &local_err); if (ret != 0) { break; } } - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&multifd_recv_state->sem_sync); - qemu_sem_wait(&p->sem_sync); + if (use_packets) { + if (flags & MULTIFD_FLAG_SYNC) { + qemu_sem_post(&multifd_recv_state->sem_sync); + qemu_sem_wait(&p->sem_sync); + } + } else { + p->total_normal_pages += p->data->size / qemu_target_page_size(); + p->data->size = 0; + /* + * Order data->size update before clearing + * pending_job. Pairs with smp_mb_acquire() at + * multifd_recv(). + */ + qatomic_store_release(&p->pending_job, false); } } @@ -1269,6 +1506,7 @@ int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; /* @@ -1282,6 +1520,10 @@ int multifd_recv_setup(Error **errp) thread_count = migrate_multifd_channels(); multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); + + multifd_recv_state->data = g_new0(MultiFDRecvData, 1); + multifd_recv_state->data->size = 0; + qatomic_set(&multifd_recv_state->count, 0); qatomic_set(&multifd_recv_state->exiting, 0); qemu_sem_init(&multifd_recv_state->sem_sync, 0); @@ -1292,10 +1534,18 @@ int multifd_recv_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); + qemu_sem_init(&p->sem, 0); + p->pending_job = false; p->id = i; - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); + + p->data = g_new0(MultiFDRecvData, 1); + p->data->size = 0; + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + } p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); @@ -1339,18 +1589,23 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) { MultiFDRecvParams *p; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int id; - id = multifd_recv_initial_packet(ioc, &local_err); - if (id < 0) { - multifd_recv_terminate_threads(local_err); - error_propagate_prepend(errp, local_err, - "failed to receive packet" - " via multifd channel %d: ", - qatomic_read(&multifd_recv_state->count)); - return; + if (use_packets) { + id = multifd_recv_initial_packet(ioc, &local_err); + if (id < 0) { + multifd_recv_terminate_threads(local_err); + error_propagate_prepend(errp, local_err, + "failed to receive packet" + " via multifd channel %d: ", + qatomic_read(&multifd_recv_state->count)); + return; + } + trace_multifd_recv_new_channel(id); + } else { + id = qatomic_read(&multifd_recv_state->count); } - trace_multifd_recv_new_channel(id); p = &multifd_recv_state->params[id]; if (p->c != NULL) { diff --git a/migration/multifd.h b/migration/multifd.h index b3fe27ae93..7447c2bea3 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,8 +13,13 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H +#include "ram.h" + +typedef struct MultiFDRecvData MultiFDRecvData; + bool multifd_send_setup(void); void multifd_send_shutdown(void); +void multifd_send_channel_created(void); int multifd_recv_setup(Error **errp); void multifd_recv_cleanup(void); void multifd_recv_shutdown(void); @@ -23,6 +28,8 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); int multifd_send_sync_main(void); bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); +bool multifd_recv(void); +MultiFDRecvData *multifd_get_recv_data(void); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) @@ -63,6 +70,13 @@ typedef struct { RAMBlock *block; } MultiFDPages_t; +struct MultiFDRecvData { + void *opaque; + size_t size; + /* for preadv */ + off_t file_offset; +}; + typedef struct { /* Fields are only written at creating/deletion time */ /* No lock required for them, they are read only */ @@ -127,7 +141,7 @@ typedef struct { /* number of iovs used */ uint32_t iovs_num; /* used for compression methods */ - void *data; + void *compress_data; } MultiFDSendParams; typedef struct { @@ -152,6 +166,8 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; + /* sem where to wait for more work */ + QemuSemaphore sem; /* this mutex protects the following parameters */ QemuMutex mutex; @@ -161,6 +177,8 @@ typedef struct { uint32_t flags; /* global number of generated multifd packets */ uint64_t packet_num; + int pending_job; + MultiFDRecvData *data; /* thread local variables. No locking required */ @@ -183,7 +201,7 @@ typedef struct { /* num of non zero pages */ uint32_t normal_num; /* used for de-compression methods */ - void *data; + void *compress_data; } MultiFDRecvParams; typedef struct { @@ -197,8 +215,8 @@ typedef struct { int (*recv_setup)(MultiFDRecvParams *p, Error **errp); /* Cleanup for receiving side */ void (*recv_cleanup)(MultiFDRecvParams *p); - /* Read all pages */ - int (*recv_pages)(MultiFDRecvParams *p, Error **errp); + /* Read all data */ + int (*recv)(MultiFDRecvParams *p, Error **errp); } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); @@ -211,5 +229,6 @@ static inline void multifd_send_prepare_header(MultiFDSendParams *p) p->iovs_num++; } +void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); #endif diff --git a/migration/options.c b/migration/options.c index 3e3e0b93b4..40eb930940 100644 --- a/migration/options.c +++ b/migration/options.c @@ -204,6 +204,7 @@ Property migration_properties[] = { DEFINE_PROP_MIG_CAP("x-switchover-ack", MIGRATION_CAPABILITY_SWITCHOVER_ACK), DEFINE_PROP_MIG_CAP("x-dirty-limit", MIGRATION_CAPABILITY_DIRTY_LIMIT), + DEFINE_PROP_MIG_CAP("mapped-ram", MIGRATION_CAPABILITY_MAPPED_RAM), DEFINE_PROP_END_OF_LIST(), }; @@ -263,6 +264,13 @@ bool migrate_events(void) return s->capabilities[MIGRATION_CAPABILITY_EVENTS]; } +bool migrate_mapped_ram(void) +{ + MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_MAPPED_RAM]; +} + bool migrate_ignore_shared(void) { MigrationState *s = migrate_get_current(); @@ -645,6 +653,26 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } } + if (new_caps[MIGRATION_CAPABILITY_MAPPED_RAM]) { + if (new_caps[MIGRATION_CAPABILITY_XBZRLE]) { + error_setg(errp, + "Mapped-ram migration is incompatible with xbzrle"); + return false; + } + + if (new_caps[MIGRATION_CAPABILITY_COMPRESS]) { + error_setg(errp, + "Mapped-ram migration is incompatible with compression"); + return false; + } + + if (new_caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { + error_setg(errp, + "Mapped-ram migration is incompatible with postcopy"); + return false; + } + } + return true; } @@ -1218,6 +1246,13 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) } #endif + if (migrate_mapped_ram() && + (migrate_multifd_compression() || migrate_tls())) { + error_setg(errp, + "Mapped-ram only available for non-compressed non-TLS multifd migration"); + return false; + } + if (params->has_x_vcpu_dirty_limit_period && (params->x_vcpu_dirty_limit_period < 1 || params->x_vcpu_dirty_limit_period > 1000)) { @@ -1312,6 +1347,12 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + dest->multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + dest->multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { dest->xbzrle_cache_size = params->xbzrle_cache_size; } @@ -1447,6 +1488,12 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + s->parameters.multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + s->parameters.multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; xbzrle_cache_resize(params->xbzrle_cache_size, errp); diff --git a/migration/options.h b/migration/options.h index 246c160aee..6ddd8dad9b 100644 --- a/migration/options.h +++ b/migration/options.h @@ -31,6 +31,7 @@ bool migrate_compress(void); bool migrate_dirty_bitmaps(void); bool migrate_dirty_limit(void); bool migrate_events(void); +bool migrate_mapped_ram(void); bool migrate_ignore_shared(void); bool migrate_late_block_activate(void); bool migrate_multifd(void); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 94231ff295..b10c882629 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -33,6 +33,7 @@ #include "options.h" #include "qapi/error.h" #include "rdma.h" +#include "io/channel-file.h" #define IO_BUF_SIZE 32768 #define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64) @@ -255,6 +256,10 @@ static void qemu_iovec_release_ram(QEMUFile *f) memset(f->may_free, 0, sizeof(f->may_free)); } +bool qemu_file_is_seekable(QEMUFile *f) +{ + return qio_channel_has_feature(f->ioc, QIO_CHANNEL_FEATURE_SEEKABLE); +} /** * Flushes QEMUFile buffer @@ -447,6 +452,107 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) } } +void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos) +{ + Error *err = NULL; + size_t ret; + + if (f->last_error) { + return; + } + + qemu_fflush(f); + ret = qio_channel_pwrite(f->ioc, (char *)buf, buflen, pos, &err); + + if (err) { + qemu_file_set_error_obj(f, -EIO, err); + return; + } + + if ((ssize_t)ret == QIO_CHANNEL_ERR_BLOCK) { + qemu_file_set_error_obj(f, -EAGAIN, NULL); + return; + } + + if (ret != buflen) { + error_setg(&err, "Partial write of size %zu, expected %zu", ret, + buflen); + qemu_file_set_error_obj(f, -EIO, err); + return; + } + + stat64_add(&mig_stats.qemu_file_transferred, buflen); + + return; +} + + +size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos) +{ + Error *err = NULL; + size_t ret; + + if (f->last_error) { + return 0; + } + + ret = qio_channel_pread(f->ioc, (char *)buf, buflen, pos, &err); + + if ((ssize_t)ret == -1 || err) { + qemu_file_set_error_obj(f, -EIO, err); + return 0; + } + + if ((ssize_t)ret == QIO_CHANNEL_ERR_BLOCK) { + qemu_file_set_error_obj(f, -EAGAIN, NULL); + return 0; + } + + if (ret != buflen) { + error_setg(&err, "Partial read of size %zu, expected %zu", ret, buflen); + qemu_file_set_error_obj(f, -EIO, err); + return 0; + } + + return ret; +} + +void qemu_set_offset(QEMUFile *f, off_t off, int whence) +{ + Error *err = NULL; + off_t ret; + + if (qemu_file_is_writable(f)) { + qemu_fflush(f); + } else { + /* Drop all cached buffers if existed; will trigger a re-fill later */ + f->buf_index = 0; + f->buf_size = 0; + } + + ret = qio_channel_io_seek(f->ioc, off, whence, &err); + if (ret == (off_t)-1) { + qemu_file_set_error_obj(f, -EIO, err); + } +} + +off_t qemu_get_offset(QEMUFile *f) +{ + Error *err = NULL; + off_t ret; + + qemu_fflush(f); + + ret = qio_channel_io_seek(f->ioc, 0, SEEK_CUR, &err); + if (ret == (off_t)-1) { + qemu_file_set_error_obj(f, -EIO, err); + } + return ret; +} + + void qemu_put_byte(QEMUFile *f, int v) { if (f->last_error) { diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 8aec9fabf7..32fd4a34fd 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -75,6 +75,12 @@ QEMUFile *qemu_file_get_return_path(QEMUFile *f); int qemu_fflush(QEMUFile *f); void qemu_file_set_blocking(QEMUFile *f, bool block); int qemu_file_get_to_fd(QEMUFile *f, int fd, size_t size); +void qemu_set_offset(QEMUFile *f, off_t off, int whence); +off_t qemu_get_offset(QEMUFile *f); +void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos); +size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos); QIOChannel *qemu_file_get_ioc(QEMUFile *file); diff --git a/migration/ram.c b/migration/ram.c index 45a00b45ed..003c28e133 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -94,6 +94,24 @@ #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 /* We can't use any flag that is bigger than 0x200 */ +/* + * mapped-ram migration supports O_DIRECT, so we need to make sure the + * userspace buffer, the IO operation size and the file offset are + * aligned according to the underlying device's block size. The first + * two are already aligned to page size, but we need to add padding to + * the file to align the offset. We cannot read the block size + * dynamically because the migration file can be moved between + * different systems, so use 1M to cover most block sizes and to keep + * the file offset aligned at page size as well. + */ +#define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 + +/* + * When doing mapped-ram migration, this is the amount we read from + * the pages region in the migration file at a time. + */ +#define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 + XBZRLECacheStats xbzrle_counters; /* used by the search for pages to send */ @@ -1126,12 +1144,18 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, return 0; } + stat64_add(&mig_stats.zero_pages, 1); + + if (migrate_mapped_ram()) { + /* zero pages are not transferred with mapped-ram */ + clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); + return 1; + } + len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO); qemu_put_byte(file, 0); len += 1; ram_release_page(pss->block->idstr, offset); - - stat64_add(&mig_stats.zero_pages, 1); ram_transferred_add(len); /* @@ -1189,14 +1213,20 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, { QEMUFile *file = pss->pss_channel; - ram_transferred_add(save_page_header(pss, pss->pss_channel, block, - offset | RAM_SAVE_FLAG_PAGE)); - if (async) { - qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, - migrate_release_ram() && - migration_in_postcopy()); + if (migrate_mapped_ram()) { + qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE, + block->pages_offset + offset); + set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap); } else { - qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); + ram_transferred_add(save_page_header(pss, pss->pss_channel, block, + offset | RAM_SAVE_FLAG_PAGE)); + if (async) { + qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, + migrate_release_ram() && + migration_in_postcopy()); + } else { + qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); + } } ram_transferred_add(TARGET_PAGE_SIZE); stat64_add(&mig_stats.normal_pages, 1); @@ -1332,14 +1362,18 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) pss->block = QLIST_NEXT_RCU(pss->block, next); if (!pss->block) { if (migrate_multifd() && - !migrate_multifd_flush_after_each_section()) { + (!migrate_multifd_flush_after_each_section() || + migrate_mapped_ram())) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; int ret = multifd_send_sync_main(); if (ret < 0) { return ret; } - qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); - qemu_fflush(f); + + if (!migrate_mapped_ram()) { + qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); + qemu_fflush(f); + } } /* * If memory migration starts over, we will meet a dirtied page @@ -2778,6 +2812,9 @@ static void ram_list_init_bitmaps(void) */ block->bmap = bitmap_new(pages); bitmap_set(block->bmap, 0, pages); + if (migrate_mapped_ram()) { + block->file_bmap = bitmap_new(pages); + } block->clear_bmap_shift = shift; block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); } @@ -2915,6 +2952,89 @@ void qemu_guest_free_page_hint(void *addr, size_t len) } } +#define MAPPED_RAM_HDR_VERSION 1 +struct MappedRamHeader { + uint32_t version; + /* + * The target's page size, so we know how many pages are in the + * bitmap. + */ + uint64_t page_size; + /* + * The offset in the migration file where the pages bitmap is + * stored. + */ + uint64_t bitmap_offset; + /* + * The offset in the migration file where the actual pages (data) + * are stored. + */ + uint64_t pages_offset; +} QEMU_PACKED; +typedef struct MappedRamHeader MappedRamHeader; + +static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) +{ + g_autofree MappedRamHeader *header = NULL; + size_t header_size, bitmap_size; + long num_pages; + + header = g_new0(MappedRamHeader, 1); + header_size = sizeof(MappedRamHeader); + + num_pages = block->used_length >> TARGET_PAGE_BITS; + bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); + + /* + * Save the file offsets of where the bitmap and the pages should + * go as they are written at the end of migration and during the + * iterative phase, respectively. + */ + block->bitmap_offset = qemu_get_offset(file) + header_size; + block->pages_offset = ROUND_UP(block->bitmap_offset + + bitmap_size, + MAPPED_RAM_FILE_OFFSET_ALIGNMENT); + + header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION); + header->page_size = cpu_to_be64(TARGET_PAGE_SIZE); + header->bitmap_offset = cpu_to_be64(block->bitmap_offset); + header->pages_offset = cpu_to_be64(block->pages_offset); + + qemu_put_buffer(file, (uint8_t *) header, header_size); + + /* prepare offset for next ramblock */ + qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); +} + +static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header, + Error **errp) +{ + size_t ret, header_size = sizeof(MappedRamHeader); + + ret = qemu_get_buffer(file, (uint8_t *)header, header_size); + if (ret != header_size) { + error_setg(errp, "Could not read whole mapped-ram migration header " + "(expected %zd, got %zd bytes)", header_size, ret); + return false; + } + + /* migration stream is big-endian */ + header->version = be32_to_cpu(header->version); + + if (header->version > MAPPED_RAM_HDR_VERSION) { + error_setg(errp, "Migration mapped-ram capability version not " + "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION, + header->version); + return false; + } + + header->page_size = be64_to_cpu(header->page_size); + header->bitmap_offset = be64_to_cpu(header->bitmap_offset); + header->pages_offset = be64_to_cpu(header->pages_offset); + + return true; +} + /* * Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -2934,7 +3054,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) { RAMState **rsp = opaque; RAMBlock *block; - int ret; + int ret, max_hg_page_size; if (compress_threads_save_setup()) { return -1; @@ -2949,6 +3069,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; + /* + * ??? Mirrors the previous value of qemu_host_page_size, + * but is this really what was intended for the migration? + */ + max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); + WITH_RCU_READ_LOCK_GUARD() { qemu_put_be64(f, ram_bytes_total_with_ignored() | RAM_SAVE_FLAG_MEM_SIZE); @@ -2957,13 +3083,17 @@ static int ram_save_setup(QEMUFile *f, void *opaque) qemu_put_byte(f, strlen(block->idstr)); qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); qemu_put_be64(f, block->used_length); - if (migrate_postcopy_ram() && block->page_size != - qemu_host_page_size) { + if (migrate_postcopy_ram() && + block->page_size != max_hg_page_size) { qemu_put_be64(f, block->page_size); } if (migrate_ignore_shared()) { qemu_put_be64(f, block->mr->addr); } + + if (migrate_mapped_ram()) { + mapped_ram_setup_ramblock(f, block); + } } } @@ -2989,7 +3119,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return ret; } - if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && !migrate_multifd_flush_after_each_section() + && !migrate_mapped_ram()) { qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); } @@ -2997,6 +3128,33 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return qemu_fflush(f); } +static void ram_save_file_bmap(QEMUFile *f) +{ + RAMBlock *block; + + RAMBLOCK_FOREACH_MIGRATABLE(block) { + long num_pages = block->used_length >> TARGET_PAGE_BITS; + long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); + + qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, + block->bitmap_offset); + ram_transferred_add(bitmap_size); + + /* + * Free the bitmap here to catch any synchronization issues + * with multifd channels. No channels should be sending pages + * after we've written the bitmap to file. + */ + g_free(block->file_bmap); + block->file_bmap = NULL; + } +} + +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset) +{ + set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); +} + /** * ram_save_iterate: iterative stage for migration * @@ -3106,7 +3264,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { - if (migrate_multifd() && migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && migrate_multifd_flush_after_each_section() && + !migrate_mapped_ram()) { ret = multifd_send_sync_main(); if (ret < 0) { return ret; @@ -3186,7 +3345,20 @@ static int ram_save_complete(QEMUFile *f, void *opaque) return ret; } - if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { + if (migrate_mapped_ram()) { + ram_save_file_bmap(f); + + if (qemu_file_get_error(f)) { + Error *local_err = NULL; + int err = qemu_file_get_error_obj(f, &local_err); + + error_reportf_err(local_err, "Failed to write bitmap to file: "); + return -err; + } + } + + if (migrate_multifd() && !migrate_multifd_flush_after_each_section() && + !migrate_mapped_ram()) { qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); @@ -3786,31 +3958,165 @@ void colo_flush_ram_cache(void) trace_colo_flush_ram_cache_end(); } +static size_t ram_load_multifd_pages(void *host_addr, size_t size, + uint64_t offset) +{ + MultiFDRecvData *data = multifd_get_recv_data(); + + data->opaque = host_addr; + data->file_offset = offset; + data->size = size; + + if (!multifd_recv()) { + return 0; + } + + return size; +} + +static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, + long num_pages, unsigned long *bitmap, + Error **errp) +{ + ERRP_GUARD(); + unsigned long set_bit_idx, clear_bit_idx; + ram_addr_t offset; + void *host; + size_t read, unread, size; + + for (set_bit_idx = find_first_bit(bitmap, num_pages); + set_bit_idx < num_pages; + set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { + + clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); + + unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); + offset = set_bit_idx << TARGET_PAGE_BITS; + + while (unread > 0) { + host = host_from_ram_block_offset(block, offset); + if (!host) { + error_setg(errp, "page outside of ramblock %s range", + block->idstr); + return false; + } + + size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); + + if (migrate_multifd()) { + read = ram_load_multifd_pages(host, size, + block->pages_offset + offset); + } else { + read = qemu_get_buffer_at(f, host, size, + block->pages_offset + offset); + } + + if (!read) { + goto err; + } + offset += read; + unread -= read; + } + } + + return true; + +err: + qemu_file_get_error_obj(f, errp); + error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT + "from file offset %" PRIx64 ": ", block->idstr, offset, + block->pages_offset + offset); + return false; +} + +static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, + ram_addr_t length, Error **errp) +{ + g_autofree unsigned long *bitmap = NULL; + MappedRamHeader header; + size_t bitmap_size; + long num_pages; + + if (!mapped_ram_read_header(f, &header, errp)) { + return; + } + + block->pages_offset = header.pages_offset; + + /* + * Check the alignment of the file region that contains pages. We + * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that + * value to change in the future. Do only a sanity check with page + * size alignment. + */ + if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) { + error_setg(errp, + "Error reading ramblock %s pages, region has bad alignment", + block->idstr); + return; + } + + num_pages = length / header.page_size; + bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); + + bitmap = g_malloc0(bitmap_size); + if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size, + header.bitmap_offset) != bitmap_size) { + error_setg(errp, "Error reading dirty bitmap"); + return; + } + + if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) { + return; + } + + /* Skip pages array */ + qemu_set_offset(f, block->pages_offset + length, SEEK_SET); + + return; +} + static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) { int ret = 0; /* ADVISE is earlier, it shows the source has the postcopy capability on */ bool postcopy_advised = migration_incoming_postcopy_advised(); + int max_hg_page_size; + Error *local_err = NULL; assert(block); + if (migrate_mapped_ram()) { + parse_ramblock_mapped_ram(f, block, length, &local_err); + if (local_err) { + error_report_err(local_err); + return -EINVAL; + } + return 0; + } + if (!qemu_ram_is_migratable(block)) { error_report("block %s should not be migrated !", block->idstr); return -EINVAL; } if (length != block->used_length) { - Error *local_err = NULL; - ret = qemu_ram_resize(block, length, &local_err); if (local_err) { error_report_err(local_err); return ret; } } + + /* + * ??? Mirrors the previous value of qemu_host_page_size, + * but is this really what was intended for the migration? + */ + max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); + /* For postcopy we need to check hugepage sizes match */ if (postcopy_advised && migrate_postcopy_ram() && - block->page_size != qemu_host_page_size) { + block->page_size != max_hg_page_size) { uint64_t remote_page_size = qemu_get_be64(f); if (remote_page_size != block->page_size) { error_report("Mismatched RAM page size %s " @@ -3885,6 +4191,12 @@ static int ram_load_precopy(QEMUFile *f) invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; } + if (migrate_mapped_ram()) { + invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH | + RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE | + RAM_SAVE_FLAG_ZERO); + } + while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { ram_addr_t addr; void *host = NULL, *host_bak = NULL; @@ -3906,6 +4218,8 @@ static int ram_load_precopy(QEMUFile *f) addr &= TARGET_PAGE_MASK; if (flags & invalid_flags) { + error_report("Unexpected RAM flags: %d", flags & invalid_flags); + if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { error_report("Received an unexpected compressed page"); } @@ -3958,6 +4272,16 @@ static int ram_load_precopy(QEMUFile *f) switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { case RAM_SAVE_FLAG_MEM_SIZE: ret = parse_ramblocks(f, addr); + /* + * For mapped-ram migration (to a file) using multifd, we sync + * once and for all here to make sure all tasks we queued to + * multifd threads are completed, so that all the ramblocks + * (including all the guest memory pages within) are fully + * loaded after this sync returns. + */ + if (migrate_mapped_ram()) { + multifd_recv_sync_main(); + } break; case RAM_SAVE_FLAG_ZERO: @@ -3998,7 +4322,12 @@ static int ram_load_precopy(QEMUFile *f) case RAM_SAVE_FLAG_EOS: /* normal exit */ if (migrate_multifd() && - migrate_multifd_flush_after_each_section()) { + migrate_multifd_flush_after_each_section() && + /* + * Mapped-ram migration flushes once and for all after + * parsing ramblocks. Always ignore EOS for it. + */ + !migrate_mapped_ram()) { multifd_recv_sync_main(); } break; diff --git a/migration/ram.h b/migration/ram.h index 9b937a446b..b9ac0da587 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -75,6 +75,7 @@ bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start); void postcopy_preempt_shutdown_file(MigrationState *s); void *postcopy_preempt_thread(void *opaque); +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset); /* ram cache */ int colo_init_ram_cache(void); diff --git a/migration/savevm.c b/migration/savevm.c index d612c8a902..dc1fb9c0d3 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -245,6 +245,7 @@ static bool should_validate_capability(int capability) /* Validate only new capabilities to keep compatibility. */ switch (capability) { case MIGRATION_CAPABILITY_X_IGNORE_SHARED: + case MIGRATION_CAPABILITY_MAPPED_RAM: return true; default: return false; diff --git a/migration/trace-events b/migration/trace-events index 298ad2b0dd..bf1a069632 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -132,7 +132,7 @@ multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uin multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" -multifd_recv_sync_main_wait(uint8_t id) "channel %u" +multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" diff --git a/pc-bios/README b/pc-bios/README index b8a0210d24..7ffb2f43a4 100644 --- a/pc-bios/README +++ b/pc-bios/README @@ -75,3 +75,9 @@ initialize and run boot images stored in SPI flash, but may grow more features over time as needed. The source code is available at: https://github.com/google/vbootrom + +- hppa-firmware.img (32-bit) and hppa-firmware64.img (64-bit) are firmware + files for the HP-PARISC (hppa) architecture. + They are built form the SeaBIOS-hppa sources, which is a fork of SeaBIOS + adapted for hppa. + SeaBIOS-hppa is available at https://github.com/hdeller/seabios-hppa diff --git a/pc-bios/meson.build b/pc-bios/meson.build index e67fa433a1..0760612bea 100644 --- a/pc-bios/meson.build +++ b/pc-bios/meson.build @@ -73,6 +73,7 @@ blobs = [ 'qemu_vga.ndrv', 'edk2-licenses.txt', 'hppa-firmware.img', + 'hppa-firmware64.img', 'opensbi-riscv32-generic-fw_dynamic.bin', 'opensbi-riscv64-generic-fw_dynamic.bin', 'npcm7xx_bootrom.bin', diff --git a/qapi/migration.json b/qapi/migration.json index 27a67e5012..51d188b902 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -531,6 +531,10 @@ # and can result in more stable read performance. Requires KVM # with accelerator property "dirty-ring-size" set. (Since 8.1) # +# @mapped-ram: Migrate using fixed offsets in the migration file for +# each RAM page. Requires a migration URI that supports seeking, +# such as a file. (since 9.0) +# # Features: # # @deprecated: Member @block is deprecated. Use blockdev-mirror with @@ -555,7 +559,7 @@ { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] }, 'validate-uuid', 'background-snapshot', 'zero-copy-send', 'postcopy-preempt', 'switchover-ack', - 'dirty-limit'] } + 'dirty-limit', 'mapped-ram'] } ## # @MigrationCapabilityStatus: @@ -636,28 +640,30 @@ # # @normal: the original form of migration. (since 8.2) # -# @cpr-reboot: The migrate command stops the VM and saves state to the URI. -# After quitting qemu, the user resumes by running qemu -incoming. +# @cpr-reboot: The migrate command stops the VM and saves state to +# the URI. After quitting QEMU, the user resumes by running +# QEMU -incoming. # -# This mode allows the user to quit qemu, and restart an updated version -# of qemu. The user may even update and reboot the OS before restarting, -# as long as the URI persists across a reboot. +# This mode allows the user to quit QEMU, optionally update and +# reboot the OS, and restart QEMU. If the user reboots, the URI +# must persist across the reboot, such as by using a file. # -# Unlike normal mode, the use of certain local storage options does not -# block the migration, but the user must not modify guest block devices -# between the quit and restart. +# Unlike normal mode, the use of certain local storage options +# does not block the migration, but the user must not modify the +# contents of guest block devices between the quit and restart. # -# This mode supports vfio devices provided the user first puts the guest -# in the suspended runstate, such as by issuing guest-suspend-ram to the -# qemu guest agent. +# This mode supports VFIO devices provided the user first puts +# the guest in the suspended runstate, such as by issuing +# guest-suspend-ram to the QEMU guest agent. # -# Best performance is achieved when the memory backend is shared and the -# @x-ignore-shared migration capability is set, but this is not required. -# Further, if the user reboots before restarting such a configuration, the -# shared backend must be be non-volatile across reboot, such as by backing -# it with a dax device. +# Best performance is achieved when the memory backend is shared +# and the @x-ignore-shared migration capability is set, but this +# is not required. Further, if the user reboots before restarting +# such a configuration, the shared memory must persist across the +# reboot, such as by backing it with a dax device. # -# cpr-reboot may not be used with postcopy, colo, or background-snapshot. +# @cpr-reboot may not be used with postcopy, background-snapshot, +# or COLO. # # (since 8.2) ## diff --git a/roms/Makefile b/roms/Makefile index 67f709ba2d..8e5d8d26a9 100644 --- a/roms/Makefile +++ b/roms/Makefile @@ -68,6 +68,7 @@ default help: @echo " opensbi32-generic -- update OpenSBI for 32-bit generic machine" @echo " opensbi64-generic -- update OpenSBI for 64-bit generic machine" @echo " qboot -- update qboot" + @echo " hppa-firmware -- update 32- and 64-bit hppa firmware" @echo " clean -- delete the files generated by the previous" \ "build targets" @@ -177,6 +178,11 @@ npcm7xx_bootrom: $(MAKE) -C vbootrom CROSS_COMPILE=$(arm_cross_prefix) cp vbootrom/npcm7xx_bootrom.bin ../pc-bios/npcm7xx_bootrom.bin +hppa-firmware: + $(MAKE) -C seabios-hppa parisc + cp seabios-hppa/out/hppa-firmware.img ../pc-bios/ + cp seabios-hppa/out-64/hppa-firmware64.img ../pc-bios/ + clean: rm -rf seabios/.config seabios/out seabios/builds $(MAKE) -C ipxe/src veryclean @@ -189,3 +195,4 @@ clean: $(MAKE) -C opensbi clean $(MAKE) -C qboot clean $(MAKE) -C vbootrom clean + $(MAKE) -C seabios-hppa clean diff --git a/system/physmem.c b/system/physmem.c index e3ebc19eef..3adda08ebf 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -1680,7 +1680,8 @@ int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp) assert(block); - newsize = HOST_PAGE_ALIGN(newsize); + newsize = TARGET_PAGE_ALIGN(newsize); + newsize = REAL_HOST_PAGE_ALIGN(newsize); if (block->used_length == newsize) { /* @@ -1916,7 +1917,9 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, return NULL; } - size = HOST_PAGE_ALIGN(size); + size = TARGET_PAGE_ALIGN(size); + size = REAL_HOST_PAGE_ALIGN(size); + file_size = get_file_size(fd); if (file_size > offset && file_size < (offset + size)) { error_setg(errp, "backing store size 0x%" PRIx64 @@ -2014,13 +2017,17 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, { RAMBlock *new_block; Error *local_err = NULL; + int align; assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC | RAM_NORESERVE)) == 0); assert(!host ^ (ram_flags & RAM_PREALLOC)); - size = HOST_PAGE_ALIGN(size); - max_size = HOST_PAGE_ALIGN(max_size); + align = qemu_real_host_page_size(); + align = MAX(align, TARGET_PAGE_SIZE); + size = ROUND_UP(size, align); + max_size = ROUND_UP(max_size, align); + new_block = g_malloc0(sizeof(*new_block)); new_block->mr = mr; new_block->resized = resized; @@ -3511,7 +3518,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) * fallocate works on hugepages and shmem * shared anonymous memory requires madvise REMOVE */ - need_madvise = (rb->page_size == qemu_host_page_size); + need_madvise = (rb->page_size == qemu_real_host_page_size()); need_fallocate = rb->fd != -1; if (need_fallocate) { /* For a file, this causes the area of the file to be zero'd diff --git a/system/vl.c b/system/vl.c index e480afd7a0..48aae6e053 100644 --- a/system/vl.c +++ b/system/vl.c @@ -2118,7 +2118,6 @@ static void qemu_create_machine(QDict *qdict) } cpu_exec_init_all(); - page_size_init(); if (machine_class->hw_version) { qemu_set_hw_version(machine_class->hw_version); diff --git a/target/alpha/cpu-param.h b/target/alpha/cpu-param.h index 68c46f7998..c969cb016b 100644 --- a/target/alpha/cpu-param.h +++ b/target/alpha/cpu-param.h @@ -9,10 +9,22 @@ #define ALPHA_CPU_PARAM_H #define TARGET_LONG_BITS 64 -#define TARGET_PAGE_BITS 13 /* ??? EV4 has 34 phys addr bits, EV5 has 40, EV6 has 44. */ #define TARGET_PHYS_ADDR_SPACE_BITS 44 -#define TARGET_VIRT_ADDR_SPACE_BITS (30 + TARGET_PAGE_BITS) + +#ifdef CONFIG_USER_ONLY +/* + * Allow user-only to vary page size. Real hardware allows only 8k and 64k, + * but since any variance means guests cannot assume a fixed value, allow + * a 4k minimum to match x86 host, which can minimize emulation issues. + */ +# define TARGET_PAGE_BITS_VARY +# define TARGET_PAGE_BITS_MIN 12 +# define TARGET_VIRT_ADDR_SPACE_BITS 63 +#else +# define TARGET_PAGE_BITS 13 +# define TARGET_VIRT_ADDR_SPACE_BITS (30 + TARGET_PAGE_BITS) +#endif #endif diff --git a/target/arm/cpu-param.h b/target/arm/cpu-param.h index f9b462a98f..da3243ab21 100644 --- a/target/arm/cpu-param.h +++ b/target/arm/cpu-param.h @@ -19,9 +19,13 @@ #endif #ifdef CONFIG_USER_ONLY -#define TARGET_PAGE_BITS 12 # ifdef TARGET_AARCH64 # define TARGET_TAGGED_ADDRESSES +/* Allow user-only to vary page size from 4k */ +# define TARGET_PAGE_BITS_VARY +# define TARGET_PAGE_BITS_MIN 12 +# else +# define TARGET_PAGE_BITS 12 # endif #else /* diff --git a/target/arm/cpu.c b/target/arm/cpu.c index b2ea5d6513..f3ed79cef2 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -1809,7 +1809,6 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) ARMCPU *cpu = ARM_CPU(dev); ARMCPUClass *acc = ARM_CPU_GET_CLASS(dev); CPUARMState *env = &cpu->env; - int pagebits; Error *local_err = NULL; #if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY) @@ -2100,28 +2099,36 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) !cpu_isar_feature(aa32_vfp_simd, cpu) || !arm_feature(env, ARM_FEATURE_XSCALE)); - if (arm_feature(env, ARM_FEATURE_V7) && - !arm_feature(env, ARM_FEATURE_M) && - !arm_feature(env, ARM_FEATURE_PMSA)) { - /* v7VMSA drops support for the old ARMv5 tiny pages, so we - * can use 4K pages. - */ - pagebits = 12; - } else { - /* For CPUs which might have tiny 1K pages, or which have an - * MPU and might have small region sizes, stick with 1K pages. - */ - pagebits = 10; - } - if (!set_preferred_target_page_bits(pagebits)) { - /* This can only ever happen for hotplugging a CPU, or if - * the board code incorrectly creates a CPU which it has - * promised via minimum_page_size that it will not. - */ - error_setg(errp, "This CPU requires a smaller page size than the " - "system is using"); - return; +#ifndef CONFIG_USER_ONLY + { + int pagebits; + if (arm_feature(env, ARM_FEATURE_V7) && + !arm_feature(env, ARM_FEATURE_M) && + !arm_feature(env, ARM_FEATURE_PMSA)) { + /* + * v7VMSA drops support for the old ARMv5 tiny pages, + * so we can use 4K pages. + */ + pagebits = 12; + } else { + /* + * For CPUs which might have tiny 1K pages, or which have an + * MPU and might have small region sizes, stick with 1K pages. + */ + pagebits = 10; + } + if (!set_preferred_target_page_bits(pagebits)) { + /* + * This can only ever happen for hotplugging a CPU, or if + * the board code incorrectly creates a CPU which it has + * promised via minimum_page_size that it will not. + */ + error_setg(errp, "This CPU requires a smaller page size " + "than the system is using"); + return; + } } +#endif /* This cpu-id-to-MPIDR affinity is used only for TCG; KVM will override it. * We don't support setting cluster ID ([16..23]) (known as Aff2 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c index afe73d4474..3831cb6db2 100644 --- a/target/hppa/cpu.c +++ b/target/hppa/cpu.c @@ -121,9 +121,10 @@ void hppa_cpu_do_unaligned_access(CPUState *cs, vaddr addr, CPUHPPAState *env = &cpu->env; cs->exception_index = EXCP_UNALIGN; + cpu_restore_state(cs, retaddr); hppa_set_ior_and_isr(env, addr, MMU_IDX_MMU_DISABLED(mmu_idx)); - cpu_loop_exit_restore(cs, retaddr); + cpu_loop_exit(cs); } #endif /* CONFIG_USER_ONLY */ diff --git a/target/hppa/helper.c b/target/hppa/helper.c index 859644c47a..9d217d051c 100644 --- a/target/hppa/helper.c +++ b/target/hppa/helper.c @@ -76,7 +76,8 @@ void cpu_hppa_put_psw(CPUHPPAState *env, target_ulong psw) } psw &= ~reserved; - env->psw = psw & ~(PSW_N | PSW_V | PSW_CB); + env->psw = psw & (uint32_t)~(PSW_N | PSW_V | PSW_CB); + env->psw_n = (psw / PSW_N) & 1; env->psw_v = -((psw / PSW_V) & 1); diff --git a/target/hppa/mem_helper.c b/target/hppa/mem_helper.c index 66b8fa7d72..3fc895c1c2 100644 --- a/target/hppa/mem_helper.c +++ b/target/hppa/mem_helper.c @@ -348,9 +348,10 @@ raise_exception_with_ior(CPUHPPAState *env, int excp, uintptr_t retaddr, CPUState *cs = env_cpu(env); cs->exception_index = excp; + cpu_restore_state(cs, retaddr); hppa_set_ior_and_isr(env, addr, mmu_disabled); - cpu_loop_exit_restore(cs, retaddr); + cpu_loop_exit(cs); } void hppa_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr, diff --git a/target/hppa/op_helper.c b/target/hppa/op_helper.c index b1f24a5aad..480fe80844 100644 --- a/target/hppa/op_helper.c +++ b/target/hppa/op_helper.c @@ -351,11 +351,12 @@ target_ulong HELPER(probe)(CPUHPPAState *env, target_ulong addr, excp = hppa_get_physical_address(env, addr, mmu_idx, 0, &phys, &prot, NULL); if (excp >= 0) { + cpu_restore_state(env_cpu(env), GETPC()); hppa_set_ior_and_isr(env, addr, MMU_IDX_MMU_DISABLED(mmu_idx)); if (excp == EXCP_DTLB_MISS) { excp = EXCP_NA_DTLB_MISS; } - hppa_dynamic_excp(env, excp, GETPC()); + helper_excp(env, excp); } return (want & prot) != 0; #endif diff --git a/target/ppc/cpu-param.h b/target/ppc/cpu-param.h index 0a0416e0a8..b7ad52de03 100644 --- a/target/ppc/cpu-param.h +++ b/target/ppc/cpu-param.h @@ -31,6 +31,13 @@ # define TARGET_PHYS_ADDR_SPACE_BITS 36 # define TARGET_VIRT_ADDR_SPACE_BITS 32 #endif -#define TARGET_PAGE_BITS 12 + +#ifdef CONFIG_USER_ONLY +/* Allow user-only to vary page size from 4k */ +# define TARGET_PAGE_BITS_VARY +# define TARGET_PAGE_BITS_MIN 12 +#else +# define TARGET_PAGE_BITS 12 +#endif #endif diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h index ef5ebe91bd..85d5746e47 100644 --- a/tcg/aarch64/tcg-target.h +++ b/tcg/aarch64/tcg-target.h @@ -55,7 +55,11 @@ typedef enum { #define TCG_TARGET_CALL_STACK_OFFSET 0 #define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL -#define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_EVEN +#ifdef CONFIG_DARWIN +# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL +#else +# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_EVEN +#endif #define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL #define have_lse (cpuinfo & CPUINFO_LSE) diff --git a/tcg/optimize.c b/tcg/optimize.c index 79e701652b..752cc5c56b 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -2102,7 +2102,8 @@ static bool fold_remainder(OptContext *ctx, TCGOp *op) static void fold_setcond_tst_pow2(OptContext *ctx, TCGOp *op, bool neg) { - TCGOpcode and_opc, sub_opc, xor_opc, neg_opc, shr_opc, uext_opc, sext_opc; + TCGOpcode and_opc, sub_opc, xor_opc, neg_opc, shr_opc; + TCGOpcode uext_opc = 0, sext_opc = 0; TCGCond cond = op->args[3]; TCGArg ret, src1, src2; TCGOp *op2; diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 83512bce85..4023d808f9 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2200,6 +2200,14 @@ static void *test_mode_reboot_start(QTestState *from, QTestState *to) return NULL; } +static void *migrate_mapped_ram_start(QTestState *from, QTestState *to) +{ + migrate_set_capability(from, "mapped-ram", true); + migrate_set_capability(to, "mapped-ram", true); + + return NULL; +} + static void test_mode_reboot(void) { g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, @@ -2214,6 +2222,72 @@ static void test_mode_reboot(void) test_file_common(&args, true); } +static void test_precopy_file_mapped_ram_live(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_mapped_ram_start, + }; + + test_file_common(&args, false); +} + +static void test_precopy_file_mapped_ram(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_mapped_ram_start, + }; + + test_file_common(&args, true); +} + +static void *migrate_multifd_mapped_ram_start(QTestState *from, QTestState *to) +{ + migrate_mapped_ram_start(from, to); + + migrate_set_parameter_int(from, "multifd-channels", 4); + migrate_set_parameter_int(to, "multifd-channels", 4); + + migrate_set_capability(from, "multifd", true); + migrate_set_capability(to, "multifd", true); + + return NULL; +} + +static void test_multifd_file_mapped_ram_live(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_multifd_mapped_ram_start, + }; + + test_file_common(&args, false); +} + +static void test_multifd_file_mapped_ram(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_multifd_mapped_ram_start, + }; + + test_file_common(&args, true); +} + + static void test_precopy_tcp_plain(void) { MigrateCommon args = { @@ -2462,6 +2536,13 @@ static void *migrate_precopy_fd_file_start(QTestState *from, QTestState *to) return NULL; } +static void *migrate_fd_file_mapped_ram_start(QTestState *from, QTestState *to) +{ + migrate_mapped_ram_start(from, to); + + return migrate_precopy_fd_file_start(from, to); +} + static void test_migrate_precopy_fd_file(void) { MigrateCommon args = { @@ -2472,6 +2553,36 @@ static void test_migrate_precopy_fd_file(void) }; test_file_common(&args, true); } + +static void test_migrate_precopy_fd_file_mapped_ram(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .connect_uri = "fd:fd-mig", + .start_hook = migrate_fd_file_mapped_ram_start, + .finish_hook = test_migrate_fd_finish_hook + }; + test_file_common(&args, true); +} + +static void *migrate_multifd_fd_mapped_ram_start(QTestState *from, + QTestState *to) +{ + migrate_multifd_mapped_ram_start(from, to); + return migrate_precopy_fd_file_start(from, to); +} + +static void test_multifd_fd_mapped_ram(void) +{ + MigrateCommon args = { + .connect_uri = "fd:fd-mig", + .listen_uri = "defer", + .start_hook = migrate_multifd_fd_mapped_ram_start, + .finish_hook = test_migrate_fd_finish_hook + }; + + test_file_common(&args, true); +} #endif /* _WIN32 */ static void do_test_validate_uuid(MigrateStart *args, bool should_fail) @@ -2664,6 +2775,13 @@ static void * test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from, QTestState *to) { + /* + * Overloading this test to also check that set_parameter does not error. + * This is also done in the tests for the other compression methods. + */ + migrate_set_parameter_int(from, "multifd-zlib-level", 2); + migrate_set_parameter_int(to, "multifd-zlib-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zlib"); } @@ -2672,6 +2790,9 @@ static void * test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, QTestState *to) { + migrate_set_parameter_int(from, "multifd-zstd-level", 2); + migrate_set_parameter_int(to, "multifd-zstd-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zstd"); } #endif /* CONFIG_ZSTD */ @@ -3509,6 +3630,20 @@ int main(int argc, char **argv) migration_test_add("/migration/mode/reboot", test_mode_reboot); } + migration_test_add("/migration/precopy/file/mapped-ram", + test_precopy_file_mapped_ram); + migration_test_add("/migration/precopy/file/mapped-ram/live", + test_precopy_file_mapped_ram_live); + + migration_test_add("/migration/multifd/file/mapped-ram", + test_multifd_file_mapped_ram); + migration_test_add("/migration/multifd/file/mapped-ram/live", + test_multifd_file_mapped_ram_live); +#ifndef _WIN32 + migration_test_add("/migration/multifd/fd/mapped-ram", + test_multifd_fd_mapped_ram); +#endif + #ifdef CONFIG_GNUTLS migration_test_add("/migration/precopy/unix/tls/psk", test_precopy_unix_tls_psk); @@ -3570,6 +3705,8 @@ int main(int argc, char **argv) test_migrate_precopy_fd_socket); migration_test_add("/migration/precopy/fd/file", test_migrate_precopy_fd_file); + migration_test_add("/migration/precopy/fd/file/mapped-ram", + test_migrate_precopy_fd_file_mapped_ram); #endif migration_test_add("/migration/validate_uuid", test_validate_uuid); migration_test_add("/migration/validate_uuid_error", diff --git a/tests/tcg/alpha/Makefile.target b/tests/tcg/alpha/Makefile.target index b94500a7d9..fdd7ddf64e 100644 --- a/tests/tcg/alpha/Makefile.target +++ b/tests/tcg/alpha/Makefile.target @@ -13,6 +13,3 @@ test-cmov: test-cond.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS) run-test-cmov: test-cmov - -# On Alpha Linux only supports 8k pages -EXTRA_RUNS+=run-test-mmap-8192 diff --git a/tests/tcg/arm/Makefile.target b/tests/tcg/arm/Makefile.target index 3473f4619e..0a1965fce7 100644 --- a/tests/tcg/arm/Makefile.target +++ b/tests/tcg/arm/Makefile.target @@ -79,6 +79,3 @@ sha512-vector: sha512.c ARM_TESTS += sha512-vector TESTS += $(ARM_TESTS) - -# On ARM Linux only supports 4k pages -EXTRA_RUNS+=run-test-mmap-4096 diff --git a/tests/tcg/hppa/Makefile.target b/tests/tcg/hppa/Makefile.target index cdd0d572a7..ea5ae2186d 100644 --- a/tests/tcg/hppa/Makefile.target +++ b/tests/tcg/hppa/Makefile.target @@ -2,9 +2,6 @@ # # HPPA specific tweaks - specifically masking out broken tests -# On parisc Linux supports 4K/16K/64K (but currently only 4k works) -EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-16384 run-test-mmap-65536 - # This triggers failures for hppa-linux about 1% of the time # HPPA is the odd target that can't use the sigtramp page; # it requires the full vdso with dwarf2 unwind info. diff --git a/tests/tcg/i386/Makefile.target b/tests/tcg/i386/Makefile.target index 9906f9e116..bbe2c44b2a 100644 --- a/tests/tcg/i386/Makefile.target +++ b/tests/tcg/i386/Makefile.target @@ -71,9 +71,6 @@ endif I386_TESTS:=$(filter-out $(SKIP_I386_TESTS), $(ALL_X86_TESTS)) TESTS=$(MULTIARCH_TESTS) $(I386_TESTS) -# On i386 and x86_64 Linux only supports 4k pages (large pages are a different hack) -EXTRA_RUNS+=run-test-mmap-4096 - sha512-sse: CFLAGS=-msse4.1 -O3 sha512-sse: sha512.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS) diff --git a/tests/tcg/m68k/Makefile.target b/tests/tcg/m68k/Makefile.target index 6ff214e60a..33f7b1b127 100644 --- a/tests/tcg/m68k/Makefile.target +++ b/tests/tcg/m68k/Makefile.target @@ -5,6 +5,3 @@ VPATH += $(SRC_PATH)/tests/tcg/m68k TESTS += trap denormal - -# On m68k Linux supports 4k and 8k pages (but 8k is currently broken) -EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-8192 diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target index e10951a801..f11f3b084d 100644 --- a/tests/tcg/multiarch/Makefile.target +++ b/tests/tcg/multiarch/Makefile.target @@ -51,18 +51,9 @@ run-plugin-vma-pthread-with-%: vma-pthread $(call skip-test, $<, "flaky on CI?") endif -# We define the runner for test-mmap after the individual -# architectures have defined their supported pages sizes. If no -# additional page sizes are defined we only run the default test. - -# default case (host page size) run-test-mmap: test-mmap $(call run-test, test-mmap, $(QEMU) $<, $< (default)) -# additional page sizes (defined by each architecture adding to EXTRA_RUNS) -run-test-mmap-%: test-mmap - $(call run-test, test-mmap-$*, $(QEMU) -p $* $<, $< ($* byte pages)) - ifneq ($(GDB),) GDB_SCRIPT=$(SRC_PATH)/tests/guest-debug/run-test.py diff --git a/tests/tcg/multiarch/linux/linux-madvise.c b/tests/tcg/multiarch/linux/linux-madvise.c index 29d0997e68..539fb3b772 100644 --- a/tests/tcg/multiarch/linux/linux-madvise.c +++ b/tests/tcg/multiarch/linux/linux-madvise.c @@ -42,6 +42,8 @@ static void test_file(void) assert(ret == 0); written = write(fd, &c, sizeof(c)); assert(written == sizeof(c)); + ret = ftruncate(fd, pagesize); + assert(ret == 0); page = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE, fd, 0); assert(page != MAP_FAILED); diff --git a/tests/tcg/multiarch/linux/linux-shmat-maps.c b/tests/tcg/multiarch/linux/linux-shmat-maps.c new file mode 100644 index 0000000000..0ccf7a973a --- /dev/null +++ b/tests/tcg/multiarch/linux/linux-shmat-maps.c @@ -0,0 +1,55 @@ +/* + * Test that shmat() does not break /proc/self/maps. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#include <assert.h> +#include <fcntl.h> +#include <stdlib.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <unistd.h> + +int main(void) +{ + char buf[128]; + int err, fd; + int shmid; + ssize_t n; + void *p; + + shmid = shmget(IPC_PRIVATE, 1, IPC_CREAT | 0600); + assert(shmid != -1); + + /* + * The original bug required a non-NULL address, which skipped the + * mmap_find_vma step, which could result in a host mapping smaller + * than the target mapping. Choose an address at random. + */ + p = shmat(shmid, (void *)0x800000, SHM_RND); + if (p == (void *)-1) { + /* + * Because we are now running the testcase for all guests for which + * we have a cross-compiler, the above random address might conflict + * with the guest executable in some way. Rather than stopping, + * continue with a system supplied address, which should never fail. + */ + p = shmat(shmid, NULL, 0); + assert(p != (void *)-1); + } + + fd = open("/proc/self/maps", O_RDONLY); + assert(fd != -1); + do { + n = read(fd, buf, sizeof(buf)); + assert(n >= 0); + } while (n != 0); + close(fd); + + err = shmdt(p); + assert(err == 0); + err = shmctl(shmid, IPC_RMID, NULL); + assert(err == 0); + + return EXIT_SUCCESS; +} diff --git a/tests/tcg/ppc/Makefile.target b/tests/tcg/ppc/Makefile.target deleted file mode 100644 index f5e08c7376..0000000000 --- a/tests/tcg/ppc/Makefile.target +++ /dev/null @@ -1,12 +0,0 @@ -# -*- Mode: makefile -*- -# -# PPC - included from tests/tcg/Makefile -# - -ifneq (,$(findstring 64,$(TARGET_NAME))) -# On PPC64 Linux can be configured with 4k (default) or 64k pages (currently broken) -EXTRA_RUNS+=run-test-mmap-4096 #run-test-mmap-65536 -else -# On PPC32 Linux supports 4K/16K/64K/256K (but currently only 4k works) -EXTRA_RUNS+=run-test-mmap-4096 #run-test-mmap-16384 run-test-mmap-65536 run-test-mmap-262144 -endif diff --git a/tests/tcg/sh4/Makefile.target b/tests/tcg/sh4/Makefile.target index 47c39a44b6..16eaa850a8 100644 --- a/tests/tcg/sh4/Makefile.target +++ b/tests/tcg/sh4/Makefile.target @@ -3,9 +3,6 @@ # SuperH specific tweaks # -# On sh Linux supports 4k, 8k, 16k and 64k pages (but only 4k currently works) -EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-8192 run-test-mmap-16384 run-test-mmap-65536 - # This triggers failures for sh4-linux about 10% of the time. # Random SIGSEGV at unpredictable guest address, cause unknown. run-signals: signals diff --git a/tests/tcg/sparc64/Makefile.target b/tests/tcg/sparc64/Makefile.target deleted file mode 100644 index 408dace783..0000000000 --- a/tests/tcg/sparc64/Makefile.target +++ /dev/null @@ -1,6 +0,0 @@ -# -*- Mode: makefile -*- -# -# sparc specific tweaks - -# On Sparc64 Linux support 8k pages -EXTRA_RUNS+=run-test-mmap-8192 |