diff options
| -rw-r--r-- | hw/i386/x86.c | 4 | ||||
| -rw-r--r-- | hw/nvme/ctrl.c | 29 | ||||
| -rw-r--r-- | hw/nvme/nvme.h | 4 | ||||
| -rw-r--r-- | include/qemu/async-teardown.h | 22 | ||||
| -rw-r--r-- | meson.build | 1 | ||||
| -rw-r--r-- | os-posix.c | 6 | ||||
| -rw-r--r-- | qemu-options.hx | 19 | ||||
| -rw-r--r-- | target/i386/kvm/kvm-stub.c | 5 | ||||
| -rw-r--r-- | target/i386/kvm/kvm.c | 5 | ||||
| -rw-r--r-- | target/i386/kvm/kvm_i386.h | 2 | ||||
| -rw-r--r-- | target/i386/tcg/sysemu/excp_helper.c | 10 | ||||
| -rw-r--r-- | target/i386/tcg/translate.c | 2 | ||||
| -rw-r--r-- | util/async-teardown.c | 150 | ||||
| -rw-r--r-- | util/log.c | 9 | ||||
| -rw-r--r-- | util/meson.build | 1 |
15 files changed, 245 insertions, 24 deletions
diff --git a/hw/i386/x86.c b/hw/i386/x86.c index bd50a064a3..78cc131926 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -140,6 +140,10 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) exit(EXIT_FAILURE); } + if (kvm_enabled()) { + kvm_set_max_apic_id(x86ms->apic_id_limit); + } + possible_cpus = mc->possible_cpu_arch_ids(ms); for (i = 0; i < ms->smp.cpus; i++) { x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 87aeba0564..9a9857ccf8 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1401,13 +1401,7 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); - if (req->sq->ioeventfd_enabled) { - /* Post CQE directly since we are in main loop thread */ - nvme_post_cqes(cq); - } else { - /* Schedule the timer to post CQE later since we are in vcpu thread */ - timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); - } + qemu_bh_schedule(cq->bh); } static void nvme_process_aers(void *opaque) @@ -3040,7 +3034,8 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) goto invalid; } - if (ns->pif && format != 0x1) { + if ((ns->pif == 0x0 && format != 0x0) || + (ns->pif != 0x0 && format != 0x1)) { status = NVME_INVALID_FORMAT | NVME_DNR; goto invalid; } @@ -4252,7 +4247,7 @@ static void nvme_cq_notifier(EventNotifier *e) nvme_irq_deassert(n, cq); } - nvme_post_cqes(cq); + qemu_bh_schedule(cq->bh); } static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) @@ -4307,7 +4302,7 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) uint16_t offset = sq->sqid << 3; n->sq[sq->sqid] = NULL; - timer_free(sq->timer); + qemu_bh_delete(sq->bh); if (sq->ioeventfd_enabled) { memory_region_del_eventfd(&n->iomem, 0x1000 + offset, 4, false, 0, &sq->notifier); @@ -4381,7 +4376,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, sq->io_req[i].sq = sq; QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry); } - sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); + + sq->bh = qemu_bh_new(nvme_process_sq, sq); if (n->dbbuf_enabled) { sq->db_addr = n->dbbuf_dbs + (sqid << 3); @@ -4698,7 +4694,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) uint16_t offset = (cq->cqid << 3) + (1 << 2); n->cq[cq->cqid] = NULL; - timer_free(cq->timer); + qemu_bh_delete(cq->bh); if (cq->ioeventfd_enabled) { memory_region_del_eventfd(&n->iomem, 0x1000 + offset, 4, false, 0, &cq->notifier); @@ -4771,7 +4767,7 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, } } n->cq[cqid] = cq; - cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); + cq->bh = qemu_bh_new(nvme_post_cqes, cq); } static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) @@ -6913,9 +6909,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) if (start_sqs) { NvmeSQueue *sq; QTAILQ_FOREACH(sq, &cq->sq_list, entry) { - timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); + qemu_bh_schedule(sq->bh); } - timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); + qemu_bh_schedule(cq->bh); } if (cq->tail == cq->head) { @@ -6984,7 +6980,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail, sizeof(sq->tail)); } - timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); + + qemu_bh_schedule(sq->bh); } } diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 79f5c281c2..7adf042ec3 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -375,7 +375,7 @@ typedef struct NvmeSQueue { uint64_t dma_addr; uint64_t db_addr; uint64_t ei_addr; - QEMUTimer *timer; + QEMUBH *bh; EventNotifier notifier; bool ioeventfd_enabled; NvmeRequest *io_req; @@ -396,7 +396,7 @@ typedef struct NvmeCQueue { uint64_t dma_addr; uint64_t db_addr; uint64_t ei_addr; - QEMUTimer *timer; + QEMUBH *bh; EventNotifier notifier; bool ioeventfd_enabled; QTAILQ_HEAD(, NvmeSQueue) sq_list; diff --git a/include/qemu/async-teardown.h b/include/qemu/async-teardown.h new file mode 100644 index 0000000000..092e7a37e7 --- /dev/null +++ b/include/qemu/async-teardown.h @@ -0,0 +1,22 @@ +/* + * Asynchronous teardown + * + * Copyright IBM, Corp. 2022 + * + * Authors: + * Claudio Imbrenda <imbrenda@linux.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at your + * option) any later version. See the COPYING file in the top-level directory. + * + */ +#ifndef QEMU_ASYNC_TEARDOWN_H +#define QEMU_ASYNC_TEARDOWN_H + +#include "config-host.h" + +#ifdef CONFIG_LINUX +void init_async_teardown(void); +#endif + +#endif diff --git a/meson.build b/meson.build index d809d51791..1d448272ab 100644 --- a/meson.build +++ b/meson.build @@ -1955,6 +1955,7 @@ if targetos == 'windows' endif # has_function +config_host_data.set('CONFIG_CLOSE_RANGE', cc.has_function('close_range')) config_host_data.set('CONFIG_ACCEPT4', cc.has_function('accept4')) config_host_data.set('CONFIG_CLOCK_ADJTIME', cc.has_function('clock_adjtime')) config_host_data.set('CONFIG_DUP3', cc.has_function('dup3')) diff --git a/os-posix.c b/os-posix.c index 321fc4bd13..4858650c3e 100644 --- a/os-posix.c +++ b/os-posix.c @@ -39,6 +39,7 @@ #ifdef CONFIG_LINUX #include <sys/prctl.h> +#include "qemu/async-teardown.h" #endif /* @@ -150,6 +151,11 @@ int os_parse_cmd_args(int index, const char *optarg) case QEMU_OPTION_daemonize: daemonize = 1; break; +#if defined(CONFIG_LINUX) + case QEMU_OPTION_asyncteardown: + init_async_teardown(); + break; +#endif default: return -1; } diff --git a/qemu-options.hx b/qemu-options.hx index ceee0ddc25..911d82afa5 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4790,6 +4790,25 @@ HXCOMM Internal use DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", QEMU_ARCH_ALL) DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL) +#ifdef __linux__ +DEF("async-teardown", 0, QEMU_OPTION_asyncteardown, + "-async-teardown enable asynchronous teardown\n", + QEMU_ARCH_ALL) +#endif +SRST +``-async-teardown`` + Enable asynchronous teardown. A new process called "cleanup/<QEMU_PID>" + will be created at startup sharing the address space with the main qemu + process, using clone. It will wait for the main qemu process to + terminate completely, and then exit. + This allows qemu to terminate very quickly even if the guest was + huge, leaving the teardown of the address space to the cleanup + process. Since the cleanup process shares the same cgroups as the + main qemu process, accounting is performed correctly. This only + works if the cleanup process is not forcefully killed with SIGKILL + before the main qemu process has terminated completely. +ERST + DEF("msg", HAS_ARG, QEMU_OPTION_msg, "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n" " control error message format\n" diff --git a/target/i386/kvm/kvm-stub.c b/target/i386/kvm/kvm-stub.c index f6e7e4466e..e052f1c7b0 100644 --- a/target/i386/kvm/kvm-stub.c +++ b/target/i386/kvm/kvm-stub.c @@ -44,3 +44,8 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp) { abort(); } + +void kvm_set_max_apic_id(uint32_t max_apic_id) +{ + return; +} diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 4df0428089..a213209379 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -5723,3 +5723,8 @@ void kvm_arch_accel_class_init(ObjectClass *oc) "Clock cycles without an event window " "after which a notification VM exit occurs"); } + +void kvm_set_max_apic_id(uint32_t max_apic_id) +{ + kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id); +} diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h index b7c38ba2c4..6a5c24e3dc 100644 --- a/target/i386/kvm/kvm_i386.h +++ b/target/i386/kvm/kvm_i386.h @@ -66,4 +66,6 @@ typedef struct kvm_msr_handlers { bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, QEMUWRMSRHandler *wrmsr); +void kvm_set_max_apic_id(uint32_t max_apic_id); + #endif diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c index d51b5d7431..405a5d414a 100644 --- a/target/i386/tcg/sysemu/excp_helper.c +++ b/target/i386/tcg/sysemu/excp_helper.c @@ -553,12 +553,12 @@ static bool get_physical_address(CPUX86State *env, vaddr addr, break; default: - in.cr3 = env->cr[3]; - in.mmu_idx = mmu_idx; - in.ptw_idx = use_stage2 ? MMU_NESTED_IDX : MMU_PHYS_IDX; - in.pg_mode = get_pg_mode(env); + if (likely(env->cr[0] & CR0_PG_MASK)) { + in.cr3 = env->cr[3]; + in.mmu_idx = mmu_idx; + in.ptw_idx = use_stage2 ? MMU_NESTED_IDX : MMU_PHYS_IDX; + in.pg_mode = get_pg_mode(env); - if (likely(in.pg_mode)) { if (in.pg_mode & PG_MODE_LMA) { /* test virtual address sign extension */ int shift = in.pg_mode & PG_MODE_LA57 ? 56 : 47; diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 0ee548ce56..28a4e6dc1d 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3320,7 +3320,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) tcg_temp_free(t2); tcg_temp_free(a0); - tcg_gen_mov_tl(s->T0, t0); + tcg_gen_neg_tl(s->T0, t0); tcg_temp_free(t0); } else { tcg_gen_neg_tl(s->T0, s->T0); diff --git a/util/async-teardown.c b/util/async-teardown.c new file mode 100644 index 0000000000..62bfce1b3c --- /dev/null +++ b/util/async-teardown.c @@ -0,0 +1,150 @@ +/* + * Asynchronous teardown + * + * Copyright IBM, Corp. 2022 + * + * Authors: + * Claudio Imbrenda <imbrenda@linux.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at your + * option) any later version. See the COPYING file in the top-level directory. + * + */ +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> +#include <dirent.h> +#include <sys/prctl.h> +#include <signal.h> +#include <sched.h> +#include <unistd.h> + +#include "qemu/osdep.h" +#include "qemu/async-teardown.h" + +#ifdef _SC_THREAD_STACK_MIN +#define CLONE_STACK_SIZE sysconf(_SC_THREAD_STACK_MIN) +#else +#define CLONE_STACK_SIZE 16384 +#endif + +static pid_t the_ppid; + +/* + * Close all open file descriptors. + */ +static void close_all_open_fd(void) +{ + struct dirent *de; + int fd, dfd; + DIR *dir; + +#ifdef CONFIG_CLOSE_RANGE + int r = close_range(0, ~0U, 0); + if (!r) { + /* Success, no need to try other ways. */ + return; + } +#endif + + dir = opendir("/proc/self/fd"); + if (!dir) { + /* If /proc is not mounted, there is nothing that can be done. */ + return; + } + /* Avoid closing the directory. */ + dfd = dirfd(dir); + + for (de = readdir(dir); de; de = readdir(dir)) { + fd = atoi(de->d_name); + if (fd != dfd) { + close(fd); + } + } + closedir(dir); +} + +static void hup_handler(int signal) +{ + /* Check every second if this process has been reparented. */ + while (the_ppid == getppid()) { + /* sleep() is safe to use in a signal handler. */ + sleep(1); + } + + /* At this point the parent process has terminated completely. */ + _exit(0); +} + +static int async_teardown_fn(void *arg) +{ + struct sigaction sa = { .sa_handler = hup_handler }; + sigset_t hup_signal; + char name[16]; + + /* Set a meaningful name for this process. */ + snprintf(name, 16, "cleanup/%d", the_ppid); + prctl(PR_SET_NAME, (unsigned long)name); + + /* + * Close all file descriptors that might have been inherited from the + * main qemu process when doing clone, needed to make libvirt happy. + * Not using close_range for increased compatibility with older kernels. + */ + close_all_open_fd(); + + /* Set up a handler for SIGHUP and unblock SIGHUP. */ + sigaction(SIGHUP, &sa, NULL); + sigemptyset(&hup_signal); + sigaddset(&hup_signal, SIGHUP); + sigprocmask(SIG_UNBLOCK, &hup_signal, NULL); + + /* Ask to receive SIGHUP when the parent dies. */ + prctl(PR_SET_PDEATHSIG, SIGHUP); + + /* + * Sleep forever, unless the parent process has already terminated. The + * only interruption can come from the SIGHUP signal, which in normal + * operation is received when the parent process dies. + */ + if (the_ppid == getppid()) { + pause(); + } + + /* At this point the parent process has terminated completely. */ + _exit(0); +} + +/* + * Allocate a new stack of a reasonable size, and return a pointer to its top. + */ +static void *new_stack_for_clone(void) +{ + size_t stack_size = CLONE_STACK_SIZE; + char *stack_ptr; + + /* Allocate a new stack and get a pointer to its top. */ + stack_ptr = qemu_alloc_stack(&stack_size); +#if !defined(HOST_HPPA) + /* The top is at the end of the area, except on HPPA. */ + stack_ptr += stack_size; +#endif + + return stack_ptr; +} + +/* + * Block all signals, start (clone) a new process sharing the address space + * with qemu (CLONE_VM), then restore signals. + */ +void init_async_teardown(void) +{ + sigset_t all_signals, old_signals; + + the_ppid = getpid(); + + sigfillset(&all_signals); + sigprocmask(SIG_BLOCK, &all_signals, &old_signals); + clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL); + sigprocmask(SIG_SETMASK, &old_signals, NULL); +} diff --git a/util/log.c b/util/log.c index d6eb0378c3..39866bdaf2 100644 --- a/util/log.c +++ b/util/log.c @@ -42,6 +42,7 @@ static QemuMutex global_mutex; static char *global_filename; static FILE *global_file; static __thread FILE *thread_file; +static __thread Notifier qemu_log_thread_cleanup_notifier; int qemu_loglevel; static bool log_append; @@ -77,6 +78,12 @@ static int log_thread_id(void) #endif } +static void qemu_log_thread_cleanup(Notifier *n, void *unused) +{ + fclose(thread_file); + thread_file = NULL; +} + /* Lock/unlock output. */ FILE *qemu_log_trylock(void) @@ -93,6 +100,8 @@ FILE *qemu_log_trylock(void) return NULL; } thread_file = logfile; + qemu_log_thread_cleanup_notifier.notify = qemu_log_thread_cleanup; + qemu_thread_atexit_add(&qemu_log_thread_cleanup_notifier); } else { rcu_read_lock(); /* diff --git a/util/meson.build b/util/meson.build index c0a7bc54d4..59c1f467bb 100644 --- a/util/meson.build +++ b/util/meson.build @@ -3,6 +3,7 @@ util_ss.add(files('thread-context.c'), numa) if not config_host_data.get('CONFIG_ATOMIC64') util_ss.add(files('atomic64.c')) endif +util_ss.add(when: 'CONFIG_LINUX', if_true: files('async-teardown.c')) util_ss.add(when: 'CONFIG_POSIX', if_true: files('aio-posix.c')) util_ss.add(when: 'CONFIG_POSIX', if_true: files('fdmon-poll.c')) if config_host_data.get('CONFIG_EPOLL_CREATE1') |