summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--hw/i386/x86.c4
-rw-r--r--hw/nvme/ctrl.c29
-rw-r--r--hw/nvme/nvme.h4
-rw-r--r--include/qemu/async-teardown.h22
-rw-r--r--meson.build1
-rw-r--r--os-posix.c6
-rw-r--r--qemu-options.hx19
-rw-r--r--target/i386/kvm/kvm-stub.c5
-rw-r--r--target/i386/kvm/kvm.c5
-rw-r--r--target/i386/kvm/kvm_i386.h2
-rw-r--r--target/i386/tcg/sysemu/excp_helper.c10
-rw-r--r--target/i386/tcg/translate.c2
-rw-r--r--util/async-teardown.c150
-rw-r--r--util/log.c9
-rw-r--r--util/meson.build1
15 files changed, 245 insertions, 24 deletions
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index bd50a064a3..78cc131926 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -140,6 +140,10 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
         exit(EXIT_FAILURE);
     }
 
+    if (kvm_enabled()) {
+        kvm_set_max_apic_id(x86ms->apic_id_limit);
+    }
+
     possible_cpus = mc->possible_cpu_arch_ids(ms);
     for (i = 0; i < ms->smp.cpus; i++) {
         x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 87aeba0564..9a9857ccf8 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1401,13 +1401,7 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
 
-    if (req->sq->ioeventfd_enabled) {
-        /* Post CQE directly since we are in main loop thread */
-        nvme_post_cqes(cq);
-    } else {
-        /* Schedule the timer to post CQE later since we are in vcpu thread */
-        timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
-    }
+    qemu_bh_schedule(cq->bh);
 }
 
 static void nvme_process_aers(void *opaque)
@@ -3040,7 +3034,8 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
-    if (ns->pif && format != 0x1) {
+    if ((ns->pif == 0x0 && format != 0x0) ||
+        (ns->pif != 0x0 && format != 0x1)) {
         status = NVME_INVALID_FORMAT | NVME_DNR;
         goto invalid;
     }
@@ -4252,7 +4247,7 @@ static void nvme_cq_notifier(EventNotifier *e)
         nvme_irq_deassert(n, cq);
     }
 
-    nvme_post_cqes(cq);
+    qemu_bh_schedule(cq->bh);
 }
 
 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
@@ -4307,7 +4302,7 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
     uint16_t offset = sq->sqid << 3;
 
     n->sq[sq->sqid] = NULL;
-    timer_free(sq->timer);
+    qemu_bh_delete(sq->bh);
     if (sq->ioeventfd_enabled) {
         memory_region_del_eventfd(&n->iomem,
                                   0x1000 + offset, 4, false, 0, &sq->notifier);
@@ -4381,7 +4376,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
         sq->io_req[i].sq = sq;
         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
     }
-    sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+
+    sq->bh = qemu_bh_new(nvme_process_sq, sq);
 
     if (n->dbbuf_enabled) {
         sq->db_addr = n->dbbuf_dbs + (sqid << 3);
@@ -4698,7 +4694,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
     uint16_t offset = (cq->cqid << 3) + (1 << 2);
 
     n->cq[cq->cqid] = NULL;
-    timer_free(cq->timer);
+    qemu_bh_delete(cq->bh);
     if (cq->ioeventfd_enabled) {
         memory_region_del_eventfd(&n->iomem,
                                   0x1000 + offset, 4, false, 0, &cq->notifier);
@@ -4771,7 +4767,7 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
         }
     }
     n->cq[cqid] = cq;
-    cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
+    cq->bh = qemu_bh_new(nvme_post_cqes, cq);
 }
 
 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
@@ -6913,9 +6909,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
         if (start_sqs) {
             NvmeSQueue *sq;
             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
-                timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+                qemu_bh_schedule(sq->bh);
             }
-            timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+            qemu_bh_schedule(cq->bh);
         }
 
         if (cq->tail == cq->head) {
@@ -6984,7 +6980,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
                           sizeof(sq->tail));
         }
-        timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+
+        qemu_bh_schedule(sq->bh);
     }
 }
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 79f5c281c2..7adf042ec3 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -375,7 +375,7 @@ typedef struct NvmeSQueue {
     uint64_t    dma_addr;
     uint64_t    db_addr;
     uint64_t    ei_addr;
-    QEMUTimer   *timer;
+    QEMUBH      *bh;
     EventNotifier notifier;
     bool        ioeventfd_enabled;
     NvmeRequest *io_req;
@@ -396,7 +396,7 @@ typedef struct NvmeCQueue {
     uint64_t    dma_addr;
     uint64_t    db_addr;
     uint64_t    ei_addr;
-    QEMUTimer   *timer;
+    QEMUBH      *bh;
     EventNotifier notifier;
     bool        ioeventfd_enabled;
     QTAILQ_HEAD(, NvmeSQueue) sq_list;
diff --git a/include/qemu/async-teardown.h b/include/qemu/async-teardown.h
new file mode 100644
index 0000000000..092e7a37e7
--- /dev/null
+++ b/include/qemu/async-teardown.h
@@ -0,0 +1,22 @@
+/*
+ * Asynchronous teardown
+ *
+ * Copyright IBM, Corp. 2022
+ *
+ * Authors:
+ *  Claudio Imbrenda <imbrenda@linux.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at your
+ * option) any later version.  See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_ASYNC_TEARDOWN_H
+#define QEMU_ASYNC_TEARDOWN_H
+
+#include "config-host.h"
+
+#ifdef CONFIG_LINUX
+void init_async_teardown(void);
+#endif
+
+#endif
diff --git a/meson.build b/meson.build
index d809d51791..1d448272ab 100644
--- a/meson.build
+++ b/meson.build
@@ -1955,6 +1955,7 @@ if targetos == 'windows'
 endif
 
 # has_function
+config_host_data.set('CONFIG_CLOSE_RANGE', cc.has_function('close_range'))
 config_host_data.set('CONFIG_ACCEPT4', cc.has_function('accept4'))
 config_host_data.set('CONFIG_CLOCK_ADJTIME', cc.has_function('clock_adjtime'))
 config_host_data.set('CONFIG_DUP3', cc.has_function('dup3'))
diff --git a/os-posix.c b/os-posix.c
index 321fc4bd13..4858650c3e 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -39,6 +39,7 @@
 
 #ifdef CONFIG_LINUX
 #include <sys/prctl.h>
+#include "qemu/async-teardown.h"
 #endif
 
 /*
@@ -150,6 +151,11 @@ int os_parse_cmd_args(int index, const char *optarg)
     case QEMU_OPTION_daemonize:
         daemonize = 1;
         break;
+#if defined(CONFIG_LINUX)
+    case QEMU_OPTION_asyncteardown:
+        init_async_teardown();
+        break;
+#endif
     default:
         return -1;
     }
diff --git a/qemu-options.hx b/qemu-options.hx
index ceee0ddc25..911d82afa5 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4790,6 +4790,25 @@ HXCOMM Internal use
 DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", QEMU_ARCH_ALL)
 DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL)
 
+#ifdef __linux__
+DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
+    "-async-teardown enable asynchronous teardown\n",
+    QEMU_ARCH_ALL)
+#endif
+SRST
+``-async-teardown``
+    Enable asynchronous teardown. A new process called "cleanup/<QEMU_PID>"
+    will be created at startup sharing the address space with the main qemu
+    process, using clone. It will wait for the main qemu process to
+    terminate completely, and then exit.
+    This allows qemu to terminate very quickly even if the guest was
+    huge, leaving the teardown of the address space to the cleanup
+    process. Since the cleanup process shares the same cgroups as the
+    main qemu process, accounting is performed correctly. This only
+    works if the cleanup process is not forcefully killed with SIGKILL
+    before the main qemu process has terminated completely.
+ERST
+
 DEF("msg", HAS_ARG, QEMU_OPTION_msg,
     "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n"
     "                control error message format\n"
diff --git a/target/i386/kvm/kvm-stub.c b/target/i386/kvm/kvm-stub.c
index f6e7e4466e..e052f1c7b0 100644
--- a/target/i386/kvm/kvm-stub.c
+++ b/target/i386/kvm/kvm-stub.c
@@ -44,3 +44,8 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
 {
     abort();
 }
+
+void kvm_set_max_apic_id(uint32_t max_apic_id)
+{
+    return;
+}
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 4df0428089..a213209379 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -5723,3 +5723,8 @@ void kvm_arch_accel_class_init(ObjectClass *oc)
                                           "Clock cycles without an event window "
                                           "after which a notification VM exit occurs");
 }
+
+void kvm_set_max_apic_id(uint32_t max_apic_id)
+{
+    kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id);
+}
diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
index b7c38ba2c4..6a5c24e3dc 100644
--- a/target/i386/kvm/kvm_i386.h
+++ b/target/i386/kvm/kvm_i386.h
@@ -66,4 +66,6 @@ typedef struct kvm_msr_handlers {
 bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
                     QEMUWRMSRHandler *wrmsr);
 
+void kvm_set_max_apic_id(uint32_t max_apic_id);
+
 #endif
diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c
index d51b5d7431..405a5d414a 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -553,12 +553,12 @@ static bool get_physical_address(CPUX86State *env, vaddr addr,
         break;
 
     default:
-        in.cr3 = env->cr[3];
-        in.mmu_idx = mmu_idx;
-        in.ptw_idx = use_stage2 ? MMU_NESTED_IDX : MMU_PHYS_IDX;
-        in.pg_mode = get_pg_mode(env);
+        if (likely(env->cr[0] & CR0_PG_MASK)) {
+            in.cr3 = env->cr[3];
+            in.mmu_idx = mmu_idx;
+            in.ptw_idx = use_stage2 ? MMU_NESTED_IDX : MMU_PHYS_IDX;
+            in.pg_mode = get_pg_mode(env);
 
-        if (likely(in.pg_mode)) {
             if (in.pg_mode & PG_MODE_LMA) {
                 /* test virtual address sign extension */
                 int shift = in.pg_mode & PG_MODE_LA57 ? 56 : 47;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 0ee548ce56..28a4e6dc1d 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3320,7 +3320,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
 
                 tcg_temp_free(t2);
                 tcg_temp_free(a0);
-                tcg_gen_mov_tl(s->T0, t0);
+                tcg_gen_neg_tl(s->T0, t0);
                 tcg_temp_free(t0);
             } else {
                 tcg_gen_neg_tl(s->T0, s->T0);
diff --git a/util/async-teardown.c b/util/async-teardown.c
new file mode 100644
index 0000000000..62bfce1b3c
--- /dev/null
+++ b/util/async-teardown.c
@@ -0,0 +1,150 @@
+/*
+ * Asynchronous teardown
+ *
+ * Copyright IBM, Corp. 2022
+ *
+ * Authors:
+ *  Claudio Imbrenda <imbrenda@linux.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at your
+ * option) any later version.  See the COPYING file in the top-level directory.
+ *
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <sched.h>
+#include <unistd.h>
+
+#include "qemu/osdep.h"
+#include "qemu/async-teardown.h"
+
+#ifdef _SC_THREAD_STACK_MIN
+#define CLONE_STACK_SIZE sysconf(_SC_THREAD_STACK_MIN)
+#else
+#define CLONE_STACK_SIZE 16384
+#endif
+
+static pid_t the_ppid;
+
+/*
+ * Close all open file descriptors.
+ */
+static void close_all_open_fd(void)
+{
+    struct dirent *de;
+    int fd, dfd;
+    DIR *dir;
+
+#ifdef CONFIG_CLOSE_RANGE
+    int r = close_range(0, ~0U, 0);
+    if (!r) {
+        /* Success, no need to try other ways. */
+        return;
+    }
+#endif
+
+    dir = opendir("/proc/self/fd");
+    if (!dir) {
+        /* If /proc is not mounted, there is nothing that can be done. */
+        return;
+    }
+    /* Avoid closing the directory. */
+    dfd = dirfd(dir);
+
+    for (de = readdir(dir); de; de = readdir(dir)) {
+        fd = atoi(de->d_name);
+        if (fd != dfd) {
+            close(fd);
+        }
+    }
+    closedir(dir);
+}
+
+static void hup_handler(int signal)
+{
+    /* Check every second if this process has been reparented. */
+    while (the_ppid == getppid()) {
+        /* sleep() is safe to use in a signal handler. */
+        sleep(1);
+    }
+
+    /* At this point the parent process has terminated completely. */
+    _exit(0);
+}
+
+static int async_teardown_fn(void *arg)
+{
+    struct sigaction sa = { .sa_handler = hup_handler };
+    sigset_t hup_signal;
+    char name[16];
+
+    /* Set a meaningful name for this process. */
+    snprintf(name, 16, "cleanup/%d", the_ppid);
+    prctl(PR_SET_NAME, (unsigned long)name);
+
+    /*
+     * Close all file descriptors that might have been inherited from the
+     * main qemu process when doing clone, needed to make libvirt happy.
+     * Not using close_range for increased compatibility with older kernels.
+     */
+    close_all_open_fd();
+
+    /* Set up a handler for SIGHUP and unblock SIGHUP. */
+    sigaction(SIGHUP, &sa, NULL);
+    sigemptyset(&hup_signal);
+    sigaddset(&hup_signal, SIGHUP);
+    sigprocmask(SIG_UNBLOCK, &hup_signal, NULL);
+
+    /* Ask to receive SIGHUP when the parent dies. */
+    prctl(PR_SET_PDEATHSIG, SIGHUP);
+
+    /*
+     * Sleep forever, unless the parent process has already terminated. The
+     * only interruption can come from the SIGHUP signal, which in normal
+     * operation is received when the parent process dies.
+     */
+    if (the_ppid == getppid()) {
+        pause();
+    }
+
+    /* At this point the parent process has terminated completely. */
+    _exit(0);
+}
+
+/*
+ * Allocate a new stack of a reasonable size, and return a pointer to its top.
+ */
+static void *new_stack_for_clone(void)
+{
+    size_t stack_size = CLONE_STACK_SIZE;
+    char *stack_ptr;
+
+    /* Allocate a new stack and get a pointer to its top. */
+    stack_ptr = qemu_alloc_stack(&stack_size);
+#if !defined(HOST_HPPA)
+    /* The top is at the end of the area, except on HPPA. */
+    stack_ptr += stack_size;
+#endif
+
+    return stack_ptr;
+}
+
+/*
+ * Block all signals, start (clone) a new process sharing the address space
+ * with qemu (CLONE_VM), then restore signals.
+ */
+void init_async_teardown(void)
+{
+    sigset_t all_signals, old_signals;
+
+    the_ppid = getpid();
+
+    sigfillset(&all_signals);
+    sigprocmask(SIG_BLOCK, &all_signals, &old_signals);
+    clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL);
+    sigprocmask(SIG_SETMASK, &old_signals, NULL);
+}
diff --git a/util/log.c b/util/log.c
index d6eb0378c3..39866bdaf2 100644
--- a/util/log.c
+++ b/util/log.c
@@ -42,6 +42,7 @@ static QemuMutex global_mutex;
 static char *global_filename;
 static FILE *global_file;
 static __thread FILE *thread_file;
+static __thread Notifier qemu_log_thread_cleanup_notifier;
 
 int qemu_loglevel;
 static bool log_append;
@@ -77,6 +78,12 @@ static int log_thread_id(void)
 #endif
 }
 
+static void qemu_log_thread_cleanup(Notifier *n, void *unused)
+{
+    fclose(thread_file);
+    thread_file = NULL;
+}
+
 /* Lock/unlock output. */
 
 FILE *qemu_log_trylock(void)
@@ -93,6 +100,8 @@ FILE *qemu_log_trylock(void)
                 return NULL;
             }
             thread_file = logfile;
+            qemu_log_thread_cleanup_notifier.notify = qemu_log_thread_cleanup;
+            qemu_thread_atexit_add(&qemu_log_thread_cleanup_notifier);
         } else {
             rcu_read_lock();
             /*
diff --git a/util/meson.build b/util/meson.build
index c0a7bc54d4..59c1f467bb 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -3,6 +3,7 @@ util_ss.add(files('thread-context.c'), numa)
 if not config_host_data.get('CONFIG_ATOMIC64')
   util_ss.add(files('atomic64.c'))
 endif
+util_ss.add(when: 'CONFIG_LINUX', if_true: files('async-teardown.c'))
 util_ss.add(when: 'CONFIG_POSIX', if_true: files('aio-posix.c'))
 util_ss.add(when: 'CONFIG_POSIX', if_true: files('fdmon-poll.c'))
 if config_host_data.get('CONFIG_EPOLL_CREATE1')