summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--accel/stubs/tcg-stub.c5
-rw-r--r--accel/tcg/cputlb.c46
-rw-r--r--accel/tcg/plugin-gen.c4
-rw-r--r--accel/tcg/tb-maint.c59
-rw-r--r--accel/tcg/tcg-all.c21
-rw-r--r--accel/tcg/translate-all.c6
-rw-r--r--gdbstub/system.c5
-rw-r--r--gdbstub/user.c3
-rw-r--r--hw/core/cpu-system.c9
-rw-r--r--hw/pci-host/astro.c27
-rw-r--r--hw/pci-host/dino.c74
-rw-r--r--hw/ppc/spapr_hcall.c4
-rw-r--r--include/exec/cpu-common.h2
-rw-r--r--include/exec/cputlb.h32
-rw-r--r--include/exec/tb-flush.h30
-rw-r--r--include/hw/core/cpu.h22
-rwxr-xr-xlinux-user/aarch64/vdso-be.sobin3224 -> 3320 bytes
-rwxr-xr-xlinux-user/aarch64/vdso-le.sobin3224 -> 3320 bytes
-rw-r--r--linux-user/aarch64/vdso.S2
-rw-r--r--linux-user/alpha/cpu_loop.c5
-rwxr-xr-xlinux-user/arm/vdso-be32.sobin2648 -> 2724 bytes
-rwxr-xr-xlinux-user/arm/vdso-be8.sobin2648 -> 2724 bytes
-rwxr-xr-xlinux-user/arm/vdso-le.sobin2648 -> 2724 bytes
-rw-r--r--linux-user/arm/vdso.S2
-rw-r--r--linux-user/elfload.c7
-rw-r--r--linux-user/gen-vdso-elfn.c.inc7
-rw-r--r--linux-user/gen-vdso.c6
-rw-r--r--linux-user/hppa/vdso.S2
-rwxr-xr-xlinux-user/hppa/vdso.sobin2104 -> 2224 bytes
-rw-r--r--linux-user/i386/vdso.S2
-rwxr-xr-xlinux-user/i386/vdso.sobin2672 -> 2792 bytes
-rw-r--r--linux-user/loader.h2
-rw-r--r--linux-user/loongarch64/vdso.S2
-rwxr-xr-xlinux-user/loongarch64/vdso.sobin3560 -> 3712 bytes
-rw-r--r--linux-user/main.c2
-rw-r--r--linux-user/mmap.c13
-rwxr-xr-xlinux-user/ppc/vdso-32.sobin3020 -> 3140 bytes
-rwxr-xr-xlinux-user/ppc/vdso-64.sobin3896 -> 4048 bytes
-rwxr-xr-xlinux-user/ppc/vdso-64le.sobin3896 -> 4048 bytes
-rw-r--r--linux-user/ppc/vdso.S2
-rw-r--r--linux-user/qemu.h5
-rwxr-xr-xlinux-user/riscv/vdso-32.sobin2980 -> 3124 bytes
-rwxr-xr-xlinux-user/riscv/vdso-64.sobin3944 -> 4104 bytes
-rw-r--r--linux-user/riscv/vdso.S2
-rw-r--r--linux-user/s390x/vdso.S2
-rwxr-xr-xlinux-user/s390x/vdso.sobin3464 -> 3616 bytes
-rw-r--r--linux-user/signal-common.h7
-rw-r--r--linux-user/signal.c2
-rw-r--r--linux-user/syscall.c83
-rw-r--r--linux-user/syscall_defs.h6
-rw-r--r--linux-user/user-internals.h16
-rw-r--r--plugins/core.c6
-rw-r--r--plugins/loader.c3
-rw-r--r--target/alpha/helper.h1
-rw-r--r--target/alpha/sys_helper.c6
-rw-r--r--target/alpha/translate.c21
-rw-r--r--target/hppa/cpu.h30
-rw-r--r--target/riscv/csr.c3
-rw-r--r--target/riscv/tcg/tcg-cpu.c3
-rw-r--r--target/sparc/insns.decode83
-rw-r--r--target/sparc/translate.c77
-rw-r--r--tcg/aarch64/tcg-target.c.inc2
-rw-r--r--tcg/arm/tcg-target.c.inc2
-rw-r--r--tcg/optimize.c3
-rw-r--r--tcg/tcg.c3
-rw-r--r--tests/tcg/multiarch/Makefile.target2
-rw-r--r--tests/tcg/multiarch/tb-link.c67
67 files changed, 542 insertions, 296 deletions
diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c
index 3b76b8b17c..77055e3964 100644
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -17,8 +17,3 @@ G_NORETURN void cpu_loop_exit(CPUState *cpu)
 {
     g_assert_not_reached();
 }
-
-G_NORETURN void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
-{
-    g_assert_not_reached();
-}
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 87e14bde4f..2a6aa01c57 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -129,7 +129,7 @@ static inline uint64_t tlb_addr_write(const CPUTLBEntry *entry)
 static inline uintptr_t tlb_index(CPUState *cpu, uintptr_t mmu_idx,
                                   vaddr addr)
 {
-    uintptr_t size_mask = cpu->neg.tlb.f[mmu_idx].mask >> CPU_TLB_ENTRY_BITS;
+    uintptr_t size_mask = cpu_tlb_fast(cpu, mmu_idx)->mask >> CPU_TLB_ENTRY_BITS;
 
     return (addr >> TARGET_PAGE_BITS) & size_mask;
 }
@@ -138,7 +138,7 @@ static inline uintptr_t tlb_index(CPUState *cpu, uintptr_t mmu_idx,
 static inline CPUTLBEntry *tlb_entry(CPUState *cpu, uintptr_t mmu_idx,
                                      vaddr addr)
 {
-    return &cpu->neg.tlb.f[mmu_idx].table[tlb_index(cpu, mmu_idx, addr)];
+    return &cpu_tlb_fast(cpu, mmu_idx)->table[tlb_index(cpu, mmu_idx, addr)];
 }
 
 static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
@@ -292,7 +292,7 @@ static void tlb_flush_one_mmuidx_locked(CPUState *cpu, int mmu_idx,
                                         int64_t now)
 {
     CPUTLBDesc *desc = &cpu->neg.tlb.d[mmu_idx];
-    CPUTLBDescFast *fast = &cpu->neg.tlb.f[mmu_idx];
+    CPUTLBDescFast *fast = cpu_tlb_fast(cpu, mmu_idx);
 
     tlb_mmu_resize_locked(desc, fast, now);
     tlb_mmu_flush_locked(desc, fast);
@@ -331,7 +331,7 @@ void tlb_init(CPUState *cpu)
     cpu->neg.tlb.c.dirty = 0;
 
     for (i = 0; i < NB_MMU_MODES; i++) {
-        tlb_mmu_init(&cpu->neg.tlb.d[i], &cpu->neg.tlb.f[i], now);
+        tlb_mmu_init(&cpu->neg.tlb.d[i], cpu_tlb_fast(cpu, i), now);
     }
 }
 
@@ -342,7 +342,7 @@ void tlb_destroy(CPUState *cpu)
     qemu_spin_destroy(&cpu->neg.tlb.c.lock);
     for (i = 0; i < NB_MMU_MODES; i++) {
         CPUTLBDesc *desc = &cpu->neg.tlb.d[i];
-        CPUTLBDescFast *fast = &cpu->neg.tlb.f[i];
+        CPUTLBDescFast *fast = cpu_tlb_fast(cpu, i);
 
         g_free(fast->table);
         g_free(desc->fulltlb);
@@ -370,8 +370,8 @@ static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
 
 static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
-    uint16_t asked = data.host_int;
-    uint16_t all_dirty, work, to_clean;
+    MMUIdxMap asked = data.host_int;
+    MMUIdxMap all_dirty, work, to_clean;
     int64_t now = get_clock_realtime();
 
     assert_cpu_is_self(cpu);
@@ -408,7 +408,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
     }
 }
 
-void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
+void tlb_flush_by_mmuidx(CPUState *cpu, MMUIdxMap idxmap)
 {
     tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
 
@@ -422,7 +422,7 @@ void tlb_flush(CPUState *cpu)
     tlb_flush_by_mmuidx(cpu, ALL_MMUIDX_BITS);
 }
 
-void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu, uint16_t idxmap)
+void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu, MMUIdxMap idxmap)
 {
     const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work;
 
@@ -531,7 +531,7 @@ static void tlb_flush_page_locked(CPUState *cpu, int midx, vaddr page)
  */
 static void tlb_flush_page_by_mmuidx_async_0(CPUState *cpu,
                                              vaddr addr,
-                                             uint16_t idxmap)
+                                             MMUIdxMap idxmap)
 {
     int mmu_idx;
 
@@ -570,14 +570,14 @@ static void tlb_flush_page_by_mmuidx_async_1(CPUState *cpu,
 {
     vaddr addr_and_idxmap = data.target_ptr;
     vaddr addr = addr_and_idxmap & TARGET_PAGE_MASK;
-    uint16_t idxmap = addr_and_idxmap & ~TARGET_PAGE_MASK;
+    MMUIdxMap idxmap = addr_and_idxmap & ~TARGET_PAGE_MASK;
 
     tlb_flush_page_by_mmuidx_async_0(cpu, addr, idxmap);
 }
 
 typedef struct {
     vaddr addr;
-    uint16_t idxmap;
+    MMUIdxMap idxmap;
 } TLBFlushPageByMMUIdxData;
 
 /**
@@ -599,7 +599,7 @@ static void tlb_flush_page_by_mmuidx_async_2(CPUState *cpu,
     g_free(d);
 }
 
-void tlb_flush_page_by_mmuidx(CPUState *cpu, vaddr addr, uint16_t idxmap)
+void tlb_flush_page_by_mmuidx(CPUState *cpu, vaddr addr, MMUIdxMap idxmap)
 {
     tlb_debug("addr: %016" VADDR_PRIx " mmu_idx:%" PRIx16 "\n", addr, idxmap);
 
@@ -618,7 +618,7 @@ void tlb_flush_page(CPUState *cpu, vaddr addr)
 
 void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
                                               vaddr addr,
-                                              uint16_t idxmap)
+                                              MMUIdxMap idxmap)
 {
     tlb_debug("addr: %016" VADDR_PRIx " mmu_idx:%"PRIx16"\n", addr, idxmap);
 
@@ -667,7 +667,7 @@ static void tlb_flush_range_locked(CPUState *cpu, int midx,
                                    unsigned bits)
 {
     CPUTLBDesc *d = &cpu->neg.tlb.d[midx];
-    CPUTLBDescFast *f = &cpu->neg.tlb.f[midx];
+    CPUTLBDescFast *f = cpu_tlb_fast(cpu, midx);
     vaddr mask = MAKE_64BIT_MASK(0, bits);
 
     /*
@@ -715,8 +715,8 @@ static void tlb_flush_range_locked(CPUState *cpu, int midx,
 typedef struct {
     vaddr addr;
     vaddr len;
-    uint16_t idxmap;
-    uint16_t bits;
+    MMUIdxMap idxmap;
+    unsigned bits;
 } TLBFlushRangeData;
 
 static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
@@ -766,7 +766,7 @@ static void tlb_flush_range_by_mmuidx_async_1(CPUState *cpu,
 }
 
 void tlb_flush_range_by_mmuidx(CPUState *cpu, vaddr addr,
-                               vaddr len, uint16_t idxmap,
+                               vaddr len, MMUIdxMap idxmap,
                                unsigned bits)
 {
     TLBFlushRangeData d;
@@ -797,7 +797,7 @@ void tlb_flush_range_by_mmuidx(CPUState *cpu, vaddr addr,
 }
 
 void tlb_flush_page_bits_by_mmuidx(CPUState *cpu, vaddr addr,
-                                   uint16_t idxmap, unsigned bits)
+                                   MMUIdxMap idxmap, unsigned bits)
 {
     tlb_flush_range_by_mmuidx(cpu, addr, TARGET_PAGE_SIZE, idxmap, bits);
 }
@@ -805,7 +805,7 @@ void tlb_flush_page_bits_by_mmuidx(CPUState *cpu, vaddr addr,
 void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
                                                vaddr addr,
                                                vaddr len,
-                                               uint16_t idxmap,
+                                               MMUIdxMap idxmap,
                                                unsigned bits)
 {
     TLBFlushRangeData d, *p;
@@ -847,7 +847,7 @@ void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
 
 void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
                                                    vaddr addr,
-                                                   uint16_t idxmap,
+                                                   MMUIdxMap idxmap,
                                                    unsigned bits)
 {
     tlb_flush_range_by_mmuidx_all_cpus_synced(src_cpu, addr, TARGET_PAGE_SIZE,
@@ -923,7 +923,7 @@ void tlb_reset_dirty(CPUState *cpu, uintptr_t start, uintptr_t length)
     qemu_spin_lock(&cpu->neg.tlb.c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         CPUTLBDesc *desc = &cpu->neg.tlb.d[mmu_idx];
-        CPUTLBDescFast *fast = &cpu->neg.tlb.f[mmu_idx];
+        CPUTLBDescFast *fast = cpu_tlb_fast(cpu, mmu_idx);
         unsigned int n = tlb_n_entries(fast);
         unsigned int i;
 
@@ -1316,7 +1316,7 @@ static bool victim_tlb_hit(CPUState *cpu, size_t mmu_idx, size_t index,
 
         if (cmp == page) {
             /* Found entry in victim tlb, swap tlb and iotlb.  */
-            CPUTLBEntry tmptlb, *tlb = &cpu->neg.tlb.f[mmu_idx].table[index];
+            CPUTLBEntry tmptlb, *tlb = &cpu_tlb_fast(cpu, mmu_idx)->table[index];
 
             qemu_spin_lock(&cpu->neg.tlb.c.lock);
             copy_tlb_helper_locked(&tmptlb, tlb);
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index 9920381a84..1ffcb4b2d2 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -102,8 +102,8 @@ static TCGv_i32 gen_cpu_index(void)
     /*
      * Optimize when we run with a single vcpu. All values using cpu_index,
      * including scoreboard index, will be optimized out.
-     * User-mode calls tb_flush when setting this flag. In system-mode, all
-     * vcpus are created before generating code.
+     * User-mode flushes all TBs when setting this flag.
+     * In system-mode, all vcpus are created before generating code.
      */
     if (!tcg_cflags_has(current_cpu, CF_PARALLEL)) {
         return tcg_constant_i32(current_cpu->cpu_index);
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index 0048316f99..5a8d0784e7 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -36,6 +36,9 @@
 #include "internal-common.h"
 #ifdef CONFIG_USER_ONLY
 #include "user/page-protection.h"
+#define runstate_is_running()  true
+#else
+#include "system/runstate.h"
 #endif
 
 
@@ -88,7 +91,10 @@ static IntervalTreeRoot tb_root;
 
 static void tb_remove_all(void)
 {
-    assert_memory_lock();
+    /*
+     * Only called from tb_flush__exclusive_or_serial, where we have already
+     * asserted that we're in an exclusive state.
+     */
     memset(&tb_root, 0, sizeof(tb_root));
 }
 
@@ -756,17 +762,19 @@ static void tb_remove(TranslationBlock *tb)
 }
 #endif /* CONFIG_USER_ONLY */
 
-/* flush all the translation blocks */
-static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
+/*
+ * Flush all the translation blocks.
+ * Must be called from a context in which no cpus are running,
+ * e.g. start_exclusive() or vm_stop().
+ */
+void tb_flush__exclusive_or_serial(void)
 {
-    bool did_flush = false;
+    CPUState *cpu;
 
-    mmap_lock();
-    /* If it is already been done on request of another CPU, just retry. */
-    if (tb_ctx.tb_flush_count != tb_flush_count.host_int) {
-        goto done;
-    }
-    did_flush = true;
+    assert(tcg_enabled());
+    /* Note that cpu_in_serial_context checks cpu_in_exclusive_context. */
+    assert(!runstate_is_running() ||
+           (current_cpu && cpu_in_serial_context(current_cpu)));
 
     CPU_FOREACH(cpu) {
         tcg_flush_jmp_cache(cpu);
@@ -778,25 +786,23 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
     tcg_region_reset_all();
     /* XXX: flush processor icache at this point if cache flush is expensive */
     qatomic_inc(&tb_ctx.tb_flush_count);
+    qemu_plugin_flush_cb();
+}
 
-done:
-    mmap_unlock();
-    if (did_flush) {
-        qemu_plugin_flush_cb();
+static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
+{
+    /* If it is already been done on request of another CPU, just retry. */
+    if (tb_ctx.tb_flush_count == tb_flush_count.host_int) {
+        tb_flush__exclusive_or_serial();
     }
 }
 
-void tb_flush(CPUState *cpu)
+void queue_tb_flush(CPUState *cs)
 {
     if (tcg_enabled()) {
         unsigned tb_flush_count = qatomic_read(&tb_ctx.tb_flush_count);
-
-        if (cpu_in_serial_context(cpu)) {
-            do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
-        } else {
-            async_safe_run_on_cpu(cpu, do_tb_flush,
-                                  RUN_ON_CPU_HOST_INT(tb_flush_count));
-        }
+        async_safe_run_on_cpu(cs, do_tb_flush,
+                              RUN_ON_CPU_HOST_INT(tb_flush_count));
     }
 }
 
@@ -836,6 +842,14 @@ static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig)
      * We first acquired the lock, and since the destination pointer matches,
      * we know for sure that @orig is in the jmp list.
      */
+    if (dest == orig) {
+        /*
+         * In the case of a TB that links to itself, removing the entry
+         * from the list means that it won't be present later during
+         * tb_jmp_unlink -- unlink now.
+         */
+        tb_reset_jump(orig, n_orig);
+    }
     pprev = &dest->jmp_list_head;
     TB_FOR_EACH_JMP(dest, tb, n) {
         if (tb == orig && n == n_orig) {
@@ -1154,7 +1168,6 @@ tb_invalidate_phys_page_range__locked(CPUState *cpu,
         page_collection_unlock(pages);
         /* Force execution of one insn next time.  */
         cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
-        mmap_unlock();
         cpu_loop_exit_noexc(cpu);
     }
 }
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index 5125e1a4e2..18ea0c58b0 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -38,6 +38,8 @@
 #include "qemu/target-info.h"
 #ifndef CONFIG_USER_ONLY
 #include "hw/boards.h"
+#include "exec/tb-flush.h"
+#include "system/runstate.h"
 #endif
 #include "accel/accel-ops.h"
 #include "accel/accel-cpu-ops.h"
@@ -82,6 +84,23 @@ static void tcg_accel_instance_init(Object *obj)
 
 bool one_insn_per_tb;
 
+#ifndef CONFIG_USER_ONLY
+static void tcg_vm_change_state(void *opaque, bool running, RunState state)
+{
+    if (state == RUN_STATE_RESTORE_VM) {
+        /*
+         * loadvm will update the content of RAM, bypassing the usual
+         * mechanisms that ensure we flush TBs for writes to memory
+         * we've translated code from, so we must flush all TBs.
+         *
+         * vm_stop() has just stopped all cpus, so we are exclusive.
+         */
+        assert(!running);
+        tb_flush__exclusive_or_serial();
+    }
+}
+#endif
+
 static int tcg_init_machine(AccelState *as, MachineState *ms)
 {
     TCGState *s = TCG_STATE(as);
@@ -124,6 +143,8 @@ static int tcg_init_machine(AccelState *as, MachineState *ms)
     default:
         g_assert_not_reached();
     }
+
+    qemu_add_vm_change_state_handler(tcg_vm_change_state, NULL);
 #endif
 
     tcg_allowed = true;
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index d468667b0d..da9d7f1675 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -289,7 +289,11 @@ TranslationBlock *tb_gen_code(CPUState *cpu, TCGTBCPUState s)
     tb = tcg_tb_alloc(tcg_ctx);
     if (unlikely(!tb)) {
         /* flush must be done */
-        tb_flush(cpu);
+        if (cpu_in_serial_context(cpu)) {
+            tb_flush__exclusive_or_serial();
+            goto buffer_overflow;
+        }
+        queue_tb_flush(cpu);
         mmap_unlock();
         /* Make the execution loop process the flush as soon as possible.  */
         cpu->exception_index = EXCP_INTERRUPT;
diff --git a/gdbstub/system.c b/gdbstub/system.c
index 5be0d3c58c..5221c579d9 100644
--- a/gdbstub/system.c
+++ b/gdbstub/system.c
@@ -18,13 +18,11 @@
 #include "gdbstub/syscalls.h"
 #include "gdbstub/commands.h"
 #include "exec/hwaddr.h"
-#include "exec/tb-flush.h"
 #include "accel/accel-ops.h"
 #include "accel/accel-cpu-ops.h"
 #include "system/cpus.h"
 #include "system/runstate.h"
 #include "system/replay.h"
-#include "system/tcg.h"
 #include "hw/core/cpu.h"
 #include "hw/cpu/cluster.h"
 #include "hw/boards.h"
@@ -174,9 +172,6 @@ static void gdb_vm_state_change(void *opaque, bool running, RunState state)
         } else {
             trace_gdbstub_hit_break();
         }
-        if (tcg_enabled()) {
-            tb_flush(cpu);
-        }
         ret = GDB_SIGNAL_TRAP;
         break;
     case RUN_STATE_PAUSED:
diff --git a/gdbstub/user.c b/gdbstub/user.c
index 67403e5a25..2e14ded3f0 100644
--- a/gdbstub/user.c
+++ b/gdbstub/user.c
@@ -15,7 +15,6 @@
 #include "qemu/sockets.h"
 #include "qapi/error.h"
 #include "exec/hwaddr.h"
-#include "exec/tb-flush.h"
 #include "exec/gdbstub.h"
 #include "gdbstub/commands.h"
 #include "gdbstub/syscalls.h"
@@ -220,7 +219,6 @@ int gdb_handlesig(CPUState *cpu, int sig, const char *reason, void *siginfo,
 
     /* disable single step if it was enabled */
     cpu_single_step(cpu, 0);
-    tb_flush(cpu);
 
     if (sig != 0) {
         gdb_set_stop_cpu(cpu);
@@ -539,7 +537,6 @@ static void disable_gdbstub(CPUState *thread_cpu)
         /* no cpu_watchpoint_remove_all for user-mode */
         cpu_single_step(cpu, 0);
     }
-    tb_flush(thread_cpu);
 }
 
 void gdbserver_fork_end(CPUState *cpu, pid_t pid)
diff --git a/hw/core/cpu-system.c b/hw/core/cpu-system.c
index 09c928c1f9..f601a083d1 100644
--- a/hw/core/cpu-system.c
+++ b/hw/core/cpu-system.c
@@ -23,7 +23,6 @@
 #include "system/address-spaces.h"
 #include "exec/cputlb.h"
 #include "system/memory.h"
-#include "exec/tb-flush.h"
 #include "qemu/target-info.h"
 #include "hw/qdev-core.h"
 #include "hw/qdev-properties.h"
@@ -207,14 +206,6 @@ static int cpu_common_post_load(void *opaque, int version_id)
         cpu_reset_interrupt(cpu, 0x01);
 
         tlb_flush(cpu);
-
-        /*
-         * loadvm has just updated the content of RAM, bypassing the
-         * usual mechanisms that ensure we flush TBs for writes to
-         * memory we've translated code from. So we must flush all TBs,
-         * which will now be stale.
-         */
-        tb_flush(cpu);
     }
 
     return 0;
diff --git a/hw/pci-host/astro.c b/hw/pci-host/astro.c
index 859e308c57..1024ede7b6 100644
--- a/hw/pci-host/astro.c
+++ b/hw/pci-host/astro.c
@@ -424,22 +424,23 @@ static void elroy_reset(DeviceState *dev)
     }
 }
 
-static void elroy_pcihost_init(Object *obj)
+static void elroy_pcihost_realize(DeviceState *dev, Error **errp)
 {
-    ElroyState *s = ELROY_PCI_HOST_BRIDGE(obj);
-    PCIHostState *phb = PCI_HOST_BRIDGE(obj);
-    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
+    ElroyState *s = ELROY_PCI_HOST_BRIDGE(dev);
+    PCIHostState *phb = PCI_HOST_BRIDGE(dev);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+    Object *obj = OBJECT(s);
 
     /* Elroy config access from CPU.  */
-    memory_region_init_io(&s->this_mem, OBJECT(s), &elroy_chip_ops,
+    memory_region_init_io(&s->this_mem, obj, &elroy_chip_ops,
                           s, "elroy", 0x2000);
 
     /* Elroy PCI config. */
-    memory_region_init_io(&phb->conf_mem, OBJECT(phb),
-                          &elroy_config_addr_ops, DEVICE(s),
+    memory_region_init_io(&phb->conf_mem, obj,
+                          &elroy_config_addr_ops, dev,
                           "pci-conf-idx", 8);
-    memory_region_init_io(&phb->data_mem, OBJECT(phb),
-                          &elroy_config_data_ops, DEVICE(s),
+    memory_region_init_io(&phb->data_mem, obj,
+                          &elroy_config_data_ops, dev,
                           "pci-conf-data", 8);
     memory_region_add_subregion(&s->this_mem, 0x40,
                                 &phb->conf_mem);
@@ -447,8 +448,8 @@ static void elroy_pcihost_init(Object *obj)
                                 &phb->data_mem);
 
     /* Elroy PCI bus memory.  */
-    memory_region_init(&s->pci_mmio, OBJECT(s), "pci-mmio", UINT64_MAX);
-    memory_region_init_io(&s->pci_io, OBJECT(s), &unassigned_io_ops, obj,
+    memory_region_init(&s->pci_mmio, obj, "pci-mmio", UINT64_MAX);
+    memory_region_init_io(&s->pci_io, obj, &unassigned_io_ops, obj,
                             "pci-isa-mmio",
                             ((uint32_t) IOS_DIST_BASE_SIZE) / ROPES_PER_IOC);
 
@@ -459,7 +460,7 @@ static void elroy_pcihost_init(Object *obj)
 
     sysbus_init_mmio(sbd, &s->this_mem);
 
-    qdev_init_gpio_in(DEVICE(obj), elroy_set_irq, ELROY_IRQS);
+    qdev_init_gpio_in(dev, elroy_set_irq, ELROY_IRQS);
 }
 
 static const VMStateDescription vmstate_elroy = {
@@ -487,6 +488,7 @@ static void elroy_pcihost_class_init(ObjectClass *klass, const void *data)
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     device_class_set_legacy_reset(dc, elroy_reset);
+    dc->realize = elroy_pcihost_realize;
     dc->vmsd = &vmstate_elroy;
     dc->user_creatable = false;
 }
@@ -494,7 +496,6 @@ static void elroy_pcihost_class_init(ObjectClass *klass, const void *data)
 static const TypeInfo elroy_pcihost_info = {
     .name          = TYPE_ELROY_PCI_HOST_BRIDGE,
     .parent        = TYPE_PCI_HOST_BRIDGE,
-    .instance_init = elroy_pcihost_init,
     .instance_size = sizeof(ElroyState),
     .class_init    = elroy_pcihost_class_init,
 };
diff --git a/hw/pci-host/dino.c b/hw/pci-host/dino.c
index 11b353be2e..924053499c 100644
--- a/hw/pci-host/dino.c
+++ b/hw/pci-host/dino.c
@@ -413,43 +413,7 @@ static void dino_pcihost_reset(DeviceState *dev)
 static void dino_pcihost_realize(DeviceState *dev, Error **errp)
 {
     DinoState *s = DINO_PCI_HOST_BRIDGE(dev);
-
-    /* Set up PCI view of memory: Bus master address space.  */
-    memory_region_init(&s->bm, OBJECT(s), "bm-dino", 4 * GiB);
-    memory_region_init_alias(&s->bm_ram_alias, OBJECT(s),
-                             "bm-system", s->memory_as, 0,
-                             0xf0000000 + DINO_MEM_CHUNK_SIZE);
-    memory_region_init_alias(&s->bm_pci_alias, OBJECT(s),
-                             "bm-pci", &s->pci_mem,
-                             0xf0000000 + DINO_MEM_CHUNK_SIZE,
-                             30 * DINO_MEM_CHUNK_SIZE);
-    memory_region_init_alias(&s->bm_cpu_alias, OBJECT(s),
-                             "bm-cpu", s->memory_as, 0xfff00000,
-                             0xfffff);
-    memory_region_add_subregion(&s->bm, 0,
-                                &s->bm_ram_alias);
-    memory_region_add_subregion(&s->bm,
-                                0xf0000000 + DINO_MEM_CHUNK_SIZE,
-                                &s->bm_pci_alias);
-    memory_region_add_subregion(&s->bm, 0xfff00000,
-                                &s->bm_cpu_alias);
-
-    address_space_init(&s->bm_as, &s->bm, "pci-bm");
-}
-
-static void dino_pcihost_unrealize(DeviceState *dev)
-{
-    DinoState *s = DINO_PCI_HOST_BRIDGE(dev);
-
-    address_space_destroy(&s->bm_as);
-}
-
-static void dino_pcihost_init(Object *obj)
-{
-    DinoState *s = DINO_PCI_HOST_BRIDGE(obj);
-    PCIHostState *phb = PCI_HOST_BRIDGE(obj);
-    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
-    int i;
+    PCIHostState *phb = PCI_HOST_BRIDGE(dev);
 
     /* Dino PCI access from main memory.  */
     memory_region_init_io(&s->this_mem, OBJECT(s), &dino_chip_ops,
@@ -476,7 +440,7 @@ static void dino_pcihost_init(Object *obj)
                                      PCI_DEVFN(0, 0), 32, TYPE_PCI_BUS);
 
     /* Set up windows into PCI bus memory.  */
-    for (i = 1; i < 31; i++) {
+    for (int i = 1; i < 31; i++) {
         uint32_t addr = 0xf0000000 + i * DINO_MEM_CHUNK_SIZE;
         char *name = g_strdup_printf("PCI Outbound Window %d", i);
         memory_region_init_alias(&s->pci_mem_alias[i], OBJECT(s),
@@ -487,9 +451,38 @@ static void dino_pcihost_init(Object *obj)
 
     pci_setup_iommu(phb->bus, &dino_iommu_ops, s);
 
-    sysbus_init_mmio(sbd, &s->this_mem);
+    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->this_mem);
 
-    qdev_init_gpio_in(DEVICE(obj), dino_set_irq, DINO_IRQS);
+    qdev_init_gpio_in(dev, dino_set_irq, DINO_IRQS);
+
+    /* Set up PCI view of memory: Bus master address space.  */
+    memory_region_init(&s->bm, OBJECT(s), "bm-dino", 4 * GiB);
+    memory_region_init_alias(&s->bm_ram_alias, OBJECT(s),
+                             "bm-system", s->memory_as, 0,
+                             0xf0000000 + DINO_MEM_CHUNK_SIZE);
+    memory_region_init_alias(&s->bm_pci_alias, OBJECT(s),
+                             "bm-pci", &s->pci_mem,
+                             0xf0000000 + DINO_MEM_CHUNK_SIZE,
+                             30 * DINO_MEM_CHUNK_SIZE);
+    memory_region_init_alias(&s->bm_cpu_alias, OBJECT(s),
+                             "bm-cpu", s->memory_as, 0xfff00000,
+                             0xfffff);
+    memory_region_add_subregion(&s->bm, 0,
+                                &s->bm_ram_alias);
+    memory_region_add_subregion(&s->bm,
+                                0xf0000000 + DINO_MEM_CHUNK_SIZE,
+                                &s->bm_pci_alias);
+    memory_region_add_subregion(&s->bm, 0xfff00000,
+                                &s->bm_cpu_alias);
+
+    address_space_init(&s->bm_as, &s->bm, "pci-bm");
+}
+
+static void dino_pcihost_unrealize(DeviceState *dev)
+{
+    DinoState *s = DINO_PCI_HOST_BRIDGE(dev);
+
+    address_space_destroy(&s->bm_as);
 }
 
 static const Property dino_pcihost_properties[] = {
@@ -511,7 +504,6 @@ static void dino_pcihost_class_init(ObjectClass *klass, const void *data)
 static const TypeInfo dino_pcihost_info = {
     .name          = TYPE_DINO_PCI_HOST_BRIDGE,
     .parent        = TYPE_PCI_HOST_BRIDGE,
-    .instance_init = dino_pcihost_init,
     .instance_size = sizeof(DinoState),
     .class_init    = dino_pcihost_class_init,
 };
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index c594d4b916..8c1e0a4817 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -8,7 +8,7 @@
 #include "qemu/main-loop.h"
 #include "qemu/module.h"
 #include "qemu/error-report.h"
-#include "exec/tb-flush.h"
+#include "exec/translation-block.h"
 #include "exec/target_page.h"
 #include "helper_regs.h"
 #include "hw/ppc/ppc.h"
@@ -301,7 +301,7 @@ static target_ulong h_page_init(PowerPCCPU *cpu, SpaprMachineState *spapr,
         if (kvm_enabled()) {
             kvmppc_icbi_range(cpu, pdst, len);
         } else if (tcg_enabled()) {
-            tb_flush(CPU(cpu));
+            tb_invalidate_phys_range(CPU(cpu), dst, dst + len - 1);
         } else {
             g_assert_not_reached();
         }
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 9b658a3f48..f373781ae0 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -221,9 +221,9 @@ static inline bool cpu_loop_exit_requested(CPUState *cpu)
 
 G_NORETURN void cpu_loop_exit_noexc(CPUState *cpu);
 G_NORETURN void cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc);
+G_NORETURN void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc);
 #endif /* CONFIG_TCG */
 G_NORETURN void cpu_loop_exit(CPUState *cpu);
-G_NORETURN void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc);
 
 /* accel/tcg/cpu-exec.c */
 int cpu_exec(CPUState *cpu);
diff --git a/include/exec/cputlb.h b/include/exec/cputlb.h
index 03ed7e2165..9bec0e7890 100644
--- a/include/exec/cputlb.h
+++ b/include/exec/cputlb.h
@@ -150,7 +150,7 @@ void tlb_flush_all_cpus_synced(CPUState *src_cpu);
  * MMU indexes.
  */
 void tlb_flush_page_by_mmuidx(CPUState *cpu, vaddr addr,
-                              uint16_t idxmap);
+                              MMUIdxMap idxmap);
 
 /**
  * tlb_flush_page_by_mmuidx_all_cpus_synced:
@@ -165,7 +165,7 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, vaddr addr,
  * translations using the flushed TLBs.
  */
 void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *cpu, vaddr addr,
-                                              uint16_t idxmap);
+                                              MMUIdxMap idxmap);
 
 /**
  * tlb_flush_by_mmuidx:
@@ -176,7 +176,7 @@ void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *cpu, vaddr addr,
  * Flush all entries from the TLB of the specified CPU, for the specified
  * MMU indexes.
  */
-void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap);
+void tlb_flush_by_mmuidx(CPUState *cpu, MMUIdxMap idxmap);
 
 /**
  * tlb_flush_by_mmuidx_all_cpus_synced:
@@ -189,7 +189,7 @@ void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap);
  * When this function returns, no CPUs will subsequently perform
  * translations using the flushed TLBs.
  */
-void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu, uint16_t idxmap);
+void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu, MMUIdxMap idxmap);
 
 /**
  * tlb_flush_page_bits_by_mmuidx
@@ -201,11 +201,11 @@ void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu, uint16_t idxmap);
  * Similar to tlb_flush_page_mask, but with a bitmap of indexes.
  */
 void tlb_flush_page_bits_by_mmuidx(CPUState *cpu, vaddr addr,
-                                   uint16_t idxmap, unsigned bits);
+                                   MMUIdxMap idxmap, unsigned bits);
 
 /* Similarly, with broadcast and syncing. */
 void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *cpu, vaddr addr,
-                                                   uint16_t idxmap,
+                                                   MMUIdxMap idxmap,
                                                    unsigned bits);
 
 /**
@@ -220,14 +220,14 @@ void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *cpu, vaddr addr,
  * comparing only the low @bits worth of each virtual page.
  */
 void tlb_flush_range_by_mmuidx(CPUState *cpu, vaddr addr,
-                               vaddr len, uint16_t idxmap,
+                               vaddr len, MMUIdxMap idxmap,
                                unsigned bits);
 
 /* Similarly, with broadcast and syncing. */
 void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *cpu,
                                                vaddr addr,
                                                vaddr len,
-                                               uint16_t idxmap,
+                                               MMUIdxMap idxmap,
                                                unsigned bits);
 #else
 static inline void tlb_flush_page(CPUState *cpu, vaddr addr)
@@ -243,42 +243,42 @@ static inline void tlb_flush_all_cpus_synced(CPUState *src_cpu)
 {
 }
 static inline void tlb_flush_page_by_mmuidx(CPUState *cpu,
-                                            vaddr addr, uint16_t idxmap)
+                                            vaddr addr, MMUIdxMap idxmap)
 {
 }
 
-static inline void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
+static inline void tlb_flush_by_mmuidx(CPUState *cpu, MMUIdxMap idxmap)
 {
 }
 static inline void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *cpu,
                                                             vaddr addr,
-                                                            uint16_t idxmap)
+                                                            MMUIdxMap idxmap)
 {
 }
 static inline void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu,
-                                                       uint16_t idxmap)
+                                                       MMUIdxMap idxmap)
 {
 }
 static inline void tlb_flush_page_bits_by_mmuidx(CPUState *cpu,
                                                  vaddr addr,
-                                                 uint16_t idxmap,
+                                                 MMUIdxMap idxmap,
                                                  unsigned bits)
 {
 }
 static inline void
 tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *cpu, vaddr addr,
-                                              uint16_t idxmap, unsigned bits)
+                                              MMUIdxMap idxmap, unsigned bits)
 {
 }
 static inline void tlb_flush_range_by_mmuidx(CPUState *cpu, vaddr addr,
-                                             vaddr len, uint16_t idxmap,
+                                             vaddr len, MMUIdxMap idxmap,
                                              unsigned bits)
 {
 }
 static inline void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *cpu,
                                                              vaddr addr,
                                                              vaddr len,
-                                                             uint16_t idxmap,
+                                                             MMUIdxMap idxmap,
                                                              unsigned bits)
 {
 }
diff --git a/include/exec/tb-flush.h b/include/exec/tb-flush.h
index 142c240d94..e971d4ba6d 100644
--- a/include/exec/tb-flush.h
+++ b/include/exec/tb-flush.h
@@ -9,19 +9,29 @@
 #define _TB_FLUSH_H_
 
 /**
- * tb_flush() - flush all translation blocks
- * @cs: CPUState (must be valid, but treated as anonymous pointer)
+ * tb_flush__exclusive_or_serial()
  *
- * Used to flush all the translation blocks in the system. Sometimes
- * it is simpler to flush everything than work out which individual
- * translations are now invalid and ensure they are not called
- * anymore.
+ * Used to flush all the translation blocks in the system.  Mostly this is
+ * used to empty the code generation buffer after it is full.  Sometimes it
+ * is used when it is simpler to flush everything than work out which
+ * individual translations are now invalid.
  *
- * tb_flush() takes care of running the flush in an exclusive context
- * if it is not already running in one. This means no guest code will
- * run until this complete.
+ * Must be called from an exclusive or serial context, e.g. start_exclusive,
+ * vm_stop, or when there is only one vcpu.  Note that start_exclusive cannot
+ * be called from within the cpu run loop, so this cannot be called from
+ * within target code.
  */
-void tb_flush(CPUState *cs);
+void tb_flush__exclusive_or_serial(void);
+
+/**
+ * queue_tb_flush() - add flush to the cpu work queue
+ * @cs: CPUState
+ *
+ * Flush all translation blocks the next time @cs processes the work queue.
+ * This should generally be followed by cpu_loop_exit(), so that the work
+ * queue is processed promptly.
+ */
+void queue_tb_flush(CPUState *cs);
 
 void tcg_flush_jmp_cache(CPUState *cs);
 
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index fb788ca110..c9f40c2539 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -198,10 +198,10 @@ struct CPUClass {
 };
 
 /*
- * Fix the number of mmu modes to 16, which is also the maximum
- * supported by the softmmu tlb api.
+ * Fix the number of mmu modes to 16.
  */
 #define NB_MMU_MODES 16
+typedef uint16_t MMUIdxMap;
 
 /* Use a fully associative victim tlb of 8 entries. */
 #define CPU_VTLB_SIZE 8
@@ -306,7 +306,7 @@ typedef struct CPUTLBCommon {
      * mmu_idx N since the last time that mmu_idx was flushed.
      * Protected by tlb_c.lock.
      */
-    uint16_t dirty;
+    MMUIdxMap dirty;
     /*
      * Statistics.  These are not lock protected, but are read and
      * written atomically.  This allows the monitor to print a snapshot
@@ -602,6 +602,22 @@ static inline CPUArchState *cpu_env(CPUState *cpu)
     return (CPUArchState *)(cpu + 1);
 }
 
+#ifdef CONFIG_TCG
+/*
+ * Invert the index order of the CPUTLBDescFast array so that lower
+ * mmu_idx have offsets from env with smaller magnitude.
+ */
+static inline int mmuidx_to_fast_index(int mmu_idx)
+{
+    return NB_MMU_MODES - 1 - mmu_idx;
+}
+
+static inline CPUTLBDescFast *cpu_tlb_fast(CPUState *cpu, int mmu_idx)
+{
+    return &cpu->neg.tlb.f[mmuidx_to_fast_index(mmu_idx)];
+}
+#endif
+
 typedef QTAILQ_HEAD(CPUTailQ, CPUState) CPUTailQ;
 extern CPUTailQ cpus_queue;
 
diff --git a/linux-user/aarch64/vdso-be.so b/linux-user/aarch64/vdso-be.so
index d43c3b19cd..4089838b30 100755
--- a/linux-user/aarch64/vdso-be.so
+++ b/linux-user/aarch64/vdso-be.so
Binary files differdiff --git a/linux-user/aarch64/vdso-le.so b/linux-user/aarch64/vdso-le.so
index aaedc9d85e..240802821c 100755
--- a/linux-user/aarch64/vdso-le.so
+++ b/linux-user/aarch64/vdso-le.so
Binary files differdiff --git a/linux-user/aarch64/vdso.S b/linux-user/aarch64/vdso.S
index a0ac1487b0..59dd94dc8f 100644
--- a/linux-user/aarch64/vdso.S
+++ b/linux-user/aarch64/vdso.S
@@ -71,5 +71,7 @@ vdso_syscall __kernel_clock_getres, __NR_clock_getres
 __kernel_rt_sigreturn:
 	/* No BTI C insn here -- we arrive via RET. */
 	mov	x8, #__NR_rt_sigreturn
+sigreturn_region_start:
 	svc	#0
+sigreturn_region_end:
 endf	__kernel_rt_sigreturn
diff --git a/linux-user/alpha/cpu_loop.c b/linux-user/alpha/cpu_loop.c
index bb8346b509..f93597c400 100644
--- a/linux-user/alpha/cpu_loop.c
+++ b/linux-user/alpha/cpu_loop.c
@@ -94,11 +94,6 @@ void cpu_loop(CPUAlphaState *env)
                 break;
             case 0x86:
                 /* IMB */
-                /* ??? We can probably elide the code using page_unprotect
-                   that is checking for self-modifying code.  Instead we
-                   could simply call tb_flush here.  Until we work out the
-                   changes required to turn off the extra write protection,
-                   this can be a no-op.  */
                 break;
             case 0x9E:
                 /* RDUNIQUE */
diff --git a/linux-user/arm/vdso-be32.so b/linux-user/arm/vdso-be32.so
index b896d3d545..6d71cd9c36 100755
--- a/linux-user/arm/vdso-be32.so
+++ b/linux-user/arm/vdso-be32.so
Binary files differdiff --git a/linux-user/arm/vdso-be8.so b/linux-user/arm/vdso-be8.so
index 784b7bdb2a..6446a96fcf 100755
--- a/linux-user/arm/vdso-be8.so
+++ b/linux-user/arm/vdso-be8.so
Binary files differdiff --git a/linux-user/arm/vdso-le.so b/linux-user/arm/vdso-le.so
index 38d3d51047..d34e577b69 100755
--- a/linux-user/arm/vdso-le.so
+++ b/linux-user/arm/vdso-le.so
Binary files differdiff --git a/linux-user/arm/vdso.S b/linux-user/arm/vdso.S
index b3bb6491dc..d84d964730 100644
--- a/linux-user/arm/vdso.S
+++ b/linux-user/arm/vdso.S
@@ -140,6 +140,7 @@ SYSCALL __vdso_gettimeofday, __NR_gettimeofday
 
 	.balign	16
 sigreturn_codes:
+sigreturn_region_start:
 	/* [EO]ABI sigreturn */
 	slot	0
 	raw_syscall __NR_sigreturn
@@ -172,3 +173,4 @@ sigreturn_codes:
 
 	.balign	16
 endf sigreturn_codes
+sigreturn_region_end:
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 26c090c95d..1370ec59be 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1659,6 +1659,11 @@ static void load_elf_vdso(struct image_info *info, const VdsoImageInfo *vdso)
     if (vdso->rt_sigreturn_ofs) {
         default_rt_sigreturn = load_addr + vdso->rt_sigreturn_ofs;
     }
+    if (vdso->sigreturn_region_start_ofs) {
+        vdso_sigreturn_region_start =
+            load_addr + vdso->sigreturn_region_start_ofs;
+        vdso_sigreturn_region_end = load_addr + vdso->sigreturn_region_end_ofs;
+    }
 
     /* Remove write from VDSO segment. */
     target_mprotect(info->start_data, info->end_data - info->start_data,
@@ -1969,6 +1974,8 @@ int load_elf_binary(struct linux_binprm *bprm, struct image_info *info)
 
         setup_sigtramp(tramp_page);
         target_mprotect(tramp_page, TARGET_PAGE_SIZE, PROT_READ | PROT_EXEC);
+        vdso_sigreturn_region_start = tramp_page;
+        vdso_sigreturn_region_end = tramp_page + TARGET_PAGE_SIZE;
     }
 
     bprm->p = create_elf_tables(bprm->p, bprm->argc, bprm->envc, &ehdr, info,
diff --git a/linux-user/gen-vdso-elfn.c.inc b/linux-user/gen-vdso-elfn.c.inc
index b47019e136..c2677a146c 100644
--- a/linux-user/gen-vdso-elfn.c.inc
+++ b/linux-user/gen-vdso-elfn.c.inc
@@ -84,9 +84,12 @@ static void elfN(search_symtab)(ElfN(Shdr) *shdr, unsigned sym_idx,
 
         if (sigreturn_sym && strcmp(sigreturn_sym, name) == 0) {
             sigreturn_addr = sym.st_value;
-        }
-        if (rt_sigreturn_sym && strcmp(rt_sigreturn_sym, name) == 0) {
+        } else if (rt_sigreturn_sym && strcmp(rt_sigreturn_sym, name) == 0) {
             rt_sigreturn_addr = sym.st_value;
+        } else if (strcmp("sigreturn_region_start", name) == 0) {
+            sigreturn_region_start_addr = sym.st_value;
+        } else if (strcmp("sigreturn_region_end", name) == 0) {
+            sigreturn_region_end_addr = sym.st_value;
         }
     }
 }
diff --git a/linux-user/gen-vdso.c b/linux-user/gen-vdso.c
index aeaa927db8..d6a2cdaa83 100644
--- a/linux-user/gen-vdso.c
+++ b/linux-user/gen-vdso.c
@@ -36,6 +36,8 @@ static const char *rt_sigreturn_sym;
 
 static unsigned sigreturn_addr;
 static unsigned rt_sigreturn_addr;
+static unsigned sigreturn_region_start_addr;
+static unsigned sigreturn_region_end_addr;
 
 #define N 32
 #define elfN(x)  elf32_##x
@@ -215,6 +217,10 @@ int main(int argc, char **argv)
     fprintf(outf, "    .reloc_count = ARRAY_SIZE(%s_relocs),\n", prefix);
     fprintf(outf, "    .sigreturn_ofs = 0x%x,\n", sigreturn_addr);
     fprintf(outf, "    .rt_sigreturn_ofs = 0x%x,\n", rt_sigreturn_addr);
+    fprintf(outf, "    .sigreturn_region_start_ofs = 0x%x,\n",
+            sigreturn_region_start_addr);
+    fprintf(outf, "    .sigreturn_region_end_ofs = 0x%x,\n",
+            sigreturn_region_end_addr);
     fprintf(outf, "};\n");
 
     ret = EXIT_SUCCESS;
diff --git a/linux-user/hppa/vdso.S b/linux-user/hppa/vdso.S
index 5be14d2f70..a6f8da2981 100644
--- a/linux-user/hppa/vdso.S
+++ b/linux-user/hppa/vdso.S
@@ -156,8 +156,10 @@
 __kernel_sigtramp_rt:
 	ldi	0, %r25
 	ldi	__NR_rt_sigreturn, %r20
+sigreturn_region_start:
 	be,l	0x100(%sr2, %r0), %sr0, %r31
 	nop
+sigreturn_region_end:
 
 	.cfi_endproc
 	.size	__kernel_sigtramp_rt, . - __kernel_sigtramp_rt
diff --git a/linux-user/hppa/vdso.so b/linux-user/hppa/vdso.so
index e1ddd70c37..68baf80fd3 100755
--- a/linux-user/hppa/vdso.so
+++ b/linux-user/hppa/vdso.so
Binary files differdiff --git a/linux-user/i386/vdso.S b/linux-user/i386/vdso.S
index e7a1f333a1..8df77b5a94 100644
--- a/linux-user/i386/vdso.S
+++ b/linux-user/i386/vdso.S
@@ -114,6 +114,7 @@ vdso_syscall3 __vdso_getcpu, __NR_gettimeofday
  */
 	nop
 
+sigreturn_region_start:
 __kernel_sigreturn:
 	popl	%eax	/* pop sig */
 	.cfi_adjust_cfa_offset -4
@@ -128,6 +129,7 @@ __kernel_rt_sigreturn:
 	movl	$__NR_rt_sigreturn, %eax
 	int	$0x80
 endf	__kernel_rt_sigreturn
+sigreturn_region_end:
 
 	.cfi_endproc
 
diff --git a/linux-user/i386/vdso.so b/linux-user/i386/vdso.so
index bdece5dfcf..e01c3818d0 100755
--- a/linux-user/i386/vdso.so
+++ b/linux-user/i386/vdso.so
Binary files differdiff --git a/linux-user/loader.h b/linux-user/loader.h
index e42b8fa1e3..da9ad28db5 100644
--- a/linux-user/loader.h
+++ b/linux-user/loader.h
@@ -117,6 +117,8 @@ typedef struct {
     unsigned reloc_count;
     unsigned sigreturn_ofs;
     unsigned rt_sigreturn_ofs;
+    unsigned sigreturn_region_start_ofs;
+    unsigned sigreturn_region_end_ofs;
 } VdsoImageInfo;
 
 /* Note that both Elf32_Word and Elf64_Word are uint32_t. */
diff --git a/linux-user/loongarch64/vdso.S b/linux-user/loongarch64/vdso.S
index 780a5fda12..2409d95476 100644
--- a/linux-user/loongarch64/vdso.S
+++ b/linux-user/loongarch64/vdso.S
@@ -125,6 +125,8 @@ vdso_syscall __vdso_getcpu, __NR_getcpu
 
 __vdso_rt_sigreturn:
 	li.w	$a7, __NR_rt_sigreturn
+sigreturn_region_start:
 	syscall	0
+sigreturn_region_end:
 	.cfi_endproc
 endf __vdso_rt_sigreturn
diff --git a/linux-user/loongarch64/vdso.so b/linux-user/loongarch64/vdso.so
index 7c2de6c50e..3704834f0d 100755
--- a/linux-user/loongarch64/vdso.so
+++ b/linux-user/loongarch64/vdso.so
Binary files differdiff --git a/linux-user/main.c b/linux-user/main.c
index 4ddfc9a619..db751c0757 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -228,6 +228,8 @@ void init_task_state(TaskState *ts)
         ts->start_boottime += bt.tv_nsec * (uint64_t) ticks_per_sec /
                               NANOSECONDS_PER_SECOND;
     }
+
+    ts->sys_dispatch_len = -1;
 }
 
 CPUArchState *cpu_copy(CPUArchState *env)
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 002e1e668e..847092a28a 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -22,8 +22,6 @@
 #include "exec/log.h"
 #include "exec/page-protection.h"
 #include "exec/mmap-lock.h"
-#include "exec/tb-flush.h"
-#include "exec/translation-block.h"
 #include "qemu.h"
 #include "user/page-protection.h"
 #include "user-internals.h"
@@ -1007,11 +1005,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot,
      * be atomic with respect to an external process.
      */
     if (ret != -1 && (flags & MAP_TYPE) != MAP_PRIVATE) {
-        CPUState *cpu = thread_cpu;
-        if (!tcg_cflags_has(cpu, CF_PARALLEL)) {
-            tcg_cflags_set(cpu, CF_PARALLEL);
-            tb_flush(cpu);
-        }
+        begin_parallel_context(thread_cpu);
     }
 
     return ret;
@@ -1448,10 +1442,7 @@ abi_ulong target_shmat(CPUArchState *cpu_env, int shmid,
      * supported by the host -- anything that requires EXCP_ATOMIC will not
      * be atomic with respect to an external process.
      */
-    if (!tcg_cflags_has(cpu, CF_PARALLEL)) {
-        tcg_cflags_set(cpu, CF_PARALLEL);
-        tb_flush(cpu);
-    }
+    begin_parallel_context(cpu);
 
     if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
         FILE *f = qemu_log_trylock();
diff --git a/linux-user/ppc/vdso-32.so b/linux-user/ppc/vdso-32.so
index 0dc55e0ddd..03476052fb 100755
--- a/linux-user/ppc/vdso-32.so
+++ b/linux-user/ppc/vdso-32.so
Binary files differdiff --git a/linux-user/ppc/vdso-64.so b/linux-user/ppc/vdso-64.so
index ac1ab2582e..b89f2a0e01 100755
--- a/linux-user/ppc/vdso-64.so
+++ b/linux-user/ppc/vdso-64.so
Binary files differdiff --git a/linux-user/ppc/vdso-64le.so b/linux-user/ppc/vdso-64le.so
index 424abb4290..22499d2701 100755
--- a/linux-user/ppc/vdso-64le.so
+++ b/linux-user/ppc/vdso-64le.so
Binary files differdiff --git a/linux-user/ppc/vdso.S b/linux-user/ppc/vdso.S
index 2e79ea9808..e9256a2dea 100644
--- a/linux-user/ppc/vdso.S
+++ b/linux-user/ppc/vdso.S
@@ -220,6 +220,7 @@ endf	__kernel_sync_dicache
 
 	nop
 
+sigreturn_region_start:
 __kernel_sigtramp_rt:
 	raw_syscall __NR_rt_sigreturn
 endf	__kernel_sigtramp_rt
@@ -235,5 +236,6 @@ __kernel_sigtramp32:
 	raw_syscall __NR_sigreturn
 endf	__kernel_sigtramp32
 #endif
+sigreturn_region_end:
 
 	.cfi_endproc
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index e4dca0c20f..cabb7bd6a8 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -155,6 +155,11 @@ struct TaskState {
     /* This thread's sigaltstack, if it has one */
     struct target_sigaltstack sigaltstack_used;
 
+    /* This thread's SYSCALL_USER_DISPATCH state, len=~0 means disabled */
+    vaddr sys_dispatch;
+    vaddr sys_dispatch_selector;
+    abi_ulong sys_dispatch_len;
+
     /* Start time of task after system boot in clock ticks */
     uint64_t start_boottime;
 };
diff --git a/linux-user/riscv/vdso-32.so b/linux-user/riscv/vdso-32.so
index c2ce2a4757..4818a994f0 100755
--- a/linux-user/riscv/vdso-32.so
+++ b/linux-user/riscv/vdso-32.so
Binary files differdiff --git a/linux-user/riscv/vdso-64.so b/linux-user/riscv/vdso-64.so
index ae49f5b043..cc6f7e974b 100755
--- a/linux-user/riscv/vdso-64.so
+++ b/linux-user/riscv/vdso-64.so
Binary files differdiff --git a/linux-user/riscv/vdso.S b/linux-user/riscv/vdso.S
index c37275233a..1d780db771 100644
--- a/linux-user/riscv/vdso.S
+++ b/linux-user/riscv/vdso.S
@@ -181,7 +181,9 @@ endf __vdso_flush_icache
 	nop
 
 __vdso_rt_sigreturn:
+sigreturn_region_start:
 	raw_syscall __NR_rt_sigreturn
+sigreturn_region_end:
 endf __vdso_rt_sigreturn
 
 	.cfi_endproc
diff --git a/linux-user/s390x/vdso.S b/linux-user/s390x/vdso.S
index 3332492477..c60e9ed086 100644
--- a/linux-user/s390x/vdso.S
+++ b/linux-user/s390x/vdso.S
@@ -52,6 +52,7 @@ vdso_syscall __kernel_getcpu, __NR_getcpu
  * by all users.  Without it we get the fallback signal frame handling.
  */
 
+sigreturn_region_start:
 __kernel_sigreturn:
 	raw_syscall __NR_sigreturn
 endf	__kernel_sigreturn
@@ -59,3 +60,4 @@ endf	__kernel_sigreturn
 __kernel_rt_sigreturn:
 	raw_syscall __NR_rt_sigreturn
 endf	__kernel_rt_sigreturn
+sigreturn_region_end:
diff --git a/linux-user/s390x/vdso.so b/linux-user/s390x/vdso.so
index 64130f6f33..a669a6b7dd 100755
--- a/linux-user/s390x/vdso.so
+++ b/linux-user/s390x/vdso.so
Binary files differdiff --git a/linux-user/signal-common.h b/linux-user/signal-common.h
index 196d2406f8..8a44714251 100644
--- a/linux-user/signal-common.h
+++ b/linux-user/signal-common.h
@@ -25,6 +25,13 @@
 /* Fallback addresses into sigtramp page. */
 extern abi_ulong default_sigreturn;
 extern abi_ulong default_rt_sigreturn;
+extern abi_ulong vdso_sigreturn_region_start;
+extern abi_ulong vdso_sigreturn_region_end;
+
+static inline bool is_vdso_sigreturn(abi_ulong pc)
+{
+    return pc >= vdso_sigreturn_region_start && pc < vdso_sigreturn_region_end;
+}
 
 void setup_sigtramp(abi_ulong tramp_page);
 
diff --git a/linux-user/signal.c b/linux-user/signal.c
index cd0e7398aa..804096bd44 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -50,6 +50,8 @@ static void host_signal_handler(int host_signum, siginfo_t *info,
 /* Fallback addresses into sigtramp page. */
 abi_ulong default_sigreturn;
 abi_ulong default_rt_sigreturn;
+abi_ulong vdso_sigreturn_region_start;
+abi_ulong vdso_sigreturn_region_end;
 
 /*
  * System includes define _NSIG as SIGRTMAX + 1, but qemu (like the kernel)
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 91360a072c..1a5f2a03f9 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -27,8 +27,6 @@
 #include "target_mman.h"
 #include "exec/page-protection.h"
 #include "exec/mmap-lock.h"
-#include "exec/tb-flush.h"
-#include "exec/translation-block.h"
 #include <elf.h>
 #include <endian.h>
 #include <grp.h>
@@ -6344,6 +6342,10 @@ abi_long do_arch_prctl(CPUX86State *env, int code, abi_ulong addr)
 #endif
 #ifndef PR_SET_SYSCALL_USER_DISPATCH
 # define PR_SET_SYSCALL_USER_DISPATCH 59
+# define PR_SYS_DISPATCH_OFF 0
+# define PR_SYS_DISPATCH_ON 1
+# define SYSCALL_DISPATCH_FILTER_ALLOW 0
+# define SYSCALL_DISPATCH_FILTER_BLOCK 1
 #endif
 #ifndef PR_SME_SET_VL
 # define PR_SME_SET_VL  63
@@ -6398,6 +6400,36 @@ static abi_long do_prctl_inval1(CPUArchState *env, abi_long arg2)
 #define do_prctl_sme_set_vl do_prctl_inval1
 #endif
 
+static abi_long do_prctl_syscall_user_dispatch(CPUArchState *env,
+                                               abi_ulong arg2, abi_ulong arg3,
+                                               abi_ulong arg4, abi_ulong arg5)
+{
+    CPUState *cpu = env_cpu(env);
+    TaskState *ts = get_task_state(cpu);
+
+    switch (arg2) {
+    case PR_SYS_DISPATCH_OFF:
+        if (arg3 || arg4 || arg5) {
+            return -TARGET_EINVAL;
+        }
+        ts->sys_dispatch_len = -1;
+        return 0;
+    case PR_SYS_DISPATCH_ON:
+        if (arg3 && arg3 + arg4 <= arg3) {
+            return -TARGET_EINVAL;
+        }
+        if (arg5 && !access_ok(cpu, VERIFY_READ, arg5, 1)) {
+            return -TARGET_EFAULT;
+        }
+        ts->sys_dispatch = arg3;
+        ts->sys_dispatch_len = arg4;
+        ts->sys_dispatch_selector = arg5;
+        return 0;
+    default:
+        return -TARGET_EINVAL;
+    }
+}
+
 static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2,
                          abi_long arg3, abi_long arg4, abi_long arg5)
 {
@@ -6473,6 +6505,9 @@ static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2,
     case PR_SET_UNALIGN:
         return do_prctl_set_unalign(env, arg2);
 
+    case PR_SET_SYSCALL_USER_DISPATCH:
+        return do_prctl_syscall_user_dispatch(env, arg2, arg3, arg4, arg5);
+
     case PR_CAP_AMBIENT:
     case PR_CAPBSET_READ:
     case PR_CAPBSET_DROP:
@@ -6527,7 +6562,6 @@ static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2,
     case PR_SET_MM:
     case PR_GET_SECCOMP:
     case PR_SET_SECCOMP:
-    case PR_SET_SYSCALL_USER_DISPATCH:
     case PR_GET_THP_DISABLE:
     case PR_SET_THP_DISABLE:
     case PR_GET_TSC:
@@ -6631,10 +6665,7 @@ static int do_fork(CPUArchState *env, unsigned int flags, abi_ulong newsp,
          * generate code for parallel execution and flush old translations.
          * Do this now so that the copy gets CF_PARALLEL too.
          */
-        if (!tcg_cflags_has(cpu, CF_PARALLEL)) {
-            tcg_cflags_set(cpu, CF_PARALLEL);
-            tb_flush(cpu);
-        }
+        begin_parallel_context(cpu);
 
         /* we create a new CPU instance. */
         new_env = cpu_copy(env);
@@ -13897,12 +13928,46 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1,
     return ret;
 }
 
+static bool sys_dispatch(CPUState *cpu, TaskState *ts)
+{
+    abi_ptr pc;
+
+    if (likely(ts->sys_dispatch_len == -1)) {
+        return false;
+    }
+
+    pc = cpu->cc->get_pc(cpu);
+    if (likely(pc - ts->sys_dispatch < ts->sys_dispatch_len)) {
+        return false;
+    }
+    if (unlikely(is_vdso_sigreturn(pc))) {
+        return false;
+    }
+    if (likely(ts->sys_dispatch_selector)) {
+        uint8_t sb;
+        if (get_user_u8(sb, ts->sys_dispatch_selector)) {
+            force_sig(TARGET_SIGSEGV);
+            return true;
+        }
+        if (likely(sb == SYSCALL_DISPATCH_FILTER_ALLOW)) {
+            return false;
+        }
+        if (unlikely(sb != SYSCALL_DISPATCH_FILTER_BLOCK)) {
+            force_sig(TARGET_SIGSYS);
+            return true;
+        }
+    }
+    force_sig_fault(TARGET_SIGSYS, TARGET_SYS_USER_DISPATCH, pc);
+    return true;
+}
+
 abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1,
                     abi_long arg2, abi_long arg3, abi_long arg4,
                     abi_long arg5, abi_long arg6, abi_long arg7,
                     abi_long arg8)
 {
     CPUState *cpu = env_cpu(cpu_env);
+    TaskState *ts = get_task_state(cpu);
     abi_long ret;
 
 #ifdef DEBUG_ERESTARTSYS
@@ -13919,6 +13984,10 @@ abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1,
     }
 #endif
 
+    if (sys_dispatch(cpu, ts)) {
+        return -QEMU_ESIGRETURN;
+    }
+
     record_syscall_start(cpu, num, arg1,
                          arg2, arg3, arg4, arg5, arg6, arg7, arg8);
 
diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h
index df26a2d28f..cd9ff709b8 100644
--- a/linux-user/syscall_defs.h
+++ b/linux-user/syscall_defs.h
@@ -690,6 +690,12 @@ typedef struct target_siginfo {
 #define TARGET_TRAP_UNK         (5)     /* undiagnosed trap */
 
 /*
+ * SIGSYS si_codes
+ */
+#define TARGET_SYS_SECCOMP       (1)  /* seccomp triggered */
+#define TARGET_SYS_USER_DISPATCH (2)  /* syscall user dispatch triggered */
+
+/*
  * SIGEMT si_codes
  */
 #define TARGET_EMT_TAGOVF      1       /* tag overflow */
diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h
index 691b9a1775..7099349ec8 100644
--- a/linux-user/user-internals.h
+++ b/linux-user/user-internals.h
@@ -20,6 +20,8 @@
 
 #include "user/thunk.h"
 #include "qemu/log.h"
+#include "exec/tb-flush.h"
+#include "exec/translation-block.h"
 
 extern char *exec_path;
 void init_task_state(TaskState *ts);
@@ -172,6 +174,20 @@ static inline int regpairs_aligned(CPUArchState *cpu_env, int num) { return 0; }
  */
 void preexit_cleanup(CPUArchState *env, int code);
 
+/**
+ * begin_parallel_context
+ * @cs: the CPU context
+ *
+ * Called when starting the second vcpu, or joining shared memory.
+ */
+static inline void begin_parallel_context(CPUState *cs)
+{
+    if (!tcg_cflags_has(cs, CF_PARALLEL)) {
+        tb_flush__exclusive_or_serial();
+        tcg_cflags_set(cs, CF_PARALLEL);
+    }
+}
+
 /*
  * Include target-specific struct and function definitions;
  * they may need access to the target-independent structures
diff --git a/plugins/core.c b/plugins/core.c
index c6e9ef1478..ead09fd2f1 100644
--- a/plugins/core.c
+++ b/plugins/core.c
@@ -248,7 +248,7 @@ static void plugin_grow_scoreboards__locked(CPUState *cpu)
         }
         plugin.scoreboard_alloc_size = scoreboard_size;
         /* force all tb to be flushed, as scoreboard pointers were changed. */
-        tb_flush(cpu);
+        tb_flush__exclusive_or_serial();
     }
     end_exclusive();
 }
@@ -684,8 +684,6 @@ void qemu_plugin_user_exit(void)
      * with the one in fork_start(). That is:
      * - start_exclusive(), which acquires qemu_cpu_list_lock,
      *   must be called before acquiring plugin.lock.
-     * - tb_flush(), which acquires mmap_lock(), must be called
-     *   while plugin.lock is not held.
      */
     start_exclusive();
 
@@ -705,7 +703,7 @@ void qemu_plugin_user_exit(void)
     }
     qemu_rec_mutex_unlock(&plugin.lock);
 
-    tb_flush(current_cpu);
+    tb_flush__exclusive_or_serial();
     end_exclusive();
 
     /* now it's safe to handle the exit case */
diff --git a/plugins/loader.c b/plugins/loader.c
index 8f0d75c904..ba10ebac99 100644
--- a/plugins/loader.c
+++ b/plugins/loader.c
@@ -377,8 +377,7 @@ static void plugin_flush_destroy(CPUState *cpu, run_on_cpu_data arg)
 {
     struct qemu_plugin_reset_data *data = arg.host_ptr;
 
-    g_assert(cpu_in_exclusive_context(cpu));
-    tb_flush(cpu);
+    tb_flush__exclusive_or_serial();
     plugin_reset_destroy(data);
 }
 
diff --git a/target/alpha/helper.h b/target/alpha/helper.h
index d60f208703..788d2fbf28 100644
--- a/target/alpha/helper.h
+++ b/target/alpha/helper.h
@@ -90,7 +90,6 @@ DEF_HELPER_FLAGS_2(ieee_input_s, TCG_CALL_NO_WG, void, env, i64)
 #if !defined (CONFIG_USER_ONLY)
 DEF_HELPER_FLAGS_1(tbia, TCG_CALL_NO_RWG, void, env)
 DEF_HELPER_FLAGS_2(tbis, TCG_CALL_NO_RWG, void, env, i64)
-DEF_HELPER_FLAGS_1(tb_flush, TCG_CALL_NO_RWG, void, env)
 
 DEF_HELPER_1(halt, void, i64)
 
diff --git a/target/alpha/sys_helper.c b/target/alpha/sys_helper.c
index 51e3254428..87e37605c1 100644
--- a/target/alpha/sys_helper.c
+++ b/target/alpha/sys_helper.c
@@ -20,7 +20,6 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "exec/cputlb.h"
-#include "exec/tb-flush.h"
 #include "exec/helper-proto.h"
 #include "system/runstate.h"
 #include "system/system.h"
@@ -38,11 +37,6 @@ void helper_tbis(CPUAlphaState *env, uint64_t p)
     tlb_flush_page(env_cpu(env), p);
 }
 
-void helper_tb_flush(CPUAlphaState *env)
-{
-    tb_flush(env_cpu(env));
-}
-
 void helper_halt(uint64_t restart)
 {
     if (restart) {
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index cebab0318c..f11b382438 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -48,8 +48,6 @@ struct DisasContext {
 
 #ifdef CONFIG_USER_ONLY
     MemOp unalign;
-#else
-    uint64_t palbr;
 #endif
     uint32_t tbflags;
     int mem_idx;
@@ -1155,7 +1153,6 @@ static DisasJumpType gen_call_pal(DisasContext *ctx, int palcode)
 #else
     {
         TCGv tmp = tcg_temp_new();
-        uint64_t entry;
 
         gen_pc_disp(ctx, tmp, 0);
         if (ctx->tbflags & ENV_FLAG_PAL_MODE) {
@@ -1165,12 +1162,11 @@ static DisasJumpType gen_call_pal(DisasContext *ctx, int palcode)
         }
         tcg_gen_st_i64(tmp, tcg_env, offsetof(CPUAlphaState, exc_addr));
 
-        entry = ctx->palbr;
-        entry += (palcode & 0x80
-                  ? 0x2000 + (palcode - 0x80) * 64
-                  : 0x1000 + palcode * 64);
-
-        tcg_gen_movi_i64(cpu_pc, entry);
+        tcg_gen_ld_i64(cpu_pc, tcg_env, offsetof(CPUAlphaState, palbr));
+        tcg_gen_addi_i64(cpu_pc, cpu_pc,
+                         palcode & 0x80
+                         ? 0x2000 + (palcode - 0x80) * 64
+                         : 0x1000 + palcode * 64);
         return DISAS_PC_UPDATED;
     }
 #endif
@@ -1292,11 +1288,7 @@ static DisasJumpType gen_mtpr(DisasContext *ctx, TCGv vb, int regno)
     case 7:
         /* PALBR */
         tcg_gen_st_i64(vb, tcg_env, offsetof(CPUAlphaState, palbr));
-        /* Changing the PAL base register implies un-chaining all of the TBs
-           that ended with a CALL_PAL.  Since the base register usually only
-           changes during boot, flushing everything works well.  */
-        gen_helper_tb_flush(tcg_env);
-        return DISAS_PC_STALE;
+        break;
 
     case 32 ... 39:
         /* Accessing the "non-shadow" general registers.  */
@@ -2874,7 +2866,6 @@ static void alpha_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
     ctx->ir = cpu_std_ir;
     ctx->unalign = (ctx->tbflags & TB_FLAG_UNALIGN ? MO_UNALN : MO_ALIGN);
 #else
-    ctx->palbr = env->palbr;
     ctx->ir = (ctx->tbflags & ENV_FLAG_PAL_MODE ? cpu_pal_ir : cpu_std_ir);
 #endif
 
diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 11d59d11ca..672ab3750c 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -29,21 +29,21 @@
 #include "qemu/interval-tree.h"
 #include "hw/registerfields.h"
 
-#define MMU_ABS_W_IDX     6
-#define MMU_ABS_IDX       7
-#define MMU_KERNEL_IDX    8
-#define MMU_KERNEL_P_IDX  9
-#define MMU_PL1_IDX       10
-#define MMU_PL1_P_IDX     11
-#define MMU_PL2_IDX       12
-#define MMU_PL2_P_IDX     13
-#define MMU_USER_IDX      14
-#define MMU_USER_P_IDX    15
-
-#define MMU_IDX_MMU_DISABLED(MIDX)  ((MIDX) < MMU_KERNEL_IDX)
-#define MMU_IDX_TO_PRIV(MIDX)       (((MIDX) - MMU_KERNEL_IDX) / 2)
-#define MMU_IDX_TO_P(MIDX)          (((MIDX) - MMU_KERNEL_IDX) & 1)
-#define PRIV_P_TO_MMU_IDX(PRIV, P)  ((PRIV) * 2 + !!(P) + MMU_KERNEL_IDX)
+#define MMU_KERNEL_IDX    0
+#define MMU_KERNEL_P_IDX  1
+#define MMU_PL1_IDX       2
+#define MMU_PL1_P_IDX     3
+#define MMU_PL2_IDX       4
+#define MMU_PL2_P_IDX     5
+#define MMU_USER_IDX      6
+#define MMU_USER_P_IDX    7
+#define MMU_ABS_IDX       8
+#define MMU_ABS_W_IDX     9
+
+#define MMU_IDX_MMU_DISABLED(MIDX)  ((MIDX) >= MMU_ABS_IDX)
+#define MMU_IDX_TO_PRIV(MIDX)       ((MIDX) / 2)
+#define MMU_IDX_TO_P(MIDX)          ((MIDX) & 1)
+#define PRIV_P_TO_MMU_IDX(PRIV, P)  ((PRIV) * 2 + !!(P))
 
 #define PRIV_KERNEL       0
 #define PRIV_USER         3
diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index 8842e07a73..3c8989f522 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -25,7 +25,6 @@
 #include "pmu.h"
 #include "time_helper.h"
 #include "exec/cputlb.h"
-#include "exec/tb-flush.h"
 #include "exec/icount.h"
 #include "accel/tcg/getpc.h"
 #include "qemu/guest-random.h"
@@ -2173,8 +2172,6 @@ static RISCVException write_misa(CPURISCVState *env, int csrno,
         env->mstatus &= ~MSTATUS_FS;
     }
 
-    /* flush translation cache */
-    tb_flush(env_cpu(env));
     env->xl = riscv_cpu_mxl(env);
     return RISCV_EXCP_NONE;
 }
diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
index 78fb279184..143ab079d4 100644
--- a/target/riscv/tcg/tcg-cpu.c
+++ b/target/riscv/tcg/tcg-cpu.c
@@ -191,7 +191,8 @@ static TCGTBCPUState riscv_get_tb_cpu_state(CPUState *cs)
 
     return (TCGTBCPUState){
         .pc = env->xl == MXL_RV32 ? env->pc & UINT32_MAX : env->pc,
-        .flags = flags
+        .flags = flags,
+        .cs_base = env->misa_ext,
     };
 }
 
diff --git a/target/sparc/insns.decode b/target/sparc/insns.decode
index 9e39d23273..242ec42016 100644
--- a/target/sparc/insns.decode
+++ b/target/sparc/insns.decode
@@ -88,9 +88,10 @@ CALL    01 i:s30
 
 {
   [
-    STBAR           10 00000 101000 01111 0 0000000000000
+    STBAR_v9        10 00000 101000 01111 0 0000000000000
     MEMBAR          10 00000 101000 01111 1 000000 cmask:3 mmask:4
 
+    RDY_v9          10 rd:5  101000 00000 0 0000000000000
     RDCCR           10 rd:5  101000 00010 0 0000000000000
     RDASI           10 rd:5  101000 00011 0 0000000000000
     RDTICK          10 rd:5  101000 00100 0 0000000000000
@@ -107,8 +108,26 @@ CALL    01 i:s30
     RDSTICK_CMPR    10 rd:5  101000 11001 0 0000000000000
     RDSTRAND_STATUS 10 rd:5  101000 11010 0 0000000000000
   ]
-  # Before v8, all rs1 accepted; otherwise rs1==0.
-  RDY               10 rd:5  101000 rs1:5 0 0000000000000
+
+  # The v8 manual, section B.30 STBAR instruction, says
+  # bits [12:0] are ignored, but bit 13 must be 0.
+  # However, section B.28 Read State Register Instruction has a
+  # comment that RDASR with rs1 = 15, rd = 0 is STBAR.  Here,
+  # bit 13 is also ignored and rd != 0 is merely reserved.
+  #
+  # Solaris 8 executes v9 MEMBAR instruction 0x8143e008 during boot.
+  # This confirms that bit 13 is ignored, as 0x8143c000 is STBAR.
+  STBAR_v8          10 ----- 101000 01111 - -------------
+
+  # For v7, bits [18:0] are ignored.
+  # For v8, bits [18:14], aka rs1, are repurposed and rs1 = 0 is RDY,
+  # and other values are RDASR.  However, the v8 manual explicitly
+  # says that rs1 in 1..14 yield undefined results and do not cause
+  # an illegal instruction trap, and rs1 in 16..31 are available for
+  # implementation specific usage.
+  # Implement not causing an illegal instruction trap for v8 by
+  # continuing to interpret unused values per v7, i.e. as RDY.
+  RDY_v7            10 rd:5  101000 ----- - -------------
 }
 
 {
@@ -139,14 +158,16 @@ CALL    01 i:s30
 }
 
 {
-  RDPSR             10 rd:5  101001 00000 0 0000000000000
-  RDHPR_hpstate     10 rd:5  101001 00000 0 0000000000000
+  [
+    RDHPR_hpstate       10 rd:5  101001 00000 0 0000000000000
+    RDHPR_htstate       10 rd:5  101001 00001 0 0000000000000
+    RDHPR_hintp         10 rd:5  101001 00011 0 0000000000000
+    RDHPR_htba          10 rd:5  101001 00101 0 0000000000000
+    RDHPR_hver          10 rd:5  101001 00110 0 0000000000000
+    RDHPR_hstick_cmpr   10 rd:5  101001 11111 0 0000000000000
+  ]
+  RDPSR                 10 rd:5  101001 ----- - -------------
 }
-RDHPR_htstate       10 rd:5  101001 00001 0 0000000000000
-RDHPR_hintp         10 rd:5  101001 00011 0 0000000000000
-RDHPR_htba          10 rd:5  101001 00101 0 0000000000000
-RDHPR_hver          10 rd:5  101001 00110 0 0000000000000
-RDHPR_hstick_cmpr   10 rd:5  101001 11111 0 0000000000000
 
 {
   WRPSR             10 00000 110001 ..... . .............  @n_r_ri
@@ -159,26 +180,28 @@ RESTORED            10 00001 110001 00000 0 0000000000000
 # UA2005 INVALW
 
 {
-  RDWIM             10 rd:5  101010 00000 0 0000000000000
-  RDPR_tpc          10 rd:5  101010 00000 0 0000000000000
+  [
+    RDPR_tpc            10 rd:5  101010 00000 0 0000000000000
+    RDPR_tnpc           10 rd:5  101010 00001 0 0000000000000
+    RDPR_tstate         10 rd:5  101010 00010 0 0000000000000
+    RDPR_tt             10 rd:5  101010 00011 0 0000000000000
+    RDPR_tick           10 rd:5  101010 00100 0 0000000000000
+    RDPR_tba            10 rd:5  101010 00101 0 0000000000000
+    RDPR_pstate         10 rd:5  101010 00110 0 0000000000000
+    RDPR_tl             10 rd:5  101010 00111 0 0000000000000
+    RDPR_pil            10 rd:5  101010 01000 0 0000000000000
+    RDPR_cwp            10 rd:5  101010 01001 0 0000000000000
+    RDPR_cansave        10 rd:5  101010 01010 0 0000000000000
+    RDPR_canrestore     10 rd:5  101010 01011 0 0000000000000
+    RDPR_cleanwin       10 rd:5  101010 01100 0 0000000000000
+    RDPR_otherwin       10 rd:5  101010 01101 0 0000000000000
+    RDPR_wstate         10 rd:5  101010 01110 0 0000000000000
+    RDPR_gl             10 rd:5  101010 10000 0 0000000000000
+    RDPR_strand_status  10 rd:5  101010 11010 0 0000000000000
+    RDPR_ver            10 rd:5  101010 11111 0 0000000000000
+  ]
+  RDWIM                 10 rd:5  101010 ----- - -------------
 }
-RDPR_tnpc           10 rd:5  101010 00001 0 0000000000000
-RDPR_tstate         10 rd:5  101010 00010 0 0000000000000
-RDPR_tt             10 rd:5  101010 00011 0 0000000000000
-RDPR_tick           10 rd:5  101010 00100 0 0000000000000
-RDPR_tba            10 rd:5  101010 00101 0 0000000000000
-RDPR_pstate         10 rd:5  101010 00110 0 0000000000000
-RDPR_tl             10 rd:5  101010 00111 0 0000000000000
-RDPR_pil            10 rd:5  101010 01000 0 0000000000000
-RDPR_cwp            10 rd:5  101010 01001 0 0000000000000
-RDPR_cansave        10 rd:5  101010 01010 0 0000000000000
-RDPR_canrestore     10 rd:5  101010 01011 0 0000000000000
-RDPR_cleanwin       10 rd:5  101010 01100 0 0000000000000
-RDPR_otherwin       10 rd:5  101010 01101 0 0000000000000
-RDPR_wstate         10 rd:5  101010 01110 0 0000000000000
-RDPR_gl             10 rd:5  101010 10000 0 0000000000000
-RDPR_strand_status  10 rd:5  101010 11010 0 0000000000000
-RDPR_ver            10 rd:5  101010 11111 0 0000000000000
 
 {
   WRWIM             10 00000 110010 ..... . .............  @n_r_ri
@@ -203,7 +226,7 @@ WRPR_strand_status  10 11010 110010 ..... . .............  @n_r_ri
 
 {
   FLUSHW    10 00000 101011 00000 0 0000000000000
-  RDTBR     10 rd:5  101011 00000 0 0000000000000
+  RDTBR     10 rd:5  101011 ----- - -------------
 }
 
 {
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index b922e53bf1..810e2491a6 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -2487,7 +2487,7 @@ static int extract_qfpreg(DisasContext *dc, int x)
 
 #define TRANS(NAME, AVAIL, FUNC, ...) \
     static bool trans_##NAME(DisasContext *dc, arg_##NAME *a) \
-    { return avail_##AVAIL(dc) && FUNC(dc, __VA_ARGS__); }
+    { return avail_##AVAIL(dc) && FUNC(dc, ## __VA_ARGS__); }
 
 #define avail_ALL(C)      true
 #ifdef TARGET_SPARC64
@@ -2526,6 +2526,32 @@ static int extract_qfpreg(DisasContext *dc, int x)
 # define avail_VIS4(C)    false
 #endif
 
+/*
+ * We decoded bit 13 as imm, and bits [12:0] as rs2_or_imm.
+ * For v9, if !imm, then the unused bits [12:5] must be zero.
+ * For v7 and v8, the unused bits are ignored; clear them here.
+ */
+static bool check_rs2(DisasContext *dc, int *rs2)
+{
+    if (unlikely(*rs2 & ~0x1f)) {
+        if (avail_64(dc)) {
+            return false;
+        }
+        *rs2 &= 0x1f;
+    }
+    return true;
+}
+
+static bool check_r_r_ri(DisasContext *dc, arg_r_r_ri *a)
+{
+    return a->imm || check_rs2(dc, &a->rs2_or_imm);
+}
+
+static bool check_r_r_ri_cc(DisasContext *dc, arg_r_r_ri_cc *a)
+{
+    return a->imm || check_rs2(dc, &a->rs2_or_imm);
+}
+
 /* Default case for non jump instructions. */
 static bool advance_pc(DisasContext *dc)
 {
@@ -2823,12 +2849,15 @@ static bool trans_Tcc_i_v9(DisasContext *dc, arg_Tcc_i_v9 *a)
     return do_tcc(dc, a->cond, a->cc, a->rs1, true, a->i);
 }
 
-static bool trans_STBAR(DisasContext *dc, arg_STBAR *a)
+static bool do_stbar(DisasContext *dc)
 {
     tcg_gen_mb(TCG_MO_ST_ST | TCG_BAR_SC);
     return advance_pc(dc);
 }
 
+TRANS(STBAR_v8, 32, do_stbar)
+TRANS(STBAR_v9, 64, do_stbar)
+
 static bool trans_MEMBAR(DisasContext *dc, arg_MEMBAR *a)
 {
     if (avail_32(dc)) {
@@ -2860,18 +2889,8 @@ static TCGv do_rdy(DisasContext *dc, TCGv dst)
     return cpu_y;
 }
 
-static bool trans_RDY(DisasContext *dc, arg_RDY *a)
-{
-    /*
-     * TODO: Need a feature bit for sparcv8.  In the meantime, treat all
-     * 32-bit cpus like sparcv7, which ignores the rs1 field.
-     * This matches after all other ASR, so Leon3 Asr17 is handled first.
-     */
-    if (avail_64(dc) && a->rs1 != 0) {
-        return false;
-    }
-    return do_rd_special(dc, true, a->rd, do_rdy);
-}
+TRANS(RDY_v7, 32, do_rd_special, true, a->rd, do_rdy)
+TRANS(RDY_v9, 64, do_rd_special, true, a->rd, do_rdy)
 
 static TCGv do_rd_leon3_config(DisasContext *dc, TCGv dst)
 {
@@ -3256,8 +3275,7 @@ static bool do_wr_special(DisasContext *dc, arg_r_r_ri *a, bool priv,
 {
     TCGv src;
 
-    /* For simplicity, we under-decoded the rs2 form. */
-    if (!a->imm && (a->rs2_or_imm & ~0x1f)) {
+    if (!check_r_r_ri(dc, a)) {
         return false;
     }
     if (!priv) {
@@ -3700,8 +3718,7 @@ static bool do_arith_int(DisasContext *dc, arg_r_r_ri_cc *a,
 {
     TCGv dst, src1;
 
-    /* For simplicity, we under-decoded the rs2 form. */
-    if (!a->imm && a->rs2_or_imm & ~0x1f) {
+    if (!check_r_r_ri_cc(dc, a)) {
         return false;
     }
 
@@ -3785,11 +3802,11 @@ static bool trans_OR(DisasContext *dc, arg_r_r_ri_cc *a)
 {
     /* OR with %g0 is the canonical alias for MOV. */
     if (!a->cc && a->rs1 == 0) {
+        if (!check_r_r_ri_cc(dc, a)) {
+            return false;
+        }
         if (a->imm || a->rs2_or_imm == 0) {
             gen_store_gpr(dc, a->rd, tcg_constant_tl(a->rs2_or_imm));
-        } else if (a->rs2_or_imm & ~0x1f) {
-            /* For simplicity, we under-decoded the rs2 form. */
-            return false;
         } else {
             gen_store_gpr(dc, a->rd, cpu_regs[a->rs2_or_imm]);
         }
@@ -3806,8 +3823,7 @@ static bool trans_UDIV(DisasContext *dc, arg_r_r_ri *a)
     if (!avail_DIV(dc)) {
         return false;
     }
-    /* For simplicity, we under-decoded the rs2 form. */
-    if (!a->imm && a->rs2_or_imm & ~0x1f) {
+    if (!check_r_r_ri(dc, a)) {
         return false;
     }
 
@@ -3858,8 +3874,7 @@ static bool trans_UDIVX(DisasContext *dc, arg_r_r_ri *a)
     if (!avail_64(dc)) {
         return false;
     }
-    /* For simplicity, we under-decoded the rs2 form. */
-    if (!a->imm && a->rs2_or_imm & ~0x1f) {
+    if (!check_r_r_ri(dc, a)) {
         return false;
     }
 
@@ -3896,8 +3911,7 @@ static bool trans_SDIVX(DisasContext *dc, arg_r_r_ri *a)
     if (!avail_64(dc)) {
         return false;
     }
-    /* For simplicity, we under-decoded the rs2 form. */
-    if (!a->imm && a->rs2_or_imm & ~0x1f) {
+    if (!check_r_r_ri(dc, a)) {
         return false;
     }
 
@@ -4193,8 +4207,7 @@ TRANS(SRA_i, ALL, do_shift_i, a, false, false)
 
 static TCGv gen_rs2_or_imm(DisasContext *dc, bool imm, int rs2_or_imm)
 {
-    /* For simplicity, we under-decoded the rs2 form. */
-    if (!imm && rs2_or_imm & ~0x1f) {
+    if (!imm && !check_rs2(dc, &rs2_or_imm)) {
         return NULL;
     }
     if (imm || rs2_or_imm == 0) {
@@ -4257,8 +4270,7 @@ static bool do_add_special(DisasContext *dc, arg_r_r_ri *a,
 {
     TCGv src1, sum;
 
-    /* For simplicity, we under-decoded the rs2 form. */
-    if (!a->imm && a->rs2_or_imm & ~0x1f) {
+    if (!check_r_r_ri(dc, a)) {
         return false;
     }
 
@@ -4376,8 +4388,7 @@ static TCGv gen_ldst_addr(DisasContext *dc, int rs1, bool imm, int rs2_or_imm)
 {
     TCGv addr, tmp = NULL;
 
-    /* For simplicity, we under-decoded the rs2 form. */
-    if (!imm && rs2_or_imm & ~0x1f) {
+    if (!imm && !check_rs2(dc, &rs2_or_imm)) {
         return NULL;
     }
 
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 3b088b7bd9..caf79c742d 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -1668,7 +1668,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
         ldst->oi = oi;
         ldst->addr_reg = addr_reg;
 
-        /* Load cpu->neg.tlb.f[mmu_idx].{mask,table} into {tmp0,tmp1}. */
+        /* Load CPUTLBDescFast.{mask,table} into {tmp0,tmp1}. */
         QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
         QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8);
         tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0,
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index 338c57b061..87ca66bb02 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -1421,7 +1421,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
         ldst->oi = oi;
         ldst->addr_reg = addr;
 
-        /* Load cpu->neg.tlb.f[mmu_idx].{mask,table} into {r0,r1}.  */
+        /* Load CPUTLBDescFast.{mask,table} into {r0,r1}.  */
         QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
         QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);
         tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 3638ab9fea..f69702b26e 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -1568,9 +1568,10 @@ static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
             return fold_and(ctx, op);
         }
         if (fv == -1 && TCG_TARGET_HAS_orc_vec) {
+            TCGArg ta = op->args[2];
             op->opc = INDEX_op_orc_vec;
             op->args[2] = op->args[1];
-            op->args[1] = op->args[3];
+            op->args[1] = ta;
             return fold_orc(ctx, op);
         }
     }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index afac55a203..294762c283 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -425,7 +425,8 @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
 static int __attribute__((unused))
 tlb_mask_table_ofs(TCGContext *s, int which)
 {
-    return (offsetof(CPUNegativeOffsetState, tlb.f[which]) -
+    int fi = mmuidx_to_fast_index(which);
+    return (offsetof(CPUNegativeOffsetState, tlb.f[fi]) -
             sizeof(CPUNegativeOffsetState));
 }
 
diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target
index 8dc65d7a06..f5b4d2b813 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -46,6 +46,8 @@ vma-pthread: LDFLAGS+=-pthread
 sigreturn-sigmask: CFLAGS+=-pthread
 sigreturn-sigmask: LDFLAGS+=-pthread
 
+tb-link: LDFLAGS+=-lpthread
+
 # GCC versions 12/13/14/15 at least incorrectly complain about
 # "'SHA1Transform' reading 64 bytes from a region of size 0"; see the gcc bug
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106709
diff --git a/tests/tcg/multiarch/tb-link.c b/tests/tcg/multiarch/tb-link.c
new file mode 100644
index 0000000000..4e40306fa1
--- /dev/null
+++ b/tests/tcg/multiarch/tb-link.c
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Verify that a single TB spin-loop is properly invalidated,
+ * releasing the thread from the spin-loop.
+ */
+
+#include <assert.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sched.h>
+
+
+#ifdef __x86_64__
+#define READY   0x000047c6      /* movb $0,0(%rdi) */
+#define LOOP    0xfceb9090      /* 1: nop*2; jmp 1b */
+#define RETURN  0x909090c3      /* ret; nop*3 */
+#define NOP     0x90909090      /* nop*4 */
+#elif defined(__aarch64__)
+#define READY   0x3900001f      /* strb wzr,[x0] */
+#define LOOP    0x14000000      /* b . */
+#define RETURN  0xd65f03c0      /* ret */
+#define NOP     0xd503201f      /* nop */
+#elif defined(__riscv)
+#define READY   0x00050023      /* sb zero, (a0) */
+#define LOOP    0x0000006f      /* jal zero, #0 */
+#define RETURN  0x00008067      /* jalr zero, ra, 0 */
+#define NOP     0x00000013      /* nop */
+#endif
+
+
+int main()
+{
+#ifdef READY
+    int tmp;
+    pthread_t thread_id;
+    bool hold = true;
+    uint32_t *buf;
+
+    buf = mmap(NULL, 3 * sizeof(uint32_t),
+               PROT_READ | PROT_WRITE | PROT_EXEC,
+               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    assert(buf != MAP_FAILED);
+
+    buf[0] = READY;
+    buf[1] = LOOP;
+    buf[2] = RETURN;
+
+    alarm(2);
+
+    tmp = pthread_create(&thread_id, NULL, (void *(*)(void *))buf, &hold);
+    assert(tmp == 0);
+
+    while (hold) {
+        sched_yield();
+    }
+
+    buf[1] = NOP;
+    __builtin___clear_cache(&buf[1], &buf[2]);
+
+    tmp = pthread_join(thread_id, NULL);
+    assert(tmp == 0);
+#endif
+    return 0;
+}