summary refs log tree commit diff stats
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--accel/tcg/cputlb.c7
-rw-r--r--accel/tcg/internal.h81
-rw-r--r--accel/tcg/tb-maint.c994
-rw-r--r--accel/tcg/trace-events4
-rw-r--r--accel/tcg/translate-all.c811
-rw-r--r--accel/tcg/user-exec.c658
-rw-r--r--include/exec/exec-all.h43
-rw-r--r--include/exec/translate-all.h6
-rw-r--r--include/qemu/interval-tree.h99
-rw-r--r--tests/tcg/multiarch/test-vma.c22
-rw-r--r--tests/unit/meson.build1
-rw-r--r--tests/unit/test-interval-tree.c209
-rw-r--r--trace-events4
-rw-r--r--util/interval-tree.c882
-rw-r--r--util/meson.build1
15 files changed, 2661 insertions, 1161 deletions
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 6f1c00682b..03674d598f 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -33,7 +33,7 @@
 #include "qemu/atomic.h"
 #include "qemu/atomic128.h"
 #include "exec/translate-all.h"
-#include "trace/trace-root.h"
+#include "trace.h"
 #include "tb-hash.h"
 #include "internal.h"
 #ifdef CONFIG_PLUGIN
@@ -1508,10 +1508,7 @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
     trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
 
     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
-        struct page_collection *pages
-            = page_collection_lock(ram_addr, ram_addr + size);
-        tb_invalidate_phys_page_fast(pages, ram_addr, size, retaddr);
-        page_collection_unlock(pages);
+        tb_invalidate_phys_range_fast(ram_addr, size, retaddr);
     }
 
     /*
diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index cb13bade4f..6edff16fb0 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -23,83 +23,28 @@
 #define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
 #endif
 
-typedef struct PageDesc {
-    /* list of TBs intersecting this ram page */
-    uintptr_t first_tb;
-#ifdef CONFIG_USER_ONLY
-    unsigned long flags;
-    void *target_data;
-#endif
-#ifdef CONFIG_SOFTMMU
-    QemuSpin lock;
+#if defined(CONFIG_SOFTMMU) && defined(CONFIG_DEBUG_TCG)
+void assert_no_pages_locked(void);
+#else
+static inline void assert_no_pages_locked(void) { }
 #endif
-} PageDesc;
-
-/* Size of the L2 (and L3, etc) page tables.  */
-#define V_L2_BITS 10
-#define V_L2_SIZE (1 << V_L2_BITS)
-
-/*
- * L1 Mapping properties
- */
-extern int v_l1_size;
-extern int v_l1_shift;
-extern int v_l2_levels;
-
-/*
- * The bottom level has pointers to PageDesc, and is indexed by
- * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size.
- */
-#define V_L1_MIN_BITS 4
-#define V_L1_MAX_BITS (V_L2_BITS + 3)
-#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS)
-
-extern void *l1_map[V_L1_MAX_SIZE];
 
-PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc);
-
-static inline PageDesc *page_find(tb_page_addr_t index)
-{
-    return page_find_alloc(index, false);
-}
-
-/* list iterators for lists of tagged pointers in TranslationBlock */
-#define TB_FOR_EACH_TAGGED(head, tb, n, field)                          \
-    for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1);        \
-         tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \
-             tb = (TranslationBlock *)((uintptr_t)tb & ~1))
-
-#define PAGE_FOR_EACH_TB(pagedesc, tb, n)                       \
-    TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next)
-
-#define TB_FOR_EACH_JMP(head_tb, tb, n)                                 \
-    TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next)
-
-/* In user-mode page locks aren't used; mmap_lock is enough */
 #ifdef CONFIG_USER_ONLY
-#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock())
-static inline void page_lock(PageDesc *pd) { }
-static inline void page_unlock(PageDesc *pd) { }
-#else
-#ifdef CONFIG_DEBUG_TCG
-void do_assert_page_locked(const PageDesc *pd, const char *file, int line);
-#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__)
+static inline void page_table_config_init(void) { }
 #else
-#define assert_page_locked(pd)
-#endif
-void page_lock(PageDesc *pd);
-void page_unlock(PageDesc *pd);
-#endif
-#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG)
-void assert_no_pages_locked(void);
-#else
-static inline void assert_no_pages_locked(void) { }
+void page_table_config_init(void);
 #endif
 
+#ifdef CONFIG_SOFTMMU
+void tb_invalidate_phys_range_fast(ram_addr_t ram_addr,
+                                   unsigned size,
+                                   uintptr_t retaddr);
+G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
+#endif /* CONFIG_SOFTMMU */
+
 TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc,
                               target_ulong cs_base, uint32_t flags,
                               int cflags);
-G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
 void page_init(void);
 void tb_htable_init(void);
 void tb_reset_jump(TranslationBlock *tb, int n);
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index 0cdb35548c..1b8e860647 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -18,6 +18,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/interval-tree.h"
 #include "exec/cputlb.h"
 #include "exec/log.h"
 #include "exec/exec-all.h"
@@ -29,6 +30,15 @@
 #include "internal.h"
 
 
+/* List iterators for lists of tagged pointers in TranslationBlock. */
+#define TB_FOR_EACH_TAGGED(head, tb, n, field)                          \
+    for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1);        \
+         tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \
+             tb = (TranslationBlock *)((uintptr_t)tb & ~1))
+
+#define TB_FOR_EACH_JMP(head_tb, tb, n)                                 \
+    TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next)
+
 static bool tb_cmp(const void *ap, const void *bp)
 {
     const TranslationBlock *a = ap;
@@ -50,8 +60,513 @@ void tb_htable_init(void)
     qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
 }
 
+typedef struct PageDesc PageDesc;
+
+#ifdef CONFIG_USER_ONLY
+
+/*
+ * In user-mode page locks aren't used; mmap_lock is enough.
+ */
+#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock())
+
+static inline void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+                                  PageDesc **ret_p2, tb_page_addr_t phys2,
+                                  bool alloc)
+{
+    *ret_p1 = NULL;
+    *ret_p2 = NULL;
+}
+
+static inline void page_unlock(PageDesc *pd) { }
+static inline void page_lock_tb(const TranslationBlock *tb) { }
+static inline void page_unlock_tb(const TranslationBlock *tb) { }
+
+/*
+ * For user-only, since we are protecting all of memory with a single lock,
+ * and because the two pages of a TranslationBlock are always contiguous,
+ * use a single data structure to record all TranslationBlocks.
+ */
+static IntervalTreeRoot tb_root;
+
+static void tb_remove_all(void)
+{
+    assert_memory_lock();
+    memset(&tb_root, 0, sizeof(tb_root));
+}
+
+/* Call with mmap_lock held. */
+static void tb_record(TranslationBlock *tb, PageDesc *p1, PageDesc *p2)
+{
+    target_ulong addr;
+    int flags;
+
+    assert_memory_lock();
+    tb->itree.last = tb->itree.start + tb->size - 1;
+
+    /* translator_loop() must have made all TB pages non-writable */
+    addr = tb_page_addr0(tb);
+    flags = page_get_flags(addr);
+    assert(!(flags & PAGE_WRITE));
+
+    addr = tb_page_addr1(tb);
+    if (addr != -1) {
+        flags = page_get_flags(addr);
+        assert(!(flags & PAGE_WRITE));
+    }
+
+    interval_tree_insert(&tb->itree, &tb_root);
+}
+
+/* Call with mmap_lock held. */
+static void tb_remove(TranslationBlock *tb)
+{
+    assert_memory_lock();
+    interval_tree_remove(&tb->itree, &tb_root);
+}
+
+/* TODO: For now, still shared with translate-all.c for system mode. */
+#define PAGE_FOR_EACH_TB(start, end, pagedesc, T, N)    \
+    for (T = foreach_tb_first(start, end),              \
+         N = foreach_tb_next(T, start, end);            \
+         T != NULL;                                     \
+         T = N, N = foreach_tb_next(N, start, end))
+
+typedef TranslationBlock *PageForEachNext;
+
+static PageForEachNext foreach_tb_first(tb_page_addr_t start,
+                                        tb_page_addr_t end)
+{
+    IntervalTreeNode *n = interval_tree_iter_first(&tb_root, start, end - 1);
+    return n ? container_of(n, TranslationBlock, itree) : NULL;
+}
+
+static PageForEachNext foreach_tb_next(PageForEachNext tb,
+                                       tb_page_addr_t start,
+                                       tb_page_addr_t end)
+{
+    IntervalTreeNode *n;
+
+    if (tb) {
+        n = interval_tree_iter_next(&tb->itree, start, end - 1);
+        if (n) {
+            return container_of(n, TranslationBlock, itree);
+        }
+    }
+    return NULL;
+}
+
+#else
+/*
+ * In system mode we want L1_MAP to be based on ram offsets.
+ */
+#if HOST_LONG_BITS < TARGET_PHYS_ADDR_SPACE_BITS
+# define L1_MAP_ADDR_SPACE_BITS  HOST_LONG_BITS
+#else
+# define L1_MAP_ADDR_SPACE_BITS  TARGET_PHYS_ADDR_SPACE_BITS
+#endif
+
+/* Size of the L2 (and L3, etc) page tables.  */
+#define V_L2_BITS 10
+#define V_L2_SIZE (1 << V_L2_BITS)
+
+/*
+ * L1 Mapping properties
+ */
+static int v_l1_size;
+static int v_l1_shift;
+static int v_l2_levels;
+
+/*
+ * The bottom level has pointers to PageDesc, and is indexed by
+ * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size.
+ */
+#define V_L1_MIN_BITS 4
+#define V_L1_MAX_BITS (V_L2_BITS + 3)
+#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS)
+
+static void *l1_map[V_L1_MAX_SIZE];
+
+struct PageDesc {
+    QemuSpin lock;
+    /* list of TBs intersecting this ram page */
+    uintptr_t first_tb;
+};
+
+void page_table_config_init(void)
+{
+    uint32_t v_l1_bits;
+
+    assert(TARGET_PAGE_BITS);
+    /* The bits remaining after N lower levels of page tables.  */
+    v_l1_bits = (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS;
+    if (v_l1_bits < V_L1_MIN_BITS) {
+        v_l1_bits += V_L2_BITS;
+    }
+
+    v_l1_size = 1 << v_l1_bits;
+    v_l1_shift = L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - v_l1_bits;
+    v_l2_levels = v_l1_shift / V_L2_BITS - 1;
+
+    assert(v_l1_bits <= V_L1_MAX_BITS);
+    assert(v_l1_shift % V_L2_BITS == 0);
+    assert(v_l2_levels >= 0);
+}
+
+static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
+{
+    PageDesc *pd;
+    void **lp;
+    int i;
+
+    /* Level 1.  Always allocated.  */
+    lp = l1_map + ((index >> v_l1_shift) & (v_l1_size - 1));
+
+    /* Level 2..N-1.  */
+    for (i = v_l2_levels; i > 0; i--) {
+        void **p = qatomic_rcu_read(lp);
+
+        if (p == NULL) {
+            void *existing;
+
+            if (!alloc) {
+                return NULL;
+            }
+            p = g_new0(void *, V_L2_SIZE);
+            existing = qatomic_cmpxchg(lp, NULL, p);
+            if (unlikely(existing)) {
+                g_free(p);
+                p = existing;
+            }
+        }
+
+        lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1));
+    }
+
+    pd = qatomic_rcu_read(lp);
+    if (pd == NULL) {
+        void *existing;
+
+        if (!alloc) {
+            return NULL;
+        }
+
+        pd = g_new0(PageDesc, V_L2_SIZE);
+        for (int i = 0; i < V_L2_SIZE; i++) {
+            qemu_spin_init(&pd[i].lock);
+        }
+
+        existing = qatomic_cmpxchg(lp, NULL, pd);
+        if (unlikely(existing)) {
+            for (int i = 0; i < V_L2_SIZE; i++) {
+                qemu_spin_destroy(&pd[i].lock);
+            }
+            g_free(pd);
+            pd = existing;
+        }
+    }
+
+    return pd + (index & (V_L2_SIZE - 1));
+}
+
+static inline PageDesc *page_find(tb_page_addr_t index)
+{
+    return page_find_alloc(index, false);
+}
+
+/**
+ * struct page_entry - page descriptor entry
+ * @pd:     pointer to the &struct PageDesc of the page this entry represents
+ * @index:  page index of the page
+ * @locked: whether the page is locked
+ *
+ * This struct helps us keep track of the locked state of a page, without
+ * bloating &struct PageDesc.
+ *
+ * A page lock protects accesses to all fields of &struct PageDesc.
+ *
+ * See also: &struct page_collection.
+ */
+struct page_entry {
+    PageDesc *pd;
+    tb_page_addr_t index;
+    bool locked;
+};
+
+/**
+ * struct page_collection - tracks a set of pages (i.e. &struct page_entry's)
+ * @tree:   Binary search tree (BST) of the pages, with key == page index
+ * @max:    Pointer to the page in @tree with the highest page index
+ *
+ * To avoid deadlock we lock pages in ascending order of page index.
+ * When operating on a set of pages, we need to keep track of them so that
+ * we can lock them in order and also unlock them later. For this we collect
+ * pages (i.e. &struct page_entry's) in a binary search @tree. Given that the
+ * @tree implementation we use does not provide an O(1) operation to obtain the
+ * highest-ranked element, we use @max to keep track of the inserted page
+ * with the highest index. This is valuable because if a page is not in
+ * the tree and its index is higher than @max's, then we can lock it
+ * without breaking the locking order rule.
+ *
+ * Note on naming: 'struct page_set' would be shorter, but we already have a few
+ * page_set_*() helpers, so page_collection is used instead to avoid confusion.
+ *
+ * See also: page_collection_lock().
+ */
+struct page_collection {
+    GTree *tree;
+    struct page_entry *max;
+};
+
+typedef int PageForEachNext;
+#define PAGE_FOR_EACH_TB(start, end, pagedesc, tb, n) \
+    TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next)
+
+#ifdef CONFIG_DEBUG_TCG
+
+static __thread GHashTable *ht_pages_locked_debug;
+
+static void ht_pages_locked_debug_init(void)
+{
+    if (ht_pages_locked_debug) {
+        return;
+    }
+    ht_pages_locked_debug = g_hash_table_new(NULL, NULL);
+}
+
+static bool page_is_locked(const PageDesc *pd)
+{
+    PageDesc *found;
+
+    ht_pages_locked_debug_init();
+    found = g_hash_table_lookup(ht_pages_locked_debug, pd);
+    return !!found;
+}
+
+static void page_lock__debug(PageDesc *pd)
+{
+    ht_pages_locked_debug_init();
+    g_assert(!page_is_locked(pd));
+    g_hash_table_insert(ht_pages_locked_debug, pd, pd);
+}
+
+static void page_unlock__debug(const PageDesc *pd)
+{
+    bool removed;
+
+    ht_pages_locked_debug_init();
+    g_assert(page_is_locked(pd));
+    removed = g_hash_table_remove(ht_pages_locked_debug, pd);
+    g_assert(removed);
+}
+
+static void do_assert_page_locked(const PageDesc *pd,
+                                  const char *file, int line)
+{
+    if (unlikely(!page_is_locked(pd))) {
+        error_report("assert_page_lock: PageDesc %p not locked @ %s:%d",
+                     pd, file, line);
+        abort();
+    }
+}
+#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__)
+
+void assert_no_pages_locked(void)
+{
+    ht_pages_locked_debug_init();
+    g_assert(g_hash_table_size(ht_pages_locked_debug) == 0);
+}
+
+#else /* !CONFIG_DEBUG_TCG */
+
+static inline void page_lock__debug(const PageDesc *pd) { }
+static inline void page_unlock__debug(const PageDesc *pd) { }
+static inline void assert_page_locked(const PageDesc *pd) { }
+
+#endif /* CONFIG_DEBUG_TCG */
+
+static void page_lock(PageDesc *pd)
+{
+    page_lock__debug(pd);
+    qemu_spin_lock(&pd->lock);
+}
+
+static void page_unlock(PageDesc *pd)
+{
+    qemu_spin_unlock(&pd->lock);
+    page_unlock__debug(pd);
+}
+
+static inline struct page_entry *
+page_entry_new(PageDesc *pd, tb_page_addr_t index)
+{
+    struct page_entry *pe = g_malloc(sizeof(*pe));
+
+    pe->index = index;
+    pe->pd = pd;
+    pe->locked = false;
+    return pe;
+}
+
+static void page_entry_destroy(gpointer p)
+{
+    struct page_entry *pe = p;
+
+    g_assert(pe->locked);
+    page_unlock(pe->pd);
+    g_free(pe);
+}
+
+/* returns false on success */
+static bool page_entry_trylock(struct page_entry *pe)
+{
+    bool busy;
+
+    busy = qemu_spin_trylock(&pe->pd->lock);
+    if (!busy) {
+        g_assert(!pe->locked);
+        pe->locked = true;
+        page_lock__debug(pe->pd);
+    }
+    return busy;
+}
+
+static void do_page_entry_lock(struct page_entry *pe)
+{
+    page_lock(pe->pd);
+    g_assert(!pe->locked);
+    pe->locked = true;
+}
+
+static gboolean page_entry_lock(gpointer key, gpointer value, gpointer data)
+{
+    struct page_entry *pe = value;
+
+    do_page_entry_lock(pe);
+    return FALSE;
+}
+
+static gboolean page_entry_unlock(gpointer key, gpointer value, gpointer data)
+{
+    struct page_entry *pe = value;
+
+    if (pe->locked) {
+        pe->locked = false;
+        page_unlock(pe->pd);
+    }
+    return FALSE;
+}
+
+/*
+ * Trylock a page, and if successful, add the page to a collection.
+ * Returns true ("busy") if the page could not be locked; false otherwise.
+ */
+static bool page_trylock_add(struct page_collection *set, tb_page_addr_t addr)
+{
+    tb_page_addr_t index = addr >> TARGET_PAGE_BITS;
+    struct page_entry *pe;
+    PageDesc *pd;
+
+    pe = g_tree_lookup(set->tree, &index);
+    if (pe) {
+        return false;
+    }
+
+    pd = page_find(index);
+    if (pd == NULL) {
+        return false;
+    }
+
+    pe = page_entry_new(pd, index);
+    g_tree_insert(set->tree, &pe->index, pe);
+
+    /*
+     * If this is either (1) the first insertion or (2) a page whose index
+     * is higher than any other so far, just lock the page and move on.
+     */
+    if (set->max == NULL || pe->index > set->max->index) {
+        set->max = pe;
+        do_page_entry_lock(pe);
+        return false;
+    }
+    /*
+     * Try to acquire out-of-order lock; if busy, return busy so that we acquire
+     * locks in order.
+     */
+    return page_entry_trylock(pe);
+}
+
+static gint tb_page_addr_cmp(gconstpointer ap, gconstpointer bp, gpointer udata)
+{
+    tb_page_addr_t a = *(const tb_page_addr_t *)ap;
+    tb_page_addr_t b = *(const tb_page_addr_t *)bp;
+
+    if (a == b) {
+        return 0;
+    } else if (a < b) {
+        return -1;
+    }
+    return 1;
+}
+
+/*
+ * Lock a range of pages ([@start,@end[) as well as the pages of all
+ * intersecting TBs.
+ * Locking order: acquire locks in ascending order of page index.
+ */
+static struct page_collection *page_collection_lock(tb_page_addr_t start,
+                                                    tb_page_addr_t end)
+{
+    struct page_collection *set = g_malloc(sizeof(*set));
+    tb_page_addr_t index;
+    PageDesc *pd;
+
+    start >>= TARGET_PAGE_BITS;
+    end   >>= TARGET_PAGE_BITS;
+    g_assert(start <= end);
+
+    set->tree = g_tree_new_full(tb_page_addr_cmp, NULL, NULL,
+                                page_entry_destroy);
+    set->max = NULL;
+    assert_no_pages_locked();
+
+ retry:
+    g_tree_foreach(set->tree, page_entry_lock, NULL);
+
+    for (index = start; index <= end; index++) {
+        TranslationBlock *tb;
+        PageForEachNext n;
+
+        pd = page_find(index);
+        if (pd == NULL) {
+            continue;
+        }
+        if (page_trylock_add(set, index << TARGET_PAGE_BITS)) {
+            g_tree_foreach(set->tree, page_entry_unlock, NULL);
+            goto retry;
+        }
+        assert_page_locked(pd);
+        PAGE_FOR_EACH_TB(unused, unused, pd, tb, n) {
+            if (page_trylock_add(set, tb_page_addr0(tb)) ||
+                (tb_page_addr1(tb) != -1 &&
+                 page_trylock_add(set, tb_page_addr1(tb)))) {
+                /* drop all locks, and reacquire in order */
+                g_tree_foreach(set->tree, page_entry_unlock, NULL);
+                goto retry;
+            }
+        }
+    }
+    return set;
+}
+
+static void page_collection_unlock(struct page_collection *set)
+{
+    /* entries are unlocked and freed via page_entry_destroy */
+    g_tree_destroy(set->tree);
+    g_free(set);
+}
+
 /* Set to NULL all the 'first_tb' fields in all PageDescs. */
-static void page_flush_tb_1(int level, void **lp)
+static void tb_remove_all_1(int level, void **lp)
 {
     int i;
 
@@ -70,20 +585,144 @@ static void page_flush_tb_1(int level, void **lp)
         void **pp = *lp;
 
         for (i = 0; i < V_L2_SIZE; ++i) {
-            page_flush_tb_1(level - 1, pp + i);
+            tb_remove_all_1(level - 1, pp + i);
         }
     }
 }
 
-static void page_flush_tb(void)
+static void tb_remove_all(void)
 {
     int i, l1_sz = v_l1_size;
 
     for (i = 0; i < l1_sz; i++) {
-        page_flush_tb_1(v_l2_levels, l1_map + i);
+        tb_remove_all_1(v_l2_levels, l1_map + i);
+    }
+}
+
+/*
+ * Add the tb in the target page and protect it if necessary.
+ * Called with @p->lock held.
+ */
+static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
+                               unsigned int n)
+{
+    bool page_already_protected;
+
+    assert_page_locked(p);
+
+    tb->page_next[n] = p->first_tb;
+    page_already_protected = p->first_tb != 0;
+    p->first_tb = (uintptr_t)tb | n;
+
+    /*
+     * If some code is already present, then the pages are already
+     * protected. So we handle the case where only the first TB is
+     * allocated in a physical page.
+     */
+    if (!page_already_protected) {
+        tlb_protect_code(tb->page_addr[n] & TARGET_PAGE_MASK);
+    }
+}
+
+static void tb_record(TranslationBlock *tb, PageDesc *p1, PageDesc *p2)
+{
+    tb_page_add(p1, tb, 0);
+    if (unlikely(p2)) {
+        tb_page_add(p2, tb, 1);
+    }
+}
+
+static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb)
+{
+    TranslationBlock *tb1;
+    uintptr_t *pprev;
+    PageForEachNext n1;
+
+    assert_page_locked(pd);
+    pprev = &pd->first_tb;
+    PAGE_FOR_EACH_TB(unused, unused, pd, tb1, n1) {
+        if (tb1 == tb) {
+            *pprev = tb1->page_next[n1];
+            return;
+        }
+        pprev = &tb1->page_next[n1];
     }
+    g_assert_not_reached();
 }
 
+static void tb_remove(TranslationBlock *tb)
+{
+    PageDesc *pd;
+
+    pd = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
+    tb_page_remove(pd, tb);
+    if (unlikely(tb->page_addr[1] != -1)) {
+        pd = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
+        tb_page_remove(pd, tb);
+    }
+}
+
+static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
+{
+    PageDesc *p1, *p2;
+    tb_page_addr_t page1;
+    tb_page_addr_t page2;
+
+    assert_memory_lock();
+    g_assert(phys1 != -1);
+
+    page1 = phys1 >> TARGET_PAGE_BITS;
+    page2 = phys2 >> TARGET_PAGE_BITS;
+
+    p1 = page_find_alloc(page1, alloc);
+    if (ret_p1) {
+        *ret_p1 = p1;
+    }
+    if (likely(phys2 == -1)) {
+        page_lock(p1);
+        return;
+    } else if (page1 == page2) {
+        page_lock(p1);
+        if (ret_p2) {
+            *ret_p2 = p1;
+        }
+        return;
+    }
+    p2 = page_find_alloc(page2, alloc);
+    if (ret_p2) {
+        *ret_p2 = p2;
+    }
+    if (page1 < page2) {
+        page_lock(p1);
+        page_lock(p2);
+    } else {
+        page_lock(p2);
+        page_lock(p1);
+    }
+}
+
+/* lock the page(s) of a TB in the correct acquisition order */
+static void page_lock_tb(const TranslationBlock *tb)
+{
+    page_lock_pair(NULL, tb_page_addr0(tb), NULL, tb_page_addr1(tb), false);
+}
+
+static void page_unlock_tb(const TranslationBlock *tb)
+{
+    PageDesc *p1 = page_find(tb_page_addr0(tb) >> TARGET_PAGE_BITS);
+
+    page_unlock(p1);
+    if (unlikely(tb_page_addr1(tb) != -1)) {
+        PageDesc *p2 = page_find(tb_page_addr1(tb) >> TARGET_PAGE_BITS);
+
+        if (p2 != p1) {
+            page_unlock(p2);
+        }
+    }
+}
+#endif /* CONFIG_USER_ONLY */
+
 /* flush all the translation blocks */
 static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
 {
@@ -101,7 +740,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
     }
 
     qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
-    page_flush_tb();
+    tb_remove_all();
 
     tcg_region_reset_all();
     /* XXX: flush processor icache at this point if cache flush is expensive */
@@ -128,28 +767,6 @@ void tb_flush(CPUState *cpu)
     }
 }
 
-/*
- * user-mode: call with mmap_lock held
- * !user-mode: call with @pd->lock held
- */
-static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb)
-{
-    TranslationBlock *tb1;
-    uintptr_t *pprev;
-    unsigned int n1;
-
-    assert_page_locked(pd);
-    pprev = &pd->first_tb;
-    PAGE_FOR_EACH_TB(pd, tb1, n1) {
-        if (tb1 == tb) {
-            *pprev = tb1->page_next[n1];
-            return;
-        }
-        pprev = &tb1->page_next[n1];
-    }
-    g_assert_not_reached();
-}
-
 /* remove @orig from its @n_orig-th jump list */
 static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig)
 {
@@ -255,7 +872,6 @@ static void tb_jmp_cache_inval_tb(TranslationBlock *tb)
  */
 static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 {
-    PageDesc *p;
     uint32_t h;
     tb_page_addr_t phys_pc;
     uint32_t orig_cflags = tb_cflags(tb);
@@ -277,13 +893,7 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 
     /* remove the TB from the page list */
     if (rm_from_page_list) {
-        p = page_find(phys_pc >> TARGET_PAGE_BITS);
-        tb_page_remove(p, tb);
-        phys_pc = tb_page_addr1(tb);
-        if (phys_pc != -1) {
-            p = page_find(phys_pc >> TARGET_PAGE_BITS);
-            tb_page_remove(p, tb);
-        }
+        tb_remove(tb);
     }
 
     /* remove the TB from the hash list */
@@ -307,71 +917,6 @@ static void tb_phys_invalidate__locked(TranslationBlock *tb)
     qemu_thread_jit_execute();
 }
 
-static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
-                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
-{
-    PageDesc *p1, *p2;
-    tb_page_addr_t page1;
-    tb_page_addr_t page2;
-
-    assert_memory_lock();
-    g_assert(phys1 != -1);
-
-    page1 = phys1 >> TARGET_PAGE_BITS;
-    page2 = phys2 >> TARGET_PAGE_BITS;
-
-    p1 = page_find_alloc(page1, alloc);
-    if (ret_p1) {
-        *ret_p1 = p1;
-    }
-    if (likely(phys2 == -1)) {
-        page_lock(p1);
-        return;
-    } else if (page1 == page2) {
-        page_lock(p1);
-        if (ret_p2) {
-            *ret_p2 = p1;
-        }
-        return;
-    }
-    p2 = page_find_alloc(page2, alloc);
-    if (ret_p2) {
-        *ret_p2 = p2;
-    }
-    if (page1 < page2) {
-        page_lock(p1);
-        page_lock(p2);
-    } else {
-        page_lock(p2);
-        page_lock(p1);
-    }
-}
-
-#ifdef CONFIG_USER_ONLY
-static inline void page_lock_tb(const TranslationBlock *tb) { }
-static inline void page_unlock_tb(const TranslationBlock *tb) { }
-#else
-/* lock the page(s) of a TB in the correct acquisition order */
-static void page_lock_tb(const TranslationBlock *tb)
-{
-    page_lock_pair(NULL, tb_page_addr0(tb), NULL, tb_page_addr1(tb), false);
-}
-
-static void page_unlock_tb(const TranslationBlock *tb)
-{
-    PageDesc *p1 = page_find(tb_page_addr0(tb) >> TARGET_PAGE_BITS);
-
-    page_unlock(p1);
-    if (unlikely(tb_page_addr1(tb) != -1)) {
-        PageDesc *p2 = page_find(tb_page_addr1(tb) >> TARGET_PAGE_BITS);
-
-        if (p2 != p1) {
-            page_unlock(p2);
-        }
-    }
-}
-#endif
-
 /*
  * Invalidate one TB.
  * Called with mmap_lock held in user-mode.
@@ -388,41 +933,6 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
 }
 
 /*
- * Add the tb in the target page and protect it if necessary.
- * Called with mmap_lock held for user-mode emulation.
- * Called with @p->lock held in !user-mode.
- */
-static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
-                               unsigned int n, tb_page_addr_t page_addr)
-{
-#ifndef CONFIG_USER_ONLY
-    bool page_already_protected;
-#endif
-
-    assert_page_locked(p);
-
-    tb->page_next[n] = p->first_tb;
-#ifndef CONFIG_USER_ONLY
-    page_already_protected = p->first_tb != (uintptr_t)NULL;
-#endif
-    p->first_tb = (uintptr_t)tb | n;
-
-#if defined(CONFIG_USER_ONLY)
-    /* translator_loop() must have made all TB pages non-writable */
-    assert(!(p->flags & PAGE_WRITE));
-#else
-    /*
-     * If some code is already present, then the pages are already
-     * protected. So we handle the case where only the first TB is
-     * allocated in a physical page.
-     */
-    if (!page_already_protected) {
-        tlb_protect_code(page_addr);
-    }
-#endif
-}
-
-/*
  * Add a new TB and link it to the physical page tables. phys_page2 is
  * (-1) to indicate that only one page contains the TB.
  *
@@ -453,10 +963,7 @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
      * we can only insert TBs that are fully initialized.
      */
     page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
-    tb_page_add(p, tb, 0, phys_pc);
-    if (p2) {
-        tb_page_add(p2, tb, 1, phys_page2);
-    }
+    tb_record(tb, p, p2);
 
     /* add in the hash table */
     h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
@@ -465,10 +972,7 @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
 
     /* remove TB from the page(s) if we couldn't insert it */
     if (unlikely(existing_tb)) {
-        tb_page_remove(p, tb);
-        if (p2) {
-            tb_page_remove(p2, tb);
-        }
+        tb_remove(tb);
         tb = existing_tb;
     }
 
@@ -479,10 +983,90 @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
     return tb;
 }
 
+#ifdef CONFIG_USER_ONLY
+/*
+ * Invalidate all TBs which intersect with the target address range.
+ * Called with mmap_lock held for user-mode emulation.
+ * NOTE: this function must not be called while a TB is running.
+ */
+void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
+{
+    TranslationBlock *tb;
+    PageForEachNext n;
+
+    assert_memory_lock();
+
+    PAGE_FOR_EACH_TB(start, end, unused, tb, n) {
+        tb_phys_invalidate__locked(tb);
+    }
+}
+
+/*
+ * Invalidate all TBs which intersect with the target address page @addr.
+ * Called with mmap_lock held for user-mode emulation
+ * NOTE: this function must not be called while a TB is running.
+ */
+void tb_invalidate_phys_page(tb_page_addr_t addr)
+{
+    tb_page_addr_t start, end;
+
+    start = addr & TARGET_PAGE_MASK;
+    end = start + TARGET_PAGE_SIZE;
+    tb_invalidate_phys_range(start, end);
+}
+
+/*
+ * Called with mmap_lock held. If pc is not 0 then it indicates the
+ * host PC of the faulting store instruction that caused this invalidate.
+ * Returns true if the caller needs to abort execution of the current
+ * TB (because it was modified by this store and the guest CPU has
+ * precise-SMC semantics).
+ */
+bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc)
+{
+    assert(pc != 0);
+#ifdef TARGET_HAS_PRECISE_SMC
+    assert_memory_lock();
+    {
+        TranslationBlock *current_tb = tcg_tb_lookup(pc);
+        bool current_tb_modified = false;
+        TranslationBlock *tb;
+        PageForEachNext n;
+
+        addr &= TARGET_PAGE_MASK;
+
+        PAGE_FOR_EACH_TB(addr, addr + TARGET_PAGE_SIZE, unused, tb, n) {
+            if (current_tb == tb &&
+                (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
+                /*
+                 * If we are modifying the current TB, we must stop its
+                 * execution. We could be more precise by checking that
+                 * the modification is after the current PC, but it would
+                 * require a specialized function to partially restore
+                 * the CPU state.
+                 */
+                current_tb_modified = true;
+                cpu_restore_state_from_tb(current_cpu, current_tb, pc);
+            }
+            tb_phys_invalidate__locked(tb);
+        }
+
+        if (current_tb_modified) {
+            /* Force execution of one insn next time.  */
+            CPUState *cpu = current_cpu;
+            cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(current_cpu);
+            return true;
+        }
+    }
+#else
+    tb_invalidate_phys_page(addr);
+#endif /* TARGET_HAS_PRECISE_SMC */
+    return false;
+}
+#else
 /*
  * @p must be non-NULL.
- * user-mode: call with mmap_lock held.
- * !user-mode: call with all @pages locked.
+ * Call with all @pages locked.
  */
 static void
 tb_invalidate_phys_page_range__locked(struct page_collection *pages,
@@ -492,22 +1076,17 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
 {
     TranslationBlock *tb;
     tb_page_addr_t tb_start, tb_end;
-    int n;
+    PageForEachNext n;
 #ifdef TARGET_HAS_PRECISE_SMC
-    CPUState *cpu = current_cpu;
-    bool current_tb_not_found = retaddr != 0;
     bool current_tb_modified = false;
-    TranslationBlock *current_tb = NULL;
+    TranslationBlock *current_tb = retaddr ? tcg_tb_lookup(retaddr) : NULL;
 #endif /* TARGET_HAS_PRECISE_SMC */
 
-    assert_page_locked(p);
-
     /*
      * We remove all the TBs in the range [start, end[.
      * XXX: see if in some cases it could be faster to invalidate all the code
      */
-    PAGE_FOR_EACH_TB(p, tb, n) {
-        assert_page_locked(p);
+    PAGE_FOR_EACH_TB(start, end, p, tb, n) {
         /* NOTE: this is subtle as a TB may span two physical pages */
         if (n == 0) {
             /* NOTE: tb_end may be after the end of the page, but
@@ -521,11 +1100,6 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
         }
         if (!(tb_end <= start || tb_start >= end)) {
 #ifdef TARGET_HAS_PRECISE_SMC
-            if (current_tb_not_found) {
-                current_tb_not_found = false;
-                /* now we have a real cpu fault */
-                current_tb = tcg_tb_lookup(retaddr);
-            }
             if (current_tb == tb &&
                 (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
                 /*
@@ -536,25 +1110,25 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
                  * restore the CPU state.
                  */
                 current_tb_modified = true;
-                cpu_restore_state_from_tb(cpu, current_tb, retaddr);
+                cpu_restore_state_from_tb(current_cpu, current_tb, retaddr);
             }
 #endif /* TARGET_HAS_PRECISE_SMC */
             tb_phys_invalidate__locked(tb);
         }
     }
-#if !defined(CONFIG_USER_ONLY)
+
     /* if no code remaining, no need to continue to use slow writes */
     if (!p->first_tb) {
         tlb_unprotect_code(start);
     }
-#endif
+
 #ifdef TARGET_HAS_PRECISE_SMC
     if (current_tb_modified) {
         page_collection_unlock(pages);
         /* Force execution of one insn next time.  */
-        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
+        current_cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(current_cpu);
         mmap_unlock();
-        cpu_loop_exit_noexc(cpu);
+        cpu_loop_exit_noexc(current_cpu);
     }
 #endif
 }
@@ -562,8 +1136,6 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
 /*
  * Invalidate all TBs which intersect with the target physical
  * address page @addr.
- *
- * Called with mmap_lock held for user-mode emulation
  */
 void tb_invalidate_phys_page(tb_page_addr_t addr)
 {
@@ -571,8 +1143,6 @@ void tb_invalidate_phys_page(tb_page_addr_t addr)
     tb_page_addr_t start, end;
     PageDesc *p;
 
-    assert_memory_lock();
-
     p = page_find(addr >> TARGET_PAGE_BITS);
     if (p == NULL) {
         return;
@@ -591,16 +1161,12 @@ void tb_invalidate_phys_page(tb_page_addr_t addr)
  * 'is_cpu_write_access' should be true if called from a real cpu write
  * access: the virtual CPU will exit the current TB if code is modified inside
  * this TB.
- *
- * Called with mmap_lock held for user-mode emulation.
  */
 void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
 {
     struct page_collection *pages;
     tb_page_addr_t next;
 
-    assert_memory_lock();
-
     pages = page_collection_lock(start, end);
     for (next = (start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
          start < end;
@@ -611,94 +1177,44 @@ void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
         if (pd == NULL) {
             continue;
         }
+        assert_page_locked(pd);
         tb_invalidate_phys_page_range__locked(pages, pd, start, bound, 0);
     }
     page_collection_unlock(pages);
 }
 
-#ifdef CONFIG_SOFTMMU
 /*
- * len must be <= 8 and start must be a multiple of len.
- * Called via softmmu_template.h when code areas are written to with
- * iothread mutex not held.
- *
  * Call with all @pages in the range [@start, @start + len[ locked.
  */
-void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                  tb_page_addr_t start, int len,
-                                  uintptr_t retaddr)
+static void tb_invalidate_phys_page_fast__locked(struct page_collection *pages,
+                                                 tb_page_addr_t start,
+                                                 unsigned len, uintptr_t ra)
 {
     PageDesc *p;
 
-    assert_memory_lock();
-
     p = page_find(start >> TARGET_PAGE_BITS);
     if (!p) {
         return;
     }
 
     assert_page_locked(p);
-    tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
-                                          retaddr);
+    tb_invalidate_phys_page_range__locked(pages, p, start, start + len, ra);
 }
-#else
+
 /*
- * Called with mmap_lock held. If pc is not 0 then it indicates the
- * host PC of the faulting store instruction that caused this invalidate.
- * Returns true if the caller needs to abort execution of the current
- * TB (because it was modified by this store and the guest CPU has
- * precise-SMC semantics).
+ * len must be <= 8 and start must be a multiple of len.
+ * Called via softmmu_template.h when code areas are written to with
+ * iothread mutex not held.
  */
-bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc)
+void tb_invalidate_phys_range_fast(ram_addr_t ram_addr,
+                                   unsigned size,
+                                   uintptr_t retaddr)
 {
-    TranslationBlock *tb;
-    PageDesc *p;
-    int n;
-#ifdef TARGET_HAS_PRECISE_SMC
-    TranslationBlock *current_tb = NULL;
-    CPUState *cpu = current_cpu;
-    bool current_tb_modified = false;
-#endif
-
-    assert_memory_lock();
-
-    addr &= TARGET_PAGE_MASK;
-    p = page_find(addr >> TARGET_PAGE_BITS);
-    if (!p) {
-        return false;
-    }
-
-#ifdef TARGET_HAS_PRECISE_SMC
-    if (p->first_tb && pc != 0) {
-        current_tb = tcg_tb_lookup(pc);
-    }
-#endif
-    assert_page_locked(p);
-    PAGE_FOR_EACH_TB(p, tb, n) {
-#ifdef TARGET_HAS_PRECISE_SMC
-        if (current_tb == tb &&
-            (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
-            /*
-             * If we are modifying the current TB, we must stop its execution.
-             * We could be more precise by checking that the modification is
-             * after the current PC, but it would require a specialized
-             * function to partially restore the CPU state.
-             */
-            current_tb_modified = true;
-            cpu_restore_state_from_tb(cpu, current_tb, pc);
-        }
-#endif /* TARGET_HAS_PRECISE_SMC */
-        tb_phys_invalidate(tb, addr);
-    }
-    p->first_tb = (uintptr_t)NULL;
-#ifdef TARGET_HAS_PRECISE_SMC
-    if (current_tb_modified) {
-        /* Force execution of one insn next time.  */
-        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
-        return true;
-    }
-#endif
+    struct page_collection *pages;
 
-    return false;
+    pages = page_collection_lock(ram_addr, ram_addr + size);
+    tb_invalidate_phys_page_fast__locked(pages, ram_addr, size, retaddr);
+    page_collection_unlock(pages);
 }
-#endif
+
+#endif /* CONFIG_USER_ONLY */
diff --git a/accel/tcg/trace-events b/accel/tcg/trace-events
index 59eab96f26..4e9b450520 100644
--- a/accel/tcg/trace-events
+++ b/accel/tcg/trace-events
@@ -6,5 +6,9 @@ exec_tb(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
 exec_tb_nocache(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
 exec_tb_exit(void *last_tb, unsigned int flags) "tb:%p flags=0x%x"
 
+# cputlb.c
+memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
+memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
+
 # translate-all.c
 translate_block(void *tb, uintptr_t pc, const void *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p"
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index ac3ee3740c..51ac1f6c84 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -63,105 +63,13 @@
 #include "tb-context.h"
 #include "internal.h"
 
-/* make various TB consistency checks */
-
-/**
- * struct page_entry - page descriptor entry
- * @pd:     pointer to the &struct PageDesc of the page this entry represents
- * @index:  page index of the page
- * @locked: whether the page is locked
- *
- * This struct helps us keep track of the locked state of a page, without
- * bloating &struct PageDesc.
- *
- * A page lock protects accesses to all fields of &struct PageDesc.
- *
- * See also: &struct page_collection.
- */
-struct page_entry {
-    PageDesc *pd;
-    tb_page_addr_t index;
-    bool locked;
-};
-
-/**
- * struct page_collection - tracks a set of pages (i.e. &struct page_entry's)
- * @tree:   Binary search tree (BST) of the pages, with key == page index
- * @max:    Pointer to the page in @tree with the highest page index
- *
- * To avoid deadlock we lock pages in ascending order of page index.
- * When operating on a set of pages, we need to keep track of them so that
- * we can lock them in order and also unlock them later. For this we collect
- * pages (i.e. &struct page_entry's) in a binary search @tree. Given that the
- * @tree implementation we use does not provide an O(1) operation to obtain the
- * highest-ranked element, we use @max to keep track of the inserted page
- * with the highest index. This is valuable because if a page is not in
- * the tree and its index is higher than @max's, then we can lock it
- * without breaking the locking order rule.
- *
- * Note on naming: 'struct page_set' would be shorter, but we already have a few
- * page_set_*() helpers, so page_collection is used instead to avoid confusion.
- *
- * See also: page_collection_lock().
- */
-struct page_collection {
-    GTree *tree;
-    struct page_entry *max;
-};
-
-/*
- * In system mode we want L1_MAP to be based on ram offsets,
- * while in user mode we want it to be based on virtual addresses.
- *
- * TODO: For user mode, see the caveat re host vs guest virtual
- * address spaces near GUEST_ADDR_MAX.
- */
-#if !defined(CONFIG_USER_ONLY)
-#if HOST_LONG_BITS < TARGET_PHYS_ADDR_SPACE_BITS
-# define L1_MAP_ADDR_SPACE_BITS  HOST_LONG_BITS
-#else
-# define L1_MAP_ADDR_SPACE_BITS  TARGET_PHYS_ADDR_SPACE_BITS
-#endif
-#else
-# define L1_MAP_ADDR_SPACE_BITS  MIN(HOST_LONG_BITS, TARGET_ABI_BITS)
-#endif
-
 /* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
 QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
                   sizeof_field(TranslationBlock, trace_vcpu_dstate)
                   * BITS_PER_BYTE);
 
-/*
- * L1 Mapping properties
- */
-int v_l1_size;
-int v_l1_shift;
-int v_l2_levels;
-
-void *l1_map[V_L1_MAX_SIZE];
-
 TBContext tb_ctx;
 
-static void page_table_config_init(void)
-{
-    uint32_t v_l1_bits;
-
-    assert(TARGET_PAGE_BITS);
-    /* The bits remaining after N lower levels of page tables.  */
-    v_l1_bits = (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS;
-    if (v_l1_bits < V_L1_MIN_BITS) {
-        v_l1_bits += V_L2_BITS;
-    }
-
-    v_l1_size = 1 << v_l1_bits;
-    v_l1_shift = L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - v_l1_bits;
-    v_l2_levels = v_l1_shift / V_L2_BITS - 1;
-
-    assert(v_l1_bits <= V_L1_MAX_BITS);
-    assert(v_l1_shift % V_L2_BITS == 0);
-    assert(v_l2_levels >= 0);
-}
-
 /* Encode VAL as a signed leb128 sequence at P.
    Return P incremented past the encoded value.  */
 static uint8_t *encode_sleb128(uint8_t *p, target_long val)
@@ -354,395 +262,9 @@ void page_init(void)
 {
     page_size_init();
     page_table_config_init();
-
-#if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
-    {
-#ifdef HAVE_KINFO_GETVMMAP
-        struct kinfo_vmentry *freep;
-        int i, cnt;
-
-        freep = kinfo_getvmmap(getpid(), &cnt);
-        if (freep) {
-            mmap_lock();
-            for (i = 0; i < cnt; i++) {
-                unsigned long startaddr, endaddr;
-
-                startaddr = freep[i].kve_start;
-                endaddr = freep[i].kve_end;
-                if (h2g_valid(startaddr)) {
-                    startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
-
-                    if (h2g_valid(endaddr)) {
-                        endaddr = h2g(endaddr);
-                        page_set_flags(startaddr, endaddr, PAGE_RESERVED);
-                    } else {
-#if TARGET_ABI_BITS <= L1_MAP_ADDR_SPACE_BITS
-                        endaddr = ~0ul;
-                        page_set_flags(startaddr, endaddr, PAGE_RESERVED);
-#endif
-                    }
-                }
-            }
-            free(freep);
-            mmap_unlock();
-        }
-#else
-        FILE *f;
-
-        last_brk = (unsigned long)sbrk(0);
-
-        f = fopen("/compat/linux/proc/self/maps", "r");
-        if (f) {
-            mmap_lock();
-
-            do {
-                unsigned long startaddr, endaddr;
-                int n;
-
-                n = fscanf(f, "%lx-%lx %*[^\n]\n", &startaddr, &endaddr);
-
-                if (n == 2 && h2g_valid(startaddr)) {
-                    startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
-
-                    if (h2g_valid(endaddr)) {
-                        endaddr = h2g(endaddr);
-                    } else {
-                        endaddr = ~0ul;
-                    }
-                    page_set_flags(startaddr, endaddr, PAGE_RESERVED);
-                }
-            } while (!feof(f));
-
-            fclose(f);
-            mmap_unlock();
-        }
-#endif
-    }
-#endif
-}
-
-PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
-{
-    PageDesc *pd;
-    void **lp;
-    int i;
-
-    /* Level 1.  Always allocated.  */
-    lp = l1_map + ((index >> v_l1_shift) & (v_l1_size - 1));
-
-    /* Level 2..N-1.  */
-    for (i = v_l2_levels; i > 0; i--) {
-        void **p = qatomic_rcu_read(lp);
-
-        if (p == NULL) {
-            void *existing;
-
-            if (!alloc) {
-                return NULL;
-            }
-            p = g_new0(void *, V_L2_SIZE);
-            existing = qatomic_cmpxchg(lp, NULL, p);
-            if (unlikely(existing)) {
-                g_free(p);
-                p = existing;
-            }
-        }
-
-        lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1));
-    }
-
-    pd = qatomic_rcu_read(lp);
-    if (pd == NULL) {
-        void *existing;
-
-        if (!alloc) {
-            return NULL;
-        }
-        pd = g_new0(PageDesc, V_L2_SIZE);
-#ifndef CONFIG_USER_ONLY
-        {
-            int i;
-
-            for (i = 0; i < V_L2_SIZE; i++) {
-                qemu_spin_init(&pd[i].lock);
-            }
-        }
-#endif
-        existing = qatomic_cmpxchg(lp, NULL, pd);
-        if (unlikely(existing)) {
-#ifndef CONFIG_USER_ONLY
-            {
-                int i;
-
-                for (i = 0; i < V_L2_SIZE; i++) {
-                    qemu_spin_destroy(&pd[i].lock);
-                }
-            }
-#endif
-            g_free(pd);
-            pd = existing;
-        }
-    }
-
-    return pd + (index & (V_L2_SIZE - 1));
-}
-
-/* In user-mode page locks aren't used; mmap_lock is enough */
-#ifdef CONFIG_USER_ONLY
-struct page_collection *
-page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
-{
-    return NULL;
-}
-
-void page_collection_unlock(struct page_collection *set)
-{ }
-#else /* !CONFIG_USER_ONLY */
-
-#ifdef CONFIG_DEBUG_TCG
-
-static __thread GHashTable *ht_pages_locked_debug;
-
-static void ht_pages_locked_debug_init(void)
-{
-    if (ht_pages_locked_debug) {
-        return;
-    }
-    ht_pages_locked_debug = g_hash_table_new(NULL, NULL);
-}
-
-static bool page_is_locked(const PageDesc *pd)
-{
-    PageDesc *found;
-
-    ht_pages_locked_debug_init();
-    found = g_hash_table_lookup(ht_pages_locked_debug, pd);
-    return !!found;
-}
-
-static void page_lock__debug(PageDesc *pd)
-{
-    ht_pages_locked_debug_init();
-    g_assert(!page_is_locked(pd));
-    g_hash_table_insert(ht_pages_locked_debug, pd, pd);
-}
-
-static void page_unlock__debug(const PageDesc *pd)
-{
-    bool removed;
-
-    ht_pages_locked_debug_init();
-    g_assert(page_is_locked(pd));
-    removed = g_hash_table_remove(ht_pages_locked_debug, pd);
-    g_assert(removed);
-}
-
-void do_assert_page_locked(const PageDesc *pd, const char *file, int line)
-{
-    if (unlikely(!page_is_locked(pd))) {
-        error_report("assert_page_lock: PageDesc %p not locked @ %s:%d",
-                     pd, file, line);
-        abort();
-    }
-}
-
-void assert_no_pages_locked(void)
-{
-    ht_pages_locked_debug_init();
-    g_assert(g_hash_table_size(ht_pages_locked_debug) == 0);
-}
-
-#else /* !CONFIG_DEBUG_TCG */
-
-static inline void page_lock__debug(const PageDesc *pd) { }
-static inline void page_unlock__debug(const PageDesc *pd) { }
-
-#endif /* CONFIG_DEBUG_TCG */
-
-void page_lock(PageDesc *pd)
-{
-    page_lock__debug(pd);
-    qemu_spin_lock(&pd->lock);
-}
-
-void page_unlock(PageDesc *pd)
-{
-    qemu_spin_unlock(&pd->lock);
-    page_unlock__debug(pd);
-}
-
-static inline struct page_entry *
-page_entry_new(PageDesc *pd, tb_page_addr_t index)
-{
-    struct page_entry *pe = g_malloc(sizeof(*pe));
-
-    pe->index = index;
-    pe->pd = pd;
-    pe->locked = false;
-    return pe;
-}
-
-static void page_entry_destroy(gpointer p)
-{
-    struct page_entry *pe = p;
-
-    g_assert(pe->locked);
-    page_unlock(pe->pd);
-    g_free(pe);
-}
-
-/* returns false on success */
-static bool page_entry_trylock(struct page_entry *pe)
-{
-    bool busy;
-
-    busy = qemu_spin_trylock(&pe->pd->lock);
-    if (!busy) {
-        g_assert(!pe->locked);
-        pe->locked = true;
-        page_lock__debug(pe->pd);
-    }
-    return busy;
-}
-
-static void do_page_entry_lock(struct page_entry *pe)
-{
-    page_lock(pe->pd);
-    g_assert(!pe->locked);
-    pe->locked = true;
-}
-
-static gboolean page_entry_lock(gpointer key, gpointer value, gpointer data)
-{
-    struct page_entry *pe = value;
-
-    do_page_entry_lock(pe);
-    return FALSE;
-}
-
-static gboolean page_entry_unlock(gpointer key, gpointer value, gpointer data)
-{
-    struct page_entry *pe = value;
-
-    if (pe->locked) {
-        pe->locked = false;
-        page_unlock(pe->pd);
-    }
-    return FALSE;
 }
 
 /*
- * Trylock a page, and if successful, add the page to a collection.
- * Returns true ("busy") if the page could not be locked; false otherwise.
- */
-static bool page_trylock_add(struct page_collection *set, tb_page_addr_t addr)
-{
-    tb_page_addr_t index = addr >> TARGET_PAGE_BITS;
-    struct page_entry *pe;
-    PageDesc *pd;
-
-    pe = g_tree_lookup(set->tree, &index);
-    if (pe) {
-        return false;
-    }
-
-    pd = page_find(index);
-    if (pd == NULL) {
-        return false;
-    }
-
-    pe = page_entry_new(pd, index);
-    g_tree_insert(set->tree, &pe->index, pe);
-
-    /*
-     * If this is either (1) the first insertion or (2) a page whose index
-     * is higher than any other so far, just lock the page and move on.
-     */
-    if (set->max == NULL || pe->index > set->max->index) {
-        set->max = pe;
-        do_page_entry_lock(pe);
-        return false;
-    }
-    /*
-     * Try to acquire out-of-order lock; if busy, return busy so that we acquire
-     * locks in order.
-     */
-    return page_entry_trylock(pe);
-}
-
-static gint tb_page_addr_cmp(gconstpointer ap, gconstpointer bp, gpointer udata)
-{
-    tb_page_addr_t a = *(const tb_page_addr_t *)ap;
-    tb_page_addr_t b = *(const tb_page_addr_t *)bp;
-
-    if (a == b) {
-        return 0;
-    } else if (a < b) {
-        return -1;
-    }
-    return 1;
-}
-
-/*
- * Lock a range of pages ([@start,@end[) as well as the pages of all
- * intersecting TBs.
- * Locking order: acquire locks in ascending order of page index.
- */
-struct page_collection *
-page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
-{
-    struct page_collection *set = g_malloc(sizeof(*set));
-    tb_page_addr_t index;
-    PageDesc *pd;
-
-    start >>= TARGET_PAGE_BITS;
-    end   >>= TARGET_PAGE_BITS;
-    g_assert(start <= end);
-
-    set->tree = g_tree_new_full(tb_page_addr_cmp, NULL, NULL,
-                                page_entry_destroy);
-    set->max = NULL;
-    assert_no_pages_locked();
-
- retry:
-    g_tree_foreach(set->tree, page_entry_lock, NULL);
-
-    for (index = start; index <= end; index++) {
-        TranslationBlock *tb;
-        int n;
-
-        pd = page_find(index);
-        if (pd == NULL) {
-            continue;
-        }
-        if (page_trylock_add(set, index << TARGET_PAGE_BITS)) {
-            g_tree_foreach(set->tree, page_entry_unlock, NULL);
-            goto retry;
-        }
-        assert_page_locked(pd);
-        PAGE_FOR_EACH_TB(pd, tb, n) {
-            if (page_trylock_add(set, tb_page_addr0(tb)) ||
-                (tb_page_addr1(tb) != -1 &&
-                 page_trylock_add(set, tb_page_addr1(tb)))) {
-                /* drop all locks, and reacquire in order */
-                g_tree_foreach(set->tree, page_entry_unlock, NULL);
-                goto retry;
-            }
-        }
-    }
-    return set;
-}
-
-void page_collection_unlock(struct page_collection *set)
-{
-    /* entries are unlocked and freed via page_entry_destroy */
-    g_tree_destroy(set->tree);
-    g_free(set);
-}
-
-#endif /* !CONFIG_USER_ONLY */
-
-/*
  * Isolate the portion of code gen which can setjmp/longjmp.
  * Return the size of the generated code, or negative on error.
  */
@@ -1235,339 +757,6 @@ void cpu_interrupt(CPUState *cpu, int mask)
     qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
 }
 
-/*
- * Walks guest process memory "regions" one by one
- * and calls callback function 'fn' for each region.
- */
-struct walk_memory_regions_data {
-    walk_memory_regions_fn fn;
-    void *priv;
-    target_ulong start;
-    int prot;
-};
-
-static int walk_memory_regions_end(struct walk_memory_regions_data *data,
-                                   target_ulong end, int new_prot)
-{
-    if (data->start != -1u) {
-        int rc = data->fn(data->priv, data->start, end, data->prot);
-        if (rc != 0) {
-            return rc;
-        }
-    }
-
-    data->start = (new_prot ? end : -1u);
-    data->prot = new_prot;
-
-    return 0;
-}
-
-static int walk_memory_regions_1(struct walk_memory_regions_data *data,
-                                 target_ulong base, int level, void **lp)
-{
-    target_ulong pa;
-    int i, rc;
-
-    if (*lp == NULL) {
-        return walk_memory_regions_end(data, base, 0);
-    }
-
-    if (level == 0) {
-        PageDesc *pd = *lp;
-
-        for (i = 0; i < V_L2_SIZE; ++i) {
-            int prot = pd[i].flags;
-
-            pa = base | (i << TARGET_PAGE_BITS);
-            if (prot != data->prot) {
-                rc = walk_memory_regions_end(data, pa, prot);
-                if (rc != 0) {
-                    return rc;
-                }
-            }
-        }
-    } else {
-        void **pp = *lp;
-
-        for (i = 0; i < V_L2_SIZE; ++i) {
-            pa = base | ((target_ulong)i <<
-                (TARGET_PAGE_BITS + V_L2_BITS * level));
-            rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
-            if (rc != 0) {
-                return rc;
-            }
-        }
-    }
-
-    return 0;
-}
-
-int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
-{
-    struct walk_memory_regions_data data;
-    uintptr_t i, l1_sz = v_l1_size;
-
-    data.fn = fn;
-    data.priv = priv;
-    data.start = -1u;
-    data.prot = 0;
-
-    for (i = 0; i < l1_sz; i++) {
-        target_ulong base = i << (v_l1_shift + TARGET_PAGE_BITS);
-        int rc = walk_memory_regions_1(&data, base, v_l2_levels, l1_map + i);
-        if (rc != 0) {
-            return rc;
-        }
-    }
-
-    return walk_memory_regions_end(&data, 0, 0);
-}
-
-static int dump_region(void *priv, target_ulong start,
-    target_ulong end, unsigned long prot)
-{
-    FILE *f = (FILE *)priv;
-
-    (void) fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx
-        " "TARGET_FMT_lx" %c%c%c\n",
-        start, end, end - start,
-        ((prot & PAGE_READ) ? 'r' : '-'),
-        ((prot & PAGE_WRITE) ? 'w' : '-'),
-        ((prot & PAGE_EXEC) ? 'x' : '-'));
-
-    return 0;
-}
-
-/* dump memory mappings */
-void page_dump(FILE *f)
-{
-    const int length = sizeof(target_ulong) * 2;
-    (void) fprintf(f, "%-*s %-*s %-*s %s\n",
-            length, "start", length, "end", length, "size", "prot");
-    walk_memory_regions(f, dump_region);
-}
-
-int page_get_flags(target_ulong address)
-{
-    PageDesc *p;
-
-    p = page_find(address >> TARGET_PAGE_BITS);
-    if (!p) {
-        return 0;
-    }
-    return p->flags;
-}
-
-/*
- * Allow the target to decide if PAGE_TARGET_[12] may be reset.
- * By default, they are not kept.
- */
-#ifndef PAGE_TARGET_STICKY
-#define PAGE_TARGET_STICKY  0
-#endif
-#define PAGE_STICKY  (PAGE_ANON | PAGE_PASSTHROUGH | PAGE_TARGET_STICKY)
-
-/* Modify the flags of a page and invalidate the code if necessary.
-   The flag PAGE_WRITE_ORG is positioned automatically depending
-   on PAGE_WRITE.  The mmap_lock should already be held.  */
-void page_set_flags(target_ulong start, target_ulong end, int flags)
-{
-    target_ulong addr, len;
-    bool reset, inval_tb = false;
-
-    /* This function should never be called with addresses outside the
-       guest address space.  If this assert fires, it probably indicates
-       a missing call to h2g_valid.  */
-    assert(end - 1 <= GUEST_ADDR_MAX);
-    assert(start < end);
-    /* Only set PAGE_ANON with new mappings. */
-    assert(!(flags & PAGE_ANON) || (flags & PAGE_RESET));
-    assert_memory_lock();
-
-    start = start & TARGET_PAGE_MASK;
-    end = TARGET_PAGE_ALIGN(end);
-
-    if (flags & PAGE_WRITE) {
-        flags |= PAGE_WRITE_ORG;
-    }
-    reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
-    if (reset) {
-        page_reset_target_data(start, end);
-    }
-    flags &= ~PAGE_RESET;
-
-    for (addr = start, len = end - start;
-         len != 0;
-         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
-        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
-
-        /*
-         * If the page was executable, but is reset, or is no longer
-         * executable, or has become writable, then invalidate any code.
-         */
-        if ((p->flags & PAGE_EXEC)
-            && (reset ||
-                !(flags & PAGE_EXEC) ||
-                (flags & ~p->flags & PAGE_WRITE))) {
-            inval_tb = true;
-        }
-        /* Using mprotect on a page does not change sticky bits. */
-        p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
-    }
-
-    if (inval_tb) {
-        tb_invalidate_phys_range(start, end);
-    }
-}
-
-int page_check_range(target_ulong start, target_ulong len, int flags)
-{
-    PageDesc *p;
-    target_ulong end;
-    target_ulong addr;
-
-    /* This function should never be called with addresses outside the
-       guest address space.  If this assert fires, it probably indicates
-       a missing call to h2g_valid.  */
-    if (TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS) {
-        assert(start < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
-    }
-
-    if (len == 0) {
-        return 0;
-    }
-    if (start + len - 1 < start) {
-        /* We've wrapped around.  */
-        return -1;
-    }
-
-    /* must do before we loose bits in the next step */
-    end = TARGET_PAGE_ALIGN(start + len);
-    start = start & TARGET_PAGE_MASK;
-
-    for (addr = start, len = end - start;
-         len != 0;
-         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
-        p = page_find(addr >> TARGET_PAGE_BITS);
-        if (!p) {
-            return -1;
-        }
-        if (!(p->flags & PAGE_VALID)) {
-            return -1;
-        }
-
-        if ((flags & PAGE_READ) && !(p->flags & PAGE_READ)) {
-            return -1;
-        }
-        if (flags & PAGE_WRITE) {
-            if (!(p->flags & PAGE_WRITE_ORG)) {
-                return -1;
-            }
-            /* unprotect the page if it was put read-only because it
-               contains translated code */
-            if (!(p->flags & PAGE_WRITE)) {
-                if (!page_unprotect(addr, 0)) {
-                    return -1;
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-void page_protect(tb_page_addr_t page_addr)
-{
-    target_ulong addr;
-    PageDesc *p;
-    int prot;
-
-    p = page_find(page_addr >> TARGET_PAGE_BITS);
-    if (p && (p->flags & PAGE_WRITE)) {
-        /*
-         * Force the host page as non writable (writes will have a page fault +
-         * mprotect overhead).
-         */
-        page_addr &= qemu_host_page_mask;
-        prot = 0;
-        for (addr = page_addr; addr < page_addr + qemu_host_page_size;
-             addr += TARGET_PAGE_SIZE) {
-
-            p = page_find(addr >> TARGET_PAGE_BITS);
-            if (!p) {
-                continue;
-            }
-            prot |= p->flags;
-            p->flags &= ~PAGE_WRITE;
-        }
-        mprotect(g2h_untagged(page_addr), qemu_host_page_size,
-                 (prot & PAGE_BITS) & ~PAGE_WRITE);
-    }
-}
-
-/* called from signal handler: invalidate the code and unprotect the
- * page. Return 0 if the fault was not handled, 1 if it was handled,
- * and 2 if it was handled but the caller must cause the TB to be
- * immediately exited. (We can only return 2 if the 'pc' argument is
- * non-zero.)
- */
-int page_unprotect(target_ulong address, uintptr_t pc)
-{
-    unsigned int prot;
-    bool current_tb_invalidated;
-    PageDesc *p;
-    target_ulong host_start, host_end, addr;
-
-    /* Technically this isn't safe inside a signal handler.  However we
-       know this only ever happens in a synchronous SEGV handler, so in
-       practice it seems to be ok.  */
-    mmap_lock();
-
-    p = page_find(address >> TARGET_PAGE_BITS);
-    if (!p) {
-        mmap_unlock();
-        return 0;
-    }
-
-    /* if the page was really writable, then we change its
-       protection back to writable */
-    if (p->flags & PAGE_WRITE_ORG) {
-        current_tb_invalidated = false;
-        if (p->flags & PAGE_WRITE) {
-            /* If the page is actually marked WRITE then assume this is because
-             * this thread raced with another one which got here first and
-             * set the page to PAGE_WRITE and did the TB invalidate for us.
-             */
-#ifdef TARGET_HAS_PRECISE_SMC
-            TranslationBlock *current_tb = tcg_tb_lookup(pc);
-            if (current_tb) {
-                current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
-            }
-#endif
-        } else {
-            host_start = address & qemu_host_page_mask;
-            host_end = host_start + qemu_host_page_size;
-
-            prot = 0;
-            for (addr = host_start; addr < host_end; addr += TARGET_PAGE_SIZE) {
-                p = page_find(addr >> TARGET_PAGE_BITS);
-                p->flags |= PAGE_WRITE;
-                prot |= p->flags;
-
-                /* and since the content will be modified, we must invalidate
-                   the corresponding translated code. */
-                current_tb_invalidated |=
-                    tb_invalidate_phys_page_unwind(addr, pc);
-            }
-            mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
-                     prot & PAGE_BITS);
-        }
-        mmap_unlock();
-        /* If current TB was invalidated return to main loop */
-        return current_tb_invalidated ? 2 : 1;
-    }
-    mmap_unlock();
-    return 0;
-}
 #endif /* CONFIG_USER_ONLY */
 
 /*
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index fb7d6ee9e9..a3cecda405 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -135,6 +135,565 @@ bool handle_sigsegv_accerr_write(CPUState *cpu, sigset_t *old_set,
     }
 }
 
+typedef struct PageFlagsNode {
+    IntervalTreeNode itree;
+    int flags;
+} PageFlagsNode;
+
+static IntervalTreeRoot pageflags_root;
+
+static PageFlagsNode *pageflags_find(target_ulong start, target_long last)
+{
+    IntervalTreeNode *n;
+
+    n = interval_tree_iter_first(&pageflags_root, start, last);
+    return n ? container_of(n, PageFlagsNode, itree) : NULL;
+}
+
+static PageFlagsNode *pageflags_next(PageFlagsNode *p, target_ulong start,
+                                     target_long last)
+{
+    IntervalTreeNode *n;
+
+    n = interval_tree_iter_next(&p->itree, start, last);
+    return n ? container_of(n, PageFlagsNode, itree) : NULL;
+}
+
+int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
+{
+    IntervalTreeNode *n;
+    int rc = 0;
+
+    mmap_lock();
+    for (n = interval_tree_iter_first(&pageflags_root, 0, -1);
+         n != NULL;
+         n = interval_tree_iter_next(n, 0, -1)) {
+        PageFlagsNode *p = container_of(n, PageFlagsNode, itree);
+
+        rc = fn(priv, n->start, n->last + 1, p->flags);
+        if (rc != 0) {
+            break;
+        }
+    }
+    mmap_unlock();
+
+    return rc;
+}
+
+static int dump_region(void *priv, target_ulong start,
+                       target_ulong end, unsigned long prot)
+{
+    FILE *f = (FILE *)priv;
+
+    fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx" "TARGET_FMT_lx" %c%c%c\n",
+            start, end, end - start,
+            ((prot & PAGE_READ) ? 'r' : '-'),
+            ((prot & PAGE_WRITE) ? 'w' : '-'),
+            ((prot & PAGE_EXEC) ? 'x' : '-'));
+    return 0;
+}
+
+/* dump memory mappings */
+void page_dump(FILE *f)
+{
+    const int length = sizeof(target_ulong) * 2;
+
+    fprintf(f, "%-*s %-*s %-*s %s\n",
+            length, "start", length, "end", length, "size", "prot");
+    walk_memory_regions(f, dump_region);
+}
+
+int page_get_flags(target_ulong address)
+{
+    PageFlagsNode *p = pageflags_find(address, address);
+
+    /*
+     * See util/interval-tree.c re lockless lookups: no false positives but
+     * there are false negatives.  If we find nothing, retry with the mmap
+     * lock acquired.
+     */
+    if (p) {
+        return p->flags;
+    }
+    if (have_mmap_lock()) {
+        return 0;
+    }
+
+    mmap_lock();
+    p = pageflags_find(address, address);
+    mmap_unlock();
+    return p ? p->flags : 0;
+}
+
+/* A subroutine of page_set_flags: insert a new node for [start,last]. */
+static void pageflags_create(target_ulong start, target_ulong last, int flags)
+{
+    PageFlagsNode *p = g_new(PageFlagsNode, 1);
+
+    p->itree.start = start;
+    p->itree.last = last;
+    p->flags = flags;
+    interval_tree_insert(&p->itree, &pageflags_root);
+}
+
+/* A subroutine of page_set_flags: remove everything in [start,last]. */
+static bool pageflags_unset(target_ulong start, target_ulong last)
+{
+    bool inval_tb = false;
+
+    while (true) {
+        PageFlagsNode *p = pageflags_find(start, last);
+        target_ulong p_last;
+
+        if (!p) {
+            break;
+        }
+
+        if (p->flags & PAGE_EXEC) {
+            inval_tb = true;
+        }
+
+        interval_tree_remove(&p->itree, &pageflags_root);
+        p_last = p->itree.last;
+
+        if (p->itree.start < start) {
+            /* Truncate the node from the end, or split out the middle. */
+            p->itree.last = start - 1;
+            interval_tree_insert(&p->itree, &pageflags_root);
+            if (last < p_last) {
+                pageflags_create(last + 1, p_last, p->flags);
+                break;
+            }
+        } else if (p_last <= last) {
+            /* Range completely covers node -- remove it. */
+            g_free(p);
+        } else {
+            /* Truncate the node from the start. */
+            p->itree.start = last + 1;
+            interval_tree_insert(&p->itree, &pageflags_root);
+            break;
+        }
+    }
+
+    return inval_tb;
+}
+
+/*
+ * A subroutine of page_set_flags: nothing overlaps [start,last],
+ * but check adjacent mappings and maybe merge into a single range.
+ */
+static void pageflags_create_merge(target_ulong start, target_ulong last,
+                                   int flags)
+{
+    PageFlagsNode *next = NULL, *prev = NULL;
+
+    if (start > 0) {
+        prev = pageflags_find(start - 1, start - 1);
+        if (prev) {
+            if (prev->flags == flags) {
+                interval_tree_remove(&prev->itree, &pageflags_root);
+            } else {
+                prev = NULL;
+            }
+        }
+    }
+    if (last + 1 != 0) {
+        next = pageflags_find(last + 1, last + 1);
+        if (next) {
+            if (next->flags == flags) {
+                interval_tree_remove(&next->itree, &pageflags_root);
+            } else {
+                next = NULL;
+            }
+        }
+    }
+
+    if (prev) {
+        if (next) {
+            prev->itree.last = next->itree.last;
+            g_free(next);
+        } else {
+            prev->itree.last = last;
+        }
+        interval_tree_insert(&prev->itree, &pageflags_root);
+    } else if (next) {
+        next->itree.start = start;
+        interval_tree_insert(&next->itree, &pageflags_root);
+    } else {
+        pageflags_create(start, last, flags);
+    }
+}
+
+/*
+ * Allow the target to decide if PAGE_TARGET_[12] may be reset.
+ * By default, they are not kept.
+ */
+#ifndef PAGE_TARGET_STICKY
+#define PAGE_TARGET_STICKY  0
+#endif
+#define PAGE_STICKY  (PAGE_ANON | PAGE_PASSTHROUGH | PAGE_TARGET_STICKY)
+
+/* A subroutine of page_set_flags: add flags to [start,last]. */
+static bool pageflags_set_clear(target_ulong start, target_ulong last,
+                                int set_flags, int clear_flags)
+{
+    PageFlagsNode *p;
+    target_ulong p_start, p_last;
+    int p_flags, merge_flags;
+    bool inval_tb = false;
+
+ restart:
+    p = pageflags_find(start, last);
+    if (!p) {
+        if (set_flags) {
+            pageflags_create_merge(start, last, set_flags);
+        }
+        goto done;
+    }
+
+    p_start = p->itree.start;
+    p_last = p->itree.last;
+    p_flags = p->flags;
+    /* Using mprotect on a page does not change sticky bits. */
+    merge_flags = (p_flags & ~clear_flags) | set_flags;
+
+    /*
+     * Need to flush if an overlapping executable region
+     * removes exec, or adds write.
+     */
+    if ((p_flags & PAGE_EXEC)
+        && (!(merge_flags & PAGE_EXEC)
+            || (merge_flags & ~p_flags & PAGE_WRITE))) {
+        inval_tb = true;
+    }
+
+    /*
+     * If there is an exact range match, update and return without
+     * attempting to merge with adjacent regions.
+     */
+    if (start == p_start && last == p_last) {
+        if (merge_flags) {
+            p->flags = merge_flags;
+        } else {
+            interval_tree_remove(&p->itree, &pageflags_root);
+            g_free(p);
+        }
+        goto done;
+    }
+
+    /*
+     * If sticky bits affect the original mapping, then we must be more
+     * careful about the existing intervals and the separate flags.
+     */
+    if (set_flags != merge_flags) {
+        if (p_start < start) {
+            interval_tree_remove(&p->itree, &pageflags_root);
+            p->itree.last = start - 1;
+            interval_tree_insert(&p->itree, &pageflags_root);
+
+            if (last < p_last) {
+                if (merge_flags) {
+                    pageflags_create(start, last, merge_flags);
+                }
+                pageflags_create(last + 1, p_last, p_flags);
+            } else {
+                if (merge_flags) {
+                    pageflags_create(start, p_last, merge_flags);
+                }
+                if (p_last < last) {
+                    start = p_last + 1;
+                    goto restart;
+                }
+            }
+        } else {
+            if (start < p_start && set_flags) {
+                pageflags_create(start, p_start - 1, set_flags);
+            }
+            if (last < p_last) {
+                interval_tree_remove(&p->itree, &pageflags_root);
+                p->itree.start = last + 1;
+                interval_tree_insert(&p->itree, &pageflags_root);
+                if (merge_flags) {
+                    pageflags_create(start, last, merge_flags);
+                }
+            } else {
+                if (merge_flags) {
+                    p->flags = merge_flags;
+                } else {
+                    interval_tree_remove(&p->itree, &pageflags_root);
+                    g_free(p);
+                }
+                if (p_last < last) {
+                    start = p_last + 1;
+                    goto restart;
+                }
+            }
+        }
+        goto done;
+    }
+
+    /* If flags are not changing for this range, incorporate it. */
+    if (set_flags == p_flags) {
+        if (start < p_start) {
+            interval_tree_remove(&p->itree, &pageflags_root);
+            p->itree.start = start;
+            interval_tree_insert(&p->itree, &pageflags_root);
+        }
+        if (p_last < last) {
+            start = p_last + 1;
+            goto restart;
+        }
+        goto done;
+    }
+
+    /* Maybe split out head and/or tail ranges with the original flags. */
+    interval_tree_remove(&p->itree, &pageflags_root);
+    if (p_start < start) {
+        p->itree.last = start - 1;
+        interval_tree_insert(&p->itree, &pageflags_root);
+
+        if (p_last < last) {
+            goto restart;
+        }
+        if (last < p_last) {
+            pageflags_create(last + 1, p_last, p_flags);
+        }
+    } else if (last < p_last) {
+        p->itree.start = last + 1;
+        interval_tree_insert(&p->itree, &pageflags_root);
+    } else {
+        g_free(p);
+        goto restart;
+    }
+    if (set_flags) {
+        pageflags_create(start, last, set_flags);
+    }
+
+ done:
+    return inval_tb;
+}
+
+/*
+ * Modify the flags of a page and invalidate the code if necessary.
+ * The flag PAGE_WRITE_ORG is positioned automatically depending
+ * on PAGE_WRITE.  The mmap_lock should already be held.
+ */
+void page_set_flags(target_ulong start, target_ulong end, int flags)
+{
+    target_ulong last;
+    bool reset = false;
+    bool inval_tb = false;
+
+    /* This function should never be called with addresses outside the
+       guest address space.  If this assert fires, it probably indicates
+       a missing call to h2g_valid.  */
+    assert(start < end);
+    assert(end - 1 <= GUEST_ADDR_MAX);
+    /* Only set PAGE_ANON with new mappings. */
+    assert(!(flags & PAGE_ANON) || (flags & PAGE_RESET));
+    assert_memory_lock();
+
+    start = start & TARGET_PAGE_MASK;
+    end = TARGET_PAGE_ALIGN(end);
+    last = end - 1;
+
+    if (!(flags & PAGE_VALID)) {
+        flags = 0;
+    } else {
+        reset = flags & PAGE_RESET;
+        flags &= ~PAGE_RESET;
+        if (flags & PAGE_WRITE) {
+            flags |= PAGE_WRITE_ORG;
+        }
+    }
+
+    if (!flags || reset) {
+        page_reset_target_data(start, end);
+        inval_tb |= pageflags_unset(start, last);
+    }
+    if (flags) {
+        inval_tb |= pageflags_set_clear(start, last, flags,
+                                        ~(reset ? 0 : PAGE_STICKY));
+    }
+    if (inval_tb) {
+        tb_invalidate_phys_range(start, end);
+    }
+}
+
+int page_check_range(target_ulong start, target_ulong len, int flags)
+{
+    target_ulong last;
+
+    if (len == 0) {
+        return 0;  /* trivial length */
+    }
+
+    last = start + len - 1;
+    if (last < start) {
+        return -1; /* wrap around */
+    }
+
+    while (true) {
+        PageFlagsNode *p = pageflags_find(start, last);
+        int missing;
+
+        if (!p) {
+            return -1; /* entire region invalid */
+        }
+        if (start < p->itree.start) {
+            return -1; /* initial bytes invalid */
+        }
+
+        missing = flags & ~p->flags;
+        if (missing & PAGE_READ) {
+            return -1; /* page not readable */
+        }
+        if (missing & PAGE_WRITE) {
+            if (!(p->flags & PAGE_WRITE_ORG)) {
+                return -1; /* page not writable */
+            }
+            /* Asking about writable, but has been protected: undo. */
+            if (!page_unprotect(start, 0)) {
+                return -1;
+            }
+            /* TODO: page_unprotect should take a range, not a single page. */
+            if (last - start < TARGET_PAGE_SIZE) {
+                return 0; /* ok */
+            }
+            start += TARGET_PAGE_SIZE;
+            continue;
+        }
+
+        if (last <= p->itree.last) {
+            return 0; /* ok */
+        }
+        start = p->itree.last + 1;
+    }
+}
+
+void page_protect(tb_page_addr_t address)
+{
+    PageFlagsNode *p;
+    target_ulong start, last;
+    int prot;
+
+    assert_memory_lock();
+
+    if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+        start = address & TARGET_PAGE_MASK;
+        last = start + TARGET_PAGE_SIZE - 1;
+    } else {
+        start = address & qemu_host_page_mask;
+        last = start + qemu_host_page_size - 1;
+    }
+
+    p = pageflags_find(start, last);
+    if (!p) {
+        return;
+    }
+    prot = p->flags;
+
+    if (unlikely(p->itree.last < last)) {
+        /* More than one protection region covers the one host page. */
+        assert(TARGET_PAGE_SIZE < qemu_host_page_size);
+        while ((p = pageflags_next(p, start, last)) != NULL) {
+            prot |= p->flags;
+        }
+    }
+
+    if (prot & PAGE_WRITE) {
+        pageflags_set_clear(start, last, 0, PAGE_WRITE);
+        mprotect(g2h_untagged(start), qemu_host_page_size,
+                 prot & (PAGE_READ | PAGE_EXEC) ? PROT_READ : PROT_NONE);
+    }
+}
+
+/*
+ * Called from signal handler: invalidate the code and unprotect the
+ * page. Return 0 if the fault was not handled, 1 if it was handled,
+ * and 2 if it was handled but the caller must cause the TB to be
+ * immediately exited. (We can only return 2 if the 'pc' argument is
+ * non-zero.)
+ */
+int page_unprotect(target_ulong address, uintptr_t pc)
+{
+    PageFlagsNode *p;
+    bool current_tb_invalidated;
+
+    /*
+     * Technically this isn't safe inside a signal handler.  However we
+     * know this only ever happens in a synchronous SEGV handler, so in
+     * practice it seems to be ok.
+     */
+    mmap_lock();
+
+    p = pageflags_find(address, address);
+
+    /* If this address was not really writable, nothing to do. */
+    if (!p || !(p->flags & PAGE_WRITE_ORG)) {
+        mmap_unlock();
+        return 0;
+    }
+
+    current_tb_invalidated = false;
+    if (p->flags & PAGE_WRITE) {
+        /*
+         * If the page is actually marked WRITE then assume this is because
+         * this thread raced with another one which got here first and
+         * set the page to PAGE_WRITE and did the TB invalidate for us.
+         */
+#ifdef TARGET_HAS_PRECISE_SMC
+        TranslationBlock *current_tb = tcg_tb_lookup(pc);
+        if (current_tb) {
+            current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
+        }
+#endif
+    } else {
+        target_ulong start, len, i;
+        int prot;
+
+        if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+            start = address & TARGET_PAGE_MASK;
+            len = TARGET_PAGE_SIZE;
+            prot = p->flags | PAGE_WRITE;
+            pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0);
+            current_tb_invalidated = tb_invalidate_phys_page_unwind(start, pc);
+        } else {
+            start = address & qemu_host_page_mask;
+            len = qemu_host_page_size;
+            prot = 0;
+
+            for (i = 0; i < len; i += TARGET_PAGE_SIZE) {
+                target_ulong addr = start + i;
+
+                p = pageflags_find(addr, addr);
+                if (p) {
+                    prot |= p->flags;
+                    if (p->flags & PAGE_WRITE_ORG) {
+                        prot |= PAGE_WRITE;
+                        pageflags_set_clear(addr, addr + TARGET_PAGE_SIZE - 1,
+                                            PAGE_WRITE, 0);
+                    }
+                }
+                /*
+                 * Since the content will be modified, we must invalidate
+                 * the corresponding translated code.
+                 */
+                current_tb_invalidated |=
+                    tb_invalidate_phys_page_unwind(addr, pc);
+            }
+        }
+        if (prot & PAGE_EXEC) {
+            prot = (prot & ~PAGE_EXEC) | PAGE_READ;
+        }
+        mprotect((void *)g2h_untagged(start), len, prot & PAGE_BITS);
+    }
+    mmap_unlock();
+
+    /* If current TB was invalidated return to main loop */
+    return current_tb_invalidated ? 2 : 1;
+}
+
 static int probe_access_internal(CPUArchState *env, target_ulong addr,
                                  int fault_size, MMUAccessType access_type,
                                  bool nonfault, uintptr_t ra)
@@ -210,47 +769,96 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
     return addr;
 }
 
+#ifdef TARGET_PAGE_DATA_SIZE
+/*
+ * Allocate chunks of target data together.  For the only current user,
+ * if we allocate one hunk per page, we have overhead of 40/128 or 40%.
+ * Therefore, allocate memory for 64 pages at a time for overhead < 1%.
+ */
+#define TPD_PAGES  64
+#define TBD_MASK   (TARGET_PAGE_MASK * TPD_PAGES)
+
+typedef struct TargetPageDataNode {
+    IntervalTreeNode itree;
+    char data[TPD_PAGES][TARGET_PAGE_DATA_SIZE] __attribute__((aligned));
+} TargetPageDataNode;
+
+static IntervalTreeRoot targetdata_root;
+
 void page_reset_target_data(target_ulong start, target_ulong end)
 {
-#ifdef TARGET_PAGE_DATA_SIZE
-    target_ulong addr, len;
+    IntervalTreeNode *n, *next;
+    target_ulong last;
 
-    /*
-     * This function should never be called with addresses outside the
-     * guest address space.  If this assert fires, it probably indicates
-     * a missing call to h2g_valid.
-     */
-    assert(end - 1 <= GUEST_ADDR_MAX);
-    assert(start < end);
     assert_memory_lock();
 
     start = start & TARGET_PAGE_MASK;
-    end = TARGET_PAGE_ALIGN(end);
+    last = TARGET_PAGE_ALIGN(end) - 1;
+
+    for (n = interval_tree_iter_first(&targetdata_root, start, last),
+         next = n ? interval_tree_iter_next(n, start, last) : NULL;
+         n != NULL;
+         n = next,
+         next = next ? interval_tree_iter_next(n, start, last) : NULL) {
+        target_ulong n_start, n_last, p_ofs, p_len;
+        TargetPageDataNode *t;
+
+        if (n->start >= start && n->last <= last) {
+            interval_tree_remove(n, &targetdata_root);
+            g_free(n);
+            continue;
+        }
 
-    for (addr = start, len = end - start;
-         len != 0;
-         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
-        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
+        if (n->start < start) {
+            n_start = start;
+            p_ofs = (start - n->start) >> TARGET_PAGE_BITS;
+        } else {
+            n_start = n->start;
+            p_ofs = 0;
+        }
+        n_last = MIN(last, n->last);
+        p_len = (n_last + 1 - n_start) >> TARGET_PAGE_BITS;
 
-        g_free(p->target_data);
-        p->target_data = NULL;
+        t = container_of(n, TargetPageDataNode, itree);
+        memset(t->data[p_ofs], 0, p_len * TARGET_PAGE_DATA_SIZE);
     }
-#endif
 }
 
-#ifdef TARGET_PAGE_DATA_SIZE
 void *page_get_target_data(target_ulong address)
 {
-    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
-    void *ret = p->target_data;
+    IntervalTreeNode *n;
+    TargetPageDataNode *t;
+    target_ulong page, region;
+
+    page = address & TARGET_PAGE_MASK;
+    region = address & TBD_MASK;
 
-    if (!ret) {
-        ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
-        p->target_data = ret;
+    n = interval_tree_iter_first(&targetdata_root, page, page);
+    if (!n) {
+        /*
+         * See util/interval-tree.c re lockless lookups: no false positives
+         * but there are false negatives.  If we find nothing, retry with
+         * the mmap lock acquired.  We also need the lock for the
+         * allocation + insert.
+         */
+        mmap_lock();
+        n = interval_tree_iter_first(&targetdata_root, page, page);
+        if (!n) {
+            t = g_new0(TargetPageDataNode, 1);
+            n = &t->itree;
+            n->start = region;
+            n->last = region | ~TBD_MASK;
+            interval_tree_insert(n, &targetdata_root);
+        }
+        mmap_unlock();
     }
-    return ret;
+
+    t = container_of(n, TargetPageDataNode, itree);
+    return t->data[(page - region) >> TARGET_PAGE_BITS];
 }
-#endif
+#else
+void page_reset_target_data(target_ulong start, target_ulong end) { }
+#endif /* TARGET_PAGE_DATA_SIZE */
 
 /* The softmmu versions of these helpers are in cputlb.c.  */
 
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 9b7bfbf09a..25e11b0a8d 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -24,6 +24,7 @@
 #ifdef CONFIG_TCG
 #include "exec/cpu_ldst.h"
 #endif
+#include "qemu/interval-tree.h"
 
 /* allow to see translation results - the slowdown should be negligible, so we leave it */
 #define DEBUG_DISAS
@@ -559,11 +560,20 @@ struct TranslationBlock {
 
     struct tb_tc tc;
 
-    /* first and second physical page containing code. The lower bit
-       of the pointer tells the index in page_next[].
-       The list is protected by the TB's page('s) lock(s) */
+    /*
+     * Track tb_page_addr_t intervals that intersect this TB.
+     * For user-only, the virtual addresses are always contiguous,
+     * and we use a unified interval tree.  For system, we use a
+     * linked list headed in each PageDesc.  Within the list, the lsb
+     * of the previous pointer tells the index of page_next[], and the
+     * list is protected by the PageDesc lock(s).
+     */
+#ifdef CONFIG_USER_ONLY
+    IntervalTreeNode itree;
+#else
     uintptr_t page_next[2];
     tb_page_addr_t page_addr[2];
+#endif
 
     /* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */
     QemuSpin jmp_lock;
@@ -619,24 +629,51 @@ static inline uint32_t tb_cflags(const TranslationBlock *tb)
 
 static inline tb_page_addr_t tb_page_addr0(const TranslationBlock *tb)
 {
+#ifdef CONFIG_USER_ONLY
+    return tb->itree.start;
+#else
     return tb->page_addr[0];
+#endif
 }
 
 static inline tb_page_addr_t tb_page_addr1(const TranslationBlock *tb)
 {
+#ifdef CONFIG_USER_ONLY
+    tb_page_addr_t next = tb->itree.last & TARGET_PAGE_MASK;
+    return next == (tb->itree.start & TARGET_PAGE_MASK) ? -1 : next;
+#else
     return tb->page_addr[1];
+#endif
 }
 
 static inline void tb_set_page_addr0(TranslationBlock *tb,
                                      tb_page_addr_t addr)
 {
+#ifdef CONFIG_USER_ONLY
+    tb->itree.start = addr;
+    /*
+     * To begin, we record an interval of one byte.  When the translation
+     * loop encounters a second page, the interval will be extended to
+     * include the first byte of the second page, which is sufficient to
+     * allow tb_page_addr1() above to work properly.  The final corrected
+     * interval will be set by tb_page_add() from tb->size before the
+     * node is added to the interval tree.
+     */
+    tb->itree.last = addr;
+#else
     tb->page_addr[0] = addr;
+#endif
 }
 
 static inline void tb_set_page_addr1(TranslationBlock *tb,
                                      tb_page_addr_t addr)
 {
+#ifdef CONFIG_USER_ONLY
+    /* Extend the interval to the first byte of the second page.  See above. */
+    tb->itree.last = addr;
+#else
     tb->page_addr[1] = addr;
+#endif
 }
 
 /* current cflags for hashing/comparison */
diff --git a/include/exec/translate-all.h b/include/exec/translate-all.h
index 3e9cb91565..88602ae8d8 100644
--- a/include/exec/translate-all.h
+++ b/include/exec/translate-all.h
@@ -23,12 +23,6 @@
 
 
 /* translate-all.c */
-struct page_collection *page_collection_lock(tb_page_addr_t start,
-                                             tb_page_addr_t end);
-void page_collection_unlock(struct page_collection *set);
-void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                  tb_page_addr_t start, int len,
-                                  uintptr_t retaddr);
 void tb_invalidate_phys_page(tb_page_addr_t addr);
 void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);
 
diff --git a/include/qemu/interval-tree.h b/include/qemu/interval-tree.h
new file mode 100644
index 0000000000..25006debe8
--- /dev/null
+++ b/include/qemu/interval-tree.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Interval trees.
+ *
+ * Derived from include/linux/interval_tree.h and its dependencies.
+ */
+
+#ifndef QEMU_INTERVAL_TREE_H
+#define QEMU_INTERVAL_TREE_H
+
+/*
+ * For now, don't expose Linux Red-Black Trees separately, but retain the
+ * separate type definitions to keep the implementation sane, and allow
+ * the possibility of disentangling them later.
+ */
+typedef struct RBNode
+{
+    /* Encodes parent with color in the lsb. */
+    uintptr_t rb_parent_color;
+    struct RBNode *rb_right;
+    struct RBNode *rb_left;
+} RBNode;
+
+typedef struct RBRoot
+{
+    RBNode *rb_node;
+} RBRoot;
+
+typedef struct RBRootLeftCached {
+    RBRoot rb_root;
+    RBNode *rb_leftmost;
+} RBRootLeftCached;
+
+typedef struct IntervalTreeNode
+{
+    RBNode rb;
+
+    uint64_t start;    /* Start of interval */
+    uint64_t last;     /* Last location _in_ interval */
+    uint64_t subtree_last;
+} IntervalTreeNode;
+
+typedef RBRootLeftCached IntervalTreeRoot;
+
+/**
+ * interval_tree_is_empty
+ * @root: root of the tree.
+ *
+ * Returns true if the tree contains no nodes.
+ */
+static inline bool interval_tree_is_empty(const IntervalTreeRoot *root)
+{
+    return root->rb_root.rb_node == NULL;
+}
+
+/**
+ * interval_tree_insert
+ * @node: node to insert,
+ * @root: root of the tree.
+ *
+ * Insert @node into @root, and rebalance.
+ */
+void interval_tree_insert(IntervalTreeNode *node, IntervalTreeRoot *root);
+
+/**
+ * interval_tree_remove
+ * @node: node to remove,
+ * @root: root of the tree.
+ *
+ * Remove @node from @root, and rebalance.
+ */
+void interval_tree_remove(IntervalTreeNode *node, IntervalTreeRoot *root);
+
+/**
+ * interval_tree_iter_first:
+ * @root: root of the tree,
+ * @start, @last: the inclusive interval [start, last].
+ *
+ * Locate the "first" of a set of nodes within the tree at @root
+ * that overlap the interval, where "first" is sorted by start.
+ * Returns NULL if no overlap found.
+ */
+IntervalTreeNode *interval_tree_iter_first(IntervalTreeRoot *root,
+                                           uint64_t start, uint64_t last);
+
+/**
+ * interval_tree_iter_next:
+ * @node: previous search result
+ * @start, @last: the inclusive interval [start, last].
+ *
+ * Locate the "next" of a set of nodes within the tree that overlap the
+ * interval; @next is the result of a previous call to
+ * interval_tree_iter_{first,next}.  Returns NULL if @next was the last
+ * node in the set.
+ */
+IntervalTreeNode *interval_tree_iter_next(IntervalTreeNode *node,
+                                          uint64_t start, uint64_t last);
+
+#endif /* QEMU_INTERVAL_TREE_H */
diff --git a/tests/tcg/multiarch/test-vma.c b/tests/tcg/multiarch/test-vma.c
new file mode 100644
index 0000000000..2893d60334
--- /dev/null
+++ b/tests/tcg/multiarch/test-vma.c
@@ -0,0 +1,22 @@
+/*
+ * Test very large vma allocations.
+ * The qemu out-of-memory condition was within the mmap syscall itself.
+ * If the syscall actually returns with MAP_FAILED, the test succeeded.
+ */
+#include <sys/mman.h>
+
+int main()
+{
+    int n = sizeof(size_t) == 4 ? 32 : 45;
+
+    for (int i = 28; i < n; i++) {
+        size_t l = (size_t)1 << i;
+        void *p = mmap(0, l, PROT_NONE,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
+        if (p == MAP_FAILED) {
+            break;
+        }
+        munmap(p, l);
+    }
+    return 0;
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index b497a41378..ffa444f432 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -47,6 +47,7 @@ tests = {
   'ptimer-test': ['ptimer-test-stubs.c', meson.project_source_root() / 'hw/core/ptimer.c'],
   'test-qapi-util': [],
   'test-smp-parse': [qom, meson.project_source_root() / 'hw/core/machine-smp.c'],
+  'test-interval-tree': [],
 }
 
 if have_system or have_tools
diff --git a/tests/unit/test-interval-tree.c b/tests/unit/test-interval-tree.c
new file mode 100644
index 0000000000..119817a019
--- /dev/null
+++ b/tests/unit/test-interval-tree.c
@@ -0,0 +1,209 @@
+/*
+ * Test interval trees
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/interval-tree.h"
+
+static IntervalTreeNode nodes[20];
+static IntervalTreeRoot root;
+
+static void rand_interval(IntervalTreeNode *n, uint64_t start, uint64_t last)
+{
+    gint32 s_ofs, l_ofs, l_max;
+
+    if (last - start > INT32_MAX) {
+        l_max = INT32_MAX;
+    } else {
+        l_max = last - start;
+    }
+    s_ofs = g_test_rand_int_range(0, l_max);
+    l_ofs = g_test_rand_int_range(s_ofs, l_max);
+
+    n->start = start + s_ofs;
+    n->last = start + l_ofs;
+}
+
+static void test_empty(void)
+{
+    g_assert(root.rb_root.rb_node == NULL);
+    g_assert(root.rb_leftmost == NULL);
+    g_assert(interval_tree_iter_first(&root, 0, UINT64_MAX) == NULL);
+}
+
+static void test_find_one_point(void)
+{
+    /* Create a tree of a single node, which is the point [1,1]. */
+    nodes[0].start = 1;
+    nodes[0].last = 1;
+
+    interval_tree_insert(&nodes[0], &root);
+
+    g_assert(interval_tree_iter_first(&root, 0, 9) == &nodes[0]);
+    g_assert(interval_tree_iter_next(&nodes[0], 0, 9) == NULL);
+    g_assert(interval_tree_iter_first(&root, 0, 0) == NULL);
+    g_assert(interval_tree_iter_next(&nodes[0], 0, 0) == NULL);
+    g_assert(interval_tree_iter_first(&root, 0, 1) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 1, 1) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 1, 2) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 2, 2) == NULL);
+
+    interval_tree_remove(&nodes[0], &root);
+    g_assert(root.rb_root.rb_node == NULL);
+    g_assert(root.rb_leftmost == NULL);
+}
+
+static void test_find_two_point(void)
+{
+    IntervalTreeNode *find0, *find1;
+
+    /* Create a tree of a two nodes, which are both the point [1,1]. */
+    nodes[0].start = 1;
+    nodes[0].last = 1;
+    nodes[1] = nodes[0];
+
+    interval_tree_insert(&nodes[0], &root);
+    interval_tree_insert(&nodes[1], &root);
+
+    find0 = interval_tree_iter_first(&root, 0, 9);
+    g_assert(find0 == &nodes[0] || find0 == &nodes[1]);
+
+    find1 = interval_tree_iter_next(find0, 0, 9);
+    g_assert(find1 == &nodes[0] || find1 == &nodes[1]);
+    g_assert(find0 != find1);
+
+    interval_tree_remove(&nodes[1], &root);
+
+    g_assert(interval_tree_iter_first(&root, 0, 9) == &nodes[0]);
+    g_assert(interval_tree_iter_next(&nodes[0], 0, 9) == NULL);
+
+    interval_tree_remove(&nodes[0], &root);
+}
+
+static void test_find_one_range(void)
+{
+    /* Create a tree of a single node, which is the range [1,8]. */
+    nodes[0].start = 1;
+    nodes[0].last = 8;
+
+    interval_tree_insert(&nodes[0], &root);
+
+    g_assert(interval_tree_iter_first(&root, 0, 9) == &nodes[0]);
+    g_assert(interval_tree_iter_next(&nodes[0], 0, 9) == NULL);
+    g_assert(interval_tree_iter_first(&root, 0, 0) == NULL);
+    g_assert(interval_tree_iter_first(&root, 0, 1) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 1, 1) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 4, 6) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 8, 8) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 9, 9) == NULL);
+
+    interval_tree_remove(&nodes[0], &root);
+}
+
+static void test_find_one_range_many(void)
+{
+    int i;
+
+    /*
+     * Create a tree of many nodes in [0,99] and [200,299],
+     * but only one node with exactly [110,190].
+     */
+    nodes[0].start = 110;
+    nodes[0].last = 190;
+
+    for (i = 1; i < ARRAY_SIZE(nodes) / 2; ++i) {
+        rand_interval(&nodes[i], 0, 99);
+    }
+    for (; i < ARRAY_SIZE(nodes); ++i) {
+        rand_interval(&nodes[i], 200, 299);
+    }
+
+    for (i = 0; i < ARRAY_SIZE(nodes); ++i) {
+        interval_tree_insert(&nodes[i], &root);
+    }
+
+    /* Test that we find exactly the one node. */
+    g_assert(interval_tree_iter_first(&root, 100, 199) == &nodes[0]);
+    g_assert(interval_tree_iter_next(&nodes[0], 100, 199) == NULL);
+    g_assert(interval_tree_iter_first(&root, 100, 109) == NULL);
+    g_assert(interval_tree_iter_first(&root, 100, 110) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 111, 120) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 111, 199) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 190, 199) == &nodes[0]);
+    g_assert(interval_tree_iter_first(&root, 192, 199) == NULL);
+
+    /*
+     * Test that if there are multiple matches, we return the one
+     * with the minimal start.
+     */
+    g_assert(interval_tree_iter_first(&root, 100, 300) == &nodes[0]);
+
+    /* Test that we don't find it after it is removed. */
+    interval_tree_remove(&nodes[0], &root);
+    g_assert(interval_tree_iter_first(&root, 100, 199) == NULL);
+
+    for (i = 1; i < ARRAY_SIZE(nodes); ++i) {
+        interval_tree_remove(&nodes[i], &root);
+    }
+}
+
+static void test_find_many_range(void)
+{
+    IntervalTreeNode *find;
+    int i, n;
+
+    n = g_test_rand_int_range(ARRAY_SIZE(nodes) / 3, ARRAY_SIZE(nodes) / 2);
+
+    /*
+     * Create a fair few nodes in [2000,2999], with the others
+     * distributed around.
+     */
+    for (i = 0; i < n; ++i) {
+        rand_interval(&nodes[i], 2000, 2999);
+    }
+    for (; i < ARRAY_SIZE(nodes) * 2 / 3; ++i) {
+        rand_interval(&nodes[i], 1000, 1899);
+    }
+    for (; i < ARRAY_SIZE(nodes); ++i) {
+        rand_interval(&nodes[i], 3100, 3999);
+    }
+
+    for (i = 0; i < ARRAY_SIZE(nodes); ++i) {
+        interval_tree_insert(&nodes[i], &root);
+    }
+
+    /* Test that we find all of the nodes. */
+    find = interval_tree_iter_first(&root, 2000, 2999);
+    for (i = 0; find != NULL; i++) {
+        find = interval_tree_iter_next(find, 2000, 2999);
+    }
+    g_assert_cmpint(i, ==, n);
+
+    g_assert(interval_tree_iter_first(&root,    0,  999) == NULL);
+    g_assert(interval_tree_iter_first(&root, 1900, 1999) == NULL);
+    g_assert(interval_tree_iter_first(&root, 3000, 3099) == NULL);
+    g_assert(interval_tree_iter_first(&root, 4000, UINT64_MAX) == NULL);
+
+    for (i = 0; i < ARRAY_SIZE(nodes); ++i) {
+        interval_tree_remove(&nodes[i], &root);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+
+    g_test_add_func("/interval-tree/empty", test_empty);
+    g_test_add_func("/interval-tree/find-one-point", test_find_one_point);
+    g_test_add_func("/interval-tree/find-two-point", test_find_two_point);
+    g_test_add_func("/interval-tree/find-one-range", test_find_one_range);
+    g_test_add_func("/interval-tree/find-one-range-many",
+                    test_find_one_range_many);
+    g_test_add_func("/interval-tree/find-many-range", test_find_many_range);
+
+    return g_test_run();
+}
diff --git a/trace-events b/trace-events
index 035f3d570d..b6b84b175e 100644
--- a/trace-events
+++ b/trace-events
@@ -42,10 +42,6 @@ find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx
 find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64
 ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d"
 
-# accel/tcg/cputlb.c
-memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
-memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
-
 # job.c
 job_state_transition(void *job,  int ret, const char *legal, const char *s0, const char *s1) "job %p (ret: %d) attempting %s transition (%s-->%s)"
 job_apply_verb(void *job, const char *state, const char *verb, const char *legal) "job %p in state %s; applying verb %s (%s)"
diff --git a/util/interval-tree.c b/util/interval-tree.c
new file mode 100644
index 0000000000..4c0baf108f
--- /dev/null
+++ b/util/interval-tree.c
@@ -0,0 +1,882 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "qemu/osdep.h"
+#include "qemu/interval-tree.h"
+#include "qemu/atomic.h"
+
+/*
+ * Red Black Trees.
+ *
+ * For now, don't expose Linux Red-Black Trees separately, but retain the
+ * separate type definitions to keep the implementation sane, and allow
+ * the possibility of separating them later.
+ *
+ * Derived from include/linux/rbtree_augmented.h and its dependencies.
+ */
+
+/*
+ * red-black trees properties:  https://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
+ *
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE [qatomic_set for QEMU]. And we must not inadvertently cause
+ * (temporary) loops in the tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
+typedef enum RBColor
+{
+    RB_RED,
+    RB_BLACK,
+} RBColor;
+
+typedef struct RBAugmentCallbacks {
+    void (*propagate)(RBNode *node, RBNode *stop);
+    void (*copy)(RBNode *old, RBNode *new);
+    void (*rotate)(RBNode *old, RBNode *new);
+} RBAugmentCallbacks;
+
+static inline RBNode *rb_parent(const RBNode *n)
+{
+    return (RBNode *)(n->rb_parent_color & ~1);
+}
+
+static inline RBNode *rb_red_parent(const RBNode *n)
+{
+    return (RBNode *)n->rb_parent_color;
+}
+
+static inline RBColor pc_color(uintptr_t pc)
+{
+    return (RBColor)(pc & 1);
+}
+
+static inline bool pc_is_red(uintptr_t pc)
+{
+    return pc_color(pc) == RB_RED;
+}
+
+static inline bool pc_is_black(uintptr_t pc)
+{
+    return !pc_is_red(pc);
+}
+
+static inline RBColor rb_color(const RBNode *n)
+{
+    return pc_color(n->rb_parent_color);
+}
+
+static inline bool rb_is_red(const RBNode *n)
+{
+    return pc_is_red(n->rb_parent_color);
+}
+
+static inline bool rb_is_black(const RBNode *n)
+{
+    return pc_is_black(n->rb_parent_color);
+}
+
+static inline void rb_set_black(RBNode *n)
+{
+    n->rb_parent_color |= RB_BLACK;
+}
+
+static inline void rb_set_parent_color(RBNode *n, RBNode *p, RBColor color)
+{
+    n->rb_parent_color = (uintptr_t)p | color;
+}
+
+static inline void rb_set_parent(RBNode *n, RBNode *p)
+{
+    rb_set_parent_color(n, p, rb_color(n));
+}
+
+static inline void rb_link_node(RBNode *node, RBNode *parent, RBNode **rb_link)
+{
+    node->rb_parent_color = (uintptr_t)parent;
+    node->rb_left = node->rb_right = NULL;
+
+    qatomic_set(rb_link, node);
+}
+
+static RBNode *rb_next(RBNode *node)
+{
+    RBNode *parent;
+
+    /* OMIT: if empty node, return null. */
+
+    /*
+     * If we have a right-hand child, go down and then left as far as we can.
+     */
+    if (node->rb_right) {
+        node = node->rb_right;
+        while (node->rb_left) {
+            node = node->rb_left;
+        }
+        return node;
+    }
+
+    /*
+     * No right-hand children. Everything down and left is smaller than us,
+     * so any 'next' node must be in the general direction of our parent.
+     * Go up the tree; any time the ancestor is a right-hand child of its
+     * parent, keep going up. First time it's a left-hand child of its
+     * parent, said parent is our 'next' node.
+     */
+    while ((parent = rb_parent(node)) && node == parent->rb_right) {
+        node = parent;
+    }
+
+    return parent;
+}
+
+static inline void rb_change_child(RBNode *old, RBNode *new,
+                                   RBNode *parent, RBRoot *root)
+{
+    if (!parent) {
+        qatomic_set(&root->rb_node, new);
+    } else if (parent->rb_left == old) {
+        qatomic_set(&parent->rb_left, new);
+    } else {
+        qatomic_set(&parent->rb_right, new);
+    }
+}
+
+static inline void rb_rotate_set_parents(RBNode *old, RBNode *new,
+                                         RBRoot *root, RBColor color)
+{
+    RBNode *parent = rb_parent(old);
+
+    new->rb_parent_color = old->rb_parent_color;
+    rb_set_parent_color(old, new, color);
+    rb_change_child(old, new, parent, root);
+}
+
+static void rb_insert_augmented(RBNode *node, RBRoot *root,
+                                const RBAugmentCallbacks *augment)
+{
+    RBNode *parent = rb_red_parent(node), *gparent, *tmp;
+
+    while (true) {
+        /*
+         * Loop invariant: node is red.
+         */
+        if (unlikely(!parent)) {
+            /*
+             * The inserted node is root. Either this is the first node, or
+             * we recursed at Case 1 below and are no longer violating 4).
+             */
+            rb_set_parent_color(node, NULL, RB_BLACK);
+            break;
+        }
+
+        /*
+         * If there is a black parent, we are done.  Otherwise, take some
+         * corrective action as, per 4), we don't want a red root or two
+         * consecutive red nodes.
+         */
+        if (rb_is_black(parent)) {
+            break;
+        }
+
+        gparent = rb_red_parent(parent);
+
+        tmp = gparent->rb_right;
+        if (parent != tmp) {    /* parent == gparent->rb_left */
+            if (tmp && rb_is_red(tmp)) {
+                /*
+                 * Case 1 - node's uncle is red (color flips).
+                 *
+                 *       G            g
+                 *      / \          / \
+                 *     p   u  -->   P   U
+                 *    /            /
+                 *   n            n
+                 *
+                 * However, since g's parent might be red, and 4) does not
+                 * allow this, we need to recurse at g.
+                 */
+                rb_set_parent_color(tmp, gparent, RB_BLACK);
+                rb_set_parent_color(parent, gparent, RB_BLACK);
+                node = gparent;
+                parent = rb_parent(node);
+                rb_set_parent_color(node, parent, RB_RED);
+                continue;
+            }
+
+            tmp = parent->rb_right;
+            if (node == tmp) {
+                /*
+                 * Case 2 - node's uncle is black and node is
+                 * the parent's right child (left rotate at parent).
+                 *
+                 *      G             G
+                 *     / \           / \
+                 *    p   U  -->    n   U
+                 *     \           /
+                 *      n         p
+                 *
+                 * This still leaves us in violation of 4), the
+                 * continuation into Case 3 will fix that.
+                 */
+                tmp = node->rb_left;
+                qatomic_set(&parent->rb_right, tmp);
+                qatomic_set(&node->rb_left, parent);
+                if (tmp) {
+                    rb_set_parent_color(tmp, parent, RB_BLACK);
+                }
+                rb_set_parent_color(parent, node, RB_RED);
+                augment->rotate(parent, node);
+                parent = node;
+                tmp = node->rb_right;
+            }
+
+            /*
+             * Case 3 - node's uncle is black and node is
+             * the parent's left child (right rotate at gparent).
+             *
+             *        G           P
+             *       / \         / \
+             *      p   U  -->  n   g
+             *     /                 \
+             *    n                   U
+             */
+            qatomic_set(&gparent->rb_left, tmp); /* == parent->rb_right */
+            qatomic_set(&parent->rb_right, gparent);
+            if (tmp) {
+                rb_set_parent_color(tmp, gparent, RB_BLACK);
+            }
+            rb_rotate_set_parents(gparent, parent, root, RB_RED);
+            augment->rotate(gparent, parent);
+            break;
+        } else {
+            tmp = gparent->rb_left;
+            if (tmp && rb_is_red(tmp)) {
+                /* Case 1 - color flips */
+                rb_set_parent_color(tmp, gparent, RB_BLACK);
+                rb_set_parent_color(parent, gparent, RB_BLACK);
+                node = gparent;
+                parent = rb_parent(node);
+                rb_set_parent_color(node, parent, RB_RED);
+                continue;
+            }
+
+            tmp = parent->rb_left;
+            if (node == tmp) {
+                /* Case 2 - right rotate at parent */
+                tmp = node->rb_right;
+                qatomic_set(&parent->rb_left, tmp);
+                qatomic_set(&node->rb_right, parent);
+                if (tmp) {
+                    rb_set_parent_color(tmp, parent, RB_BLACK);
+                }
+                rb_set_parent_color(parent, node, RB_RED);
+                augment->rotate(parent, node);
+                parent = node;
+                tmp = node->rb_left;
+            }
+
+            /* Case 3 - left rotate at gparent */
+            qatomic_set(&gparent->rb_right, tmp); /* == parent->rb_left */
+            qatomic_set(&parent->rb_left, gparent);
+            if (tmp) {
+                rb_set_parent_color(tmp, gparent, RB_BLACK);
+            }
+            rb_rotate_set_parents(gparent, parent, root, RB_RED);
+            augment->rotate(gparent, parent);
+            break;
+        }
+    }
+}
+
+static void rb_insert_augmented_cached(RBNode *node,
+                                       RBRootLeftCached *root, bool newleft,
+                                       const RBAugmentCallbacks *augment)
+{
+    if (newleft) {
+        root->rb_leftmost = node;
+    }
+    rb_insert_augmented(node, &root->rb_root, augment);
+}
+
+static void rb_erase_color(RBNode *parent, RBRoot *root,
+                           const RBAugmentCallbacks *augment)
+{
+    RBNode *node = NULL, *sibling, *tmp1, *tmp2;
+
+    while (true) {
+        /*
+         * Loop invariants:
+         * - node is black (or NULL on first iteration)
+         * - node is not the root (parent is not NULL)
+         * - All leaf paths going through parent and node have a
+         *   black node count that is 1 lower than other leaf paths.
+         */
+        sibling = parent->rb_right;
+        if (node != sibling) {  /* node == parent->rb_left */
+            if (rb_is_red(sibling)) {
+                /*
+                 * Case 1 - left rotate at parent
+                 *
+                 *     P               S
+                 *    / \             / \ 
+                 *   N   s    -->    p   Sr
+                 *      / \         / \ 
+                 *     Sl  Sr      N   Sl
+                 */
+                tmp1 = sibling->rb_left;
+                qatomic_set(&parent->rb_right, tmp1);
+                qatomic_set(&sibling->rb_left, parent);
+                rb_set_parent_color(tmp1, parent, RB_BLACK);
+                rb_rotate_set_parents(parent, sibling, root, RB_RED);
+                augment->rotate(parent, sibling);
+                sibling = tmp1;
+            }
+            tmp1 = sibling->rb_right;
+            if (!tmp1 || rb_is_black(tmp1)) {
+                tmp2 = sibling->rb_left;
+                if (!tmp2 || rb_is_black(tmp2)) {
+                    /*
+                     * Case 2 - sibling color flip
+                     * (p could be either color here)
+                     *
+                     *    (p)           (p)
+                     *    / \           / \ 
+                     *   N   S    -->  N   s
+                     *      / \           / \ 
+                     *     Sl  Sr        Sl  Sr
+                     *
+                     * This leaves us violating 5) which
+                     * can be fixed by flipping p to black
+                     * if it was red, or by recursing at p.
+                     * p is red when coming from Case 1.
+                     */
+                    rb_set_parent_color(sibling, parent, RB_RED);
+                    if (rb_is_red(parent)) {
+                        rb_set_black(parent);
+                    } else {
+                        node = parent;
+                        parent = rb_parent(node);
+                        if (parent) {
+                            continue;
+                        }
+                    }
+                    break;
+                }
+                /*
+                 * Case 3 - right rotate at sibling
+                 * (p could be either color here)
+                 *
+                 *   (p)           (p)
+                 *   / \           / \
+                 *  N   S    -->  N   sl
+                 *     / \             \
+                 *    sl  Sr            S
+                 *                       \
+                 *                        Sr
+                 *
+                 * Note: p might be red, and then bot
+                 * p and sl are red after rotation (which
+                 * breaks property 4). This is fixed in
+                 * Case 4 (in rb_rotate_set_parents()
+                 *         which set sl the color of p
+                 *         and set p RB_BLACK)
+                 *
+                 *   (p)            (sl)
+                 *   / \            /  \
+                 *  N   sl   -->   P    S
+                 *       \        /      \
+                 *        S      N        Sr
+                 *         \
+                 *          Sr
+                 */
+                tmp1 = tmp2->rb_right;
+                qatomic_set(&sibling->rb_left, tmp1);
+                qatomic_set(&tmp2->rb_right, sibling);
+                qatomic_set(&parent->rb_right, tmp2);
+                if (tmp1) {
+                    rb_set_parent_color(tmp1, sibling, RB_BLACK);
+                }
+                augment->rotate(sibling, tmp2);
+                tmp1 = sibling;
+                sibling = tmp2;
+            }
+            /*
+             * Case 4 - left rotate at parent + color flips
+             * (p and sl could be either color here.
+             *  After rotation, p becomes black, s acquires
+             *  p's color, and sl keeps its color)
+             *
+             *      (p)             (s)
+             *      / \             / \
+             *     N   S     -->   P   Sr
+             *        / \         / \
+             *      (sl) sr      N  (sl)
+             */
+            tmp2 = sibling->rb_left;
+            qatomic_set(&parent->rb_right, tmp2);
+            qatomic_set(&sibling->rb_left, parent);
+            rb_set_parent_color(tmp1, sibling, RB_BLACK);
+            if (tmp2) {
+                rb_set_parent(tmp2, parent);
+            }
+            rb_rotate_set_parents(parent, sibling, root, RB_BLACK);
+            augment->rotate(parent, sibling);
+            break;
+        } else {
+            sibling = parent->rb_left;
+            if (rb_is_red(sibling)) {
+                /* Case 1 - right rotate at parent */
+                tmp1 = sibling->rb_right;
+                qatomic_set(&parent->rb_left, tmp1);
+                qatomic_set(&sibling->rb_right, parent);
+                rb_set_parent_color(tmp1, parent, RB_BLACK);
+                rb_rotate_set_parents(parent, sibling, root, RB_RED);
+                augment->rotate(parent, sibling);
+                sibling = tmp1;
+            }
+            tmp1 = sibling->rb_left;
+            if (!tmp1 || rb_is_black(tmp1)) {
+                tmp2 = sibling->rb_right;
+                if (!tmp2 || rb_is_black(tmp2)) {
+                    /* Case 2 - sibling color flip */
+                    rb_set_parent_color(sibling, parent, RB_RED);
+                    if (rb_is_red(parent)) {
+                        rb_set_black(parent);
+                    } else {
+                        node = parent;
+                        parent = rb_parent(node);
+                        if (parent) {
+                            continue;
+                        }
+                    }
+                    break;
+                }
+                /* Case 3 - left rotate at sibling */
+                tmp1 = tmp2->rb_left;
+                qatomic_set(&sibling->rb_right, tmp1);
+                qatomic_set(&tmp2->rb_left, sibling);
+                qatomic_set(&parent->rb_left, tmp2);
+                if (tmp1) {
+                    rb_set_parent_color(tmp1, sibling, RB_BLACK);
+                }
+                augment->rotate(sibling, tmp2);
+                tmp1 = sibling;
+                sibling = tmp2;
+            }
+            /* Case 4 - right rotate at parent + color flips */
+            tmp2 = sibling->rb_right;
+            qatomic_set(&parent->rb_left, tmp2);
+            qatomic_set(&sibling->rb_right, parent);
+            rb_set_parent_color(tmp1, sibling, RB_BLACK);
+            if (tmp2) {
+                rb_set_parent(tmp2, parent);
+            }
+            rb_rotate_set_parents(parent, sibling, root, RB_BLACK);
+            augment->rotate(parent, sibling);
+            break;
+        }
+    }
+}
+
+static void rb_erase_augmented(RBNode *node, RBRoot *root,
+                               const RBAugmentCallbacks *augment)
+{
+    RBNode *child = node->rb_right;
+    RBNode *tmp = node->rb_left;
+    RBNode *parent, *rebalance;
+    uintptr_t pc;
+
+    if (!tmp) {
+        /*
+         * Case 1: node to erase has no more than 1 child (easy!)
+         *
+         * Note that if there is one child it must be red due to 5)
+         * and node must be black due to 4). We adjust colors locally
+         * so as to bypass rb_erase_color() later on.
+         */
+        pc = node->rb_parent_color;
+        parent = rb_parent(node);
+        rb_change_child(node, child, parent, root);
+        if (child) {
+            child->rb_parent_color = pc;
+            rebalance = NULL;
+        } else {
+            rebalance = pc_is_black(pc) ? parent : NULL;
+        }
+        tmp = parent;
+    } else if (!child) {
+        /* Still case 1, but this time the child is node->rb_left */
+        pc = node->rb_parent_color;
+        parent = rb_parent(node);
+        tmp->rb_parent_color = pc;
+        rb_change_child(node, tmp, parent, root);
+        rebalance = NULL;
+        tmp = parent;
+    } else {
+        RBNode *successor = child, *child2;
+        tmp = child->rb_left;
+        if (!tmp) {
+            /*
+             * Case 2: node's successor is its right child
+             *
+             *    (n)          (s)
+             *    / \          / \
+             *  (x) (s)  ->  (x) (c)
+             *        \
+             *        (c)
+             */
+            parent = successor;
+            child2 = successor->rb_right;
+
+            augment->copy(node, successor);
+        } else {
+            /*
+             * Case 3: node's successor is leftmost under
+             * node's right child subtree
+             *
+             *    (n)          (s)
+             *    / \          / \
+             *  (x) (y)  ->  (x) (y)
+             *      /            /
+             *    (p)          (p)
+             *    /            /
+             *  (s)          (c)
+             *    \
+             *    (c)
+             */
+            do {
+                parent = successor;
+                successor = tmp;
+                tmp = tmp->rb_left;
+            } while (tmp);
+            child2 = successor->rb_right;
+            qatomic_set(&parent->rb_left, child2);
+            qatomic_set(&successor->rb_right, child);
+            rb_set_parent(child, successor);
+
+            augment->copy(node, successor);
+            augment->propagate(parent, successor);
+        }
+
+        tmp = node->rb_left;
+        qatomic_set(&successor->rb_left, tmp);
+        rb_set_parent(tmp, successor);
+
+        pc = node->rb_parent_color;
+        tmp = rb_parent(node);
+        rb_change_child(node, successor, tmp, root);
+
+        if (child2) {
+            rb_set_parent_color(child2, parent, RB_BLACK);
+            rebalance = NULL;
+        } else {
+            rebalance = rb_is_black(successor) ? parent : NULL;
+        }
+        successor->rb_parent_color = pc;
+        tmp = successor;
+    }
+
+    augment->propagate(tmp, NULL);
+
+    if (rebalance) {
+        rb_erase_color(rebalance, root, augment);
+    }
+}
+
+static void rb_erase_augmented_cached(RBNode *node, RBRootLeftCached *root,
+                                      const RBAugmentCallbacks *augment)
+{
+    if (root->rb_leftmost == node) {
+        root->rb_leftmost = rb_next(node);
+    }
+    rb_erase_augmented(node, &root->rb_root, augment);
+}
+
+
+/*
+ * Interval trees.
+ *
+ * Derived from lib/interval_tree.c and its dependencies,
+ * especially include/linux/interval_tree_generic.h.
+ */
+
+#define rb_to_itree(N)  container_of(N, IntervalTreeNode, rb)
+
+static bool interval_tree_compute_max(IntervalTreeNode *node, bool exit)
+{
+    IntervalTreeNode *child;
+    uint64_t max = node->last;
+
+    if (node->rb.rb_left) {
+        child = rb_to_itree(node->rb.rb_left);
+        if (child->subtree_last > max) {
+            max = child->subtree_last;
+        }
+    }
+    if (node->rb.rb_right) {
+        child = rb_to_itree(node->rb.rb_right);
+        if (child->subtree_last > max) {
+            max = child->subtree_last;
+        }
+    }
+    if (exit && node->subtree_last == max) {
+        return true;
+    }
+    node->subtree_last = max;
+    return false;
+}
+
+static void interval_tree_propagate(RBNode *rb, RBNode *stop)
+{
+    while (rb != stop) {
+        IntervalTreeNode *node = rb_to_itree(rb);
+        if (interval_tree_compute_max(node, true)) {
+            break;
+        }
+        rb = rb_parent(&node->rb);
+    }
+}
+
+static void interval_tree_copy(RBNode *rb_old, RBNode *rb_new)
+{
+    IntervalTreeNode *old = rb_to_itree(rb_old);
+    IntervalTreeNode *new = rb_to_itree(rb_new);
+
+    new->subtree_last = old->subtree_last;
+}
+
+static void interval_tree_rotate(RBNode *rb_old, RBNode *rb_new)
+{
+    IntervalTreeNode *old = rb_to_itree(rb_old);
+    IntervalTreeNode *new = rb_to_itree(rb_new);
+
+    new->subtree_last = old->subtree_last;
+    interval_tree_compute_max(old, false);
+}
+
+static const RBAugmentCallbacks interval_tree_augment = {
+    .propagate = interval_tree_propagate,
+    .copy = interval_tree_copy,
+    .rotate = interval_tree_rotate,
+};
+
+/* Insert / remove interval nodes from the tree */
+void interval_tree_insert(IntervalTreeNode *node, IntervalTreeRoot *root)
+{
+    RBNode **link = &root->rb_root.rb_node, *rb_parent = NULL;
+    uint64_t start = node->start, last = node->last;
+    IntervalTreeNode *parent;
+    bool leftmost = true;
+
+    while (*link) {
+        rb_parent = *link;
+        parent = rb_to_itree(rb_parent);
+
+        if (parent->subtree_last < last) {
+            parent->subtree_last = last;
+        }
+        if (start < parent->start) {
+            link = &parent->rb.rb_left;
+        } else {
+            link = &parent->rb.rb_right;
+            leftmost = false;
+        }
+    }
+
+    node->subtree_last = last;
+    rb_link_node(&node->rb, rb_parent, link);
+    rb_insert_augmented_cached(&node->rb, root, leftmost,
+                               &interval_tree_augment);
+}
+
+void interval_tree_remove(IntervalTreeNode *node, IntervalTreeRoot *root)
+{
+    rb_erase_augmented_cached(&node->rb, root, &interval_tree_augment);
+}
+
+/*
+ * Iterate over intervals intersecting [start;last]
+ *
+ * Note that a node's interval intersects [start;last] iff:
+ *   Cond1: node->start <= last
+ * and
+ *   Cond2: start <= node->last
+ */
+
+static IntervalTreeNode *interval_tree_subtree_search(IntervalTreeNode *node,
+                                                      uint64_t start,
+                                                      uint64_t last)
+{
+    while (true) {
+        /*
+         * Loop invariant: start <= node->subtree_last
+         * (Cond2 is satisfied by one of the subtree nodes)
+         */
+        if (node->rb.rb_left) {
+            IntervalTreeNode *left = rb_to_itree(node->rb.rb_left);
+
+            if (start <= left->subtree_last) {
+                /*
+                 * Some nodes in left subtree satisfy Cond2.
+                 * Iterate to find the leftmost such node N.
+                 * If it also satisfies Cond1, that's the
+                 * match we are looking for. Otherwise, there
+                 * is no matching interval as nodes to the
+                 * right of N can't satisfy Cond1 either.
+                 */
+                node = left;
+                continue;
+            }
+        }
+        if (node->start <= last) {         /* Cond1 */
+            if (start <= node->last) {     /* Cond2 */
+                return node; /* node is leftmost match */
+            }
+            if (node->rb.rb_right) {
+                node = rb_to_itree(node->rb.rb_right);
+                if (start <= node->subtree_last) {
+                    continue;
+                }
+            }
+        }
+        return NULL; /* no match */
+    }
+}
+
+IntervalTreeNode *interval_tree_iter_first(IntervalTreeRoot *root,
+                                           uint64_t start, uint64_t last)
+{
+    IntervalTreeNode *node, *leftmost;
+
+    if (!root->rb_root.rb_node) {
+        return NULL;
+    }
+
+    /*
+     * Fastpath range intersection/overlap between A: [a0, a1] and
+     * B: [b0, b1] is given by:
+     *
+     *         a0 <= b1 && b0 <= a1
+     *
+     *  ... where A holds the lock range and B holds the smallest
+     * 'start' and largest 'last' in the tree. For the later, we
+     * rely on the root node, which by augmented interval tree
+     * property, holds the largest value in its last-in-subtree.
+     * This allows mitigating some of the tree walk overhead for
+     * for non-intersecting ranges, maintained and consulted in O(1).
+     */
+    node = rb_to_itree(root->rb_root.rb_node);
+    if (node->subtree_last < start) {
+        return NULL;
+    }
+
+    leftmost = rb_to_itree(root->rb_leftmost);
+    if (leftmost->start > last) {
+        return NULL;
+    }
+
+    return interval_tree_subtree_search(node, start, last);
+}
+
+IntervalTreeNode *interval_tree_iter_next(IntervalTreeNode *node,
+                                          uint64_t start, uint64_t last)
+{
+    RBNode *rb = node->rb.rb_right, *prev;
+
+    while (true) {
+        /*
+         * Loop invariants:
+         *   Cond1: node->start <= last
+         *   rb == node->rb.rb_right
+         *
+         * First, search right subtree if suitable
+         */
+        if (rb) {
+            IntervalTreeNode *right = rb_to_itree(rb);
+
+            if (start <= right->subtree_last) {
+                return interval_tree_subtree_search(right, start, last);
+            }
+        }
+
+        /* Move up the tree until we come from a node's left child */
+        do {
+            rb = rb_parent(&node->rb);
+            if (!rb) {
+                return NULL;
+            }
+            prev = &node->rb;
+            node = rb_to_itree(rb);
+            rb = node->rb.rb_right;
+        } while (prev == rb);
+
+        /* Check if the node intersects [start;last] */
+        if (last < node->start) {  /* !Cond1 */
+            return NULL;
+        }
+        if (start <= node->last) { /* Cond2 */
+            return node;
+        }
+    }
+}
+
+/* Occasionally useful for calling from within the debugger. */
+#if 0
+static void debug_interval_tree_int(IntervalTreeNode *node,
+                                    const char *dir, int level)
+{
+    printf("%4d %*s %s [%" PRIu64 ",%" PRIu64 "] subtree_last:%" PRIu64 "\n",
+           level, level + 1, dir, rb_is_red(&node->rb) ? "r" : "b",
+           node->start, node->last, node->subtree_last);
+
+    if (node->rb.rb_left) {
+        debug_interval_tree_int(rb_to_itree(node->rb.rb_left), "<", level + 1);
+    }
+    if (node->rb.rb_right) {
+        debug_interval_tree_int(rb_to_itree(node->rb.rb_right), ">", level + 1);
+    }
+}
+
+void debug_interval_tree(IntervalTreeNode *node);
+void debug_interval_tree(IntervalTreeNode *node)
+{
+    if (node) {
+        debug_interval_tree_int(node, "*", 0);
+    } else {
+        printf("null\n");
+    }
+}
+#endif
diff --git a/util/meson.build b/util/meson.build
index 25b9b61f98..d8d109ff84 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -57,6 +57,7 @@ util_ss.add(files('guest-random.c'))
 util_ss.add(files('yank.c'))
 util_ss.add(files('int128.c'))
 util_ss.add(files('memalign.c'))
+util_ss.add(files('interval-tree.c'))
 
 if have_user
   util_ss.add(files('selfmap.c'))