summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--block.c404
-rw-r--r--block/block-backend.c25
-rw-r--r--block/block-copy.c21
-rw-r--r--block/block-gen.h11
-rw-r--r--block/commit.c4
-rw-r--r--block/coroutines.h21
-rw-r--r--block/crypto.c2
-rw-r--r--block/dirty-bitmap.c88
-rw-r--r--block/graph-lock.c275
-rw-r--r--block/io.c367
-rw-r--r--block/meson.build2
-rw-r--r--block/parallels.c2
-rw-r--r--block/qcow.c2
-rw-r--r--block/qcow2.c4
-rw-r--r--block/qed.c28
-rw-r--r--block/raw-format.c2
-rw-r--r--block/replication.c6
-rw-r--r--block/stream.c26
-rw-r--r--block/throttle.c8
-rw-r--r--block/vdi.c2
-rw-r--r--block/vhdx.c2
-rw-r--r--block/vmdk.c38
-rw-r--r--block/vpc.c2
-rw-r--r--blockdev.c17
-rw-r--r--blockjob.c2
-rw-r--r--docs/devel/block-coroutine-wrapper.rst6
-rw-r--r--include/block/aio.h9
-rw-r--r--include/block/block-common.h27
-rw-r--r--include/block/block-copy.h5
-rw-r--r--include/block/block-global-state.h15
-rw-r--r--include/block/block-io.h136
-rw-r--r--include/block/block_int-common.h49
-rw-r--r--include/block/block_int-global-state.h17
-rw-r--r--include/block/block_int-io.h12
-rw-r--r--include/block/block_int.h1
-rw-r--r--include/block/dirty-bitmap.h10
-rw-r--r--include/block/graph-lock.h280
-rw-r--r--include/qemu/clang-tsa.h114
-rw-r--r--include/sysemu/block-backend-io.h77
-rw-r--r--nbd/server.c47
-rw-r--r--scripts/block-coroutine-wrapper.py133
-rw-r--r--stubs/graph-lock.c10
-rw-r--r--stubs/meson.build1
-rw-r--r--tests/unit/test-bdrv-drain.c387
-rw-r--r--util/async.c4
45 files changed, 1574 insertions, 1127 deletions
diff --git a/block.c b/block.c
index a18f052374..9c2ac757e4 100644
--- a/block.c
+++ b/block.c
@@ -93,8 +93,6 @@ static bool bdrv_recurse_has_child(BlockDriverState *bs,
 static void bdrv_replace_child_noperm(BdrvChild *child,
                                       BlockDriverState *new_bs);
 static void bdrv_remove_child(BdrvChild *child, Transaction *tran);
-static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
-                                            Transaction *tran);
 
 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue,
@@ -528,65 +526,24 @@ typedef struct CreateCo {
     Error *err;
 } CreateCo;
 
-static void coroutine_fn bdrv_create_co_entry(void *opaque)
-{
-    Error *local_err = NULL;
-    int ret;
-
-    CreateCo *cco = opaque;
-    assert(cco->drv);
-    GLOBAL_STATE_CODE();
-
-    ret = cco->drv->bdrv_co_create_opts(cco->drv,
-                                        cco->filename, cco->opts, &local_err);
-    error_propagate(&cco->err, local_err);
-    cco->ret = ret;
-}
-
-int bdrv_create(BlockDriver *drv, const char* filename,
-                QemuOpts *opts, Error **errp)
+int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename,
+                                QemuOpts *opts, Error **errp)
 {
     int ret;
-
     GLOBAL_STATE_CODE();
-
-    Coroutine *co;
-    CreateCo cco = {
-        .drv = drv,
-        .filename = g_strdup(filename),
-        .opts = opts,
-        .ret = NOT_DONE,
-        .err = NULL,
-    };
+    ERRP_GUARD();
 
     if (!drv->bdrv_co_create_opts) {
-        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
-        ret = -ENOTSUP;
-        goto out;
-    }
-
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_create_co_entry(&cco);
-    } else {
-        co = qemu_coroutine_create(bdrv_create_co_entry, &cco);
-        qemu_coroutine_enter(co);
-        while (cco.ret == NOT_DONE) {
-            aio_poll(qemu_get_aio_context(), true);
-        }
+        error_setg(errp, "Driver '%s' does not support image creation",
+                   drv->format_name);
+        return -ENOTSUP;
     }
 
-    ret = cco.ret;
-    if (ret < 0) {
-        if (cco.err) {
-            error_propagate(errp, cco.err);
-        } else {
-            error_setg_errno(errp, -ret, "Could not create image");
-        }
+    ret = drv->bdrv_co_create_opts(drv, filename, opts, errp);
+    if (ret < 0 && !*errp) {
+        error_setg_errno(errp, -ret, "Could not create image");
     }
 
-out:
-    g_free(cco.filename);
     return ret;
 }
 
@@ -725,7 +682,8 @@ out:
     return ret;
 }
 
-int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
+int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
+                                     Error **errp)
 {
     QemuOpts *protocol_opts;
     BlockDriver *drv;
@@ -766,7 +724,7 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
         goto out;
     }
 
-    ret = bdrv_create(drv, filename, protocol_opts, errp);
+    ret = bdrv_co_create(drv, filename, protocol_opts, errp);
 out:
     qemu_opts_del(protocol_opts);
     qobject_unref(qdict);
@@ -1228,20 +1186,19 @@ static char *bdrv_child_get_parent_desc(BdrvChild *c)
 static void bdrv_child_cb_drained_begin(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
-    bdrv_do_drained_begin_quiesce(bs, NULL, false);
+    bdrv_do_drained_begin_quiesce(bs, NULL);
 }
 
 static bool bdrv_child_cb_drained_poll(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
-    return bdrv_drain_poll(bs, false, NULL, false);
+    return bdrv_drain_poll(bs, NULL, false);
 }
 
-static void bdrv_child_cb_drained_end(BdrvChild *child,
-                                      int *drained_end_counter)
+static void bdrv_child_cb_drained_end(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
-    bdrv_drained_end_no_poll(bs, drained_end_counter);
+    bdrv_drained_end(bs);
 }
 
 static int bdrv_child_cb_inactivate(BdrvChild *child)
@@ -1445,11 +1402,11 @@ static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
     *child_flags = flags;
 }
 
-static void bdrv_child_cb_attach(BdrvChild *child)
+static void GRAPH_WRLOCK bdrv_child_cb_attach(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
 
-    assert_bdrv_graph_writable(bs);
+    assert_bdrv_graph_writable();
     QLIST_INSERT_HEAD(&bs->children, child, next);
     if (bs->drv->is_filter || (child->role & BDRV_CHILD_FILTERED)) {
         /*
@@ -1485,11 +1442,9 @@ static void bdrv_child_cb_attach(BdrvChild *child)
         assert(!bs->file);
         bs->file = child;
     }
-
-    bdrv_apply_subtree_drain(child, bs);
 }
 
-static void bdrv_child_cb_detach(BdrvChild *child)
+static void GRAPH_WRLOCK bdrv_child_cb_detach(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
 
@@ -1497,9 +1452,7 @@ static void bdrv_child_cb_detach(BdrvChild *child)
         bdrv_backing_detach(child);
     }
 
-    bdrv_unapply_subtree_drain(child, bs);
-
-    assert_bdrv_graph_writable(bs);
+    assert_bdrv_graph_writable();
     QLIST_REMOVE(child, next);
     if (child == bs->backing) {
         assert(child != bs->file);
@@ -1715,8 +1668,8 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
     assert(is_power_of_2(bs->bl.request_alignment));
 
     for (i = 0; i < bs->quiesce_counter; i++) {
-        if (drv->bdrv_co_drain_begin) {
-            drv->bdrv_co_drain_begin(bs);
+        if (drv->bdrv_drain_begin) {
+            drv->bdrv_drain_begin(bs);
         }
     }
 
@@ -2414,6 +2367,20 @@ static void bdrv_replace_child_abort(void *opaque)
 
     GLOBAL_STATE_CODE();
     /* old_bs reference is transparently moved from @s to @s->child */
+    if (!s->child->bs) {
+        /*
+         * The parents were undrained when removing old_bs from the child. New
+         * requests can't have been made, though, because the child was empty.
+         *
+         * TODO Make bdrv_replace_child_noperm() transactionable to avoid
+         * undraining the parent in the first place. Once this is done, having
+         * new_bs drained when calling bdrv_replace_child_tran() is not a
+         * requirement any more.
+         */
+        bdrv_parent_drained_begin_single(s->child);
+        assert(!bdrv_parent_drained_poll_single(s->child));
+    }
+    assert(s->child->quiesced_parent);
     bdrv_replace_child_noperm(s->child, s->old_bs);
     bdrv_unref(new_bs);
 }
@@ -2429,12 +2396,19 @@ static TransactionActionDrv bdrv_replace_child_drv = {
  *
  * Note: real unref of old_bs is done only on commit.
  *
+ * Both @child->bs and @new_bs (if non-NULL) must be drained. @new_bs must be
+ * kept drained until the transaction is completed.
+ *
  * The function doesn't update permissions, caller is responsible for this.
  */
 static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
                                     Transaction *tran)
 {
     BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
+
+    assert(child->quiesced_parent);
+    assert(!new_bs || new_bs->quiesce_counter);
+
     *s = (BdrvReplaceChildState) {
         .child = child,
         .old_bs = child->bs,
@@ -2523,8 +2497,12 @@ static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
     return 0;
 }
 
-static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
-                                   Transaction *tran, Error **errp)
+/*
+ * @list is a product of bdrv_topological_dfs() (may be called several times) -
+ * a topologically sorted subgraph.
+ */
+static int bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q,
+                                 Transaction *tran, Error **errp)
 {
     int ret;
     BlockDriverState *bs;
@@ -2546,6 +2524,24 @@ static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
     return 0;
 }
 
+/*
+ * @list is any list of nodes. List is completed by all subtrees and
+ * topologically sorted. It's not a problem if some node occurs in the @list
+ * several times.
+ */
+static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
+                                   Transaction *tran, Error **errp)
+{
+    g_autoptr(GHashTable) found = g_hash_table_new(NULL, NULL);
+    g_autoptr(GSList) refresh_list = NULL;
+
+    for ( ; list; list = list->next) {
+        refresh_list = bdrv_topological_dfs(refresh_list, found, list->data);
+    }
+
+    return bdrv_do_refresh_perms(refresh_list, q, tran, errp);
+}
+
 void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
                               uint64_t *shared_perm)
 {
@@ -2593,15 +2589,24 @@ char *bdrv_perm_names(uint64_t perm)
 }
 
 
-static int bdrv_refresh_perms(BlockDriverState *bs, Error **errp)
+/* @tran is allowed to be NULL. In this case no rollback is possible */
+static int bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran,
+                              Error **errp)
 {
     int ret;
-    Transaction *tran = tran_new();
+    Transaction *local_tran = NULL;
     g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
     GLOBAL_STATE_CODE();
 
-    ret = bdrv_list_refresh_perms(list, NULL, tran, errp);
-    tran_finalize(tran, ret);
+    if (!tran) {
+        tran = local_tran = tran_new();
+    }
+
+    ret = bdrv_do_refresh_perms(list, NULL, tran, errp);
+
+    if (local_tran) {
+        tran_finalize(local_tran, ret);
+    }
 
     return ret;
 }
@@ -2617,7 +2622,7 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
 
     bdrv_child_set_perm(c, perm, shared, tran);
 
-    ret = bdrv_refresh_perms(c->bs, &local_err);
+    ret = bdrv_refresh_perms(c->bs, tran, &local_err);
 
     tran_finalize(tran, ret);
 
@@ -2826,14 +2831,41 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
     return permissions[qapi_perm];
 }
 
+/*
+ * Replaces the node that a BdrvChild points to without updating permissions.
+ *
+ * If @new_bs is non-NULL, the parent of @child must already be drained through
+ * @child.
+ */
 static void bdrv_replace_child_noperm(BdrvChild *child,
                                       BlockDriverState *new_bs)
 {
     BlockDriverState *old_bs = child->bs;
     int new_bs_quiesce_counter;
-    int drain_saldo;
 
     assert(!child->frozen);
+
+    /*
+     * If we want to change the BdrvChild to point to a drained node as its new
+     * child->bs, we need to make sure that its new parent is drained, too. In
+     * other words, either child->quiesce_parent must already be true or we must
+     * be able to set it and keep the parent's quiesce_counter consistent with
+     * that, but without polling or starting new requests (this function
+     * guarantees that it doesn't poll, and starting new requests would be
+     * against the invariants of drain sections).
+     *
+     * To keep things simple, we pick the first option (child->quiesce_parent
+     * must already be true). We also generalise the rule a bit to make it
+     * easier to verify in callers and more likely to be covered in test cases:
+     * The parent must be quiesced through this child even if new_bs isn't
+     * currently drained.
+     *
+     * The only exception is for callers that always pass new_bs == NULL. In
+     * this case, we obviously never need to consider the case of a drained
+     * new_bs, so we can keep the callers simpler by allowing them not to drain
+     * the parent.
+     */
+    assert(!new_bs || child->quiesced_parent);
     assert(old_bs != new_bs);
     GLOBAL_STATE_CODE();
 
@@ -2841,59 +2873,33 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
 
-    new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
-    drain_saldo = new_bs_quiesce_counter - child->parent_quiesce_counter;
-
-    /*
-     * If the new child node is drained but the old one was not, flush
-     * all outstanding requests to the old child node.
-     */
-    while (drain_saldo > 0 && child->klass->drained_begin) {
-        bdrv_parent_drained_begin_single(child, true);
-        drain_saldo--;
-    }
-
+    /* TODO Pull this up into the callers to avoid polling here */
+    bdrv_graph_wrlock();
     if (old_bs) {
-        /* Detach first so that the recursive drain sections coming from @child
-         * are already gone and we only end the drain sections that came from
-         * elsewhere. */
         if (child->klass->detach) {
             child->klass->detach(child);
         }
-        assert_bdrv_graph_writable(old_bs);
         QLIST_REMOVE(child, next_parent);
     }
 
     child->bs = new_bs;
 
     if (new_bs) {
-        assert_bdrv_graph_writable(new_bs);
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
-
-        /*
-         * Detaching the old node may have led to the new node's
-         * quiesce_counter having been decreased.  Not a problem, we
-         * just need to recognize this here and then invoke
-         * drained_end appropriately more often.
-         */
-        assert(new_bs->quiesce_counter <= new_bs_quiesce_counter);
-        drain_saldo += new_bs->quiesce_counter - new_bs_quiesce_counter;
-
-        /* Attach only after starting new drained sections, so that recursive
-         * drain sections coming from @child don't get an extra .drained_begin
-         * callback. */
         if (child->klass->attach) {
             child->klass->attach(child);
         }
     }
+    bdrv_graph_wrunlock();
 
     /*
-     * If the old child node was drained but the new one is not, allow
-     * requests to come in only after the new node has been attached.
+     * If the parent was drained through this BdrvChild previously, but new_bs
+     * is not drained, allow requests to come in only after the new node has
+     * been attached.
      */
-    while (drain_saldo < 0 && child->klass->drained_end) {
+    new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
+    if (!new_bs_quiesce_counter && child->quiesced_parent) {
         bdrv_parent_drained_end_single(child);
-        drain_saldo++;
     }
 }
 
@@ -3026,6 +3032,24 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
     }
 
     bdrv_ref(child_bs);
+    /*
+     * Let every new BdrvChild start with a drained parent. Inserting the child
+     * in the graph with bdrv_replace_child_noperm() will undrain it if
+     * @child_bs is not drained.
+     *
+     * The child was only just created and is not yet visible in global state
+     * until bdrv_replace_child_noperm() inserts it into the graph, so nobody
+     * could have sent requests and polling is not necessary.
+     *
+     * Note that this means that the parent isn't fully drained yet, we only
+     * stop new requests from coming in. This is fine, we don't care about the
+     * old requests here, they are not for this child. If another place enters a
+     * drain section for the same parent, but wants it to be fully quiesced, it
+     * will not run most of the the code in .drained_begin() again (which is not
+     * a problem, we already did this), but it will still poll until the parent
+     * is fully quiesced, so it will not be negatively affected either.
+     */
+    bdrv_parent_drained_begin_single(new_child);
     bdrv_replace_child_noperm(new_child, child_bs);
 
     BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
@@ -3070,30 +3094,6 @@ static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs,
                                     tran, errp);
 }
 
-static void bdrv_detach_child(BdrvChild *child)
-{
-    BlockDriverState *old_bs = child->bs;
-
-    GLOBAL_STATE_CODE();
-    bdrv_replace_child_noperm(child, NULL);
-    bdrv_child_free(child);
-
-    if (old_bs) {
-        /*
-         * Update permissions for old node. We're just taking a parent away, so
-         * we're loosening restrictions. Errors of permission update are not
-         * fatal in this case, ignore them.
-         */
-        bdrv_refresh_perms(old_bs, NULL);
-
-        /*
-         * When the parent requiring a non-default AioContext is removed, the
-         * node moves back to the main AioContext
-         */
-        bdrv_try_change_aio_context(old_bs, qemu_get_aio_context(), NULL, NULL);
-    }
-}
-
 /*
  * This function steals the reference to child_bs from the caller.
  * That reference is later dropped by bdrv_root_unref_child().
@@ -3125,7 +3125,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
         goto out;
     }
 
-    ret = bdrv_refresh_perms(child_bs, errp);
+    ret = bdrv_refresh_perms(child_bs, tran, errp);
 
 out:
     tran_finalize(tran, ret);
@@ -3166,7 +3166,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
         goto out;
     }
 
-    ret = bdrv_refresh_perms(parent_bs, errp);
+    ret = bdrv_refresh_perms(parent_bs, tran, errp);
     if (ret < 0) {
         goto out;
     }
@@ -3182,12 +3182,28 @@ out:
 /* Callers must ensure that child->frozen is false. */
 void bdrv_root_unref_child(BdrvChild *child)
 {
-    BlockDriverState *child_bs;
+    BlockDriverState *child_bs = child->bs;
 
     GLOBAL_STATE_CODE();
+    bdrv_replace_child_noperm(child, NULL);
+    bdrv_child_free(child);
+
+    if (child_bs) {
+        /*
+         * Update permissions for old node. We're just taking a parent away, so
+         * we're loosening restrictions. Errors of permission update are not
+         * fatal in this case, ignore them.
+         */
+        bdrv_refresh_perms(child_bs, NULL, NULL);
+
+        /*
+         * When the parent requiring a non-default AioContext is removed, the
+         * node moves back to the main AioContext
+         */
+        bdrv_try_change_aio_context(child_bs, qemu_get_aio_context(), NULL,
+                                    NULL);
+    }
 
-    child_bs = child->bs;
-    bdrv_detach_child(child);
     bdrv_unref(child_bs);
 }
 
@@ -3406,24 +3422,35 @@ static int bdrv_set_backing_noperm(BlockDriverState *bs,
     return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
 }
 
-int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
-                        Error **errp)
+int bdrv_set_backing_hd_drained(BlockDriverState *bs,
+                                BlockDriverState *backing_hd,
+                                Error **errp)
 {
     int ret;
     Transaction *tran = tran_new();
 
     GLOBAL_STATE_CODE();
-    bdrv_drained_begin(bs);
+    assert(bs->quiesce_counter > 0);
 
     ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
     if (ret < 0) {
         goto out;
     }
 
-    ret = bdrv_refresh_perms(bs, errp);
+    ret = bdrv_refresh_perms(bs, tran, errp);
 out:
     tran_finalize(tran, ret);
+    return ret;
+}
+
+int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
+                        Error **errp)
+{
+    int ret;
+    GLOBAL_STATE_CODE();
 
+    bdrv_drained_begin(bs);
+    ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp);
     bdrv_drained_end(bs);
 
     return ret;
@@ -4153,7 +4180,9 @@ static bool bdrv_recurse_has_child(BlockDriverState *bs,
  * returns a pointer to bs_queue, which is either the newly allocated
  * bs_queue, or the existing bs_queue being used.
  *
- * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
+ * bs is drained here and undrained by bdrv_reopen_queue_free().
+ *
+ * To be called with bs->aio_context locked.
  */
 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                                  BlockDriverState *bs,
@@ -4173,12 +4202,10 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
     int flags;
     QemuOpts *opts;
 
-    /* Make sure that the caller remembered to use a drained section. This is
-     * important to avoid graph changes between the recursive queuing here and
-     * bdrv_reopen_multiple(). */
-    assert(bs->quiesce_counter > 0);
     GLOBAL_STATE_CODE();
 
+    bdrv_drained_begin(bs);
+
     if (bs_queue == NULL) {
         bs_queue = g_new0(BlockReopenQueue, 1);
         QTAILQ_INIT(bs_queue);
@@ -4312,6 +4339,7 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
     return bs_queue;
 }
 
+/* To be called with bs->aio_context locked */
 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
                                     BlockDriverState *bs,
                                     QDict *options, bool keep_old_opts)
@@ -4328,6 +4356,12 @@ void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
     if (bs_queue) {
         BlockReopenQueueEntry *bs_entry, *next;
         QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
+            AioContext *ctx = bdrv_get_aio_context(bs_entry->state.bs);
+
+            aio_context_acquire(ctx);
+            bdrv_drained_end(bs_entry->state.bs);
+            aio_context_release(ctx);
+
             qobject_unref(bs_entry->state.explicit_options);
             qobject_unref(bs_entry->state.options);
             g_free(bs_entry);
@@ -4361,7 +4395,6 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
     BlockReopenQueueEntry *bs_entry, *next;
     AioContext *ctx;
     Transaction *tran = tran_new();
-    g_autoptr(GHashTable) found = NULL;
     g_autoptr(GSList) refresh_list = NULL;
 
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
@@ -4391,18 +4424,15 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
         bs_entry->prepared = true;
     }
 
-    found = g_hash_table_new(NULL, NULL);
     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
         BDRVReopenState *state = &bs_entry->state;
 
-        refresh_list = bdrv_topological_dfs(refresh_list, found, state->bs);
+        refresh_list = g_slist_prepend(refresh_list, state->bs);
         if (state->old_backing_bs) {
-            refresh_list = bdrv_topological_dfs(refresh_list, found,
-                                                state->old_backing_bs);
+            refresh_list = g_slist_prepend(refresh_list, state->old_backing_bs);
         }
         if (state->old_file_bs) {
-            refresh_list = bdrv_topological_dfs(refresh_list, found,
-                                                state->old_file_bs);
+            refresh_list = g_slist_prepend(refresh_list, state->old_file_bs);
         }
     }
 
@@ -4475,18 +4505,16 @@ int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
 
     GLOBAL_STATE_CODE();
 
-    bdrv_subtree_drained_begin(bs);
+    queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
+
     if (ctx != qemu_get_aio_context()) {
         aio_context_release(ctx);
     }
-
-    queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
     ret = bdrv_reopen_multiple(queue, errp);
 
     if (ctx != qemu_get_aio_context()) {
         aio_context_acquire(ctx);
     }
-    bdrv_subtree_drained_end(bs);
 
     return ret;
 }
@@ -5067,23 +5095,24 @@ static void bdrv_remove_child(BdrvChild *child, Transaction *tran)
     }
 
     if (child->bs) {
+        BlockDriverState *bs = child->bs;
+        bdrv_drained_begin(bs);
         bdrv_replace_child_tran(child, NULL, tran);
+        bdrv_drained_end(bs);
     }
 
     tran_add(tran, &bdrv_remove_child_drv, child);
 }
 
-/*
- * A function to remove backing-chain child of @bs if exists: cow child for
- * format nodes (always .backing) and filter child for filters (may be .file or
- * .backing)
- */
-static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
-                                            Transaction *tran)
+static void undrain_on_clean_cb(void *opaque)
 {
-    bdrv_remove_child(bdrv_filter_or_cow_child(bs), tran);
+    bdrv_drained_end(opaque);
 }
 
+static TransactionActionDrv undrain_on_clean = {
+    .clean = undrain_on_clean_cb,
+};
+
 static int bdrv_replace_node_noperm(BlockDriverState *from,
                                     BlockDriverState *to,
                                     bool auto_skip, Transaction *tran,
@@ -5093,6 +5122,11 @@ static int bdrv_replace_node_noperm(BlockDriverState *from,
 
     GLOBAL_STATE_CODE();
 
+    bdrv_drained_begin(from);
+    bdrv_drained_begin(to);
+    tran_add(tran, &undrain_on_clean, from);
+    tran_add(tran, &undrain_on_clean, to);
+
     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
         assert(c->bs == from);
         if (!should_update_child(c, to)) {
@@ -5130,7 +5164,6 @@ static int bdrv_replace_node_common(BlockDriverState *from,
                                     Error **errp)
 {
     Transaction *tran = tran_new();
-    g_autoptr(GHashTable) found = NULL;
     g_autoptr(GSList) refresh_list = NULL;
     BlockDriverState *to_cow_parent = NULL;
     int ret;
@@ -5168,13 +5201,11 @@ static int bdrv_replace_node_common(BlockDriverState *from,
     }
 
     if (detach_subchain) {
-        bdrv_remove_filter_or_cow_child(to_cow_parent, tran);
+        bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran);
     }
 
-    found = g_hash_table_new(NULL, NULL);
-
-    refresh_list = bdrv_topological_dfs(refresh_list, found, to);
-    refresh_list = bdrv_topological_dfs(refresh_list, found, from);
+    refresh_list = g_slist_prepend(refresh_list, to);
+    refresh_list = g_slist_prepend(refresh_list, from);
 
     ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
     if (ret < 0) {
@@ -5244,7 +5275,7 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
         goto out;
     }
 
-    ret = bdrv_refresh_perms(bs_new, errp);
+    ret = bdrv_refresh_perms(bs_new, tran, errp);
 out:
     tran_finalize(tran, ret);
 
@@ -5259,7 +5290,6 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
 {
     int ret;
     Transaction *tran = tran_new();
-    g_autoptr(GHashTable) found = NULL;
     g_autoptr(GSList) refresh_list = NULL;
     BlockDriverState *old_bs = child->bs;
 
@@ -5271,9 +5301,8 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
 
     bdrv_replace_child_tran(child, new_bs, tran);
 
-    found = g_hash_table_new(NULL, NULL);
-    refresh_list = bdrv_topological_dfs(refresh_list, found, old_bs);
-    refresh_list = bdrv_topological_dfs(refresh_list, found, new_bs);
+    refresh_list = g_slist_prepend(refresh_list, old_bs);
+    refresh_list = g_slist_prepend(refresh_list, new_bs);
 
     ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
 
@@ -5373,6 +5402,7 @@ int coroutine_fn bdrv_co_check(BlockDriverState *bs,
                                BdrvCheckResult *res, BdrvCheckMode fix)
 {
     IO_CODE();
+    assert_bdrv_graph_readable();
     if (bs->drv == NULL) {
         return -ENOMEDIUM;
     }
@@ -5595,7 +5625,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
     GLOBAL_STATE_CODE();
 
     bdrv_ref(top);
-    bdrv_subtree_drained_begin(top);
+    bdrv_drained_begin(base);
 
     if (!top->drv || !base->drv) {
         goto exit;
@@ -5668,7 +5698,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
 
     ret = 0;
 exit:
-    bdrv_subtree_drained_end(top);
+    bdrv_drained_end(base);
     bdrv_unref(top);
     return ret;
 }
@@ -6544,7 +6574,7 @@ int bdrv_activate(BlockDriverState *bs, Error **errp)
      */
     if (bs->open_flags & BDRV_O_INACTIVE) {
         bs->open_flags &= ~BDRV_O_INACTIVE;
-        ret = bdrv_refresh_perms(bs, errp);
+        ret = bdrv_refresh_perms(bs, NULL, errp);
         if (ret < 0) {
             bs->open_flags |= BDRV_O_INACTIVE;
             return ret;
@@ -6588,6 +6618,7 @@ int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
     IO_CODE();
 
     assert(!(bs->open_flags & BDRV_O_INACTIVE));
+    assert_bdrv_graph_readable();
 
     if (bs->drv->bdrv_co_invalidate_cache) {
         bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
@@ -6689,7 +6720,7 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
      * We only tried to loosen restrictions, so errors are not fatal, ignore
      * them.
      */
-    bdrv_refresh_perms(bs, NULL);
+    bdrv_refresh_perms(bs, NULL, NULL);
 
     /* Recursively inactivate children */
     QLIST_FOREACH(child, &bs->children, next) {
@@ -6894,6 +6925,10 @@ bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
     return true;
 }
 
+/*
+ * Must not be called while holding the lock of an AioContext other than the
+ * current one.
+ */
 void bdrv_img_create(const char *filename, const char *fmt,
                      const char *base_filename, const char *base_fmt,
                      char *options, uint64_t img_size, int flags, bool quiet,
@@ -7181,7 +7216,6 @@ static void bdrv_detach_aio_context(BlockDriverState *bs)
     if (bs->quiesce_counter) {
         aio_enable_external(bs->aio_context);
     }
-    assert_bdrv_graph_writable(bs);
     bs->aio_context = NULL;
 }
 
@@ -7195,7 +7229,6 @@ static void bdrv_attach_aio_context(BlockDriverState *bs,
         aio_disable_external(new_context);
     }
 
-    assert_bdrv_graph_writable(bs);
     bs->aio_context = new_context;
 
     if (bs->drv && bs->drv->bdrv_attach_aio_context) {
@@ -7276,7 +7309,6 @@ static void bdrv_set_aio_context_commit(void *opaque)
     BlockDriverState *bs = (BlockDriverState *) state->bs;
     AioContext *new_context = state->new_ctx;
     AioContext *old_context = bdrv_get_aio_context(bs);
-    assert_bdrv_graph_writable(bs);
 
     /*
      * Take the old AioContex when detaching it from bs.
diff --git a/block/block-backend.c b/block/block-backend.c
index bf0ea3cfed..ba7bf1d6bc 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -129,7 +129,7 @@ static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format,
 }
 static void blk_root_drained_begin(BdrvChild *child);
 static bool blk_root_drained_poll(BdrvChild *child);
-static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter);
+static void blk_root_drained_end(BdrvChild *child);
 
 static void blk_root_change_media(BdrvChild *child, bool load);
 static void blk_root_resize(BdrvChild *child);
@@ -1424,6 +1424,27 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
     return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
 }
 
+int coroutine_fn blk_co_block_status_above(BlockBackend *blk,
+                                           BlockDriverState *base,
+                                           int64_t offset, int64_t bytes,
+                                           int64_t *pnum, int64_t *map,
+                                           BlockDriverState **file)
+{
+    IO_CODE();
+    return bdrv_co_block_status_above(blk_bs(blk), base, offset, bytes, pnum,
+                                      map, file);
+}
+
+int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk,
+                                           BlockDriverState *base,
+                                           bool include_base, int64_t offset,
+                                           int64_t bytes, int64_t *pnum)
+{
+    IO_CODE();
+    return bdrv_co_is_allocated_above(blk_bs(blk), base, include_base, offset,
+                                      bytes, pnum);
+}
+
 typedef struct BlkRwCo {
     BlockBackend *blk;
     int64_t offset;
@@ -2556,7 +2577,7 @@ static bool blk_root_drained_poll(BdrvChild *child)
     return busy || !!blk->in_flight;
 }
 
-static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
+static void blk_root_drained_end(BdrvChild *child)
 {
     BlockBackend *blk = child->opaque;
     assert(blk->quiesce_counter);
diff --git a/block/block-copy.c b/block/block-copy.c
index bb947afdda..5e59d6262f 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -577,8 +577,9 @@ static coroutine_fn int block_copy_task_entry(AioTask *task)
     return ret;
 }
 
-static int block_copy_block_status(BlockCopyState *s, int64_t offset,
-                                   int64_t bytes, int64_t *pnum)
+static coroutine_fn int block_copy_block_status(BlockCopyState *s,
+                                                int64_t offset,
+                                                int64_t bytes, int64_t *pnum)
 {
     int64_t num;
     BlockDriverState *base;
@@ -590,8 +591,8 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset,
         base = NULL;
     }
 
-    ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
-                                  NULL, NULL);
+    ret = bdrv_co_block_status_above(s->source->bs, base, offset, bytes, &num,
+                                     NULL, NULL);
     if (ret < 0 || num < s->cluster_size) {
         /*
          * On error or if failed to obtain large enough chunk just fallback to
@@ -613,8 +614,9 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset,
  * Check if the cluster starting at offset is allocated or not.
  * return via pnum the number of contiguous clusters sharing this allocation.
  */
-static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
-                                           int64_t *pnum)
+static int coroutine_fn block_copy_is_cluster_allocated(BlockCopyState *s,
+                                                        int64_t offset,
+                                                        int64_t *pnum)
 {
     BlockDriverState *bs = s->source->bs;
     int64_t count, total_count = 0;
@@ -624,7 +626,7 @@ static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 
     while (true) {
-        ret = bdrv_is_allocated(bs, offset, bytes, &count);
+        ret = bdrv_co_is_allocated(bs, offset, bytes, &count);
         if (ret < 0) {
             return ret;
         }
@@ -669,8 +671,9 @@ void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes)
  * @return 0 when the cluster at @offset was unallocated,
  *         1 otherwise, and -ret on error.
  */
-int64_t block_copy_reset_unallocated(BlockCopyState *s,
-                                     int64_t offset, int64_t *count)
+int64_t coroutine_fn block_copy_reset_unallocated(BlockCopyState *s,
+                                                  int64_t offset,
+                                                  int64_t *count)
 {
     int ret;
     int64_t clusters, bytes;
diff --git a/block/block-gen.h b/block/block-gen.h
index f80cf4897d..89b7daaa1f 100644
--- a/block/block-gen.h
+++ b/block/block-gen.h
@@ -30,20 +30,17 @@
 
 /* Base structure for argument packing structures */
 typedef struct BdrvPollCo {
-    BlockDriverState *bs;
+    AioContext *ctx;
     bool in_progress;
-    int ret;
     Coroutine *co; /* Keep pointer here for debugging */
 } BdrvPollCo;
 
-static inline int bdrv_poll_co(BdrvPollCo *s)
+static inline void bdrv_poll_co(BdrvPollCo *s)
 {
     assert(!qemu_in_coroutine());
 
-    bdrv_coroutine_enter(s->bs, s->co);
-    BDRV_POLL_WHILE(s->bs, s->in_progress);
-
-    return s->ret;
+    aio_co_enter(s->ctx, s->co);
+    AIO_WAIT_WHILE(s->ctx, s->in_progress);
 }
 
 #endif /* BLOCK_BLOCK_GEN_H */
diff --git a/block/commit.c b/block/commit.c
index 0029b31944..b346341767 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -155,8 +155,8 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
             break;
         }
         /* Copy if allocated above the base */
-        ret = bdrv_is_allocated_above(blk_bs(s->top), s->base_overlay, true,
-                                      offset, COMMIT_BUFFER_SIZE, &n);
+        ret = blk_co_is_allocated_above(s->top, s->base_overlay, true,
+                                        offset, COMMIT_BUFFER_SIZE, &n);
         copy = (ret > 0);
         trace_commit_one_iteration(s, offset, n, ret);
         if (copy) {
diff --git a/block/coroutines.h b/block/coroutines.h
index 3a2bad564f..2a1e0b3c9d 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -37,9 +37,11 @@
  * the I/O API.
  */
 
-int coroutine_fn bdrv_co_check(BlockDriverState *bs,
-                               BdrvCheckResult *res, BdrvCheckMode fix);
-int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix);
+
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp);
 
 int coroutine_fn
 bdrv_co_common_block_status_above(BlockDriverState *bs,
@@ -53,10 +55,11 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
                                   BlockDriverState **file,
                                   int *depth);
 
-int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
-                                       QEMUIOVector *qiov, int64_t pos);
-int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
-                                        QEMUIOVector *qiov, int64_t pos);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 
 int coroutine_fn
 nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking,
@@ -71,7 +74,7 @@ nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking,
  * the "I/O or GS" API.
  */
 
-int generated_co_wrapper
+int co_wrapper_mixed_bdrv_rdlock
 bdrv_common_block_status_above(BlockDriverState *bs,
                                BlockDriverState *base,
                                bool include_base,
@@ -82,7 +85,7 @@ bdrv_common_block_status_above(BlockDriverState *bs,
                                int64_t *map,
                                BlockDriverState **file,
                                int *depth);
-int generated_co_wrapper
+int co_wrapper_mixed
 nbd_do_establish_connection(BlockDriverState *bs, bool blocking, Error **errp);
 
 #endif /* BLOCK_COROUTINES_H */
diff --git a/block/crypto.c b/block/crypto.c
index 2fb8add458..bbeb9f437c 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -703,7 +703,7 @@ static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv,
     }
 
     /* Create protocol layer */
-    ret = bdrv_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, errp);
     if (ret < 0) {
         goto fail;
     }
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 9c39550698..956feeb2ae 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -388,7 +388,7 @@ void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs)
  * not fail.
  * This function doesn't release corresponding BdrvDirtyBitmap.
  */
-static int coroutine_fn
+int coroutine_fn
 bdrv_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
                                        Error **errp)
 {
@@ -399,45 +399,6 @@ bdrv_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
     return 0;
 }
 
-typedef struct BdrvRemovePersistentDirtyBitmapCo {
-    BlockDriverState *bs;
-    const char *name;
-    Error **errp;
-    int ret;
-} BdrvRemovePersistentDirtyBitmapCo;
-
-static void coroutine_fn
-bdrv_co_remove_persistent_dirty_bitmap_entry(void *opaque)
-{
-    BdrvRemovePersistentDirtyBitmapCo *s = opaque;
-
-    s->ret = bdrv_co_remove_persistent_dirty_bitmap(s->bs, s->name, s->errp);
-    aio_wait_kick();
-}
-
-int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
-                                        Error **errp)
-{
-    if (qemu_in_coroutine()) {
-        return bdrv_co_remove_persistent_dirty_bitmap(bs, name, errp);
-    } else {
-        Coroutine *co;
-        BdrvRemovePersistentDirtyBitmapCo s = {
-            .bs = bs,
-            .name = name,
-            .errp = errp,
-            .ret = -EINPROGRESS,
-        };
-
-        co = qemu_coroutine_create(bdrv_co_remove_persistent_dirty_bitmap_entry,
-                                   &s);
-        bdrv_coroutine_enter(bs, co);
-        BDRV_POLL_WHILE(bs, s.ret == -EINPROGRESS);
-
-        return s.ret;
-    }
-}
-
 bool
 bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs)
 {
@@ -447,7 +408,7 @@ bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs)
     return false;
 }
 
-static bool coroutine_fn
+bool coroutine_fn
 bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
                                    uint32_t granularity, Error **errp)
 {
@@ -470,51 +431,6 @@ bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
     return drv->bdrv_co_can_store_new_dirty_bitmap(bs, name, granularity, errp);
 }
 
-typedef struct BdrvCanStoreNewDirtyBitmapCo {
-    BlockDriverState *bs;
-    const char *name;
-    uint32_t granularity;
-    Error **errp;
-    bool ret;
-
-    bool in_progress;
-} BdrvCanStoreNewDirtyBitmapCo;
-
-static void coroutine_fn bdrv_co_can_store_new_dirty_bitmap_entry(void *opaque)
-{
-    BdrvCanStoreNewDirtyBitmapCo *s = opaque;
-
-    s->ret = bdrv_co_can_store_new_dirty_bitmap(s->bs, s->name, s->granularity,
-                                                s->errp);
-    s->in_progress = false;
-    aio_wait_kick();
-}
-
-bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
-                                     uint32_t granularity, Error **errp)
-{
-    IO_CODE();
-    if (qemu_in_coroutine()) {
-        return bdrv_co_can_store_new_dirty_bitmap(bs, name, granularity, errp);
-    } else {
-        Coroutine *co;
-        BdrvCanStoreNewDirtyBitmapCo s = {
-            .bs = bs,
-            .name = name,
-            .granularity = granularity,
-            .errp = errp,
-            .in_progress = true,
-        };
-
-        co = qemu_coroutine_create(bdrv_co_can_store_new_dirty_bitmap_entry,
-                                   &s);
-        bdrv_coroutine_enter(bs, co);
-        BDRV_POLL_WHILE(bs, s.in_progress);
-
-        return s.ret;
-    }
-}
-
 void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
 {
     bdrv_dirty_bitmaps_lock(bitmap->bs);
diff --git a/block/graph-lock.c b/block/graph-lock.c
new file mode 100644
index 0000000000..454c31e691
--- /dev/null
+++ b/block/graph-lock.c
@@ -0,0 +1,275 @@
+/*
+ * Graph lock: rwlock to protect block layer graph manipulations (add/remove
+ * edges and nodes)
+ *
+ *  Copyright (c) 2022 Red Hat
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "block/graph-lock.h"
+#include "block/block.h"
+#include "block/block_int.h"
+
+/* Dummy lock object to use for Thread Safety Analysis (TSA) */
+BdrvGraphLock graph_lock;
+
+/* Protects the list of aiocontext and orphaned_reader_count */
+static QemuMutex aio_context_list_lock;
+
+/* Written and read with atomic operations. */
+static int has_writer;
+
+/*
+ * A reader coroutine could move from an AioContext to another.
+ * If this happens, there is no problem from the point of view of
+ * counters. The problem is that the total count becomes
+ * unbalanced if one of the two AioContexts gets deleted.
+ * The count of readers must remain correct, so the AioContext's
+ * balance is transferred to this glboal variable.
+ * Protected by aio_context_list_lock.
+ */
+static uint32_t orphaned_reader_count;
+
+/* Queue of readers waiting for the writer to finish */
+static CoQueue reader_queue;
+
+struct BdrvGraphRWlock {
+    /* How many readers are currently reading the graph. */
+    uint32_t reader_count;
+
+    /*
+     * List of BdrvGraphRWlock kept in graph-lock.c
+     * Protected by aio_context_list_lock
+     */
+    QTAILQ_ENTRY(BdrvGraphRWlock) next_aio;
+};
+
+/*
+ * List of BdrvGraphRWlock. This list ensures that each BdrvGraphRWlock
+ * can safely modify only its own counter, avoid reading/writing
+ * others and thus improving performances by avoiding cacheline bounces.
+ */
+static QTAILQ_HEAD(, BdrvGraphRWlock) aio_context_list =
+    QTAILQ_HEAD_INITIALIZER(aio_context_list);
+
+static void __attribute__((__constructor__)) bdrv_init_graph_lock(void)
+{
+    qemu_mutex_init(&aio_context_list_lock);
+    qemu_co_queue_init(&reader_queue);
+}
+
+void register_aiocontext(AioContext *ctx)
+{
+    ctx->bdrv_graph = g_new0(BdrvGraphRWlock, 1);
+    QEMU_LOCK_GUARD(&aio_context_list_lock);
+    assert(ctx->bdrv_graph->reader_count == 0);
+    QTAILQ_INSERT_TAIL(&aio_context_list, ctx->bdrv_graph, next_aio);
+}
+
+void unregister_aiocontext(AioContext *ctx)
+{
+    QEMU_LOCK_GUARD(&aio_context_list_lock);
+    orphaned_reader_count += ctx->bdrv_graph->reader_count;
+    QTAILQ_REMOVE(&aio_context_list, ctx->bdrv_graph, next_aio);
+    g_free(ctx->bdrv_graph);
+}
+
+static uint32_t reader_count(void)
+{
+    BdrvGraphRWlock *brdv_graph;
+    uint32_t rd;
+
+    QEMU_LOCK_GUARD(&aio_context_list_lock);
+
+    /* rd can temporarly be negative, but the total will *always* be >= 0 */
+    rd = orphaned_reader_count;
+    QTAILQ_FOREACH(brdv_graph, &aio_context_list, next_aio) {
+        rd += qatomic_read(&brdv_graph->reader_count);
+    }
+
+    /* shouldn't overflow unless there are 2^31 readers */
+    assert((int32_t)rd >= 0);
+    return rd;
+}
+
+void bdrv_graph_wrlock(void)
+{
+    GLOBAL_STATE_CODE();
+    assert(!qatomic_read(&has_writer));
+
+    /* Make sure that constantly arriving new I/O doesn't cause starvation */
+    bdrv_drain_all_begin_nopoll();
+
+    /*
+     * reader_count == 0: this means writer will read has_reader as 1
+     * reader_count >= 1: we don't know if writer read has_writer == 0 or 1,
+     *                    but we need to wait.
+     * Wait by allowing other coroutine (and possible readers) to continue.
+     */
+    do {
+        /*
+         * has_writer must be 0 while polling, otherwise we get a deadlock if
+         * any callback involved during AIO_WAIT_WHILE() tries to acquire the
+         * reader lock.
+         */
+        qatomic_set(&has_writer, 0);
+        AIO_WAIT_WHILE(qemu_get_aio_context(), reader_count() >= 1);
+        qatomic_set(&has_writer, 1);
+
+        /*
+         * We want to only check reader_count() after has_writer = 1 is visible
+         * to other threads. That way no more readers can sneak in after we've
+         * determined reader_count() == 0.
+         */
+        smp_mb();
+    } while (reader_count() >= 1);
+
+    bdrv_drain_all_end();
+}
+
+void bdrv_graph_wrunlock(void)
+{
+    GLOBAL_STATE_CODE();
+    QEMU_LOCK_GUARD(&aio_context_list_lock);
+    assert(qatomic_read(&has_writer));
+
+    /*
+     * No need for memory barriers, this works in pair with
+     * the slow path of rdlock() and both take the lock.
+     */
+    qatomic_store_release(&has_writer, 0);
+
+    /* Wake up all coroutine that are waiting to read the graph */
+    qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
+}
+
+void coroutine_fn bdrv_graph_co_rdlock(void)
+{
+    BdrvGraphRWlock *bdrv_graph;
+    bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
+
+    /* Do not lock if in main thread */
+    if (qemu_in_main_thread()) {
+        return;
+    }
+
+    for (;;) {
+        qatomic_set(&bdrv_graph->reader_count,
+                    bdrv_graph->reader_count + 1);
+        /* make sure writer sees reader_count before we check has_writer */
+        smp_mb();
+
+        /*
+         * has_writer == 0: this means writer will read reader_count as >= 1
+         * has_writer == 1: we don't know if writer read reader_count == 0
+         *                  or > 0, but we need to wait anyways because
+         *                  it will write.
+         */
+        if (!qatomic_read(&has_writer)) {
+            break;
+        }
+
+        /*
+         * Synchronize access with reader_count() in bdrv_graph_wrlock().
+         * Case 1:
+         * If this critical section gets executed first, reader_count will
+         * decrease and the reader will go to sleep.
+         * Then the writer will read reader_count that does not take into
+         * account this reader, and if there's no other reader it will
+         * enter the write section.
+         * Case 2:
+         * If reader_count() critical section gets executed first,
+         * then writer will read reader_count >= 1.
+         * It will wait in AIO_WAIT_WHILE(), but once it releases the lock
+         * we will enter this critical section and call aio_wait_kick().
+         */
+        WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) {
+            /*
+             * Additional check when we use the above lock to synchronize
+             * with bdrv_graph_wrunlock().
+             * Case 1:
+             * If this gets executed first, has_writer is still 1, so we reduce
+             * reader_count and go to sleep.
+             * Then the writer will set has_writer to 0 and wake up all readers,
+             * us included.
+             * Case 2:
+             * If bdrv_graph_wrunlock() critical section gets executed first,
+             * then it will set has_writer to 0 and wake up all other readers.
+             * Then we execute this critical section, and therefore must check
+             * again for has_writer, otherwise we sleep without any writer
+             * actually running.
+             */
+            if (!qatomic_read(&has_writer)) {
+                return;
+            }
+
+            /* slow path where reader sleeps */
+            bdrv_graph->reader_count--;
+            aio_wait_kick();
+            qemu_co_queue_wait(&reader_queue, &aio_context_list_lock);
+        }
+    }
+}
+
+void coroutine_fn bdrv_graph_co_rdunlock(void)
+{
+    BdrvGraphRWlock *bdrv_graph;
+    bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
+
+    /* Do not lock if in main thread */
+    if (qemu_in_main_thread()) {
+        return;
+    }
+
+    qatomic_store_release(&bdrv_graph->reader_count,
+                          bdrv_graph->reader_count - 1);
+    /* make sure writer sees reader_count before we check has_writer */
+    smp_mb();
+
+    /*
+     * has_writer == 0: this means reader will read reader_count decreased
+     * has_writer == 1: we don't know if writer read reader_count old or
+     *                  new. Therefore, kick again so on next iteration
+     *                  writer will for sure read the updated value.
+     */
+    if (qatomic_read(&has_writer)) {
+        aio_wait_kick();
+    }
+}
+
+void bdrv_graph_rdlock_main_loop(void)
+{
+    GLOBAL_STATE_CODE();
+    assert(!qemu_in_coroutine());
+}
+
+void bdrv_graph_rdunlock_main_loop(void)
+{
+    GLOBAL_STATE_CODE();
+    assert(!qemu_in_coroutine());
+}
+
+void assert_bdrv_graph_readable(void)
+{
+    assert(qemu_in_main_thread() || reader_count());
+}
+
+void assert_bdrv_graph_writable(void)
+{
+    assert(qemu_in_main_thread());
+    assert(qatomic_read(&has_writer));
+}
diff --git a/block/io.c b/block/io.c
index b9424024f9..d87788dfbb 100644
--- a/block/io.c
+++ b/block/io.c
@@ -45,53 +45,43 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs);
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int64_t bytes, BdrvRequestFlags flags);
 
-static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
-                                      bool ignore_bds_parents)
+static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
-        if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
+        if (c == ignore) {
             continue;
         }
-        bdrv_parent_drained_begin_single(c, false);
-    }
-}
-
-static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
-                                                   int *drained_end_counter)
-{
-    assert(c->parent_quiesce_counter > 0);
-    c->parent_quiesce_counter--;
-    if (c->klass->drained_end) {
-        c->klass->drained_end(c, drained_end_counter);
+        bdrv_parent_drained_begin_single(c);
     }
 }
 
 void bdrv_parent_drained_end_single(BdrvChild *c)
 {
-    int drained_end_counter = 0;
-    AioContext *ctx = bdrv_child_get_parent_aio_context(c);
     IO_OR_GS_CODE();
-    bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
-    AIO_WAIT_WHILE(ctx, qatomic_read(&drained_end_counter) > 0);
+
+    assert(c->quiesced_parent);
+    c->quiesced_parent = false;
+
+    if (c->klass->drained_end) {
+        c->klass->drained_end(c);
+    }
 }
 
-static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
-                                    bool ignore_bds_parents,
-                                    int *drained_end_counter)
+static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c;
 
     QLIST_FOREACH(c, &bs->parents, next_parent) {
-        if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
+        if (c == ignore) {
             continue;
         }
-        bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
+        bdrv_parent_drained_end_single(c);
     }
 }
 
-static bool bdrv_parent_drained_poll_single(BdrvChild *c)
+bool bdrv_parent_drained_poll_single(BdrvChild *c)
 {
     if (c->klass->drained_poll) {
         return c->klass->drained_poll(c);
@@ -115,17 +105,16 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
     return busy;
 }
 
-void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
+void bdrv_parent_drained_begin_single(BdrvChild *c)
 {
-    AioContext *ctx = bdrv_child_get_parent_aio_context(c);
     IO_OR_GS_CODE();
-    c->parent_quiesce_counter++;
+
+    assert(!c->quiesced_parent);
+    c->quiesced_parent = true;
+
     if (c->klass->drained_begin) {
         c->klass->drained_begin(c);
     }
-    if (poll) {
-        AIO_WAIT_WHILE(ctx, bdrv_parent_drained_poll_single(c));
-    }
 }
 
 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
@@ -245,69 +234,14 @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
-    bool recursive;
     bool poll;
     BdrvChild *parent;
-    bool ignore_bds_parents;
-    int *drained_end_counter;
 } BdrvCoDrainData;
 
-static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
-{
-    BdrvCoDrainData *data = opaque;
-    BlockDriverState *bs = data->bs;
-
-    if (data->begin) {
-        bs->drv->bdrv_co_drain_begin(bs);
-    } else {
-        bs->drv->bdrv_co_drain_end(bs);
-    }
-
-    /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
-    qatomic_mb_set(&data->done, true);
-    if (!data->begin) {
-        qatomic_dec(data->drained_end_counter);
-    }
-    bdrv_dec_in_flight(bs);
-
-    g_free(data);
-}
-
-/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
-                              int *drained_end_counter)
-{
-    BdrvCoDrainData *data;
-
-    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
-            (!begin && !bs->drv->bdrv_co_drain_end)) {
-        return;
-    }
-
-    data = g_new(BdrvCoDrainData, 1);
-    *data = (BdrvCoDrainData) {
-        .bs = bs,
-        .done = false,
-        .begin = begin,
-        .drained_end_counter = drained_end_counter,
-    };
-
-    if (!begin) {
-        qatomic_inc(drained_end_counter);
-    }
-
-    /* Make sure the driver callback completes during the polling phase for
-     * drain_begin. */
-    bdrv_inc_in_flight(bs);
-    data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
-    aio_co_schedule(bdrv_get_aio_context(bs), data->co);
-}
-
 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
-bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
-                     BdrvChild *ignore_parent, bool ignore_bds_parents)
+bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent,
+                     bool ignore_bds_parents)
 {
-    BdrvChild *child, *next;
     IO_OR_GS_CODE();
 
     if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
@@ -318,30 +252,18 @@ bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
         return true;
     }
 
-    if (recursive) {
-        assert(!ignore_bds_parents);
-        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-            if (bdrv_drain_poll(child->bs, recursive, child, false)) {
-                return true;
-            }
-        }
-    }
-
     return false;
 }
 
-static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
+static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
                                       BdrvChild *ignore_parent)
 {
-    return bdrv_drain_poll(bs, recursive, ignore_parent, false);
+    return bdrv_drain_poll(bs, ignore_parent, false);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent, bool ignore_bds_parents,
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
                                   bool poll);
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                BdrvChild *parent, bool ignore_bds_parents,
-                                int *drained_end_counter);
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
 
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
@@ -354,14 +276,10 @@ static void bdrv_co_drain_bh_cb(void *opaque)
         aio_context_acquire(ctx);
         bdrv_dec_in_flight(bs);
         if (data->begin) {
-            assert(!data->drained_end_counter);
-            bdrv_do_drained_begin(bs, data->recursive, data->parent,
-                                  data->ignore_bds_parents, data->poll);
+            bdrv_do_drained_begin(bs, data->parent, data->poll);
         } else {
             assert(!data->poll);
-            bdrv_do_drained_end(bs, data->recursive, data->parent,
-                                data->ignore_bds_parents,
-                                data->drained_end_counter);
+            bdrv_do_drained_end(bs, data->parent);
         }
         aio_context_release(ctx);
     } else {
@@ -374,11 +292,9 @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin, bool recursive,
+                                                bool begin,
                                                 BdrvChild *parent,
-                                                bool ignore_bds_parents,
-                                                bool poll,
-                                                int *drained_end_counter)
+                                                bool poll)
 {
     BdrvCoDrainData data;
     Coroutine *self = qemu_coroutine_self();
@@ -394,11 +310,8 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
-        .recursive = recursive,
         .parent = parent,
-        .ignore_bds_parents = ignore_bds_parents,
         .poll = poll,
-        .drained_end_counter = drained_end_counter,
     };
 
     if (bs) {
@@ -429,41 +342,22 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     }
 }
 
-void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
-                                   BdrvChild *parent, bool ignore_bds_parents)
-{
-    IO_OR_GS_CODE();
-    assert(!qemu_in_coroutine());
-
-    /* Stop things in parent-to-child order */
-    if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
-        aio_disable_external(bdrv_get_aio_context(bs));
-    }
-
-    bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
-    bdrv_drain_invoke(bs, true, NULL);
-}
-
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent, bool ignore_bds_parents,
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
                                   bool poll)
 {
-    BdrvChild *child, *next;
+    IO_OR_GS_CODE();
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
-                               poll, NULL);
+        bdrv_co_yield_to_drain(bs, true, parent, poll);
         return;
     }
 
-    bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
-
-    if (recursive) {
-        assert(!ignore_bds_parents);
-        bs->recursive_quiesce_counter++;
-        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-            bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
-                                  false);
+    /* Stop things in parent-to-child order */
+    if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
+        aio_disable_external(bdrv_get_aio_context(bs));
+        bdrv_parent_drained_begin(bs, parent);
+        if (bs->drv && bs->drv->bdrv_drain_begin) {
+            bs->drv->bdrv_drain_begin(bs);
         }
     }
 
@@ -477,117 +371,50 @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
      * nodes.
      */
     if (poll) {
-        assert(!ignore_bds_parents);
-        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
+        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
     }
 }
 
-void bdrv_drained_begin(BlockDriverState *bs)
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent)
 {
-    IO_OR_GS_CODE();
-    bdrv_do_drained_begin(bs, false, NULL, false, true);
+    bdrv_do_drained_begin(bs, parent, false);
 }
 
-void bdrv_subtree_drained_begin(BlockDriverState *bs)
+void bdrv_drained_begin(BlockDriverState *bs)
 {
     IO_OR_GS_CODE();
-    bdrv_do_drained_begin(bs, true, NULL, false, true);
+    bdrv_do_drained_begin(bs, NULL, true);
 }
 
 /**
  * This function does not poll, nor must any of its recursively called
- * functions.  The *drained_end_counter pointee will be incremented
- * once for every background operation scheduled, and decremented once
- * the operation settles.  Therefore, the pointer must remain valid
- * until the pointee reaches 0.  That implies that whoever sets up the
- * pointee has to poll until it is 0.
- *
- * We use atomic operations to access *drained_end_counter, because
- * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
- *     @bs may contain nodes in different AioContexts,
- * (2) bdrv_drain_all_end() uses the same counter for all nodes,
- *     regardless of which AioContext they are in.
+ * functions.
  */
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                BdrvChild *parent, bool ignore_bds_parents,
-                                int *drained_end_counter)
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 {
-    BdrvChild *child;
     int old_quiesce_counter;
 
-    assert(drained_end_counter != NULL);
-
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
-                               false, drained_end_counter);
+        bdrv_co_yield_to_drain(bs, false, parent, false);
         return;
     }
     assert(bs->quiesce_counter > 0);
 
     /* Re-enable things in child-to-parent order */
-    bdrv_drain_invoke(bs, false, drained_end_counter);
-    bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
-                            drained_end_counter);
-
     old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter);
     if (old_quiesce_counter == 1) {
-        aio_enable_external(bdrv_get_aio_context(bs));
-    }
-
-    if (recursive) {
-        assert(!ignore_bds_parents);
-        bs->recursive_quiesce_counter--;
-        QLIST_FOREACH(child, &bs->children, next) {
-            bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
-                                drained_end_counter);
+        if (bs->drv && bs->drv->bdrv_drain_end) {
+            bs->drv->bdrv_drain_end(bs);
         }
+        bdrv_parent_drained_end(bs, parent);
+        aio_enable_external(bdrv_get_aio_context(bs));
     }
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
-    int drained_end_counter = 0;
-    IO_OR_GS_CODE();
-    bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
-    BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
-}
-
-void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
-{
-    IO_CODE();
-    bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
-}
-
-void bdrv_subtree_drained_end(BlockDriverState *bs)
-{
-    int drained_end_counter = 0;
-    IO_OR_GS_CODE();
-    bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
-    BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
-}
-
-void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
-{
-    int i;
-    IO_OR_GS_CODE();
-
-    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
-        bdrv_do_drained_begin(child->bs, true, child, false, true);
-    }
-}
-
-void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
-{
-    int drained_end_counter = 0;
-    int i;
     IO_OR_GS_CODE();
-
-    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
-        bdrv_do_drained_end(child->bs, true, child, false,
-                            &drained_end_counter);
-    }
-
-    BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0);
+    bdrv_do_drained_end(bs, NULL);
 }
 
 void bdrv_drain(BlockDriverState *bs)
@@ -620,7 +447,7 @@ static bool bdrv_drain_all_poll(void)
     while ((bs = bdrv_next_all_states(bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
         aio_context_acquire(aio_context);
-        result |= bdrv_drain_poll(bs, false, NULL, true);
+        result |= bdrv_drain_poll(bs, NULL, true);
         aio_context_release(aio_context);
     }
 
@@ -639,16 +466,11 @@ static bool bdrv_drain_all_poll(void)
  * NOTE: no new block jobs or BlockDriverStates can be created between
  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
  */
-void bdrv_drain_all_begin(void)
+void bdrv_drain_all_begin_nopoll(void)
 {
     BlockDriverState *bs = NULL;
     GLOBAL_STATE_CODE();
 
-    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
-        return;
-    }
-
     /*
      * bdrv queue is managed by record/replay,
      * waiting for finishing the I/O requests may
@@ -670,9 +492,21 @@ void bdrv_drain_all_begin(void)
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        bdrv_do_drained_begin(bs, false, NULL, true, false);
+        bdrv_do_drained_begin(bs, NULL, false);
         aio_context_release(aio_context);
     }
+}
+
+void bdrv_drain_all_begin(void)
+{
+    BlockDriverState *bs = NULL;
+
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(NULL, true, NULL, true);
+        return;
+    }
+
+    bdrv_drain_all_begin_nopoll();
 
     /* Now poll the in-flight requests */
     AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
@@ -684,22 +518,19 @@ void bdrv_drain_all_begin(void)
 
 void bdrv_drain_all_end_quiesce(BlockDriverState *bs)
 {
-    int drained_end_counter = 0;
     GLOBAL_STATE_CODE();
 
     g_assert(bs->quiesce_counter > 0);
     g_assert(!bs->refcnt);
 
     while (bs->quiesce_counter) {
-        bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
+        bdrv_do_drained_end(bs, NULL);
     }
-    BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
 }
 
 void bdrv_drain_all_end(void)
 {
     BlockDriverState *bs = NULL;
-    int drained_end_counter = 0;
     GLOBAL_STATE_CODE();
 
     /*
@@ -715,13 +546,11 @@ void bdrv_drain_all_end(void)
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
+        bdrv_do_drained_end(bs, NULL);
         aio_context_release(aio_context);
     }
 
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-    AIO_WAIT_WHILE(NULL, qatomic_read(&drained_end_counter) > 0);
-
     assert(bdrv_drain_all_count > 0);
     bdrv_drain_all_count--;
 }
@@ -2711,6 +2540,17 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
     return ret;
 }
 
+int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
+                                            BlockDriverState *base,
+                                            int64_t offset, int64_t bytes,
+                                            int64_t *pnum, int64_t *map,
+                                            BlockDriverState **file)
+{
+    IO_CODE();
+    return bdrv_co_common_block_status_above(bs, base, false, true, offset,
+                                             bytes, pnum, map, file, NULL);
+}
+
 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
                             int64_t offset, int64_t bytes, int64_t *pnum,
                             int64_t *map, BlockDriverState **file)
@@ -2756,6 +2596,22 @@ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
     return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
 }
 
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
+                                      int64_t bytes, int64_t *pnum)
+{
+    int ret;
+    int64_t dummy;
+    IO_CODE();
+
+    ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset,
+                                            bytes, pnum ? pnum : &dummy, NULL,
+                                            NULL, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+    return !!(ret & BDRV_BLOCK_ALLOCATED);
+}
+
 int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
                       int64_t *pnum)
 {
@@ -2772,6 +2628,29 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
     return !!(ret & BDRV_BLOCK_ALLOCATED);
 }
 
+/* See bdrv_is_allocated_above for documentation */
+int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
+                                            BlockDriverState *base,
+                                            bool include_base, int64_t offset,
+                                            int64_t bytes, int64_t *pnum)
+{
+    int depth;
+    int ret;
+    IO_CODE();
+
+    ret = bdrv_co_common_block_status_above(top, base, include_base, false,
+                                            offset, bytes, pnum, NULL, NULL,
+                                            &depth);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (ret & BDRV_BLOCK_ALLOCATED) {
+        return depth;
+    }
+    return 0;
+}
+
 /*
  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
  *
@@ -2795,10 +2674,12 @@ int bdrv_is_allocated_above(BlockDriverState *top,
                             int64_t bytes, int64_t *pnum)
 {
     int depth;
-    int ret = bdrv_common_block_status_above(top, base, include_base, false,
-                                             offset, bytes, pnum, NULL, NULL,
-                                             &depth);
+    int ret;
     IO_CODE();
+
+    ret = bdrv_common_block_status_above(top, base, include_base, false,
+                                         offset, bytes, pnum, NULL, NULL,
+                                         &depth);
     if (ret < 0) {
         return ret;
     }
@@ -2816,6 +2697,7 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
     BlockDriverState *child_bs = bdrv_primary_bs(bs);
     int ret;
     IO_CODE();
+    assert_bdrv_graph_readable();
 
     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
     if (ret < 0) {
@@ -2848,6 +2730,7 @@ bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
     BlockDriverState *child_bs = bdrv_primary_bs(bs);
     int ret;
     IO_CODE();
+    assert_bdrv_graph_readable();
 
     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
     if (ret < 0) {
diff --git a/block/meson.build b/block/meson.build
index b7c68b83a3..90011a2805 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -10,6 +10,7 @@ block_ss.add(files(
   'blkverify.c',
   'block-backend.c',
   'block-copy.c',
+  'graph-lock.c',
   'commit.c',
   'copy-on-read.c',
   'preallocate.c',
@@ -137,6 +138,7 @@ block_gen_c = custom_target('block-gen.c',
                             output: 'block-gen.c',
                             input: files(
                                       '../include/block/block-io.h',
+                                      '../include/block/dirty-bitmap.h',
                                       '../include/block/block-global-state.h',
                                       '../include/sysemu/block-backend-io.h',
                                       'coroutines.h'
diff --git a/block/parallels.c b/block/parallels.c
index fa08c1104b..bbea2f2221 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -646,7 +646,7 @@ static int coroutine_fn parallels_co_create_opts(BlockDriver *drv,
     }
 
     /* Create and open the file (protocol layer) */
-    ret = bdrv_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, errp);
     if (ret < 0) {
         goto done;
     }
diff --git a/block/qcow.c b/block/qcow.c
index 8bffc41531..5d99f00411 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -973,7 +973,7 @@ static int coroutine_fn qcow_co_create_opts(BlockDriver *drv,
     }
 
     /* Create and open the file (protocol layer) */
-    ret = bdrv_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, errp);
     if (ret < 0) {
         goto fail;
     }
diff --git a/block/qcow2.c b/block/qcow2.c
index 941782a011..bafbd077b9 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3871,7 +3871,7 @@ static int coroutine_fn qcow2_co_create_opts(BlockDriver *drv,
     }
 
     /* Create and open the file (protocol layer) */
-    ret = bdrv_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, errp);
     if (ret < 0) {
         goto finish;
     }
@@ -3886,7 +3886,7 @@ static int coroutine_fn qcow2_co_create_opts(BlockDriver *drv,
     /* Create and open an external data file (protocol layer) */
     val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE);
     if (val) {
-        ret = bdrv_create_file(val, opts, errp);
+        ret = bdrv_co_create_file(val, opts, errp);
         if (ret < 0) {
             goto finish;
         }
diff --git a/block/qed.c b/block/qed.c
index 0ab159b468..faa606618e 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -262,7 +262,7 @@ static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s)
     assert(!s->allocating_write_reqs_plugged);
     if (s->allocating_acb != NULL) {
         /* Another allocating write came concurrently.  This cannot happen
-         * from bdrv_qed_co_drain_begin, but it can happen when the timer runs.
+         * from bdrv_qed_drain_begin, but it can happen when the timer runs.
          */
         qemu_co_mutex_unlock(&s->table_lock);
         return false;
@@ -282,9 +282,8 @@ static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s)
     qemu_co_mutex_unlock(&s->table_lock);
 }
 
-static void coroutine_fn qed_need_check_timer_entry(void *opaque)
+static void coroutine_fn qed_need_check_timer(BDRVQEDState *s)
 {
-    BDRVQEDState *s = opaque;
     int ret;
 
     trace_qed_need_check_timer_cb(s);
@@ -310,9 +309,20 @@ static void coroutine_fn qed_need_check_timer_entry(void *opaque)
     (void) ret;
 }
 
+static void coroutine_fn qed_need_check_timer_entry(void *opaque)
+{
+    BDRVQEDState *s = opaque;
+
+    qed_need_check_timer(opaque);
+    bdrv_dec_in_flight(s->bs);
+}
+
 static void qed_need_check_timer_cb(void *opaque)
 {
+    BDRVQEDState *s = opaque;
     Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
+
+    bdrv_inc_in_flight(s->bs);
     qemu_coroutine_enter(co);
 }
 
@@ -355,7 +365,7 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
     }
 }
 
-static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs)
+static void bdrv_qed_drain_begin(BlockDriverState *bs)
 {
     BDRVQEDState *s = bs->opaque;
 
@@ -363,8 +373,12 @@ static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs)
      * header is flushed.
      */
     if (s->need_check_timer && timer_pending(s->need_check_timer)) {
+        Coroutine *co;
+
         qed_cancel_need_check_timer(s);
-        qed_need_check_timer_entry(s);
+        co = qemu_coroutine_create(qed_need_check_timer_entry, s);
+        bdrv_inc_in_flight(bs);
+        aio_co_enter(bdrv_get_aio_context(bs), co);
     }
 }
 
@@ -764,7 +778,7 @@ static int coroutine_fn bdrv_qed_co_create_opts(BlockDriver *drv,
     }
 
     /* Create and open the file (protocol layer) */
-    ret = bdrv_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, errp);
     if (ret < 0) {
         goto fail;
     }
@@ -1647,7 +1661,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_co_check            = bdrv_qed_co_check,
     .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
     .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
-    .bdrv_co_drain_begin      = bdrv_qed_co_drain_begin,
+    .bdrv_drain_begin         = bdrv_qed_drain_begin,
 };
 
 static void bdrv_qed_init(void)
diff --git a/block/raw-format.c b/block/raw-format.c
index a68014ef0b..28905b09ee 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -433,7 +433,7 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
                                            QemuOpts *opts,
                                            Error **errp)
 {
-    return bdrv_create_file(filename, opts, errp);
+    return bdrv_co_create_file(filename, opts, errp);
 }
 
 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
diff --git a/block/replication.c b/block/replication.c
index f1eed25e43..c62f48a874 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -374,9 +374,6 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
         s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
     }
 
-    bdrv_subtree_drained_begin(hidden_disk->bs);
-    bdrv_subtree_drained_begin(secondary_disk->bs);
-
     if (s->orig_hidden_read_only) {
         QDict *opts = qdict_new();
         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
@@ -401,9 +398,6 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
             aio_context_acquire(ctx);
         }
     }
-
-    bdrv_subtree_drained_end(hidden_disk->bs);
-    bdrv_subtree_drained_end(secondary_disk->bs);
 }
 
 static void backup_job_cleanup(BlockDriverState *bs)
diff --git a/block/stream.c b/block/stream.c
index 694709bd25..8744ad103f 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -64,13 +64,16 @@ static int stream_prepare(Job *job)
     bdrv_cor_filter_drop(s->cor_filter_bs);
     s->cor_filter_bs = NULL;
 
-    bdrv_subtree_drained_begin(s->above_base);
+    /*
+     * bdrv_set_backing_hd() requires that unfiltered_bs is drained. Drain
+     * already here and use bdrv_set_backing_hd_drained() instead because
+     * the polling during drained_begin() might change the graph, and if we do
+     * this only later, we may end up working with the wrong base node (or it
+     * might even have gone away by the time we want to use it).
+     */
+    bdrv_drained_begin(unfiltered_bs);
 
     base = bdrv_filter_or_cow_bs(s->above_base);
-    if (base) {
-        bdrv_ref(base);
-    }
-
     unfiltered_base = bdrv_skip_filters(base);
 
     if (bdrv_cow_child(unfiltered_bs)) {
@@ -82,7 +85,13 @@ static int stream_prepare(Job *job)
             }
         }
 
-        bdrv_set_backing_hd(unfiltered_bs, base, &local_err);
+        bdrv_set_backing_hd_drained(unfiltered_bs, base, &local_err);
+
+        /*
+         * This call will do I/O, so the graph can change again from here on.
+         * We have already completed the graph change, so we are not in danger
+         * of operating on the wrong node any more if this happens.
+         */
         ret = bdrv_change_backing_file(unfiltered_bs, base_id, base_fmt, false);
         if (local_err) {
             error_report_err(local_err);
@@ -92,10 +101,7 @@ static int stream_prepare(Job *job)
     }
 
 out:
-    if (base) {
-        bdrv_unref(base);
-    }
-    bdrv_subtree_drained_end(s->above_base);
+    bdrv_drained_end(unfiltered_bs);
     return ret;
 }
 
diff --git a/block/throttle.c b/block/throttle.c
index 131eba3ab4..88851c84f4 100644
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -214,7 +214,7 @@ static void throttle_reopen_abort(BDRVReopenState *reopen_state)
     reopen_state->opaque = NULL;
 }
 
-static void coroutine_fn throttle_co_drain_begin(BlockDriverState *bs)
+static void throttle_drain_begin(BlockDriverState *bs)
 {
     ThrottleGroupMember *tgm = bs->opaque;
     if (qatomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
@@ -222,7 +222,7 @@ static void coroutine_fn throttle_co_drain_begin(BlockDriverState *bs)
     }
 }
 
-static void coroutine_fn throttle_co_drain_end(BlockDriverState *bs)
+static void throttle_drain_end(BlockDriverState *bs)
 {
     ThrottleGroupMember *tgm = bs->opaque;
     assert(tgm->io_limits_disabled);
@@ -261,8 +261,8 @@ static BlockDriver bdrv_throttle = {
     .bdrv_reopen_commit                 =   throttle_reopen_commit,
     .bdrv_reopen_abort                  =   throttle_reopen_abort,
 
-    .bdrv_co_drain_begin                =   throttle_co_drain_begin,
-    .bdrv_co_drain_end                  =   throttle_co_drain_end,
+    .bdrv_drain_begin                   =   throttle_drain_begin,
+    .bdrv_drain_end                     =   throttle_drain_end,
 
     .is_filter                          =   true,
     .strong_runtime_opts                =   throttle_strong_runtime_opts,
diff --git a/block/vdi.c b/block/vdi.c
index c0c111c4b9..479bcfe820 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -934,7 +934,7 @@ static int coroutine_fn vdi_co_create_opts(BlockDriver *drv,
     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vdi_create_opts, true);
 
     /* Create and open the file (protocol layer) */
-    ret = bdrv_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, errp);
     if (ret < 0) {
         goto done;
     }
diff --git a/block/vhdx.c b/block/vhdx.c
index bad9ca691b..4c929800fe 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -2084,7 +2084,7 @@ static int coroutine_fn vhdx_co_create_opts(BlockDriver *drv,
     }
 
     /* Create and open the file (protocol layer) */
-    ret = bdrv_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, errp);
     if (ret < 0) {
         goto fail;
     }
diff --git a/block/vmdk.c b/block/vmdk.c
index bac3d8db50..8894dac2d4 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2285,15 +2285,16 @@ exit:
     return ret;
 }
 
-static int vmdk_create_extent(const char *filename, int64_t filesize,
-                              bool flat, bool compress, bool zeroed_grain,
-                              BlockBackend **pbb,
-                              QemuOpts *opts, Error **errp)
+static int coroutine_fn vmdk_create_extent(const char *filename,
+                                           int64_t filesize, bool flat,
+                                           bool compress, bool zeroed_grain,
+                                           BlockBackend **pbb,
+                                           QemuOpts *opts, Error **errp)
 {
     int ret;
     BlockBackend *blk = NULL;
 
-    ret = bdrv_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, errp);
     if (ret < 0) {
         goto exit;
     }
@@ -2366,14 +2367,14 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
  *           non-split format.
  * idx >= 1: get the n-th extent if in a split subformat
  */
-typedef BlockBackend *(*vmdk_create_extent_fn)(int64_t size,
-                                               int idx,
-                                               bool flat,
-                                               bool split,
-                                               bool compress,
-                                               bool zeroed_grain,
-                                               void *opaque,
-                                               Error **errp);
+typedef BlockBackend * coroutine_fn (*vmdk_create_extent_fn)(int64_t size,
+                                                             int idx,
+                                                             bool flat,
+                                                             bool split,
+                                                             bool compress,
+                                                             bool zeroed_grain,
+                                                             void *opaque,
+                                                             Error **errp);
 
 static void vmdk_desc_add_extent(GString *desc,
                                  const char *extent_line_fmt,
@@ -2616,7 +2617,7 @@ typedef struct {
     QemuOpts *opts;
 } VMDKCreateOptsData;
 
-static BlockBackend *vmdk_co_create_opts_cb(int64_t size, int idx,
+static BlockBackend * coroutine_fn vmdk_co_create_opts_cb(int64_t size, int idx,
                                             bool flat, bool split, bool compress,
                                             bool zeroed_grain, void *opaque,
                                             Error **errp)
@@ -2768,10 +2769,11 @@ exit:
     return ret;
 }
 
-static BlockBackend *vmdk_co_create_cb(int64_t size, int idx,
-                                       bool flat, bool split, bool compress,
-                                       bool zeroed_grain, void *opaque,
-                                       Error **errp)
+static BlockBackend * coroutine_fn vmdk_co_create_cb(int64_t size, int idx,
+                                                     bool flat, bool split,
+                                                     bool compress,
+                                                     bool zeroed_grain,
+                                                     void *opaque, Error **errp)
 {
     int ret;
     BlockDriverState *bs;
diff --git a/block/vpc.c b/block/vpc.c
index 95841f259a..6ee95dcb96 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -1111,7 +1111,7 @@ static int coroutine_fn vpc_co_create_opts(BlockDriver *drv,
     }
 
     /* Create and open the file (protocol layer) */
-    ret = bdrv_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, errp);
     if (ret < 0) {
         goto fail;
     }
diff --git a/blockdev.c b/blockdev.c
index 75eef8535e..ebf952cd21 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1507,10 +1507,14 @@ static void external_snapshot_prepare(BlkActionState *common,
                 goto out;
             }
             bdrv_refresh_filename(state->old_bs);
+
+            aio_context_release(aio_context);
             bdrv_img_create(new_image_file, format,
                             state->old_bs->filename,
                             state->old_bs->drv->format_name,
                             NULL, size, flags, false, &local_err);
+            aio_context_acquire(aio_context);
+
             if (local_err) {
                 error_propagate(errp, local_err);
                 goto out;
@@ -3515,8 +3519,6 @@ fail:
 void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp)
 {
     BlockReopenQueue *queue = NULL;
-    GSList *drained = NULL;
-    GSList *p;
 
     /* Add each one of the BDS that we want to reopen to the queue */
     for (; reopen_list != NULL; reopen_list = reopen_list->next) {
@@ -3553,9 +3555,7 @@ void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp)
         ctx = bdrv_get_aio_context(bs);
         aio_context_acquire(ctx);
 
-        bdrv_subtree_drained_begin(bs);
         queue = bdrv_reopen_queue(queue, bs, qdict, false);
-        drained = g_slist_prepend(drained, bs);
 
         aio_context_release(ctx);
     }
@@ -3566,15 +3566,6 @@ void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp)
 
 fail:
     bdrv_reopen_queue_free(queue);
-    for (p = drained; p; p = p->next) {
-        BlockDriverState *bs = p->data;
-        AioContext *ctx = bdrv_get_aio_context(bs);
-
-        aio_context_acquire(ctx);
-        bdrv_subtree_drained_end(bs);
-        aio_context_release(ctx);
-    }
-    g_slist_free(drained);
 }
 
 void qmp_blockdev_del(const char *node_name, Error **errp)
diff --git a/blockjob.c b/blockjob.c
index 3c8f3543a2..b7daf2a9f6 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -120,7 +120,7 @@ static bool child_job_drained_poll(BdrvChild *c)
     }
 }
 
-static void child_job_drained_end(BdrvChild *c, int *drained_end_counter)
+static void child_job_drained_end(BdrvChild *c)
 {
     BlockJob *job = c->opaque;
     job_resume(&job->job);
diff --git a/docs/devel/block-coroutine-wrapper.rst b/docs/devel/block-coroutine-wrapper.rst
index 412851986b..6dd2cdcab3 100644
--- a/docs/devel/block-coroutine-wrapper.rst
+++ b/docs/devel/block-coroutine-wrapper.rst
@@ -26,12 +26,12 @@ called ``bdrv_foo(<same args>)``. In this case the script can help. To
 trigger the generation:
 
 1. You need ``bdrv_foo`` declaration somewhere (for example, in
-   ``block/coroutines.h``) with the ``generated_co_wrapper`` mark,
+   ``block/coroutines.h``) with the ``co_wrapper`` mark,
    like this:
 
 .. code-block:: c
 
-    int generated_co_wrapper bdrv_foo(<some args>);
+    int co_wrapper bdrv_foo(<some args>);
 
 2. You need to feed this declaration to block-coroutine-wrapper script.
    For this, add the .h (or .c) file with the declaration to the
@@ -46,7 +46,7 @@ Links
 
 1. The script location is ``scripts/block-coroutine-wrapper.py``.
 
-2. Generic place for private ``generated_co_wrapper`` declarations is
+2. Generic place for private ``co_wrapper`` declarations is
    ``block/coroutines.h``, for public declarations:
    ``include/block/block.h``
 
diff --git a/include/block/aio.h b/include/block/aio.h
index d128558f1d..0f65a3cc9e 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -22,6 +22,7 @@
 #include "qemu/event_notifier.h"
 #include "qemu/thread.h"
 #include "qemu/timer.h"
+#include "block/graph-lock.h"
 
 typedef struct BlockAIOCB BlockAIOCB;
 typedef void BlockCompletionFunc(void *opaque, int ret);
@@ -127,6 +128,14 @@ struct AioContext {
     /* Used by AioContext users to protect from multi-threaded access.  */
     QemuRecMutex lock;
 
+    /*
+     * Keep track of readers and writers of the block layer graph.
+     * This is essential to avoid performing additions and removal
+     * of nodes and edges from block graph while some
+     * other thread is traversing it.
+     */
+    BdrvGraphRWlock *bdrv_graph;
+
     /* The list of registered AIO handlers.  Protected by ctx->list_lock. */
     AioHandlerList aio_handlers;
 
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 297704c1e9..4749c46a5e 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -29,20 +29,35 @@
 #include "qemu/iov.h"
 #include "qemu/coroutine.h"
 #include "block/accounting.h"
-#include "block/dirty-bitmap.h"
-#include "block/blockjob.h"
 #include "qemu/hbitmap.h"
 #include "qemu/transactions.h"
 
 /*
- * generated_co_wrapper
+ * co_wrapper{*}: Function specifiers used by block-coroutine-wrapper.py
  *
- * Function specifier, which does nothing but mark functions to be
+ * Function specifiers, which do nothing but mark functions to be
  * generated by scripts/block-coroutine-wrapper.py
  *
- * Read more in docs/devel/block-coroutine-wrapper.rst
+ * Usage: read docs/devel/block-coroutine-wrapper.rst
+ *
+ * There are 4 kind of specifiers:
+ * - co_wrapper functions can be called by only non-coroutine context, because
+ *   they always generate a new coroutine.
+ * - co_wrapper_mixed functions can be called by both coroutine and
+ *   non-coroutine context.
+ * - co_wrapper_bdrv_rdlock are co_wrapper functions but automatically take and
+ *   release the graph rdlock when creating a new coroutine
+ * - co_wrapper_mixed_bdrv_rdlock are co_wrapper_mixed functions but
+ *   automatically take and release the graph rdlock when creating a new
+ *   coroutine.
  */
-#define generated_co_wrapper
+#define co_wrapper
+#define co_wrapper_mixed
+#define co_wrapper_bdrv_rdlock
+#define co_wrapper_mixed_bdrv_rdlock
+
+#include "block/dirty-bitmap.h"
+#include "block/blockjob.h"
 
 /* block.c */
 typedef struct BlockDriver BlockDriver;
diff --git a/include/block/block-copy.h b/include/block/block-copy.h
index ba0b425d78..8cea4f9b90 100644
--- a/include/block/block-copy.h
+++ b/include/block/block-copy.h
@@ -36,8 +36,9 @@ void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm);
 void block_copy_state_free(BlockCopyState *s);
 
 void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes);
-int64_t block_copy_reset_unallocated(BlockCopyState *s,
-                                     int64_t offset, int64_t *count);
+int64_t coroutine_fn block_copy_reset_unallocated(BlockCopyState *s,
+                                                  int64_t offset,
+                                                  int64_t *count);
 
 int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
                             bool ignore_ratelimit, uint64_t timeout_ns,
diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h
index c7bd4a2088..b0a3cfe6b8 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -55,9 +55,14 @@ BlockDriver *bdrv_find_protocol(const char *filename,
                                 bool allow_protocol_prefix,
                                 Error **errp);
 BlockDriver *bdrv_find_format(const char *format_name);
-int bdrv_create(BlockDriver *drv, const char* filename,
-                QemuOpts *opts, Error **errp);
-int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);
+
+int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename,
+                                QemuOpts *opts, Error **errp);
+int co_wrapper bdrv_create(BlockDriver *drv, const char *filename,
+                           QemuOpts *opts, Error **errp);
+
+int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
+                                     Error **errp);
 
 BlockDriverState *bdrv_new(void);
 int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
@@ -82,6 +87,9 @@ int bdrv_open_file_child(const char *filename,
 BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp);
 int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
                         Error **errp);
+int bdrv_set_backing_hd_drained(BlockDriverState *bs,
+                                BlockDriverState *backing_hd,
+                                Error **errp);
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
                            const char *bdref_key, Error **errp);
 BlockDriverState *bdrv_open(const char *filename, const char *reference,
@@ -144,6 +152,7 @@ int bdrv_inactivate_all(void);
 int bdrv_flush_all(void);
 void bdrv_close_all(void);
 void bdrv_drain_all_begin(void);
+void bdrv_drain_all_begin_nopoll(void);
 void bdrv_drain_all_end(void);
 void bdrv_drain_all(void);
 
diff --git a/include/block/block-io.h b/include/block/block-io.h
index b099d7db45..2ed6214909 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -39,19 +39,24 @@
  * to catch when they are accidentally called by the wrong API.
  */
 
-int generated_co_wrapper bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
-                                            int64_t bytes,
-                                            BdrvRequestFlags flags);
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, int64_t bytes,
+                   BdrvRequestFlags flags);
+
 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags);
-int generated_co_wrapper bdrv_pread(BdrvChild *child, int64_t offset,
-                                    int64_t bytes, void *buf,
-                                    BdrvRequestFlags flags);
-int generated_co_wrapper bdrv_pwrite(BdrvChild *child, int64_t offset,
-                                     int64_t bytes, const void *buf,
-                                     BdrvRequestFlags flags);
-int generated_co_wrapper bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
-                                          int64_t bytes, const void *buf,
-                                          BdrvRequestFlags flags);
+
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pread(BdrvChild *child, int64_t offset, int64_t bytes, void *buf,
+           BdrvRequestFlags flags);
+
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pwrite(BdrvChild *child, int64_t offset,int64_t bytes,
+            const void *buf, BdrvRequestFlags flags);
+
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pwrite_sync(BdrvChild *child, int64_t offset, int64_t bytes,
+                 const void *buf, BdrvRequestFlags flags);
+
 int coroutine_fn bdrv_co_pwrite_sync(BdrvChild *child, int64_t offset,
                                      int64_t bytes, const void *buf,
                                      BdrvRequestFlags flags);
@@ -94,14 +99,29 @@ bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
 int bdrv_block_status(BlockDriverState *bs, int64_t offset,
                       int64_t bytes, int64_t *pnum, int64_t *map,
                       BlockDriverState **file);
+
+int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
+                                            BlockDriverState *base,
+                                            int64_t offset, int64_t bytes,
+                                            int64_t *pnum, int64_t *map,
+                                            BlockDriverState **file);
 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
                             int64_t offset, int64_t bytes, int64_t *pnum,
                             int64_t *map, BlockDriverState **file);
+
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
+                                      int64_t bytes, int64_t *pnum);
 int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
                       int64_t *pnum);
+
+int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
+                                            BlockDriverState *base,
+                                            bool include_base, int64_t offset,
+                                            int64_t bytes, int64_t *pnum);
 int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
                             bool include_base, int64_t offset, int64_t bytes,
                             int64_t *pnum);
+
 int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
                                       int64_t bytes);
 
@@ -200,8 +220,14 @@ AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
 void bdrv_io_plug(BlockDriverState *bs);
 void bdrv_io_unplug(BlockDriverState *bs);
 
-bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
-                                     uint32_t granularity, Error **errp);
+bool coroutine_fn bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs,
+                                                     const char *name,
+                                                     uint32_t granularity,
+                                                     Error **errp);
+bool co_wrapper bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs,
+                                                const char *name,
+                                                uint32_t granularity,
+                                                Error **errp);
 
 /**
  *
@@ -237,21 +263,6 @@ int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
                                     int64_t bytes, BdrvRequestFlags read_flags,
                                     BdrvRequestFlags write_flags);
 
-/**
- * bdrv_drained_end_no_poll:
- *
- * Same as bdrv_drained_end(), but do not poll for the subgraph to
- * actually become unquiesced.  Therefore, no graph changes will occur
- * with this function.
- *
- * *drained_end_counter is incremented for every background operation
- * that is scheduled, and will be decremented for every operation once
- * it settles.  The caller must poll until it reaches 0.  The counter
- * should be accessed using atomic operations only.
- */
-void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);
-
-
 /*
  * "I/O or GS" API functions. These functions can run without
  * the BQL, but only in one specific iothread/main loop.
@@ -281,47 +292,54 @@ void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);
 
 void bdrv_drain(BlockDriverState *bs);
 
-int generated_co_wrapper
+int co_wrapper_mixed_bdrv_rdlock
 bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
               PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
 
-int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res,
-                                    BdrvCheckMode fix);
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix);
 
 /* Invalidate any cached metadata used by image formats */
-int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs,
-                                               Error **errp);
-int generated_co_wrapper bdrv_flush(BlockDriverState *bs);
-int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset,
-                                       int64_t bytes);
-int generated_co_wrapper
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_invalidate_cache(BlockDriverState *bs, Error **errp);
+
+int co_wrapper_mixed_bdrv_rdlock bdrv_flush(BlockDriverState *bs);
+
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
+
+int co_wrapper_mixed_bdrv_rdlock
 bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
-int generated_co_wrapper
+
+int co_wrapper_mixed_bdrv_rdlock
 bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 
 /**
  * bdrv_parent_drained_begin_single:
  *
- * Begin a quiesced section for the parent of @c. If @poll is true, wait for
- * any pending activity to cease.
+ * Begin a quiesced section for the parent of @c.
+ */
+void bdrv_parent_drained_begin_single(BdrvChild *c);
+
+/**
+ * bdrv_parent_drained_poll_single:
+ *
+ * Returns true if there is any pending activity to cease before @c can be
+ * called quiesced, false otherwise.
  */
-void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);
+bool bdrv_parent_drained_poll_single(BdrvChild *c);
 
 /**
  * bdrv_parent_drained_end_single:
  *
  * End a quiesced section for the parent of @c.
- *
- * This polls @bs's AioContext until all scheduled sub-drained_ends
- * have settled, which may result in graph changes.
  */
 void bdrv_parent_drained_end_single(BdrvChild *c);
 
 /**
  * bdrv_drain_poll:
  *
- * Poll for pending requests in @bs, its parents (except for @ignore_parent),
- * and if @recursive is true its children as well (used for subtree drain).
+ * Poll for pending requests in @bs and its parents (except for @ignore_parent).
  *
  * If @ignore_bds_parents is true, parents that are BlockDriverStates must
  * ignore the drain request because they will be drained separately (used for
@@ -329,8 +347,8 @@ void bdrv_parent_drained_end_single(BdrvChild *c);
  *
  * This is part of bdrv_drained_begin.
  */
-bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
-                     BdrvChild *ignore_parent, bool ignore_bds_parents);
+bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent,
+                     bool ignore_bds_parents);
 
 /**
  * bdrv_drained_begin:
@@ -348,31 +366,13 @@ void bdrv_drained_begin(BlockDriverState *bs);
  * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
  * running requests to complete.
  */
-void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
-                                   BdrvChild *parent, bool ignore_bds_parents);
-
-/**
- * Like bdrv_drained_begin, but recursively begins a quiesced section for
- * exclusive access to all child nodes as well.
- */
-void bdrv_subtree_drained_begin(BlockDriverState *bs);
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent);
 
 /**
  * bdrv_drained_end:
  *
  * End a quiescent section started by bdrv_drained_begin().
- *
- * This polls @bs's AioContext until all scheduled sub-drained_ends
- * have settled.  On one hand, that may result in graph changes.  On
- * the other, this requires that the caller either runs in the main
- * loop; or that all involved nodes (@bs and all of its parents) are
- * in the caller's AioContext.
  */
 void bdrv_drained_end(BlockDriverState *bs);
 
-/**
- * End a quiescent section started by bdrv_subtree_drained_begin().
- */
-void bdrv_subtree_drained_end(BlockDriverState *bs);
-
 #endif /* BLOCK_IO_H */
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 31ae91e56e..c34c525fa6 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -641,8 +641,8 @@ struct BlockDriver {
     /*
      * Invalidate any cached meta-data.
      */
-    void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs,
-                                                  Error **errp);
+    void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_invalidate_cache)(
+        BlockDriverState *bs, Error **errp);
 
     /*
      * Flushes all data for all layers by calling bdrv_co_flush for underlying
@@ -701,12 +701,11 @@ struct BlockDriver {
                                                  Error **errp);
     BlockStatsSpecific *(*bdrv_get_specific_stats)(BlockDriverState *bs);
 
-    int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs,
-                                          QEMUIOVector *qiov,
-                                          int64_t pos);
-    int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs,
-                                          QEMUIOVector *qiov,
-                                          int64_t pos);
+    int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_save_vmstate)(
+        BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+
+    int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_load_vmstate)(
+        BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
@@ -724,9 +723,8 @@ struct BlockDriver {
      * Returns 0 for completed check, -errno for internal errors.
      * The check results are stored in result.
      */
-    int coroutine_fn (*bdrv_co_check)(BlockDriverState *bs,
-                                      BdrvCheckResult *result,
-                                      BdrvCheckMode fix);
+    int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_check)(
+        BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix);
 
     void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
 
@@ -735,17 +733,19 @@ struct BlockDriver {
     void (*bdrv_io_unplug)(BlockDriverState *bs);
 
     /**
-     * bdrv_co_drain_begin is called if implemented in the beginning of a
+     * bdrv_drain_begin is called if implemented in the beginning of a
      * drain operation to drain and stop any internal sources of requests in
      * the driver.
-     * bdrv_co_drain_end is called if implemented at the end of the drain.
+     * bdrv_drain_end is called if implemented at the end of the drain.
      *
      * They should be used by the driver to e.g. manage scheduled I/O
      * requests, or toggle an internal state. After the end of the drain new
      * requests will continue normally.
+     *
+     * Implementations of both functions must not call aio_poll().
      */
-    void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs);
-    void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs);
+    void (*bdrv_drain_begin)(BlockDriverState *bs);
+    void (*bdrv_drain_end)(BlockDriverState *bs);
 
     bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs);
     bool coroutine_fn (*bdrv_co_can_store_new_dirty_bitmap)(
@@ -896,8 +896,8 @@ struct BdrvChildClass {
     void (*activate)(BdrvChild *child, Error **errp);
     int (*inactivate)(BdrvChild *child);
 
-    void (*attach)(BdrvChild *child);
-    void (*detach)(BdrvChild *child);
+    void GRAPH_WRLOCK_PTR (*attach)(BdrvChild *child);
+    void GRAPH_WRLOCK_PTR (*detach)(BdrvChild *child);
 
     /*
      * Notifies the parent that the filename of its child has changed (e.g.
@@ -937,15 +937,11 @@ struct BdrvChildClass {
      * These functions must not change the graph (and therefore also must not
      * call aio_poll(), which could change the graph indirectly).
      *
-     * If drained_end() schedules background operations, it must atomically
-     * increment *drained_end_counter for each such operation and atomically
-     * decrement it once the operation has settled.
-     *
      * Note that this can be nested. If drained_begin() was called twice, new
      * I/O is allowed only after drained_end() was called twice, too.
      */
     void (*drained_begin)(BdrvChild *child);
-    void (*drained_end)(BdrvChild *child, int *drained_end_counter);
+    void (*drained_end)(BdrvChild *child);
 
     /*
      * Returns whether the parent has pending requests for the child. This
@@ -982,13 +978,13 @@ struct BdrvChild {
     bool frozen;
 
     /*
-     * How many times the parent of this child has been drained
+     * True if the parent of this child has been drained by this BdrvChild
      * (through klass->drained_*).
-     * Usually, this is equal to bs->quiesce_counter (potentially
-     * reduced by bdrv_drain_all_count).  It may differ while the
+     *
+     * It is generally true if bs->quiesce_counter > 0. It may differ while the
      * child is entering or leaving a drained section.
      */
-    int parent_quiesce_counter;
+    bool quiesced_parent;
 
     QLIST_ENTRY(BdrvChild) next;
     QLIST_ENTRY(BdrvChild) next_parent;
@@ -1186,7 +1182,6 @@ struct BlockDriverState {
 
     /* Accessed with atomic ops.  */
     int quiesce_counter;
-    int recursive_quiesce_counter;
 
     unsigned int write_gen;               /* Current data generation */
 
diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
index b49f4eb35b..2f0993f6e9 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -310,21 +310,4 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
  */
 void bdrv_drain_all_end_quiesce(BlockDriverState *bs);
 
-/**
- * Make sure that the function is running under both drain and BQL.
- * The latter protects from concurrent writings
- * from the GS API, while the former prevents concurrent reads
- * from I/O.
- */
-static inline void assert_bdrv_graph_writable(BlockDriverState *bs)
-{
-    /*
-     * TODO: this function is incomplete. Because the users of this
-     * assert lack the necessary drains, check only for BQL.
-     * Once the necessary drains are added,
-     * assert also for qatomic_read(&bs->quiesce_counter) > 0
-     */
-    assert(qemu_in_main_thread());
-}
-
 #endif /* BLOCK_INT_GLOBAL_STATE_H */
diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h
index 4b0b3e17ef..8bc061ebb8 100644
--- a/include/block/block_int-io.h
+++ b/include/block/block_int-io.h
@@ -179,16 +179,4 @@ void bdrv_bsc_invalidate_range(BlockDriverState *bs,
  */
 void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes);
 
-
-/*
- * "I/O or GS" API functions. These functions can run without
- * the BQL, but only in one specific iothread/main loop.
- *
- * See include/block/block-io.h for more information about
- * the "I/O or GS" API.
- */
-
-void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
-void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
-
 #endif /* BLOCK_INT_IO_H */
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 7d50b6bbd1..b35b0138ed 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -26,6 +26,7 @@
 
 #include "block_int-global-state.h"
 #include "block_int-io.h"
+#include "block/graph-lock.h"
 
 /* DO NOT ADD ANYTHING IN HERE. USE ONE OF THE HEADERS INCLUDED ABOVE */
 
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 6528336c4c..c3700cec76 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -34,8 +34,14 @@ int bdrv_dirty_bitmap_check(const BdrvDirtyBitmap *bitmap, uint32_t flags,
                             Error **errp);
 void bdrv_release_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs);
-int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
-                                        Error **errp);
+
+int coroutine_fn bdrv_co_remove_persistent_dirty_bitmap(BlockDriverState *bs,
+                                                        const char *name,
+                                                        Error **errp);
+int co_wrapper bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs,
+                                                   const char *name,
+                                                   Error **errp);
+
 void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 void bdrv_enable_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap);
diff --git a/include/block/graph-lock.h b/include/block/graph-lock.h
new file mode 100644
index 0000000000..4c92cd8edf
--- /dev/null
+++ b/include/block/graph-lock.h
@@ -0,0 +1,280 @@
+/*
+ * Graph lock: rwlock to protect block layer graph manipulations (add/remove
+ * edges and nodes)
+ *
+ *  Copyright (c) 2022 Red Hat
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GRAPH_LOCK_H
+#define GRAPH_LOCK_H
+
+#include "qemu/osdep.h"
+#include "qemu/clang-tsa.h"
+
+#include "qemu/coroutine.h"
+
+/**
+ * Graph Lock API
+ * This API provides a rwlock used to protect block layer
+ * graph modifications like edge (BdrvChild) and node (BlockDriverState)
+ * addition and removal.
+ * Currently we have 1 writer only, the Main loop, and many
+ * readers, mostly coroutines running in other AioContext thus other threads.
+ *
+ * We distinguish between writer (main loop, under BQL) that modifies the
+ * graph, and readers (all other coroutines running in various AioContext),
+ * that go through the graph edges, reading
+ * BlockDriverState ->parents and->children.
+ *
+ * The writer (main loop)  has an "exclusive" access, so it first waits for
+ * current read to finish, and then prevents incoming ones from
+ * entering while it has the exclusive access.
+ *
+ * The readers (coroutines in multiple AioContext) are free to
+ * access the graph as long the writer is not modifying the graph.
+ * In case it is, they go in a CoQueue and sleep until the writer
+ * is done.
+ *
+ * If a coroutine changes AioContext, the counter in the original and new
+ * AioContext are left intact, since the writer does not care where is the
+ * reader, but only if there is one.
+ * As a result, some AioContexts might have a negative reader count, to
+ * balance the positive count of the AioContext that took the lock.
+ * This also means that when an AioContext is deleted it may have a nonzero
+ * reader count. In that case we transfer the count to a global shared counter
+ * so that the writer is always aware of all readers.
+ */
+typedef struct BdrvGraphRWlock BdrvGraphRWlock;
+
+/* Dummy lock object to use for Thread Safety Analysis (TSA) */
+typedef struct TSA_CAPABILITY("mutex") BdrvGraphLock {
+} BdrvGraphLock;
+
+extern BdrvGraphLock graph_lock;
+
+/*
+ * clang doesn't check consistency in locking annotations between forward
+ * declarations and the function definition. Having the annotation on the
+ * definition, but not the declaration in a header file, may give the reader
+ * a false sense of security because the condition actually remains unchecked
+ * for callers in other source files.
+ *
+ * Therefore, as a convention, for public functions, GRAPH_RDLOCK and
+ * GRAPH_WRLOCK annotations should be present only in the header file.
+ */
+#define GRAPH_WRLOCK TSA_REQUIRES(graph_lock)
+#define GRAPH_RDLOCK TSA_REQUIRES_SHARED(graph_lock)
+
+/*
+ * TSA annotations are not part of function types, so checks are defeated when
+ * using a function pointer. As a workaround, annotate function pointers with
+ * this macro that will require that the lock is at least taken while reading
+ * the pointer. In most cases this is equivalent to actually protecting the
+ * function call.
+ */
+#define GRAPH_RDLOCK_PTR TSA_GUARDED_BY(graph_lock)
+#define GRAPH_WRLOCK_PTR TSA_GUARDED_BY(graph_lock)
+
+/*
+ * register_aiocontext:
+ * Add AioContext @ctx to the list of AioContext.
+ * This list is used to obtain the total number of readers
+ * currently running the graph.
+ */
+void register_aiocontext(AioContext *ctx);
+
+/*
+ * unregister_aiocontext:
+ * Removes AioContext @ctx to the list of AioContext.
+ */
+void unregister_aiocontext(AioContext *ctx);
+
+/*
+ * bdrv_graph_wrlock:
+ * Start an exclusive write operation to modify the graph. This means we are
+ * adding or removing an edge or a node in the block layer graph. Nobody else
+ * is allowed to access the graph.
+ *
+ * Must only be called from outside bdrv_graph_co_rdlock.
+ *
+ * The wrlock can only be taken from the main loop, with BQL held, as only the
+ * main loop is allowed to modify the graph.
+ *
+ * This function polls. Callers must not hold the lock of any AioContext other
+ * than the current one.
+ */
+void bdrv_graph_wrlock(void) TSA_ACQUIRE(graph_lock) TSA_NO_TSA;
+
+/*
+ * bdrv_graph_wrunlock:
+ * Write finished, reset global has_writer to 0 and restart
+ * all readers that are waiting.
+ */
+void bdrv_graph_wrunlock(void) TSA_RELEASE(graph_lock) TSA_NO_TSA;
+
+/*
+ * bdrv_graph_co_rdlock:
+ * Read the bs graph. This usually means traversing all nodes in
+ * the graph, therefore it can't happen while another thread is
+ * modifying it.
+ * Increases the reader counter of the current aiocontext,
+ * and if has_writer is set, it means that the writer is modifying
+ * the graph, therefore wait in a coroutine queue.
+ * The writer will then wake this coroutine once it is done.
+ *
+ * This lock should be taken from Iothreads (IO_CODE() class of functions)
+ * because it signals the writer that there are some
+ * readers currently running, or waits until the current
+ * write is finished before continuing.
+ * Calling this function from the Main Loop with BQL held
+ * is not necessary, since the Main Loop itself is the only
+ * writer, thus won't be able to read and write at the same time.
+ * The only exception to that is when we can't take the lock in the
+ * function/coroutine itself, and need to delegate the caller (usually main
+ * loop) to take it and wait that the coroutine ends, so that
+ * we always signal that a reader is running.
+ */
+void coroutine_fn TSA_ACQUIRE_SHARED(graph_lock) TSA_NO_TSA
+bdrv_graph_co_rdlock(void);
+
+/*
+ * bdrv_graph_rdunlock:
+ * Read terminated, decrease the count of readers in the current aiocontext.
+ * If the writer is waiting for reads to finish (has_writer == 1), signal
+ * the writer that we are done via aio_wait_kick() to let it continue.
+ */
+void coroutine_fn TSA_RELEASE_SHARED(graph_lock) TSA_NO_TSA
+bdrv_graph_co_rdunlock(void);
+
+/*
+ * bdrv_graph_rd{un}lock_main_loop:
+ * Just a placeholder to mark where the graph rdlock should be taken
+ * in the main loop. It is just asserting that we are not
+ * in a coroutine and in GLOBAL_STATE_CODE.
+ */
+void TSA_ACQUIRE_SHARED(graph_lock) TSA_NO_TSA
+bdrv_graph_rdlock_main_loop(void);
+
+void TSA_RELEASE_SHARED(graph_lock) TSA_NO_TSA
+bdrv_graph_rdunlock_main_loop(void);
+
+/*
+ * assert_bdrv_graph_readable:
+ * Make sure that the reader is either the main loop,
+ * or there is at least a reader helding the rdlock.
+ * In this way an incoming writer is aware of the read and waits.
+ */
+void GRAPH_RDLOCK assert_bdrv_graph_readable(void);
+
+/*
+ * assert_bdrv_graph_writable:
+ * Make sure that the writer is the main loop and has set @has_writer,
+ * so that incoming readers will pause.
+ */
+void GRAPH_WRLOCK assert_bdrv_graph_writable(void);
+
+/*
+ * Calling this function tells TSA that we know that the lock is effectively
+ * taken even though we cannot prove it (yet) with GRAPH_RDLOCK. This can be
+ * useful in intermediate stages of a conversion to using the GRAPH_RDLOCK
+ * macro.
+ */
+static inline void TSA_ASSERT_SHARED(graph_lock) TSA_NO_TSA
+assume_graph_lock(void)
+{
+}
+
+typedef struct GraphLockable { } GraphLockable;
+
+/*
+ * In C, compound literals have the lifetime of an automatic variable.
+ * In C++ it would be different, but then C++ wouldn't need QemuLockable
+ * either...
+ */
+#define GML_OBJ_() (&(GraphLockable) { })
+
+/*
+ * This is not marked as TSA_ACQUIRE() because TSA doesn't understand the
+ * cleanup attribute and would therefore complain that the graph is never
+ * unlocked. TSA_ASSERT() makes sure that the following calls know that we
+ * hold the lock while unlocking is left unchecked.
+ */
+static inline GraphLockable * TSA_ASSERT(graph_lock) TSA_NO_TSA
+graph_lockable_auto_lock(GraphLockable *x)
+{
+    bdrv_graph_co_rdlock();
+    return x;
+}
+
+static inline void TSA_NO_TSA
+graph_lockable_auto_unlock(GraphLockable *x)
+{
+    bdrv_graph_co_rdunlock();
+}
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(GraphLockable, graph_lockable_auto_unlock)
+
+#define WITH_GRAPH_RDLOCK_GUARD_(var)                                         \
+    for (g_autoptr(GraphLockable) var = graph_lockable_auto_lock(GML_OBJ_()); \
+         var;                                                                 \
+         graph_lockable_auto_unlock(var), var = NULL)
+
+#define WITH_GRAPH_RDLOCK_GUARD() \
+    WITH_GRAPH_RDLOCK_GUARD_(glue(graph_lockable_auto, __COUNTER__))
+
+#define GRAPH_RDLOCK_GUARD(x)                                       \
+    g_autoptr(GraphLockable)                                        \
+    glue(graph_lockable_auto, __COUNTER__) G_GNUC_UNUSED =          \
+            graph_lockable_auto_lock(GML_OBJ_())
+
+
+typedef struct GraphLockableMainloop { } GraphLockableMainloop;
+
+/*
+ * In C, compound literals have the lifetime of an automatic variable.
+ * In C++ it would be different, but then C++ wouldn't need QemuLockable
+ * either...
+ */
+#define GMLML_OBJ_() (&(GraphLockableMainloop) { })
+
+/*
+ * This is not marked as TSA_ACQUIRE() because TSA doesn't understand the
+ * cleanup attribute and would therefore complain that the graph is never
+ * unlocked. TSA_ASSERT() makes sure that the following calls know that we
+ * hold the lock while unlocking is left unchecked.
+ */
+static inline GraphLockableMainloop * TSA_ASSERT(graph_lock) TSA_NO_TSA
+graph_lockable_auto_lock_mainloop(GraphLockableMainloop *x)
+{
+    bdrv_graph_rdlock_main_loop();
+    return x;
+}
+
+static inline void TSA_NO_TSA
+graph_lockable_auto_unlock_mainloop(GraphLockableMainloop *x)
+{
+    bdrv_graph_rdunlock_main_loop();
+}
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(GraphLockableMainloop,
+                              graph_lockable_auto_unlock_mainloop)
+
+#define GRAPH_RDLOCK_GUARD_MAINLOOP(x)                              \
+    g_autoptr(GraphLockableMainloop)                                \
+    glue(graph_lockable_auto, __COUNTER__) G_GNUC_UNUSED =          \
+            graph_lockable_auto_lock_mainloop(GMLML_OBJ_())
+
+#endif /* GRAPH_LOCK_H */
+
diff --git a/include/qemu/clang-tsa.h b/include/qemu/clang-tsa.h
new file mode 100644
index 0000000000..ba06fb8c92
--- /dev/null
+++ b/include/qemu/clang-tsa.h
@@ -0,0 +1,114 @@
+#ifndef CLANG_TSA_H
+#define CLANG_TSA_H
+
+/*
+ * Copyright 2018 Jarkko Hietaniemi <jhi@iki.fi>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* http://clang.llvm.org/docs/ThreadSafetyAnalysis.html
+ *
+ * TSA is available since clang 3.6-ish.
+ */
+#ifdef __clang__
+#  define TSA(x)   __attribute__((x))
+#else
+#  define TSA(x)   /* No TSA, make TSA attributes no-ops. */
+#endif
+
+/* TSA_CAPABILITY() is used to annotate typedefs:
+ *
+ * typedef pthread_mutex_t TSA_CAPABILITY("mutex") tsa_mutex;
+ */
+#define TSA_CAPABILITY(x) TSA(capability(x))
+
+/* TSA_GUARDED_BY() is used to annotate global variables,
+ * the data is guarded:
+ *
+ * Foo foo TSA_GUARDED_BY(mutex);
+ */
+#define TSA_GUARDED_BY(x) TSA(guarded_by(x))
+
+/* TSA_PT_GUARDED_BY() is used to annotate global pointers, the data
+ * behind the pointer is guarded.
+ *
+ * Foo* ptr TSA_PT_GUARDED_BY(mutex);
+ */
+#define TSA_PT_GUARDED_BY(x) TSA(pt_guarded_by(x))
+
+/* The TSA_REQUIRES() is used to annotate functions: the caller of the
+ * function MUST hold the resource, the function will NOT release it.
+ *
+ * More than one mutex may be specified, comma-separated.
+ *
+ * void Foo(void) TSA_REQUIRES(mutex);
+ */
+#define TSA_REQUIRES(...) TSA(requires_capability(__VA_ARGS__))
+#define TSA_REQUIRES_SHARED(...) TSA(requires_shared_capability(__VA_ARGS__))
+
+/* TSA_EXCLUDES() is used to annotate functions: the caller of the
+ * function MUST NOT hold resource, the function first acquires the
+ * resource, and then releases it.
+ *
+ * More than one mutex may be specified, comma-separated.
+ *
+ * void Foo(void) TSA_EXCLUDES(mutex);
+ */
+#define TSA_EXCLUDES(...) TSA(locks_excluded(__VA_ARGS__))
+
+/* TSA_ACQUIRE() is used to annotate functions: the caller of the
+ * function MUST NOT hold the resource, the function will acquire the
+ * resource, but NOT release it.
+ *
+ * More than one mutex may be specified, comma-separated.
+ *
+ * void Foo(void) TSA_ACQUIRE(mutex);
+ */
+#define TSA_ACQUIRE(...) TSA(acquire_capability(__VA_ARGS__))
+#define TSA_ACQUIRE_SHARED(...) TSA(acquire_shared_capability(__VA_ARGS__))
+
+/* TSA_RELEASE() is used to annotate functions: the caller of the
+ * function MUST hold the resource, but the function will then release it.
+ *
+ * More than one mutex may be specified, comma-separated.
+ *
+ * void Foo(void) TSA_RELEASE(mutex);
+ */
+#define TSA_RELEASE(...) TSA(release_capability(__VA_ARGS__))
+#define TSA_RELEASE_SHARED(...) TSA(release_shared_capability(__VA_ARGS__))
+
+/* TSA_NO_TSA is used to annotate functions.  Use only when you need to.
+ *
+ * void Foo(void) TSA_NO_TSA;
+ */
+#define TSA_NO_TSA TSA(no_thread_safety_analysis)
+
+/*
+ * TSA_ASSERT() is used to annotate functions: This function will assert that
+ * the lock is held. When it returns, the caller of the function is assumed to
+ * already hold the resource.
+ *
+ * More than one mutex may be specified, comma-separated.
+ */
+#define TSA_ASSERT(...) TSA(assert_capability(__VA_ARGS__))
+#define TSA_ASSERT_SHARED(...) TSA(assert_shared_capability(__VA_ARGS__))
+
+#endif /* #ifndef CLANG_TSA_H */
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index 50f5aa2e07..7ec6d978d4 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -92,6 +92,15 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
                                    int64_t bytes, BdrvRequestFlags read_flags,
                                    BdrvRequestFlags write_flags);
 
+int coroutine_fn blk_co_block_status_above(BlockBackend *blk,
+                                           BlockDriverState *base,
+                                           int64_t offset, int64_t bytes,
+                                           int64_t *pnum, int64_t *map,
+                                           BlockDriverState **file);
+int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk,
+                                           BlockDriverState *base,
+                                           bool include_base, int64_t offset,
+                                           int64_t bytes, int64_t *pnum);
 
 /*
  * "I/O or GS" API functions. These functions can run without
@@ -101,77 +110,77 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
  * the "I/O or GS" API.
  */
 
-int generated_co_wrapper blk_pread(BlockBackend *blk, int64_t offset,
-                                   int64_t bytes, void *buf,
-                                   BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pread(BlockBackend *blk, int64_t offset,
+                               int64_t bytes, void *buf,
+                               BdrvRequestFlags flags);
 int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset, int64_t bytes,
                               void *buf, BdrvRequestFlags flags);
 
-int generated_co_wrapper blk_preadv(BlockBackend *blk, int64_t offset,
-                                    int64_t bytes, QEMUIOVector *qiov,
-                                    BdrvRequestFlags flags);
+int co_wrapper_mixed blk_preadv(BlockBackend *blk, int64_t offset,
+                                int64_t bytes, QEMUIOVector *qiov,
+                                BdrvRequestFlags flags);
 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                int64_t bytes, QEMUIOVector *qiov,
                                BdrvRequestFlags flags);
 
-int generated_co_wrapper blk_preadv_part(BlockBackend *blk, int64_t offset,
-                                         int64_t bytes, QEMUIOVector *qiov,
-                                         size_t qiov_offset,
-                                         BdrvRequestFlags flags);
+int co_wrapper_mixed blk_preadv_part(BlockBackend *blk, int64_t offset,
+                                     int64_t bytes, QEMUIOVector *qiov,
+                                     size_t qiov_offset,
+                                     BdrvRequestFlags flags);
 int coroutine_fn blk_co_preadv_part(BlockBackend *blk, int64_t offset,
                                     int64_t bytes, QEMUIOVector *qiov,
                                     size_t qiov_offset, BdrvRequestFlags flags);
 
-int generated_co_wrapper blk_pwrite(BlockBackend *blk, int64_t offset,
-                                    int64_t bytes, const void *buf,
-                                    BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pwrite(BlockBackend *blk, int64_t offset,
+                                int64_t bytes, const void *buf,
+                                BdrvRequestFlags flags);
 int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset, int64_t bytes,
                                const void *buf, BdrvRequestFlags flags);
 
-int generated_co_wrapper blk_pwritev(BlockBackend *blk, int64_t offset,
-                                     int64_t bytes, QEMUIOVector *qiov,
-                                     BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pwritev(BlockBackend *blk, int64_t offset,
+                                 int64_t bytes, QEMUIOVector *qiov,
+                                 BdrvRequestFlags flags);
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                 int64_t bytes, QEMUIOVector *qiov,
                                 BdrvRequestFlags flags);
 
-int generated_co_wrapper blk_pwritev_part(BlockBackend *blk, int64_t offset,
-                                          int64_t bytes, QEMUIOVector *qiov,
-                                          size_t qiov_offset,
-                                          BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pwritev_part(BlockBackend *blk, int64_t offset,
+                                      int64_t bytes, QEMUIOVector *qiov,
+                                      size_t qiov_offset,
+                                      BdrvRequestFlags flags);
 int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
                                      int64_t bytes,
                                      QEMUIOVector *qiov, size_t qiov_offset,
                                      BdrvRequestFlags flags);
 
-int generated_co_wrapper blk_pwrite_compressed(BlockBackend *blk,
-                                               int64_t offset, int64_t bytes,
-                                               const void *buf);
+int co_wrapper_mixed blk_pwrite_compressed(BlockBackend *blk,
+                                           int64_t offset, int64_t bytes,
+                                           const void *buf);
 int coroutine_fn blk_co_pwrite_compressed(BlockBackend *blk, int64_t offset,
                                           int64_t bytes, const void *buf);
 
-int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                                           int64_t bytes,
-                                           BdrvRequestFlags flags);
+int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                       int64_t bytes,
+                                       BdrvRequestFlags flags);
 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
                                       int64_t bytes, BdrvRequestFlags flags);
 
-int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
-                                      int64_t bytes);
+int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset,
+                                  int64_t bytes);
 int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
                                  int64_t bytes);
 
-int generated_co_wrapper blk_flush(BlockBackend *blk);
+int co_wrapper_mixed blk_flush(BlockBackend *blk);
 int coroutine_fn blk_co_flush(BlockBackend *blk);
 
-int generated_co_wrapper blk_ioctl(BlockBackend *blk, unsigned long int req,
-                                   void *buf);
+int co_wrapper_mixed blk_ioctl(BlockBackend *blk, unsigned long int req,
+                               void *buf);
 int coroutine_fn blk_co_ioctl(BlockBackend *blk, unsigned long int req,
                               void *buf);
 
-int generated_co_wrapper blk_truncate(BlockBackend *blk, int64_t offset,
-                                      bool exact, PreallocMode prealloc,
-                                      BdrvRequestFlags flags, Error **errp);
+int co_wrapper_mixed blk_truncate(BlockBackend *blk, int64_t offset,
+                                  bool exact, PreallocMode prealloc,
+                                  BdrvRequestFlags flags, Error **errp);
 int coroutine_fn blk_co_truncate(BlockBackend *blk, int64_t offset, bool exact,
                                  PreallocMode prealloc, BdrvRequestFlags flags,
                                  Error **errp);
diff --git a/nbd/server.c b/nbd/server.c
index 0570596312..67ed333578 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1988,7 +1988,7 @@ static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
 }
 
 /* Do a sparse read and send the structured reply to the client.
- * Returns -errno if sending fails. bdrv_block_status_above() failure is
+ * Returns -errno if sending fails. blk_co_block_status_above() failure is
  * reported to the client, at which point this function succeeds.
  */
 static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
@@ -2004,10 +2004,10 @@ static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
 
     while (progress < size) {
         int64_t pnum;
-        int status = bdrv_block_status_above(blk_bs(exp->common.blk), NULL,
-                                             offset + progress,
-                                             size - progress, &pnum, NULL,
-                                             NULL);
+        int status = blk_co_block_status_above(exp->common.blk, NULL,
+                                               offset + progress,
+                                               size - progress, &pnum, NULL,
+                                               NULL);
         bool final;
 
         if (status < 0) {
@@ -2138,14 +2138,15 @@ static int nbd_extent_array_add(NBDExtentArray *ea,
     return 0;
 }
 
-static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset,
-                                  uint64_t bytes, NBDExtentArray *ea)
+static int coroutine_fn blockstatus_to_extents(BlockBackend *blk,
+                                               uint64_t offset, uint64_t bytes,
+                                               NBDExtentArray *ea)
 {
     while (bytes) {
         uint32_t flags;
         int64_t num;
-        int ret = bdrv_block_status_above(bs, NULL, offset, bytes, &num,
-                                          NULL, NULL);
+        int ret = blk_co_block_status_above(blk, NULL, offset, bytes, &num,
+                                            NULL, NULL);
 
         if (ret < 0) {
             return ret;
@@ -2165,13 +2166,14 @@ static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset,
     return 0;
 }
 
-static int blockalloc_to_extents(BlockDriverState *bs, uint64_t offset,
-                                 uint64_t bytes, NBDExtentArray *ea)
+static int coroutine_fn blockalloc_to_extents(BlockBackend *blk,
+                                              uint64_t offset, uint64_t bytes,
+                                              NBDExtentArray *ea)
 {
     while (bytes) {
         int64_t num;
-        int ret = bdrv_is_allocated_above(bs, NULL, false, offset, bytes,
-                                          &num);
+        int ret = blk_co_is_allocated_above(blk, NULL, false, offset, bytes,
+                                            &num);
 
         if (ret < 0) {
             return ret;
@@ -2217,20 +2219,21 @@ static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
 }
 
 /* Get block status from the exported device and send it to the client */
-static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
-                                    BlockDriverState *bs, uint64_t offset,
-                                    uint32_t length, bool dont_fragment,
-                                    bool last, uint32_t context_id,
-                                    Error **errp)
+static int
+coroutine_fn nbd_co_send_block_status(NBDClient *client, uint64_t handle,
+                                      BlockBackend *blk, uint64_t offset,
+                                      uint32_t length, bool dont_fragment,
+                                      bool last, uint32_t context_id,
+                                      Error **errp)
 {
     int ret;
     unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
     g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents);
 
     if (context_id == NBD_META_ID_BASE_ALLOCATION) {
-        ret = blockstatus_to_extents(bs, offset, length, ea);
+        ret = blockstatus_to_extents(blk, offset, length, ea);
     } else {
-        ret = blockalloc_to_extents(bs, offset, length, ea);
+        ret = blockalloc_to_extents(blk, offset, length, ea);
     }
     if (ret < 0) {
         return nbd_co_send_structured_error(
@@ -2557,7 +2560,7 @@ static coroutine_fn int nbd_handle_request(NBDClient *client,
 
             if (client->export_meta.base_allocation) {
                 ret = nbd_co_send_block_status(client, request->handle,
-                                               blk_bs(exp->common.blk),
+                                               exp->common.blk,
                                                request->from,
                                                request->len, dont_fragment,
                                                !--contexts_remaining,
@@ -2570,7 +2573,7 @@ static coroutine_fn int nbd_handle_request(NBDClient *client,
 
             if (client->export_meta.allocation_depth) {
                 ret = nbd_co_send_block_status(client, request->handle,
-                                               blk_bs(exp->common.blk),
+                                               exp->common.blk,
                                                request->from, request->len,
                                                dont_fragment,
                                                !--contexts_remaining,
diff --git a/scripts/block-coroutine-wrapper.py b/scripts/block-coroutine-wrapper.py
index 08be813407..6e087fa0b7 100644
--- a/scripts/block-coroutine-wrapper.py
+++ b/scripts/block-coroutine-wrapper.py
@@ -2,7 +2,7 @@
 """Generate coroutine wrappers for block subsystem.
 
 The program parses one or several concatenated c files from stdin,
-searches for functions with the 'generated_co_wrapper' specifier
+searches for functions with the 'co_wrapper' specifier
 and generates corresponding wrappers on stdout.
 
 Usage: block-coroutine-wrapper.py generated-file.c FILE.[ch]...
@@ -62,10 +62,28 @@ class ParamDecl:
 
 
 class FuncDecl:
-    def __init__(self, return_type: str, name: str, args: str) -> None:
+    def __init__(self, return_type: str, name: str, args: str,
+                 variant: str) -> None:
         self.return_type = return_type.strip()
         self.name = name.strip()
+        self.struct_name = snake_to_camel(self.name)
         self.args = [ParamDecl(arg.strip()) for arg in args.split(',')]
+        self.create_only_co = 'mixed' not in variant
+        self.graph_rdlock = 'bdrv_rdlock' in variant
+
+        subsystem, subname = self.name.split('_', 1)
+        self.co_name = f'{subsystem}_co_{subname}'
+
+        t = self.args[0].type
+        if t == 'BlockDriverState *':
+            ctx = 'bdrv_get_aio_context(bs)'
+        elif t == 'BdrvChild *':
+            ctx = 'bdrv_get_aio_context(child->bs)'
+        elif t == 'BlockBackend *':
+            ctx = 'blk_get_aio_context(blk)'
+        else:
+            ctx = 'qemu_get_aio_context()'
+        self.ctx = ctx
 
     def gen_list(self, format: str) -> str:
         return ', '.join(format.format_map(arg.__dict__) for arg in self.args)
@@ -74,17 +92,20 @@ class FuncDecl:
         return '\n'.join(format.format_map(arg.__dict__) for arg in self.args)
 
 
-# Match wrappers declared with a generated_co_wrapper mark
-func_decl_re = re.compile(r'^int\s*generated_co_wrapper\s*'
+# Match wrappers declared with a co_wrapper mark
+func_decl_re = re.compile(r'^(?P<return_type>[a-zA-Z][a-zA-Z0-9_]* [\*]?)'
+                          r'\s*co_wrapper'
+                          r'(?P<variant>(_[a-z][a-z0-9_]*)?)\s*'
                           r'(?P<wrapper_name>[a-z][a-z0-9_]*)'
                           r'\((?P<args>[^)]*)\);$', re.MULTILINE)
 
 
 def func_decl_iter(text: str) -> Iterator:
     for m in func_decl_re.finditer(text):
-        yield FuncDecl(return_type='int',
+        yield FuncDecl(return_type=m.group('return_type'),
                        name=m.group('wrapper_name'),
-                       args=m.group('args'))
+                       args=m.group('args'),
+                       variant=m.group('variant'))
 
 
 def snake_to_camel(func_name: str) -> str:
@@ -97,24 +118,75 @@ def snake_to_camel(func_name: str) -> str:
     return ''.join(words)
 
 
+def create_mixed_wrapper(func: FuncDecl) -> str:
+    """
+    Checks if we are already in coroutine
+    """
+    name = func.co_name
+    struct_name = func.struct_name
+    graph_assume_lock = 'assume_graph_lock();' if func.graph_rdlock else ''
+
+    return f"""\
+{func.return_type} {func.name}({ func.gen_list('{decl}') })
+{{
+    if (qemu_in_coroutine()) {{
+        {graph_assume_lock}
+        return {name}({ func.gen_list('{name}') });
+    }} else {{
+        {struct_name} s = {{
+            .poll_state.ctx = {func.ctx},
+            .poll_state.in_progress = true,
+
+{ func.gen_block('            .{name} = {name},') }
+        }};
+
+        s.poll_state.co = qemu_coroutine_create({name}_entry, &s);
+
+        bdrv_poll_co(&s.poll_state);
+        return s.ret;
+    }}
+}}"""
+
+
+def create_co_wrapper(func: FuncDecl) -> str:
+    """
+    Assumes we are not in coroutine, and creates one
+    """
+    name = func.co_name
+    struct_name = func.struct_name
+    return f"""\
+{func.return_type} {func.name}({ func.gen_list('{decl}') })
+{{
+    {struct_name} s = {{
+        .poll_state.ctx = {func.ctx},
+        .poll_state.in_progress = true,
+
+{ func.gen_block('        .{name} = {name},') }
+    }};
+    assert(!qemu_in_coroutine());
+
+    s.poll_state.co = qemu_coroutine_create({name}_entry, &s);
+
+    bdrv_poll_co(&s.poll_state);
+    return s.ret;
+}}"""
+
+
 def gen_wrapper(func: FuncDecl) -> str:
     assert not '_co_' in func.name
-    assert func.return_type == 'int'
-    assert func.args[0].type in ['BlockDriverState *', 'BdrvChild *',
-                                 'BlockBackend *']
 
-    subsystem, subname = func.name.split('_', 1)
+    name = func.co_name
+    struct_name = func.struct_name
 
-    name = f'{subsystem}_co_{subname}'
+    graph_lock=''
+    graph_unlock=''
+    if func.graph_rdlock:
+        graph_lock='    bdrv_graph_co_rdlock();'
+        graph_unlock='    bdrv_graph_co_rdunlock();'
 
-    t = func.args[0].type
-    if t == 'BlockDriverState *':
-        bs = 'bs'
-    elif t == 'BdrvChild *':
-        bs = 'child->bs'
-    else:
-        bs = 'blk_bs(blk)'
-    struct_name = snake_to_camel(name)
+    creation_function = create_mixed_wrapper
+    if func.create_only_co:
+        creation_function = create_co_wrapper
 
     return f"""\
 /*
@@ -123,6 +195,7 @@ def gen_wrapper(func: FuncDecl) -> str:
 
 typedef struct {struct_name} {{
     BdrvPollCo poll_state;
+    {func.return_type} ret;
 { func.gen_block('    {decl};') }
 }} {struct_name};
 
@@ -130,29 +203,15 @@ static void coroutine_fn {name}_entry(void *opaque)
 {{
     {struct_name} *s = opaque;
 
-    s->poll_state.ret = {name}({ func.gen_list('s->{name}') });
+{graph_lock}
+    s->ret = {name}({ func.gen_list('s->{name}') });
+{graph_unlock}
     s->poll_state.in_progress = false;
 
     aio_wait_kick();
 }}
 
-int {func.name}({ func.gen_list('{decl}') })
-{{
-    if (qemu_in_coroutine()) {{
-        return {name}({ func.gen_list('{name}') });
-    }} else {{
-        {struct_name} s = {{
-            .poll_state.bs = {bs},
-            .poll_state.in_progress = true,
-
-{ func.gen_block('            .{name} = {name},') }
-        }};
-
-        s.poll_state.co = qemu_coroutine_create({name}_entry, &s);
-
-        return bdrv_poll_co(&s.poll_state);
-    }}
-}}"""
+{creation_function(func)}"""
 
 
 def gen_wrappers(input_code: str) -> str:
diff --git a/stubs/graph-lock.c b/stubs/graph-lock.c
new file mode 100644
index 0000000000..177aa0a8ba
--- /dev/null
+++ b/stubs/graph-lock.c
@@ -0,0 +1,10 @@
+#include "qemu/osdep.h"
+#include "block/graph-lock.h"
+
+void register_aiocontext(AioContext *ctx)
+{
+}
+
+void unregister_aiocontext(AioContext *ctx)
+{
+}
diff --git a/stubs/meson.build b/stubs/meson.build
index c96a74f095..981585cbdf 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -13,6 +13,7 @@ stub_ss.add(files('error-printf.c'))
 stub_ss.add(files('fdset.c'))
 stub_ss.add(files('gdbstub.c'))
 stub_ss.add(files('get-vm-name.c'))
+stub_ss.add(files('graph-lock.c'))
 if linux_io_uring.found()
   stub_ss.add(files('io_uring.c'))
 endif
diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index 09dc4a4891..8cedea4959 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -38,16 +38,26 @@ typedef struct BDRVTestState {
     bool sleep_in_drain_begin;
 } BDRVTestState;
 
-static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
+static void coroutine_fn sleep_in_drain_begin(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+
+    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+    bdrv_dec_in_flight(bs);
+}
+
+static void bdrv_test_drain_begin(BlockDriverState *bs)
 {
     BDRVTestState *s = bs->opaque;
     s->drain_count++;
     if (s->sleep_in_drain_begin) {
-        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+        Coroutine *co = qemu_coroutine_create(sleep_in_drain_begin, bs);
+        bdrv_inc_in_flight(bs);
+        aio_co_enter(bdrv_get_aio_context(bs), co);
     }
 }
 
-static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
+static void bdrv_test_drain_end(BlockDriverState *bs)
 {
     BDRVTestState *s = bs->opaque;
     s->drain_count--;
@@ -101,8 +111,8 @@ static BlockDriver bdrv_test = {
     .bdrv_close             = bdrv_test_close,
     .bdrv_co_preadv         = bdrv_test_co_preadv,
 
-    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
-    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+    .bdrv_drain_begin       = bdrv_test_drain_begin,
+    .bdrv_drain_end         = bdrv_test_drain_end,
 
     .bdrv_child_perm        = bdrv_default_perms,
 
@@ -146,7 +156,6 @@ static void call_in_coroutine(void (*entry)(void))
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
-    BDRV_SUBTREE_DRAIN,
     DRAIN_TYPE_MAX,
 };
 
@@ -155,7 +164,6 @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
-    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -165,7 +173,6 @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
-    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -261,11 +268,6 @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
-static void test_drv_cb_drain_subtree(void)
-{
-    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
-}
-
 static void test_drv_cb_co_drain_all(void)
 {
     call_in_coroutine(test_drv_cb_drain_all);
@@ -276,11 +278,6 @@ static void test_drv_cb_co_drain(void)
     call_in_coroutine(test_drv_cb_drain);
 }
 
-static void test_drv_cb_co_drain_subtree(void)
-{
-    call_in_coroutine(test_drv_cb_drain_subtree);
-}
-
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -299,7 +296,11 @@ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 
     do_drain_begin(drain_type, bs);
 
-    g_assert_cmpint(bs->quiesce_counter, ==, 1);
+    if (drain_type == BDRV_DRAIN_ALL) {
+        g_assert_cmpint(bs->quiesce_counter, ==, 2);
+    } else {
+        g_assert_cmpint(bs->quiesce_counter, ==, 1);
+    }
     g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
 
     do_drain_end(drain_type, bs);
@@ -322,11 +323,6 @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
-static void test_quiesce_drain_subtree(void)
-{
-    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
-}
-
 static void test_quiesce_co_drain_all(void)
 {
     call_in_coroutine(test_quiesce_drain_all);
@@ -337,11 +333,6 @@ static void test_quiesce_co_drain(void)
     call_in_coroutine(test_quiesce_drain);
 }
 
-static void test_quiesce_co_drain_subtree(void)
-{
-    call_in_coroutine(test_quiesce_drain_subtree);
-}
-
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -361,8 +352,8 @@ static void test_nested(void)
 
     for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
         for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
-            int backing_quiesce = (outer != BDRV_DRAIN) +
-                                  (inner != BDRV_DRAIN);
+            int backing_quiesce = (outer == BDRV_DRAIN_ALL) +
+                                  (inner == BDRV_DRAIN_ALL);
 
             g_assert_cmpint(bs->quiesce_counter, ==, 0);
             g_assert_cmpint(backing->quiesce_counter, ==, 0);
@@ -372,10 +363,10 @@ static void test_nested(void)
             do_drain_begin(outer, bs);
             do_drain_begin(inner, bs);
 
-            g_assert_cmpint(bs->quiesce_counter, ==, 2);
+            g_assert_cmpint(bs->quiesce_counter, ==, 2 + !!backing_quiesce);
             g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
-            g_assert_cmpint(s->drain_count, ==, 2);
-            g_assert_cmpint(backing_s->drain_count, ==, backing_quiesce);
+            g_assert_cmpint(s->drain_count, ==, 1);
+            g_assert_cmpint(backing_s->drain_count, ==, !!backing_quiesce);
 
             do_drain_end(inner, bs);
             do_drain_end(outer, bs);
@@ -392,158 +383,6 @@ static void test_nested(void)
     blk_unref(blk);
 }
 
-static void test_multiparent(void)
-{
-    BlockBackend *blk_a, *blk_b;
-    BlockDriverState *bs_a, *bs_b, *backing;
-    BDRVTestState *a_s, *b_s, *backing_s;
-
-    blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
-    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
-                                &error_abort);
-    a_s = bs_a->opaque;
-    blk_insert_bs(blk_a, bs_a, &error_abort);
-
-    blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
-    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
-                                &error_abort);
-    b_s = bs_b->opaque;
-    blk_insert_bs(blk_b, bs_b, &error_abort);
-
-    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-    backing_s = backing->opaque;
-    bdrv_set_backing_hd(bs_a, backing, &error_abort);
-    bdrv_set_backing_hd(bs_b, backing, &error_abort);
-
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
-    g_assert_cmpint(backing->quiesce_counter, ==, 0);
-    g_assert_cmpint(a_s->drain_count, ==, 0);
-    g_assert_cmpint(b_s->drain_count, ==, 0);
-    g_assert_cmpint(backing_s->drain_count, ==, 0);
-
-    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
-
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
-    g_assert_cmpint(backing->quiesce_counter, ==, 1);
-    g_assert_cmpint(a_s->drain_count, ==, 1);
-    g_assert_cmpint(b_s->drain_count, ==, 1);
-    g_assert_cmpint(backing_s->drain_count, ==, 1);
-
-    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
-
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
-    g_assert_cmpint(backing->quiesce_counter, ==, 2);
-    g_assert_cmpint(a_s->drain_count, ==, 2);
-    g_assert_cmpint(b_s->drain_count, ==, 2);
-    g_assert_cmpint(backing_s->drain_count, ==, 2);
-
-    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
-
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
-    g_assert_cmpint(backing->quiesce_counter, ==, 1);
-    g_assert_cmpint(a_s->drain_count, ==, 1);
-    g_assert_cmpint(b_s->drain_count, ==, 1);
-    g_assert_cmpint(backing_s->drain_count, ==, 1);
-
-    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
-
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
-    g_assert_cmpint(backing->quiesce_counter, ==, 0);
-    g_assert_cmpint(a_s->drain_count, ==, 0);
-    g_assert_cmpint(b_s->drain_count, ==, 0);
-    g_assert_cmpint(backing_s->drain_count, ==, 0);
-
-    bdrv_unref(backing);
-    bdrv_unref(bs_a);
-    bdrv_unref(bs_b);
-    blk_unref(blk_a);
-    blk_unref(blk_b);
-}
-
-static void test_graph_change_drain_subtree(void)
-{
-    BlockBackend *blk_a, *blk_b;
-    BlockDriverState *bs_a, *bs_b, *backing;
-    BDRVTestState *a_s, *b_s, *backing_s;
-
-    blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
-    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
-                                &error_abort);
-    a_s = bs_a->opaque;
-    blk_insert_bs(blk_a, bs_a, &error_abort);
-
-    blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
-    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
-                                &error_abort);
-    b_s = bs_b->opaque;
-    blk_insert_bs(blk_b, bs_b, &error_abort);
-
-    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-    backing_s = backing->opaque;
-    bdrv_set_backing_hd(bs_a, backing, &error_abort);
-
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
-    g_assert_cmpint(backing->quiesce_counter, ==, 0);
-    g_assert_cmpint(a_s->drain_count, ==, 0);
-    g_assert_cmpint(b_s->drain_count, ==, 0);
-    g_assert_cmpint(backing_s->drain_count, ==, 0);
-
-    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
-    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
-    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
-    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
-    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
-
-    bdrv_set_backing_hd(bs_b, backing, &error_abort);
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
-    g_assert_cmpint(backing->quiesce_counter, ==, 5);
-    g_assert_cmpint(a_s->drain_count, ==, 5);
-    g_assert_cmpint(b_s->drain_count, ==, 5);
-    g_assert_cmpint(backing_s->drain_count, ==, 5);
-
-    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
-    g_assert_cmpint(backing->quiesce_counter, ==, 3);
-    g_assert_cmpint(a_s->drain_count, ==, 3);
-    g_assert_cmpint(b_s->drain_count, ==, 2);
-    g_assert_cmpint(backing_s->drain_count, ==, 3);
-
-    bdrv_set_backing_hd(bs_b, backing, &error_abort);
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
-    g_assert_cmpint(backing->quiesce_counter, ==, 5);
-    g_assert_cmpint(a_s->drain_count, ==, 5);
-    g_assert_cmpint(b_s->drain_count, ==, 5);
-    g_assert_cmpint(backing_s->drain_count, ==, 5);
-
-    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
-    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
-    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
-    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
-    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
-
-    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
-    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
-    g_assert_cmpint(backing->quiesce_counter, ==, 0);
-    g_assert_cmpint(a_s->drain_count, ==, 0);
-    g_assert_cmpint(b_s->drain_count, ==, 0);
-    g_assert_cmpint(backing_s->drain_count, ==, 0);
-
-    bdrv_unref(backing);
-    bdrv_unref(bs_a);
-    bdrv_unref(bs_b);
-    blk_unref(blk_a);
-    blk_unref(blk_b);
-}
-
 static void test_graph_change_drain_all(void)
 {
     BlockBackend *blk_a, *blk_b;
@@ -763,12 +602,6 @@ static void test_iothread_drain(void)
     test_iothread_common(BDRV_DRAIN, 1);
 }
 
-static void test_iothread_drain_subtree(void)
-{
-    test_iothread_common(BDRV_SUBTREE_DRAIN, 0);
-    test_iothread_common(BDRV_SUBTREE_DRAIN, 1);
-}
-
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -853,7 +686,6 @@ enum test_job_result {
 enum test_job_drain_node {
     TEST_JOB_DRAIN_SRC,
     TEST_JOB_DRAIN_SRC_CHILD,
-    TEST_JOB_DRAIN_SRC_PARENT,
 };
 
 static void test_blockjob_common_drain_node(enum drain_type drain_type,
@@ -891,9 +723,6 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
     case TEST_JOB_DRAIN_SRC_CHILD:
         drain_bs = src_backing;
         break;
-    case TEST_JOB_DRAIN_SRC_PARENT:
-        drain_bs = src_overlay;
-        break;
     default:
         g_assert_not_reached();
     }
@@ -1045,10 +874,6 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread,
                                     TEST_JOB_DRAIN_SRC);
     test_blockjob_common_drain_node(drain_type, use_iothread, result,
                                     TEST_JOB_DRAIN_SRC_CHILD);
-    if (drain_type == BDRV_SUBTREE_DRAIN) {
-        test_blockjob_common_drain_node(drain_type, use_iothread, result,
-                                        TEST_JOB_DRAIN_SRC_PARENT);
-    }
 }
 
 static void test_blockjob_drain_all(void)
@@ -1061,11 +886,6 @@ static void test_blockjob_drain(void)
     test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_SUCCESS);
 }
 
-static void test_blockjob_drain_subtree(void)
-{
-    test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_SUCCESS);
-}
-
 static void test_blockjob_error_drain_all(void)
 {
     test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_FAIL_RUN);
@@ -1078,12 +898,6 @@ static void test_blockjob_error_drain(void)
     test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_FAIL_PREPARE);
 }
 
-static void test_blockjob_error_drain_subtree(void)
-{
-    test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_RUN);
-    test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_PREPARE);
-}
-
 static void test_blockjob_iothread_drain_all(void)
 {
     test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_SUCCESS);
@@ -1094,11 +908,6 @@ static void test_blockjob_iothread_drain(void)
     test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_SUCCESS);
 }
 
-static void test_blockjob_iothread_drain_subtree(void)
-{
-    test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_SUCCESS);
-}
-
 static void test_blockjob_iothread_error_drain_all(void)
 {
     test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_FAIL_RUN);
@@ -1111,12 +920,6 @@ static void test_blockjob_iothread_error_drain(void)
     test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_FAIL_PREPARE);
 }
 
-static void test_blockjob_iothread_error_drain_subtree(void)
-{
-    test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_RUN);
-    test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_PREPARE);
-}
-
 
 typedef struct BDRVTestTopState {
     BdrvChild *wait_child;
@@ -1263,14 +1066,6 @@ static void do_test_delete_by_drain(bool detach_instead_of_delete,
         bdrv_drain(child_bs);
         bdrv_unref(child_bs);
         break;
-    case BDRV_SUBTREE_DRAIN:
-        /* Would have to ref/unref bs here for !detach_instead_of_delete, but
-         * then the whole test becomes pointless because the graph changes
-         * don't occur during the drain any more. */
-        assert(detach_instead_of_delete);
-        bdrv_subtree_drained_begin(bs);
-        bdrv_subtree_drained_end(bs);
-        break;
     case BDRV_DRAIN_ALL:
         bdrv_drain_all_begin();
         bdrv_drain_all_end();
@@ -1305,11 +1100,6 @@ static void test_detach_by_drain(void)
     do_test_delete_by_drain(true, BDRV_DRAIN);
 }
 
-static void test_detach_by_drain_subtree(void)
-{
-    do_test_delete_by_drain(true, BDRV_SUBTREE_DRAIN);
-}
-
 
 struct detach_by_parent_data {
     BlockDriverState *parent_b;
@@ -1317,6 +1107,7 @@ struct detach_by_parent_data {
     BlockDriverState *c;
     BdrvChild *child_c;
     bool by_parent_cb;
+    bool detach_on_drain;
 };
 static struct detach_by_parent_data detach_by_parent_data;
 
@@ -1324,6 +1115,7 @@ static void detach_indirect_bh(void *opaque)
 {
     struct detach_by_parent_data *data = opaque;
 
+    bdrv_dec_in_flight(data->child_b->bs);
     bdrv_unref_child(data->parent_b, data->child_b);
 
     bdrv_ref(data->c);
@@ -1338,12 +1130,21 @@ static void detach_by_parent_aio_cb(void *opaque, int ret)
 
     g_assert_cmpint(ret, ==, 0);
     if (data->by_parent_cb) {
+        bdrv_inc_in_flight(data->child_b->bs);
         detach_indirect_bh(data);
     }
 }
 
 static void detach_by_driver_cb_drained_begin(BdrvChild *child)
 {
+    struct detach_by_parent_data *data = &detach_by_parent_data;
+
+    if (!data->detach_on_drain) {
+        return;
+    }
+    data->detach_on_drain = false;
+
+    bdrv_inc_in_flight(data->child_b->bs);
     aio_bh_schedule_oneshot(qemu_get_current_aio_context(),
                             detach_indirect_bh, &detach_by_parent_data);
     child_of_bds.drained_begin(child);
@@ -1384,8 +1185,14 @@ static void test_detach_indirect(bool by_parent_cb)
         detach_by_driver_cb_class = child_of_bds;
         detach_by_driver_cb_class.drained_begin =
             detach_by_driver_cb_drained_begin;
+        detach_by_driver_cb_class.drained_end = NULL;
+        detach_by_driver_cb_class.drained_poll = NULL;
     }
 
+    detach_by_parent_data = (struct detach_by_parent_data) {
+        .detach_on_drain = false,
+    };
+
     /* Create all involved nodes */
     parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
                                     &error_abort);
@@ -1437,12 +1244,16 @@ static void test_detach_indirect(bool by_parent_cb)
         .child_b = child_b,
         .c = c,
         .by_parent_cb = by_parent_cb,
+        .detach_on_drain = true,
     };
     acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
     g_assert(acb != NULL);
 
     /* Drain and check the expected result */
-    bdrv_subtree_drained_begin(parent_b);
+    bdrv_drained_begin(parent_b);
+    bdrv_drained_begin(a);
+    bdrv_drained_begin(b);
+    bdrv_drained_begin(c);
 
     g_assert(detach_by_parent_data.child_c != NULL);
 
@@ -1457,12 +1268,15 @@ static void test_detach_indirect(bool by_parent_cb)
     g_assert(QLIST_NEXT(child_a, next) == NULL);
 
     g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
-    g_assert_cmpint(parent_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(parent_b->quiesce_counter, ==, 3);
     g_assert_cmpint(a->quiesce_counter, ==, 1);
-    g_assert_cmpint(b->quiesce_counter, ==, 0);
+    g_assert_cmpint(b->quiesce_counter, ==, 1);
     g_assert_cmpint(c->quiesce_counter, ==, 1);
 
-    bdrv_subtree_drained_end(parent_b);
+    bdrv_drained_end(parent_b);
+    bdrv_drained_end(a);
+    bdrv_drained_end(b);
+    bdrv_drained_end(c);
 
     bdrv_unref(parent_b);
     blk_unref(blk);
@@ -1693,6 +1507,7 @@ static void test_blockjob_commit_by_drained_end(void)
     bdrv_drained_begin(bs_child);
     g_assert(!job_has_completed);
     bdrv_drained_end(bs_child);
+    aio_poll(qemu_get_aio_context(), false);
     g_assert(job_has_completed);
 
     bdrv_unref(bs_parents[0]);
@@ -1848,6 +1663,7 @@ static void test_drop_intermediate_poll(void)
 
     g_assert(!job_has_completed);
     ret = bdrv_drop_intermediate(chain[1], chain[0], NULL);
+    aio_poll(qemu_get_aio_context(), false);
     g_assert(ret == 0);
     g_assert(job_has_completed);
 
@@ -1856,6 +1672,7 @@ static void test_drop_intermediate_poll(void)
 
 
 typedef struct BDRVReplaceTestState {
+    bool setup_completed;
     bool was_drained;
     bool was_undrained;
     bool has_read;
@@ -1916,52 +1733,78 @@ static int coroutine_fn bdrv_replace_test_co_preadv(BlockDriverState *bs,
     return 0;
 }
 
+static void coroutine_fn bdrv_replace_test_drain_co(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    BDRVReplaceTestState *s = bs->opaque;
+
+    /* Keep waking io_co up until it is done */
+    while (s->io_co) {
+        aio_co_wake(s->io_co);
+        s->io_co = NULL;
+        qemu_coroutine_yield();
+    }
+    s->drain_co = NULL;
+    bdrv_dec_in_flight(bs);
+}
+
 /**
  * If .drain_count is 0, wake up .io_co if there is one; and set
  * .was_drained.
  * Increment .drain_count.
  */
-static void coroutine_fn bdrv_replace_test_co_drain_begin(BlockDriverState *bs)
+static void bdrv_replace_test_drain_begin(BlockDriverState *bs)
 {
     BDRVReplaceTestState *s = bs->opaque;
 
-    if (!s->drain_count) {
-        /* Keep waking io_co up until it is done */
-        s->drain_co = qemu_coroutine_self();
-        while (s->io_co) {
-            aio_co_wake(s->io_co);
-            s->io_co = NULL;
-            qemu_coroutine_yield();
-        }
-        s->drain_co = NULL;
+    if (!s->setup_completed) {
+        return;
+    }
 
+    if (!s->drain_count) {
+        s->drain_co = qemu_coroutine_create(bdrv_replace_test_drain_co, bs);
+        bdrv_inc_in_flight(bs);
+        aio_co_enter(bdrv_get_aio_context(bs), s->drain_co);
         s->was_drained = true;
     }
     s->drain_count++;
 }
 
+static void coroutine_fn bdrv_replace_test_read_entry(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    char data;
+    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, &data, 1);
+    int ret;
+
+    /* Queue a read request post-drain */
+    ret = bdrv_replace_test_co_preadv(bs, 0, 1, &qiov, 0);
+    g_assert(ret >= 0);
+    bdrv_dec_in_flight(bs);
+}
+
 /**
  * Reduce .drain_count, set .was_undrained once it reaches 0.
  * If .drain_count reaches 0 and the node has a backing file, issue a
  * read request.
  */
-static void coroutine_fn bdrv_replace_test_co_drain_end(BlockDriverState *bs)
+static void bdrv_replace_test_drain_end(BlockDriverState *bs)
 {
     BDRVReplaceTestState *s = bs->opaque;
 
+    if (!s->setup_completed) {
+        return;
+    }
+
     g_assert(s->drain_count > 0);
     if (!--s->drain_count) {
-        int ret;
-
         s->was_undrained = true;
 
         if (bs->backing) {
-            char data;
-            QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, &data, 1);
-
-            /* Queue a read request post-drain */
-            ret = bdrv_replace_test_co_preadv(bs, 0, 1, &qiov, 0);
-            g_assert(ret >= 0);
+            Coroutine *co = qemu_coroutine_create(bdrv_replace_test_read_entry,
+                                                  bs);
+            bdrv_inc_in_flight(bs);
+            aio_co_enter(bdrv_get_aio_context(bs), co);
         }
     }
 }
@@ -1974,8 +1817,8 @@ static BlockDriver bdrv_replace_test = {
     .bdrv_close             = bdrv_replace_test_close,
     .bdrv_co_preadv         = bdrv_replace_test_co_preadv,
 
-    .bdrv_co_drain_begin    = bdrv_replace_test_co_drain_begin,
-    .bdrv_co_drain_end      = bdrv_replace_test_co_drain_end,
+    .bdrv_drain_begin       = bdrv_replace_test_drain_begin,
+    .bdrv_drain_end         = bdrv_replace_test_drain_end,
 
     .bdrv_child_perm        = bdrv_default_perms,
 };
@@ -2051,6 +1894,7 @@ static void do_test_replace_child_mid_drain(int old_drain_count,
     bdrv_ref(old_child_bs);
     bdrv_attach_child(parent_bs, old_child_bs, "child", &child_of_bds,
                       BDRV_CHILD_COW, &error_abort);
+    parent_s->setup_completed = true;
 
     for (i = 0; i < old_drain_count; i++) {
         bdrv_drained_begin(old_child_bs);
@@ -2172,70 +2016,47 @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
-    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
-                    test_drv_cb_drain_subtree);
 
     g_test_add_func("/bdrv-drain/driver-cb/co/drain_all",
                     test_drv_cb_co_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
-    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
-                    test_drv_cb_co_drain_subtree);
-
 
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
-    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
-                    test_quiesce_drain_subtree);
 
     g_test_add_func("/bdrv-drain/quiesce/co/drain_all",
                     test_quiesce_co_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
-    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
-                    test_quiesce_co_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
-    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
 
-    g_test_add_func("/bdrv-drain/graph-change/drain_subtree",
-                    test_graph_change_drain_subtree);
     g_test_add_func("/bdrv-drain/graph-change/drain_all",
                     test_graph_change_drain_all);
 
     g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
     g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
-    g_test_add_func("/bdrv-drain/iothread/drain_subtree",
-                    test_iothread_drain_subtree);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
-                    test_blockjob_drain_subtree);
 
     g_test_add_func("/bdrv-drain/blockjob/error/drain_all",
                     test_blockjob_error_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/error/drain",
                     test_blockjob_error_drain);
-    g_test_add_func("/bdrv-drain/blockjob/error/drain_subtree",
-                    test_blockjob_error_drain_subtree);
 
     g_test_add_func("/bdrv-drain/blockjob/iothread/drain_all",
                     test_blockjob_iothread_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/iothread/drain",
                     test_blockjob_iothread_drain);
-    g_test_add_func("/bdrv-drain/blockjob/iothread/drain_subtree",
-                    test_blockjob_iothread_drain_subtree);
 
     g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_all",
                     test_blockjob_iothread_error_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain",
                     test_blockjob_iothread_error_drain);
-    g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_subtree",
-                    test_blockjob_iothread_error_drain_subtree);
 
     g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
     g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all);
     g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
-    g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
     g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
     g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb);
 
diff --git a/util/async.c b/util/async.c
index 63434ddae4..14d63b3091 100644
--- a/util/async.c
+++ b/util/async.c
@@ -27,6 +27,7 @@
 #include "qapi/error.h"
 #include "block/aio.h"
 #include "block/thread-pool.h"
+#include "block/graph-lock.h"
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
 #include "qemu/rcu_queue.h"
@@ -376,6 +377,7 @@ aio_ctx_finalize(GSource     *source)
     qemu_rec_mutex_destroy(&ctx->lock);
     qemu_lockcnt_destroy(&ctx->list_lock);
     timerlistgroup_deinit(&ctx->tlg);
+    unregister_aiocontext(ctx);
     aio_context_destroy(ctx);
 }
 
@@ -574,6 +576,8 @@ AioContext *aio_context_new(Error **errp)
     ctx->thread_pool_min = 0;
     ctx->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
 
+    register_aiocontext(ctx);
+
     return ctx;
 fail:
     g_source_destroy(&ctx->source);