From f1bf3be14bd5d6e6a2cfbbe64cdd4d58a8595d68 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:38 +0200 Subject: block: remove outdated comments about AioContext locking AioContext locking was removed in commit b49f4755c7 ("block: remove AioContext locking"). Signed-off-by: Fiona Ebner Reviewed-by: Kevin Wolf Message-ID: <20250530151125.955508-2-f.ebner@proxmox.com> Signed-off-by: Kevin Wolf --- block.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index f222e1a50a..a5399888ba 100644 --- a/block.c +++ b/block.c @@ -4359,8 +4359,6 @@ bdrv_recurse_has_child(BlockDriverState *bs, BlockDriverState *child) * bs_queue, or the existing bs_queue being used. * * bs is drained here and undrained by bdrv_reopen_queue_free(). - * - * To be called with bs->aio_context locked. */ static BlockReopenQueue * GRAPH_RDLOCK bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, @@ -4519,7 +4517,6 @@ bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, return bs_queue; } -/* To be called with bs->aio_context locked */ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, BlockDriverState *bs, QDict *options, bool keep_old_opts) @@ -7278,10 +7275,6 @@ bool bdrv_op_blocker_is_empty(BlockDriverState *bs) return true; } -/* - * Must not be called while holding the lock of an AioContext other than the - * current one. - */ void bdrv_img_create(const char *filename, const char *fmt, const char *base_filename, const char *base_fmt, char *options, uint64_t img_size, int flags, bool quiet, -- cgit 1.4.1 From e1d681b3e1d8256047dbfc6d2c796028b9694eaf Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:39 +0200 Subject: block: move drain outside of read-locked bdrv_reopen_queue_child() This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. More granular draining is not trivially possible, because bdrv_reopen_queue_child() can recursively call itself. Signed-off-by: Fiona Ebner Reviewed-by: Kevin Wolf Message-ID: <20250530151125.955508-3-f.ebner@proxmox.com> Signed-off-by: Kevin Wolf --- block.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index a5399888ba..3065d5c91e 100644 --- a/block.c +++ b/block.c @@ -4358,7 +4358,7 @@ bdrv_recurse_has_child(BlockDriverState *bs, BlockDriverState *child) * returns a pointer to bs_queue, which is either the newly allocated * bs_queue, or the existing bs_queue being used. * - * bs is drained here and undrained by bdrv_reopen_queue_free(). + * bs must be drained. */ static BlockReopenQueue * GRAPH_RDLOCK bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, @@ -4377,12 +4377,7 @@ bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, GLOBAL_STATE_CODE(); - /* - * Strictly speaking, draining is illegal under GRAPH_RDLOCK. We know that - * we've been called with bdrv_graph_rdlock_main_loop(), though, so it's ok - * in practice. - */ - bdrv_drained_begin(bs); + assert(bs->quiesce_counter > 0); if (bs_queue == NULL) { bs_queue = g_new0(BlockReopenQueue, 1); @@ -4522,6 +4517,12 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, QDict *options, bool keep_old_opts) { GLOBAL_STATE_CODE(); + + if (bs_queue == NULL) { + /* Paired with bdrv_drain_all_end() in bdrv_reopen_queue_free(). */ + bdrv_drain_all_begin(); + } + GRAPH_RDLOCK_GUARD_MAINLOOP(); return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false, @@ -4534,12 +4535,14 @@ void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue) if (bs_queue) { BlockReopenQueueEntry *bs_entry, *next; QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { - bdrv_drained_end(bs_entry->state.bs); qobject_unref(bs_entry->state.explicit_options); qobject_unref(bs_entry->state.options); g_free(bs_entry); } g_free(bs_queue); + + /* Paired with bdrv_drain_all_begin() in bdrv_reopen_queue(). */ + bdrv_drain_all_end(); } } -- cgit 1.4.1 From 841998e08650f5b4476fa2d1eb84a592ab405f51 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:41 +0200 Subject: block: move drain outside of read-locked bdrv_inactivate_recurse() This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. More granular draining is not trivially possible, because bdrv_inactivate_recurse() can recursively call itself. Signed-off-by: Fiona Ebner Reviewed-by: Kevin Wolf Message-ID: <20250530151125.955508-5-f.ebner@proxmox.com> Signed-off-by: Kevin Wolf --- block.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index 3065d5c91e..fa55dfba68 100644 --- a/block.c +++ b/block.c @@ -6989,6 +6989,8 @@ bdrv_inactivate_recurse(BlockDriverState *bs, bool top_level) GLOBAL_STATE_CODE(); + assert(bs->quiesce_counter > 0); + if (!bs->drv) { return -ENOMEDIUM; } @@ -7032,9 +7034,7 @@ bdrv_inactivate_recurse(BlockDriverState *bs, bool top_level) return -EPERM; } - bdrv_drained_begin(bs); bs->open_flags |= BDRV_O_INACTIVE; - bdrv_drained_end(bs); /* * Update permissions, they may differ for inactive nodes. @@ -7059,20 +7059,26 @@ int bdrv_inactivate(BlockDriverState *bs, Error **errp) int ret; GLOBAL_STATE_CODE(); - GRAPH_RDLOCK_GUARD_MAINLOOP(); + + bdrv_drain_all_begin(); + bdrv_graph_rdlock_main_loop(); if (bdrv_has_bds_parent(bs, true)) { error_setg(errp, "Node has active parent node"); - return -EPERM; + ret = -EPERM; + goto out; } ret = bdrv_inactivate_recurse(bs, true); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to inactivate node"); - return ret; + goto out; } - return 0; +out: + bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_end(); + return ret; } int bdrv_inactivate_all(void) @@ -7082,7 +7088,9 @@ int bdrv_inactivate_all(void) int ret = 0; GLOBAL_STATE_CODE(); - GRAPH_RDLOCK_GUARD_MAINLOOP(); + + bdrv_drain_all_begin(); + bdrv_graph_rdlock_main_loop(); for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { /* Nodes with BDS parents are covered by recursion from the last @@ -7098,6 +7106,9 @@ int bdrv_inactivate_all(void) } } + bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_end(); + return ret; } -- cgit 1.4.1 From 3758733959af93b5eb3283659d868ad5b24152b4 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:42 +0200 Subject: block: mark bdrv_parent_change_aio_context() GRAPH_RDLOCK This is a small step in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. More concretely, it allows marking the change_aio_ctx() callback GRAPH_RDLOCK_PTR, which is the next step. Signed-off-by: Fiona Ebner Reviewed-by: Kevin Wolf Message-ID: <20250530151125.955508-6-f.ebner@proxmox.com> Signed-off-by: Kevin Wolf --- block.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index fa55dfba68..7207978e53 100644 --- a/block.c +++ b/block.c @@ -7575,10 +7575,10 @@ typedef struct BdrvStateSetAioContext { BlockDriverState *bs; } BdrvStateSetAioContext; -static bool bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx, - GHashTable *visited, - Transaction *tran, - Error **errp) +static bool GRAPH_RDLOCK +bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx, + GHashTable *visited, Transaction *tran, + Error **errp) { GLOBAL_STATE_CODE(); if (g_hash_table_contains(visited, c)) { -- cgit 1.4.1 From 844d550d09ac29ff2b1b49069587ae6a989df31d Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:43 +0200 Subject: block: mark change_aio_ctx() callback and instances as GRAPH_RDLOCK(_PTR) This is a small step in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. More concretely, it is in preparation to move the drain out of bdrv_change_aio_context() and marking that function as GRAPH_RDLOCK. Signed-off-by: Fiona Ebner Reviewed-by: Kevin Wolf Message-ID: <20250530151125.955508-7-f.ebner@proxmox.com> Signed-off-by: Kevin Wolf --- block.c | 7 ++++--- block/block-backend.c | 6 +++--- blockjob.c | 6 +++--- include/block/block_int-common.h | 6 +++--- 4 files changed, 13 insertions(+), 12 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index 7207978e53..01144c895e 100644 --- a/block.c +++ b/block.c @@ -1226,9 +1226,10 @@ static int bdrv_child_cb_inactivate(BdrvChild *child) return 0; } -static bool bdrv_child_cb_change_aio_ctx(BdrvChild *child, AioContext *ctx, - GHashTable *visited, Transaction *tran, - Error **errp) +static bool GRAPH_RDLOCK +bdrv_child_cb_change_aio_ctx(BdrvChild *child, AioContext *ctx, + GHashTable *visited, Transaction *tran, + Error **errp) { BlockDriverState *bs = child->opaque; return bdrv_change_aio_context(bs, ctx, visited, tran, errp); diff --git a/block/block-backend.c b/block/block-backend.c index a402db13f2..6a6949edeb 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -136,9 +136,9 @@ static void blk_root_drained_end(BdrvChild *child); static void blk_root_change_media(BdrvChild *child, bool load); static void blk_root_resize(BdrvChild *child); -static bool blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx, - GHashTable *visited, Transaction *tran, - Error **errp); +static bool GRAPH_RDLOCK +blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx, GHashTable *visited, + Transaction *tran, Error **errp); static char *blk_root_get_parent_desc(BdrvChild *child) { diff --git a/blockjob.c b/blockjob.c index 32007f31a9..34185d7715 100644 --- a/blockjob.c +++ b/blockjob.c @@ -144,9 +144,9 @@ static TransactionActionDrv change_child_job_context = { .clean = g_free, }; -static bool child_job_change_aio_ctx(BdrvChild *c, AioContext *ctx, - GHashTable *visited, Transaction *tran, - Error **errp) +static bool GRAPH_RDLOCK +child_job_change_aio_ctx(BdrvChild *c, AioContext *ctx, GHashTable *visited, + Transaction *tran, Error **errp) { BlockJob *job = c->opaque; BdrvStateChildJobContext *s; diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 2982dd3118..37466c7841 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -983,9 +983,9 @@ struct BdrvChildClass { bool backing_mask_protocol, Error **errp); - bool (*change_aio_ctx)(BdrvChild *child, AioContext *ctx, - GHashTable *visited, Transaction *tran, - Error **errp); + bool GRAPH_RDLOCK_PTR (*change_aio_ctx)(BdrvChild *child, AioContext *ctx, + GHashTable *visited, + Transaction *tran, Error **errp); /* * I/O API functions. These functions are thread-safe. -- cgit 1.4.1 From 91ba0e1c382bd4a4b9c6a200f8a175d6ff30ab99 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:45 +0200 Subject: block: move drain outside of bdrv_change_aio_context() and mark GRAPH_RDLOCK This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. Note that even if bdrv_drained_begin() were already marked as GRAPH_UNLOCKED, TSA would not complain about the instance in bdrv_change_aio_context() before this change, because it is preceded by a bdrv_graph_rdunlock_main_loop() call. It is not correct to release the lock here, and in case the caller holds a write lock, it wouldn't actually release the lock. In combination with block-stream, there is a deadlock that can happen because of this [0]. In particular, it can happen that main thread IO thread 1. acquires write lock in blk_co_do_preadv_part(): 2. have non-zero blk->in_flight 3. try to acquire read lock 4. begin drain Steps 3 and 4 might be switched. Draining will poll and get stuck, because it will see the non-zero in_flight counter. But the IO thread will not make any progress either, because it cannot acquire the read lock. After this change, all paths to bdrv_change_aio_context() drain: bdrv_change_aio_context() is called by: 1. bdrv_child_cb_change_aio_ctx() which is only called via the change_aio_ctx() callback, see below. 2. bdrv_child_change_aio_context(), see below. 3. bdrv_try_change_aio_context(), where a drained section is introduced. The change_aio_ctx() callback is called by: 1. bdrv_attach_child_common_abort(), where a drained section is introduced. 2. bdrv_attach_child_common(), where a drained section is introduced. 3. bdrv_parent_change_aio_context(), see below. bdrv_child_change_aio_context() is called by: 1. bdrv_change_aio_context(), i.e. recursive, so being in a drained section is invariant. 2. child_job_change_aio_ctx(), which is only called via the change_aio_ctx() callback, see above. bdrv_parent_change_aio_context() is called by: 1. bdrv_change_aio_context(), i.e. recursive, so being in a drained section is invariant. This resolves all code paths. Note that bdrv_attach_child_common() and bdrv_attach_child_common_abort() hold the graph write lock and callers of bdrv_try_change_aio_context() might too, so they are not actually allowed to drain either. This will be addressed in the following commits. More granular draining is not trivially possible, because bdrv_change_aio_context() can recursively call itself e.g. via bdrv_child_change_aio_context(). [0]: https://lore.kernel.org/qemu-devel/73839c04-7616-407e-b057-80ca69e63f51@virtuozzo.com/ Reported-by: Andrey Drobyshev Signed-off-by: Fiona Ebner Reviewed-by: Kevin Wolf Message-ID: <20250530151125.955508-9-f.ebner@proxmox.com> Signed-off-by: Kevin Wolf --- block.c | 57 +++++++++++++++++++++++++++++----------- include/block/block_int-common.h | 12 +++++++++ 2 files changed, 53 insertions(+), 16 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index 01144c895e..6f42c0f1ab 100644 --- a/block.c +++ b/block.c @@ -106,9 +106,9 @@ static void bdrv_reopen_abort(BDRVReopenState *reopen_state); static bool bdrv_backing_overridden(BlockDriverState *bs); -static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, - GHashTable *visited, Transaction *tran, - Error **errp); +static bool GRAPH_RDLOCK +bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, + GHashTable *visited, Transaction *tran, Error **errp); /* If non-zero, use only whitelisted block drivers */ static int use_bdrv_whitelist; @@ -3040,8 +3040,10 @@ static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque) /* No need to visit `child`, because it has been detached already */ visited = g_hash_table_new(NULL, NULL); + bdrv_drain_all_begin(); ret = s->child->klass->change_aio_ctx(s->child, s->old_parent_ctx, visited, tran, &error_abort); + bdrv_drain_all_end(); g_hash_table_destroy(visited); /* transaction is supposed to always succeed */ @@ -3122,9 +3124,11 @@ bdrv_attach_child_common(BlockDriverState *child_bs, bool ret_child; g_hash_table_add(visited, new_child); + bdrv_drain_all_begin(); ret_child = child_class->change_aio_ctx(new_child, child_ctx, visited, aio_ctx_tran, NULL); + bdrv_drain_all_end(); if (ret_child == true) { error_free(local_err); ret = 0; @@ -7576,6 +7580,17 @@ typedef struct BdrvStateSetAioContext { BlockDriverState *bs; } BdrvStateSetAioContext; +/* + * Changes the AioContext of @child to @ctx and recursively for the associated + * block nodes and all their children and parents. Returns true if the change is + * possible and the transaction @tran can be continued. Returns false and sets + * @errp if not and the transaction must be aborted. + * + * @visited will accumulate all visited BdrvChild objects. The caller is + * responsible for freeing the list afterwards. + * + * Must be called with the affected block nodes drained. + */ static bool GRAPH_RDLOCK bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx, GHashTable *visited, Transaction *tran, @@ -7604,6 +7619,17 @@ bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx, return true; } +/* + * Changes the AioContext of @c->bs to @ctx and recursively for all its children + * and parents. Returns true if the change is possible and the transaction @tran + * can be continued. Returns false and sets @errp if not and the transaction + * must be aborted. + * + * @visited will accumulate all visited BdrvChild objects. The caller is + * responsible for freeing the list afterwards. + * + * Must be called with the affected block nodes drained. + */ bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx, GHashTable *visited, Transaction *tran, Error **errp) @@ -7619,10 +7645,6 @@ bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx, static void bdrv_set_aio_context_clean(void *opaque) { BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque; - BlockDriverState *bs = (BlockDriverState *) state->bs; - - /* Paired with bdrv_drained_begin in bdrv_change_aio_context() */ - bdrv_drained_end(bs); g_free(state); } @@ -7650,10 +7672,12 @@ static TransactionActionDrv set_aio_context = { * * @visited will accumulate all visited BdrvChild objects. The caller is * responsible for freeing the list afterwards. + * + * @bs must be drained. */ -static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, - GHashTable *visited, Transaction *tran, - Error **errp) +static bool GRAPH_RDLOCK +bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, + GHashTable *visited, Transaction *tran, Error **errp) { BdrvChild *c; BdrvStateSetAioContext *state; @@ -7664,21 +7688,17 @@ static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, return true; } - bdrv_graph_rdlock_main_loop(); QLIST_FOREACH(c, &bs->parents, next_parent) { if (!bdrv_parent_change_aio_context(c, ctx, visited, tran, errp)) { - bdrv_graph_rdunlock_main_loop(); return false; } } QLIST_FOREACH(c, &bs->children, next) { if (!bdrv_child_change_aio_context(c, ctx, visited, tran, errp)) { - bdrv_graph_rdunlock_main_loop(); return false; } } - bdrv_graph_rdunlock_main_loop(); state = g_new(BdrvStateSetAioContext, 1); *state = (BdrvStateSetAioContext) { @@ -7686,8 +7706,7 @@ static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, .bs = bs, }; - /* Paired with bdrv_drained_end in bdrv_set_aio_context_clean() */ - bdrv_drained_begin(bs); + assert(bs->quiesce_counter > 0); tran_add(tran, &set_aio_context, state); @@ -7720,6 +7739,8 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, if (ignore_child) { g_hash_table_add(visited, ignore_child); } + bdrv_drain_all_begin(); + bdrv_graph_rdlock_main_loop(); ret = bdrv_change_aio_context(bs, ctx, visited, tran, errp); g_hash_table_destroy(visited); @@ -7733,10 +7754,14 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, if (!ret) { /* Just run clean() callbacks. No AioContext changed. */ tran_abort(tran); + bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_end(); return -EPERM; } tran_commit(tran); + bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_end(); return 0; } diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 37466c7841..168f703fa1 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -983,6 +983,18 @@ struct BdrvChildClass { bool backing_mask_protocol, Error **errp); + /* + * Notifies the parent that the child is trying to change its AioContext. + * The parent may in turn change the AioContext of other nodes in the same + * transaction. Returns true if the change is possible and the transaction + * can be continued. Returns false and sets @errp if not and the transaction + * must be aborted. + * + * @visited will accumulate all visited BdrvChild objects. The caller is + * responsible for freeing the list afterwards. + * + * Must be called with the affected block nodes drained. + */ bool GRAPH_RDLOCK_PTR (*change_aio_ctx)(BdrvChild *child, AioContext *ctx, GHashTable *visited, Transaction *tran, Error **errp); -- cgit 1.4.1 From a1ea8eb5912256c0b2be16fae5d3786aebc80cb1 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:46 +0200 Subject: block: move drain outside of bdrv_try_change_aio_context() This is part of resolving the deadlock mentioned in commit "block: move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". Convert the function to a _locked() version that has to be called with the graph lock held and add a convenience wrapper that has to be called with the graph unlocked, which drains and takes the lock itself. Since bdrv_try_change_aio_context() is global state code, the wrapper is too. Callers are adapted to use the appropriate variant, depending on whether the caller already holds the lock. In the test_set_aio_context() unit test, prior drains can be removed, because draining already happens inside the new wrapper. Note that bdrv_attach_child_common_abort(), bdrv_attach_child_common() and bdrv_root_unref_child() hold the graph lock and are not actually allowed to drain either. This will be addressed in the following commits. Functions like qmp_blockdev_mirror() query the nodes to act on before draining and locking. In theory, draining could invalidate those nodes. This kind of issue is not addressed by these commits. Signed-off-by: Fiona Ebner Reviewed-by: Kevin Wolf Message-ID: <20250530151125.955508-10-f.ebner@proxmox.com> Signed-off-by: Kevin Wolf --- block.c | 58 ++++++++++++++++++++++++++++---------- blockdev.c | 15 ++++++---- include/block/block-global-state.h | 8 ++++-- tests/unit/test-bdrv-drain.c | 4 --- 4 files changed, 59 insertions(+), 26 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index 6f42c0f1ab..3aaacabf7f 100644 --- a/block.c +++ b/block.c @@ -3028,7 +3028,10 @@ static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque) bdrv_replace_child_noperm(s->child, NULL); if (bdrv_get_aio_context(bs) != s->old_child_ctx) { - bdrv_try_change_aio_context(bs, s->old_child_ctx, NULL, &error_abort); + bdrv_drain_all_begin(); + bdrv_try_change_aio_context_locked(bs, s->old_child_ctx, NULL, + &error_abort); + bdrv_drain_all_end(); } if (bdrv_child_get_parent_aio_context(s->child) != s->old_parent_ctx) { @@ -3115,8 +3118,10 @@ bdrv_attach_child_common(BlockDriverState *child_bs, parent_ctx = bdrv_child_get_parent_aio_context(new_child); if (child_ctx != parent_ctx) { Error *local_err = NULL; - int ret = bdrv_try_change_aio_context(child_bs, parent_ctx, NULL, - &local_err); + bdrv_drain_all_begin(); + int ret = bdrv_try_change_aio_context_locked(child_bs, parent_ctx, NULL, + &local_err); + bdrv_drain_all_end(); if (ret < 0 && child_class->change_aio_ctx) { Transaction *aio_ctx_tran = tran_new(); @@ -3319,8 +3324,10 @@ void bdrv_root_unref_child(BdrvChild *child) * When the parent requiring a non-default AioContext is removed, the * node moves back to the main AioContext */ - bdrv_try_change_aio_context(child_bs, qemu_get_aio_context(), NULL, - NULL); + bdrv_drain_all_begin(); + bdrv_try_change_aio_context_locked(child_bs, qemu_get_aio_context(), + NULL, NULL); + bdrv_drain_all_end(); } bdrv_schedule_unref(child_bs); @@ -7719,9 +7726,13 @@ bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, * * If ignore_child is not NULL, that child (and its subgraph) will not * be touched. + * + * Called with the graph lock held. + * + * Called while all bs are drained. */ -int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, - BdrvChild *ignore_child, Error **errp) +int bdrv_try_change_aio_context_locked(BlockDriverState *bs, AioContext *ctx, + BdrvChild *ignore_child, Error **errp) { Transaction *tran; GHashTable *visited; @@ -7730,17 +7741,15 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, /* * Recursion phase: go through all nodes of the graph. - * Take care of checking that all nodes support changing AioContext - * and drain them, building a linear list of callbacks to run if everything - * is successful (the transaction itself). + * Take care of checking that all nodes support changing AioContext, + * building a linear list of callbacks to run if everything is successful + * (the transaction itself). */ tran = tran_new(); visited = g_hash_table_new(NULL, NULL); if (ignore_child) { g_hash_table_add(visited, ignore_child); } - bdrv_drain_all_begin(); - bdrv_graph_rdlock_main_loop(); ret = bdrv_change_aio_context(bs, ctx, visited, tran, errp); g_hash_table_destroy(visited); @@ -7754,15 +7763,34 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, if (!ret) { /* Just run clean() callbacks. No AioContext changed. */ tran_abort(tran); - bdrv_graph_rdunlock_main_loop(); - bdrv_drain_all_end(); return -EPERM; } tran_commit(tran); + return 0; +} + +/* + * Change bs's and recursively all of its parents' and children's AioContext + * to the given new context, returning an error if that isn't possible. + * + * If ignore_child is not NULL, that child (and its subgraph) will not + * be touched. + */ +int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, + BdrvChild *ignore_child, Error **errp) +{ + int ret; + + GLOBAL_STATE_CODE(); + + bdrv_drain_all_begin(); + bdrv_graph_rdlock_main_loop(); + ret = bdrv_try_change_aio_context_locked(bs, ctx, ignore_child, errp); bdrv_graph_rdunlock_main_loop(); bdrv_drain_all_end(); - return 0; + + return ret; } void bdrv_add_aio_context_notifier(BlockDriverState *bs, diff --git a/blockdev.c b/blockdev.c index 3982f9776b..750beba41f 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3601,12 +3601,13 @@ void qmp_x_blockdev_set_iothread(const char *node_name, StrOrNull *iothread, AioContext *new_context; BlockDriverState *bs; - GRAPH_RDLOCK_GUARD_MAINLOOP(); + bdrv_drain_all_begin(); + bdrv_graph_rdlock_main_loop(); bs = bdrv_find_node(node_name); if (!bs) { error_setg(errp, "Failed to find node with node-name='%s'", node_name); - return; + goto out; } /* Protects against accidents. */ @@ -3614,14 +3615,14 @@ void qmp_x_blockdev_set_iothread(const char *node_name, StrOrNull *iothread, error_setg(errp, "Node %s is associated with a BlockBackend and could " "be in use (use force=true to override this check)", node_name); - return; + goto out; } if (iothread->type == QTYPE_QSTRING) { IOThread *obj = iothread_by_id(iothread->u.s); if (!obj) { error_setg(errp, "Cannot find iothread %s", iothread->u.s); - return; + goto out; } new_context = iothread_get_aio_context(obj); @@ -3629,7 +3630,11 @@ void qmp_x_blockdev_set_iothread(const char *node_name, StrOrNull *iothread, new_context = qemu_get_aio_context(); } - bdrv_try_change_aio_context(bs, new_context, NULL, errp); + bdrv_try_change_aio_context_locked(bs, new_context, NULL, errp); + +out: + bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_end(); } QemuOptsList qemu_common_drive_opts = { diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h index aad160956a..91f249b5ad 100644 --- a/include/block/block-global-state.h +++ b/include/block/block-global-state.h @@ -278,8 +278,12 @@ bool GRAPH_RDLOCK bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx, GHashTable *visited, Transaction *tran, Error **errp); -int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, - BdrvChild *ignore_child, Error **errp); +int GRAPH_UNLOCKED +bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, + BdrvChild *ignore_child, Error **errp); +int GRAPH_RDLOCK +bdrv_try_change_aio_context_locked(BlockDriverState *bs, AioContext *ctx, + BdrvChild *ignore_child, Error **errp); int GRAPH_RDLOCK bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz); int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo); diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c index 290cd2a70e..3185f3f429 100644 --- a/tests/unit/test-bdrv-drain.c +++ b/tests/unit/test-bdrv-drain.c @@ -1396,14 +1396,10 @@ static void test_set_aio_context(void) bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR, &error_abort); - bdrv_drained_begin(bs); bdrv_try_change_aio_context(bs, ctx_a, NULL, &error_abort); - bdrv_drained_end(bs); - bdrv_drained_begin(bs); bdrv_try_change_aio_context(bs, ctx_b, NULL, &error_abort); bdrv_try_change_aio_context(bs, qemu_get_aio_context(), NULL, &error_abort); - bdrv_drained_end(bs); bdrv_unref(bs); iothread_join(a); -- cgit 1.4.1 From 2b833595aa21679145cfe67ba720113b165c19ef Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:47 +0200 Subject: block: move drain outside of bdrv_attach_child_common(_abort)() This is part of resolving the deadlock mentioned in commit "block: move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". The function bdrv_attach_child_common_abort() is used only as the abort callback in bdrv_attach_child_common_drv transactions, so the tran_finalize() calls of such transactions need to be in drained sections too. All code paths are covered: The bdrv_attach_child_common_drv transactions are only used in bdrv_attach_child_common(), so it is enough to check callers of bdrv_attach_child_common() following the transactions. bdrv_attach_child_common() is called by: 1. bdrv_attach_child_noperm(), which does not finalize the transaction yet. 2. bdrv_root_attach_child(), where a drained section is introduced. bdrv_attach_child_noperm() is called by: 1. bdrv_attach_child(), where a drained section is introduced. 2. bdrv_set_file_or_backing_noperm(), which does not finalize the transaction yet. 3. bdrv_append(), where a drained section is introduced. bdrv_set_file_or_backing_noperm() is called by: 1. bdrv_set_backing_hd_drained(), where a drained section is introduced. 2. bdrv_reopen_parse_file_or_backing(), which does not finalize the transaction yet. Draining the old child bs currently happens under the graph lock there. This is replaced with an assertion, because the drain will be moved further up to the caller. bdrv_reopen_parse_file_or_backing() is called by: 1. bdrv_reopen_prepare(), which does not finalize the transaction yet. bdrv_reopen_prepare() is called by: 1. bdrv_reopen_multiple(), which does finalize the transaction. It is called after bdrv_reopen_queue(), which starts a drained section. The drained section ends, when bdrv_reopen_queue_free() is called at the end of bdrv_reopen_multiple(). This resolves all code paths. The functions bdrv_set_backing_hd_drained(), bdrv_attach_child() and bdrv_root_attach_child() run under the graph lock, so they are not actually allowed to drain. This will be addressed in the following commits. Signed-off-by: Fiona Ebner Message-ID: <20250530151125.955508-11-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index 3aaacabf7f..46eb2fe449 100644 --- a/block.c +++ b/block.c @@ -3028,10 +3028,8 @@ static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque) bdrv_replace_child_noperm(s->child, NULL); if (bdrv_get_aio_context(bs) != s->old_child_ctx) { - bdrv_drain_all_begin(); bdrv_try_change_aio_context_locked(bs, s->old_child_ctx, NULL, &error_abort); - bdrv_drain_all_end(); } if (bdrv_child_get_parent_aio_context(s->child) != s->old_parent_ctx) { @@ -3043,10 +3041,8 @@ static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque) /* No need to visit `child`, because it has been detached already */ visited = g_hash_table_new(NULL, NULL); - bdrv_drain_all_begin(); ret = s->child->klass->change_aio_ctx(s->child, s->old_parent_ctx, visited, tran, &error_abort); - bdrv_drain_all_end(); g_hash_table_destroy(visited); /* transaction is supposed to always succeed */ @@ -3075,6 +3071,9 @@ static TransactionActionDrv bdrv_attach_child_common_drv = { * * Both @parent_bs and @child_bs can move to a different AioContext in this * function. + * + * All block nodes must be drained before this function is called until after + * the transaction is finalized. */ static BdrvChild * GRAPH_WRLOCK bdrv_attach_child_common(BlockDriverState *child_bs, @@ -3118,10 +3117,8 @@ bdrv_attach_child_common(BlockDriverState *child_bs, parent_ctx = bdrv_child_get_parent_aio_context(new_child); if (child_ctx != parent_ctx) { Error *local_err = NULL; - bdrv_drain_all_begin(); int ret = bdrv_try_change_aio_context_locked(child_bs, parent_ctx, NULL, &local_err); - bdrv_drain_all_end(); if (ret < 0 && child_class->change_aio_ctx) { Transaction *aio_ctx_tran = tran_new(); @@ -3129,11 +3126,9 @@ bdrv_attach_child_common(BlockDriverState *child_bs, bool ret_child; g_hash_table_add(visited, new_child); - bdrv_drain_all_begin(); ret_child = child_class->change_aio_ctx(new_child, child_ctx, visited, aio_ctx_tran, NULL); - bdrv_drain_all_end(); if (ret_child == true) { error_free(local_err); ret = 0; @@ -3189,6 +3184,9 @@ bdrv_attach_child_common(BlockDriverState *child_bs, * * After calling this function, the transaction @tran may only be completed * while holding a writer lock for the graph. + * + * All block nodes must be drained before this function is called until after + * the transaction is finalized. */ static BdrvChild * GRAPH_WRLOCK bdrv_attach_child_noperm(BlockDriverState *parent_bs, @@ -3244,6 +3242,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, GLOBAL_STATE_CODE(); + bdrv_drain_all_begin(); child = bdrv_attach_child_common(child_bs, child_name, child_class, child_role, perm, shared_perm, opaque, tran, errp); @@ -3256,6 +3255,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, out: tran_finalize(tran, ret); + bdrv_drain_all_end(); bdrv_schedule_unref(child_bs); @@ -3283,6 +3283,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, GLOBAL_STATE_CODE(); + bdrv_drain_all_begin(); child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, child_class, child_role, tran, errp); if (!child) { @@ -3297,6 +3298,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, out: tran_finalize(tran, ret); + bdrv_drain_all_end(); bdrv_schedule_unref(child_bs); @@ -3465,6 +3467,9 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs) * * After calling this function, the transaction @tran may only be completed * while holding a writer lock for the graph. + * + * All block nodes must be drained before this function is called until after + * the transaction is finalized. */ static int GRAPH_WRLOCK bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs, @@ -3573,6 +3578,7 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs, assert(bs->backing->bs->quiesce_counter > 0); } + bdrv_drain_all_begin(); ret = bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp); if (ret < 0) { goto out; @@ -3581,6 +3587,7 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs, ret = bdrv_refresh_perms(bs, tran, errp); out: tran_finalize(tran, ret); + bdrv_drain_all_end(); return ret; } @@ -4721,6 +4728,9 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, * Return 0 on success, otherwise return < 0 and set @errp. * * @reopen_state->bs can move to a different AioContext in this function. + * + * All block nodes must be drained before this function is called until after + * the transaction is finalized. */ static int GRAPH_UNLOCKED bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, @@ -4814,7 +4824,7 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, if (old_child_bs) { bdrv_ref(old_child_bs); - bdrv_drained_begin(old_child_bs); + assert(old_child_bs->quiesce_counter > 0); } bdrv_graph_rdunlock_main_loop(); @@ -4826,7 +4836,6 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, bdrv_graph_wrunlock(); if (old_child_bs) { - bdrv_drained_end(old_child_bs); bdrv_unref(old_child_bs); } @@ -4855,6 +4864,9 @@ out_rdlock: * * After calling this function, the transaction @change_child_tran may only be * completed while holding a writer lock for the graph. + * + * All block nodes must be drained before this function is called until after + * the transaction is finalized. */ static int GRAPH_UNLOCKED bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, @@ -5501,9 +5513,7 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, assert(!bs_new->backing); bdrv_graph_rdunlock_main_loop(); - bdrv_drained_begin(bs_top); - bdrv_drained_begin(bs_new); - + bdrv_drain_all_begin(); bdrv_graph_wrlock(); child = bdrv_attach_child_noperm(bs_new, bs_top, "backing", @@ -5525,9 +5535,7 @@ out: bdrv_refresh_limits(bs_top, NULL, NULL); bdrv_graph_wrunlock(); - - bdrv_drained_end(bs_top); - bdrv_drained_end(bs_new); + bdrv_drain_all_end(); return ret; } -- cgit 1.4.1 From e66dbda11eab2b4a091d470f3508a4d6ca60eaf5 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:48 +0200 Subject: block: move drain outside of bdrv_set_backing_hd_drained() This is part of resolving the deadlock mentioned in commit "block: move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". The function bdrv_set_backing_hd_drained() holds the graph lock, so it is not allowed to drain. It is called by: 1. bdrv_set_backing_hd(), where a drained section is introduced, replacing the previously present bs-specific drains. 2. stream_prepare(), where a drained section is introduced replacing the previously present bs-specific drains. The drain_bs variable in bdrv_set_backing_hd_drained() is now superfluous and thus dropped. Signed-off-by: Fiona Ebner Message-ID: <20250530151125.955508-12-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block.c | 16 +++------------- block/stream.c | 6 ++---- 2 files changed, 5 insertions(+), 17 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index 46eb2fe449..e53a88e1b6 100644 --- a/block.c +++ b/block.c @@ -3562,8 +3562,7 @@ out: * Both @bs and @backing_hd can move to a different AioContext in this * function. * - * If a backing child is already present (i.e. we're detaching a node), that - * child node must be drained. + * All block nodes must be drained. */ int bdrv_set_backing_hd_drained(BlockDriverState *bs, BlockDriverState *backing_hd, @@ -3578,7 +3577,6 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs, assert(bs->backing->bs->quiesce_counter > 0); } - bdrv_drain_all_begin(); ret = bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp); if (ret < 0) { goto out; @@ -3587,28 +3585,20 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs, ret = bdrv_refresh_perms(bs, tran, errp); out: tran_finalize(tran, ret); - bdrv_drain_all_end(); return ret; } int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, Error **errp) { - BlockDriverState *drain_bs; int ret; GLOBAL_STATE_CODE(); - bdrv_graph_rdlock_main_loop(); - drain_bs = bs->backing ? bs->backing->bs : bs; - bdrv_graph_rdunlock_main_loop(); - - bdrv_ref(drain_bs); - bdrv_drained_begin(drain_bs); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp); bdrv_graph_wrunlock(); - bdrv_drained_end(drain_bs); - bdrv_unref(drain_bs); + bdrv_drain_all_end(); return ret; } diff --git a/block/stream.c b/block/stream.c index 999d9e56d4..6ba49cffd3 100644 --- a/block/stream.c +++ b/block/stream.c @@ -80,11 +80,10 @@ static int stream_prepare(Job *job) * may end up working with the wrong base node (or it might even have gone * away by the time we want to use it). */ - bdrv_drained_begin(unfiltered_bs); if (unfiltered_bs_cow) { bdrv_ref(unfiltered_bs_cow); - bdrv_drained_begin(unfiltered_bs_cow); } + bdrv_drain_all_begin(); bdrv_graph_rdlock_main_loop(); base = bdrv_filter_or_cow_bs(s->above_base); @@ -123,11 +122,10 @@ static int stream_prepare(Job *job) } out: + bdrv_drain_all_end(); if (unfiltered_bs_cow) { - bdrv_drained_end(unfiltered_bs_cow); bdrv_unref(unfiltered_bs_cow); } - bdrv_drained_end(unfiltered_bs); return ret; } -- cgit 1.4.1 From ffdcd081f52544f065020c780a6c522dace6b0af Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:49 +0200 Subject: block: move drain outside of bdrv_root_attach_child() This is part of resolving the deadlock mentioned in commit "block: move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". The function bdrv_root_attach_child() runs under the graph lock, so it is not allowed to drain. It is called by: 1. blk_insert_bs(), where a drained section is introduced. 2. block_job_add_bdrv(), which holds the graph lock itself. block_job_add_bdrv() is called by: 1. mirror_start_job() 2. stream_start() 3. commit_start() 4. backup_job_create() 5. block_job_create() 6. In the test_blockjob_common_drain_node() unit test In all callers, a drained section is introduced. Signed-off-by: Fiona Ebner Reviewed-by: Kevin Wolf Message-ID: <20250530151125.955508-13-f.ebner@proxmox.com> Signed-off-by: Kevin Wolf --- block.c | 4 ++-- block/backup.c | 2 ++ block/block-backend.c | 2 ++ block/commit.c | 4 ++++ block/mirror.c | 5 +++++ block/stream.c | 4 ++++ blockjob.c | 4 ++++ include/block/blockjob.h | 2 ++ tests/unit/test-bdrv-drain.c | 2 ++ 9 files changed, 27 insertions(+), 2 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index e53a88e1b6..17c34dc240 100644 --- a/block.c +++ b/block.c @@ -3228,6 +3228,8 @@ bdrv_attach_child_noperm(BlockDriverState *parent_bs, * * On failure NULL is returned, errp is set and the reference to * child_bs is also dropped. + * + * All block nodes must be drained. */ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, const char *child_name, @@ -3242,7 +3244,6 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, GLOBAL_STATE_CODE(); - bdrv_drain_all_begin(); child = bdrv_attach_child_common(child_bs, child_name, child_class, child_role, perm, shared_perm, opaque, tran, errp); @@ -3255,7 +3256,6 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, out: tran_finalize(tran, ret); - bdrv_drain_all_end(); bdrv_schedule_unref(child_bs); diff --git a/block/backup.c b/block/backup.c index 0151e84395..909027c17a 100644 --- a/block/backup.c +++ b/block/backup.c @@ -498,10 +498,12 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, block_copy_set_speed(bcs, speed); /* Required permissions are taken by copy-before-write filter target */ + bdrv_drain_all_begin(); bdrv_graph_wrlock(); block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); return &job->common; diff --git a/block/block-backend.c b/block/block-backend.c index 6a6949edeb..24cae3cb55 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -904,6 +904,7 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp) GLOBAL_STATE_CODE(); bdrv_ref(bs); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); if ((bs->open_flags & BDRV_O_INACTIVE) && blk_can_inactivate(blk)) { @@ -919,6 +920,7 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp) BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, perm, shared_perm, blk, errp); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); if (blk->root == NULL) { return -EPERM; } diff --git a/block/commit.c b/block/commit.c index 7cc8c0f0df..6c4b736ff8 100644 --- a/block/commit.c +++ b/block/commit.c @@ -392,6 +392,7 @@ void commit_start(const char *job_id, BlockDriverState *bs, * this is the responsibility of the interface (i.e. whoever calls * commit_start()). */ + bdrv_drain_all_begin(); bdrv_graph_wrlock(); s->base_overlay = bdrv_find_overlay(top, base); assert(s->base_overlay); @@ -424,18 +425,21 @@ void commit_start(const char *job_id, BlockDriverState *bs, iter_shared_perms, errp); if (ret < 0) { bdrv_graph_wrunlock(); + bdrv_drain_all_end(); goto fail; } } if (bdrv_freeze_backing_chain(commit_top_bs, base, errp) < 0) { bdrv_graph_wrunlock(); + bdrv_drain_all_end(); goto fail; } s->chain_frozen = true; ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); if (ret < 0) { goto fail; diff --git a/block/mirror.c b/block/mirror.c index c2c5099c95..6e8caf4b49 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -2014,6 +2014,7 @@ static BlockJob *mirror_start_job( */ bdrv_disable_dirty_bitmap(s->dirty_bitmap); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); ret = block_job_add_bdrv(&s->common, "source", bs, 0, BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE | @@ -2021,6 +2022,7 @@ static BlockJob *mirror_start_job( errp); if (ret < 0) { bdrv_graph_wrunlock(); + bdrv_drain_all_end(); goto fail; } @@ -2066,16 +2068,19 @@ static BlockJob *mirror_start_job( iter_shared_perms, errp); if (ret < 0) { bdrv_graph_wrunlock(); + bdrv_drain_all_end(); goto fail; } } if (bdrv_freeze_backing_chain(mirror_top_bs, target, errp) < 0) { bdrv_graph_wrunlock(); + bdrv_drain_all_end(); goto fail; } } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); QTAILQ_INIT(&s->ops_in_flight); diff --git a/block/stream.c b/block/stream.c index 6ba49cffd3..f5441f27f4 100644 --- a/block/stream.c +++ b/block/stream.c @@ -371,10 +371,12 @@ void stream_start(const char *job_id, BlockDriverState *bs, * already have our own plans. Also don't allow resize as the image size is * queried only at the job start and then cached. */ + bdrv_drain_all_begin(); bdrv_graph_wrlock(); if (block_job_add_bdrv(&s->common, "active node", bs, 0, basic_flags | BLK_PERM_WRITE, errp)) { bdrv_graph_wrunlock(); + bdrv_drain_all_end(); goto fail; } @@ -395,10 +397,12 @@ void stream_start(const char *job_id, BlockDriverState *bs, basic_flags, errp); if (ret < 0) { bdrv_graph_wrunlock(); + bdrv_drain_all_end(); goto fail; } } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); s->base_overlay = base_overlay; s->above_base = above_base; diff --git a/blockjob.c b/blockjob.c index 34185d7715..44991e3ff7 100644 --- a/blockjob.c +++ b/blockjob.c @@ -496,6 +496,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, int ret; GLOBAL_STATE_CODE(); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); if (job_id == NULL && !(flags & JOB_INTERNAL)) { @@ -506,6 +507,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, flags, cb, opaque, errp); if (job == NULL) { bdrv_graph_wrunlock(); + bdrv_drain_all_end(); return NULL; } @@ -544,10 +546,12 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); return job; fail: bdrv_graph_wrunlock(); + bdrv_drain_all_end(); job_early_fail(&job->job); return NULL; } diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 7061ab7201..990f3e179a 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -137,6 +137,8 @@ BlockJob *block_job_get_locked(const char *id); * Add @bs to the list of BlockDriverState that are involved in * @job. This means that all operations will be blocked on @bs while * @job exists. + * + * All block nodes must be drained. */ int GRAPH_WRLOCK block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c index 3185f3f429..4f3057844b 100644 --- a/tests/unit/test-bdrv-drain.c +++ b/tests/unit/test-bdrv-drain.c @@ -772,9 +772,11 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type, tjob->bs = src; job = &tjob->common; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); switch (result) { case TEST_JOB_SUCCESS: -- cgit 1.4.1 From 77f3965ba7fed5b35212171a1e41c20c05a7ef11 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:50 +0200 Subject: block: move drain outside of bdrv_attach_child() This is part of resolving the deadlock mentioned in commit "block: move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". The function bdrv_attach_child() runs under the graph lock, so it is not allowed to drain. It is called by: 1. replication_start() 2. quorum_add_child() 3. bdrv_open_child_common() 4. Throughout test-bdrv-graph-mod.c and test-bdrv-drain.c unit tests. In all callers, a drained section is introduced. The function quorum_add_child() runs under the graph lock, so it is not actually allowed to drain. This will be addressed by the following commit. Signed-off-by: Fiona Ebner Reviewed-by: Kevin Wolf Message-ID: <20250530151125.955508-14-f.ebner@proxmox.com> Signed-off-by: Kevin Wolf --- block.c | 6 ++++-- block/quorum.c | 2 ++ block/replication.c | 5 +++++ tests/unit/test-bdrv-drain.c | 14 ++++++++++++++ tests/unit/test-bdrv-graph-mod.c | 10 ++++++++++ 5 files changed, 35 insertions(+), 2 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index 17c34dc240..6fc87aa318 100644 --- a/block.c +++ b/block.c @@ -3269,6 +3269,8 @@ out: * * On failure NULL is returned, errp is set and the reference to * child_bs is also dropped. + * + * All block nodes must be drained. */ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, @@ -3283,7 +3285,6 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, GLOBAL_STATE_CODE(); - bdrv_drain_all_begin(); child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, child_class, child_role, tran, errp); if (!child) { @@ -3298,7 +3299,6 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, out: tran_finalize(tran, ret); - bdrv_drain_all_end(); bdrv_schedule_unref(child_bs); @@ -3789,10 +3789,12 @@ static BdrvChild *bdrv_open_child_common(const char *filename, return NULL; } + bdrv_drain_all_begin(); bdrv_graph_wrlock(); child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role, errp); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); return child; } diff --git a/block/quorum.c b/block/quorum.c index ed8ce801ee..ea17b0ec13 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -1096,8 +1096,10 @@ quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, Error **errp) /* We can safely add the child now */ bdrv_ref(child_bs); + bdrv_drain_all_begin(); child = bdrv_attach_child(bs, child_bs, indexstr, &child_of_bds, BDRV_CHILD_DATA, errp); + bdrv_drain_all_end(); if (child == NULL) { s->next_child_index--; return; diff --git a/block/replication.c b/block/replication.c index 07f274de9e..54cbd03e00 100644 --- a/block/replication.c +++ b/block/replication.c @@ -540,6 +540,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, return; } + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_ref(hidden_disk->bs); @@ -549,6 +550,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, if (local_err) { error_propagate(errp, local_err); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); return; } @@ -559,6 +561,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, if (local_err) { error_propagate(errp, local_err); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); return; } @@ -571,12 +574,14 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, !check_top_bs(top_bs, bs)) { error_setg(errp, "No top_bs or it is invalid"); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); reopen_backing_file(bs, false, NULL); return; } bdrv_op_block_all(top_bs, s->blocker); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); s->backup_job = backup_job_create( NULL, s->secondary_disk->bs, s->hidden_disk->bs, diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c index 4f3057844b..ac76525e5a 100644 --- a/tests/unit/test-bdrv-drain.c +++ b/tests/unit/test-bdrv-drain.c @@ -1049,10 +1049,12 @@ static void do_test_delete_by_drain(bool detach_instead_of_delete, null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, &error_abort); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_attach_child(bs, null_bs, "null-child", &child_of_bds, BDRV_CHILD_DATA, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); /* This child will be the one to pass to requests through to, and * it will stall until a drain occurs */ @@ -1060,21 +1062,25 @@ static void do_test_delete_by_drain(bool detach_instead_of_delete, &error_abort); child_bs->total_sectors = 65536 >> BDRV_SECTOR_BITS; /* Takes our reference to child_bs */ + bdrv_drain_all_begin(); bdrv_graph_wrlock(); tts->wait_child = bdrv_attach_child(bs, child_bs, "wait-child", &child_of_bds, BDRV_CHILD_DATA | BDRV_CHILD_PRIMARY, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); /* This child is just there to be deleted * (for detach_instead_of_delete == true) */ null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, &error_abort); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_attach_child(bs, null_bs, "null-child", &child_of_bds, BDRV_CHILD_DATA, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL); blk_insert_bs(blk, bs, &error_abort); @@ -1157,6 +1163,7 @@ static void no_coroutine_fn detach_indirect_bh(void *opaque) bdrv_dec_in_flight(data->child_b->bs); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(data->parent_b, data->child_b); @@ -1165,6 +1172,7 @@ static void no_coroutine_fn detach_indirect_bh(void *opaque) &child_of_bds, BDRV_CHILD_DATA, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); } static void coroutine_mixed_fn detach_by_parent_aio_cb(void *opaque, int ret) @@ -1262,6 +1270,7 @@ static void TSA_NO_TSA test_detach_indirect(bool by_parent_cb) /* Set child relationships */ bdrv_ref(b); bdrv_ref(a); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); child_b = bdrv_attach_child(parent_b, b, "PB-B", &child_of_bds, BDRV_CHILD_DATA, &error_abort); @@ -1273,6 +1282,7 @@ static void TSA_NO_TSA test_detach_indirect(bool by_parent_cb) by_parent_cb ? &child_of_bds : &detach_by_driver_cb_class, BDRV_CHILD_DATA, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); g_assert_cmpint(parent_a->refcnt, ==, 1); g_assert_cmpint(parent_b->refcnt, ==, 1); @@ -1685,6 +1695,7 @@ static void test_drop_intermediate_poll(void) * Establish the chain last, so the chain links are the first * elements in the BDS.parents lists */ + bdrv_drain_all_begin(); bdrv_graph_wrlock(); for (i = 0; i < 3; i++) { if (i) { @@ -1694,6 +1705,7 @@ static void test_drop_intermediate_poll(void) } } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); job = block_job_create("job", &test_simple_job_driver, NULL, job_node, 0, BLK_PERM_ALL, 0, 0, NULL, NULL, &error_abort); @@ -1940,10 +1952,12 @@ static void do_test_replace_child_mid_drain(int old_drain_count, new_child_bs->total_sectors = 1; bdrv_ref(old_child_bs); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_attach_child(parent_bs, old_child_bs, "child", &child_of_bds, BDRV_CHILD_COW, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); parent_s->setup_completed = true; for (i = 0; i < old_drain_count; i++) { diff --git a/tests/unit/test-bdrv-graph-mod.c b/tests/unit/test-bdrv-graph-mod.c index d743abb4bb..7b03ebe4b0 100644 --- a/tests/unit/test-bdrv-graph-mod.c +++ b/tests/unit/test-bdrv-graph-mod.c @@ -137,10 +137,12 @@ static void test_update_perm_tree(void) blk_insert_bs(root, bs, &error_abort); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_attach_child(filter, bs, "child", &child_of_bds, BDRV_CHILD_DATA, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); ret = bdrv_append(filter, bs, NULL); g_assert_cmpint(ret, <, 0); @@ -204,11 +206,13 @@ static void test_should_update_child(void) bdrv_set_backing_hd(target, bs, &error_abort); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); g_assert(target->backing->bs == bs); bdrv_attach_child(filter, target, "target", &child_of_bds, BDRV_CHILD_DATA, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); bdrv_append(filter, bs, &error_abort); bdrv_graph_rdlock_main_loop(); @@ -244,6 +248,7 @@ static void test_parallel_exclusive_write(void) bdrv_ref(base); bdrv_ref(fl1); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_attach_child(top, fl1, "backing", &child_of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, @@ -257,6 +262,7 @@ static void test_parallel_exclusive_write(void) bdrv_replace_node(fl1, fl2, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); bdrv_drained_end(fl2); bdrv_drained_end(fl1); @@ -363,6 +369,7 @@ static void test_parallel_perm_update(void) */ bdrv_ref(base); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_attach_child(top, ws, "file", &child_of_bds, BDRV_CHILD_DATA, &error_abort); @@ -377,6 +384,7 @@ static void test_parallel_perm_update(void) BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); /* Select fl1 as first child to be active */ s->selected = c_fl1; @@ -430,11 +438,13 @@ static void test_append_greedy_filter(void) BlockDriverState *base = no_perm_node("base"); BlockDriverState *fl = exclusive_writer_node("fl1"); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_attach_child(top, base, "backing", &child_of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, &error_abort); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); bdrv_append(fl, base, &error_abort); bdrv_unref(fl); -- cgit 1.4.1 From 0414930d3adfa89299eaea5ce92accab15d9fba5 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:51 +0200 Subject: block: move drain outside of quorum_add_child() This is part of resolving the deadlock mentioned in commit "block: move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". The quorum_add_child() callback runs under the graph lock, so it is not allowed to drain. It is only called as the .bdrv_add_child() callback, which is only called in the bdrv_add_child() function, which also runs under the graph lock. The bdrv_add_child() function is called by qmp_x_blockdev_change(), where a drained section is introduced. Signed-off-by: Fiona Ebner Message-ID: <20250530151125.955508-15-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block.c | 10 ++++++++-- block/quorum.c | 2 -- blockdev.c | 2 ++ include/block/block_int-common.h | 7 +++++++ 4 files changed, 17 insertions(+), 4 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index 6fc87aa318..f6c2f7e208 100644 --- a/block.c +++ b/block.c @@ -8220,8 +8220,10 @@ char *bdrv_dirname(BlockDriverState *bs, Error **errp) } /* - * Hot add/remove a BDS's child. So the user can take a child offline when - * it is broken and take a new child online + * Hot add a BDS's child. Used in combination with bdrv_del_child, so the user + * can take a child offline when it is broken and take a new child online. + * + * All block nodes must be drained. */ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, Error **errp) @@ -8261,6 +8263,10 @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp); } +/* + * Hot remove a BDS's child. Used in combination with bdrv_add_child, so the + * user can take a child offline when it is broken and take a new child online. + */ void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp) { BdrvChild *tmp; diff --git a/block/quorum.c b/block/quorum.c index ea17b0ec13..ed8ce801ee 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -1096,10 +1096,8 @@ quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, Error **errp) /* We can safely add the child now */ bdrv_ref(child_bs); - bdrv_drain_all_begin(); child = bdrv_attach_child(bs, child_bs, indexstr, &child_of_bds, BDRV_CHILD_DATA, errp); - bdrv_drain_all_end(); if (child == NULL) { s->next_child_index--; return; diff --git a/blockdev.c b/blockdev.c index 750beba41f..bd5ca77619 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3531,6 +3531,7 @@ void qmp_x_blockdev_change(const char *parent, const char *child, BlockDriverState *parent_bs, *new_bs = NULL; BdrvChild *p_child; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); parent_bs = bdrv_lookup_bs(parent, parent, errp); @@ -3568,6 +3569,7 @@ void qmp_x_blockdev_change(const char *parent, const char *child, out: bdrv_graph_wrunlock(); + bdrv_drain_all_end(); } BlockJobInfoList *qmp_query_block_jobs(Error **errp) diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 168f703fa1..f9e742f812 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -396,6 +396,13 @@ struct BlockDriver { int GRAPH_RDLOCK_PTR (*bdrv_probe_geometry)( BlockDriverState *bs, HDGeometry *geo); + /** + * Hot add a BDS's child. Used in combination with bdrv_del_child, so the + * user can take a child offline when it is broken and take a new child + * online. + * + * All block nodes must be drained. + */ void GRAPH_WRLOCK_PTR (*bdrv_add_child)( BlockDriverState *parent, BlockDriverState *child, Error **errp); -- cgit 1.4.1 From b13f54654546cbc0661d3fe9d25f7543535c2bee Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:52 +0200 Subject: block: move drain outside of bdrv_root_unref_child() This is part of resolving the deadlock mentioned in commit "block: move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". bdrv_root_unref_child() is called by: 1. blk_remove_bs(), where a drained section is introduced. 2. bdrv_unref_child(), which runs under the graph lock, so the drain will be moved further up to its callers. 3. block_job_remove_all_bdrv(), where a drained section is introduced. For all callers of bdrv_unref_child() and its generated bdrv_co_unref_child() coroutine variant, a drained section is introduced, they are not explicilty listed here. The caller quorum_del_child() holds the graph lock, so it is not actually allowed to drain. This will be addressed in the next commit. Signed-off-by: Fiona Ebner Message-ID: <20250530151125.955508-16-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block.c | 18 ++++++++++++++---- block/blklogwrites.c | 4 ++++ block/blkverify.c | 2 ++ block/block-backend.c | 2 ++ block/qcow2.c | 4 ++++ block/quorum.c | 6 ++++++ block/replication.c | 2 ++ block/snapshot.c | 2 ++ block/vmdk.c | 10 ++++++++++ blockjob.c | 2 ++ tests/unit/test-bdrv-drain.c | 4 ++++ 11 files changed, 52 insertions(+), 4 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index f6c2f7e208..15a8ccb822 100644 --- a/block.c +++ b/block.c @@ -1721,12 +1721,14 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name, open_failed: bs->drv = NULL; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); if (bs->file != NULL) { bdrv_unref_child(bs, bs->file); assert(!bs->file); } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); g_free(bs->opaque); bs->opaque = NULL; @@ -3305,7 +3307,11 @@ out: return ret < 0 ? NULL : child; } -/* Callers must ensure that child->frozen is false. */ +/* + * Callers must ensure that child->frozen is false. + * + * All block nodes must be drained. + */ void bdrv_root_unref_child(BdrvChild *child) { BlockDriverState *child_bs = child->bs; @@ -3326,10 +3332,8 @@ void bdrv_root_unref_child(BdrvChild *child) * When the parent requiring a non-default AioContext is removed, the * node moves back to the main AioContext */ - bdrv_drain_all_begin(); bdrv_try_change_aio_context_locked(child_bs, qemu_get_aio_context(), NULL, NULL); - bdrv_drain_all_end(); } bdrv_schedule_unref(child_bs); @@ -3402,7 +3406,11 @@ bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child, } } -/* Callers must ensure that child->frozen is false. */ +/* + * Callers must ensure that child->frozen is false. + * + * All block nodes must be drained. + */ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child) { GLOBAL_STATE_CODE(); @@ -5172,6 +5180,7 @@ static void bdrv_close(BlockDriverState *bs) bs->drv = NULL; } + bdrv_drain_all_begin(); bdrv_graph_wrlock(); QLIST_FOREACH_SAFE(child, &bs->children, next, next) { bdrv_unref_child(bs, child); @@ -5180,6 +5189,7 @@ static void bdrv_close(BlockDriverState *bs) assert(!bs->backing); assert(!bs->file); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); g_free(bs->opaque); bs->opaque = NULL; diff --git a/block/blklogwrites.c b/block/blklogwrites.c index b0f78c4bc7..70ac76f401 100644 --- a/block/blklogwrites.c +++ b/block/blklogwrites.c @@ -281,9 +281,11 @@ static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags, ret = 0; fail_log: if (ret < 0) { + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, s->log_file); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); s->log_file = NULL; qemu_mutex_destroy(&s->mutex); } @@ -296,10 +298,12 @@ static void blk_log_writes_close(BlockDriverState *bs) { BDRVBlkLogWritesState *s = bs->opaque; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, s->log_file); s->log_file = NULL; bdrv_graph_wrunlock(); + bdrv_drain_all_end(); qemu_mutex_destroy(&s->mutex); } diff --git a/block/blkverify.c b/block/blkverify.c index db79a36681..3a71f7498c 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -151,10 +151,12 @@ static void blkverify_close(BlockDriverState *bs) { BDRVBlkverifyState *s = bs->opaque; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, s->test_file); s->test_file = NULL; bdrv_graph_wrunlock(); + bdrv_drain_all_end(); } static int64_t coroutine_fn GRAPH_RDLOCK diff --git a/block/block-backend.c b/block/block-backend.c index 24cae3cb55..68209bb2f7 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -889,9 +889,11 @@ void blk_remove_bs(BlockBackend *blk) root = blk->root; blk->root = NULL; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_root_unref_child(root); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); } /* diff --git a/block/qcow2.c b/block/qcow2.c index 66fba89b41..45451a7ee8 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1895,7 +1895,9 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, g_free(s->image_data_file); if (open_data_file && has_data_file(bs)) { bdrv_graph_co_rdunlock(); + bdrv_drain_all_begin(); bdrv_co_unref_child(bs, s->data_file); + bdrv_drain_all_end(); bdrv_graph_co_rdlock(); s->data_file = NULL; } @@ -2821,9 +2823,11 @@ qcow2_do_close(BlockDriverState *bs, bool close_data_file) if (close_data_file && has_data_file(bs)) { GLOBAL_STATE_CODE(); bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, s->data_file); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); s->data_file = NULL; bdrv_graph_rdlock_main_loop(); } diff --git a/block/quorum.c b/block/quorum.c index ed8ce801ee..81407a38ee 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -1037,6 +1037,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, close_exit: /* cleanup on error */ + bdrv_drain_all_begin(); bdrv_graph_wrlock(); for (i = 0; i < s->num_children; i++) { if (!opened[i]) { @@ -1045,6 +1046,7 @@ close_exit: bdrv_unref_child(bs, s->children[i]); } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); g_free(s->children); g_free(opened); exit: @@ -1057,11 +1059,13 @@ static void quorum_close(BlockDriverState *bs) BDRVQuorumState *s = bs->opaque; int i; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); for (i = 0; i < s->num_children; i++) { bdrv_unref_child(bs, s->children[i]); } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); g_free(s->children); } @@ -1143,7 +1147,9 @@ quorum_del_child(BlockDriverState *bs, BdrvChild *child, Error **errp) (s->num_children - i - 1) * sizeof(BdrvChild *)); s->children = g_renew(BdrvChild *, s->children, --s->num_children); + bdrv_drain_all_begin(); bdrv_unref_child(bs, child); + bdrv_drain_all_end(); quorum_refresh_flags(bs); } diff --git a/block/replication.c b/block/replication.c index 54cbd03e00..0879718854 100644 --- a/block/replication.c +++ b/block/replication.c @@ -656,12 +656,14 @@ static void replication_done(void *opaque, int ret) if (ret == 0) { s->stage = BLOCK_REPLICATION_DONE; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, s->secondary_disk); s->secondary_disk = NULL; bdrv_unref_child(bs, s->hidden_disk); s->hidden_disk = NULL; bdrv_graph_wrunlock(); + bdrv_drain_all_end(); s->error = 0; } else { diff --git a/block/snapshot.c b/block/snapshot.c index 9f300a78bd..28c9c43621 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -291,9 +291,11 @@ int bdrv_snapshot_goto(BlockDriverState *bs, } /* .bdrv_open() will re-attach it */ + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, fallback); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp); memset(bs->opaque, 0, drv->instance_size); diff --git a/block/vmdk.c b/block/vmdk.c index 9c7ab037e1..89a7250120 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -271,6 +271,7 @@ static void vmdk_free_extents(BlockDriverState *bs) BDRVVmdkState *s = bs->opaque; VmdkExtent *e; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); for (i = 0; i < s->num_extents; i++) { e = &s->extents[i]; @@ -283,6 +284,7 @@ static void vmdk_free_extents(BlockDriverState *bs) } } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); g_free(s->extents); } @@ -1247,9 +1249,11 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options, 0, 0, 0, 0, 0, &extent, errp); if (ret < 0) { bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, extent_file); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); bdrv_graph_rdlock_main_loop(); goto out; } @@ -1266,9 +1270,11 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options, g_free(buf); if (ret) { bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, extent_file); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); bdrv_graph_rdlock_main_loop(); goto out; } @@ -1277,9 +1283,11 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options, ret = vmdk_open_se_sparse(bs, extent_file, bs->open_flags, errp); if (ret) { bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, extent_file); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); bdrv_graph_rdlock_main_loop(); goto out; } @@ -1287,9 +1295,11 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options, } else { error_setg(errp, "Unsupported extent type '%s'", type); bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_begin(); bdrv_graph_wrlock(); bdrv_unref_child(bs, extent_file); bdrv_graph_wrunlock(); + bdrv_drain_all_end(); bdrv_graph_rdlock_main_loop(); ret = -ENOTSUP; goto out; diff --git a/blockjob.c b/blockjob.c index 44991e3ff7..e68181a35b 100644 --- a/blockjob.c +++ b/blockjob.c @@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job) * one to make sure that such a concurrent access does not attempt * to process an already freed BdrvChild. */ + bdrv_drain_all_begin(); bdrv_graph_wrlock(); while (job->nodes) { GSList *l = job->nodes; @@ -211,6 +212,7 @@ void block_job_remove_all_bdrv(BlockJob *job) g_slist_free_1(l); } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); } bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs) diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c index ac76525e5a..59c2793725 100644 --- a/tests/unit/test-bdrv-drain.c +++ b/tests/unit/test-bdrv-drain.c @@ -955,11 +955,13 @@ static void bdrv_test_top_close(BlockDriverState *bs) { BdrvChild *c, *next_c; + bdrv_drain_all_begin(); bdrv_graph_wrlock(); QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) { bdrv_unref_child(bs, c); } bdrv_graph_wrunlock(); + bdrv_drain_all_end(); } static int coroutine_fn GRAPH_RDLOCK @@ -1016,7 +1018,9 @@ static void coroutine_fn test_co_delete_by_drain(void *opaque) bdrv_graph_co_rdlock(); QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) { bdrv_graph_co_rdunlock(); + bdrv_drain_all_begin(); bdrv_co_unref_child(bs, c); + bdrv_drain_all_end(); bdrv_graph_co_rdlock(); } bdrv_graph_co_rdunlock(); -- cgit 1.4.1 From d75f8ed1d7fc27cf1643e549cd006a68d3bf6ef1 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 30 May 2025 17:10:53 +0200 Subject: block: move drain outside of quorum_del_child() The quorum_del_child() callback runs under the graph lock, so it is not allowed to drain. It is only called as the .bdrv_del_child() callback, which is only called in the bdrv_del_child() function, which also runs under the graph lock. The bdrv_del_child() function is called by qmp_x_blockdev_change(). A drained section was already introduced there by commit "block: move drain out of quorum_add_child()". This finally finishes moving out the drain to places that are not under the graph lock started in "block: move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". Signed-off-by: Fiona Ebner Message-ID: <20250530151125.955508-17-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block.c | 2 ++ block/quorum.c | 2 -- include/block/block_int-common.h | 7 +++++++ 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'block.c') diff --git a/block.c b/block.c index 15a8ccb822..bfd4340b24 100644 --- a/block.c +++ b/block.c @@ -8276,6 +8276,8 @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, /* * Hot remove a BDS's child. Used in combination with bdrv_add_child, so the * user can take a child offline when it is broken and take a new child online. + * + * All block nodes must be drained. */ void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp) { diff --git a/block/quorum.c b/block/quorum.c index 81407a38ee..cc3bc5f4e7 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -1147,9 +1147,7 @@ quorum_del_child(BlockDriverState *bs, BdrvChild *child, Error **errp) (s->num_children - i - 1) * sizeof(BdrvChild *)); s->children = g_renew(BdrvChild *, s->children, --s->num_children); - bdrv_drain_all_begin(); bdrv_unref_child(bs, child); - bdrv_drain_all_end(); quorum_refresh_flags(bs); } diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index f9e742f812..925a3e7353 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -406,6 +406,13 @@ struct BlockDriver { void GRAPH_WRLOCK_PTR (*bdrv_add_child)( BlockDriverState *parent, BlockDriverState *child, Error **errp); + /** + * Hot remove a BDS's child. Used in combination with bdrv_add_child, so the + * user can take a child offline when it is broken and take a new child + * online. + * + * All block nodes must be drained. + */ void GRAPH_WRLOCK_PTR (*bdrv_del_child)( BlockDriverState *parent, BdrvChild *child, Error **errp); -- cgit 1.4.1