From 34dc97b9a0e592bc466bdb0bbfe45d77304a72b6 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 14:53:05 +0200 Subject: blockjob: Wake up BDS when job becomes idle In the context of draining a BDS, the .drained_poll callback of block jobs is called. If this returns true (i.e. there is still some activity pending), the drain operation may call aio_poll() with blocking=true to wait for completion. As soon as the pending activity is completed and the job finally arrives in a quiescent state (i.e. its coroutine either yields with busy=false or terminates), the block job must notify the aio_poll() loop to wake up, otherwise we get a deadlock if both are running in different threads. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng Reviewed-by: Max Reitz --- include/qemu/job.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/qemu/job.h') diff --git a/include/qemu/job.h b/include/qemu/job.h index 5cb0681834..b4a784d3cc 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -156,6 +156,9 @@ typedef struct Job { /** Notifiers called when the job transitions to READY */ NotifierList on_ready; + /** Notifiers called when the job coroutine yields or terminates */ + NotifierList on_idle; + /** Element of the list of jobs */ QLIST_ENTRY(Job) job_list; -- cgit 1.4.1 From 30c070a547322a5e41ce129d540bca3653b1a9c8 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 17:29:08 +0200 Subject: test-blockjob: Acquire AioContext around job_cancel_sync() All callers in QEMU proper hold the AioContext lock when calling job_finish_sync(). test-blockjob should do the same when it calls the function indirectly through job_cancel_sync(). Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng --- include/qemu/job.h | 6 ++++++ tests/test-blockjob.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'include/qemu/job.h') diff --git a/include/qemu/job.h b/include/qemu/job.h index b4a784d3cc..63c60ef1ba 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -524,6 +524,8 @@ void job_user_cancel(Job *job, bool force, Error **errp); * * Returns the return value from the job if the job actually completed * during the call, or -ECANCELED if it was canceled. + * + * Callers must hold the AioContext lock of job->aio_context. */ int job_cancel_sync(Job *job); @@ -541,6 +543,8 @@ void job_cancel_sync_all(void); * function). * * Returns the return value from the job. + * + * Callers must hold the AioContext lock of job->aio_context. */ int job_complete_sync(Job *job, Error **errp); @@ -566,6 +570,8 @@ void job_dismiss(Job **job, Error **errp); * * Returns 0 if the job is successfully completed, -ECANCELED if the job was * cancelled before completing, and -errno in other error cases. + * + * Callers must hold the AioContext lock of job->aio_context. */ int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp); diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c index de4c1c20aa..652d1e8359 100644 --- a/tests/test-blockjob.c +++ b/tests/test-blockjob.c @@ -223,6 +223,10 @@ static void cancel_common(CancelJob *s) BlockJob *job = &s->common; BlockBackend *blk = s->blk; JobStatus sts = job->job.status; + AioContext *ctx; + + ctx = job->job.aio_context; + aio_context_acquire(ctx); job_cancel_sync(&job->job); if (sts != JOB_STATUS_CREATED && sts != JOB_STATUS_CONCLUDED) { @@ -232,6 +236,8 @@ static void cancel_common(CancelJob *s) assert(job->job.status == JOB_STATUS_NULL); job_unref(&job->job); destroy_blk(blk); + + aio_context_release(ctx); } static void test_cancel_created(void) -- cgit 1.4.1 From b5a7a0573530698ee448b063ac01d485e30446bd Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 7 Sep 2018 15:31:22 +0200 Subject: blockjob: Lie better in child_job_drained_poll() Block jobs claim in .drained_poll() that they are in a quiescent state as soon as job->deferred_to_main_loop is true. This is obviously wrong, they still have a completion BH to run. We only get away with this because commit 91af091f923 added an unconditional aio_poll(false) to the drain functions, but this is bypassing the regular drain mechanisms. However, just removing this and telling that the job is still active doesn't work either: The completion callbacks themselves call drain functions (directly, or indirectly with bdrv_reopen), so they would deadlock then. As a better lie, tell that the job is active as long as the BH is pending, but falsely call it quiescent from the point in the BH when the completion callback is called. At this point, nested drain calls won't deadlock because they ignore the job, and outer drains will wait for the job to really reach a quiescent state because the callback is already running. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- blockjob.c | 2 +- include/qemu/job.h | 3 +++ job.c | 11 ++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/qemu/job.h') diff --git a/blockjob.c b/blockjob.c index 58dbd87a51..4d5342259c 100644 --- a/blockjob.c +++ b/blockjob.c @@ -164,7 +164,7 @@ static bool child_job_drained_poll(BdrvChild *c) /* An inactive or completed job doesn't have any pending requests. Jobs * with !job->busy are either already paused or have a pause point after * being reentered, so no job driver code will run before they pause. */ - if (!job->busy || job_is_completed(job) || job->deferred_to_main_loop) { + if (!job->busy || job_is_completed(job)) { return false; } diff --git a/include/qemu/job.h b/include/qemu/job.h index 63c60ef1ba..9e7cd1e4a0 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -76,6 +76,9 @@ typedef struct Job { * Set to false by the job while the coroutine has yielded and may be * re-entered by job_enter(). There may still be I/O or event loop activity * pending. Accessed under block_job_mutex (in blockjob.c). + * + * When the job is deferred to the main loop, busy is true as long as the + * bottom half is still pending. */ bool busy; diff --git a/job.c b/job.c index 7ec8c3b969..518f603314 100644 --- a/job.c +++ b/job.c @@ -857,7 +857,16 @@ static void job_exit(void *opaque) AioContext *ctx = job->aio_context; aio_context_acquire(ctx); + + /* This is a lie, we're not quiescent, but still doing the completion + * callbacks. However, completion callbacks tend to involve operations that + * drain block nodes, and if .drained_poll still returned true, we would + * deadlock. */ + job->busy = false; + job_event_idle(job); + job_completed(job); + aio_context_release(ctx); } @@ -872,8 +881,8 @@ static void coroutine_fn job_co_entry(void *opaque) assert(job && job->driver && job->driver->run); job_pause_point(job); job->ret = job->driver->run(job, &job->err); - job_event_idle(job); job->deferred_to_main_loop = true; + job->busy = true; aio_bh_schedule_oneshot(qemu_get_aio_context(), job_exit, job); } -- cgit 1.4.1