Blame SOURCES/kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch

902636
From 3f16b8a33bd7503cbe857fbeb45fff7301b6bb5f Mon Sep 17 00:00:00 2001
902636
From: Kevin Wolf <kwolf@redhat.com>
902636
Date: Wed, 8 Apr 2020 17:29:12 +0100
902636
Subject: [PATCH 1/6] job: take each job's lock individually in job_txn_apply
902636
902636
RH-Author: Kevin Wolf <kwolf@redhat.com>
902636
Message-id: <20200408172917.18712-2-kwolf@redhat.com>
902636
Patchwork-id: 94597
902636
O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/6] job: take each job's lock individually in job_txn_apply
902636
Bugzilla: 1817621
902636
RH-Acked-by: Eric Blake <eblake@redhat.com>
902636
RH-Acked-by: Danilo de Paula <ddepaula@redhat.com>
902636
RH-Acked-by: Max Reitz <mreitz@redhat.com>
902636
902636
From: Stefan Reiter <s.reiter@proxmox.com>
902636
902636
All callers of job_txn_apply hold a single job's lock, but different
902636
jobs within a transaction can have different contexts, thus we need to
902636
lock each one individually before applying the callback function.
902636
902636
Similar to job_completed_txn_abort this also requires releasing the
902636
caller's context before and reacquiring it after to avoid recursive
902636
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
902636
existing code would already have to take this into account, lest
902636
job_completed_txn_abort might have broken.
902636
902636
This also brings to light a different issue: When a callback function in
902636
job_txn_apply moves it's job to a different AIO context, callers will
902636
try to release the wrong lock (now that we re-acquire the lock
902636
correctly, previously it would just continue with the old lock, leaving
902636
the job unlocked for the rest of the return path). Fix this by not caching
902636
the job's context.
902636
902636
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
902636
job_exit, since everyone else calls through job_exit.
902636
902636
One test needed adapting, since it calls job_finalize directly, so it
902636
manually needs to acquire the correct context.
902636
902636
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
902636
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
902636
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
902636
(cherry picked from commit b660a84bbb0eb1a76b505648d31d5e82594fb75e)
902636
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
902636
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
902636
---
902636
 blockdev.c            |  9 +++++++++
902636
 job-qmp.c             |  9 +++++++++
902636
 job.c                 | 50 ++++++++++++++++++++++++++++++++++++++++----------
902636
 tests/test-blockjob.c |  2 ++
902636
 4 files changed, 60 insertions(+), 10 deletions(-)
902636
902636
diff --git a/blockdev.c b/blockdev.c
902636
index c8d4b51..86eb115 100644
902636
--- a/blockdev.c
902636
+++ b/blockdev.c
902636
@@ -4215,7 +4215,16 @@ void qmp_block_job_finalize(const char *id, Error **errp)
902636
     }
902636
 
902636
     trace_qmp_block_job_finalize(job);
902636
+    job_ref(&job->job);
902636
     job_finalize(&job->job, errp);
902636
+
902636
+    /*
902636
+     * Job's context might have changed via job_finalize (and job_txn_apply
902636
+     * automatically acquires the new one), so make sure we release the correct
902636
+     * one.
902636
+     */
902636
+    aio_context = blk_get_aio_context(job->blk);
902636
+    job_unref(&job->job);
902636
     aio_context_release(aio_context);
902636
 }
902636
 
902636
diff --git a/job-qmp.c b/job-qmp.c
902636
index fbfed25..a201220 100644
902636
--- a/job-qmp.c
902636
+++ b/job-qmp.c
902636
@@ -114,7 +114,16 @@ void qmp_job_finalize(const char *id, Error **errp)
902636
     }
902636
 
902636
     trace_qmp_job_finalize(job);
902636
+    job_ref(job);
902636
     job_finalize(job, errp);
902636
+
902636
+    /*
902636
+     * Job's context might have changed via job_finalize (and job_txn_apply
902636
+     * automatically acquires the new one), so make sure we release the correct
902636
+     * one.
902636
+     */
902636
+    aio_context = job->aio_context;
902636
+    job_unref(job);
902636
     aio_context_release(aio_context);
902636
 }
902636
 
902636
diff --git a/job.c b/job.c
902636
index 04409b4..48fc4ad 100644
902636
--- a/job.c
902636
+++ b/job.c
902636
@@ -136,17 +136,38 @@ static void job_txn_del_job(Job *job)
902636
     }
902636
 }
902636
 
902636
-static int job_txn_apply(JobTxn *txn, int fn(Job *))
902636
+static int job_txn_apply(Job *job, int fn(Job *))
902636
 {
902636
-    Job *job, *next;
902636
+    AioContext *inner_ctx;
902636
+    Job *other_job, *next;
902636
+    JobTxn *txn = job->txn;
902636
     int rc = 0;
902636
 
902636
-    QLIST_FOREACH_SAFE(job, &txn->jobs, txn_list, next) {
902636
-        rc = fn(job);
902636
+    /*
902636
+     * Similar to job_completed_txn_abort, we take each job's lock before
902636
+     * applying fn, but since we assume that outer_ctx is held by the caller,
902636
+     * we need to release it here to avoid holding the lock twice - which would
902636
+     * break AIO_WAIT_WHILE from within fn.
902636
+     */
902636
+    job_ref(job);
902636
+    aio_context_release(job->aio_context);
902636
+
902636
+    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
902636
+        inner_ctx = other_job->aio_context;
902636
+        aio_context_acquire(inner_ctx);
902636
+        rc = fn(other_job);
902636
+        aio_context_release(inner_ctx);
902636
         if (rc) {
902636
             break;
902636
         }
902636
     }
902636
+
902636
+    /*
902636
+     * Note that job->aio_context might have been changed by calling fn, so we
902636
+     * can't use a local variable to cache it.
902636
+     */
902636
+    aio_context_acquire(job->aio_context);
902636
+    job_unref(job);
902636
     return rc;
902636
 }
902636
 
902636
@@ -774,11 +795,11 @@ static void job_do_finalize(Job *job)
902636
     assert(job && job->txn);
902636
 
902636
     /* prepare the transaction to complete */
902636
-    rc = job_txn_apply(job->txn, job_prepare);
902636
+    rc = job_txn_apply(job, job_prepare);
902636
     if (rc) {
902636
         job_completed_txn_abort(job);
902636
     } else {
902636
-        job_txn_apply(job->txn, job_finalize_single);
902636
+        job_txn_apply(job, job_finalize_single);
902636
     }
902636
 }
902636
 
902636
@@ -824,10 +845,10 @@ static void job_completed_txn_success(Job *job)
902636
         assert(other_job->ret == 0);
902636
     }
902636
 
902636
-    job_txn_apply(txn, job_transition_to_pending);
902636
+    job_txn_apply(job, job_transition_to_pending);
902636
 
902636
     /* If no jobs need manual finalization, automatically do so */
902636
-    if (job_txn_apply(txn, job_needs_finalize) == 0) {
902636
+    if (job_txn_apply(job, job_needs_finalize) == 0) {
902636
         job_do_finalize(job);
902636
     }
902636
 }
902636
@@ -849,9 +870,10 @@ static void job_completed(Job *job)
902636
 static void job_exit(void *opaque)
902636
 {
902636
     Job *job = (Job *)opaque;
902636
-    AioContext *ctx = job->aio_context;
902636
+    AioContext *ctx;
902636
 
902636
-    aio_context_acquire(ctx);
902636
+    job_ref(job);
902636
+    aio_context_acquire(job->aio_context);
902636
 
902636
     /* This is a lie, we're not quiescent, but still doing the completion
902636
      * callbacks. However, completion callbacks tend to involve operations that
902636
@@ -862,6 +884,14 @@ static void job_exit(void *opaque)
902636
 
902636
     job_completed(job);
902636
 
902636
+    /*
902636
+     * Note that calling job_completed can move the job to a different
902636
+     * aio_context, so we cannot cache from above. job_txn_apply takes care of
902636
+     * acquiring the new lock, and we ref/unref to avoid job_completed freeing
902636
+     * the job underneath us.
902636
+     */
902636
+    ctx = job->aio_context;
902636
+    job_unref(job);
902636
     aio_context_release(ctx);
902636
 }
902636
 
902636
diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c
902636
index 7844c9f..6d857fd 100644
902636
--- a/tests/test-blockjob.c
902636
+++ b/tests/test-blockjob.c
902636
@@ -368,7 +368,9 @@ static void test_cancel_concluded(void)
902636
     aio_poll(qemu_get_aio_context(), true);
902636
     assert(job->status == JOB_STATUS_PENDING);
902636
 
902636
+    aio_context_acquire(job->aio_context);
902636
     job_finalize(job, &error_abort);
902636
+    aio_context_release(job->aio_context);
902636
     assert(job->status == JOB_STATUS_CONCLUDED);
902636
 
902636
     cancel_common(s);
902636
-- 
902636
1.8.3.1
902636