Blame SOURCES/kvm-block-Really-pause-block-jobs-on-drain.patch

1bdc94
From 70365466a45a381ebb54e49cb03579b5fd6c76ef Mon Sep 17 00:00:00 2001
1bdc94
From: Kevin Wolf <kwolf@redhat.com>
1bdc94
Date: Fri, 14 Sep 2018 10:55:05 +0200
1bdc94
Subject: [PATCH 14/49] block: Really pause block jobs on drain
1bdc94
1bdc94
RH-Author: Kevin Wolf <kwolf@redhat.com>
1bdc94
Message-id: <20180914105540.18077-8-kwolf@redhat.com>
1bdc94
Patchwork-id: 82160
1bdc94
O-Subject: [RHV-7.6 qemu-kvm-rhev PATCH 07/42] block: Really pause block jobs on drain
1bdc94
Bugzilla: 1601212
1bdc94
RH-Acked-by: John Snow <jsnow@redhat.com>
1bdc94
RH-Acked-by: Max Reitz <mreitz@redhat.com>
1bdc94
RH-Acked-by: Fam Zheng <famz@redhat.com>
1bdc94
1bdc94
We already requested that block jobs be paused in .bdrv_drained_begin,
1bdc94
but no guarantee was made that the job was actually inactive at the
1bdc94
point where bdrv_drained_begin() returned.
1bdc94
1bdc94
This introduces a new callback BdrvChildRole.bdrv_drained_poll() and
1bdc94
uses it to make bdrv_drain_poll() consider block jobs using the node to
1bdc94
be drained.
1bdc94
1bdc94
For the test case to work as expected, we have to switch from
1bdc94
block_job_sleep_ns() to qemu_co_sleep_ns() so that the test job is even
1bdc94
considered active and must be waited for when draining the node.
1bdc94
1bdc94
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
1bdc94
(cherry picked from commit 89bd030533e3592ca0a995450dcfc5d53e459e20)
1bdc94
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
1bdc94
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
1bdc94
---
1bdc94
 block.c                      |  9 +++++++++
1bdc94
 block/io.c                   | 40 ++++++++++++++++++++++++++++++++++------
1bdc94
 block/mirror.c               |  8 ++++++++
1bdc94
 blockjob.c                   | 23 +++++++++++++++++++++++
1bdc94
 include/block/block.h        |  8 ++++++++
1bdc94
 include/block/block_int.h    |  7 +++++++
1bdc94
 include/block/blockjob_int.h |  8 ++++++++
1bdc94
 tests/test-bdrv-drain.c      | 18 ++++++++++--------
1bdc94
 8 files changed, 107 insertions(+), 14 deletions(-)
1bdc94
1bdc94
diff --git a/block.c b/block.c
1bdc94
index 10a1ece..0d9698a 100644
1bdc94
--- a/block.c
1bdc94
+++ b/block.c
1bdc94
@@ -821,6 +821,12 @@ static void bdrv_child_cb_drained_begin(BdrvChild *child)
1bdc94
     bdrv_drained_begin(bs);
1bdc94
 }
1bdc94
 
1bdc94
+static bool bdrv_child_cb_drained_poll(BdrvChild *child)
1bdc94
+{
1bdc94
+    BlockDriverState *bs = child->opaque;
1bdc94
+    return bdrv_drain_poll(bs, NULL);
1bdc94
+}
1bdc94
+
1bdc94
 static void bdrv_child_cb_drained_end(BdrvChild *child)
1bdc94
 {
1bdc94
     BlockDriverState *bs = child->opaque;
1bdc94
@@ -905,6 +911,7 @@ const BdrvChildRole child_file = {
1bdc94
     .get_parent_desc = bdrv_child_get_parent_desc,
1bdc94
     .inherit_options = bdrv_inherited_options,
1bdc94
     .drained_begin   = bdrv_child_cb_drained_begin,
1bdc94
+    .drained_poll    = bdrv_child_cb_drained_poll,
1bdc94
     .drained_end     = bdrv_child_cb_drained_end,
1bdc94
     .attach          = bdrv_child_cb_attach,
1bdc94
     .detach          = bdrv_child_cb_detach,
1bdc94
@@ -929,6 +936,7 @@ const BdrvChildRole child_format = {
1bdc94
     .get_parent_desc = bdrv_child_get_parent_desc,
1bdc94
     .inherit_options = bdrv_inherited_fmt_options,
1bdc94
     .drained_begin   = bdrv_child_cb_drained_begin,
1bdc94
+    .drained_poll    = bdrv_child_cb_drained_poll,
1bdc94
     .drained_end     = bdrv_child_cb_drained_end,
1bdc94
     .attach          = bdrv_child_cb_attach,
1bdc94
     .detach          = bdrv_child_cb_detach,
1bdc94
@@ -1048,6 +1056,7 @@ const BdrvChildRole child_backing = {
1bdc94
     .detach          = bdrv_backing_detach,
1bdc94
     .inherit_options = bdrv_backing_options,
1bdc94
     .drained_begin   = bdrv_child_cb_drained_begin,
1bdc94
+    .drained_poll    = bdrv_child_cb_drained_poll,
1bdc94
     .drained_end     = bdrv_child_cb_drained_end,
1bdc94
     .inactivate      = bdrv_child_cb_inactivate,
1bdc94
     .update_filename = bdrv_backing_update_filename,
1bdc94
diff --git a/block/io.c b/block/io.c
1bdc94
index 4d332c3..e260394 100644
1bdc94
--- a/block/io.c
1bdc94
+++ b/block/io.c
1bdc94
@@ -69,6 +69,23 @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
1bdc94
     }
1bdc94
 }
1bdc94
 
1bdc94
+static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore)
1bdc94
+{
1bdc94
+    BdrvChild *c, *next;
1bdc94
+    bool busy = false;
1bdc94
+
1bdc94
+    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
1bdc94
+        if (c == ignore) {
1bdc94
+            continue;
1bdc94
+        }
1bdc94
+        if (c->role->drained_poll) {
1bdc94
+            busy |= c->role->drained_poll(c);
1bdc94
+        }
1bdc94
+    }
1bdc94
+
1bdc94
+    return busy;
1bdc94
+}
1bdc94
+
1bdc94
 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
1bdc94
 {
1bdc94
     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
1bdc94
@@ -182,21 +199,32 @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
1bdc94
 }
1bdc94
 
1bdc94
 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
1bdc94
-static bool bdrv_drain_poll(BlockDriverState *bs)
1bdc94
+bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent)
1bdc94
+{
1bdc94
+    if (bdrv_parent_drained_poll(bs, ignore_parent)) {
1bdc94
+        return true;
1bdc94
+    }
1bdc94
+
1bdc94
+    return atomic_read(&bs->in_flight);
1bdc94
+}
1bdc94
+
1bdc94
+static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
1bdc94
+                                      BdrvChild *ignore_parent)
1bdc94
 {
1bdc94
     /* Execute pending BHs first and check everything else only after the BHs
1bdc94
      * have executed. */
1bdc94
     while (aio_poll(bs->aio_context, false));
1bdc94
-    return atomic_read(&bs->in_flight);
1bdc94
+
1bdc94
+    return bdrv_drain_poll(bs, ignore_parent);
1bdc94
 }
1bdc94
 
1bdc94
-static bool bdrv_drain_recurse(BlockDriverState *bs)
1bdc94
+static bool bdrv_drain_recurse(BlockDriverState *bs, BdrvChild *parent)
1bdc94
 {
1bdc94
     BdrvChild *child, *tmp;
1bdc94
     bool waited;
1bdc94
 
1bdc94
     /* Wait for drained requests to finish */
1bdc94
-    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll(bs));
1bdc94
+    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
1bdc94
 
1bdc94
     QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
1bdc94
         BlockDriverState *bs = child->bs;
1bdc94
@@ -213,7 +241,7 @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
1bdc94
              */
1bdc94
             bdrv_ref(bs);
1bdc94
         }
1bdc94
-        waited |= bdrv_drain_recurse(bs);
1bdc94
+        waited |= bdrv_drain_recurse(bs, child);
1bdc94
         if (in_main_loop) {
1bdc94
             bdrv_unref(bs);
1bdc94
         }
1bdc94
@@ -289,7 +317,7 @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
1bdc94
 
1bdc94
     bdrv_parent_drained_begin(bs, parent);
1bdc94
     bdrv_drain_invoke(bs, true);
1bdc94
-    bdrv_drain_recurse(bs);
1bdc94
+    bdrv_drain_recurse(bs, parent);
1bdc94
 
1bdc94
     if (recursive) {
1bdc94
         bs->recursive_quiesce_counter++;
1bdc94
diff --git a/block/mirror.c b/block/mirror.c
1bdc94
index 313e6e9..4b27f71 100644
1bdc94
--- a/block/mirror.c
1bdc94
+++ b/block/mirror.c
1bdc94
@@ -976,6 +976,12 @@ static void mirror_pause(Job *job)
1bdc94
     mirror_wait_for_all_io(s);
1bdc94
 }
1bdc94
 
1bdc94
+static bool mirror_drained_poll(BlockJob *job)
1bdc94
+{
1bdc94
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
1bdc94
+    return !!s->in_flight;
1bdc94
+}
1bdc94
+
1bdc94
 static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
1bdc94
 {
1bdc94
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
1bdc94
@@ -1011,6 +1017,7 @@ static const BlockJobDriver mirror_job_driver = {
1bdc94
         .pause                  = mirror_pause,
1bdc94
         .complete               = mirror_complete,
1bdc94
     },
1bdc94
+    .drained_poll           = mirror_drained_poll,
1bdc94
     .attached_aio_context   = mirror_attached_aio_context,
1bdc94
     .drain                  = mirror_drain,
1bdc94
 };
1bdc94
@@ -1028,6 +1035,7 @@ static const BlockJobDriver commit_active_job_driver = {
1bdc94
         .pause                  = mirror_pause,
1bdc94
         .complete               = mirror_complete,
1bdc94
     },
1bdc94
+    .drained_poll           = mirror_drained_poll,
1bdc94
     .attached_aio_context   = mirror_attached_aio_context,
1bdc94
     .drain                  = mirror_drain,
1bdc94
 };
1bdc94
diff --git a/blockjob.c b/blockjob.c
1bdc94
index 0306533..be5903a 100644
1bdc94
--- a/blockjob.c
1bdc94
+++ b/blockjob.c
1bdc94
@@ -155,6 +155,28 @@ static void child_job_drained_begin(BdrvChild *c)
1bdc94
     job_pause(&job->job);
1bdc94
 }
1bdc94
 
1bdc94
+static bool child_job_drained_poll(BdrvChild *c)
1bdc94
+{
1bdc94
+    BlockJob *bjob = c->opaque;
1bdc94
+    Job *job = &bjob->job;
1bdc94
+    const BlockJobDriver *drv = block_job_driver(bjob);
1bdc94
+
1bdc94
+    /* An inactive or completed job doesn't have any pending requests. Jobs
1bdc94
+     * with !job->busy are either already paused or have a pause point after
1bdc94
+     * being reentered, so no job driver code will run before they pause. */
1bdc94
+    if (!job->busy || job_is_completed(job) || job->deferred_to_main_loop) {
1bdc94
+        return false;
1bdc94
+    }
1bdc94
+
1bdc94
+    /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
1bdc94
+     * override this assumption. */
1bdc94
+    if (drv->drained_poll) {
1bdc94
+        return drv->drained_poll(bjob);
1bdc94
+    } else {
1bdc94
+        return true;
1bdc94
+    }
1bdc94
+}
1bdc94
+
1bdc94
 static void child_job_drained_end(BdrvChild *c)
1bdc94
 {
1bdc94
     BlockJob *job = c->opaque;
1bdc94
@@ -164,6 +186,7 @@ static void child_job_drained_end(BdrvChild *c)
1bdc94
 static const BdrvChildRole child_job = {
1bdc94
     .get_parent_desc    = child_job_get_parent_desc,
1bdc94
     .drained_begin      = child_job_drained_begin,
1bdc94
+    .drained_poll       = child_job_drained_poll,
1bdc94
     .drained_end        = child_job_drained_end,
1bdc94
     .stay_at_node       = true,
1bdc94
 };
1bdc94
diff --git a/include/block/block.h b/include/block/block.h
1bdc94
index 8f87eea..8c91d4c 100644
1bdc94
--- a/include/block/block.h
1bdc94
+++ b/include/block/block.h
1bdc94
@@ -596,6 +596,14 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
1bdc94
 void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
1bdc94
 
1bdc94
 /**
1bdc94
+ * bdrv_drain_poll:
1bdc94
+ *
1bdc94
+ * Poll for pending requests in @bs and its parents (except for
1bdc94
+ * @ignore_parent). This is part of bdrv_drained_begin.
1bdc94
+ */
1bdc94
+bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent);
1bdc94
+
1bdc94
+/**
1bdc94
  * bdrv_drained_begin:
1bdc94
  *
1bdc94
  * Begin a quiesced section for exclusive access to the BDS, by disabling
1bdc94
diff --git a/include/block/block_int.h b/include/block/block_int.h
1bdc94
index 341cbe8..beeacde 100644
1bdc94
--- a/include/block/block_int.h
1bdc94
+++ b/include/block/block_int.h
1bdc94
@@ -610,6 +610,13 @@ struct BdrvChildRole {
1bdc94
     void (*drained_begin)(BdrvChild *child);
1bdc94
     void (*drained_end)(BdrvChild *child);
1bdc94
 
1bdc94
+    /*
1bdc94
+     * Returns whether the parent has pending requests for the child. This
1bdc94
+     * callback is polled after .drained_begin() has been called until all
1bdc94
+     * activity on the child has stopped.
1bdc94
+     */
1bdc94
+    bool (*drained_poll)(BdrvChild *child);
1bdc94
+
1bdc94
     /* Notifies the parent that the child has been activated/inactivated (e.g.
1bdc94
      * when migration is completing) and it can start/stop requesting
1bdc94
      * permissions and doing I/O on it. */
1bdc94
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
1bdc94
index 5cd50c6..e4a318d 100644
1bdc94
--- a/include/block/blockjob_int.h
1bdc94
+++ b/include/block/blockjob_int.h
1bdc94
@@ -39,6 +39,14 @@ struct BlockJobDriver {
1bdc94
     JobDriver job_driver;
1bdc94
 
1bdc94
     /*
1bdc94
+     * Returns whether the job has pending requests for the child or will
1bdc94
+     * submit new requests before the next pause point. This callback is polled
1bdc94
+     * in the context of draining a job node after requesting that the job be
1bdc94
+     * paused, until all activity on the child has stopped.
1bdc94
+     */
1bdc94
+    bool (*drained_poll)(BlockJob *job);
1bdc94
+
1bdc94
+    /*
1bdc94
      * If the callback is not NULL, it will be invoked before the job is
1bdc94
      * resumed in a new AioContext.  This is the place to move any resources
1bdc94
      * besides job->blk to the new AioContext.
1bdc94
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
1bdc94
index f5d85c9..49786ea 100644
1bdc94
--- a/tests/test-bdrv-drain.c
1bdc94
+++ b/tests/test-bdrv-drain.c
1bdc94
@@ -681,7 +681,11 @@ static int coroutine_fn test_job_run(Job *job, Error **errp)
1bdc94
 
1bdc94
     job_transition_to_ready(&s->common.job);
1bdc94
     while (!s->should_complete) {
1bdc94
-        job_sleep_ns(&s->common.job, 100000);
1bdc94
+        /* Avoid block_job_sleep_ns() because it marks the job as !busy. We
1bdc94
+         * want to emulate some actual activity (probably some I/O) here so
1bdc94
+         * that drain has to wait for this acitivity to stop. */
1bdc94
+        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
1bdc94
+        job_pause_point(&s->common.job);
1bdc94
     }
1bdc94
 
1bdc94
     return 0;
1bdc94
@@ -728,7 +732,7 @@ static void test_blockjob_common(enum drain_type drain_type)
1bdc94
 
1bdc94
     g_assert_cmpint(job->job.pause_count, ==, 0);
1bdc94
     g_assert_false(job->job.paused);
1bdc94
-    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
1bdc94
+    g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
1bdc94
 
1bdc94
     do_drain_begin(drain_type, src);
1bdc94
 
1bdc94
@@ -738,15 +742,14 @@ static void test_blockjob_common(enum drain_type drain_type)
1bdc94
     } else {
1bdc94
         g_assert_cmpint(job->job.pause_count, ==, 1);
1bdc94
     }
1bdc94
-    /* XXX We don't wait until the job is actually paused. Is this okay? */
1bdc94
-    /* g_assert_true(job->job.paused); */
1bdc94
+    g_assert_true(job->job.paused);
1bdc94
     g_assert_false(job->job.busy); /* The job is paused */
1bdc94
 
1bdc94
     do_drain_end(drain_type, src);
1bdc94
 
1bdc94
     g_assert_cmpint(job->job.pause_count, ==, 0);
1bdc94
     g_assert_false(job->job.paused);
1bdc94
-    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
1bdc94
+    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
1bdc94
 
1bdc94
     do_drain_begin(drain_type, target);
1bdc94
 
1bdc94
@@ -756,15 +759,14 @@ static void test_blockjob_common(enum drain_type drain_type)
1bdc94
     } else {
1bdc94
         g_assert_cmpint(job->job.pause_count, ==, 1);
1bdc94
     }
1bdc94
-    /* XXX We don't wait until the job is actually paused. Is this okay? */
1bdc94
-    /* g_assert_true(job->job.paused); */
1bdc94
+    g_assert_true(job->job.paused);
1bdc94
     g_assert_false(job->job.busy); /* The job is paused */
1bdc94
 
1bdc94
     do_drain_end(drain_type, target);
1bdc94
 
1bdc94
     g_assert_cmpint(job->job.pause_count, ==, 0);
1bdc94
     g_assert_false(job->job.paused);
1bdc94
-    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
1bdc94
+    g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
1bdc94
 
1bdc94
     ret = job_complete_sync(&job->job, &error_abort);
1bdc94
     g_assert_cmpint(ret, ==, 0);
1bdc94
-- 
1bdc94
1.8.3.1
1bdc94