26ba25
From 7946b28c7e68c564b0734869ff6f4e36bd5e2e2d Mon Sep 17 00:00:00 2001
26ba25
From: Kevin Wolf <kwolf@redhat.com>
26ba25
Date: Wed, 10 Oct 2018 20:21:48 +0100
26ba25
Subject: [PATCH 22/49] block: Allow graph changes in bdrv_drain_all_begin/end
26ba25
 sections
26ba25
26ba25
RH-Author: Kevin Wolf <kwolf@redhat.com>
26ba25
Message-id: <20181010202213.7372-10-kwolf@redhat.com>
26ba25
Patchwork-id: 82597
26ba25
O-Subject: [RHEL-8 qemu-kvm PATCH 19/44] block: Allow graph changes in bdrv_drain_all_begin/end sections
26ba25
Bugzilla: 1637976
26ba25
RH-Acked-by: Max Reitz <mreitz@redhat.com>
26ba25
RH-Acked-by: John Snow <jsnow@redhat.com>
26ba25
RH-Acked-by: Thomas Huth <thuth@redhat.com>
26ba25
26ba25
bdrv_drain_all_*() used bdrv_next() to iterate over all root nodes and
26ba25
did a subtree drain for each of them. This works fine as long as the
26ba25
graph is static, but sadly, reality looks different.
26ba25
26ba25
If the graph changes so that root nodes are added or removed, we would
26ba25
have to compensate for this. bdrv_next() returns each root node only
26ba25
once even if it's the root node for multiple BlockBackends or for a
26ba25
monitor-owned block driver tree, which would only complicate things.
26ba25
26ba25
The much easier and more obviously correct way is to fundamentally
26ba25
change the way the functions work: Iterate over all BlockDriverStates,
26ba25
no matter who owns them, and drain them individually. Compensation is
26ba25
only necessary when a new BDS is created inside a drain_all section.
26ba25
Removal of a BDS doesn't require any action because it's gone afterwards
26ba25
anyway.
26ba25
26ba25
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
26ba25
(cherry picked from commit 0f12264e7a41458179ad10276a7c33c72024861a)
26ba25
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
26ba25
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
26ba25
---
26ba25
 block.c                   | 34 ++++++++++++++++++++++++---
26ba25
 block/io.c                | 60 ++++++++++++++++++++++++++++++++++++-----------
26ba25
 include/block/block.h     |  1 +
26ba25
 include/block/block_int.h |  1 +
26ba25
 4 files changed, 79 insertions(+), 17 deletions(-)
26ba25
26ba25
diff --git a/block.c b/block.c
26ba25
index 519be6e..eea9c6f 100644
26ba25
--- a/block.c
26ba25
+++ b/block.c
26ba25
@@ -333,6 +333,10 @@ BlockDriverState *bdrv_new(void)
26ba25
 
26ba25
     qemu_co_queue_init(&bs->flush_queue);
26ba25
 
26ba25
+    for (i = 0; i < bdrv_drain_all_count; i++) {
26ba25
+        bdrv_drained_begin(bs);
26ba25
+    }
26ba25
+
26ba25
     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
26ba25
 
26ba25
     return bs;
26ba25
@@ -1170,7 +1174,7 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
26ba25
                             int open_flags, Error **errp)
26ba25
 {
26ba25
     Error *local_err = NULL;
26ba25
-    int ret;
26ba25
+    int i, ret;
26ba25
 
26ba25
     bdrv_assign_node_name(bs, node_name, &local_err);
26ba25
     if (local_err) {
26ba25
@@ -1218,6 +1222,12 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
26ba25
     assert(bdrv_min_mem_align(bs) != 0);
26ba25
     assert(is_power_of_2(bs->bl.request_alignment));
26ba25
 
26ba25
+    for (i = 0; i < bs->quiesce_counter; i++) {
26ba25
+        if (drv->bdrv_co_drain_begin) {
26ba25
+            drv->bdrv_co_drain_begin(bs);
26ba25
+        }
26ba25
+    }
26ba25
+
26ba25
     return 0;
26ba25
 open_failed:
26ba25
     bs->drv = NULL;
26ba25
@@ -2039,7 +2049,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
26ba25
             child->role->detach(child);
26ba25
         }
26ba25
         if (old_bs->quiesce_counter && child->role->drained_end) {
26ba25
-            for (i = 0; i < old_bs->quiesce_counter; i++) {
26ba25
+            int num = old_bs->quiesce_counter;
26ba25
+            if (child->role->parent_is_bds) {
26ba25
+                num -= bdrv_drain_all_count;
26ba25
+            }
26ba25
+            assert(num >= 0);
26ba25
+            for (i = 0; i < num; i++) {
26ba25
                 child->role->drained_end(child);
26ba25
             }
26ba25
         }
26ba25
@@ -2051,7 +2066,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
26ba25
     if (new_bs) {
26ba25
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
26ba25
         if (new_bs->quiesce_counter && child->role->drained_begin) {
26ba25
-            for (i = 0; i < new_bs->quiesce_counter; i++) {
26ba25
+            int num = new_bs->quiesce_counter;
26ba25
+            if (child->role->parent_is_bds) {
26ba25
+                num -= bdrv_drain_all_count;
26ba25
+            }
26ba25
+            assert(num >= 0);
26ba25
+            for (i = 0; i < num; i++) {
26ba25
                 child->role->drained_begin(child);
26ba25
             }
26ba25
         }
26ba25
@@ -3993,6 +4013,14 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
26ba25
     return QTAILQ_NEXT(bs, node_list);
26ba25
 }
26ba25
 
26ba25
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
26ba25
+{
26ba25
+    if (!bs) {
26ba25
+        return QTAILQ_FIRST(&all_bdrv_states);
26ba25
+    }
26ba25
+    return QTAILQ_NEXT(bs, bs_list);
26ba25
+}
26ba25
+
26ba25
 const char *bdrv_get_node_name(const BlockDriverState *bs)
26ba25
 {
26ba25
     return bs->node_name;
26ba25
diff --git a/block/io.c b/block/io.c
26ba25
index 0021fefd..38ae299 100644
26ba25
--- a/block/io.c
26ba25
+++ b/block/io.c
26ba25
@@ -38,6 +38,8 @@
26ba25
 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
26ba25
 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
26ba25
 
26ba25
+static AioWait drain_all_aio_wait;
26ba25
+
26ba25
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
26ba25
     int64_t offset, int bytes, BdrvRequestFlags flags);
26ba25
 
26ba25
@@ -471,6 +473,29 @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
26ba25
     }
26ba25
 }
26ba25
 
26ba25
+unsigned int bdrv_drain_all_count = 0;
26ba25
+
26ba25
+static bool bdrv_drain_all_poll(void)
26ba25
+{
26ba25
+    BlockDriverState *bs = NULL;
26ba25
+    bool result = false;
26ba25
+
26ba25
+    /* Execute pending BHs first (may modify the graph) and check everything
26ba25
+     * else only after the BHs have executed. */
26ba25
+    while (aio_poll(qemu_get_aio_context(), false));
26ba25
+
26ba25
+    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
26ba25
+     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
26ba25
+    while ((bs = bdrv_next_all_states(bs))) {
26ba25
+        AioContext *aio_context = bdrv_get_aio_context(bs);
26ba25
+        aio_context_acquire(aio_context);
26ba25
+        result |= bdrv_drain_poll(bs, false, NULL, true);
26ba25
+        aio_context_release(aio_context);
26ba25
+    }
26ba25
+
26ba25
+    return result;
26ba25
+}
26ba25
+
26ba25
 /*
26ba25
  * Wait for pending requests to complete across all BlockDriverStates
26ba25
  *
26ba25
@@ -485,45 +510,51 @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
26ba25
  */
26ba25
 void bdrv_drain_all_begin(void)
26ba25
 {
26ba25
-    BlockDriverState *bs;
26ba25
-    BdrvNextIterator it;
26ba25
+    BlockDriverState *bs = NULL;
26ba25
 
26ba25
     if (qemu_in_coroutine()) {
26ba25
-        bdrv_co_yield_to_drain(NULL, true, false, NULL, false, true);
26ba25
+        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
26ba25
         return;
26ba25
     }
26ba25
 
26ba25
-    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
26ba25
-     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
26ba25
-     * nodes in several different AioContexts, so make sure we're in the main
26ba25
-     * context. */
26ba25
+    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
26ba25
+     * loop AioContext, so make sure we're in the main context. */
26ba25
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
26ba25
+    assert(bdrv_drain_all_count < INT_MAX);
26ba25
+    bdrv_drain_all_count++;
26ba25
 
26ba25
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
26ba25
+    /* Quiesce all nodes, without polling in-flight requests yet. The graph
26ba25
+     * cannot change during this loop. */
26ba25
+    while ((bs = bdrv_next_all_states(bs))) {
26ba25
         AioContext *aio_context = bdrv_get_aio_context(bs);
26ba25
 
26ba25
         aio_context_acquire(aio_context);
26ba25
-        bdrv_do_drained_begin(bs, true, NULL, false, true);
26ba25
+        bdrv_do_drained_begin(bs, false, NULL, true, false);
26ba25
         aio_context_release(aio_context);
26ba25
     }
26ba25
 
26ba25
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
26ba25
+    /* Now poll the in-flight requests */
26ba25
+    AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());
26ba25
+
26ba25
+    while ((bs = bdrv_next_all_states(bs))) {
26ba25
         bdrv_drain_assert_idle(bs);
26ba25
     }
26ba25
 }
26ba25
 
26ba25
 void bdrv_drain_all_end(void)
26ba25
 {
26ba25
-    BlockDriverState *bs;
26ba25
-    BdrvNextIterator it;
26ba25
+    BlockDriverState *bs = NULL;
26ba25
 
26ba25
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
26ba25
+    while ((bs = bdrv_next_all_states(bs))) {
26ba25
         AioContext *aio_context = bdrv_get_aio_context(bs);
26ba25
 
26ba25
         aio_context_acquire(aio_context);
26ba25
-        bdrv_do_drained_end(bs, true, NULL, false);
26ba25
+        bdrv_do_drained_end(bs, false, NULL, true);
26ba25
         aio_context_release(aio_context);
26ba25
     }
26ba25
+
26ba25
+    assert(bdrv_drain_all_count > 0);
26ba25
+    bdrv_drain_all_count--;
26ba25
 }
26ba25
 
26ba25
 void bdrv_drain_all(void)
26ba25
@@ -658,6 +689,7 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
26ba25
 void bdrv_wakeup(BlockDriverState *bs)
26ba25
 {
26ba25
     aio_wait_kick(bdrv_get_aio_wait(bs));
26ba25
+    aio_wait_kick(&drain_all_aio_wait);
26ba25
 }
26ba25
 
26ba25
 void bdrv_dec_in_flight(BlockDriverState *bs)
26ba25
diff --git a/include/block/block.h b/include/block/block.h
26ba25
index 6e91803..f9079ac 100644
26ba25
--- a/include/block/block.h
26ba25
+++ b/include/block/block.h
26ba25
@@ -449,6 +449,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
26ba25
                                  Error **errp);
26ba25
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
26ba25
 BlockDriverState *bdrv_next_node(BlockDriverState *bs);
26ba25
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
26ba25
 
26ba25
 typedef struct BdrvNextIterator {
26ba25
     enum {
26ba25
diff --git a/include/block/block_int.h b/include/block/block_int.h
26ba25
index 0ad8a76..9757d5e 100644
26ba25
--- a/include/block/block_int.h
26ba25
+++ b/include/block/block_int.h
26ba25
@@ -845,6 +845,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
26ba25
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
26ba25
     BdrvRequestFlags flags);
26ba25
 
26ba25
+extern unsigned int bdrv_drain_all_count;
26ba25
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
26ba25
 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
26ba25
 
26ba25
-- 
26ba25
1.8.3.1
26ba25