Blame SOURCES/kvm-block-Allow-graph-changes-in-bdrv_drain_all_begin-en.patch

383d26
From 78fde43fee8eaaa0fd1222844961614f576ebd7c Mon Sep 17 00:00:00 2001
383d26
From: Kevin Wolf <kwolf@redhat.com>
383d26
Date: Fri, 14 Sep 2018 10:55:17 +0200
383d26
Subject: [PATCH 26/49] block: Allow graph changes in bdrv_drain_all_begin/end
383d26
 sections
383d26
383d26
RH-Author: Kevin Wolf <kwolf@redhat.com>
383d26
Message-id: <20180914105540.18077-20-kwolf@redhat.com>
383d26
Patchwork-id: 82171
383d26
O-Subject: [RHV-7.6 qemu-kvm-rhev PATCH 19/42] block: Allow graph changes in bdrv_drain_all_begin/end sections
383d26
Bugzilla: 1601212
383d26
RH-Acked-by: John Snow <jsnow@redhat.com>
383d26
RH-Acked-by: Max Reitz <mreitz@redhat.com>
383d26
RH-Acked-by: Fam Zheng <famz@redhat.com>
383d26
383d26
bdrv_drain_all_*() used bdrv_next() to iterate over all root nodes and
383d26
did a subtree drain for each of them. This works fine as long as the
383d26
graph is static, but sadly, reality looks different.
383d26
383d26
If the graph changes so that root nodes are added or removed, we would
383d26
have to compensate for this. bdrv_next() returns each root node only
383d26
once even if it's the root node for multiple BlockBackends or for a
383d26
monitor-owned block driver tree, which would only complicate things.
383d26
383d26
The much easier and more obviously correct way is to fundamentally
383d26
change the way the functions work: Iterate over all BlockDriverStates,
383d26
no matter who owns them, and drain them individually. Compensation is
383d26
only necessary when a new BDS is created inside a drain_all section.
383d26
Removal of a BDS doesn't require any action because it's gone afterwards
383d26
anyway.
383d26
383d26
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
383d26
(cherry picked from commit 0f12264e7a41458179ad10276a7c33c72024861a)
383d26
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
383d26
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
383d26
---
383d26
 block.c                   | 34 ++++++++++++++++++++++++---
383d26
 block/io.c                | 60 ++++++++++++++++++++++++++++++++++++-----------
383d26
 include/block/block.h     |  1 +
383d26
 include/block/block_int.h |  1 +
383d26
 4 files changed, 79 insertions(+), 17 deletions(-)
383d26
383d26
diff --git a/block.c b/block.c
383d26
index 519be6e..eea9c6f 100644
383d26
--- a/block.c
383d26
+++ b/block.c
383d26
@@ -333,6 +333,10 @@ BlockDriverState *bdrv_new(void)
383d26
 
383d26
     qemu_co_queue_init(&bs->flush_queue);
383d26
 
383d26
+    for (i = 0; i < bdrv_drain_all_count; i++) {
383d26
+        bdrv_drained_begin(bs);
383d26
+    }
383d26
+
383d26
     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
383d26
 
383d26
     return bs;
383d26
@@ -1170,7 +1174,7 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
383d26
                             int open_flags, Error **errp)
383d26
 {
383d26
     Error *local_err = NULL;
383d26
-    int ret;
383d26
+    int i, ret;
383d26
 
383d26
     bdrv_assign_node_name(bs, node_name, &local_err);
383d26
     if (local_err) {
383d26
@@ -1218,6 +1222,12 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
383d26
     assert(bdrv_min_mem_align(bs) != 0);
383d26
     assert(is_power_of_2(bs->bl.request_alignment));
383d26
 
383d26
+    for (i = 0; i < bs->quiesce_counter; i++) {
383d26
+        if (drv->bdrv_co_drain_begin) {
383d26
+            drv->bdrv_co_drain_begin(bs);
383d26
+        }
383d26
+    }
383d26
+
383d26
     return 0;
383d26
 open_failed:
383d26
     bs->drv = NULL;
383d26
@@ -2039,7 +2049,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
383d26
             child->role->detach(child);
383d26
         }
383d26
         if (old_bs->quiesce_counter && child->role->drained_end) {
383d26
-            for (i = 0; i < old_bs->quiesce_counter; i++) {
383d26
+            int num = old_bs->quiesce_counter;
383d26
+            if (child->role->parent_is_bds) {
383d26
+                num -= bdrv_drain_all_count;
383d26
+            }
383d26
+            assert(num >= 0);
383d26
+            for (i = 0; i < num; i++) {
383d26
                 child->role->drained_end(child);
383d26
             }
383d26
         }
383d26
@@ -2051,7 +2066,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
383d26
     if (new_bs) {
383d26
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
383d26
         if (new_bs->quiesce_counter && child->role->drained_begin) {
383d26
-            for (i = 0; i < new_bs->quiesce_counter; i++) {
383d26
+            int num = new_bs->quiesce_counter;
383d26
+            if (child->role->parent_is_bds) {
383d26
+                num -= bdrv_drain_all_count;
383d26
+            }
383d26
+            assert(num >= 0);
383d26
+            for (i = 0; i < num; i++) {
383d26
                 child->role->drained_begin(child);
383d26
             }
383d26
         }
383d26
@@ -3993,6 +4013,14 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
383d26
     return QTAILQ_NEXT(bs, node_list);
383d26
 }
383d26
 
383d26
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
383d26
+{
383d26
+    if (!bs) {
383d26
+        return QTAILQ_FIRST(&all_bdrv_states);
383d26
+    }
383d26
+    return QTAILQ_NEXT(bs, bs_list);
383d26
+}
383d26
+
383d26
 const char *bdrv_get_node_name(const BlockDriverState *bs)
383d26
 {
383d26
     return bs->node_name;
383d26
diff --git a/block/io.c b/block/io.c
383d26
index 0021fefd..38ae299 100644
383d26
--- a/block/io.c
383d26
+++ b/block/io.c
383d26
@@ -38,6 +38,8 @@
383d26
 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
383d26
 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
383d26
 
383d26
+static AioWait drain_all_aio_wait;
383d26
+
383d26
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
383d26
     int64_t offset, int bytes, BdrvRequestFlags flags);
383d26
 
383d26
@@ -471,6 +473,29 @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
383d26
     }
383d26
 }
383d26
 
383d26
+unsigned int bdrv_drain_all_count = 0;
383d26
+
383d26
+static bool bdrv_drain_all_poll(void)
383d26
+{
383d26
+    BlockDriverState *bs = NULL;
383d26
+    bool result = false;
383d26
+
383d26
+    /* Execute pending BHs first (may modify the graph) and check everything
383d26
+     * else only after the BHs have executed. */
383d26
+    while (aio_poll(qemu_get_aio_context(), false));
383d26
+
383d26
+    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
383d26
+     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
383d26
+    while ((bs = bdrv_next_all_states(bs))) {
383d26
+        AioContext *aio_context = bdrv_get_aio_context(bs);
383d26
+        aio_context_acquire(aio_context);
383d26
+        result |= bdrv_drain_poll(bs, false, NULL, true);
383d26
+        aio_context_release(aio_context);
383d26
+    }
383d26
+
383d26
+    return result;
383d26
+}
383d26
+
383d26
 /*
383d26
  * Wait for pending requests to complete across all BlockDriverStates
383d26
  *
383d26
@@ -485,45 +510,51 @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
383d26
  */
383d26
 void bdrv_drain_all_begin(void)
383d26
 {
383d26
-    BlockDriverState *bs;
383d26
-    BdrvNextIterator it;
383d26
+    BlockDriverState *bs = NULL;
383d26
 
383d26
     if (qemu_in_coroutine()) {
383d26
-        bdrv_co_yield_to_drain(NULL, true, false, NULL, false, true);
383d26
+        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
383d26
         return;
383d26
     }
383d26
 
383d26
-    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
383d26
-     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
383d26
-     * nodes in several different AioContexts, so make sure we're in the main
383d26
-     * context. */
383d26
+    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
383d26
+     * loop AioContext, so make sure we're in the main context. */
383d26
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
383d26
+    assert(bdrv_drain_all_count < INT_MAX);
383d26
+    bdrv_drain_all_count++;
383d26
 
383d26
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
383d26
+    /* Quiesce all nodes, without polling in-flight requests yet. The graph
383d26
+     * cannot change during this loop. */
383d26
+    while ((bs = bdrv_next_all_states(bs))) {
383d26
         AioContext *aio_context = bdrv_get_aio_context(bs);
383d26
 
383d26
         aio_context_acquire(aio_context);
383d26
-        bdrv_do_drained_begin(bs, true, NULL, false, true);
383d26
+        bdrv_do_drained_begin(bs, false, NULL, true, false);
383d26
         aio_context_release(aio_context);
383d26
     }
383d26
 
383d26
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
383d26
+    /* Now poll the in-flight requests */
383d26
+    AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());
383d26
+
383d26
+    while ((bs = bdrv_next_all_states(bs))) {
383d26
         bdrv_drain_assert_idle(bs);
383d26
     }
383d26
 }
383d26
 
383d26
 void bdrv_drain_all_end(void)
383d26
 {
383d26
-    BlockDriverState *bs;
383d26
-    BdrvNextIterator it;
383d26
+    BlockDriverState *bs = NULL;
383d26
 
383d26
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
383d26
+    while ((bs = bdrv_next_all_states(bs))) {
383d26
         AioContext *aio_context = bdrv_get_aio_context(bs);
383d26
 
383d26
         aio_context_acquire(aio_context);
383d26
-        bdrv_do_drained_end(bs, true, NULL, false);
383d26
+        bdrv_do_drained_end(bs, false, NULL, true);
383d26
         aio_context_release(aio_context);
383d26
     }
383d26
+
383d26
+    assert(bdrv_drain_all_count > 0);
383d26
+    bdrv_drain_all_count--;
383d26
 }
383d26
 
383d26
 void bdrv_drain_all(void)
383d26
@@ -658,6 +689,7 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
383d26
 void bdrv_wakeup(BlockDriverState *bs)
383d26
 {
383d26
     aio_wait_kick(bdrv_get_aio_wait(bs));
383d26
+    aio_wait_kick(&drain_all_aio_wait);
383d26
 }
383d26
 
383d26
 void bdrv_dec_in_flight(BlockDriverState *bs)
383d26
diff --git a/include/block/block.h b/include/block/block.h
383d26
index 6e91803..f9079ac 100644
383d26
--- a/include/block/block.h
383d26
+++ b/include/block/block.h
383d26
@@ -449,6 +449,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
383d26
                                  Error **errp);
383d26
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
383d26
 BlockDriverState *bdrv_next_node(BlockDriverState *bs);
383d26
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
383d26
 
383d26
 typedef struct BdrvNextIterator {
383d26
     enum {
383d26
diff --git a/include/block/block_int.h b/include/block/block_int.h
383d26
index 0ad8a76..9757d5e 100644
383d26
--- a/include/block/block_int.h
383d26
+++ b/include/block/block_int.h
383d26
@@ -845,6 +845,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
383d26
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
383d26
     BdrvRequestFlags flags);
383d26
 
383d26
+extern unsigned int bdrv_drain_all_count;
383d26
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
383d26
 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
383d26
 
383d26
-- 
383d26
1.8.3.1
383d26