Blame SOURCES/kvm-block-Allow-graph-changes-in-bdrv_drain_all_begin-en.patch

1bdc94
From 78fde43fee8eaaa0fd1222844961614f576ebd7c Mon Sep 17 00:00:00 2001
1bdc94
From: Kevin Wolf <kwolf@redhat.com>
1bdc94
Date: Fri, 14 Sep 2018 10:55:17 +0200
1bdc94
Subject: [PATCH 26/49] block: Allow graph changes in bdrv_drain_all_begin/end
1bdc94
 sections
1bdc94
1bdc94
RH-Author: Kevin Wolf <kwolf@redhat.com>
1bdc94
Message-id: <20180914105540.18077-20-kwolf@redhat.com>
1bdc94
Patchwork-id: 82171
1bdc94
O-Subject: [RHV-7.6 qemu-kvm-rhev PATCH 19/42] block: Allow graph changes in bdrv_drain_all_begin/end sections
1bdc94
Bugzilla: 1601212
1bdc94
RH-Acked-by: John Snow <jsnow@redhat.com>
1bdc94
RH-Acked-by: Max Reitz <mreitz@redhat.com>
1bdc94
RH-Acked-by: Fam Zheng <famz@redhat.com>
1bdc94
1bdc94
bdrv_drain_all_*() used bdrv_next() to iterate over all root nodes and
1bdc94
did a subtree drain for each of them. This works fine as long as the
1bdc94
graph is static, but sadly, reality looks different.
1bdc94
1bdc94
If the graph changes so that root nodes are added or removed, we would
1bdc94
have to compensate for this. bdrv_next() returns each root node only
1bdc94
once even if it's the root node for multiple BlockBackends or for a
1bdc94
monitor-owned block driver tree, which would only complicate things.
1bdc94
1bdc94
The much easier and more obviously correct way is to fundamentally
1bdc94
change the way the functions work: Iterate over all BlockDriverStates,
1bdc94
no matter who owns them, and drain them individually. Compensation is
1bdc94
only necessary when a new BDS is created inside a drain_all section.
1bdc94
Removal of a BDS doesn't require any action because it's gone afterwards
1bdc94
anyway.
1bdc94
1bdc94
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
1bdc94
(cherry picked from commit 0f12264e7a41458179ad10276a7c33c72024861a)
1bdc94
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
1bdc94
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
1bdc94
---
1bdc94
 block.c                   | 34 ++++++++++++++++++++++++---
1bdc94
 block/io.c                | 60 ++++++++++++++++++++++++++++++++++++-----------
1bdc94
 include/block/block.h     |  1 +
1bdc94
 include/block/block_int.h |  1 +
1bdc94
 4 files changed, 79 insertions(+), 17 deletions(-)
1bdc94
1bdc94
diff --git a/block.c b/block.c
1bdc94
index 519be6e..eea9c6f 100644
1bdc94
--- a/block.c
1bdc94
+++ b/block.c
1bdc94
@@ -333,6 +333,10 @@ BlockDriverState *bdrv_new(void)
1bdc94
 
1bdc94
     qemu_co_queue_init(&bs->flush_queue);
1bdc94
 
1bdc94
+    for (i = 0; i < bdrv_drain_all_count; i++) {
1bdc94
+        bdrv_drained_begin(bs);
1bdc94
+    }
1bdc94
+
1bdc94
     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
1bdc94
 
1bdc94
     return bs;
1bdc94
@@ -1170,7 +1174,7 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
1bdc94
                             int open_flags, Error **errp)
1bdc94
 {
1bdc94
     Error *local_err = NULL;
1bdc94
-    int ret;
1bdc94
+    int i, ret;
1bdc94
 
1bdc94
     bdrv_assign_node_name(bs, node_name, &local_err);
1bdc94
     if (local_err) {
1bdc94
@@ -1218,6 +1222,12 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
1bdc94
     assert(bdrv_min_mem_align(bs) != 0);
1bdc94
     assert(is_power_of_2(bs->bl.request_alignment));
1bdc94
 
1bdc94
+    for (i = 0; i < bs->quiesce_counter; i++) {
1bdc94
+        if (drv->bdrv_co_drain_begin) {
1bdc94
+            drv->bdrv_co_drain_begin(bs);
1bdc94
+        }
1bdc94
+    }
1bdc94
+
1bdc94
     return 0;
1bdc94
 open_failed:
1bdc94
     bs->drv = NULL;
1bdc94
@@ -2039,7 +2049,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
1bdc94
             child->role->detach(child);
1bdc94
         }
1bdc94
         if (old_bs->quiesce_counter && child->role->drained_end) {
1bdc94
-            for (i = 0; i < old_bs->quiesce_counter; i++) {
1bdc94
+            int num = old_bs->quiesce_counter;
1bdc94
+            if (child->role->parent_is_bds) {
1bdc94
+                num -= bdrv_drain_all_count;
1bdc94
+            }
1bdc94
+            assert(num >= 0);
1bdc94
+            for (i = 0; i < num; i++) {
1bdc94
                 child->role->drained_end(child);
1bdc94
             }
1bdc94
         }
1bdc94
@@ -2051,7 +2066,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
1bdc94
     if (new_bs) {
1bdc94
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
1bdc94
         if (new_bs->quiesce_counter && child->role->drained_begin) {
1bdc94
-            for (i = 0; i < new_bs->quiesce_counter; i++) {
1bdc94
+            int num = new_bs->quiesce_counter;
1bdc94
+            if (child->role->parent_is_bds) {
1bdc94
+                num -= bdrv_drain_all_count;
1bdc94
+            }
1bdc94
+            assert(num >= 0);
1bdc94
+            for (i = 0; i < num; i++) {
1bdc94
                 child->role->drained_begin(child);
1bdc94
             }
1bdc94
         }
1bdc94
@@ -3993,6 +4013,14 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
1bdc94
     return QTAILQ_NEXT(bs, node_list);
1bdc94
 }
1bdc94
 
1bdc94
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
1bdc94
+{
1bdc94
+    if (!bs) {
1bdc94
+        return QTAILQ_FIRST(&all_bdrv_states);
1bdc94
+    }
1bdc94
+    return QTAILQ_NEXT(bs, bs_list);
1bdc94
+}
1bdc94
+
1bdc94
 const char *bdrv_get_node_name(const BlockDriverState *bs)
1bdc94
 {
1bdc94
     return bs->node_name;
1bdc94
diff --git a/block/io.c b/block/io.c
1bdc94
index 0021fefd..38ae299 100644
1bdc94
--- a/block/io.c
1bdc94
+++ b/block/io.c
1bdc94
@@ -38,6 +38,8 @@
1bdc94
 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
1bdc94
 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
1bdc94
 
1bdc94
+static AioWait drain_all_aio_wait;
1bdc94
+
1bdc94
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1bdc94
     int64_t offset, int bytes, BdrvRequestFlags flags);
1bdc94
 
1bdc94
@@ -471,6 +473,29 @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
1bdc94
     }
1bdc94
 }
1bdc94
 
1bdc94
+unsigned int bdrv_drain_all_count = 0;
1bdc94
+
1bdc94
+static bool bdrv_drain_all_poll(void)
1bdc94
+{
1bdc94
+    BlockDriverState *bs = NULL;
1bdc94
+    bool result = false;
1bdc94
+
1bdc94
+    /* Execute pending BHs first (may modify the graph) and check everything
1bdc94
+     * else only after the BHs have executed. */
1bdc94
+    while (aio_poll(qemu_get_aio_context(), false));
1bdc94
+
1bdc94
+    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
1bdc94
+     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
1bdc94
+    while ((bs = bdrv_next_all_states(bs))) {
1bdc94
+        AioContext *aio_context = bdrv_get_aio_context(bs);
1bdc94
+        aio_context_acquire(aio_context);
1bdc94
+        result |= bdrv_drain_poll(bs, false, NULL, true);
1bdc94
+        aio_context_release(aio_context);
1bdc94
+    }
1bdc94
+
1bdc94
+    return result;
1bdc94
+}
1bdc94
+
1bdc94
 /*
1bdc94
  * Wait for pending requests to complete across all BlockDriverStates
1bdc94
  *
1bdc94
@@ -485,45 +510,51 @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
1bdc94
  */
1bdc94
 void bdrv_drain_all_begin(void)
1bdc94
 {
1bdc94
-    BlockDriverState *bs;
1bdc94
-    BdrvNextIterator it;
1bdc94
+    BlockDriverState *bs = NULL;
1bdc94
 
1bdc94
     if (qemu_in_coroutine()) {
1bdc94
-        bdrv_co_yield_to_drain(NULL, true, false, NULL, false, true);
1bdc94
+        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
1bdc94
         return;
1bdc94
     }
1bdc94
 
1bdc94
-    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
1bdc94
-     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
1bdc94
-     * nodes in several different AioContexts, so make sure we're in the main
1bdc94
-     * context. */
1bdc94
+    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
1bdc94
+     * loop AioContext, so make sure we're in the main context. */
1bdc94
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
1bdc94
+    assert(bdrv_drain_all_count < INT_MAX);
1bdc94
+    bdrv_drain_all_count++;
1bdc94
 
1bdc94
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1bdc94
+    /* Quiesce all nodes, without polling in-flight requests yet. The graph
1bdc94
+     * cannot change during this loop. */
1bdc94
+    while ((bs = bdrv_next_all_states(bs))) {
1bdc94
         AioContext *aio_context = bdrv_get_aio_context(bs);
1bdc94
 
1bdc94
         aio_context_acquire(aio_context);
1bdc94
-        bdrv_do_drained_begin(bs, true, NULL, false, true);
1bdc94
+        bdrv_do_drained_begin(bs, false, NULL, true, false);
1bdc94
         aio_context_release(aio_context);
1bdc94
     }
1bdc94
 
1bdc94
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1bdc94
+    /* Now poll the in-flight requests */
1bdc94
+    AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());
1bdc94
+
1bdc94
+    while ((bs = bdrv_next_all_states(bs))) {
1bdc94
         bdrv_drain_assert_idle(bs);
1bdc94
     }
1bdc94
 }
1bdc94
 
1bdc94
 void bdrv_drain_all_end(void)
1bdc94
 {
1bdc94
-    BlockDriverState *bs;
1bdc94
-    BdrvNextIterator it;
1bdc94
+    BlockDriverState *bs = NULL;
1bdc94
 
1bdc94
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1bdc94
+    while ((bs = bdrv_next_all_states(bs))) {
1bdc94
         AioContext *aio_context = bdrv_get_aio_context(bs);
1bdc94
 
1bdc94
         aio_context_acquire(aio_context);
1bdc94
-        bdrv_do_drained_end(bs, true, NULL, false);
1bdc94
+        bdrv_do_drained_end(bs, false, NULL, true);
1bdc94
         aio_context_release(aio_context);
1bdc94
     }
1bdc94
+
1bdc94
+    assert(bdrv_drain_all_count > 0);
1bdc94
+    bdrv_drain_all_count--;
1bdc94
 }
1bdc94
 
1bdc94
 void bdrv_drain_all(void)
1bdc94
@@ -658,6 +689,7 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
1bdc94
 void bdrv_wakeup(BlockDriverState *bs)
1bdc94
 {
1bdc94
     aio_wait_kick(bdrv_get_aio_wait(bs));
1bdc94
+    aio_wait_kick(&drain_all_aio_wait);
1bdc94
 }
1bdc94
 
1bdc94
 void bdrv_dec_in_flight(BlockDriverState *bs)
1bdc94
diff --git a/include/block/block.h b/include/block/block.h
1bdc94
index 6e91803..f9079ac 100644
1bdc94
--- a/include/block/block.h
1bdc94
+++ b/include/block/block.h
1bdc94
@@ -449,6 +449,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
1bdc94
                                  Error **errp);
1bdc94
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
1bdc94
 BlockDriverState *bdrv_next_node(BlockDriverState *bs);
1bdc94
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
1bdc94
 
1bdc94
 typedef struct BdrvNextIterator {
1bdc94
     enum {
1bdc94
diff --git a/include/block/block_int.h b/include/block/block_int.h
1bdc94
index 0ad8a76..9757d5e 100644
1bdc94
--- a/include/block/block_int.h
1bdc94
+++ b/include/block/block_int.h
1bdc94
@@ -845,6 +845,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1bdc94
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1bdc94
     BdrvRequestFlags flags);
1bdc94
 
1bdc94
+extern unsigned int bdrv_drain_all_count;
1bdc94
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
1bdc94
 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
1bdc94
 
1bdc94
-- 
1bdc94
1.8.3.1
1bdc94