17b94a
From b9b479de2a7fd1c5eefa7aa1142e0a39e0c96ca9 Mon Sep 17 00:00:00 2001
17b94a
From: Xavi Hernandez <xhernandez@redhat.com>
17b94a
Date: Sun, 1 Mar 2020 19:49:04 +0100
17b94a
Subject: [PATCH 419/449] cluster/afr: fix race when bricks come up
17b94a
17b94a
The was a problem when self-heal was sending lookups at the same time
17b94a
that one of the bricks was coming up. In this case there was a chance
17b94a
that the number of 'up' bricks changes in the middle of sending the
17b94a
requests to subvolumes which caused a discrepancy in the expected
17b94a
number of replies and the actual number of sent requests.
17b94a
17b94a
This discrepancy caused that AFR continued executing requests before
17b94a
all requests were complete. Eventually, the frame of the pending
17b94a
request was destroyed when the operation terminated, causing a use-
17b94a
after-free issue when the answer was finally received.
17b94a
17b94a
In theory the same thing could happen in the reverse way, i.e. AFR
17b94a
tries to wait for more replies than sent requests, causing a hang.
17b94a
17b94a
Backport of:
17b94a
> Upstream-patch-link: https://review.gluster.org/24191
17b94a
> Change-Id: I7ed6108554ca379d532efb1a29b2de8085410b70
17b94a
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
17b94a
> Fixes: bz#1808875
17b94a
17b94a
BUG: 1794663
17b94a
Change-Id: I7ed6108554ca379d532efb1a29b2de8085410b70
17b94a
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/202489
17b94a
Tested-by: RHGS Build Bot <nigelb@redhat.com>
17b94a
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
17b94a
---
17b94a
 xlators/cluster/afr/src/afr-self-heal-common.c | 6 +++---
17b94a
 xlators/cluster/afr/src/afr-self-heal-name.c   | 4 +++-
17b94a
 xlators/cluster/afr/src/afr-self-heal.h        | 7 +++++--
17b94a
 3 files changed, 11 insertions(+), 6 deletions(-)
17b94a
17b94a
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
17b94a
index ce1ea50..d942ccf 100644
17b94a
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
17b94a
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
17b94a
@@ -1869,12 +1869,12 @@ int
17b94a
 afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
17b94a
                                struct afr_reply *replies)
17b94a
 {
17b94a
-    afr_private_t *priv = NULL;
17b94a
+    afr_local_t *local = NULL;
17b94a
 
17b94a
-    priv = frame->this->private;
17b94a
+    local = frame->local;
17b94a
 
17b94a
     return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
17b94a
-                                             priv->child_up);
17b94a
+                                             local->child_up);
17b94a
 }
17b94a
 
17b94a
 unsigned int
17b94a
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
17b94a
index 7d4f208..dace071 100644
17b94a
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
17b94a
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
17b94a
@@ -560,13 +560,15 @@ afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
17b94a
     struct afr_reply *replies = NULL;
17b94a
     inode_t *inode = NULL;
17b94a
     int first_idx = -1;
17b94a
+    afr_local_t *local = NULL;
17b94a
 
17b94a
     priv = this->private;
17b94a
+    local = frame->local;
17b94a
 
17b94a
     replies = alloca0(sizeof(*replies) * priv->child_count);
17b94a
 
17b94a
     inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
17b94a
-                                            priv->child_up, NULL);
17b94a
+                                            local->child_up, NULL);
17b94a
     if (!inode)
17b94a
         return -ENOMEM;
17b94a
 
17b94a
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
17b94a
index 8234cec..f7ecf5d 100644
17b94a
--- a/xlators/cluster/afr/src/afr-self-heal.h
17b94a
+++ b/xlators/cluster/afr/src/afr-self-heal.h
17b94a
@@ -46,13 +46,16 @@
17b94a
         afr_local_t *__local = frame->local;                                   \
17b94a
         afr_private_t *__priv = frame->this->private;                          \
17b94a
         int __i = 0;                                                           \
17b94a
-        int __count = AFR_COUNT(list, __priv->child_count);                    \
17b94a
+        int __count = 0;                                                       \
17b94a
+        unsigned char *__list = alloca(__priv->child_count);                   \
17b94a
                                                                                \
17b94a
+        memcpy(__list, list, sizeof(*__list) * __priv->child_count);           \
17b94a
+        __count = AFR_COUNT(__list, __priv->child_count);                      \
17b94a
         __local->barrier.waitfor = __count;                                    \
17b94a
         afr_local_replies_wipe(__local, __priv);                               \
17b94a
                                                                                \
17b94a
         for (__i = 0; __i < __priv->child_count; __i++) {                      \
17b94a
-            if (!list[__i])                                                    \
17b94a
+            if (!__list[__i])                                                  \
17b94a
                 continue;                                                      \
17b94a
             STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i,                   \
17b94a
                               __priv->children[__i],                           \
17b94a
-- 
17b94a
1.8.3.1
17b94a