190130
From b9b479de2a7fd1c5eefa7aa1142e0a39e0c96ca9 Mon Sep 17 00:00:00 2001
190130
From: Xavi Hernandez <xhernandez@redhat.com>
190130
Date: Sun, 1 Mar 2020 19:49:04 +0100
190130
Subject: [PATCH 419/449] cluster/afr: fix race when bricks come up
190130
190130
The was a problem when self-heal was sending lookups at the same time
190130
that one of the bricks was coming up. In this case there was a chance
190130
that the number of 'up' bricks changes in the middle of sending the
190130
requests to subvolumes which caused a discrepancy in the expected
190130
number of replies and the actual number of sent requests.
190130
190130
This discrepancy caused that AFR continued executing requests before
190130
all requests were complete. Eventually, the frame of the pending
190130
request was destroyed when the operation terminated, causing a use-
190130
after-free issue when the answer was finally received.
190130
190130
In theory the same thing could happen in the reverse way, i.e. AFR
190130
tries to wait for more replies than sent requests, causing a hang.
190130
190130
Backport of:
190130
> Upstream-patch-link: https://review.gluster.org/24191
190130
> Change-Id: I7ed6108554ca379d532efb1a29b2de8085410b70
190130
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
190130
> Fixes: bz#1808875
190130
190130
BUG: 1794663
190130
Change-Id: I7ed6108554ca379d532efb1a29b2de8085410b70
190130
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
190130
Reviewed-on: https://code.engineering.redhat.com/gerrit/202489
190130
Tested-by: RHGS Build Bot <nigelb@redhat.com>
190130
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
190130
---
190130
 xlators/cluster/afr/src/afr-self-heal-common.c | 6 +++---
190130
 xlators/cluster/afr/src/afr-self-heal-name.c   | 4 +++-
190130
 xlators/cluster/afr/src/afr-self-heal.h        | 7 +++++--
190130
 3 files changed, 11 insertions(+), 6 deletions(-)
190130
190130
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
190130
index ce1ea50..d942ccf 100644
190130
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
190130
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
190130
@@ -1869,12 +1869,12 @@ int
190130
 afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
190130
                                struct afr_reply *replies)
190130
 {
190130
-    afr_private_t *priv = NULL;
190130
+    afr_local_t *local = NULL;
190130
 
190130
-    priv = frame->this->private;
190130
+    local = frame->local;
190130
 
190130
     return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
190130
-                                             priv->child_up);
190130
+                                             local->child_up);
190130
 }
190130
 
190130
 unsigned int
190130
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
190130
index 7d4f208..dace071 100644
190130
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
190130
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
190130
@@ -560,13 +560,15 @@ afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
190130
     struct afr_reply *replies = NULL;
190130
     inode_t *inode = NULL;
190130
     int first_idx = -1;
190130
+    afr_local_t *local = NULL;
190130
 
190130
     priv = this->private;
190130
+    local = frame->local;
190130
 
190130
     replies = alloca0(sizeof(*replies) * priv->child_count);
190130
 
190130
     inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
190130
-                                            priv->child_up, NULL);
190130
+                                            local->child_up, NULL);
190130
     if (!inode)
190130
         return -ENOMEM;
190130
 
190130
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
190130
index 8234cec..f7ecf5d 100644
190130
--- a/xlators/cluster/afr/src/afr-self-heal.h
190130
+++ b/xlators/cluster/afr/src/afr-self-heal.h
190130
@@ -46,13 +46,16 @@
190130
         afr_local_t *__local = frame->local;                                   \
190130
         afr_private_t *__priv = frame->this->private;                          \
190130
         int __i = 0;                                                           \
190130
-        int __count = AFR_COUNT(list, __priv->child_count);                    \
190130
+        int __count = 0;                                                       \
190130
+        unsigned char *__list = alloca(__priv->child_count);                   \
190130
                                                                                \
190130
+        memcpy(__list, list, sizeof(*__list) * __priv->child_count);           \
190130
+        __count = AFR_COUNT(__list, __priv->child_count);                      \
190130
         __local->barrier.waitfor = __count;                                    \
190130
         afr_local_replies_wipe(__local, __priv);                               \
190130
                                                                                \
190130
         for (__i = 0; __i < __priv->child_count; __i++) {                      \
190130
-            if (!list[__i])                                                    \
190130
+            if (!__list[__i])                                                  \
190130
                 continue;                                                      \
190130
             STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i,                   \
190130
                               __priv->children[__i],                           \
190130
-- 
190130
1.8.3.1
190130