190130
From 2b2eb846c49caba13ab92ec66af20292e7780fc1 Mon Sep 17 00:00:00 2001
2072c5
From: Ravishankar N <ravishankar@redhat.com>
2072c5
Date: Tue, 11 Feb 2020 14:34:48 +0530
190130
Subject: [PATCH 410/449] afr: prevent spurious entry heals leading to gfid
2072c5
 split-brain
2072c5
2072c5
Problem:
2072c5
In a hyperconverged setup with granular-entry-heal enabled, if a file is
2072c5
recreated while one of the bricks is down, and an index heal is triggered
2072c5
(with the brick still down), entry-self heal was doing a spurious heal
2072c5
with just the 2 good bricks. It was doing a post-op leading to removal
2072c5
of the filename from .glusterfs/indices/entry-changes as well as
2072c5
erroneous setting of afr xattrs on the parent. When the brick came up,
2072c5
the xattrs were cleared, resulting in the renamed file not getting
2072c5
healed and leading to gfid split-brain and EIO on the mount.
2072c5
2072c5
Fix:
2072c5
Proceed with entry heal only when shd can connect to all bricks of the replica,
2072c5
just like in data and metadata heal.
2072c5
190130
BUG: 1804164
2072c5
2072c5
> Upstream patch:https://review.gluster.org/#/c/glusterfs/+/24109/
2072c5
> fixes: bz#1801624
2072c5
> Change-Id: I916ae26ad1fabf259bc6362da52d433b7223b17e
2072c5
> Signed-off-by: Ravishankar N <ravishankar@redhat.com>
2072c5
2072c5
Change-Id: I23f57e543cff1e3f35eb8dbc60a2babfae6838c7
2072c5
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
190130
Reviewed-on: https://code.engineering.redhat.com/gerrit/202395
2072c5
Tested-by: RHGS Build Bot <nigelb@redhat.com>
2072c5
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
2072c5
---
2072c5
 .../bug-1433571-undo-pending-only-on-up-bricks.t   | 18 ++-----
2072c5
 tests/bugs/replicate/bug-1801624-entry-heal.t      | 58 ++++++++++++++++++++++
2072c5
 xlators/cluster/afr/src/afr-common.c               |  4 +-
2072c5
 xlators/cluster/afr/src/afr-self-heal-common.c     |  8 +--
2072c5
 xlators/cluster/afr/src/afr-self-heal-entry.c      |  6 +--
2072c5
 xlators/cluster/afr/src/afr-self-heal-name.c       |  2 +-
2072c5
 xlators/cluster/afr/src/afr-self-heal.h            |  2 -
2072c5
 7 files changed, 69 insertions(+), 29 deletions(-)
2072c5
 create mode 100644 tests/bugs/replicate/bug-1801624-entry-heal.t
2072c5
2072c5
diff --git a/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t b/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t
2072c5
index 0767f47..10ce013 100644
2072c5
--- a/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t
2072c5
+++ b/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t
2072c5
@@ -49,25 +49,15 @@ TEST $CLI volume start $V0 force
2072c5
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
2072c5
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
2072c5
 
2072c5
-#Kill brick 0 and turn on the client side heal and do ls to trigger the heal.
2072c5
-#The pending xattrs on bricks 1 & 2 should have pending entry on brick 0.
2072c5
-TEST kill_brick $V0 $H0 $B0/${V0}0
2072c5
+# We were killing one brick and checking that entry heal does not reset the
2072c5
+# pending xattrs for the down brick. Now that we need all bricks to be up for
2072c5
+# entry heal, I'm removing that test from the .t
2072c5
+
2072c5
 TEST $CLI volume set $V0 cluster.data-self-heal on
2072c5
 TEST $CLI volume set $V0 cluster.metadata-self-heal on
2072c5
 TEST $CLI volume set $V0 cluster.entry-self-heal on
2072c5
 
2072c5
 TEST ls $M0
2072c5
-EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1
2072c5
-EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2
2072c5
-EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}1
2072c5
-EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2
2072c5
-
2072c5
-#Bring back all the bricks and trigger the heal again by doing ls. Now the
2072c5
-#pending xattrs on all the bricks should be 0.
2072c5
-TEST $CLI volume start $V0 force
2072c5
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
2072c5
-TEST ls $M0
2072c5
-
2072c5
 TEST cat $M0/f1
2072c5
 TEST cat $M0/f2
2072c5
 TEST cat $M0/f3
2072c5
diff --git a/tests/bugs/replicate/bug-1801624-entry-heal.t b/tests/bugs/replicate/bug-1801624-entry-heal.t
2072c5
new file mode 100644
2072c5
index 0000000..94b4651
2072c5
--- /dev/null
2072c5
+++ b/tests/bugs/replicate/bug-1801624-entry-heal.t
2072c5
@@ -0,0 +1,58 @@
2072c5
+#!/bin/bash
2072c5
+
2072c5
+. $(dirname $0)/../../include.rc
2072c5
+. $(dirname $0)/../../volume.rc
2072c5
+cleanup;
2072c5
+
2072c5
+TEST glusterd
2072c5
+TEST pidof glusterd
2072c5
+TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,2}
2072c5
+TEST $CLI volume set $V0 heal-timeout 5
2072c5
+TEST $CLI volume start $V0
2072c5
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
2072c5
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
2072c5
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
2072c5
+TEST $CLI volume heal $V0 granular-entry-heal enable
2072c5
+
2072c5
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
2072c5
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
2072c5
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
2072c5
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
2072c5
+echo "Data">$M0/FILE
2072c5
+ret=$?
2072c5
+TEST [ $ret -eq 0 ]
2072c5
+
2072c5
+# Re-create the file when a brick is down.
2072c5
+TEST kill_brick $V0 $H0 $B0/brick1
2072c5
+TEST rm $M0/FILE
2072c5
+echo "New Data">$M0/FILE
2072c5
+ret=$?
2072c5
+TEST [ $ret -eq 0 ]
2072c5
+EXPECT_WITHIN $HEAL_TIMEOUT "4" get_pending_heal_count $V0
2072c5
+
2072c5
+# Launching index heal must not reset parent dir afr xattrs or remove granular entry indices.
2072c5
+$CLI volume heal $V0 # CLI will fail but heal is launched anyway.
2072c5
+TEST sleep 5 # give index heal a chance to do one run.
2072c5
+brick0_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/brick0/)
2072c5
+brick2_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/brick2/)
2072c5
+TEST [ $brick0_pending -eq "000000000000000000000002" ]
2072c5
+TEST [ $brick2_pending -eq "000000000000000000000002" ]
2072c5
+EXPECT "FILE" ls $B0/brick0/.glusterfs/indices/entry-changes/00000000-0000-0000-0000-000000000001/
2072c5
+EXPECT "FILE" ls $B0/brick2/.glusterfs/indices/entry-changes/00000000-0000-0000-0000-000000000001/
2072c5
+
2072c5
+TEST $CLI volume start $V0 force
2072c5
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1
2072c5
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
2072c5
+$CLI volume heal $V0
2072c5
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
2072c5
+
2072c5
+# No gfid-split-brain (i.e. EIO) must be seen. Try on fresh mount to avoid cached values.
2072c5
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
2072c5
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
2072c5
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
2072c5
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
2072c5
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
2072c5
+TEST cat $M0/FILE
2072c5
+
2072c5
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
2072c5
+cleanup;
2072c5
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
190130
index 32127c6..5806556 100644
2072c5
--- a/xlators/cluster/afr/src/afr-common.c
2072c5
+++ b/xlators/cluster/afr/src/afr-common.c
190130
@@ -6629,7 +6629,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
2072c5
         ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
2072c5
                                    locked_on);
2072c5
         {
2072c5
-            if (ret < AFR_SH_MIN_PARTICIPANTS)
2072c5
+            if (ret < priv->child_count)
2072c5
                 goto data_unlock;
2072c5
             ret = __afr_selfheal_data_prepare(
2072c5
                 heal_frame, this, inode, locked_on, sources, sinks,
190130
@@ -6646,7 +6646,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
2072c5
         ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
2072c5
                                    LLONG_MAX - 1, 0, locked_on);
2072c5
         {
2072c5
-            if (ret < AFR_SH_MIN_PARTICIPANTS)
2072c5
+            if (ret < priv->child_count)
2072c5
                 goto mdata_unlock;
2072c5
             ret = __afr_selfheal_metadata_prepare(
2072c5
                 heal_frame, this, inode, locked_on, sources, sinks,
2072c5
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
2072c5
index 81ef38a..ce1ea50 100644
2072c5
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
2072c5
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
2072c5
@@ -1575,7 +1575,6 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
2072c5
     char *accused = NULL;      /* Accused others without any self-accusal */
2072c5
     char *pending = NULL;      /* Have pending operations on others */
2072c5
     char *self_accused = NULL; /* Accused itself */
2072c5
-    int min_participants = -1;
2072c5
 
2072c5
     priv = this->private;
2072c5
 
2072c5
@@ -1599,12 +1598,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
2072c5
         }
2072c5
     }
2072c5
 
2072c5
-    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
2072c5
-        min_participants = priv->child_count;
2072c5
-    } else {
2072c5
-        min_participants = AFR_SH_MIN_PARTICIPANTS;
2072c5
-    }
2072c5
-    if (afr_success_count(replies, priv->child_count) < min_participants) {
2072c5
+    if (afr_success_count(replies, priv->child_count) < priv->child_count) {
2072c5
         /* Treat this just like locks not being acquired */
2072c5
         return -ENOTCONN;
2072c5
     }
2072c5
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
2072c5
index 3ce882e..40be898 100644
2072c5
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
2072c5
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
2072c5
@@ -597,7 +597,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
2072c5
     ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
2072c5
                                locked_on);
2072c5
     {
2072c5
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
2072c5
+        if (ret < priv->child_count) {
2072c5
             gf_msg_debug(this->name, 0,
2072c5
                          "%s: Skipping "
2072c5
                          "entry self-heal as only %d sub-volumes "
2072c5
@@ -991,7 +991,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
2072c5
     ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
2072c5
                                data_lock);
2072c5
     {
2072c5
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
2072c5
+        if (ret < priv->child_count) {
2072c5
             gf_msg_debug(this->name, 0,
2072c5
                          "%s: Skipping "
2072c5
                          "entry self-heal as only %d sub-volumes could "
2072c5
@@ -1115,7 +1115,7 @@ afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
2072c5
     ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
2072c5
                                            NULL, locked_on);
2072c5
     {
2072c5
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
2072c5
+        if (ret < priv->child_count) {
2072c5
             gf_msg_debug(this->name, 0,
2072c5
                          "%s: Skipping "
2072c5
                          "entry self-heal as only %d sub-volumes could "
2072c5
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
2072c5
index 36640b5..7d4f208 100644
2072c5
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
2072c5
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
2072c5
@@ -514,7 +514,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
2072c5
     ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
2072c5
                                locked_on);
2072c5
     {
2072c5
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
2072c5
+        if (ret < priv->child_count) {
2072c5
             ret = -ENOTCONN;
2072c5
             goto unlock;
2072c5
         }
2072c5
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
2072c5
index 6555ec5..8234cec 100644
2072c5
--- a/xlators/cluster/afr/src/afr-self-heal.h
2072c5
+++ b/xlators/cluster/afr/src/afr-self-heal.h
2072c5
@@ -11,8 +11,6 @@
2072c5
 #ifndef _AFR_SELFHEAL_H
2072c5
 #define _AFR_SELFHEAL_H
2072c5
 
2072c5
-#define AFR_SH_MIN_PARTICIPANTS 2
2072c5
-
2072c5
 /* Perform fop on all UP subvolumes and wait for all callbacks to return */
2072c5
 
2072c5
 #define AFR_ONALL(frame, rfn, fop, args...)                                    \
2072c5
-- 
2072c5
1.8.3.1
2072c5