1df6c8
From 7689fbb4be83f0e0657ec2729c4d66ed341b5751 Mon Sep 17 00:00:00 2001
1df6c8
From: Ravishankar N <ravishankar@redhat.com>
1df6c8
Date: Tue, 11 Feb 2020 14:34:48 +0530
1df6c8
Subject: [PATCH 386/392] afr: prevent spurious entry heals leading to gfid
1df6c8
 split-brain
1df6c8
1df6c8
Problem:
1df6c8
In a hyperconverged setup with granular-entry-heal enabled, if a file is
1df6c8
recreated while one of the bricks is down, and an index heal is triggered
1df6c8
(with the brick still down), entry-self heal was doing a spurious heal
1df6c8
with just the 2 good bricks. It was doing a post-op leading to removal
1df6c8
of the filename from .glusterfs/indices/entry-changes as well as
1df6c8
erroneous setting of afr xattrs on the parent. When the brick came up,
1df6c8
the xattrs were cleared, resulting in the renamed file not getting
1df6c8
healed and leading to gfid split-brain and EIO on the mount.
1df6c8
1df6c8
Fix:
1df6c8
Proceed with entry heal only when shd can connect to all bricks of the replica,
1df6c8
just like in data and metadata heal.
1df6c8
1df6c8
BUG: 1848893
1df6c8
1df6c8
> Upstream patch:https://review.gluster.org/#/c/glusterfs/+/24109/
1df6c8
> fixes: bz#1801624
1df6c8
> Change-Id: I916ae26ad1fabf259bc6362da52d433b7223b17e
1df6c8
> Signed-off-by: Ravishankar N <ravishankar@redhat.com>
1df6c8
1df6c8
Change-Id: I23f57e543cff1e3f35eb8dbc60a2babfae6838c7
1df6c8
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
1df6c8
(cherry picked from commit 2b2eb846c49caba13ab92ec66af20292e7780fc1)
1df6c8
Reviewed-on: https://code.engineering.redhat.com/gerrit/203692
1df6c8
Tested-by: RHGS Build Bot <nigelb@redhat.com>
1df6c8
Tested-by: Karthik Subrahmanya <ksubrahm@redhat.com>
1df6c8
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
1df6c8
---
1df6c8
 .../bug-1433571-undo-pending-only-on-up-bricks.t   | 18 ++-----
1df6c8
 tests/bugs/replicate/bug-1801624-entry-heal.t      | 58 ++++++++++++++++++++++
1df6c8
 xlators/cluster/afr/src/afr-common.c               |  4 +-
1df6c8
 xlators/cluster/afr/src/afr-self-heal-common.c     |  8 +--
1df6c8
 xlators/cluster/afr/src/afr-self-heal-entry.c      |  6 +--
1df6c8
 xlators/cluster/afr/src/afr-self-heal-name.c       |  2 +-
1df6c8
 xlators/cluster/afr/src/afr-self-heal.h            |  2 -
1df6c8
 7 files changed, 69 insertions(+), 29 deletions(-)
1df6c8
 create mode 100644 tests/bugs/replicate/bug-1801624-entry-heal.t
1df6c8
1df6c8
diff --git a/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t b/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t
1df6c8
index 0767f47..10ce013 100644
1df6c8
--- a/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t
1df6c8
+++ b/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t
1df6c8
@@ -49,25 +49,15 @@ TEST $CLI volume start $V0 force
1df6c8
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
1df6c8
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
1df6c8
 
1df6c8
-#Kill brick 0 and turn on the client side heal and do ls to trigger the heal.
1df6c8
-#The pending xattrs on bricks 1 & 2 should have pending entry on brick 0.
1df6c8
-TEST kill_brick $V0 $H0 $B0/${V0}0
1df6c8
+# We were killing one brick and checking that entry heal does not reset the
1df6c8
+# pending xattrs for the down brick. Now that we need all bricks to be up for
1df6c8
+# entry heal, I'm removing that test from the .t
1df6c8
+
1df6c8
 TEST $CLI volume set $V0 cluster.data-self-heal on
1df6c8
 TEST $CLI volume set $V0 cluster.metadata-self-heal on
1df6c8
 TEST $CLI volume set $V0 cluster.entry-self-heal on
1df6c8
 
1df6c8
 TEST ls $M0
1df6c8
-EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1
1df6c8
-EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2
1df6c8
-EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}1
1df6c8
-EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2
1df6c8
-
1df6c8
-#Bring back all the bricks and trigger the heal again by doing ls. Now the
1df6c8
-#pending xattrs on all the bricks should be 0.
1df6c8
-TEST $CLI volume start $V0 force
1df6c8
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
1df6c8
-TEST ls $M0
1df6c8
-
1df6c8
 TEST cat $M0/f1
1df6c8
 TEST cat $M0/f2
1df6c8
 TEST cat $M0/f3
1df6c8
diff --git a/tests/bugs/replicate/bug-1801624-entry-heal.t b/tests/bugs/replicate/bug-1801624-entry-heal.t
1df6c8
new file mode 100644
1df6c8
index 0000000..94b4651
1df6c8
--- /dev/null
1df6c8
+++ b/tests/bugs/replicate/bug-1801624-entry-heal.t
1df6c8
@@ -0,0 +1,58 @@
1df6c8
+#!/bin/bash
1df6c8
+
1df6c8
+. $(dirname $0)/../../include.rc
1df6c8
+. $(dirname $0)/../../volume.rc
1df6c8
+cleanup;
1df6c8
+
1df6c8
+TEST glusterd
1df6c8
+TEST pidof glusterd
1df6c8
+TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,2}
1df6c8
+TEST $CLI volume set $V0 heal-timeout 5
1df6c8
+TEST $CLI volume start $V0
1df6c8
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
1df6c8
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
1df6c8
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
1df6c8
+TEST $CLI volume heal $V0 granular-entry-heal enable
1df6c8
+
1df6c8
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
1df6c8
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
1df6c8
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
1df6c8
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
1df6c8
+echo "Data">$M0/FILE
1df6c8
+ret=$?
1df6c8
+TEST [ $ret -eq 0 ]
1df6c8
+
1df6c8
+# Re-create the file when a brick is down.
1df6c8
+TEST kill_brick $V0 $H0 $B0/brick1
1df6c8
+TEST rm $M0/FILE
1df6c8
+echo "New Data">$M0/FILE
1df6c8
+ret=$?
1df6c8
+TEST [ $ret -eq 0 ]
1df6c8
+EXPECT_WITHIN $HEAL_TIMEOUT "4" get_pending_heal_count $V0
1df6c8
+
1df6c8
+# Launching index heal must not reset parent dir afr xattrs or remove granular entry indices.
1df6c8
+$CLI volume heal $V0 # CLI will fail but heal is launched anyway.
1df6c8
+TEST sleep 5 # give index heal a chance to do one run.
1df6c8
+brick0_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/brick0/)
1df6c8
+brick2_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/brick2/)
1df6c8
+TEST [ $brick0_pending -eq "000000000000000000000002" ]
1df6c8
+TEST [ $brick2_pending -eq "000000000000000000000002" ]
1df6c8
+EXPECT "FILE" ls $B0/brick0/.glusterfs/indices/entry-changes/00000000-0000-0000-0000-000000000001/
1df6c8
+EXPECT "FILE" ls $B0/brick2/.glusterfs/indices/entry-changes/00000000-0000-0000-0000-000000000001/
1df6c8
+
1df6c8
+TEST $CLI volume start $V0 force
1df6c8
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1
1df6c8
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
1df6c8
+$CLI volume heal $V0
1df6c8
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
1df6c8
+
1df6c8
+# No gfid-split-brain (i.e. EIO) must be seen. Try on fresh mount to avoid cached values.
1df6c8
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
1df6c8
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
1df6c8
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
1df6c8
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
1df6c8
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
1df6c8
+TEST cat $M0/FILE
1df6c8
+
1df6c8
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
1df6c8
+cleanup;
1df6c8
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
1df6c8
index d6b70e9..939246e 100644
1df6c8
--- a/xlators/cluster/afr/src/afr-common.c
1df6c8
+++ b/xlators/cluster/afr/src/afr-common.c
1df6c8
@@ -6632,7 +6632,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
1df6c8
         ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
1df6c8
                                    locked_on);
1df6c8
         {
1df6c8
-            if (ret < AFR_SH_MIN_PARTICIPANTS)
1df6c8
+            if (ret < priv->child_count)
1df6c8
                 goto data_unlock;
1df6c8
             ret = __afr_selfheal_data_prepare(
1df6c8
                 heal_frame, this, inode, locked_on, sources, sinks,
1df6c8
@@ -6649,7 +6649,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
1df6c8
         ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
1df6c8
                                    LLONG_MAX - 1, 0, locked_on);
1df6c8
         {
1df6c8
-            if (ret < AFR_SH_MIN_PARTICIPANTS)
1df6c8
+            if (ret < priv->child_count)
1df6c8
                 goto mdata_unlock;
1df6c8
             ret = __afr_selfheal_metadata_prepare(
1df6c8
                 heal_frame, this, inode, locked_on, sources, sinks,
1df6c8
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
1df6c8
index 81ef38a..ce1ea50 100644
1df6c8
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
1df6c8
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
1df6c8
@@ -1575,7 +1575,6 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
1df6c8
     char *accused = NULL;      /* Accused others without any self-accusal */
1df6c8
     char *pending = NULL;      /* Have pending operations on others */
1df6c8
     char *self_accused = NULL; /* Accused itself */
1df6c8
-    int min_participants = -1;
1df6c8
 
1df6c8
     priv = this->private;
1df6c8
 
1df6c8
@@ -1599,12 +1598,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
1df6c8
         }
1df6c8
     }
1df6c8
 
1df6c8
-    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
1df6c8
-        min_participants = priv->child_count;
1df6c8
-    } else {
1df6c8
-        min_participants = AFR_SH_MIN_PARTICIPANTS;
1df6c8
-    }
1df6c8
-    if (afr_success_count(replies, priv->child_count) < min_participants) {
1df6c8
+    if (afr_success_count(replies, priv->child_count) < priv->child_count) {
1df6c8
         /* Treat this just like locks not being acquired */
1df6c8
         return -ENOTCONN;
1df6c8
     }
1df6c8
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
1df6c8
index 3ce882e..40be898 100644
1df6c8
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
1df6c8
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
1df6c8
@@ -597,7 +597,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
1df6c8
     ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
1df6c8
                                locked_on);
1df6c8
     {
1df6c8
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
1df6c8
+        if (ret < priv->child_count) {
1df6c8
             gf_msg_debug(this->name, 0,
1df6c8
                          "%s: Skipping "
1df6c8
                          "entry self-heal as only %d sub-volumes "
1df6c8
@@ -991,7 +991,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
1df6c8
     ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
1df6c8
                                data_lock);
1df6c8
     {
1df6c8
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
1df6c8
+        if (ret < priv->child_count) {
1df6c8
             gf_msg_debug(this->name, 0,
1df6c8
                          "%s: Skipping "
1df6c8
                          "entry self-heal as only %d sub-volumes could "
1df6c8
@@ -1115,7 +1115,7 @@ afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
1df6c8
     ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
1df6c8
                                            NULL, locked_on);
1df6c8
     {
1df6c8
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
1df6c8
+        if (ret < priv->child_count) {
1df6c8
             gf_msg_debug(this->name, 0,
1df6c8
                          "%s: Skipping "
1df6c8
                          "entry self-heal as only %d sub-volumes could "
1df6c8
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
1df6c8
index 36640b5..7d4f208 100644
1df6c8
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
1df6c8
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
1df6c8
@@ -514,7 +514,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
1df6c8
     ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
1df6c8
                                locked_on);
1df6c8
     {
1df6c8
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
1df6c8
+        if (ret < priv->child_count) {
1df6c8
             ret = -ENOTCONN;
1df6c8
             goto unlock;
1df6c8
         }
1df6c8
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
1df6c8
index 6555ec5..8234cec 100644
1df6c8
--- a/xlators/cluster/afr/src/afr-self-heal.h
1df6c8
+++ b/xlators/cluster/afr/src/afr-self-heal.h
1df6c8
@@ -11,8 +11,6 @@
1df6c8
 #ifndef _AFR_SELFHEAL_H
1df6c8
 #define _AFR_SELFHEAL_H
1df6c8
 
1df6c8
-#define AFR_SH_MIN_PARTICIPANTS 2
1df6c8
-
1df6c8
 /* Perform fop on all UP subvolumes and wait for all callbacks to return */
1df6c8
 
1df6c8
 #define AFR_ONALL(frame, rfn, fop, args...)                                    \
1df6c8
-- 
1df6c8
1.8.3.1
1df6c8