17b94a
From 2b2eb846c49caba13ab92ec66af20292e7780fc1 Mon Sep 17 00:00:00 2001
17b94a
From: Ravishankar N <ravishankar@redhat.com>
17b94a
Date: Tue, 11 Feb 2020 14:34:48 +0530
17b94a
Subject: [PATCH 410/449] afr: prevent spurious entry heals leading to gfid
17b94a
 split-brain
17b94a
17b94a
Problem:
17b94a
In a hyperconverged setup with granular-entry-heal enabled, if a file is
17b94a
recreated while one of the bricks is down, and an index heal is triggered
17b94a
(with the brick still down), entry-self heal was doing a spurious heal
17b94a
with just the 2 good bricks. It was doing a post-op leading to removal
17b94a
of the filename from .glusterfs/indices/entry-changes as well as
17b94a
erroneous setting of afr xattrs on the parent. When the brick came up,
17b94a
the xattrs were cleared, resulting in the renamed file not getting
17b94a
healed and leading to gfid split-brain and EIO on the mount.
17b94a
17b94a
Fix:
17b94a
Proceed with entry heal only when shd can connect to all bricks of the replica,
17b94a
just like in data and metadata heal.
17b94a
17b94a
BUG: 1804164
17b94a
17b94a
> Upstream patch:https://review.gluster.org/#/c/glusterfs/+/24109/
17b94a
> fixes: bz#1801624
17b94a
> Change-Id: I916ae26ad1fabf259bc6362da52d433b7223b17e
17b94a
> Signed-off-by: Ravishankar N <ravishankar@redhat.com>
17b94a
17b94a
Change-Id: I23f57e543cff1e3f35eb8dbc60a2babfae6838c7
17b94a
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/202395
17b94a
Tested-by: RHGS Build Bot <nigelb@redhat.com>
17b94a
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
17b94a
---
17b94a
 .../bug-1433571-undo-pending-only-on-up-bricks.t   | 18 ++-----
17b94a
 tests/bugs/replicate/bug-1801624-entry-heal.t      | 58 ++++++++++++++++++++++
17b94a
 xlators/cluster/afr/src/afr-common.c               |  4 +-
17b94a
 xlators/cluster/afr/src/afr-self-heal-common.c     |  8 +--
17b94a
 xlators/cluster/afr/src/afr-self-heal-entry.c      |  6 +--
17b94a
 xlators/cluster/afr/src/afr-self-heal-name.c       |  2 +-
17b94a
 xlators/cluster/afr/src/afr-self-heal.h            |  2 -
17b94a
 7 files changed, 69 insertions(+), 29 deletions(-)
17b94a
 create mode 100644 tests/bugs/replicate/bug-1801624-entry-heal.t
17b94a
17b94a
diff --git a/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t b/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t
17b94a
index 0767f47..10ce013 100644
17b94a
--- a/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t
17b94a
+++ b/tests/bugs/replicate/bug-1433571-undo-pending-only-on-up-bricks.t
17b94a
@@ -49,25 +49,15 @@ TEST $CLI volume start $V0 force
17b94a
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
17b94a
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
17b94a
 
17b94a
-#Kill brick 0 and turn on the client side heal and do ls to trigger the heal.
17b94a
-#The pending xattrs on bricks 1 & 2 should have pending entry on brick 0.
17b94a
-TEST kill_brick $V0 $H0 $B0/${V0}0
17b94a
+# We were killing one brick and checking that entry heal does not reset the
17b94a
+# pending xattrs for the down brick. Now that we need all bricks to be up for
17b94a
+# entry heal, I'm removing that test from the .t
17b94a
+
17b94a
 TEST $CLI volume set $V0 cluster.data-self-heal on
17b94a
 TEST $CLI volume set $V0 cluster.metadata-self-heal on
17b94a
 TEST $CLI volume set $V0 cluster.entry-self-heal on
17b94a
 
17b94a
 TEST ls $M0
17b94a
-EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1
17b94a
-EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2
17b94a
-EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}1
17b94a
-EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2
17b94a
-
17b94a
-#Bring back all the bricks and trigger the heal again by doing ls. Now the
17b94a
-#pending xattrs on all the bricks should be 0.
17b94a
-TEST $CLI volume start $V0 force
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
17b94a
-TEST ls $M0
17b94a
-
17b94a
 TEST cat $M0/f1
17b94a
 TEST cat $M0/f2
17b94a
 TEST cat $M0/f3
17b94a
diff --git a/tests/bugs/replicate/bug-1801624-entry-heal.t b/tests/bugs/replicate/bug-1801624-entry-heal.t
17b94a
new file mode 100644
17b94a
index 0000000..94b4651
17b94a
--- /dev/null
17b94a
+++ b/tests/bugs/replicate/bug-1801624-entry-heal.t
17b94a
@@ -0,0 +1,58 @@
17b94a
+#!/bin/bash
17b94a
+
17b94a
+. $(dirname $0)/../../include.rc
17b94a
+. $(dirname $0)/../../volume.rc
17b94a
+cleanup;
17b94a
+
17b94a
+TEST glusterd
17b94a
+TEST pidof glusterd
17b94a
+TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,2}
17b94a
+TEST $CLI volume set $V0 heal-timeout 5
17b94a
+TEST $CLI volume start $V0
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
17b94a
+TEST $CLI volume heal $V0 granular-entry-heal enable
17b94a
+
17b94a
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
17b94a
+echo "Data">$M0/FILE
17b94a
+ret=$?
17b94a
+TEST [ $ret -eq 0 ]
17b94a
+
17b94a
+# Re-create the file when a brick is down.
17b94a
+TEST kill_brick $V0 $H0 $B0/brick1
17b94a
+TEST rm $M0/FILE
17b94a
+echo "New Data">$M0/FILE
17b94a
+ret=$?
17b94a
+TEST [ $ret -eq 0 ]
17b94a
+EXPECT_WITHIN $HEAL_TIMEOUT "4" get_pending_heal_count $V0
17b94a
+
17b94a
+# Launching index heal must not reset parent dir afr xattrs or remove granular entry indices.
17b94a
+$CLI volume heal $V0 # CLI will fail but heal is launched anyway.
17b94a
+TEST sleep 5 # give index heal a chance to do one run.
17b94a
+brick0_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/brick0/)
17b94a
+brick2_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/brick2/)
17b94a
+TEST [ $brick0_pending -eq "000000000000000000000002" ]
17b94a
+TEST [ $brick2_pending -eq "000000000000000000000002" ]
17b94a
+EXPECT "FILE" ls $B0/brick0/.glusterfs/indices/entry-changes/00000000-0000-0000-0000-000000000001/
17b94a
+EXPECT "FILE" ls $B0/brick2/.glusterfs/indices/entry-changes/00000000-0000-0000-0000-000000000001/
17b94a
+
17b94a
+TEST $CLI volume start $V0 force
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
17b94a
+$CLI volume heal $V0
17b94a
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
17b94a
+
17b94a
+# No gfid-split-brain (i.e. EIO) must be seen. Try on fresh mount to avoid cached values.
17b94a
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
17b94a
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
17b94a
+TEST cat $M0/FILE
17b94a
+
17b94a
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
17b94a
+cleanup;
17b94a
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
17b94a
index 32127c6..5806556 100644
17b94a
--- a/xlators/cluster/afr/src/afr-common.c
17b94a
+++ b/xlators/cluster/afr/src/afr-common.c
17b94a
@@ -6629,7 +6629,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
17b94a
         ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
17b94a
                                    locked_on);
17b94a
         {
17b94a
-            if (ret < AFR_SH_MIN_PARTICIPANTS)
17b94a
+            if (ret < priv->child_count)
17b94a
                 goto data_unlock;
17b94a
             ret = __afr_selfheal_data_prepare(
17b94a
                 heal_frame, this, inode, locked_on, sources, sinks,
17b94a
@@ -6646,7 +6646,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
17b94a
         ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
17b94a
                                    LLONG_MAX - 1, 0, locked_on);
17b94a
         {
17b94a
-            if (ret < AFR_SH_MIN_PARTICIPANTS)
17b94a
+            if (ret < priv->child_count)
17b94a
                 goto mdata_unlock;
17b94a
             ret = __afr_selfheal_metadata_prepare(
17b94a
                 heal_frame, this, inode, locked_on, sources, sinks,
17b94a
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
17b94a
index 81ef38a..ce1ea50 100644
17b94a
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
17b94a
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
17b94a
@@ -1575,7 +1575,6 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
17b94a
     char *accused = NULL;      /* Accused others without any self-accusal */
17b94a
     char *pending = NULL;      /* Have pending operations on others */
17b94a
     char *self_accused = NULL; /* Accused itself */
17b94a
-    int min_participants = -1;
17b94a
 
17b94a
     priv = this->private;
17b94a
 
17b94a
@@ -1599,12 +1598,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
17b94a
         }
17b94a
     }
17b94a
 
17b94a
-    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
17b94a
-        min_participants = priv->child_count;
17b94a
-    } else {
17b94a
-        min_participants = AFR_SH_MIN_PARTICIPANTS;
17b94a
-    }
17b94a
-    if (afr_success_count(replies, priv->child_count) < min_participants) {
17b94a
+    if (afr_success_count(replies, priv->child_count) < priv->child_count) {
17b94a
         /* Treat this just like locks not being acquired */
17b94a
         return -ENOTCONN;
17b94a
     }
17b94a
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
17b94a
index 3ce882e..40be898 100644
17b94a
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
17b94a
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
17b94a
@@ -597,7 +597,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
17b94a
     ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
17b94a
                                locked_on);
17b94a
     {
17b94a
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
17b94a
+        if (ret < priv->child_count) {
17b94a
             gf_msg_debug(this->name, 0,
17b94a
                          "%s: Skipping "
17b94a
                          "entry self-heal as only %d sub-volumes "
17b94a
@@ -991,7 +991,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
17b94a
     ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
17b94a
                                data_lock);
17b94a
     {
17b94a
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
17b94a
+        if (ret < priv->child_count) {
17b94a
             gf_msg_debug(this->name, 0,
17b94a
                          "%s: Skipping "
17b94a
                          "entry self-heal as only %d sub-volumes could "
17b94a
@@ -1115,7 +1115,7 @@ afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
17b94a
     ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
17b94a
                                            NULL, locked_on);
17b94a
     {
17b94a
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
17b94a
+        if (ret < priv->child_count) {
17b94a
             gf_msg_debug(this->name, 0,
17b94a
                          "%s: Skipping "
17b94a
                          "entry self-heal as only %d sub-volumes could "
17b94a
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
17b94a
index 36640b5..7d4f208 100644
17b94a
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
17b94a
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
17b94a
@@ -514,7 +514,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
17b94a
     ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
17b94a
                                locked_on);
17b94a
     {
17b94a
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
17b94a
+        if (ret < priv->child_count) {
17b94a
             ret = -ENOTCONN;
17b94a
             goto unlock;
17b94a
         }
17b94a
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
17b94a
index 6555ec5..8234cec 100644
17b94a
--- a/xlators/cluster/afr/src/afr-self-heal.h
17b94a
+++ b/xlators/cluster/afr/src/afr-self-heal.h
17b94a
@@ -11,8 +11,6 @@
17b94a
 #ifndef _AFR_SELFHEAL_H
17b94a
 #define _AFR_SELFHEAL_H
17b94a
 
17b94a
-#define AFR_SH_MIN_PARTICIPANTS 2
17b94a
-
17b94a
 /* Perform fop on all UP subvolumes and wait for all callbacks to return */
17b94a
 
17b94a
 #define AFR_ONALL(frame, rfn, fop, args...)                                    \
17b94a
-- 
17b94a
1.8.3.1
17b94a