233933
From 307074330db6e9f14941dfbabbe6f299cf841533 Mon Sep 17 00:00:00 2001
233933
From: karthik-us <ksubrahm@redhat.com>
233933
Date: Mon, 10 Jun 2019 23:58:16 +0530
233933
Subject: [PATCH 178/178] Cluster/afr: Don't treat all bricks having metadata
233933
 pending as split-brain
233933
233933
Backport of: https://review.gluster.org/#/c/glusterfs/+/22831/
233933
233933
Problem:
233933
We currently don't have a roll-back/undoing of post-ops if quorum is not met.
233933
Though the FOP is still unwound with failure, the xattrs remain on the disk.
233933
Due to these partial post-ops and partial heals (healing only when 2 bricks
233933
are up), we can end up in metadata split-brain purely from the afr xattrs
233933
point of view i.e each brick is blamed by atleast one of the others for
233933
metadata. These scenarios are hit when there is frequent connect/disconnect
233933
of the client/shd to the bricks.
233933
233933
Fix:
233933
Pick a source based on the xattr values. If 2 bricks blame one, the blamed
233933
one must be treated as sink. If there is no majority, all are sources. Once
233933
we pick a source, self-heal will then do the heal instead of erroring out
233933
due to split-brain.
233933
This patch also adds restriction of all the bricks to be up to perform
233933
metadata heal to avoid any metadata loss.
233933
233933
Removed the test case tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
233933
as it was doing metadata heal even when only 2 of 3 bricks were up.
233933
233933
Change-Id: I02064ecb7d68d498f75a353af64f75249a633508
233933
fixes: bz#1715438
233933
Signed-off-by: karthik-us <ksubrahm@redhat.com>
233933
Reviewed-on: https://code.engineering.redhat.com/gerrit/172935
233933
Tested-by: RHGS Build Bot <nigelb@redhat.com>
233933
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
233933
---
233933
 .../bug-1468279-source-not-blaming-sinks.t         |  64 ----------
233933
 .../bug-1717819-metadata-split-brain-detection.t   | 130 +++++++++++++++++++++
233933
 xlators/cluster/afr/src/afr-self-heal-common.c     |   4 +-
233933
 xlators/cluster/afr/src/afr-self-heal-metadata.c   |   2 +-
233933
 4 files changed, 133 insertions(+), 67 deletions(-)
233933
 delete mode 100644 tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
233933
 create mode 100644 tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
233933
233933
diff --git a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t b/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
233933
deleted file mode 100644
233933
index 054a4ad..0000000
233933
--- a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
233933
+++ /dev/null
233933
@@ -1,64 +0,0 @@
233933
-#!/bin/bash
233933
-. $(dirname $0)/../../include.rc
233933
-. $(dirname $0)/../../volume.rc
233933
-cleanup;
233933
-
233933
-TEST glusterd
233933
-TEST pidof glusterd
233933
-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
233933
-TEST $CLI volume start $V0
233933
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
233933
-TEST $CLI volume set $V0 cluster.metadata-self-heal off
233933
-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0;
233933
-TEST touch $M0/file
233933
-
233933
-# Kill B1, create a pending metadata heal.
233933
-TEST kill_brick $V0 $H0 $B0/${V0}0
233933
-TEST setfattr -n user.xattr -v value1 $M0/file
233933
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
233933
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
233933
-
233933
-# Kill B2, heal from B3 to B1.
233933
-TEST $CLI volume start $V0 force
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
233933
-TEST kill_brick $V0 $H0 $B0/${V0}1
233933
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
233933
-$CLI volume heal $V0
233933
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-0 "metadata"
233933
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
233933
-
233933
-# Create another pending metadata heal.
233933
-TEST setfattr -n user.xattr -v value2 $M0/file
233933
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
233933
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
233933
-
233933
-# Kill B1, heal from B3 to B2
233933
-TEST $CLI volume start $V0 force
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
233933
-TEST kill_brick $V0 $H0 $B0/${V0}0
233933
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
233933
-$CLI volume heal $V0
233933
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 "metadata"
233933
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
233933
-
233933
-# ALL bricks up again.
233933
-TEST $CLI volume start $V0 force
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
233933
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
233933
-# B1 and B2 blame each other, B3 doesn't blame anyone.
233933
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
233933
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
233933
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
233933
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
233933
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
233933
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
233933
-TEST $CLI volume heal $V0
233933
-EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
233933
-
233933
-cleanup;
233933
diff --git a/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
233933
new file mode 100644
233933
index 0000000..94b8bf3
233933
--- /dev/null
233933
+++ b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
233933
@@ -0,0 +1,130 @@
233933
+#!/bin/bash
233933
+
233933
+. $(dirname $0)/../../include.rc
233933
+. $(dirname $0)/../../volume.rc
233933
+. $(dirname $0)/../../afr.rc
233933
+
233933
+cleanup;
233933
+
233933
+## Start and create a volume
233933
+TEST glusterd;
233933
+TEST pidof glusterd;
233933
+TEST $CLI volume info;
233933
+
233933
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
233933
+TEST $CLI volume start $V0;
233933
+EXPECT 'Started' volinfo_field $V0 'Status';
233933
+TEST $CLI volume heal $V0 disable
233933
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0
233933
+
233933
+###############################################################################
233933
+# Case of 2 bricks blaming the third and the third blaming the other two.
233933
+
233933
+TEST mkdir $M0/dir
233933
+
233933
+# B0 and B2 must blame B1
233933
+TEST kill_brick $V0 $H0 $B0/$V0"1"
233933
+TEST setfattr -n user.metadata -v 1 $M0/dir
233933
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir trusted.afr.$V0-client-1 metadata
233933
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir trusted.afr.$V0-client-1 metadata
233933
+CLIENT_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $M0/dir)
233933
+
233933
+# B1 must blame B0 and B2
233933
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir
233933
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir
233933
+
233933
+# Launch heal
233933
+TEST $CLI volume start $V0 force
233933
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
233933
+TEST $CLI volume heal $V0 enable
233933
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
233933
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
233933
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
233933
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
233933
+TEST $CLI volume heal $V0
233933
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
233933
+
233933
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir)
233933
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir)
233933
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir)
233933
+
233933
+TEST [ "$CLIENT_XATTR" == "$B0_XATTR" ]
233933
+TEST [ "$CLIENT_XATTR" == "$B1_XATTR" ]
233933
+TEST [ "$CLIENT_XATTR" == "$B2_XATTR" ]
233933
+TEST setfattr -x user.metadata $M0/dir
233933
+
233933
+###############################################################################
233933
+# Case of each brick blaming the next one in a cyclic manner
233933
+
233933
+TEST $CLI volume heal $V0 disable
233933
+TEST `echo "hello" >> $M0/dir/file`
233933
+# Mark cyclic xattrs and modify metadata directly on the bricks.
233933
+setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
233933
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
233933
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"2"/dir/file
233933
+
233933
+setfattr -n user.metadata -v 1 $B0/$V0"0"/dir/file
233933
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
233933
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
233933
+
233933
+# Add entry to xattrop dir to trigger index heal.
233933
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
233933
+base_entry_b0=`ls $xattrop_dir0`
233933
+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/file))
233933
+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str
233933
+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0
233933
+
233933
+# Launch heal
233933
+TEST $CLI volume heal $V0 enable
233933
+TEST $CLI volume heal $V0
233933
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
233933
+
233933
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
233933
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
233933
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
233933
+
233933
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
233933
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
233933
+TEST rm -f $M0/dir/file
233933
+
233933
+###############################################################################
233933
+# Case of 2 bricks having quorum blaming and the other having only one blaming.
233933
+
233933
+TEST $CLI volume heal $V0 disable
233933
+TEST `echo "hello" >> $M0/dir/file`
233933
+# B0 and B2 must blame B1
233933
+TEST kill_brick $V0 $H0 $B0/$V0"1"
233933
+TEST setfattr -n user.metadata -v 1 $M0/dir/file
233933
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir/file trusted.afr.$V0-client-1 metadata
233933
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir/file trusted.afr.$V0-client-1 metadata
233933
+
233933
+# B1 must blame B0 and B2
233933
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
233933
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
233933
+
233933
+# B0 must blame B2
233933
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
233933
+
233933
+# Modify the metadata directly on the bricks B1 & B2.
233933
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
233933
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
233933
+
233933
+# Launch heal
233933
+TEST $CLI volume start $V0 force
233933
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
233933
+TEST $CLI volume heal $V0 enable
233933
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
233933
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
233933
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
233933
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
233933
+
233933
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
233933
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
233933
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
233933
+
233933
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
233933
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
233933
+
233933
+###############################################################################
233933
+
233933
+cleanup
233933
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
233933
index 595bed4..5157e7d 100644
233933
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
233933
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
233933
@@ -1590,7 +1590,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
233933
         }
233933
     }
233933
 
233933
-    if (type == AFR_DATA_TRANSACTION) {
233933
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
233933
         min_participants = priv->child_count;
233933
     } else {
233933
         min_participants = AFR_SH_MIN_PARTICIPANTS;
233933
@@ -1656,7 +1656,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
233933
         }
233933
     }
233933
 
233933
-    if (type == AFR_DATA_TRANSACTION)
233933
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
233933
         afr_selfheal_post_op_failure_accounting(priv, accused, sources,
233933
                                                 locked_on);
233933
 
233933
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
233933
index ba43341..ecfa791 100644
233933
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
233933
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
233933
@@ -398,7 +398,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
233933
     ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
233933
                                data_lock);
233933
     {
233933
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
233933
+        if (ret < priv->child_count) {
233933
             ret = -ENOTCONN;
233933
             goto unlock;
233933
         }
233933
-- 
233933
1.8.3.1
233933