256ebe
From 307074330db6e9f14941dfbabbe6f299cf841533 Mon Sep 17 00:00:00 2001
256ebe
From: karthik-us <ksubrahm@redhat.com>
256ebe
Date: Mon, 10 Jun 2019 23:58:16 +0530
256ebe
Subject: [PATCH 178/178] Cluster/afr: Don't treat all bricks having metadata
256ebe
 pending as split-brain
256ebe
256ebe
Backport of: https://review.gluster.org/#/c/glusterfs/+/22831/
256ebe
256ebe
Problem:
256ebe
We currently don't have a roll-back/undoing of post-ops if quorum is not met.
256ebe
Though the FOP is still unwound with failure, the xattrs remain on the disk.
256ebe
Due to these partial post-ops and partial heals (healing only when 2 bricks
256ebe
are up), we can end up in metadata split-brain purely from the afr xattrs
256ebe
point of view i.e each brick is blamed by atleast one of the others for
256ebe
metadata. These scenarios are hit when there is frequent connect/disconnect
256ebe
of the client/shd to the bricks.
256ebe
256ebe
Fix:
256ebe
Pick a source based on the xattr values. If 2 bricks blame one, the blamed
256ebe
one must be treated as sink. If there is no majority, all are sources. Once
256ebe
we pick a source, self-heal will then do the heal instead of erroring out
256ebe
due to split-brain.
256ebe
This patch also adds restriction of all the bricks to be up to perform
256ebe
metadata heal to avoid any metadata loss.
256ebe
256ebe
Removed the test case tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
256ebe
as it was doing metadata heal even when only 2 of 3 bricks were up.
256ebe
256ebe
Change-Id: I02064ecb7d68d498f75a353af64f75249a633508
256ebe
fixes: bz#1715438
256ebe
Signed-off-by: karthik-us <ksubrahm@redhat.com>
256ebe
Reviewed-on: https://code.engineering.redhat.com/gerrit/172935
256ebe
Tested-by: RHGS Build Bot <nigelb@redhat.com>
256ebe
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
256ebe
---
256ebe
 .../bug-1468279-source-not-blaming-sinks.t         |  64 ----------
256ebe
 .../bug-1717819-metadata-split-brain-detection.t   | 130 +++++++++++++++++++++
256ebe
 xlators/cluster/afr/src/afr-self-heal-common.c     |   4 +-
256ebe
 xlators/cluster/afr/src/afr-self-heal-metadata.c   |   2 +-
256ebe
 4 files changed, 133 insertions(+), 67 deletions(-)
256ebe
 delete mode 100644 tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
256ebe
 create mode 100644 tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
256ebe
256ebe
diff --git a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t b/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
256ebe
deleted file mode 100644
256ebe
index 054a4ad..0000000
256ebe
--- a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
256ebe
+++ /dev/null
256ebe
@@ -1,64 +0,0 @@
256ebe
-#!/bin/bash
256ebe
-. $(dirname $0)/../../include.rc
256ebe
-. $(dirname $0)/../../volume.rc
256ebe
-cleanup;
256ebe
-
256ebe
-TEST glusterd
256ebe
-TEST pidof glusterd
256ebe
-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
256ebe
-TEST $CLI volume start $V0
256ebe
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
256ebe
-TEST $CLI volume set $V0 cluster.metadata-self-heal off
256ebe
-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0;
256ebe
-TEST touch $M0/file
256ebe
-
256ebe
-# Kill B1, create a pending metadata heal.
256ebe
-TEST kill_brick $V0 $H0 $B0/${V0}0
256ebe
-TEST setfattr -n user.xattr -v value1 $M0/file
256ebe
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
256ebe
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
256ebe
-
256ebe
-# Kill B2, heal from B3 to B1.
256ebe
-TEST $CLI volume start $V0 force
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
256ebe
-TEST kill_brick $V0 $H0 $B0/${V0}1
256ebe
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
256ebe
-$CLI volume heal $V0
256ebe
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-0 "metadata"
256ebe
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
256ebe
-
256ebe
-# Create another pending metadata heal.
256ebe
-TEST setfattr -n user.xattr -v value2 $M0/file
256ebe
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
256ebe
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
256ebe
-
256ebe
-# Kill B1, heal from B3 to B2
256ebe
-TEST $CLI volume start $V0 force
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
256ebe
-TEST kill_brick $V0 $H0 $B0/${V0}0
256ebe
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
256ebe
-$CLI volume heal $V0
256ebe
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 "metadata"
256ebe
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
256ebe
-
256ebe
-# ALL bricks up again.
256ebe
-TEST $CLI volume start $V0 force
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
256ebe
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
256ebe
-# B1 and B2 blame each other, B3 doesn't blame anyone.
256ebe
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
256ebe
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
256ebe
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
256ebe
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
256ebe
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
256ebe
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
256ebe
-TEST $CLI volume heal $V0
256ebe
-EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
256ebe
-
256ebe
-cleanup;
256ebe
diff --git a/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
256ebe
new file mode 100644
256ebe
index 0000000..94b8bf3
256ebe
--- /dev/null
256ebe
+++ b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
256ebe
@@ -0,0 +1,130 @@
256ebe
+#!/bin/bash
256ebe
+
256ebe
+. $(dirname $0)/../../include.rc
256ebe
+. $(dirname $0)/../../volume.rc
256ebe
+. $(dirname $0)/../../afr.rc
256ebe
+
256ebe
+cleanup;
256ebe
+
256ebe
+## Start and create a volume
256ebe
+TEST glusterd;
256ebe
+TEST pidof glusterd;
256ebe
+TEST $CLI volume info;
256ebe
+
256ebe
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
256ebe
+TEST $CLI volume start $V0;
256ebe
+EXPECT 'Started' volinfo_field $V0 'Status';
256ebe
+TEST $CLI volume heal $V0 disable
256ebe
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0
256ebe
+
256ebe
+###############################################################################
256ebe
+# Case of 2 bricks blaming the third and the third blaming the other two.
256ebe
+
256ebe
+TEST mkdir $M0/dir
256ebe
+
256ebe
+# B0 and B2 must blame B1
256ebe
+TEST kill_brick $V0 $H0 $B0/$V0"1"
256ebe
+TEST setfattr -n user.metadata -v 1 $M0/dir
256ebe
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir trusted.afr.$V0-client-1 metadata
256ebe
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir trusted.afr.$V0-client-1 metadata
256ebe
+CLIENT_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $M0/dir)
256ebe
+
256ebe
+# B1 must blame B0 and B2
256ebe
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir
256ebe
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir
256ebe
+
256ebe
+# Launch heal
256ebe
+TEST $CLI volume start $V0 force
256ebe
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
256ebe
+TEST $CLI volume heal $V0 enable
256ebe
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
256ebe
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
256ebe
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
256ebe
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
256ebe
+TEST $CLI volume heal $V0
256ebe
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
256ebe
+
256ebe
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir)
256ebe
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir)
256ebe
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir)
256ebe
+
256ebe
+TEST [ "$CLIENT_XATTR" == "$B0_XATTR" ]
256ebe
+TEST [ "$CLIENT_XATTR" == "$B1_XATTR" ]
256ebe
+TEST [ "$CLIENT_XATTR" == "$B2_XATTR" ]
256ebe
+TEST setfattr -x user.metadata $M0/dir
256ebe
+
256ebe
+###############################################################################
256ebe
+# Case of each brick blaming the next one in a cyclic manner
256ebe
+
256ebe
+TEST $CLI volume heal $V0 disable
256ebe
+TEST `echo "hello" >> $M0/dir/file`
256ebe
+# Mark cyclic xattrs and modify metadata directly on the bricks.
256ebe
+setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
256ebe
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
256ebe
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"2"/dir/file
256ebe
+
256ebe
+setfattr -n user.metadata -v 1 $B0/$V0"0"/dir/file
256ebe
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
256ebe
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
256ebe
+
256ebe
+# Add entry to xattrop dir to trigger index heal.
256ebe
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
256ebe
+base_entry_b0=`ls $xattrop_dir0`
256ebe
+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/file))
256ebe
+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str
256ebe
+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0
256ebe
+
256ebe
+# Launch heal
256ebe
+TEST $CLI volume heal $V0 enable
256ebe
+TEST $CLI volume heal $V0
256ebe
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
256ebe
+
256ebe
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
256ebe
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
256ebe
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
256ebe
+
256ebe
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
256ebe
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
256ebe
+TEST rm -f $M0/dir/file
256ebe
+
256ebe
+###############################################################################
256ebe
+# Case of 2 bricks having quorum blaming and the other having only one blaming.
256ebe
+
256ebe
+TEST $CLI volume heal $V0 disable
256ebe
+TEST `echo "hello" >> $M0/dir/file`
256ebe
+# B0 and B2 must blame B1
256ebe
+TEST kill_brick $V0 $H0 $B0/$V0"1"
256ebe
+TEST setfattr -n user.metadata -v 1 $M0/dir/file
256ebe
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir/file trusted.afr.$V0-client-1 metadata
256ebe
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir/file trusted.afr.$V0-client-1 metadata
256ebe
+
256ebe
+# B1 must blame B0 and B2
256ebe
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
256ebe
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
256ebe
+
256ebe
+# B0 must blame B2
256ebe
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
256ebe
+
256ebe
+# Modify the metadata directly on the bricks B1 & B2.
256ebe
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
256ebe
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
256ebe
+
256ebe
+# Launch heal
256ebe
+TEST $CLI volume start $V0 force
256ebe
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
256ebe
+TEST $CLI volume heal $V0 enable
256ebe
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
256ebe
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
256ebe
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
256ebe
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
256ebe
+
256ebe
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
256ebe
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
256ebe
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
256ebe
+
256ebe
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
256ebe
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
256ebe
+
256ebe
+###############################################################################
256ebe
+
256ebe
+cleanup
256ebe
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
256ebe
index 595bed4..5157e7d 100644
256ebe
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
256ebe
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
256ebe
@@ -1590,7 +1590,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
256ebe
         }
256ebe
     }
256ebe
 
256ebe
-    if (type == AFR_DATA_TRANSACTION) {
256ebe
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
256ebe
         min_participants = priv->child_count;
256ebe
     } else {
256ebe
         min_participants = AFR_SH_MIN_PARTICIPANTS;
256ebe
@@ -1656,7 +1656,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
256ebe
         }
256ebe
     }
256ebe
 
256ebe
-    if (type == AFR_DATA_TRANSACTION)
256ebe
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
256ebe
         afr_selfheal_post_op_failure_accounting(priv, accused, sources,
256ebe
                                                 locked_on);
256ebe
 
256ebe
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
256ebe
index ba43341..ecfa791 100644
256ebe
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
256ebe
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
256ebe
@@ -398,7 +398,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
256ebe
     ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
256ebe
                                data_lock);
256ebe
     {
256ebe
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
256ebe
+        if (ret < priv->child_count) {
256ebe
             ret = -ENOTCONN;
256ebe
             goto unlock;
256ebe
         }
256ebe
-- 
256ebe
1.8.3.1
256ebe