74b1de
From 307074330db6e9f14941dfbabbe6f299cf841533 Mon Sep 17 00:00:00 2001
74b1de
From: karthik-us <ksubrahm@redhat.com>
74b1de
Date: Mon, 10 Jun 2019 23:58:16 +0530
74b1de
Subject: [PATCH 178/178] Cluster/afr: Don't treat all bricks having metadata
74b1de
 pending as split-brain
74b1de
74b1de
Backport of: https://review.gluster.org/#/c/glusterfs/+/22831/
74b1de
74b1de
Problem:
74b1de
We currently don't have a roll-back/undoing of post-ops if quorum is not met.
74b1de
Though the FOP is still unwound with failure, the xattrs remain on the disk.
74b1de
Due to these partial post-ops and partial heals (healing only when 2 bricks
74b1de
are up), we can end up in metadata split-brain purely from the afr xattrs
74b1de
point of view i.e each brick is blamed by atleast one of the others for
74b1de
metadata. These scenarios are hit when there is frequent connect/disconnect
74b1de
of the client/shd to the bricks.
74b1de
74b1de
Fix:
74b1de
Pick a source based on the xattr values. If 2 bricks blame one, the blamed
74b1de
one must be treated as sink. If there is no majority, all are sources. Once
74b1de
we pick a source, self-heal will then do the heal instead of erroring out
74b1de
due to split-brain.
74b1de
This patch also adds restriction of all the bricks to be up to perform
74b1de
metadata heal to avoid any metadata loss.
74b1de
74b1de
Removed the test case tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
74b1de
as it was doing metadata heal even when only 2 of 3 bricks were up.
74b1de
74b1de
Change-Id: I02064ecb7d68d498f75a353af64f75249a633508
74b1de
fixes: bz#1715438
74b1de
Signed-off-by: karthik-us <ksubrahm@redhat.com>
74b1de
Reviewed-on: https://code.engineering.redhat.com/gerrit/172935
74b1de
Tested-by: RHGS Build Bot <nigelb@redhat.com>
74b1de
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
74b1de
---
74b1de
 .../bug-1468279-source-not-blaming-sinks.t         |  64 ----------
74b1de
 .../bug-1717819-metadata-split-brain-detection.t   | 130 +++++++++++++++++++++
74b1de
 xlators/cluster/afr/src/afr-self-heal-common.c     |   4 +-
74b1de
 xlators/cluster/afr/src/afr-self-heal-metadata.c   |   2 +-
74b1de
 4 files changed, 133 insertions(+), 67 deletions(-)
74b1de
 delete mode 100644 tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
74b1de
 create mode 100644 tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
74b1de
74b1de
diff --git a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t b/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
74b1de
deleted file mode 100644
74b1de
index 054a4ad..0000000
74b1de
--- a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
74b1de
+++ /dev/null
74b1de
@@ -1,64 +0,0 @@
74b1de
-#!/bin/bash
74b1de
-. $(dirname $0)/../../include.rc
74b1de
-. $(dirname $0)/../../volume.rc
74b1de
-cleanup;
74b1de
-
74b1de
-TEST glusterd
74b1de
-TEST pidof glusterd
74b1de
-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
74b1de
-TEST $CLI volume start $V0
74b1de
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
74b1de
-TEST $CLI volume set $V0 cluster.metadata-self-heal off
74b1de
-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0;
74b1de
-TEST touch $M0/file
74b1de
-
74b1de
-# Kill B1, create a pending metadata heal.
74b1de
-TEST kill_brick $V0 $H0 $B0/${V0}0
74b1de
-TEST setfattr -n user.xattr -v value1 $M0/file
74b1de
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
74b1de
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
74b1de
-
74b1de
-# Kill B2, heal from B3 to B1.
74b1de
-TEST $CLI volume start $V0 force
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
74b1de
-TEST kill_brick $V0 $H0 $B0/${V0}1
74b1de
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
74b1de
-$CLI volume heal $V0
74b1de
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-0 "metadata"
74b1de
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
74b1de
-
74b1de
-# Create another pending metadata heal.
74b1de
-TEST setfattr -n user.xattr -v value2 $M0/file
74b1de
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
74b1de
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
74b1de
-
74b1de
-# Kill B1, heal from B3 to B2
74b1de
-TEST $CLI volume start $V0 force
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
74b1de
-TEST kill_brick $V0 $H0 $B0/${V0}0
74b1de
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
74b1de
-$CLI volume heal $V0
74b1de
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 "metadata"
74b1de
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
74b1de
-
74b1de
-# ALL bricks up again.
74b1de
-TEST $CLI volume start $V0 force
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
74b1de
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
74b1de
-# B1 and B2 blame each other, B3 doesn't blame anyone.
74b1de
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
74b1de
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
74b1de
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
74b1de
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
74b1de
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
74b1de
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
74b1de
-TEST $CLI volume heal $V0
74b1de
-EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
74b1de
-
74b1de
-cleanup;
74b1de
diff --git a/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
74b1de
new file mode 100644
74b1de
index 0000000..94b8bf3
74b1de
--- /dev/null
74b1de
+++ b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
74b1de
@@ -0,0 +1,130 @@
74b1de
+#!/bin/bash
74b1de
+
74b1de
+. $(dirname $0)/../../include.rc
74b1de
+. $(dirname $0)/../../volume.rc
74b1de
+. $(dirname $0)/../../afr.rc
74b1de
+
74b1de
+cleanup;
74b1de
+
74b1de
+## Start and create a volume
74b1de
+TEST glusterd;
74b1de
+TEST pidof glusterd;
74b1de
+TEST $CLI volume info;
74b1de
+
74b1de
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
74b1de
+TEST $CLI volume start $V0;
74b1de
+EXPECT 'Started' volinfo_field $V0 'Status';
74b1de
+TEST $CLI volume heal $V0 disable
74b1de
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0
74b1de
+
74b1de
+###############################################################################
74b1de
+# Case of 2 bricks blaming the third and the third blaming the other two.
74b1de
+
74b1de
+TEST mkdir $M0/dir
74b1de
+
74b1de
+# B0 and B2 must blame B1
74b1de
+TEST kill_brick $V0 $H0 $B0/$V0"1"
74b1de
+TEST setfattr -n user.metadata -v 1 $M0/dir
74b1de
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir trusted.afr.$V0-client-1 metadata
74b1de
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir trusted.afr.$V0-client-1 metadata
74b1de
+CLIENT_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $M0/dir)
74b1de
+
74b1de
+# B1 must blame B0 and B2
74b1de
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir
74b1de
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir
74b1de
+
74b1de
+# Launch heal
74b1de
+TEST $CLI volume start $V0 force
74b1de
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
74b1de
+TEST $CLI volume heal $V0 enable
74b1de
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
74b1de
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
74b1de
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
74b1de
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
74b1de
+TEST $CLI volume heal $V0
74b1de
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
74b1de
+
74b1de
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir)
74b1de
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir)
74b1de
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir)
74b1de
+
74b1de
+TEST [ "$CLIENT_XATTR" == "$B0_XATTR" ]
74b1de
+TEST [ "$CLIENT_XATTR" == "$B1_XATTR" ]
74b1de
+TEST [ "$CLIENT_XATTR" == "$B2_XATTR" ]
74b1de
+TEST setfattr -x user.metadata $M0/dir
74b1de
+
74b1de
+###############################################################################
74b1de
+# Case of each brick blaming the next one in a cyclic manner
74b1de
+
74b1de
+TEST $CLI volume heal $V0 disable
74b1de
+TEST `echo "hello" >> $M0/dir/file`
74b1de
+# Mark cyclic xattrs and modify metadata directly on the bricks.
74b1de
+setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
74b1de
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
74b1de
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"2"/dir/file
74b1de
+
74b1de
+setfattr -n user.metadata -v 1 $B0/$V0"0"/dir/file
74b1de
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
74b1de
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
74b1de
+
74b1de
+# Add entry to xattrop dir to trigger index heal.
74b1de
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
74b1de
+base_entry_b0=`ls $xattrop_dir0`
74b1de
+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/file))
74b1de
+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str
74b1de
+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0
74b1de
+
74b1de
+# Launch heal
74b1de
+TEST $CLI volume heal $V0 enable
74b1de
+TEST $CLI volume heal $V0
74b1de
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
74b1de
+
74b1de
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
74b1de
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
74b1de
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
74b1de
+
74b1de
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
74b1de
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
74b1de
+TEST rm -f $M0/dir/file
74b1de
+
74b1de
+###############################################################################
74b1de
+# Case of 2 bricks having quorum blaming and the other having only one blaming.
74b1de
+
74b1de
+TEST $CLI volume heal $V0 disable
74b1de
+TEST `echo "hello" >> $M0/dir/file`
74b1de
+# B0 and B2 must blame B1
74b1de
+TEST kill_brick $V0 $H0 $B0/$V0"1"
74b1de
+TEST setfattr -n user.metadata -v 1 $M0/dir/file
74b1de
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir/file trusted.afr.$V0-client-1 metadata
74b1de
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir/file trusted.afr.$V0-client-1 metadata
74b1de
+
74b1de
+# B1 must blame B0 and B2
74b1de
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
74b1de
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
74b1de
+
74b1de
+# B0 must blame B2
74b1de
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
74b1de
+
74b1de
+# Modify the metadata directly on the bricks B1 & B2.
74b1de
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
74b1de
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
74b1de
+
74b1de
+# Launch heal
74b1de
+TEST $CLI volume start $V0 force
74b1de
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
74b1de
+TEST $CLI volume heal $V0 enable
74b1de
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
74b1de
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
74b1de
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
74b1de
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
74b1de
+
74b1de
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
74b1de
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
74b1de
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
74b1de
+
74b1de
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
74b1de
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
74b1de
+
74b1de
+###############################################################################
74b1de
+
74b1de
+cleanup
74b1de
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
74b1de
index 595bed4..5157e7d 100644
74b1de
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
74b1de
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
74b1de
@@ -1590,7 +1590,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
74b1de
         }
74b1de
     }
74b1de
 
74b1de
-    if (type == AFR_DATA_TRANSACTION) {
74b1de
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
74b1de
         min_participants = priv->child_count;
74b1de
     } else {
74b1de
         min_participants = AFR_SH_MIN_PARTICIPANTS;
74b1de
@@ -1656,7 +1656,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
74b1de
         }
74b1de
     }
74b1de
 
74b1de
-    if (type == AFR_DATA_TRANSACTION)
74b1de
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
74b1de
         afr_selfheal_post_op_failure_accounting(priv, accused, sources,
74b1de
                                                 locked_on);
74b1de
 
74b1de
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
74b1de
index ba43341..ecfa791 100644
74b1de
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
74b1de
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
74b1de
@@ -398,7 +398,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
74b1de
     ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
74b1de
                                data_lock);
74b1de
     {
74b1de
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
74b1de
+        if (ret < priv->child_count) {
74b1de
             ret = -ENOTCONN;
74b1de
             goto unlock;
74b1de
         }
74b1de
-- 
74b1de
1.8.3.1
74b1de