50dc83
From 307074330db6e9f14941dfbabbe6f299cf841533 Mon Sep 17 00:00:00 2001
50dc83
From: karthik-us <ksubrahm@redhat.com>
50dc83
Date: Mon, 10 Jun 2019 23:58:16 +0530
50dc83
Subject: [PATCH 178/178] Cluster/afr: Don't treat all bricks having metadata
50dc83
 pending as split-brain
50dc83
50dc83
Backport of: https://review.gluster.org/#/c/glusterfs/+/22831/
50dc83
50dc83
Problem:
50dc83
We currently don't have a roll-back/undoing of post-ops if quorum is not met.
50dc83
Though the FOP is still unwound with failure, the xattrs remain on the disk.
50dc83
Due to these partial post-ops and partial heals (healing only when 2 bricks
50dc83
are up), we can end up in metadata split-brain purely from the afr xattrs
50dc83
point of view i.e each brick is blamed by atleast one of the others for
50dc83
metadata. These scenarios are hit when there is frequent connect/disconnect
50dc83
of the client/shd to the bricks.
50dc83
50dc83
Fix:
50dc83
Pick a source based on the xattr values. If 2 bricks blame one, the blamed
50dc83
one must be treated as sink. If there is no majority, all are sources. Once
50dc83
we pick a source, self-heal will then do the heal instead of erroring out
50dc83
due to split-brain.
50dc83
This patch also adds restriction of all the bricks to be up to perform
50dc83
metadata heal to avoid any metadata loss.
50dc83
50dc83
Removed the test case tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
50dc83
as it was doing metadata heal even when only 2 of 3 bricks were up.
50dc83
50dc83
Change-Id: I02064ecb7d68d498f75a353af64f75249a633508
50dc83
fixes: bz#1715438
50dc83
Signed-off-by: karthik-us <ksubrahm@redhat.com>
50dc83
Reviewed-on: https://code.engineering.redhat.com/gerrit/172935
50dc83
Tested-by: RHGS Build Bot <nigelb@redhat.com>
50dc83
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
50dc83
---
50dc83
 .../bug-1468279-source-not-blaming-sinks.t         |  64 ----------
50dc83
 .../bug-1717819-metadata-split-brain-detection.t   | 130 +++++++++++++++++++++
50dc83
 xlators/cluster/afr/src/afr-self-heal-common.c     |   4 +-
50dc83
 xlators/cluster/afr/src/afr-self-heal-metadata.c   |   2 +-
50dc83
 4 files changed, 133 insertions(+), 67 deletions(-)
50dc83
 delete mode 100644 tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
50dc83
 create mode 100644 tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
50dc83
50dc83
diff --git a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t b/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
50dc83
deleted file mode 100644
50dc83
index 054a4ad..0000000
50dc83
--- a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
50dc83
+++ /dev/null
50dc83
@@ -1,64 +0,0 @@
50dc83
-#!/bin/bash
50dc83
-. $(dirname $0)/../../include.rc
50dc83
-. $(dirname $0)/../../volume.rc
50dc83
-cleanup;
50dc83
-
50dc83
-TEST glusterd
50dc83
-TEST pidof glusterd
50dc83
-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
50dc83
-TEST $CLI volume start $V0
50dc83
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
50dc83
-TEST $CLI volume set $V0 cluster.metadata-self-heal off
50dc83
-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0;
50dc83
-TEST touch $M0/file
50dc83
-
50dc83
-# Kill B1, create a pending metadata heal.
50dc83
-TEST kill_brick $V0 $H0 $B0/${V0}0
50dc83
-TEST setfattr -n user.xattr -v value1 $M0/file
50dc83
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
50dc83
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
50dc83
-
50dc83
-# Kill B2, heal from B3 to B1.
50dc83
-TEST $CLI volume start $V0 force
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
50dc83
-TEST kill_brick $V0 $H0 $B0/${V0}1
50dc83
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
50dc83
-$CLI volume heal $V0
50dc83
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-0 "metadata"
50dc83
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
50dc83
-
50dc83
-# Create another pending metadata heal.
50dc83
-TEST setfattr -n user.xattr -v value2 $M0/file
50dc83
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
50dc83
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
50dc83
-
50dc83
-# Kill B1, heal from B3 to B2
50dc83
-TEST $CLI volume start $V0 force
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
50dc83
-TEST kill_brick $V0 $H0 $B0/${V0}0
50dc83
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
50dc83
-$CLI volume heal $V0
50dc83
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 "metadata"
50dc83
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
50dc83
-
50dc83
-# ALL bricks up again.
50dc83
-TEST $CLI volume start $V0 force
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
50dc83
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
50dc83
-# B1 and B2 blame each other, B3 doesn't blame anyone.
50dc83
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
50dc83
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
50dc83
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
50dc83
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
50dc83
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
50dc83
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
50dc83
-TEST $CLI volume heal $V0
50dc83
-EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
50dc83
-
50dc83
-cleanup;
50dc83
diff --git a/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
50dc83
new file mode 100644
50dc83
index 0000000..94b8bf3
50dc83
--- /dev/null
50dc83
+++ b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
50dc83
@@ -0,0 +1,130 @@
50dc83
+#!/bin/bash
50dc83
+
50dc83
+. $(dirname $0)/../../include.rc
50dc83
+. $(dirname $0)/../../volume.rc
50dc83
+. $(dirname $0)/../../afr.rc
50dc83
+
50dc83
+cleanup;
50dc83
+
50dc83
+## Start and create a volume
50dc83
+TEST glusterd;
50dc83
+TEST pidof glusterd;
50dc83
+TEST $CLI volume info;
50dc83
+
50dc83
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
50dc83
+TEST $CLI volume start $V0;
50dc83
+EXPECT 'Started' volinfo_field $V0 'Status';
50dc83
+TEST $CLI volume heal $V0 disable
50dc83
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0
50dc83
+
50dc83
+###############################################################################
50dc83
+# Case of 2 bricks blaming the third and the third blaming the other two.
50dc83
+
50dc83
+TEST mkdir $M0/dir
50dc83
+
50dc83
+# B0 and B2 must blame B1
50dc83
+TEST kill_brick $V0 $H0 $B0/$V0"1"
50dc83
+TEST setfattr -n user.metadata -v 1 $M0/dir
50dc83
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir trusted.afr.$V0-client-1 metadata
50dc83
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir trusted.afr.$V0-client-1 metadata
50dc83
+CLIENT_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $M0/dir)
50dc83
+
50dc83
+# B1 must blame B0 and B2
50dc83
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir
50dc83
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir
50dc83
+
50dc83
+# Launch heal
50dc83
+TEST $CLI volume start $V0 force
50dc83
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
50dc83
+TEST $CLI volume heal $V0 enable
50dc83
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
50dc83
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
50dc83
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
50dc83
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
50dc83
+TEST $CLI volume heal $V0
50dc83
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
50dc83
+
50dc83
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir)
50dc83
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir)
50dc83
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir)
50dc83
+
50dc83
+TEST [ "$CLIENT_XATTR" == "$B0_XATTR" ]
50dc83
+TEST [ "$CLIENT_XATTR" == "$B1_XATTR" ]
50dc83
+TEST [ "$CLIENT_XATTR" == "$B2_XATTR" ]
50dc83
+TEST setfattr -x user.metadata $M0/dir
50dc83
+
50dc83
+###############################################################################
50dc83
+# Case of each brick blaming the next one in a cyclic manner
50dc83
+
50dc83
+TEST $CLI volume heal $V0 disable
50dc83
+TEST `echo "hello" >> $M0/dir/file`
50dc83
+# Mark cyclic xattrs and modify metadata directly on the bricks.
50dc83
+setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
50dc83
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
50dc83
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"2"/dir/file
50dc83
+
50dc83
+setfattr -n user.metadata -v 1 $B0/$V0"0"/dir/file
50dc83
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
50dc83
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
50dc83
+
50dc83
+# Add entry to xattrop dir to trigger index heal.
50dc83
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
50dc83
+base_entry_b0=`ls $xattrop_dir0`
50dc83
+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/file))
50dc83
+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str
50dc83
+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0
50dc83
+
50dc83
+# Launch heal
50dc83
+TEST $CLI volume heal $V0 enable
50dc83
+TEST $CLI volume heal $V0
50dc83
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
50dc83
+
50dc83
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
50dc83
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
50dc83
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
50dc83
+
50dc83
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
50dc83
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
50dc83
+TEST rm -f $M0/dir/file
50dc83
+
50dc83
+###############################################################################
50dc83
+# Case of 2 bricks having quorum blaming and the other having only one blaming.
50dc83
+
50dc83
+TEST $CLI volume heal $V0 disable
50dc83
+TEST `echo "hello" >> $M0/dir/file`
50dc83
+# B0 and B2 must blame B1
50dc83
+TEST kill_brick $V0 $H0 $B0/$V0"1"
50dc83
+TEST setfattr -n user.metadata -v 1 $M0/dir/file
50dc83
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir/file trusted.afr.$V0-client-1 metadata
50dc83
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir/file trusted.afr.$V0-client-1 metadata
50dc83
+
50dc83
+# B1 must blame B0 and B2
50dc83
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
50dc83
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
50dc83
+
50dc83
+# B0 must blame B2
50dc83
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
50dc83
+
50dc83
+# Modify the metadata directly on the bricks B1 & B2.
50dc83
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
50dc83
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
50dc83
+
50dc83
+# Launch heal
50dc83
+TEST $CLI volume start $V0 force
50dc83
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
50dc83
+TEST $CLI volume heal $V0 enable
50dc83
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
50dc83
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
50dc83
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
50dc83
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
50dc83
+
50dc83
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
50dc83
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
50dc83
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
50dc83
+
50dc83
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
50dc83
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
50dc83
+
50dc83
+###############################################################################
50dc83
+
50dc83
+cleanup
50dc83
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
50dc83
index 595bed4..5157e7d 100644
50dc83
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
50dc83
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
50dc83
@@ -1590,7 +1590,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
50dc83
         }
50dc83
     }
50dc83
 
50dc83
-    if (type == AFR_DATA_TRANSACTION) {
50dc83
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
50dc83
         min_participants = priv->child_count;
50dc83
     } else {
50dc83
         min_participants = AFR_SH_MIN_PARTICIPANTS;
50dc83
@@ -1656,7 +1656,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
50dc83
         }
50dc83
     }
50dc83
 
50dc83
-    if (type == AFR_DATA_TRANSACTION)
50dc83
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
50dc83
         afr_selfheal_post_op_failure_accounting(priv, accused, sources,
50dc83
                                                 locked_on);
50dc83
 
50dc83
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
50dc83
index ba43341..ecfa791 100644
50dc83
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
50dc83
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
50dc83
@@ -398,7 +398,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
50dc83
     ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
50dc83
                                data_lock);
50dc83
     {
50dc83
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
50dc83
+        if (ret < priv->child_count) {
50dc83
             ret = -ENOTCONN;
50dc83
             goto unlock;
50dc83
         }
50dc83
-- 
50dc83
1.8.3.1
50dc83