17b94a
From 307074330db6e9f14941dfbabbe6f299cf841533 Mon Sep 17 00:00:00 2001
17b94a
From: karthik-us <ksubrahm@redhat.com>
17b94a
Date: Mon, 10 Jun 2019 23:58:16 +0530
17b94a
Subject: [PATCH 178/178] Cluster/afr: Don't treat all bricks having metadata
17b94a
 pending as split-brain
17b94a
17b94a
Backport of: https://review.gluster.org/#/c/glusterfs/+/22831/
17b94a
17b94a
Problem:
17b94a
We currently don't have a roll-back/undoing of post-ops if quorum is not met.
17b94a
Though the FOP is still unwound with failure, the xattrs remain on the disk.
17b94a
Due to these partial post-ops and partial heals (healing only when 2 bricks
17b94a
are up), we can end up in metadata split-brain purely from the afr xattrs
17b94a
point of view i.e each brick is blamed by atleast one of the others for
17b94a
metadata. These scenarios are hit when there is frequent connect/disconnect
17b94a
of the client/shd to the bricks.
17b94a
17b94a
Fix:
17b94a
Pick a source based on the xattr values. If 2 bricks blame one, the blamed
17b94a
one must be treated as sink. If there is no majority, all are sources. Once
17b94a
we pick a source, self-heal will then do the heal instead of erroring out
17b94a
due to split-brain.
17b94a
This patch also adds restriction of all the bricks to be up to perform
17b94a
metadata heal to avoid any metadata loss.
17b94a
17b94a
Removed the test case tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
17b94a
as it was doing metadata heal even when only 2 of 3 bricks were up.
17b94a
17b94a
Change-Id: I02064ecb7d68d498f75a353af64f75249a633508
17b94a
fixes: bz#1715438
17b94a
Signed-off-by: karthik-us <ksubrahm@redhat.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/172935
17b94a
Tested-by: RHGS Build Bot <nigelb@redhat.com>
17b94a
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
17b94a
---
17b94a
 .../bug-1468279-source-not-blaming-sinks.t         |  64 ----------
17b94a
 .../bug-1717819-metadata-split-brain-detection.t   | 130 +++++++++++++++++++++
17b94a
 xlators/cluster/afr/src/afr-self-heal-common.c     |   4 +-
17b94a
 xlators/cluster/afr/src/afr-self-heal-metadata.c   |   2 +-
17b94a
 4 files changed, 133 insertions(+), 67 deletions(-)
17b94a
 delete mode 100644 tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
17b94a
 create mode 100644 tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
17b94a
17b94a
diff --git a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t b/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
17b94a
deleted file mode 100644
17b94a
index 054a4ad..0000000
17b94a
--- a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
17b94a
+++ /dev/null
17b94a
@@ -1,64 +0,0 @@
17b94a
-#!/bin/bash
17b94a
-. $(dirname $0)/../../include.rc
17b94a
-. $(dirname $0)/../../volume.rc
17b94a
-cleanup;
17b94a
-
17b94a
-TEST glusterd
17b94a
-TEST pidof glusterd
17b94a
-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
17b94a
-TEST $CLI volume start $V0
17b94a
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
17b94a
-TEST $CLI volume set $V0 cluster.metadata-self-heal off
17b94a
-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0;
17b94a
-TEST touch $M0/file
17b94a
-
17b94a
-# Kill B1, create a pending metadata heal.
17b94a
-TEST kill_brick $V0 $H0 $B0/${V0}0
17b94a
-TEST setfattr -n user.xattr -v value1 $M0/file
17b94a
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
17b94a
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
17b94a
-
17b94a
-# Kill B2, heal from B3 to B1.
17b94a
-TEST $CLI volume start $V0 force
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
17b94a
-TEST kill_brick $V0 $H0 $B0/${V0}1
17b94a
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
17b94a
-$CLI volume heal $V0
17b94a
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-0 "metadata"
17b94a
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
17b94a
-
17b94a
-# Create another pending metadata heal.
17b94a
-TEST setfattr -n user.xattr -v value2 $M0/file
17b94a
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
17b94a
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
17b94a
-
17b94a
-# Kill B1, heal from B3 to B2
17b94a
-TEST $CLI volume start $V0 force
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
17b94a
-TEST kill_brick $V0 $H0 $B0/${V0}0
17b94a
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
17b94a
-$CLI volume heal $V0
17b94a
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 "metadata"
17b94a
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
17b94a
-
17b94a
-# ALL bricks up again.
17b94a
-TEST $CLI volume start $V0 force
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
17b94a
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
17b94a
-# B1 and B2 blame each other, B3 doesn't blame anyone.
17b94a
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
17b94a
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
17b94a
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
17b94a
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
17b94a
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
17b94a
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
17b94a
-TEST $CLI volume heal $V0
17b94a
-EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
17b94a
-
17b94a
-cleanup;
17b94a
diff --git a/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
17b94a
new file mode 100644
17b94a
index 0000000..94b8bf3
17b94a
--- /dev/null
17b94a
+++ b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
17b94a
@@ -0,0 +1,130 @@
17b94a
+#!/bin/bash
17b94a
+
17b94a
+. $(dirname $0)/../../include.rc
17b94a
+. $(dirname $0)/../../volume.rc
17b94a
+. $(dirname $0)/../../afr.rc
17b94a
+
17b94a
+cleanup;
17b94a
+
17b94a
+## Start and create a volume
17b94a
+TEST glusterd;
17b94a
+TEST pidof glusterd;
17b94a
+TEST $CLI volume info;
17b94a
+
17b94a
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
17b94a
+TEST $CLI volume start $V0;
17b94a
+EXPECT 'Started' volinfo_field $V0 'Status';
17b94a
+TEST $CLI volume heal $V0 disable
17b94a
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0
17b94a
+
17b94a
+###############################################################################
17b94a
+# Case of 2 bricks blaming the third and the third blaming the other two.
17b94a
+
17b94a
+TEST mkdir $M0/dir
17b94a
+
17b94a
+# B0 and B2 must blame B1
17b94a
+TEST kill_brick $V0 $H0 $B0/$V0"1"
17b94a
+TEST setfattr -n user.metadata -v 1 $M0/dir
17b94a
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir trusted.afr.$V0-client-1 metadata
17b94a
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir trusted.afr.$V0-client-1 metadata
17b94a
+CLIENT_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $M0/dir)
17b94a
+
17b94a
+# B1 must blame B0 and B2
17b94a
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir
17b94a
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir
17b94a
+
17b94a
+# Launch heal
17b94a
+TEST $CLI volume start $V0 force
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
17b94a
+TEST $CLI volume heal $V0 enable
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
17b94a
+TEST $CLI volume heal $V0
17b94a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
17b94a
+
17b94a
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir)
17b94a
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir)
17b94a
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir)
17b94a
+
17b94a
+TEST [ "$CLIENT_XATTR" == "$B0_XATTR" ]
17b94a
+TEST [ "$CLIENT_XATTR" == "$B1_XATTR" ]
17b94a
+TEST [ "$CLIENT_XATTR" == "$B2_XATTR" ]
17b94a
+TEST setfattr -x user.metadata $M0/dir
17b94a
+
17b94a
+###############################################################################
17b94a
+# Case of each brick blaming the next one in a cyclic manner
17b94a
+
17b94a
+TEST $CLI volume heal $V0 disable
17b94a
+TEST `echo "hello" >> $M0/dir/file`
17b94a
+# Mark cyclic xattrs and modify metadata directly on the bricks.
17b94a
+setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
17b94a
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
17b94a
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"2"/dir/file
17b94a
+
17b94a
+setfattr -n user.metadata -v 1 $B0/$V0"0"/dir/file
17b94a
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
17b94a
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
17b94a
+
17b94a
+# Add entry to xattrop dir to trigger index heal.
17b94a
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
17b94a
+base_entry_b0=`ls $xattrop_dir0`
17b94a
+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/file))
17b94a
+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str
17b94a
+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0
17b94a
+
17b94a
+# Launch heal
17b94a
+TEST $CLI volume heal $V0 enable
17b94a
+TEST $CLI volume heal $V0
17b94a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
17b94a
+
17b94a
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
17b94a
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
17b94a
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
17b94a
+
17b94a
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
17b94a
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
17b94a
+TEST rm -f $M0/dir/file
17b94a
+
17b94a
+###############################################################################
17b94a
+# Case of 2 bricks having quorum blaming and the other having only one blaming.
17b94a
+
17b94a
+TEST $CLI volume heal $V0 disable
17b94a
+TEST `echo "hello" >> $M0/dir/file`
17b94a
+# B0 and B2 must blame B1
17b94a
+TEST kill_brick $V0 $H0 $B0/$V0"1"
17b94a
+TEST setfattr -n user.metadata -v 1 $M0/dir/file
17b94a
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir/file trusted.afr.$V0-client-1 metadata
17b94a
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir/file trusted.afr.$V0-client-1 metadata
17b94a
+
17b94a
+# B1 must blame B0 and B2
17b94a
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
17b94a
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
17b94a
+
17b94a
+# B0 must blame B2
17b94a
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
17b94a
+
17b94a
+# Modify the metadata directly on the bricks B1 & B2.
17b94a
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
17b94a
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
17b94a
+
17b94a
+# Launch heal
17b94a
+TEST $CLI volume start $V0 force
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
17b94a
+TEST $CLI volume heal $V0 enable
17b94a
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
17b94a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
17b94a
+
17b94a
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
17b94a
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
17b94a
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
17b94a
+
17b94a
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
17b94a
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
17b94a
+
17b94a
+###############################################################################
17b94a
+
17b94a
+cleanup
17b94a
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
17b94a
index 595bed4..5157e7d 100644
17b94a
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
17b94a
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
17b94a
@@ -1590,7 +1590,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
17b94a
         }
17b94a
     }
17b94a
 
17b94a
-    if (type == AFR_DATA_TRANSACTION) {
17b94a
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
17b94a
         min_participants = priv->child_count;
17b94a
     } else {
17b94a
         min_participants = AFR_SH_MIN_PARTICIPANTS;
17b94a
@@ -1656,7 +1656,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
17b94a
         }
17b94a
     }
17b94a
 
17b94a
-    if (type == AFR_DATA_TRANSACTION)
17b94a
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
17b94a
         afr_selfheal_post_op_failure_accounting(priv, accused, sources,
17b94a
                                                 locked_on);
17b94a
 
17b94a
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
17b94a
index ba43341..ecfa791 100644
17b94a
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
17b94a
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
17b94a
@@ -398,7 +398,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
17b94a
     ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
17b94a
                                data_lock);
17b94a
     {
17b94a
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
17b94a
+        if (ret < priv->child_count) {
17b94a
             ret = -ENOTCONN;
17b94a
             goto unlock;
17b94a
         }
17b94a
-- 
17b94a
1.8.3.1
17b94a