14f8ab
From 307074330db6e9f14941dfbabbe6f299cf841533 Mon Sep 17 00:00:00 2001
14f8ab
From: karthik-us <ksubrahm@redhat.com>
14f8ab
Date: Mon, 10 Jun 2019 23:58:16 +0530
14f8ab
Subject: [PATCH 178/178] Cluster/afr: Don't treat all bricks having metadata
14f8ab
 pending as split-brain
14f8ab
14f8ab
Backport of: https://review.gluster.org/#/c/glusterfs/+/22831/
14f8ab
14f8ab
Problem:
14f8ab
We currently don't have a roll-back/undoing of post-ops if quorum is not met.
14f8ab
Though the FOP is still unwound with failure, the xattrs remain on the disk.
14f8ab
Due to these partial post-ops and partial heals (healing only when 2 bricks
14f8ab
are up), we can end up in metadata split-brain purely from the afr xattrs
14f8ab
point of view i.e each brick is blamed by atleast one of the others for
14f8ab
metadata. These scenarios are hit when there is frequent connect/disconnect
14f8ab
of the client/shd to the bricks.
14f8ab
14f8ab
Fix:
14f8ab
Pick a source based on the xattr values. If 2 bricks blame one, the blamed
14f8ab
one must be treated as sink. If there is no majority, all are sources. Once
14f8ab
we pick a source, self-heal will then do the heal instead of erroring out
14f8ab
due to split-brain.
14f8ab
This patch also adds restriction of all the bricks to be up to perform
14f8ab
metadata heal to avoid any metadata loss.
14f8ab
14f8ab
Removed the test case tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
14f8ab
as it was doing metadata heal even when only 2 of 3 bricks were up.
14f8ab
14f8ab
Change-Id: I02064ecb7d68d498f75a353af64f75249a633508
14f8ab
fixes: bz#1715438
14f8ab
Signed-off-by: karthik-us <ksubrahm@redhat.com>
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/172935
14f8ab
Tested-by: RHGS Build Bot <nigelb@redhat.com>
14f8ab
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
14f8ab
---
14f8ab
 .../bug-1468279-source-not-blaming-sinks.t         |  64 ----------
14f8ab
 .../bug-1717819-metadata-split-brain-detection.t   | 130 +++++++++++++++++++++
14f8ab
 xlators/cluster/afr/src/afr-self-heal-common.c     |   4 +-
14f8ab
 xlators/cluster/afr/src/afr-self-heal-metadata.c   |   2 +-
14f8ab
 4 files changed, 133 insertions(+), 67 deletions(-)
14f8ab
 delete mode 100644 tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
14f8ab
 create mode 100644 tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
14f8ab
14f8ab
diff --git a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t b/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
14f8ab
deleted file mode 100644
14f8ab
index 054a4ad..0000000
14f8ab
--- a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
14f8ab
+++ /dev/null
14f8ab
@@ -1,64 +0,0 @@
14f8ab
-#!/bin/bash
14f8ab
-. $(dirname $0)/../../include.rc
14f8ab
-. $(dirname $0)/../../volume.rc
14f8ab
-cleanup;
14f8ab
-
14f8ab
-TEST glusterd
14f8ab
-TEST pidof glusterd
14f8ab
-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
14f8ab
-TEST $CLI volume start $V0
14f8ab
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
14f8ab
-TEST $CLI volume set $V0 cluster.metadata-self-heal off
14f8ab
-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0;
14f8ab
-TEST touch $M0/file
14f8ab
-
14f8ab
-# Kill B1, create a pending metadata heal.
14f8ab
-TEST kill_brick $V0 $H0 $B0/${V0}0
14f8ab
-TEST setfattr -n user.xattr -v value1 $M0/file
14f8ab
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
14f8ab
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
14f8ab
-
14f8ab
-# Kill B2, heal from B3 to B1.
14f8ab
-TEST $CLI volume start $V0 force
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
14f8ab
-TEST kill_brick $V0 $H0 $B0/${V0}1
14f8ab
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
14f8ab
-$CLI volume heal $V0
14f8ab
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-0 "metadata"
14f8ab
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
14f8ab
-
14f8ab
-# Create another pending metadata heal.
14f8ab
-TEST setfattr -n user.xattr -v value2 $M0/file
14f8ab
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
14f8ab
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
14f8ab
-
14f8ab
-# Kill B1, heal from B3 to B2
14f8ab
-TEST $CLI volume start $V0 force
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
14f8ab
-TEST kill_brick $V0 $H0 $B0/${V0}0
14f8ab
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
14f8ab
-$CLI volume heal $V0
14f8ab
-EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 "metadata"
14f8ab
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
14f8ab
-
14f8ab
-# ALL bricks up again.
14f8ab
-TEST $CLI volume start $V0 force
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
14f8ab
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
14f8ab
-# B1 and B2 blame each other, B3 doesn't blame anyone.
14f8ab
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
14f8ab
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
14f8ab
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
14f8ab
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
14f8ab
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
14f8ab
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
14f8ab
-TEST $CLI volume heal $V0
14f8ab
-EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
14f8ab
-
14f8ab
-cleanup;
14f8ab
diff --git a/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
14f8ab
new file mode 100644
14f8ab
index 0000000..94b8bf3
14f8ab
--- /dev/null
14f8ab
+++ b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
14f8ab
@@ -0,0 +1,130 @@
14f8ab
+#!/bin/bash
14f8ab
+
14f8ab
+. $(dirname $0)/../../include.rc
14f8ab
+. $(dirname $0)/../../volume.rc
14f8ab
+. $(dirname $0)/../../afr.rc
14f8ab
+
14f8ab
+cleanup;
14f8ab
+
14f8ab
+## Start and create a volume
14f8ab
+TEST glusterd;
14f8ab
+TEST pidof glusterd;
14f8ab
+TEST $CLI volume info;
14f8ab
+
14f8ab
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
14f8ab
+TEST $CLI volume start $V0;
14f8ab
+EXPECT 'Started' volinfo_field $V0 'Status';
14f8ab
+TEST $CLI volume heal $V0 disable
14f8ab
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0
14f8ab
+
14f8ab
+###############################################################################
14f8ab
+# Case of 2 bricks blaming the third and the third blaming the other two.
14f8ab
+
14f8ab
+TEST mkdir $M0/dir
14f8ab
+
14f8ab
+# B0 and B2 must blame B1
14f8ab
+TEST kill_brick $V0 $H0 $B0/$V0"1"
14f8ab
+TEST setfattr -n user.metadata -v 1 $M0/dir
14f8ab
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir trusted.afr.$V0-client-1 metadata
14f8ab
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir trusted.afr.$V0-client-1 metadata
14f8ab
+CLIENT_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $M0/dir)
14f8ab
+
14f8ab
+# B1 must blame B0 and B2
14f8ab
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir
14f8ab
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir
14f8ab
+
14f8ab
+# Launch heal
14f8ab
+TEST $CLI volume start $V0 force
14f8ab
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
14f8ab
+TEST $CLI volume heal $V0 enable
14f8ab
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
14f8ab
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
14f8ab
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
14f8ab
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
14f8ab
+TEST $CLI volume heal $V0
14f8ab
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
14f8ab
+
14f8ab
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir)
14f8ab
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir)
14f8ab
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir)
14f8ab
+
14f8ab
+TEST [ "$CLIENT_XATTR" == "$B0_XATTR" ]
14f8ab
+TEST [ "$CLIENT_XATTR" == "$B1_XATTR" ]
14f8ab
+TEST [ "$CLIENT_XATTR" == "$B2_XATTR" ]
14f8ab
+TEST setfattr -x user.metadata $M0/dir
14f8ab
+
14f8ab
+###############################################################################
14f8ab
+# Case of each brick blaming the next one in a cyclic manner
14f8ab
+
14f8ab
+TEST $CLI volume heal $V0 disable
14f8ab
+TEST `echo "hello" >> $M0/dir/file`
14f8ab
+# Mark cyclic xattrs and modify metadata directly on the bricks.
14f8ab
+setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
14f8ab
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
14f8ab
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"2"/dir/file
14f8ab
+
14f8ab
+setfattr -n user.metadata -v 1 $B0/$V0"0"/dir/file
14f8ab
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
14f8ab
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
14f8ab
+
14f8ab
+# Add entry to xattrop dir to trigger index heal.
14f8ab
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
14f8ab
+base_entry_b0=`ls $xattrop_dir0`
14f8ab
+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/file))
14f8ab
+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str
14f8ab
+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0
14f8ab
+
14f8ab
+# Launch heal
14f8ab
+TEST $CLI volume heal $V0 enable
14f8ab
+TEST $CLI volume heal $V0
14f8ab
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
14f8ab
+
14f8ab
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
14f8ab
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
14f8ab
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
14f8ab
+
14f8ab
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
14f8ab
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
14f8ab
+TEST rm -f $M0/dir/file
14f8ab
+
14f8ab
+###############################################################################
14f8ab
+# Case of 2 bricks having quorum blaming and the other having only one blaming.
14f8ab
+
14f8ab
+TEST $CLI volume heal $V0 disable
14f8ab
+TEST `echo "hello" >> $M0/dir/file`
14f8ab
+# B0 and B2 must blame B1
14f8ab
+TEST kill_brick $V0 $H0 $B0/$V0"1"
14f8ab
+TEST setfattr -n user.metadata -v 1 $M0/dir/file
14f8ab
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir/file trusted.afr.$V0-client-1 metadata
14f8ab
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir/file trusted.afr.$V0-client-1 metadata
14f8ab
+
14f8ab
+# B1 must blame B0 and B2
14f8ab
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
14f8ab
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
14f8ab
+
14f8ab
+# B0 must blame B2
14f8ab
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
14f8ab
+
14f8ab
+# Modify the metadata directly on the bricks B1 & B2.
14f8ab
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
14f8ab
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
14f8ab
+
14f8ab
+# Launch heal
14f8ab
+TEST $CLI volume start $V0 force
14f8ab
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
14f8ab
+TEST $CLI volume heal $V0 enable
14f8ab
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
14f8ab
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
14f8ab
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
14f8ab
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
14f8ab
+
14f8ab
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
14f8ab
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
14f8ab
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
14f8ab
+
14f8ab
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
14f8ab
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
14f8ab
+
14f8ab
+###############################################################################
14f8ab
+
14f8ab
+cleanup
14f8ab
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
14f8ab
index 595bed4..5157e7d 100644
14f8ab
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
14f8ab
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
14f8ab
@@ -1590,7 +1590,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
14f8ab
         }
14f8ab
     }
14f8ab
 
14f8ab
-    if (type == AFR_DATA_TRANSACTION) {
14f8ab
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
14f8ab
         min_participants = priv->child_count;
14f8ab
     } else {
14f8ab
         min_participants = AFR_SH_MIN_PARTICIPANTS;
14f8ab
@@ -1656,7 +1656,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
14f8ab
         }
14f8ab
     }
14f8ab
 
14f8ab
-    if (type == AFR_DATA_TRANSACTION)
14f8ab
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
14f8ab
         afr_selfheal_post_op_failure_accounting(priv, accused, sources,
14f8ab
                                                 locked_on);
14f8ab
 
14f8ab
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
14f8ab
index ba43341..ecfa791 100644
14f8ab
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
14f8ab
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
14f8ab
@@ -398,7 +398,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
14f8ab
     ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
14f8ab
                                data_lock);
14f8ab
     {
14f8ab
-        if (ret < AFR_SH_MIN_PARTICIPANTS) {
14f8ab
+        if (ret < priv->child_count) {
14f8ab
             ret = -ENOTCONN;
14f8ab
             goto unlock;
14f8ab
         }
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab