Tree - rpms/glusterfs - CentOS Git server

rpms / glusterfs

Blame SOURCES/0155-afr-don-t-treat-all-cases-all-bricks-being-blamed-as.patch

Blob History Raw

		d1681e	`From 6229320bc25ff24bb76f990c8e5411b6f1aa476c Mon Sep 17 00:00:00 2001`
		d1681e	`From: Ravishankar N <ravishankar@redhat.com>`
		d1681e	`Date: Sun, 28 Jan 2018 13:50:47 +0530`
		d1681e	`Subject: [PATCH 155/180] afr: don't treat all cases all bricks being blamed as`
		d1681e	`split-brain`
		d1681e
		d1681e	`Backport of https://review.gluster.org/#/c/19349/`
		d1681e
		d1681e	`Problem:`
		d1681e	`We currently don't have a roll-back/undoing of post-ops if quorum is not`
		d1681e	`met. Though the FOP is still unwound with failure, the xattrs remain on`
		d1681e	`the disk. Due to these partial post-ops and partial heals (healing only when`
		d1681e	`2 bricks are up), we can end up in split-brain purely from the afr`
		d1681e	`xattrs point of view i.e each brick is blamed by atleast one of the`
		d1681e	`others. These scenarios are hit when there is frequent`
		d1681e	`connect/disconnect of the client/shd to the bricks while I/O or heal`
		d1681e	`are in progress.`
		d1681e
		d1681e	`Fix:`
		d1681e	`Instead of undoing the post-op, pick a source based on the xattr values.`
		d1681e	`If 2 bricks blame one, the blamed one must be treated as sink.`
		d1681e	`If there is no majority, all are sources. Once we pick a source,`
		d1681e	`self-heal will then do the heal instead of erroring out due to`
		d1681e	`split-brain.`
		d1681e
		d1681e	`Change-Id: I3d0224b883eb0945785ade0e9697a1c828aec0ae`
		d1681e	`BUG: 1384983`
		d1681e	`Signed-off-by: Ravishankar N <ravishankar@redhat.com>`
		d1681e	`Reviewed-on: https://code.engineering.redhat.com/gerrit/129245`
		d1681e	`Tested-by: RHGS Build Bot <nigelb@redhat.com>`
		d1681e	`---`
		d1681e	`tests/basic/afr/arbiter-add-brick.t \| 16 ++++`
		d1681e	`.../replicate/bug-1539358-split-brain-detection.t \| 89 ++++++++++++++++++++++`
		d1681e	`tests/bugs/replicate/bug-802417.t \| 12 +++`
		d1681e	`xlators/cluster/afr/src/afr-self-heal-common.c \| 51 +++++++++++--`
		d1681e	`xlators/cluster/afr/src/afr-self-heal-data.c \| 6 +-`
		d1681e	`5 files changed, 165 insertions(+), 9 deletions(-)`
		d1681e	`create mode 100755 tests/bugs/replicate/bug-1539358-split-brain-detection.t`
		d1681e
		d1681e	`diff --git a/tests/basic/afr/arbiter-add-brick.t b/tests/basic/afr/arbiter-add-brick.t`
		d1681e	`index fe919de..77b93d9 100644`
		d1681e	`--- a/tests/basic/afr/arbiter-add-brick.t`
		d1681e	`+++ b/tests/basic/afr/arbiter-add-brick.t`
		d1681e	`@@ -12,6 +12,8 @@ TEST $CLI volume set $V0 performance.stat-prefetch off`
		d1681e	`TEST $CLI volume start $V0`
		d1681e	`TEST $CLI volume set $V0 self-heal-daemon off`
		d1681e	`TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;`
		d1681e	`+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0`
		d1681e	`+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1`
		d1681e	`TEST mkdir $M0/dir1`
		d1681e	`TEST dd if=/dev/urandom of=$M0/file1 bs=1024 count=1`
		d1681e
		d1681e	`@@ -24,6 +26,7 @@ TEST dd if=/dev/urandom of=$M0/file1 bs=1024 count=1024`
		d1681e	`#convert replica 2 to arbiter volume`
		d1681e	`TEST $CLI volume start $V0 force`
		d1681e	`EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1`
		d1681e	`+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1`
		d1681e
		d1681e	`#syntax check for add-brick.`
		d1681e	`TEST ! $CLI volume add-brick $V0 replica 2 arbiter 1 $H0:$B0/${V0}2`
		d1681e	`@@ -31,6 +34,19 @@ TEST ! $CLI volume add-brick $V0 replica 3 arbiter 2 $H0:$B0/${V0}2`
		d1681e
		d1681e	`TEST $CLI volume add-brick $V0 replica 3 arbiter 1 $H0:$B0/${V0}2`
		d1681e	`EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2`
		d1681e	`+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2`
		d1681e	`+`
		d1681e	`+#Trigger name heals from client. If we just rely on index heal, the first index`
		d1681e	`+#crawl on B0 fails for /, dir2 and /file either due to lock collision or files`
		d1681e	`+#not being present on the other 2 bricks yet. It is getting healed only in the`
		d1681e	`+#next crawl after priv->shd.timeout (600 seconds) or by manually launching`
		d1681e	`+#index heal again.`
		d1681e	`+TEST $CLI volume set $V0 data-self-heal off`
		d1681e	`+TEST $CLI volume set $V0 metadata-self-heal off`
		d1681e	`+TEST $CLI volume set $V0 entry-self-heal off`
		d1681e	`+TEST stat $M0/dir1`
		d1681e	`+TEST stat $M0/dir2`
		d1681e	`+TEST stat $M0/file1`
		d1681e
		d1681e	`#Heal files`
		d1681e	`TEST $CLI volume set $V0 self-heal-daemon on`
		d1681e	`diff --git a/tests/bugs/replicate/bug-1539358-split-brain-detection.t b/tests/bugs/replicate/bug-1539358-split-brain-detection.t`
		d1681e	`new file mode 100755`
		d1681e	`index 0000000..7b71a7a`
		d1681e	`--- /dev/null`
		d1681e	`+++ b/tests/bugs/replicate/bug-1539358-split-brain-detection.t`
		d1681e	`@@ -0,0 +1,89 @@`
		d1681e	`+#!/bin/bash`
		d1681e	`+`
		d1681e	`+. $(dirname $0)/../../include.rc`
		d1681e	`+. $(dirname $0)/../../volume.rc`
		d1681e	`+. $(dirname $0)/../../afr.rc`
		d1681e	`+`
		d1681e	`+cleanup;`
		d1681e	`+`
		d1681e	`+## Start and create a volume`
		d1681e	`+TEST glusterd;`
		d1681e	`+TEST pidof glusterd;`
		d1681e	`+TEST $CLI volume info;`
		d1681e	`+`
		d1681e	`+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};`
		d1681e	`+TEST $CLI volume start $V0;`
		d1681e	`+EXPECT 'Started' volinfo_field $V0 'Status';`
		d1681e	`+TEST $CLI volume set $V0 self-heal-daemon off`
		d1681e	`+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0`
		d1681e	`+`
		d1681e	`+###############################################################################yy`
		d1681e	`+# Case of 2 bricks blaming the third and the third blaming the other two.`
		d1681e	`+`
		d1681e	+TEST `echo "hello" >> $M0/file`
		d1681e	`+`
		d1681e	`+# B0 and B2 must blame B1`
		d1681e	`+TEST kill_brick $V0 $H0 $B0/$V0"1"`
		d1681e	+TEST `echo "append" >> $M0/file`
		d1681e	`+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/file trusted.afr.$V0-client-1 data`
		d1681e	`+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 data`
		d1681e	`+CLIENT_MD5=$(md5sum $M0/file \| cut -d\ -f1)`
		d1681e	`+`
		d1681e	`+# B1 must blame B0 and B2`
		d1681e	`+setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/$V0"1"/file`
		d1681e	`+setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/$V0"1"/file`
		d1681e	`+`
		d1681e	`+# Launch heal`
		d1681e	`+TEST $CLI volume start $V0 force`
		d1681e	`+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1`
		d1681e	`+TEST $CLI volume set $V0 self-heal-daemon on`
		d1681e	`+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status`
		d1681e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0`
		d1681e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1`
		d1681e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2`
		d1681e	`+TEST $CLI volume heal $V0`
		d1681e	`+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0`
		d1681e	`+B0_MD5=$(md5sum $B0/${V0}0/file \| cut -d\ -f1)`
		d1681e	`+B1_MD5=$(md5sum $B0/${V0}1/file \| cut -d\ -f1)`
		d1681e	`+B2_MD5=$(md5sum $B0/${V0}2/file \| cut -d\ -f1)`
		d1681e	`+TEST [ "$CLIENT_MD5" == "$B0_MD5" ]`
		d1681e	`+TEST [ "$CLIENT_MD5" == "$B1_MD5" ]`
		d1681e	`+TEST [ "$CLIENT_MD5" == "$B2_MD5" ]`
		d1681e	`+`
		d1681e	`+TEST rm $M0/file`
		d1681e	`+`
		d1681e	`+###############################################################################yy`
		d1681e	`+# Case of each brick blaming the next one in a cyclic manner`
		d1681e	`+`
		d1681e	+TEST `echo "hello" >> $M0/file`
		d1681e	`+# Mark cyclic xattrs and modify file content directly on the bricks.`
		d1681e	`+TEST $CLI volume set $V0 self-heal-daemon off`
		d1681e	`+setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000000 $B0/$V0"0"/file`
		d1681e	`+setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"0"/file`
		d1681e	`+setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/$V0"1"/file`
		d1681e	`+setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"1"/file`
		d1681e	`+setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/$V0"2"/file`
		d1681e	`+setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"2"/file`
		d1681e	`+`
		d1681e	+TEST `echo "ab" >> $B0/$V0"0"/file`
		d1681e	+TEST `echo "cdef" >> $B0/$V0"1"/file`
		d1681e	+TEST `echo "ghi" >> $B0/$V0"2"/file`
		d1681e	`+`
		d1681e	`+# Add entry to xattrop dir to trigger index heal.`
		d1681e	`+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")`
		d1681e	+base_entry_b0=`ls $xattrop_dir0`
		d1681e	`+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/file))`
		d1681e	`+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str`
		d1681e	`+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0`
		d1681e	`+`
		d1681e	`+# Launch heal`
		d1681e	`+TEST $CLI volume set $V0 self-heal-daemon on`
		d1681e	`+TEST $CLI volume heal $V0`
		d1681e	`+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0`
		d1681e	`+B0_MD5=$(md5sum $B0/${V0}0/file \| cut -d\ -f1)`
		d1681e	`+B1_MD5=$(md5sum $B0/${V0}1/file \| cut -d\ -f1)`
		d1681e	`+B2_MD5=$(md5sum $B0/${V0}2/file \| cut -d\ -f1)`
		d1681e	`+TEST [ "$B0_MD5" == "$B1_MD5" ]`
		d1681e	`+TEST [ "$B0_MD5" == "$B2_MD5" ]`
		d1681e	`+###############################################################################yy`
		d1681e	`+cleanup`
		d1681e	`diff --git a/tests/bugs/replicate/bug-802417.t b/tests/bugs/replicate/bug-802417.t`
		d1681e	`index c5ba98b..f213439 100755`
		d1681e	`--- a/tests/bugs/replicate/bug-802417.t`
		d1681e	`+++ b/tests/bugs/replicate/bug-802417.t`
		d1681e	`@@ -10,6 +10,18 @@ function write_file()`
		d1681e	`}`
		d1681e
		d1681e	`cleanup;`
		d1681e	`+`
		d1681e	`+#####################################################`
		d1681e	`+# We are currently not triggering data heal unless all bricks of the replica are`
		d1681e	`+# up. We will need to modify this .t once the fix for preventing stale reads`
		d1681e	`+# being served to clients for files in spurious split-brains is done. Spurious`
		d1681e	`+# split-brains here means afr xattrs indicates sbrain but it is actually not.`
		d1681e	`+# Self-heal will heal such files automatically but before the heal completes,`
		d1681e	`+# reads can be served which needs fixing.`
		d1681e	`+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000`
		d1681e	`+#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000`
		d1681e	`+######################################################`
		d1681e	`+`
		d1681e	`TEST glusterd`
		d1681e	`TEST pidof glusterd`
		d1681e	`TEST $CLI volume info;`
		d1681e	`diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c`
		d1681e	`index 26d3860..f61b237 100644`
		d1681e	`--- a/xlators/cluster/afr/src/afr-self-heal-common.c`
		d1681e	`+++ b/xlators/cluster/afr/src/afr-self-heal-common.c`
		d1681e	`@@ -1455,6 +1455,36 @@ afr_does_witness_exist (xlator_t this, uint64_t witness)`
		d1681e	`return _gf_false;`
		d1681e	`}`
		d1681e
		d1681e	`+unsigned int`
		d1681e	`+afr_get_quorum_count (afr_private_t *priv)`
		d1681e	`+{`
		d1681e	`+ if (priv->quorum_count == AFR_QUORUM_AUTO) {`
		d1681e	`+ return priv->child_count/2 + 1;`
		d1681e	`+ } else {`
		d1681e	`+ return priv->quorum_count;`
		d1681e	`+ }`
		d1681e	`+}`
		d1681e	`+`
		d1681e	`+void`
		d1681e	`+afr_selfheal_post_op_failure_accounting (afr_private_t priv, char accused,`
		d1681e	`+ unsigned char *sources,`
		d1681e	`+ unsigned char *locked_on)`
		d1681e	`+{`
		d1681e	`+ int i = 0;`
		d1681e	`+ unsigned int quorum_count = 0;`
		d1681e	`+`
		d1681e	`+ if (AFR_COUNT (sources, priv->child_count) != 0)`
		d1681e	`+ return;`
		d1681e	`+`
		d1681e	`+ quorum_count = afr_get_quorum_count (priv);`
		d1681e	`+ for (i = 0; i < priv->child_count; i++) {`
		d1681e	`+ if ((accused[i] < quorum_count) && locked_on[i]) {`
		d1681e	`+ sources[i] = 1;`
		d1681e	`+ }`
		d1681e	`+ }`
		d1681e	`+ return;`
		d1681e	`+}`
		d1681e	`+`
		d1681e	`/*`
		d1681e	`* This function determines if a self-heal is required for a given inode,`
		d1681e	`* and if needed, in what direction.`
		d1681e	`@@ -1490,6 +1520,7 @@ afr_selfheal_find_direction (call_frame_t frame, xlator_t this,`
		d1681e	`char accused = NULL;/ Accused others without any self-accusal */`
		d1681e	`char pending = NULL;/ Have pending operations on others */`
		d1681e	`char self_accused = NULL; / Accused itself */`
		d1681e	`+ int min_participants = -1;`
		d1681e
		d1681e	`priv = this->private;`
		d1681e
		d1681e	`@@ -1513,8 +1544,13 @@ afr_selfheal_find_direction (call_frame_t frame, xlator_t this,`
		d1681e	`}`
		d1681e	`}`
		d1681e
		d1681e	`+ if (type == AFR_DATA_TRANSACTION) {`
		d1681e	`+ min_participants = priv->child_count;`
		d1681e	`+ } else {`
		d1681e	`+ min_participants = AFR_SH_MIN_PARTICIPANTS;`
		d1681e	`+ }`
		d1681e	`if (afr_success_count (replies,`
		d1681e	`- priv->child_count) < AFR_SH_MIN_PARTICIPANTS) {`
		d1681e	`+ priv->child_count) < min_participants) {`
		d1681e	`/* Treat this just like locks not being acquired */`
		d1681e	`return -ENOTCONN;`
		d1681e	`}`
		d1681e	`@@ -1530,11 +1566,10 @@ afr_selfheal_find_direction (call_frame_t frame, xlator_t this,`
		d1681e	`for (i = 0; i < priv->child_count; i++) {`
		d1681e	`for (j = 0; j < priv->child_count; j++) {`
		d1681e	`if (matrix[i][j]) {`
		d1681e	`- if (!self_accused[i])`
		d1681e	`- accused[j] = 1;`
		d1681e	`-`
		d1681e	`- if (i != j)`
		d1681e	`- pending[i] = 1;`
		d1681e	`+ if (!self_accused[i])`
		d1681e	`+ accused[j] += 1;`
		d1681e	`+ if (i != j)`
		d1681e	`+ pending[i] += 1;`
		d1681e	`}`
		d1681e	`}`
		d1681e	`}`
		d1681e	`@@ -1575,6 +1610,10 @@ afr_selfheal_find_direction (call_frame_t frame, xlator_t this,`
		d1681e	`}`
		d1681e	`}`
		d1681e
		d1681e	`+ if (type == AFR_DATA_TRANSACTION)`
		d1681e	`+ afr_selfheal_post_op_failure_accounting (priv, accused,`
		d1681e	`+ sources, locked_on);`
		d1681e	`+`
		d1681e	`/* If no sources, all locked nodes are sinks - split brain */`
		d1681e	`if (AFR_COUNT (sources, priv->child_count) == 0) {`
		d1681e	`for (i = 0; i < priv->child_count; i++) {`
		d1681e	`diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c`
		d1681e	`index 8cf43f2..bcd0dec 100644`
		d1681e	`--- a/xlators/cluster/afr/src/afr-self-heal-data.c`
		d1681e	`+++ b/xlators/cluster/afr/src/afr-self-heal-data.c`
		d1681e	`@@ -684,7 +684,7 @@ __afr_selfheal_data (call_frame_t frame, xlator_t this, fd_t *fd,`
		d1681e	`ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0,`
		d1681e	`data_lock);`
		d1681e	`{`
		d1681e	`- if (ret < AFR_SH_MIN_PARTICIPANTS) {`
		d1681e	`+ if (ret < priv->child_count) {`
		d1681e	`gf_msg_debug (this->name, 0, "%s: Skipping "`
		d1681e	`"self-heal as only %d number "`
		d1681e	`"of subvolumes "`
		d1681e	`@@ -749,7 +749,7 @@ restore_time:`
		d1681e	`if (!is_arbiter_the_only_sink) {`
		d1681e	`ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name,`
		d1681e	`0, 0, data_lock);`
		d1681e	`- if (ret < AFR_SH_MIN_PARTICIPANTS) {`
		d1681e	`+ if (ret < priv->child_count) {`
		d1681e	`ret = -ENOTCONN;`
		d1681e	`did_sh = _gf_false;`
		d1681e	`goto skip_undo_pending;`
		d1681e	`@@ -878,7 +878,7 @@ afr_selfheal_data (call_frame_t frame, xlator_t this, inode_t *inode)`
		d1681e	`priv->sh_domain, 0, 0,`
		d1681e	`locked_on);`
		d1681e	`{`
		d1681e	`- if (ret < AFR_SH_MIN_PARTICIPANTS) {`
		d1681e	`+ if (ret < priv->child_count) {`
		d1681e	`gf_msg_debug (this->name, 0, "%s: Skipping "`
		d1681e	`"self-heal as only %d number of "`
		d1681e	`"subvolumes could be locked",`
		d1681e	`--`
		d1681e	`1.8.3.1`
		d1681e

rpms / glusterfs

Source Code

Blame SOURCES/0155-afr-don-t-treat-all-cases-all-bricks-being-blamed-as.patch