|
|
190130 |
From a75bb15fbe64f14580c44b8a33314c8bbeffdede Mon Sep 17 00:00:00 2001
|
|
|
190130 |
From: Ravishankar N <ravishankar@redhat.com>
|
|
|
190130 |
Date: Thu, 4 Jun 2020 18:54:46 +0530
|
|
|
190130 |
Subject: [PATCH 406/449] afr: support split-brain CLI for replica 3
|
|
|
190130 |
|
|
|
190130 |
Patch in upstream master: https://review.gluster.org/#/c/glusterfs/+/23502/
|
|
|
190130 |
|
|
|
190130 |
Ever since we added quorum checks for lookups in afr via commit
|
|
|
190130 |
bd44d59741bb8c0f5d7a62c5b1094179dd0ce8a4, the split-brain resolution
|
|
|
190130 |
commands would not work for replica 3 because there would be no
|
|
|
190130 |
readables for the lookup fop.
|
|
|
190130 |
|
|
|
190130 |
The argument was that split-brains do not occur in replica 3 but we do
|
|
|
190130 |
see (data/metadata) split-brain cases once in a while which indicate that there are
|
|
|
190130 |
a few bugs/corner cases yet to be discovered and fixed.
|
|
|
190130 |
|
|
|
190130 |
Fortunately, commit 8016d51a3bbd410b0b927ed66be50a09574b7982 added
|
|
|
190130 |
GF_CLIENT_PID_GLFS_HEALD as the pid for all fops made by glfsheal. If we
|
|
|
190130 |
leverage this and allow lookups in afr when pid is GF_CLIENT_PID_GLFS_HEALD,
|
|
|
190130 |
split-brain resolution commands will work for replica 3 volumes too.
|
|
|
190130 |
|
|
|
190130 |
Likewise, the check is added in shard_lookup as well to permit resolving
|
|
|
190130 |
split-brains by specifying "/.shard/shard-file.xx" as the file name
|
|
|
190130 |
(which previously used to fail with EPERM).
|
|
|
190130 |
|
|
|
190130 |
BUG: 1759875
|
|
|
190130 |
Change-Id: I203735b909c7d30fc4faaf3ecd4f5b6b379ab266
|
|
|
190130 |
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
|
|
|
190130 |
Reviewed-on: https://code.engineering.redhat.com/gerrit/202375
|
|
|
190130 |
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
|
190130 |
---
|
|
|
190130 |
.../replicate/bug-1756938-replica-3-sbrain-cli.t | 111 +++++++++++++++++++++
|
|
|
190130 |
xlators/cluster/afr/src/afr-common.c | 3 +-
|
|
|
190130 |
xlators/features/shard/src/shard.c | 3 +-
|
|
|
190130 |
3 files changed, 115 insertions(+), 2 deletions(-)
|
|
|
190130 |
create mode 100644 tests/bugs/replicate/bug-1756938-replica-3-sbrain-cli.t
|
|
|
190130 |
|
|
|
190130 |
diff --git a/tests/bugs/replicate/bug-1756938-replica-3-sbrain-cli.t b/tests/bugs/replicate/bug-1756938-replica-3-sbrain-cli.t
|
|
|
190130 |
new file mode 100644
|
|
|
190130 |
index 0000000..c1bdf34
|
|
|
190130 |
--- /dev/null
|
|
|
190130 |
+++ b/tests/bugs/replicate/bug-1756938-replica-3-sbrain-cli.t
|
|
|
190130 |
@@ -0,0 +1,111 @@
|
|
|
190130 |
+#!/bin/bash
|
|
|
190130 |
+
|
|
|
190130 |
+. $(dirname $0)/../../include.rc
|
|
|
190130 |
+. $(dirname $0)/../../volume.rc
|
|
|
190130 |
+. $(dirname $0)/../../afr.rc
|
|
|
190130 |
+
|
|
|
190130 |
+cleanup;
|
|
|
190130 |
+
|
|
|
190130 |
+TEST glusterd;
|
|
|
190130 |
+TEST pidof glusterd;
|
|
|
190130 |
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
|
|
|
190130 |
+TEST $CLI volume set $V0 features.shard enable
|
|
|
190130 |
+TEST $CLI volume set $V0 features.shard-block-size 4MB
|
|
|
190130 |
+
|
|
|
190130 |
+TEST $CLI volume start $V0
|
|
|
190130 |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
|
|
|
190130 |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
|
|
|
190130 |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
|
|
|
190130 |
+TEST glusterfs --volfile-server=$H0 --volfile-id=/$V0 $M0
|
|
|
190130 |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
|
|
|
190130 |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
|
|
|
190130 |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
|
|
|
190130 |
+
|
|
|
190130 |
+#Create split-brain by setting afr xattrs/gfids manually.
|
|
|
190130 |
+#file1 is non-sharded and will be in data split-brain.
|
|
|
190130 |
+#file2 will have one shard which will be in data split-brain.
|
|
|
190130 |
+#file3 will have one shard which will be in gfid split-brain.
|
|
|
190130 |
+#file4 will have one shard which will be in data & metadata split-brain.
|
|
|
190130 |
+TEST dd if=/dev/zero of=$M0/file1 bs=1024 count=1024 oflag=direct
|
|
|
190130 |
+TEST dd if=/dev/zero of=$M0/file2 bs=1M count=6 oflag=direct
|
|
|
190130 |
+TEST dd if=/dev/zero of=$M0/file3 bs=1M count=6 oflag=direct
|
|
|
190130 |
+TEST dd if=/dev/zero of=$M0/file4 bs=1M count=6 oflag=direct
|
|
|
190130 |
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
|
190130 |
+
|
|
|
190130 |
+#-------------------------------------------------------------------------------
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000000 $B0/${V0}0/file1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/${V0}0/file1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/${V0}1/file1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/${V0}1/file1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/${V0}2/file1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000000 $B0/${V0}2/file1
|
|
|
190130 |
+
|
|
|
190130 |
+#-------------------------------------------------------------------------------
|
|
|
190130 |
+gfid_f2=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/file2))
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000000 $B0/${V0}0/.shard/$gfid_f2.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/${V0}0/.shard/$gfid_f2.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/${V0}1/.shard/$gfid_f2.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/${V0}1/.shard/$gfid_f2.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/${V0}2/.shard/$gfid_f2.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000000 $B0/${V0}2/.shard/$gfid_f2.1
|
|
|
190130 |
+
|
|
|
190130 |
+#-------------------------------------------------------------------------------
|
|
|
190130 |
+TESTS_EXPECTED_IN_LOOP=5
|
|
|
190130 |
+function assign_new_gfid {
|
|
|
190130 |
+ brickpath=$1
|
|
|
190130 |
+ filename=$2
|
|
|
190130 |
+ gfid=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $brickpath/$filename))
|
|
|
190130 |
+ gfid_shard=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $brickpath/.shard/$gfid.1))
|
|
|
190130 |
+
|
|
|
190130 |
+ TEST rm $brickpath/.glusterfs/${gfid_shard:0:2}/${gfid_shard:2:2}/$gfid_shard
|
|
|
190130 |
+ TEST setfattr -x trusted.gfid $brickpath/.shard/$gfid.1
|
|
|
190130 |
+ new_gfid=$(get_random_gfid)
|
|
|
190130 |
+ new_gfid_str=$(gf_gfid_xattr_to_str $new_gfid)
|
|
|
190130 |
+ TEST setfattr -n trusted.gfid -v $new_gfid $brickpath/.shard/$gfid.1
|
|
|
190130 |
+ TEST mkdir -p $brickpath/.glusterfs/${new_gfid_str:0:2}/${new_gfid_str:2:2}
|
|
|
190130 |
+ TEST ln $brickpath/.shard/$gfid.1 $brickpath/.glusterfs/${new_gfid_str:0:2}/${new_gfid_str:2:2}/$new_gfid_str
|
|
|
190130 |
+}
|
|
|
190130 |
+assign_new_gfid $B0/$V0"1" file3
|
|
|
190130 |
+assign_new_gfid $B0/$V0"2" file3
|
|
|
190130 |
+
|
|
|
190130 |
+#-------------------------------------------------------------------------------
|
|
|
190130 |
+gfid_f4=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/file4))
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000100000000 $B0/${V0}0/.shard/$gfid_f4.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000100000000 $B0/${V0}0/.shard/$gfid_f4.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000100000000 $B0/${V0}1/.shard/$gfid_f4.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000100000000 $B0/${V0}1/.shard/$gfid_f4.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000100000000 $B0/${V0}2/.shard/$gfid_f4.1
|
|
|
190130 |
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000100000000 $B0/${V0}2/.shard/$gfid_f4.1
|
|
|
190130 |
+
|
|
|
190130 |
+#-------------------------------------------------------------------------------
|
|
|
190130 |
+#Add entry to xattrop dir on first brick and check for split-brain.
|
|
|
190130 |
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
|
|
|
190130 |
+base_entry_b0=`ls $xattrop_dir0`
|
|
|
190130 |
+
|
|
|
190130 |
+gfid_f1=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/file1))
|
|
|
190130 |
+TEST ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_f1
|
|
|
190130 |
+
|
|
|
190130 |
+gfid_f2_shard1=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/.shard/$gfid_f2.1))
|
|
|
190130 |
+TEST ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_f2_shard1
|
|
|
190130 |
+
|
|
|
190130 |
+gfid_f3=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/file3))
|
|
|
190130 |
+gfid_f3_shard1=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/.shard/$gfid_f3.1))
|
|
|
190130 |
+TEST ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_f3_shard1
|
|
|
190130 |
+
|
|
|
190130 |
+gfid_f4_shard1=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/.shard/$gfid_f4.1))
|
|
|
190130 |
+TEST ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_f4_shard1
|
|
|
190130 |
+
|
|
|
190130 |
+#-------------------------------------------------------------------------------
|
|
|
190130 |
+#gfid split-brain won't show up in split-brain count.
|
|
|
190130 |
+EXPECT "3" afr_get_split_brain_count $V0
|
|
|
190130 |
+EXPECT_NOT "^0$" get_pending_heal_count $V0
|
|
|
190130 |
+
|
|
|
190130 |
+#Resolve split-brains
|
|
|
190130 |
+TEST $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}1 /file1
|
|
|
190130 |
+GFIDSTR="gfid:$gfid_f2_shard1"
|
|
|
190130 |
+TEST $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}1 $GFIDSTR
|
|
|
190130 |
+TEST $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}1 /.shard/$gfid_f3.1
|
|
|
190130 |
+TEST $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}1 /.shard/$gfid_f4.1
|
|
|
190130 |
+TEST $CLI volume heal $V0
|
|
|
190130 |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
190130 |
+cleanup;
|
|
|
190130 |
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
|
|
|
190130 |
index eef7fd2..32127c6 100644
|
|
|
190130 |
--- a/xlators/cluster/afr/src/afr-common.c
|
|
|
190130 |
+++ b/xlators/cluster/afr/src/afr-common.c
|
|
|
190130 |
@@ -2250,7 +2250,8 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
|
|
|
190130 |
if ((spb_choice >= 0) &&
|
|
|
190130 |
(AFR_COUNT(success_replies, child_count) == child_count)) {
|
|
|
190130 |
*read_subvol = spb_choice;
|
|
|
190130 |
- } else if (!priv->quorum_count) {
|
|
|
190130 |
+ } else if (!priv->quorum_count ||
|
|
|
190130 |
+ frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) {
|
|
|
190130 |
*read_subvol = afr_first_up_child(frame, this);
|
|
|
190130 |
} else if (priv->quorum_count &&
|
|
|
190130 |
afr_has_quorum(data_readable, this, NULL)) {
|
|
|
190130 |
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
|
|
|
190130 |
index 2e2ef5d..16d557b 100644
|
|
|
190130 |
--- a/xlators/features/shard/src/shard.c
|
|
|
190130 |
+++ b/xlators/features/shard/src/shard.c
|
|
|
190130 |
@@ -1472,7 +1472,8 @@ int shard_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc,
|
|
|
190130 |
shard_local_t *local = NULL;
|
|
|
190130 |
|
|
|
190130 |
this->itable = loc->inode->table;
|
|
|
190130 |
- if (frame->root->pid != GF_CLIENT_PID_GSYNCD) {
|
|
|
190130 |
+ if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) &&
|
|
|
190130 |
+ (frame->root->pid != GF_CLIENT_PID_GLFS_HEAL)) {
|
|
|
190130 |
SHARD_ENTRY_FOP_CHECK(loc, op_errno, err);
|
|
|
190130 |
}
|
|
|
190130 |
|
|
|
190130 |
--
|
|
|
190130 |
1.8.3.1
|
|
|
190130 |
|