|
|
887953 |
From 8a3c0fb64c8798ecf5a3635fe0922e3cfd476817 Mon Sep 17 00:00:00 2001
|
|
|
887953 |
From: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
887953 |
Date: Mon, 27 Aug 2018 12:40:16 +0530
|
|
|
887953 |
Subject: [PATCH 425/444] cluster/afr: Delegate name-heal when possible
|
|
|
887953 |
|
|
|
887953 |
Problem:
|
|
|
887953 |
When name-self-heal is triggered on the mount, it blocks
|
|
|
887953 |
lookup until name-self-heal completes. But that can lead
|
|
|
887953 |
to hangs when lot of clients are accessing a directory which
|
|
|
887953 |
needs name heal and all of them trigger heals waiting
|
|
|
887953 |
for other clients to complete heal.
|
|
|
887953 |
|
|
|
887953 |
Fix:
|
|
|
887953 |
When a name-heal is needed but quorum number of names have the
|
|
|
887953 |
file and pending xattrs exist on the parent, then better to
|
|
|
887953 |
delegate the heal to SHD which will be completed as part of
|
|
|
887953 |
entry-heal of the parent directory. We could also do the same
|
|
|
887953 |
for quorum-number of names not present but we don't have
|
|
|
887953 |
any known use-case where this is a frequent occurrence so
|
|
|
887953 |
not changing that part at the moment. When there is a gfid
|
|
|
887953 |
mismatch or missing gfid it is important to complete the heal
|
|
|
887953 |
so that next rename doesn't assume everything is fine and
|
|
|
887953 |
perform a rename etc
|
|
|
887953 |
|
|
|
887953 |
BUG: 1619357
|
|
|
887953 |
Upstream Patch: https://review.gluster.org/c/glusterfs/+/21087
|
|
|
887953 |
Change-Id: I8b002c85dffc6eb6f2833e742684a233daefeb2c
|
|
|
887953 |
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
887953 |
Reviewed-on: https://code.engineering.redhat.com/gerrit/155029
|
|
|
887953 |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
887953 |
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
|
887953 |
---
|
|
|
887953 |
tests/afr.rc | 8 ++
|
|
|
887953 |
tests/basic/afr/name-self-heal.t | 112 +++++++++++++++++++++++++++
|
|
|
887953 |
xlators/cluster/afr/src/afr-common.c | 100 ++++++++++++++++++------
|
|
|
887953 |
xlators/cluster/afr/src/afr-self-heal-name.c | 12 ++-
|
|
|
887953 |
4 files changed, 205 insertions(+), 27 deletions(-)
|
|
|
887953 |
create mode 100644 tests/basic/afr/name-self-heal.t
|
|
|
887953 |
|
|
|
887953 |
diff --git a/tests/afr.rc b/tests/afr.rc
|
|
|
887953 |
index 1fd0310..a1e8a44 100644
|
|
|
887953 |
--- a/tests/afr.rc
|
|
|
887953 |
+++ b/tests/afr.rc
|
|
|
887953 |
@@ -89,3 +89,11 @@ function count_index_entries()
|
|
|
887953 |
{
|
|
|
887953 |
ls $1/.glusterfs/indices/xattrop | wc -l
|
|
|
887953 |
}
|
|
|
887953 |
+
|
|
|
887953 |
+function get_quorum_type()
|
|
|
887953 |
+{
|
|
|
887953 |
+ local m="$1"
|
|
|
887953 |
+ local v="$2"
|
|
|
887953 |
+ local repl_id="$3"
|
|
|
887953 |
+ cat $m/.meta/graphs/active/$v-replicate-$repl_id/private|grep quorum-type|awk '{print $3}'
|
|
|
887953 |
+}
|
|
|
887953 |
diff --git a/tests/basic/afr/name-self-heal.t b/tests/basic/afr/name-self-heal.t
|
|
|
887953 |
new file mode 100644
|
|
|
887953 |
index 0000000..50fc2ec
|
|
|
887953 |
--- /dev/null
|
|
|
887953 |
+++ b/tests/basic/afr/name-self-heal.t
|
|
|
887953 |
@@ -0,0 +1,112 @@
|
|
|
887953 |
+#!/bin/bash
|
|
|
887953 |
+#Self-heal tests
|
|
|
887953 |
+
|
|
|
887953 |
+. $(dirname $0)/../../include.rc
|
|
|
887953 |
+. $(dirname $0)/../../volume.rc
|
|
|
887953 |
+. $(dirname $0)/../../afr.rc
|
|
|
887953 |
+cleanup;
|
|
|
887953 |
+
|
|
|
887953 |
+#Check that when quorum is not enabled name-heal happens correctly
|
|
|
887953 |
+TEST glusterd
|
|
|
887953 |
+TEST pidof glusterd
|
|
|
887953 |
+TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
|
|
|
887953 |
+TEST $CLI volume set $V0 performance.stat-prefetch off
|
|
|
887953 |
+TEST $CLI volume start $V0
|
|
|
887953 |
+TEST $CLI volume heal $V0 disable
|
|
|
887953 |
+TEST $CLI volume set $V0 cluster.entry-self-heal off
|
|
|
887953 |
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
|
|
|
887953 |
+
|
|
|
887953 |
+TEST touch $M0/a
|
|
|
887953 |
+TEST touch $M0/c
|
|
|
887953 |
+TEST kill_brick $V0 $H0 $B0/brick0
|
|
|
887953 |
+TEST touch $M0/b
|
|
|
887953 |
+TEST rm -f $M0/a
|
|
|
887953 |
+TEST rm -f $M0/c
|
|
|
887953 |
+TEST touch $M0/c #gfid mismatch case
|
|
|
887953 |
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
|
|
|
887953 |
+TEST $CLI volume start $V0 force
|
|
|
887953 |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
|
|
|
887953 |
+TEST ! stat $M0/a
|
|
|
887953 |
+TEST ! stat $B0/brick0/a
|
|
|
887953 |
+TEST ! stat $B0/brick1/a
|
|
|
887953 |
+
|
|
|
887953 |
+TEST stat $M0/b
|
|
|
887953 |
+TEST stat $B0/brick0/b
|
|
|
887953 |
+TEST stat $B0/brick1/b
|
|
|
887953 |
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
|
|
|
887953 |
+
|
|
|
887953 |
+TEST stat $M0/c
|
|
|
887953 |
+TEST stat $B0/brick0/c
|
|
|
887953 |
+TEST stat $B0/brick1/c
|
|
|
887953 |
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
|
|
|
887953 |
+
|
|
|
887953 |
+cleanup;
|
|
|
887953 |
+
|
|
|
887953 |
+#Check that when quorum is enabled name-heal happens as expected
|
|
|
887953 |
+TEST glusterd
|
|
|
887953 |
+TEST pidof glusterd
|
|
|
887953 |
+TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,2}
|
|
|
887953 |
+TEST $CLI volume set $V0 performance.stat-prefetch off
|
|
|
887953 |
+TEST $CLI volume start $V0
|
|
|
887953 |
+TEST $CLI volume heal $V0 disable
|
|
|
887953 |
+TEST $CLI volume set $V0 cluster.entry-self-heal off
|
|
|
887953 |
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
|
|
|
887953 |
+
|
|
|
887953 |
+TEST touch $M0/a
|
|
|
887953 |
+TEST touch $M0/c
|
|
|
887953 |
+TEST kill_brick $V0 $H0 $B0/brick0
|
|
|
887953 |
+TEST touch $M0/b
|
|
|
887953 |
+TEST rm -f $M0/a
|
|
|
887953 |
+TEST rm -f $M0/c
|
|
|
887953 |
+TEST touch $M0/c #gfid mismatch case
|
|
|
887953 |
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
|
|
|
887953 |
+TEST $CLI volume start $V0 force
|
|
|
887953 |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
|
|
|
887953 |
+TEST ! stat $M0/a
|
|
|
887953 |
+TEST ! stat $B0/brick0/a
|
|
|
887953 |
+TEST ! stat $B0/brick1/a
|
|
|
887953 |
+TEST ! stat $B0/brick2/a
|
|
|
887953 |
+
|
|
|
887953 |
+TEST stat $M0/b
|
|
|
887953 |
+TEST ! stat $B0/brick0/b #Name heal shouldn't be triggered
|
|
|
887953 |
+TEST stat $B0/brick1/b
|
|
|
887953 |
+TEST stat $B0/brick2/b
|
|
|
887953 |
+
|
|
|
887953 |
+TEST stat $M0/c
|
|
|
887953 |
+TEST stat $B0/brick0/c
|
|
|
887953 |
+TEST stat $B0/brick1/c
|
|
|
887953 |
+TEST stat $B0/brick2/c
|
|
|
887953 |
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
|
|
|
887953 |
+
|
|
|
887953 |
+TEST $CLI volume set $V0 cluster.quorum-type none
|
|
|
887953 |
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "none" get_quorum_type $M0 $V0 0
|
|
|
887953 |
+TEST stat $M0/b
|
|
|
887953 |
+TEST stat $B0/brick0/b #Name heal should be triggered
|
|
|
887953 |
+TEST stat $B0/brick1/b
|
|
|
887953 |
+TEST stat $B0/brick2/b
|
|
|
887953 |
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
|
|
|
887953 |
+TEST $CLI volume set $V0 cluster.quorum-type auto
|
|
|
887953 |
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "auto" get_quorum_type $M0 $V0 0
|
|
|
887953 |
+
|
|
|
887953 |
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
|
887953 |
+
|
|
|
887953 |
+#Missing parent xattrs cases
|
|
|
887953 |
+TEST $CLI volume heal $V0 enable
|
|
|
887953 |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
|
|
|
887953 |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
|
|
|
887953 |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
|
|
|
887953 |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
|
|
|
887953 |
+TEST $CLI volume heal $V0
|
|
|
887953 |
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
|
|
|
887953 |
+
|
|
|
887953 |
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
|
|
|
887953 |
+TEST $CLI volume heal $V0 disable
|
|
|
887953 |
+#In cases where a good parent doesn't have pending xattrs and a file,
|
|
|
887953 |
+#name-heal will be triggered
|
|
|
887953 |
+TEST gf_rm_file_and_gfid_link $B0/brick1 c
|
|
|
887953 |
+TEST stat $M0/c
|
|
|
887953 |
+TEST stat $B0/brick0/c
|
|
|
887953 |
+TEST stat $B0/brick1/c
|
|
|
887953 |
+TEST stat $B0/brick2/c
|
|
|
887953 |
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
|
|
|
887953 |
+cleanup
|
|
|
887953 |
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
|
|
|
887953 |
index e74fdec..ce2b17a 100644
|
|
|
887953 |
--- a/xlators/cluster/afr/src/afr-common.c
|
|
|
887953 |
+++ b/xlators/cluster/afr/src/afr-common.c
|
|
|
887953 |
@@ -2302,8 +2302,6 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
|
|
|
887953 |
*/
|
|
|
887953 |
for (i = 0; i < priv->child_count; i++) {
|
|
|
887953 |
if (!replies[i].valid || replies[i].op_ret == -1) {
|
|
|
887953 |
- if (priv->child_up[i])
|
|
|
887953 |
- can_interpret = _gf_false;
|
|
|
887953 |
continue;
|
|
|
887953 |
}
|
|
|
887953 |
|
|
|
887953 |
@@ -2742,21 +2740,52 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
|
|
|
887953 |
afr_private_t *priv = NULL;
|
|
|
887953 |
call_frame_t *heal = NULL;
|
|
|
887953 |
int i = 0, first = -1;
|
|
|
887953 |
- gf_boolean_t need_heal = _gf_false;
|
|
|
887953 |
+ gf_boolean_t name_state_mismatch = _gf_false;
|
|
|
887953 |
struct afr_reply *replies = NULL;
|
|
|
887953 |
int ret = 0;
|
|
|
887953 |
+ unsigned char *par_readables = NULL;
|
|
|
887953 |
+ unsigned char *success = NULL;
|
|
|
887953 |
+ int32_t op_errno = 0;
|
|
|
887953 |
+ uuid_t gfid = {0};
|
|
|
887953 |
|
|
|
887953 |
local = frame->local;
|
|
|
887953 |
replies = local->replies;
|
|
|
887953 |
priv = this->private;
|
|
|
887953 |
+ par_readables = alloca0(priv->child_count);
|
|
|
887953 |
+ success = alloca0(priv->child_count);
|
|
|
887953 |
+
|
|
|
887953 |
+ ret = afr_inode_read_subvol_get (local->loc.parent, this, par_readables,
|
|
|
887953 |
+ NULL, NULL);
|
|
|
887953 |
+ if (ret < 0 || AFR_COUNT (par_readables, priv->child_count) == 0) {
|
|
|
887953 |
+ /* In this case set par_readables to all 1 so that name_heal
|
|
|
887953 |
+ * need checks at the end of this function will flag missing
|
|
|
887953 |
+ * entry when name state mismatches*/
|
|
|
887953 |
+ memset (par_readables, 1, priv->child_count);
|
|
|
887953 |
+ }
|
|
|
887953 |
|
|
|
887953 |
for (i = 0; i < priv->child_count; i++) {
|
|
|
887953 |
if (!replies[i].valid)
|
|
|
887953 |
continue;
|
|
|
887953 |
|
|
|
887953 |
+ if (replies[i].op_ret == 0) {
|
|
|
887953 |
+ if (uuid_is_null (gfid)) {
|
|
|
887953 |
+ gf_uuid_copy (gfid,
|
|
|
887953 |
+ replies[i].poststat.ia_gfid);
|
|
|
887953 |
+ }
|
|
|
887953 |
+ success[i] = 1;
|
|
|
887953 |
+ } else {
|
|
|
887953 |
+ if ((replies[i].op_errno != ENOTCONN) &&
|
|
|
887953 |
+ (replies[i].op_errno != ENOENT) &&
|
|
|
887953 |
+ (replies[i].op_errno != ESTALE)) {
|
|
|
887953 |
+ op_errno = replies[i].op_errno;
|
|
|
887953 |
+ }
|
|
|
887953 |
+ }
|
|
|
887953 |
+
|
|
|
887953 |
+ /*gfid is missing, needs heal*/
|
|
|
887953 |
if ((replies[i].op_ret == -1) &&
|
|
|
887953 |
- (replies[i].op_errno == ENODATA))
|
|
|
887953 |
- need_heal = _gf_true;
|
|
|
887953 |
+ (replies[i].op_errno == ENODATA)) {
|
|
|
887953 |
+ goto name_heal;
|
|
|
887953 |
+ }
|
|
|
887953 |
|
|
|
887953 |
if (first == -1) {
|
|
|
887953 |
first = i;
|
|
|
887953 |
@@ -2764,30 +2793,53 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
|
|
|
887953 |
}
|
|
|
887953 |
|
|
|
887953 |
if (replies[i].op_ret != replies[first].op_ret) {
|
|
|
887953 |
- need_heal = _gf_true;
|
|
|
887953 |
- break;
|
|
|
887953 |
+ name_state_mismatch = _gf_true;
|
|
|
887953 |
}
|
|
|
887953 |
|
|
|
887953 |
- if (gf_uuid_compare (replies[i].poststat.ia_gfid,
|
|
|
887953 |
- replies[first].poststat.ia_gfid)) {
|
|
|
887953 |
- need_heal = _gf_true;
|
|
|
887953 |
- break;
|
|
|
887953 |
- }
|
|
|
887953 |
+ if (replies[i].op_ret == 0) {
|
|
|
887953 |
+ /* Rename after this lookup may succeed if we don't do
|
|
|
887953 |
+ * a name-heal and the destination may not have pending xattrs
|
|
|
887953 |
+ * to indicate which name is good and which is bad so always do
|
|
|
887953 |
+ * this heal*/
|
|
|
887953 |
+ if (gf_uuid_compare (replies[i].poststat.ia_gfid,
|
|
|
887953 |
+ gfid)) {
|
|
|
887953 |
+ goto name_heal;
|
|
|
887953 |
+ }
|
|
|
887953 |
+ }
|
|
|
887953 |
}
|
|
|
887953 |
|
|
|
887953 |
- if (need_heal) {
|
|
|
887953 |
- heal = afr_frame_create (this, NULL);
|
|
|
887953 |
- if (!heal)
|
|
|
887953 |
- goto metadata_heal;
|
|
|
887953 |
-
|
|
|
887953 |
- ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
|
|
|
887953 |
- afr_refresh_selfheal_done, heal, frame);
|
|
|
887953 |
- if (ret) {
|
|
|
887953 |
- AFR_STACK_DESTROY (heal);
|
|
|
887953 |
- goto metadata_heal;
|
|
|
887953 |
+ if (name_state_mismatch) {
|
|
|
887953 |
+ if (!priv->quorum_count)
|
|
|
887953 |
+ goto name_heal;
|
|
|
887953 |
+ if (!afr_has_quorum (success, this))
|
|
|
887953 |
+ goto name_heal;
|
|
|
887953 |
+ if (op_errno)
|
|
|
887953 |
+ goto name_heal;
|
|
|
887953 |
+ for (i = 0; i < priv->child_count; i++) {
|
|
|
887953 |
+ if (!replies[i].valid)
|
|
|
887953 |
+ continue;
|
|
|
887953 |
+ if (par_readables[i] && replies[i].op_ret < 0 &&
|
|
|
887953 |
+ replies[i].op_errno != ENOTCONN) {
|
|
|
887953 |
+ goto name_heal;
|
|
|
887953 |
+ }
|
|
|
887953 |
}
|
|
|
887953 |
- return ret;
|
|
|
887953 |
- }
|
|
|
887953 |
+ }
|
|
|
887953 |
+
|
|
|
887953 |
+ goto metadata_heal;
|
|
|
887953 |
+
|
|
|
887953 |
+name_heal:
|
|
|
887953 |
+ heal = afr_frame_create (this, NULL);
|
|
|
887953 |
+ if (!heal)
|
|
|
887953 |
+ goto metadata_heal;
|
|
|
887953 |
+
|
|
|
887953 |
+ ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
|
|
|
887953 |
+ afr_refresh_selfheal_done, heal, frame);
|
|
|
887953 |
+ if (ret) {
|
|
|
887953 |
+ AFR_STACK_DESTROY (heal);
|
|
|
887953 |
+ goto metadata_heal;
|
|
|
887953 |
+ }
|
|
|
887953 |
+ return ret;
|
|
|
887953 |
+
|
|
|
887953 |
metadata_heal:
|
|
|
887953 |
ret = afr_lookup_metadata_heal_check (frame, this);
|
|
|
887953 |
|
|
|
887953 |
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
|
|
|
887953 |
index bcd0e60..0a5be29 100644
|
|
|
887953 |
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
|
|
|
887953 |
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
|
|
|
887953 |
@@ -634,20 +634,26 @@ afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this,
|
|
|
887953 |
continue;
|
|
|
887953 |
|
|
|
887953 |
if ((replies[i].op_ret == -1) &&
|
|
|
887953 |
- (replies[i].op_errno == ENODATA))
|
|
|
887953 |
+ (replies[i].op_errno == ENODATA)) {
|
|
|
887953 |
*need_heal = _gf_true;
|
|
|
887953 |
+ break;
|
|
|
887953 |
+ }
|
|
|
887953 |
|
|
|
887953 |
if (first_idx == -1) {
|
|
|
887953 |
first_idx = i;
|
|
|
887953 |
continue;
|
|
|
887953 |
}
|
|
|
887953 |
|
|
|
887953 |
- if (replies[i].op_ret != replies[first_idx].op_ret)
|
|
|
887953 |
+ if (replies[i].op_ret != replies[first_idx].op_ret) {
|
|
|
887953 |
*need_heal = _gf_true;
|
|
|
887953 |
+ break;
|
|
|
887953 |
+ }
|
|
|
887953 |
|
|
|
887953 |
if (gf_uuid_compare (replies[i].poststat.ia_gfid,
|
|
|
887953 |
- replies[first_idx].poststat.ia_gfid))
|
|
|
887953 |
+ replies[first_idx].poststat.ia_gfid)) {
|
|
|
887953 |
*need_heal = _gf_true;
|
|
|
887953 |
+ break;
|
|
|
887953 |
+ }
|
|
|
887953 |
}
|
|
|
887953 |
|
|
|
887953 |
if (inode)
|
|
|
887953 |
--
|
|
|
887953 |
1.8.3.1
|
|
|
887953 |
|