From ef36875cd4661de13b74225ac25e9c49914434bb Mon Sep 17 00:00:00 2001 From: Anuradha Talur Date: Tue, 22 Mar 2016 14:03:38 +0530 Subject: [PATCH 45/80] afr : Enable auto heal when replica count increases Backport of: http://review.gluster.org/13807 This patch is part two change to prevent data loss in a replicate volume on doing a add-brick operation. Problem: After doing add-brick, there is a chance that self heal might happen from the newly added brick rather than the source brick, leading to data loss. Solution: Mark pending changelogs on afr children for the new afr-child so that heal is performed in the correct direction. >Change-Id: Iae6af44f97e612cb3ee8c642254ec3d15ac063f5 >BUG: 1320020 >Signed-off-by: Anuradha Talur >Reviewed-on: http://review.gluster.org/13807 >Smoke: Gluster Build System >NetBSD-regression: NetBSD Build System >CentOS-regression: Gluster Build System >Reviewed-by: Raghavendra G >Signed-off-by: Anuradha Talur Change-Id: I216626f860e6e84e274479987216a6cf2d6e5834 Signed-off-by: Anuradha Talur Reviewed-on: https://code.engineering.redhat.com/gerrit/71426 Reviewed-by: Pranith Kumar Karampuri Tested-by: Pranith Kumar Karampuri --- tests/basic/afr/add-brick-self-heal.t | 67 ++++++++++ xlators/cluster/afr/src/afr-common.c | 11 ++ xlators/cluster/afr/src/afr-inode-write.c | 187 +++++++++++++++++----------- xlators/cluster/afr/src/afr-mem-types.h | 2 +- xlators/cluster/afr/src/afr-messages.h | 16 ++- xlators/cluster/afr/src/afr.h | 9 +- 6 files changed, 211 insertions(+), 81 deletions(-) create mode 100644 tests/basic/afr/add-brick-self-heal.t diff --git a/tests/basic/afr/add-brick-self-heal.t b/tests/basic/afr/add-brick-self-heal.t new file mode 100644 index 0000000..748d367 --- /dev/null +++ b/tests/basic/afr/add-brick-self-heal.t @@ -0,0 +1,67 @@ +#!/bin/bash +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off + +TEST $CLI volume set $V0 self-heal-daemon off +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0; + +# Create files +for i in {1..5} +do + echo $i > $M0/file$i.txt +done + +# Metadata changes +TEST setfattr -n user.test -v qwerty $M0/file5.txt + +# Add brick1 +TEST $CLI volume add-brick $V0 replica 3 $H0:$B0/${V0}2 + +# New-brick should accuse the old-bricks (Simulating case for data-loss) +TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/${V0}2/ +TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/${V0}2/ + +# Check if pending xattr and dirty-xattr are set for newly-added-brick +EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}0 +EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}1 +EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.dirty $B0/${V0}2 + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 + +TEST $CLI volume set $V0 self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +TEST $CLI volume heal $V0 + +# Wait for heal to complete +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +# Check if entry-heal has happened +TEST diff <(ls $B0/${V0}0 | sort) <(ls $B0/${V0}2 | sort) +TEST diff <(ls $B0/${V0}1 | sort) <(ls $B0/${V0}2 | sort) + +# Test if data was healed +TEST diff $B0/${V0}0/file1.txt $B0/${V0}2/file1.txt + +# Test if metadata was healed and exists on both the bricks +EXPECT "qwerty" get_text_xattr user.test $B0/${V0}2/file5.txt +EXPECT "qwerty" get_text_xattr user.test $B0/${V0}0/file5.txt + +EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}0 +EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}1 +EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.dirty $B0/${V0}2 + +cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index b5d07ac..6b393c7 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -5113,3 +5113,14 @@ afr_get_need_heal (xlator_t *this) UNLOCK (&priv->lock); return need_heal; } + +int +afr_get_msg_id (char *op_type) +{ + + if (!strcmp (op_type, GF_AFR_REPLACE_BRICK)) + return AFR_MSG_REPLACE_BRICK_STATUS; + else if (!strcmp (op_type, GF_AFR_ADD_BRICK)) + return AFR_MSG_ADD_BRICK_STATUS; + return -1; +} diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 8069d6c..c86fb49 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -1014,14 +1014,15 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) } int -afr_rb_set_pending_changelog_cbk (call_frame_t *frame, void *cookie, +afr_emptyb_set_pending_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int i = 0; + int i, ret = 0; + char *op_type = NULL; local = frame->local; priv = this->private; @@ -1030,19 +1031,26 @@ afr_rb_set_pending_changelog_cbk (call_frame_t *frame, void *cookie, local->replies[i].valid = 1; local->replies[i].op_ret = op_ret; local->replies[i].op_errno = op_errno; + + ret = dict_get_str (local->xdata_req, "replicate-brick-op", &op_type); + if (ret) + goto out; + gf_msg (this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO, op_ret ? op_errno : 0, - AFR_MSG_REPLACE_BRICK_STATUS, "Set of pending xattr %s on" + afr_get_msg_id (op_type), + "Set of pending xattr %s on" " %s.", op_ret ? "failed" : "succeeded", priv->children[i]->name); +out: syncbarrier_wake (&local->barrier); return 0; } int -afr_rb_set_pending_changelog (call_frame_t *frame, xlator_t *this, - unsigned char *locked_nodes) +afr_emptyb_set_pending_changelog (call_frame_t *frame, xlator_t *this, + unsigned char *locked_nodes) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -1051,9 +1059,9 @@ afr_rb_set_pending_changelog (call_frame_t *frame, xlator_t *this, local = frame->local; priv = this->private; - AFR_ONLIST (locked_nodes, frame, afr_rb_set_pending_changelog_cbk, + AFR_ONLIST (locked_nodes, frame, afr_emptyb_set_pending_changelog_cbk, xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, - local->xdata_req, NULL); + local->xattr_req, NULL); /* It is sufficient if xattrop was successful on one child */ for (i = 0; i < priv->child_count; i++) { @@ -1073,9 +1081,10 @@ out: } int -_afr_handle_replace_brick_type (xlator_t *this, call_frame_t *frame, - loc_t *loc, int rb_index, - afr_transaction_type type) +_afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame, + loc_t *loc, int empty_index, + afr_transaction_type type, + char *op_type) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -1096,13 +1105,21 @@ _afr_handle_replace_brick_type (xlator_t *this, call_frame_t *frame, if (!local->pending) goto out; - local->pending[rb_index][idx] = hton32 (1); + local->pending[empty_index][idx] = hton32 (1); local->xdata_req = dict_new (); if (!local->xdata_req) goto out; - ret = afr_set_pending_dict (priv, local->xdata_req, local->pending); + ret = dict_set_str (local->xdata_req, "replicate-brick-op", op_type); + if (ret) + goto out; + + local->xattr_req = dict_new (); + if (!local->xattr_req) + goto out; + + ret = afr_set_pending_dict (priv, local->xattr_req, local->pending); if (ret < 0) goto out; @@ -1123,7 +1140,7 @@ _afr_handle_replace_brick_type (xlator_t *this, call_frame_t *frame, goto unlock; } - ret = afr_rb_set_pending_changelog (frame, this, locked_nodes); + ret = afr_emptyb_set_pending_changelog (frame, this, locked_nodes); if (ret) goto unlock; ret = 0; @@ -1139,33 +1156,41 @@ out: return ret; } -int -_afr_handle_replace_brick_cbk (int ret, call_frame_t *frame, void *opaque) +void +afr_brick_args_cleanup (void *opaque) { - afr_replace_brick_args_t *data = NULL; + afr_empty_brick_args_t *data = NULL; data = opaque; loc_wipe (&data->loc); GF_FREE (data); +} + +int +_afr_handle_empty_brick_cbk (int ret, call_frame_t *frame, void *opaque) +{ + afr_brick_args_cleanup (opaque); return 0; } int -_afr_handle_replace_brick (void *opaque) +_afr_handle_empty_brick (void *opaque) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int rb_index = -1; + int empty_index = -1; int ret = -1; int op_errno = ENOMEM; call_frame_t *frame = NULL; xlator_t *this = NULL; - afr_replace_brick_args_t *data = NULL; + char *op_type = NULL; + afr_empty_brick_args_t *data = NULL; data = opaque; frame = data->frame; - rb_index = data->rb_index; + empty_index = data->empty_index; + op_type = data->op_type; this = frame->this; priv = this->private; @@ -1175,11 +1200,13 @@ _afr_handle_replace_brick (void *opaque) loc_copy (&local->loc, &data->loc); - gf_msg_debug (this->name, 0, "Child being replaced is : %s", - priv->children[rb_index]->name); + gf_msg_debug (this->name, 0, "New brick is : %s", + priv->children[empty_index]->name); - ret = _afr_handle_replace_brick_type (this, frame, &local->loc, rb_index, - AFR_METADATA_TRANSACTION); + ret = _afr_handle_empty_brick_type (this, frame, &local->loc, + empty_index, + AFR_METADATA_TRANSACTION, + op_type); if (ret) { op_errno = -ret; ret = -1; @@ -1187,12 +1214,15 @@ _afr_handle_replace_brick (void *opaque) } dict_unref (local->xdata_req); + dict_unref (local->xattr_req); afr_matrix_cleanup (local->pending, priv->child_count); local->pending = NULL; + local->xattr_req = NULL; local->xdata_req = NULL; - ret = _afr_handle_replace_brick_type (this, frame, &local->loc, rb_index, - AFR_ENTRY_TRANSACTION); + ret = _afr_handle_empty_brick_type (this, frame, &local->loc, + empty_index, + AFR_ENTRY_TRANSACTION, op_type); if (ret) { op_errno = -ret; ret = -1; @@ -1413,63 +1443,71 @@ afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame, } int -afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc, - dict_t *dict) +afr_handle_empty_brick (xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) { int ret = -1; - int rb_index = -1; + int ab_ret = -1; + int empty_index = -1; int op_errno = EPERM; - char *replace_brick = NULL; - afr_replace_brick_args_t *data = NULL; + char *empty_brick = NULL; + char *op_type = NULL; + afr_empty_brick_args_t *data = NULL; - ret = dict_get_str (dict, GF_AFR_REPLACE_BRICK, &replace_brick); + ret = dict_get_str (dict, GF_AFR_REPLACE_BRICK, &empty_brick); + if (!ret) + op_type = GF_AFR_REPLACE_BRICK; - if (!ret) { - if (frame->root->pid != GF_CLIENT_PID_SELF_HEALD) { - gf_msg (this->name, GF_LOG_ERROR, EPERM, - AFR_MSG_REPLACE_BRICK_STATUS, "'%s' is an " - "internal extended attribute", - GF_AFR_REPLACE_BRICK); + ab_ret = dict_get_str (dict, GF_AFR_ADD_BRICK, &empty_brick); + if (!ab_ret) + op_type = GF_AFR_ADD_BRICK; + + if (ret && ab_ret) + goto out; + + if (frame->root->pid != GF_CLIENT_PID_SELF_HEALD) { + gf_msg (this->name, GF_LOG_ERROR, EPERM, + afr_get_msg_id (op_type), + "'%s' is an internal extended attribute.", + op_type); + ret = 1; + goto out; + } + empty_index = afr_get_child_index_from_name (this, empty_brick); + + if (empty_index < 0) { + /* Didn't belong to this replica pair + * Just do a no-op + */ + AFR_STACK_UNWIND (setxattr, frame, 0, 0, NULL); + return 0; + } else { + data = GF_CALLOC (1, sizeof (*data), + gf_afr_mt_empty_brick_t); + if (!data) { ret = 1; + op_errno = ENOMEM; goto out; } - rb_index = afr_get_child_index_from_name (this, replace_brick); - - if (rb_index < 0) { - /* Didn't belong to this replica pair - * Just do a no-op - */ - AFR_STACK_UNWIND (setxattr, frame, 0, 0, NULL); - return 0; - } else { - data = GF_CALLOC (1, sizeof (*data), - gf_afr_mt_replace_brick_t); - if (!data) { - ret = 1; - op_errno = ENOMEM; - goto out; - } - data->frame = frame; - loc_copy (&data->loc, loc); - data->rb_index = rb_index; - ret = synctask_new (this->ctx->env, - _afr_handle_replace_brick, - _afr_handle_replace_brick_cbk, - NULL, data); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_REPLACE_BRICK_FAILED, - "Failed to create synctask. Unable to " - "perform replace-brick."); - ret = 1; - op_errno = ENOMEM; - loc_wipe (&data->loc); - GF_FREE (data); - goto out; - } + data->frame = frame; + loc_copy (&data->loc, loc); + data->empty_index = empty_index; + data->op_type = op_type; + ret = synctask_new (this->ctx->env, + _afr_handle_empty_brick, + _afr_handle_empty_brick_cbk, + NULL, data); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + afr_get_msg_id (op_type), + "Failed to create synctask."); + ret = 1; + op_errno = ENOMEM; + afr_brick_args_cleanup (data); + goto out; } - ret = 0; } + ret = 0; out: if (ret == 1) { AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); @@ -1492,7 +1530,8 @@ afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc, if (ret == 0) goto out; - ret = afr_handle_replace_brick (this, frame, loc, dict); + /* Applicable for replace-brick and add-brick commands */ + ret = afr_handle_empty_brick (this, frame, loc, dict); out: return ret; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 6f1eee9..7f79620 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -45,7 +45,7 @@ enum gf_afr_mem_types_ { gf_afr_mt_subvol_healer_t, gf_afr_mt_spbc_timeout_t, gf_afr_mt_spb_status_t, - gf_afr_mt_replace_brick_t, + gf_afr_mt_empty_brick_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index d07c7bc..1fb5b51 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -341,11 +341,21 @@ /*! * @messageid 108039 - * @diagnosis - * @recommendedaction + * @diagnosis Setting of pending xattrs succeeded/failed during add-brick + * operation. + * @recommendedaction In case of failure, error number in the log should give + * the reason why it failed. Also observe brick logs for more information. */ -#define AFR_MSG_REPLACE_BRICK_FAILED (GLFS_COMP_BASE_AFR + 39) +#define AFR_MSG_ADD_BRICK_STATUS (GLFS_COMP_BASE_AFR + 39) +/*! + * @messageid 108040 + * @diagnosis AFR was unable to be loaded because the pending-changelog xattrs + * were not found in the volfile. + * @recommendedaction Please ensure cluster op-version is atleast 30707 and the + * volfiles are regenerated. +*/ +#define AFR_MSG_NO_CHANGELOG (GLFS_COMP_BASE_AFR + 40) #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 9915344..dec4eaa 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -777,11 +777,12 @@ typedef struct afr_spb_status { loc_t *loc; } afr_spb_status_t; -typedef struct afr_replace_brick_args { +typedef struct afr_empty_brick_args { call_frame_t *frame; loc_t loc; - int rb_index; -} afr_replace_brick_args_t; + int empty_index; + char *op_type; +} afr_empty_brick_args_t; typedef struct afr_read_subvol_args { ia_type_t ia_type; @@ -1117,4 +1118,6 @@ afr_set_need_heal (xlator_t *this, afr_local_t *local); int afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd); +int +afr_get_msg_id (char *op_type); #endif /* __AFR_H__ */ -- 1.7.1