From a4f226f50a2ef943b5db095877bab5a3eebf7283 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Thu, 11 Jun 2015 14:58:05 +0530 Subject: [PATCH 114/129] cluster/afr : set pending xattrs for replaced brick This patch is part two change to prevent data loss in a replicate volume on doing a replace-brick commit force operation. Problem: After doing replace-brick commit force, there is a chance that self heal might happen from the replaced (sink) brick rather than the source brick leading to data loss. Solution: Mark pending changelogs on afr children for the replaced afr-child so that heal is performed in the correct direction. Upstream patch links: master : http://review.gluster.org/10448/ 3.7 : http://review.gluster.org/11254/ Change-Id: Icb9807e49b4c1c4f1dcab115318d9a58ccf95675 BUG: 1140649 Signed-off-by: Anuradha Talur Reviewed-on: https://code.engineering.redhat.com/gerrit/51021 Reviewed-by: Krutika Dhananjay Reviewed-by: Ravishankar Narayanankutty Tested-by: Ravishankar Narayanankutty --- tests/basic/afr/replace-brick-self-heal.t | 59 ++++++ xlators/cluster/afr/src/afr-inode-write.c | 255 +++++++++++++++++++++++- xlators/cluster/afr/src/afr-self-heal-common.c | 16 +- xlators/cluster/afr/src/afr-self-heal.h | 8 + 4 files changed, 329 insertions(+), 9 deletions(-) create mode 100644 tests/basic/afr/replace-brick-self-heal.t diff --git a/tests/basic/afr/replace-brick-self-heal.t b/tests/basic/afr/replace-brick-self-heal.t new file mode 100644 index 0000000..1901466 --- /dev/null +++ b/tests/basic/afr/replace-brick-self-heal.t @@ -0,0 +1,59 @@ +#!/bin/bash +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +cleanup; + +function match_dirs { + diff <(ls $1 | sort) <(ls $2 | sort) + if [ $? -eq 0 ]; + then + echo "Y" + else + echo "N" + fi +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off + +TEST $CLI volume set $V0 self-heal-daemon off +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0; + +#Create files +for i in {1..5} +do + echo $i > $M0/file$i.txt +done + +# Replace brick1 +TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}1_new commit force + +# Replaced-brick should accuse the non-replaced-brick (Simulating case for data-loss) +TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/${V0}1_new/ + +# Check if pending xattr and dirty-xattr are set for replaced-brick +EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0 +EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.dirty $B0/${V0}1_new + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +TEST $CLI volume set $V0 self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +TEST $CLI volume heal $V0 + +# Check if heal has happened +EXPECT_WITHIN $HEAL_TIMEOUT "Y" match_dirs $B0/${V0}0 $B0/${V0}1_new + +# To make sure that data was not lost from brick0 +TEST diff <(ls $B0/${V0}0 | sort) <(ls $B0/${V0}1 | sort) +EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0 + +# Test if data was healed +TEST diff $B0/${V0}0/file1.txt $B0/${V0}1/file1.txt +cleanup; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 3db4010..ecd2b9d 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -35,7 +35,7 @@ #include "compat-errno.h" #include "compat.h" #include "protocol-common.h" - +#include "byte-order.h" #include "afr-transaction.h" #include "afr-self-heal.h" @@ -972,6 +972,211 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) } int +afr_rb_set_pending_changelog_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) + +{ + afr_local_t *local = NULL; + int i = 0; + + local = frame->local; + i = (long) cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + + syncbarrier_wake (&local->barrier); + return 0; +} + +char * +afr_opret_matrix_generate (afr_private_t *priv, afr_local_t *local) +{ + char *matrix = NULL; + char *ptr = NULL; + int i = 0; + + /* Allocate max amount of chars required, including -ve values + * and spaces */ + matrix = GF_CALLOC (priv->child_count, 3 * sizeof (char), + gf_afr_mt_char); + if (!matrix) + return NULL; + ptr = matrix; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid) + ptr += sprintf (ptr, "%d ", local->replies[i].op_ret); + else + ptr += sprintf (ptr, "-1 "); + } + return matrix; +} + +int +afr_rb_set_pending_changelog (call_frame_t *frame, xlator_t *this, + unsigned char *locked_nodes) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int ret = 0, i = 0; + char *matrix = NULL; + + local = frame->local; + priv = this->private; + + AFR_ONLIST (locked_nodes, frame, afr_rb_set_pending_changelog_cbk, + xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, + local->xdata_req, NULL); + + /* It is sufficient if xattrop was successful on one child */ + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && + local->replies[i].op_ret == 0) { + matrix = afr_opret_matrix_generate (priv, local); + gf_log (this->name, GF_LOG_DEBUG, "Successfully set " + "pending changelog. op_ret matrix : [ %s].", + matrix); + ret = 0; + goto out; + } + ret = afr_higher_errno (ret, local->replies[i].op_errno); + } + gf_log (this->name, GF_LOG_ERROR, "Couldn't set pending xattr " + "on any child. (%s)", strerror (ret)); +out: + if (matrix) + GF_FREE (matrix); + return -ret; +} + +int +_afr_handle_replace_brick_type (xlator_t *this, call_frame_t *frame, + loc_t *loc, int rb_index, + afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *locked_nodes = NULL; + struct gf_flock flock = {0, }; + struct gf_flock unflock = {0, }; + int i = 0; + int count = 0; + int ret = -ENOMEM; + int idx = -1; + + priv = this->private; + local = frame->local; + + locked_nodes = alloca0 (priv->child_count); + + idx = afr_index_for_transaction_type (type); + + local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (i == rb_index) + local->pending[i][idx] = hton32 (1); + } + + local->xdata_req = dict_new (); + if (!local->xdata_req) + goto out; + + ret = afr_set_pending_dict (priv, local->xdata_req, local->pending); + if (ret < 0) + goto out; + + if (AFR_ENTRY_TRANSACTION == type) { + AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, this->name, + loc, NULL, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } else { + flock.l_type = F_WRLCK; + flock.l_start = LLONG_MAX - 1; + flock.l_len = 0; + AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, this->name, + loc, F_SETLKW, &flock, NULL); + } + count = afr_locked_fill (frame, this, locked_nodes); + + if (!count) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't acquire lock on" + " any child."); + ret = -EAGAIN; + goto unlock; + } + + ret = afr_rb_set_pending_changelog (frame, this, locked_nodes); + if (ret) + goto unlock; + ret = 0; +unlock: + if (AFR_ENTRY_TRANSACTION == type) { + AFR_ONLIST (locked_nodes, frame, afr_selfheal_lock_cbk, + entrylk, this->name, loc, NULL, ENTRYLK_UNLOCK, + ENTRYLK_WRLCK, NULL); + } else { + unflock.l_type = F_UNLCK; + unflock.l_start = LLONG_MAX - 1; + unflock.l_len = 0; + AFR_ONLIST (locked_nodes, frame, afr_selfheal_lock_cbk, + inodelk, this->name, loc, F_SETLK, &unflock, NULL); + } +out: + return ret; +} + +int +_afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc, + int rb_index) +{ + + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int ret = -1; + int op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + loc_copy (&local->loc, loc); + + gf_log (this->name, GF_LOG_DEBUG, "Child being replaced is : %s", + priv->children[rb_index]->name); + + ret = _afr_handle_replace_brick_type (this, frame, loc, rb_index, + AFR_METADATA_TRANSACTION); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + + dict_unref (local->xdata_req); + afr_matrix_cleanup (local->pending, priv->child_count); + + ret = _afr_handle_replace_brick_type (this, frame, loc, rb_index, + AFR_ENTRY_TRANSACTION); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = 0; +out: + AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); + return 0; +} + + +int afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc, char *data) { @@ -1169,6 +1374,50 @@ afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame, return ret; } +int +afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) +{ + int len = 0; + int ret = -1; + int rb_index = -1; + int op_errno = EPERM; + void *value = NULL; + char *replace_brick = NULL; + + ret = dict_get_ptr_and_len (dict, GF_AFR_REPLACE_BRICK, &value, + &len); + + if (value) { + replace_brick = alloca0 (len + 1); + memcpy (replace_brick, value, len); + + if (!(frame->root->pid == GF_CLIENT_PID_AFR_SELF_HEALD)) { + ret = 1; + goto out; + } + rb_index = afr_get_child_index_from_name (this, replace_brick); + + if (rb_index < 0) + /* Didn't belong to this replica pair + * Just do a no-op + */ + AFR_STACK_UNWIND (setxattr, frame, 0, 0, NULL); + else + _afr_handle_replace_brick (this, frame, loc, rb_index); + ret = 0; + } +out: + if (ret == 1) { + gf_log (this->name, GF_LOG_ERROR, "'%s' is an internal" + " extended attribute : %s.", + GF_AFR_REPLACE_BRICK, strerror (op_errno)); + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + ret = 0; + } + return ret; +} + static int afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc, dict_t *dict) @@ -1180,6 +1429,10 @@ afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc, goto out; ret = afr_handle_spb_choice_timeout (this, frame, dict); + if (ret == 0) + goto out; + + ret = afr_handle_replace_brick (this, frame, loc, dict); out: return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 207e9b9..1e46226 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -807,8 +807,8 @@ afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int -afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this, - unsigned char *locked_on) +afr_locked_fill (call_frame_t *frame, xlator_t *this, + unsigned char *locked_on) { int i = 0; afr_private_t *priv = NULL; @@ -851,7 +851,7 @@ afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, loc_wipe (&loc); - return afr_selfheal_locked_fill (frame, this, locked_on); + return afr_locked_fill (frame, this, locked_on); } @@ -882,7 +882,7 @@ afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, for (i = 0; i < priv->child_count; i++) { if (local->replies[i].op_ret == -1 && local->replies[i].op_errno == EAGAIN) { - afr_selfheal_locked_fill (frame, this, locked_on); + afr_locked_fill (frame, this, locked_on); afr_selfheal_uninodelk (frame, this, inode, dom, off, size, locked_on); @@ -894,7 +894,7 @@ afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, loc_wipe (&loc); - return afr_selfheal_locked_fill (frame, this, locked_on); + return afr_locked_fill (frame, this, locked_on); } @@ -937,7 +937,7 @@ afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, loc_wipe (&loc); - return afr_selfheal_locked_fill (frame, this, locked_on); + return afr_locked_fill (frame, this, locked_on); } @@ -962,7 +962,7 @@ afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, for (i = 0; i < priv->child_count; i++) { if (local->replies[i].op_ret == -1 && local->replies[i].op_errno == EAGAIN) { - afr_selfheal_locked_fill (frame, this, locked_on); + afr_locked_fill (frame, this, locked_on); afr_selfheal_unentrylk (frame, this, inode, dom, name, locked_on); @@ -974,7 +974,7 @@ afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, loc_wipe (&loc); - return afr_selfheal_locked_fill (frame, this, locked_on); + return afr_locked_fill (frame, this, locked_on); } diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 956f075..a707e20 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -251,4 +251,12 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, int afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid); + +int +afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); + +int +afr_locked_fill (call_frame_t *frame, xlator_t *this, + unsigned char *locked_on); #endif /* !_AFR_SELFHEAL_H */ -- 1.7.1