From a4f226f50a2ef943b5db095877bab5a3eebf7283 Mon Sep 17 00:00:00 2001
From: Anuradha <atalur@redhat.com>
Date: Thu, 11 Jun 2015 14:58:05 +0530
Subject: [PATCH 114/129] cluster/afr : set pending xattrs for replaced brick
This patch is part two change to prevent data loss
in a replicate volume on doing a replace-brick commit
force operation.
Problem: After doing replace-brick commit force, there is a
chance that self heal might happen from the replaced (sink) brick
rather than the source brick leading to data loss.
Solution: Mark pending changelogs on afr children for
the replaced afr-child so that heal is performed in the
correct direction.
Upstream patch links:
master : http://review.gluster.org/10448/
3.7 : http://review.gluster.org/11254/
Change-Id: Icb9807e49b4c1c4f1dcab115318d9a58ccf95675
BUG: 1140649
Signed-off-by: Anuradha Talur <atalur@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/51021
Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
Tested-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
---
tests/basic/afr/replace-brick-self-heal.t | 59 ++++++
xlators/cluster/afr/src/afr-inode-write.c | 255 +++++++++++++++++++++++-
xlators/cluster/afr/src/afr-self-heal-common.c | 16 +-
xlators/cluster/afr/src/afr-self-heal.h | 8 +
4 files changed, 329 insertions(+), 9 deletions(-)
create mode 100644 tests/basic/afr/replace-brick-self-heal.t
diff --git a/tests/basic/afr/replace-brick-self-heal.t b/tests/basic/afr/replace-brick-self-heal.t
new file mode 100644
index 0000000..1901466
--- /dev/null
+++ b/tests/basic/afr/replace-brick-self-heal.t
@@ -0,0 +1,59 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+function match_dirs {
+ diff <(ls $1 | sort) <(ls $2 | sort)
+ if [ $? -eq 0 ];
+ then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
+
+#Create files
+for i in {1..5}
+do
+ echo $i > $M0/file$i.txt
+done
+
+# Replace brick1
+TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}1_new commit force
+
+# Replaced-brick should accuse the non-replaced-brick (Simulating case for data-loss)
+TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/${V0}1_new/
+
+# Check if pending xattr and dirty-xattr are set for replaced-brick
+EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0
+EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.dirty $B0/${V0}1_new
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+TEST $CLI volume set $V0 self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+TEST $CLI volume heal $V0
+
+# Check if heal has happened
+EXPECT_WITHIN $HEAL_TIMEOUT "Y" match_dirs $B0/${V0}0 $B0/${V0}1_new
+
+# To make sure that data was not lost from brick0
+TEST diff <(ls $B0/${V0}0 | sort) <(ls $B0/${V0}1 | sort)
+EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0
+
+# Test if data was healed
+TEST diff $B0/${V0}0/file1.txt $B0/${V0}1/file1.txt
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 3db4010..ecd2b9d 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -35,7 +35,7 @@
#include "compat-errno.h"
#include "compat.h"
#include "protocol-common.h"
-
+#include "byte-order.h"
#include "afr-transaction.h"
#include "afr-self-heal.h"
@@ -972,6 +972,211 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
}
int
+afr_rb_set_pending_changelog_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
+
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+
+ local = frame->local;
+ i = (long) cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+
+ syncbarrier_wake (&local->barrier);
+ return 0;
+}
+
+char *
+afr_opret_matrix_generate (afr_private_t *priv, afr_local_t *local)
+{
+ char *matrix = NULL;
+ char *ptr = NULL;
+ int i = 0;
+
+ /* Allocate max amount of chars required, including -ve values
+ * and spaces */
+ matrix = GF_CALLOC (priv->child_count, 3 * sizeof (char),
+ gf_afr_mt_char);
+ if (!matrix)
+ return NULL;
+ ptr = matrix;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid)
+ ptr += sprintf (ptr, "%d ", local->replies[i].op_ret);
+ else
+ ptr += sprintf (ptr, "-1 ");
+ }
+ return matrix;
+}
+
+int
+afr_rb_set_pending_changelog (call_frame_t *frame, xlator_t *this,
+ unsigned char *locked_nodes)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = 0, i = 0;
+ char *matrix = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ AFR_ONLIST (locked_nodes, frame, afr_rb_set_pending_changelog_cbk,
+ xattrop, &local->loc, GF_XATTROP_ADD_ARRAY,
+ local->xdata_req, NULL);
+
+ /* It is sufficient if xattrop was successful on one child */
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid &&
+ local->replies[i].op_ret == 0) {
+ matrix = afr_opret_matrix_generate (priv, local);
+ gf_log (this->name, GF_LOG_DEBUG, "Successfully set "
+ "pending changelog. op_ret matrix : [ %s].",
+ matrix);
+ ret = 0;
+ goto out;
+ }
+ ret = afr_higher_errno (ret, local->replies[i].op_errno);
+ }
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't set pending xattr "
+ "on any child. (%s)", strerror (ret));
+out:
+ if (matrix)
+ GF_FREE (matrix);
+ return -ret;
+}
+
+int
+_afr_handle_replace_brick_type (xlator_t *this, call_frame_t *frame,
+ loc_t *loc, int rb_index,
+ afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_nodes = NULL;
+ struct gf_flock flock = {0, };
+ struct gf_flock unflock = {0, };
+ int i = 0;
+ int count = 0;
+ int ret = -ENOMEM;
+ int idx = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ locked_nodes = alloca0 (priv->child_count);
+
+ idx = afr_index_for_transaction_type (type);
+
+ local->pending = afr_matrix_create (priv->child_count,
+ AFR_NUM_CHANGE_LOGS);
+ if (!local->pending)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == rb_index)
+ local->pending[i][idx] = hton32 (1);
+ }
+
+ local->xdata_req = dict_new ();
+ if (!local->xdata_req)
+ goto out;
+
+ ret = afr_set_pending_dict (priv, local->xdata_req, local->pending);
+ if (ret < 0)
+ goto out;
+
+ if (AFR_ENTRY_TRANSACTION == type) {
+ AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, this->name,
+ loc, NULL, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ } else {
+ flock.l_type = F_WRLCK;
+ flock.l_start = LLONG_MAX - 1;
+ flock.l_len = 0;
+ AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, this->name,
+ loc, F_SETLKW, &flock, NULL);
+ }
+ count = afr_locked_fill (frame, this, locked_nodes);
+
+ if (!count) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't acquire lock on"
+ " any child.");
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ ret = afr_rb_set_pending_changelog (frame, this, locked_nodes);
+ if (ret)
+ goto unlock;
+ ret = 0;
+unlock:
+ if (AFR_ENTRY_TRANSACTION == type) {
+ AFR_ONLIST (locked_nodes, frame, afr_selfheal_lock_cbk,
+ entrylk, this->name, loc, NULL, ENTRYLK_UNLOCK,
+ ENTRYLK_WRLCK, NULL);
+ } else {
+ unflock.l_type = F_UNLCK;
+ unflock.l_start = LLONG_MAX - 1;
+ unflock.l_len = 0;
+ AFR_ONLIST (locked_nodes, frame, afr_selfheal_lock_cbk,
+ inodelk, this->name, loc, F_SETLK, &unflock, NULL);
+ }
+out:
+ return ret;
+}
+
+int
+_afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
+ int rb_index)
+{
+
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy (&local->loc, loc);
+
+ gf_log (this->name, GF_LOG_DEBUG, "Child being replaced is : %s",
+ priv->children[rb_index]->name);
+
+ ret = _afr_handle_replace_brick_type (this, frame, loc, rb_index,
+ AFR_METADATA_TRANSACTION);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ dict_unref (local->xdata_req);
+ afr_matrix_cleanup (local->pending, priv->child_count);
+
+ ret = _afr_handle_replace_brick_type (this, frame, loc, rb_index,
+ AFR_ENTRY_TRANSACTION);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
+ return 0;
+}
+
+
+int
afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,
char *data)
{
@@ -1169,6 +1374,50 @@ afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame,
return ret;
}
+int
+afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
+{
+ int len = 0;
+ int ret = -1;
+ int rb_index = -1;
+ int op_errno = EPERM;
+ void *value = NULL;
+ char *replace_brick = NULL;
+
+ ret = dict_get_ptr_and_len (dict, GF_AFR_REPLACE_BRICK, &value,
+ &len);
+
+ if (value) {
+ replace_brick = alloca0 (len + 1);
+ memcpy (replace_brick, value, len);
+
+ if (!(frame->root->pid == GF_CLIENT_PID_AFR_SELF_HEALD)) {
+ ret = 1;
+ goto out;
+ }
+ rb_index = afr_get_child_index_from_name (this, replace_brick);
+
+ if (rb_index < 0)
+ /* Didn't belong to this replica pair
+ * Just do a no-op
+ */
+ AFR_STACK_UNWIND (setxattr, frame, 0, 0, NULL);
+ else
+ _afr_handle_replace_brick (this, frame, loc, rb_index);
+ ret = 0;
+ }
+out:
+ if (ret == 1) {
+ gf_log (this->name, GF_LOG_ERROR, "'%s' is an internal"
+ " extended attribute : %s.",
+ GF_AFR_REPLACE_BRICK, strerror (op_errno));
+ AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+ ret = 0;
+ }
+ return ret;
+}
+
static int
afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc,
dict_t *dict)
@@ -1180,6 +1429,10 @@ afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc,
goto out;
ret = afr_handle_spb_choice_timeout (this, frame, dict);
+ if (ret == 0)
+ goto out;
+
+ ret = afr_handle_replace_brick (this, frame, loc, dict);
out:
return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 207e9b9..1e46226 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -807,8 +807,8 @@ afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
-afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this,
- unsigned char *locked_on)
+afr_locked_fill (call_frame_t *frame, xlator_t *this,
+ unsigned char *locked_on)
{
int i = 0;
afr_private_t *priv = NULL;
@@ -851,7 +851,7 @@ afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
loc_wipe (&loc);
- return afr_selfheal_locked_fill (frame, this, locked_on);
+ return afr_locked_fill (frame, this, locked_on);
}
@@ -882,7 +882,7 @@ afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
for (i = 0; i < priv->child_count; i++) {
if (local->replies[i].op_ret == -1 &&
local->replies[i].op_errno == EAGAIN) {
- afr_selfheal_locked_fill (frame, this, locked_on);
+ afr_locked_fill (frame, this, locked_on);
afr_selfheal_uninodelk (frame, this, inode, dom, off,
size, locked_on);
@@ -894,7 +894,7 @@ afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
loc_wipe (&loc);
- return afr_selfheal_locked_fill (frame, this, locked_on);
+ return afr_locked_fill (frame, this, locked_on);
}
@@ -937,7 +937,7 @@ afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
loc_wipe (&loc);
- return afr_selfheal_locked_fill (frame, this, locked_on);
+ return afr_locked_fill (frame, this, locked_on);
}
@@ -962,7 +962,7 @@ afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
for (i = 0; i < priv->child_count; i++) {
if (local->replies[i].op_ret == -1 &&
local->replies[i].op_errno == EAGAIN) {
- afr_selfheal_locked_fill (frame, this, locked_on);
+ afr_locked_fill (frame, this, locked_on);
afr_selfheal_unentrylk (frame, this, inode, dom, name,
locked_on);
@@ -974,7 +974,7 @@ afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
loc_wipe (&loc);
- return afr_selfheal_locked_fill (frame, this, locked_on);
+ return afr_locked_fill (frame, this, locked_on);
}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 956f075..a707e20 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -251,4 +251,12 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
int
afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid);
+
+int
+afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
+
+int
+afr_locked_fill (call_frame_t *frame, xlator_t *this,
+ unsigned char *locked_on);
#endif /* !_AFR_SELFHEAL_H */
--
1.7.1