Blob Blame History Raw
From a4f226f50a2ef943b5db095877bab5a3eebf7283 Mon Sep 17 00:00:00 2001
From: Anuradha <atalur@redhat.com>
Date: Thu, 11 Jun 2015 14:58:05 +0530
Subject: [PATCH 114/129] cluster/afr : set pending xattrs for replaced brick

This patch is part two change to prevent data loss
in a replicate volume on doing a replace-brick commit
force operation.

Problem: After doing replace-brick commit force, there is a
chance that self heal might happen from the replaced (sink) brick
rather than the source brick leading to data loss.

Solution: Mark pending changelogs on afr children for
the replaced afr-child so that heal is performed in the
correct direction.

Upstream patch links:
master : http://review.gluster.org/10448/
3.7    : http://review.gluster.org/11254/
Change-Id: Icb9807e49b4c1c4f1dcab115318d9a58ccf95675
BUG: 1140649
Signed-off-by: Anuradha Talur <atalur@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/51021
Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
Tested-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
---
 tests/basic/afr/replace-brick-self-heal.t      |   59 ++++++
 xlators/cluster/afr/src/afr-inode-write.c      |  255 +++++++++++++++++++++++-
 xlators/cluster/afr/src/afr-self-heal-common.c |   16 +-
 xlators/cluster/afr/src/afr-self-heal.h        |    8 +
 4 files changed, 329 insertions(+), 9 deletions(-)
 create mode 100644 tests/basic/afr/replace-brick-self-heal.t

diff --git a/tests/basic/afr/replace-brick-self-heal.t b/tests/basic/afr/replace-brick-self-heal.t
new file mode 100644
index 0000000..1901466
--- /dev/null
+++ b/tests/basic/afr/replace-brick-self-heal.t
@@ -0,0 +1,59 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+function match_dirs {
+        diff <(ls $1 | sort) <(ls $2 | sort)
+        if [ $? -eq 0 ];
+        then
+                echo "Y"
+        else
+                echo "N"
+        fi
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
+
+#Create files
+for i in {1..5}
+do
+        echo $i > $M0/file$i.txt
+done
+
+# Replace brick1
+TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}1_new commit force
+
+# Replaced-brick should accuse the non-replaced-brick (Simulating case for data-loss)
+TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/${V0}1_new/
+
+# Check if pending xattr and dirty-xattr are set for replaced-brick
+EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0
+EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.dirty $B0/${V0}1_new
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+TEST $CLI volume set $V0 self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+TEST $CLI volume heal $V0
+
+# Check if heal has happened
+EXPECT_WITHIN $HEAL_TIMEOUT "Y" match_dirs $B0/${V0}0 $B0/${V0}1_new
+
+# To make sure that data was not lost from brick0
+TEST diff <(ls $B0/${V0}0 | sort) <(ls $B0/${V0}1 | sort)
+EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0
+
+# Test if data was healed
+TEST diff $B0/${V0}0/file1.txt $B0/${V0}1/file1.txt
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 3db4010..ecd2b9d 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -35,7 +35,7 @@
 #include "compat-errno.h"
 #include "compat.h"
 #include "protocol-common.h"
-
+#include "byte-order.h"
 #include "afr-transaction.h"
 #include "afr-self-heal.h"
 
@@ -972,6 +972,211 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
 }
 
 int
+afr_rb_set_pending_changelog_cbk (call_frame_t *frame, void *cookie,
+                                  xlator_t *this, int op_ret, int op_errno,
+                                  dict_t *xattr, dict_t *xdata)
+
+{
+        afr_local_t *local = NULL;
+        int i = 0;
+
+        local = frame->local;
+        i = (long) cookie;
+
+        local->replies[i].valid = 1;
+        local->replies[i].op_ret = op_ret;
+        local->replies[i].op_errno = op_errno;
+
+        syncbarrier_wake (&local->barrier);
+        return 0;
+}
+
+char *
+afr_opret_matrix_generate (afr_private_t *priv, afr_local_t *local)
+{
+        char *matrix = NULL;
+        char *ptr    = NULL;
+        int   i      = 0;
+
+        /* Allocate max amount of chars required, including -ve values
+         * and spaces */
+        matrix = GF_CALLOC (priv->child_count, 3 * sizeof (char),
+                            gf_afr_mt_char);
+        if (!matrix)
+                return NULL;
+        ptr = matrix;
+        for (i = 0; i < priv->child_count; i++) {
+                if (local->replies[i].valid)
+                        ptr += sprintf (ptr, "%d ", local->replies[i].op_ret);
+                else
+                        ptr += sprintf (ptr, "-1 ");
+        }
+        return matrix;
+}
+
+int
+afr_rb_set_pending_changelog (call_frame_t *frame, xlator_t *this,
+                              unsigned char *locked_nodes)
+{
+        afr_local_t *local = NULL;
+        afr_private_t *priv = NULL;
+        int ret = 0, i = 0;
+        char *matrix = NULL;
+
+        local = frame->local;
+        priv = this->private;
+
+        AFR_ONLIST (locked_nodes, frame, afr_rb_set_pending_changelog_cbk,
+                    xattrop, &local->loc, GF_XATTROP_ADD_ARRAY,
+                    local->xdata_req, NULL);
+
+        /* It is sufficient if xattrop was successful on one child */
+        for (i = 0; i < priv->child_count; i++) {
+                if (local->replies[i].valid &&
+                    local->replies[i].op_ret == 0) {
+                        matrix = afr_opret_matrix_generate (priv, local);
+                        gf_log (this->name, GF_LOG_DEBUG, "Successfully set "
+                                "pending changelog. op_ret matrix : [ %s].",
+                                matrix);
+                        ret = 0;
+                        goto out;
+                }
+                ret = afr_higher_errno (ret, local->replies[i].op_errno);
+        }
+        gf_log (this->name, GF_LOG_ERROR, "Couldn't set pending xattr "
+                "on any child. (%s)", strerror (ret));
+out:
+        if (matrix)
+                GF_FREE (matrix);
+        return -ret;
+}
+
+int
+_afr_handle_replace_brick_type (xlator_t *this, call_frame_t *frame,
+                                loc_t *loc, int rb_index,
+                                afr_transaction_type type)
+{
+        afr_local_t     *local            = NULL;
+        afr_private_t   *priv             = NULL;
+        unsigned char   *locked_nodes     = NULL;
+        struct           gf_flock flock   = {0, };
+        struct           gf_flock unflock = {0, };
+        int              i                = 0;
+        int              count            = 0;
+        int              ret              = -ENOMEM;
+        int              idx              = -1;
+
+        priv = this->private;
+        local = frame->local;
+
+        locked_nodes = alloca0 (priv->child_count);
+
+        idx = afr_index_for_transaction_type (type);
+
+        local->pending = afr_matrix_create (priv->child_count,
+                                            AFR_NUM_CHANGE_LOGS);
+        if (!local->pending)
+                goto out;
+
+        for (i = 0; i < priv->child_count; i++) {
+                if (i == rb_index)
+                        local->pending[i][idx] = hton32 (1);
+        }
+
+        local->xdata_req = dict_new ();
+        if (!local->xdata_req)
+                goto out;
+
+        ret = afr_set_pending_dict (priv, local->xdata_req, local->pending);
+        if (ret < 0)
+                goto out;
+
+        if (AFR_ENTRY_TRANSACTION == type) {
+                AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, this->name,
+                         loc, NULL, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+        } else {
+                flock.l_type = F_WRLCK;
+                flock.l_start = LLONG_MAX - 1;
+                flock.l_len = 0;
+                AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, this->name,
+                         loc, F_SETLKW, &flock, NULL);
+        }
+        count = afr_locked_fill (frame, this, locked_nodes);
+
+        if (!count) {
+                gf_log (this->name, GF_LOG_ERROR, "Couldn't acquire lock on"
+                        " any child.");
+                ret = -EAGAIN;
+                goto unlock;
+        }
+
+        ret = afr_rb_set_pending_changelog (frame, this, locked_nodes);
+        if (ret)
+                goto unlock;
+        ret = 0;
+unlock:
+        if (AFR_ENTRY_TRANSACTION == type) {
+                AFR_ONLIST (locked_nodes, frame, afr_selfheal_lock_cbk,
+                            entrylk, this->name, loc, NULL, ENTRYLK_UNLOCK,
+                            ENTRYLK_WRLCK, NULL);
+        } else {
+                unflock.l_type = F_UNLCK;
+                unflock.l_start = LLONG_MAX - 1;
+                unflock.l_len = 0;
+                AFR_ONLIST (locked_nodes, frame, afr_selfheal_lock_cbk,
+                            inodelk, this->name, loc, F_SETLK, &unflock, NULL);
+        }
+out:
+        return ret;
+}
+
+int
+_afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
+                           int rb_index)
+{
+
+        afr_local_t     *local          = NULL;
+        afr_private_t   *priv           = NULL;
+        int              ret            = -1;
+        int              op_errno       = ENOMEM;
+
+        priv = this->private;
+
+        local = AFR_FRAME_INIT (frame, op_errno);
+        if (!local)
+                goto out;
+
+        loc_copy (&local->loc, loc);
+
+        gf_log (this->name, GF_LOG_DEBUG, "Child being replaced is : %s",
+                priv->children[rb_index]->name);
+
+        ret = _afr_handle_replace_brick_type (this, frame, loc, rb_index,
+                                              AFR_METADATA_TRANSACTION);
+        if (ret) {
+                op_errno = -ret;
+                ret = -1;
+                goto out;
+        }
+
+        dict_unref (local->xdata_req);
+        afr_matrix_cleanup (local->pending, priv->child_count);
+
+        ret = _afr_handle_replace_brick_type (this, frame, loc, rb_index,
+                                              AFR_ENTRY_TRANSACTION);
+        if (ret) {
+                op_errno = -ret;
+                ret = -1;
+                goto out;
+        }
+        ret = 0;
+out:
+        AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
+        return 0;
+}
+
+
+int
 afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,
                             char *data)
 {
@@ -1169,6 +1374,50 @@ afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame,
         return ret;
 }
 
+int
+afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
+                          dict_t *dict)
+{
+        int             len               = 0;
+        int             ret               = -1;
+        int             rb_index          = -1;
+        int             op_errno          = EPERM;
+        void           *value             = NULL;
+        char           *replace_brick     = NULL;
+
+        ret =  dict_get_ptr_and_len (dict, GF_AFR_REPLACE_BRICK, &value,
+                                     &len);
+
+        if (value) {
+                replace_brick = alloca0 (len + 1);
+                memcpy (replace_brick, value, len);
+
+                if (!(frame->root->pid == GF_CLIENT_PID_AFR_SELF_HEALD)) {
+                        ret = 1;
+                        goto out;
+                }
+                rb_index = afr_get_child_index_from_name (this, replace_brick);
+
+                if (rb_index < 0)
+                         /* Didn't belong to this replica pair
+                          * Just do a no-op
+                          */
+                        AFR_STACK_UNWIND (setxattr, frame, 0, 0, NULL);
+                else
+                        _afr_handle_replace_brick (this, frame, loc, rb_index);
+                ret = 0;
+        }
+out:
+        if (ret == 1) {
+                gf_log (this->name, GF_LOG_ERROR, "'%s' is an internal"
+                        " extended attribute : %s.",
+                        GF_AFR_REPLACE_BRICK, strerror (op_errno));
+                AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+                ret = 0;
+        }
+        return ret;
+}
+
 static int
 afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc,
                           dict_t *dict)
@@ -1180,6 +1429,10 @@ afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc,
                 goto out;
 
         ret = afr_handle_spb_choice_timeout (this, frame, dict);
+        if (ret == 0)
+                goto out;
+
+        ret = afr_handle_replace_brick (this, frame, loc, dict);
 out:
         return ret;
 }
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 207e9b9..1e46226 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -807,8 +807,8 @@ afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 
 
 int
-afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this,
-			  unsigned char *locked_on)
+afr_locked_fill (call_frame_t *frame, xlator_t *this,
+                 unsigned char *locked_on)
 {
 	int i = 0;
 	afr_private_t *priv = NULL;
@@ -851,7 +851,7 @@ afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
 
 	loc_wipe (&loc);
 
-	return afr_selfheal_locked_fill (frame, this, locked_on);
+	return afr_locked_fill (frame, this, locked_on);
 }
 
 
@@ -882,7 +882,7 @@ afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
 	for (i = 0; i < priv->child_count; i++) {
 		if (local->replies[i].op_ret == -1 &&
 		    local->replies[i].op_errno == EAGAIN) {
-			afr_selfheal_locked_fill (frame, this, locked_on);
+			afr_locked_fill (frame, this, locked_on);
 			afr_selfheal_uninodelk (frame, this, inode, dom, off,
 						size, locked_on);
 
@@ -894,7 +894,7 @@ afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
 
 	loc_wipe (&loc);
 
-	return afr_selfheal_locked_fill (frame, this, locked_on);
+	return afr_locked_fill (frame, this, locked_on);
 }
 
 
@@ -937,7 +937,7 @@ afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
 
 	loc_wipe (&loc);
 
-	return afr_selfheal_locked_fill (frame, this, locked_on);
+	return afr_locked_fill (frame, this, locked_on);
 }
 
 
@@ -962,7 +962,7 @@ afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
 	for (i = 0; i < priv->child_count; i++) {
 		if (local->replies[i].op_ret == -1 &&
 		    local->replies[i].op_errno == EAGAIN) {
-			afr_selfheal_locked_fill (frame, this, locked_on);
+			afr_locked_fill (frame, this, locked_on);
 			afr_selfheal_unentrylk (frame, this, inode, dom, name,
 						locked_on);
 
@@ -974,7 +974,7 @@ afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
 
 	loc_wipe (&loc);
 
-	return afr_selfheal_locked_fill (frame, this, locked_on);
+	return afr_locked_fill (frame, this, locked_on);
 }
 
 
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 956f075..a707e20 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -251,4 +251,12 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
 
 int
 afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid);
+
+int
+afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+		       int op_ret, int op_errno, dict_t *xdata);
+
+int
+afr_locked_fill (call_frame_t *frame, xlator_t *this,
+                 unsigned char *locked_on);
 #endif /* !_AFR_SELFHEAL_H */
-- 
1.7.1