Blob Blame History Raw
From f398b6b9705f1b75d17d965ffcd72157d5be3daf Mon Sep 17 00:00:00 2001
From: karthik-us <ksubrahm@redhat.com>
Date: Tue, 18 Dec 2018 16:04:42 +0530
Subject: [PATCH 488/493] cluster/afr: Allow lookup on root if it is from
 ADD_REPLICA_MOUNT

Problem: When trying to convert a plain distribute volume to replica-3
or arbiter type it is failing with ENOTCONN error as the lookup on
the root will fail as there is no quorum.

Fix: Allow lookup on root if it is coming from the ADD_REPLICA_MOUNT
which is used while adding bricks to a volume. It will try to set the
pending xattrs for the newly added bricks to allow the heal to happen
in the right direction and avoid data loss scenarios.

Note: This fix will solve the problem of type conversion only in the
case where the volume was mounted at least once. The conversion of
non mounted volumes will still fail since the dht selfheal tries to
set the directory layout will fail as they do that with the PID
GF_CLIENT_PID_NO_ROOT_SQUASH set in the frame->root.

Backport of: https://review.gluster.org/#/c/glusterfs/+/21791/

Change-Id: Ie31d429dfebbfb0f60610c9c5739595c54b19c46
BUG: 1645480
Signed-off-by: karthik-us <ksubrahm@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/158932
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
 libglusterfs/src/common-utils.h                    |  3 +-
 ...g-1655854-support-dist-to-rep3-arb-conversion.t | 95 ++++++++++++++++++++++
 xlators/cluster/afr/src/afr-common.c               | 80 +++++++++++++-----
 xlators/cluster/afr/src/afr-inode-write.c          |  2 +-
 xlators/cluster/afr/src/afr-read-txn.c             |  3 +-
 xlators/cluster/afr/src/afr-transaction.c          | 11 ++-
 xlators/cluster/afr/src/afr-transaction.h          |  3 +-
 xlators/cluster/afr/src/afr.c                      |  2 +-
 xlators/cluster/afr/src/afr.h                      |  4 +
 xlators/mgmt/glusterd/src/glusterd-utils.c         |  2 +-
 10 files changed, 175 insertions(+), 30 deletions(-)
 create mode 100644 tests/bugs/replicate/bug-1655854-support-dist-to-rep3-arb-conversion.t

diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h
index c804ed5..50c1f9a 100644
--- a/libglusterfs/src/common-utils.h
+++ b/libglusterfs/src/common-utils.h
@@ -162,7 +162,8 @@ enum _gf_special_pid
         GF_CLIENT_PID_BITD              = -8,
         GF_CLIENT_PID_SCRUB             = -9,
         GF_CLIENT_PID_TIER_DEFRAG       = -10,
-        GF_SERVER_PID_TRASH             = -11
+        GF_SERVER_PID_TRASH             = -11,
+        GF_CLIENT_PID_ADD_REPLICA_MOUNT = -12
 };
 
 enum _gf_xlator_ipc_targets {
diff --git a/tests/bugs/replicate/bug-1655854-support-dist-to-rep3-arb-conversion.t b/tests/bugs/replicate/bug-1655854-support-dist-to-rep3-arb-conversion.t
new file mode 100644
index 0000000..783016d
--- /dev/null
+++ b/tests/bugs/replicate/bug-1655854-support-dist-to-rep3-arb-conversion.t
@@ -0,0 +1,95 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+
+# Conversion from 2x1 to 2x3
+
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1}
+EXPECT 'Created' volinfo_field $V0 'Status';
+TEST $CLI volume start $V0
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
+TEST mkdir $M0/dir
+TEST dd if=/dev/urandom of=$M0/dir/file bs=100K count=5
+file_md5sum=$(md5sum $M0/dir/file | awk '{print $1}')
+
+TEST $CLI volume add-brick $V0 replica 3 $H0:$B0/${V0}{2..5}
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}3
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}4
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}5
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 3
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 4
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 5
+
+# Trigger heal and wait for for it to complete
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+# Check whether the directory & file are healed to the newly added bricks
+TEST ls $B0/${V0}2/dir
+TEST ls $B0/${V0}3/dir
+TEST ls $B0/${V0}4/dir
+TEST ls $B0/${V0}5/dir
+
+TEST [ $file_md5sum == $(md5sum $B0/${V0}4/dir/file | awk '{print $1}') ]
+TEST [ $file_md5sum == $(md5sum $B0/${V0}5/dir/file | awk '{print $1}') ]
+
+
+# Conversion from 2x1 to 2x(2+1)
+
+TEST $CLI volume create $V1 $H0:$B0/${V1}{0,1}
+EXPECT 'Created' volinfo_field $V1 'Status';
+TEST $CLI volume start $V1
+EXPECT 'Started' volinfo_field $V1 'Status';
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}1
+
+TEST $GFS --volfile-id=$V1 --volfile-server=$H0 $M1;
+TEST mkdir $M1/dir
+TEST dd if=/dev/urandom of=$M1/dir/file bs=100K count=5
+file_md5sum=$(md5sum $M1/dir/file | awk '{print $1}')
+
+TEST $CLI volume add-brick $V1 replica 3 arbiter 1 $H0:$B0/${V1}{2..5}
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}3
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}4
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}5
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 3
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 4
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 5
+
+# Trigger heal and wait for for it to complete
+TEST $CLI volume heal $V1
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V1
+
+# Check whether the directory & file are healed to the newly added bricks
+TEST ls $B0/${V1}2/dir
+TEST ls $B0/${V1}3/dir
+TEST ls $B0/${V1}4/dir
+TEST ls $B0/${V1}5/dir
+
+EXPECT "0" stat -c %s $B0/${V1}5/dir/file
+TEST [ $file_md5sum == $(md5sum $B0/${V1}4/dir/file | awk '{print $1}') ]
+
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 231de9d..322dfbe 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -2265,7 +2265,7 @@ afr_attempt_readsubvol_set (call_frame_t *frame, xlator_t *this,
         } else if (!priv->quorum_count) {
                 *read_subvol = afr_first_up_child (frame, this);
         } else if (priv->quorum_count &&
-                   afr_has_quorum (data_readable, this)) {
+                   afr_has_quorum (data_readable, this, NULL)) {
                 /* read_subvol is guaranteed to be valid if we hit this path. */
                 *read_subvol = afr_first_up_child (frame, this);
         } else {
@@ -2405,7 +2405,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
         read_subvol = -1;
         memset (readable, 0, sizeof (*readable) * priv->child_count);
 	if (can_interpret) {
-                if (!afr_has_quorum (success_replies, this))
+                if (!afr_has_quorum (success_replies, this, NULL))
                         goto cant_interpret;
 		/* It is safe to call afr_replies_interpret() because we have
 		   a response from all the UP subvolumes and all of them resolved
@@ -2887,7 +2887,7 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
         if (name_state_mismatch) {
                 if (!priv->quorum_count)
                         goto name_heal;
-                if (!afr_has_quorum (success, this))
+                if (!afr_has_quorum (success, this, NULL))
                         goto name_heal;
                 if (op_errno)
                         goto name_heal;
@@ -2979,7 +2979,6 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
         afr_private_t       *priv  = NULL;
         afr_local_t         *local = NULL;
 	int                 i = -1;
-	int                 op_errno = 0;
 	int                 read_subvol = -1;
         unsigned char *data_readable = NULL;
         unsigned char *success_replies = NULL;
@@ -2998,15 +2997,13 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
                 }
 	}
 
-	op_errno = afr_final_errno (frame->local, this->private);
-
         if (local->op_ret < 0) {
-                AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL,
-                                  NULL, NULL);
-                return;
-	}
+                local->op_ret = -1;
+                local->op_errno = afr_final_errno (frame->local, this->private);
+                goto error;
+        }
 
-        if (!afr_has_quorum (success_replies, this))
+        if (!afr_has_quorum (success_replies, this, frame))
                 goto unwind;
 
 	afr_replies_interpret (frame, this, local->inode, NULL);
@@ -3017,11 +3014,8 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
 unwind:
         afr_attempt_readsubvol_set (frame, this, success_replies, data_readable,
                                     &read_subvol);
-	if (read_subvol == -1) {
-                AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
-                                  NULL, NULL, NULL, NULL);
-                return;
-        }
+	if (read_subvol == -1)
+                goto error;
 
         if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) {
                 local->op_ret = -1;
@@ -3034,6 +3028,11 @@ unwind:
 			  local->inode, &local->replies[read_subvol].poststat,
 			  local->replies[read_subvol].xdata,
 			  &local->replies[read_subvol].postparent);
+        return;
+
+error:
+        AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, NULL,
+                          NULL, NULL, NULL);
 }
 
 
@@ -3977,7 +3976,8 @@ afr_fop_lock_done (call_frame_t *frame, xlator_t *this)
 
         if (afr_is_conflicting_lock_present (local->op_ret, local->op_errno)) {
                 afr_unlock_locks_and_proceed (frame, this, lock_count);
-        } else if (priv->quorum_count && !afr_has_quorum (success, this)) {
+        } else if (priv->quorum_count &&
+                   !afr_has_quorum (success, this, NULL)) {
                 local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED;
                 local->op_ret = -1;
                 local->op_errno = afr_final_errno (local, priv);
@@ -4485,7 +4485,7 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                                    &local->cont.lk.user_flock,
                                    local->xdata_req);
         } else if (priv->quorum_count &&
-                   !afr_has_quorum (local->cont.lk.locked_nodes, this)) {
+                   !afr_has_quorum (local->cont.lk.locked_nodes, this, NULL)) {
                 local->op_ret   = -1;
                 local->op_errno = afr_final_errno (local, priv);
 
@@ -5199,7 +5199,7 @@ afr_notify (xlator_t *this, int32_t event,
         }
 
         had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
-                                                           this);
+                                                           this, NULL);
         if (priv->halo_enabled) {
                 halo_max_latency_msec = afr_get_halo_latency (this);
 
@@ -5328,7 +5328,7 @@ afr_notify (xlator_t *this, int32_t event,
         UNLOCK (&priv->lock);
 
         if (priv->quorum_count) {
-                has_quorum = afr_has_quorum (priv->child_up, this);
+                has_quorum = afr_has_quorum (priv->child_up, this, NULL);
                 if (!had_quorum && has_quorum) {
                         gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET,
                                 "Client-quorum is met");
@@ -6543,3 +6543,43 @@ afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode)
         }
         return ret;
 }
+
+gf_boolean_t
+afr_is_add_replica_mount_lookup_on_root (call_frame_t *frame)
+{
+        afr_local_t *local = NULL;
+
+        local = frame->local;
+
+        if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT)
+                return _gf_false;
+
+        if (local->op != GF_FOP_LOOKUP)
+                /* TODO:If the replica count is being increased on a plain
+                 * distribute volume that was never mounted, we need to allow
+                 * setxattr on '/' with GF_CLIENT_PID_NO_ROOT_SQUASH to
+                 * accomodate for DHT layout setting */
+                return _gf_false;
+
+        if (local->inode == NULL)
+                return _gf_false;
+
+        if (!__is_root_gfid (local->inode->gfid))
+                return _gf_false;
+
+        return _gf_true;
+}
+
+gf_boolean_t
+afr_lookup_has_quorum (call_frame_t *frame, xlator_t *this,
+                       unsigned char *subvols)
+{
+        afr_private_t *priv = this->private;
+
+        if (frame && afr_is_add_replica_mount_lookup_on_root (frame)) {
+                if (AFR_COUNT (subvols, priv->child_count) > 0)
+                        return _gf_true;
+        }
+
+        return _gf_false;
+}
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 8b1dcfd..ea4755a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -1539,7 +1539,7 @@ afr_handle_empty_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
         if (ret && ab_ret)
                 goto out;
 
-        if (frame->root->pid != GF_CLIENT_PID_SELF_HEALD) {
+        if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) {
                 gf_msg (this->name, GF_LOG_ERROR, EPERM,
                         afr_get_msg_id (op_type),
                         "'%s' is an internal extended attribute.",
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index f6c491b..ec322ae 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -193,7 +193,8 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
 	local->inode = inode_ref (inode);
         local->is_read_txn = _gf_true;
 
-        if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {
+        if (priv->quorum_count &&
+            !afr_has_quorum (local->child_up, this, NULL)) {
                 local->op_ret = -1;
                 local->op_errno = ENOTCONN;
                 read_subvol = -1;
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 0a67a83..0e58e02 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -160,7 +160,7 @@ afr_changelog_has_quorum (afr_local_t *local, xlator_t *this)
                 }
         }
 
-        if (afr_has_quorum (success_children, this)) {
+        if (afr_has_quorum (success_children, this, NULL)) {
                 return _gf_true;
         }
 
@@ -690,7 +690,7 @@ afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)
 }
 
 gf_boolean_t
-afr_has_quorum (unsigned char *subvols, xlator_t *this)
+afr_has_quorum (unsigned char *subvols, xlator_t *this, call_frame_t *frame)
 {
         unsigned int  quorum_count = 0;
         afr_private_t *priv  = NULL;
@@ -699,6 +699,9 @@ afr_has_quorum (unsigned char *subvols, xlator_t *this)
         priv = this->private;
         up_children_count = AFR_COUNT (subvols, priv->child_count);
 
+        if (afr_lookup_has_quorum (frame, this, subvols))
+                return _gf_true;
+
         if (priv->quorum_count == AFR_QUORUM_AUTO) {
                 /*
                  * Special case for auto-quorum with an even number of nodes.
@@ -753,7 +756,7 @@ afr_has_fop_quorum (call_frame_t *frame)
 
         locked_nodes = afr_locked_nodes_get (local->transaction.type,
                                              &local->internal_lock);
-        return afr_has_quorum (locked_nodes, this);
+        return afr_has_quorum (locked_nodes, this, NULL);
 }
 
 static gf_boolean_t
@@ -771,7 +774,7 @@ afr_has_fop_cbk_quorum (call_frame_t *frame)
                                 success[i] = 1;
         }
 
-        return afr_has_quorum (success, this);
+        return afr_has_quorum (success, this, NULL);
 }
 
 gf_boolean_t
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index a27e9a3..1c42c66 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -38,7 +38,8 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
 int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);
 
 call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
-gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);
+gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this,
+                             call_frame_t *frame);
 gf_boolean_t afr_needs_changelog_update (afr_local_t *local);
 void afr_zero_fill_stat (afr_local_t *local);
 
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 1b738c0..d5347e9 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -244,7 +244,7 @@ reconfigure (xlator_t *this, dict_t *options)
         GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options,
                           uint32, out);
         fix_quorum_options (this, priv, qtype, options);
-        if (priv->quorum_count && !afr_has_quorum (priv->child_up, this))
+        if (priv->quorum_count && !afr_has_quorum (priv->child_up, this, NULL))
                 gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
                         "Client-quorum is not met");
 
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 7010e9b..febc509 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1227,4 +1227,8 @@ afr_is_symmetric_error (call_frame_t *frame, xlator_t *this);
 
 gf_boolean_t
 afr_is_pending_set (xlator_t *this, dict_t *xdata, int type);
+
+gf_boolean_t
+afr_lookup_has_quorum (call_frame_t *frame, xlator_t *this,
+                       unsigned char *subvols);
 #endif /* __AFR_H__ */
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 2290343..6468ecb 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -13788,7 +13788,7 @@ glusterd_handle_replicate_brick_ops (glusterd_volinfo_t *volinfo,
                 goto out;
         }
 
-        ret = gf_asprintf (&pid, "%d", GF_CLIENT_PID_SELF_HEALD);
+        ret = gf_asprintf (&pid, "%d", GF_CLIENT_PID_ADD_REPLICA_MOUNT);
         if (ret < 0)
                 goto out;
 
-- 
1.8.3.1