cb8e9e
From 2cb67dea078a2d3f3f650e02eb91cedb3663f5d6 Mon Sep 17 00:00:00 2001
cb8e9e
From: Ravishankar N <ravishankar@redhat.com>
cb8e9e
Date: Mon, 14 Sep 2015 15:43:31 +0530
cb8e9e
Subject: [PATCH 330/330] afr: perform replace-brick in a synctask
cb8e9e
cb8e9e
Patch in master: http://review.gluster.org/#/c/12169/
cb8e9e
Patch in release-3.7: http://review.gluster.org/#/c/12172/
cb8e9e
cb8e9e
Problem:
cb8e9e
replace-brick setxattr is not performed inside a synctask. This can lead
cb8e9e
to hangs if the setxattr is executed by epoll thread, as the epoll
cb8e9e
thread will be waiting for replies to come where as epoll thread is the
cb8e9e
thread that needs to epoll_ctl for reading from socket and listen.
cb8e9e
cb8e9e
Fix:
cb8e9e
Move replace-brick to synctask to prevent epoll thread hang.
cb8e9e
cb8e9e
This patch is in line with the fix performed in
cb8e9e
http://review.gluster.org/#/c/12163/
cb8e9e
cb8e9e
Change-Id: I0e04fe549428a91b8ca5cfe359ce0526c6c0336d
cb8e9e
BUG: 1262291
cb8e9e
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
cb8e9e
Reviewed-on: https://code.engineering.redhat.com/gerrit/57735
cb8e9e
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
cb8e9e
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
cb8e9e
---
cb8e9e
 xlators/cluster/afr/src/afr-inode-write.c |   70 ++++++++++++++++++++++++-----
cb8e9e
 xlators/cluster/afr/src/afr-mem-types.h   |    1 +
cb8e9e
 xlators/cluster/afr/src/afr-messages.h    |   10 +++-
cb8e9e
 xlators/cluster/afr/src/afr.h             |    6 +++
cb8e9e
 4 files changed, 73 insertions(+), 14 deletions(-)
cb8e9e
cb8e9e
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
cb8e9e
index 01390f0..3968b24 100644
cb8e9e
--- a/xlators/cluster/afr/src/afr-inode-write.c
cb8e9e
+++ b/xlators/cluster/afr/src/afr-inode-write.c
cb8e9e
@@ -1099,27 +1099,45 @@ out:
cb8e9e
 }
cb8e9e
 
cb8e9e
 int
cb8e9e
-_afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
cb8e9e
-                           int rb_index)
cb8e9e
+_afr_handle_replace_brick_cbk (int ret, call_frame_t *frame, void *opaque)
cb8e9e
+{
cb8e9e
+        afr_replace_brick_args_t *data = NULL;
cb8e9e
+
cb8e9e
+        data = opaque;
cb8e9e
+        loc_wipe (&data->loc);
cb8e9e
+        GF_FREE (data);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int
cb8e9e
+_afr_handle_replace_brick (void *opaque)
cb8e9e
 {
cb8e9e
 
cb8e9e
         afr_local_t     *local          = NULL;
cb8e9e
         afr_private_t   *priv           = NULL;
cb8e9e
+        int              rb_index       = -1;
cb8e9e
         int              ret            = -1;
cb8e9e
         int              op_errno       = ENOMEM;
cb8e9e
+        call_frame_t    *frame          = NULL;
cb8e9e
+        xlator_t        *this           = NULL;
cb8e9e
+        afr_replace_brick_args_t *data  = NULL;
cb8e9e
 
cb8e9e
+        data = opaque;
cb8e9e
+        frame = data->frame;
cb8e9e
+        rb_index = data->rb_index;
cb8e9e
+        this = frame->this;
cb8e9e
         priv = this->private;
cb8e9e
 
cb8e9e
         local = AFR_FRAME_INIT (frame, op_errno);
cb8e9e
         if (!local)
cb8e9e
                 goto out;
cb8e9e
 
cb8e9e
-        loc_copy (&local->loc, loc);
cb8e9e
+        loc_copy (&local->loc, &data->loc);
cb8e9e
 
cb8e9e
         gf_log (this->name, GF_LOG_DEBUG, "Child being replaced is : %s",
cb8e9e
                 priv->children[rb_index]->name);
cb8e9e
 
cb8e9e
-        ret = _afr_handle_replace_brick_type (this, frame, loc, rb_index,
cb8e9e
+        ret = _afr_handle_replace_brick_type (this, frame, &local->loc, rb_index,
cb8e9e
                                               AFR_METADATA_TRANSACTION);
cb8e9e
         if (ret) {
cb8e9e
                 op_errno = -ret;
cb8e9e
@@ -1132,7 +1150,7 @@ _afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
cb8e9e
         local->pending = NULL;
cb8e9e
         local->xdata_req = NULL;
cb8e9e
 
cb8e9e
-        ret = _afr_handle_replace_brick_type (this, frame, loc, rb_index,
cb8e9e
+        ret = _afr_handle_replace_brick_type (this, frame, &local->loc, rb_index,
cb8e9e
                                               AFR_ENTRY_TRANSACTION);
cb8e9e
         if (ret) {
cb8e9e
                 op_errno = -ret;
cb8e9e
@@ -1359,32 +1377,60 @@ afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
cb8e9e
 {
cb8e9e
         int             ret               = -1;
cb8e9e
         int             rb_index          = -1;
cb8e9e
+        int             op_errno          = EPERM;
cb8e9e
         char           *replace_brick     = NULL;
cb8e9e
+        afr_replace_brick_args_t *data    = NULL;
cb8e9e
 
cb8e9e
         ret =  dict_get_str (dict, GF_AFR_REPLACE_BRICK, &replace_brick);
cb8e9e
 
cb8e9e
         if (!ret) {
cb8e9e
                 if (frame->root->pid != GF_CLIENT_PID_AFR_SELF_HEALD) {
cb8e9e
+                        gf_log (this->name, GF_LOG_ERROR, "'%s' is an internal"
cb8e9e
+                                " extended attribute : %s.",
cb8e9e
+                                GF_AFR_REPLACE_BRICK, strerror (EPERM));
cb8e9e
                         ret = 1;
cb8e9e
                         goto out;
cb8e9e
                 }
cb8e9e
                 rb_index = afr_get_child_index_from_name (this, replace_brick);
cb8e9e
 
cb8e9e
-                if (rb_index < 0)
cb8e9e
+                if (rb_index < 0) {
cb8e9e
                          /* Didn't belong to this replica pair
cb8e9e
                           * Just do a no-op
cb8e9e
                           */
cb8e9e
                         AFR_STACK_UNWIND (setxattr, frame, 0, 0, NULL);
cb8e9e
-                else
cb8e9e
-                        _afr_handle_replace_brick (this, frame, loc, rb_index);
cb8e9e
+                        return 0;
cb8e9e
+                } else {
cb8e9e
+                        data = GF_CALLOC (1, sizeof (*data),
cb8e9e
+                                          gf_afr_mt_replace_brick_t);
cb8e9e
+                        if (!data) {
cb8e9e
+                                ret = 1;
cb8e9e
+                                op_errno = ENOMEM;
cb8e9e
+                                goto out;
cb8e9e
+                        }
cb8e9e
+                        data->frame = frame;
cb8e9e
+                        loc_copy (&data->loc, loc);
cb8e9e
+                        data->rb_index = rb_index;
cb8e9e
+                        ret = synctask_new (this->ctx->env,
cb8e9e
+                                            _afr_handle_replace_brick,
cb8e9e
+                                            _afr_handle_replace_brick_cbk,
cb8e9e
+                                            NULL, data);
cb8e9e
+                        if (ret) {
cb8e9e
+                                gf_msg (this->name, GF_LOG_ERROR, 0,
cb8e9e
+                                        AFR_MSG_REPLACE_BRICK_FAILED,
cb8e9e
+                                        "Failed to create synctask. Unable to "
cb8e9e
+                                        "perform replace-brick.");
cb8e9e
+                                ret = 1;
cb8e9e
+                                op_errno = ENOMEM;
cb8e9e
+                                loc_wipe (&data->loc);
cb8e9e
+                                GF_FREE (data);
cb8e9e
+                                goto out;
cb8e9e
+                        }
cb8e9e
+                }
cb8e9e
                 ret = 0;
cb8e9e
         }
cb8e9e
 out:
cb8e9e
         if (ret == 1) {
cb8e9e
-                gf_log (this->name, GF_LOG_ERROR, "'%s' is an internal"
cb8e9e
-                        " extended attribute : %s.",
cb8e9e
-                        GF_AFR_REPLACE_BRICK, strerror (EPERM));
cb8e9e
-                AFR_STACK_UNWIND (setxattr, frame, -1, EPERM, NULL);
cb8e9e
+                AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
cb8e9e
                 ret = 0;
cb8e9e
         }
cb8e9e
         return ret;
cb8e9e
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
cb8e9e
index fd484e4..6f1eee9 100644
cb8e9e
--- a/xlators/cluster/afr/src/afr-mem-types.h
cb8e9e
+++ b/xlators/cluster/afr/src/afr-mem-types.h
cb8e9e
@@ -45,6 +45,7 @@ enum gf_afr_mem_types_ {
cb8e9e
 	gf_afr_mt_subvol_healer_t,
cb8e9e
 	gf_afr_mt_spbc_timeout_t,
cb8e9e
         gf_afr_mt_spb_status_t,
cb8e9e
+        gf_afr_mt_replace_brick_t,
cb8e9e
         gf_afr_mt_end
cb8e9e
 };
cb8e9e
 #endif
cb8e9e
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
cb8e9e
index 4793413..d07c7bc 100644
cb8e9e
--- a/xlators/cluster/afr/src/afr-messages.h
cb8e9e
+++ b/xlators/cluster/afr/src/afr-messages.h
cb8e9e
@@ -45,7 +45,7 @@
cb8e9e
  */
cb8e9e
 
cb8e9e
 #define GLFS_COMP_BASE_AFR      GLFS_MSGID_COMP_AFR
cb8e9e
-#define GLFS_NUM_MESSAGES       38
cb8e9e
+#define GLFS_NUM_MESSAGES       39
cb8e9e
 #define GLFS_MSGID_END          (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)
cb8e9e
 
cb8e9e
 #define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages"
cb8e9e
@@ -332,7 +332,6 @@
cb8e9e
 */
cb8e9e
 #define AFR_MSG_SELF_HEAL_FAILED                (GLFS_COMP_BASE_AFR + 37)
cb8e9e
 
cb8e9e
-
cb8e9e
 /*!
cb8e9e
  * @messageid 108038
cb8e9e
  * @diagnosis
cb8e9e
@@ -340,6 +339,13 @@
cb8e9e
 */
cb8e9e
 #define AFR_MSG_SPLIT_BRAIN_STATUS      (GLFS_COMP_BASE_AFR + 38)
cb8e9e
 
cb8e9e
+/*!
cb8e9e
+ * @messageid 108039
cb8e9e
+ * @diagnosis
cb8e9e
+ * @recommendedaction
cb8e9e
+*/
cb8e9e
+#define AFR_MSG_REPLACE_BRICK_FAILED                (GLFS_COMP_BASE_AFR + 39)
cb8e9e
+
cb8e9e
 
cb8e9e
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
cb8e9e
 #endif /* !_AFR_MESSAGES_H_ */
cb8e9e
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
cb8e9e
index 316eeea..19e764a 100644
cb8e9e
--- a/xlators/cluster/afr/src/afr.h
cb8e9e
+++ b/xlators/cluster/afr/src/afr.h
cb8e9e
@@ -760,6 +760,12 @@ typedef struct afr_spb_status {
cb8e9e
         loc_t        *loc;
cb8e9e
 } afr_spb_status_t;
cb8e9e
 
cb8e9e
+typedef struct afr_replace_brick_args {
cb8e9e
+        call_frame_t *frame;
cb8e9e
+        loc_t loc;
cb8e9e
+        int rb_index;
cb8e9e
+} afr_replace_brick_args_t;
cb8e9e
+
cb8e9e
 typedef struct afr_read_subvol_args {
cb8e9e
         ia_type_t ia_type;
cb8e9e
         uuid_t gfid;
cb8e9e
-- 
cb8e9e
1.7.1
cb8e9e