21ab4e
From 71f895fb6c0fd6f4146cf0b103be3ed4079d210e Mon Sep 17 00:00:00 2001
21ab4e
From: Mohit Agrawal <moagrawa@redhat.com>
21ab4e
Date: Tue, 8 Aug 2017 14:36:17 +0530
21ab4e
Subject: [PATCH 600/601] glusterd: Block brick attach request till the brick's
21ab4e
 ctx is set
21ab4e
MIME-Version: 1.0
21ab4e
Content-Type: text/plain; charset=UTF-8
21ab4e
Content-Transfer-Encoding: 8bit
21ab4e
21ab4e
Problem: In multiplexing setup in a container environment we hit a race
21ab4e
where before the first brick finishes its handshake with glusterd, the
21ab4e
subsequent attach requests went through and they actually failed and
21ab4e
glusterd has no mechanism to realize it. This resulted into all the such
21ab4e
bricks not to be active resulting into clients not able to connect.
21ab4e
21ab4e
Solution: Introduce a new flag port_registered in glusterd_brickinfo
21ab4e
          to make sure about pmap_signin finish before the subsequent
21ab4e
          attach bricks can be processed.
21ab4e
21ab4e
Test:     To reproduce the issue followed below steps
21ab4e
          1) Create 100 volumes on 3 nodes(1x3) in CNS environment
21ab4e
          2) Enable brick multiplexing
21ab4e
          3) Reboot one container
21ab4e
          4) Run below command
21ab4e
             for v in ‛gluster v list‛
21ab4e
             do
21ab4e
               glfsheal $v | grep -i "transport"
21ab4e
             done
21ab4e
          After apply the patch command should not fail.
21ab4e
21ab4e
Note:   A big thanks to Atin for suggest the fix.
21ab4e
21ab4e
>Reviewed-on: https://review.gluster.org/17984
21ab4e
>Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
>Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>
21ab4e
21ab4e
BUG: 1477024
21ab4e
Change-Id: I8e1bd6132122b3a5b0dd49606cea564122f2609b
21ab4e
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/114764
21ab4e
Tested-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
---
21ab4e
 xlators/mgmt/glusterd/src/glusterd-pmap.c  |  7 +++
21ab4e
 xlators/mgmt/glusterd/src/glusterd-utils.c | 73 ++++++++++++++++++++----------
21ab4e
 xlators/mgmt/glusterd/src/glusterd.h       |  1 +
21ab4e
 3 files changed, 57 insertions(+), 24 deletions(-)
21ab4e
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
21ab4e
index 6fead59..7acee19 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
21ab4e
@@ -532,6 +532,9 @@ __gluster_pmap_signin (rpcsvc_request_t *req)
21ab4e
                                          GF_PMAP_PORT_BRICKSERVER, req->trans);
21ab4e
 
21ab4e
         ret = glusterd_get_brickinfo (THIS, args.brick, args.port, &brickinfo);
21ab4e
+        /* Update portmap status in brickinfo */
21ab4e
+        if (brickinfo)
21ab4e
+                brickinfo->port_registered = _gf_true;
21ab4e
 
21ab4e
 fail:
21ab4e
         glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
21ab4e
@@ -584,6 +587,10 @@ __gluster_pmap_signout (rpcsvc_request_t *req)
21ab4e
                                 brick_path, GF_PMAP_PORT_BRICKSERVER,
21ab4e
                                 req->trans);
21ab4e
         }
21ab4e
+        /* Update portmap status on brickinfo */
21ab4e
+        if (brickinfo)
21ab4e
+                brickinfo->port_registered = _gf_false;
21ab4e
+
21ab4e
         /* Clean up the pidfile for this brick given glusterfsd doesn't clean it
21ab4e
          * any more. This is required to ensure we don't end up with having
21ab4e
          * stale pid files in case a brick is killed from the backend
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
index 3afed1e..407e5b8 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
@@ -2057,6 +2057,7 @@ retry:
21ab4e
         brickinfo->port = port;
21ab4e
         brickinfo->rdma_port = rdma_port;
21ab4e
         brickinfo->status = GF_BRICK_STARTING;
21ab4e
+        brickinfo->port_registered = _gf_false;
21ab4e
 
21ab4e
         if (wait) {
21ab4e
                 synclock_unlock (&priv->big_lock);
21ab4e
@@ -5339,13 +5340,6 @@ attach_brick (xlator_t *this,
21ab4e
 
21ab4e
         GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, unslashed);
21ab4e
 
21ab4e
-        ret = pmap_registry_extend (this, other_brick->port,
21ab4e
-                                    brickinfo->path);
21ab4e
-        if (ret != 0) {
21ab4e
-                gf_log (this->name, GF_LOG_ERROR,
21ab4e
-                        "adding brick to process failed");
21ab4e
-                return -1;
21ab4e
-        }
21ab4e
         GLUSTERD_GET_BRICK_PIDFILE (pidfile1, other_vol, other_brick, conf);
21ab4e
         GLUSTERD_GET_BRICK_PIDFILE (pidfile2, volinfo, brickinfo, conf);
21ab4e
 
21ab4e
@@ -5368,6 +5362,14 @@ attach_brick (xlator_t *this,
21ab4e
                                                GLUSTERD_BRICK_ATTACH);
21ab4e
                         rpc_clnt_unref (rpc);
21ab4e
                         if (!ret) {
21ab4e
+                                ret = pmap_registry_extend (this, other_brick->port,
21ab4e
+                                                            brickinfo->path);
21ab4e
+                                if (ret != 0) {
21ab4e
+                                        gf_log (this->name, GF_LOG_ERROR,
21ab4e
+                                                "adding brick to process failed");
21ab4e
+                                        return ret;
21ab4e
+                                }
21ab4e
+
21ab4e
                                 /* PID file is copied once brick has attached
21ab4e
                                   successfully
21ab4e
                                 */
21ab4e
@@ -5376,6 +5378,7 @@ attach_brick (xlator_t *this,
21ab4e
                                 brickinfo->status = GF_BRICK_STARTED;
21ab4e
                                 brickinfo->rpc =
21ab4e
                                         rpc_clnt_ref (other_brick->rpc);
21ab4e
+                                brickinfo->port_registered = _gf_true;
21ab4e
                                 ret = glusterd_brick_process_add_brick (brickinfo,
21ab4e
                                                                         volinfo);
21ab4e
                                 if (ret) {
21ab4e
@@ -5429,6 +5432,7 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
21ab4e
         int16_t                 retries                 = 15;
21ab4e
         int                     mux_limit               = -1;
21ab4e
         int                     ret                     = -1;
21ab4e
+        gf_boolean_t            brick_status            = _gf_false;
21ab4e
 
21ab4e
         /*
21ab4e
          * If comp_vol is provided, we have to check *volume* compatibility
21ab4e
@@ -5503,31 +5507,45 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
21ab4e
                                 "set. Continuing with no limit set for "
21ab4e
                                 "brick multiplexing.");
21ab4e
                 }
21ab4e
-
21ab4e
-                GLUSTERD_GET_BRICK_PIDFILE (pidfile2, srch_vol, other_brick,
21ab4e
-                                            conf);
21ab4e
-
21ab4e
-                /* It is possible that the pidfile hasn't yet been populated,
21ab4e
-                 * when bricks are started in "no-wait" mode; for example
21ab4e
-                 * when bricks are started by glusterd_restart_bricks(). So
21ab4e
-                 * wait for the pidfile to be populated with a value before
21ab4e
-                 * checking if the service is running */
21ab4e
+                /* The first brick process might take some time to finish its
21ab4e
+                 * handshake with glusterd and prepare the graph. We can't
21ab4e
+                 * afford to send attach_req for other bricks till that time.
21ab4e
+                 * brick process sends PMAP_SIGNIN event after processing the
21ab4e
+                 * volfile and hence it's safe to assume that if glusterd has
21ab4e
+                 * received a pmap signin request for the same brick, we are
21ab4e
+                 * good for subsequent attach requests.
21ab4e
+                 */
21ab4e
+                retries = 15;
21ab4e
                 while (retries > 0) {
21ab4e
-                        if (sys_access (pidfile2, F_OK) == 0 &&
21ab4e
-                            gf_is_service_running (pidfile2, &pid2)) {
21ab4e
-                                break;
21ab4e
+                        if (other_brick->port_registered) {
21ab4e
+                                GLUSTERD_GET_BRICK_PIDFILE (pidfile2, srch_vol,
21ab4e
+                                                            other_brick, conf);
21ab4e
+                                if (sys_access (pidfile2, F_OK) == 0 &&
21ab4e
+                                    gf_is_service_running (pidfile2, &pid2)) {
21ab4e
+                                        gf_msg_debug (this->name, 0,
21ab4e
+                                                      "brick %s is running as a pid %d ",
21ab4e
+                                                      other_brick->path, pid2);
21ab4e
+                                        brick_status = _gf_true;
21ab4e
+                                        break;
21ab4e
+                                }
21ab4e
                         }
21ab4e
 
21ab4e
-                        sleep (1);
21ab4e
+                        synclock_unlock (&conf->big_lock);
21ab4e
+                        gf_msg_debug (this->name, 0, "brick %s is still"
21ab4e
+                                      " starting, waiting for 2 seconds ",
21ab4e
+                                      other_brick->path);
21ab4e
+                        sleep(2);
21ab4e
+                        synclock_lock (&conf->big_lock);
21ab4e
                         retries--;
21ab4e
                 }
21ab4e
 
21ab4e
-                if (retries == 0) {
21ab4e
+                if (!brick_status) {
21ab4e
                         gf_log (this->name, GF_LOG_INFO,
21ab4e
-                                "cleaning up dead brick %s:%s",
21ab4e
+                                "brick has not come up so cleaning up dead brick %s:%s",
21ab4e
                                 other_brick->hostname, other_brick->path);
21ab4e
                         other_brick->status = GF_BRICK_STOPPED;
21ab4e
-                        sys_unlink (pidfile2);
21ab4e
+                        if (pidfile2[0])
21ab4e
+                                sys_unlink (pidfile2);
21ab4e
                         continue;
21ab4e
                 }
21ab4e
                 return other_brick;
21ab4e
@@ -5803,6 +5821,12 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
21ab4e
                                         brickinfo->path);
21ab4e
                                 goto out;
21ab4e
                         }
21ab4e
+                        /* We need to set the status back to STARTING so that
21ab4e
+                         * while the other (re)start brick requests come in for
21ab4e
+                         * other bricks, this brick can be considered as
21ab4e
+                         * compatible.
21ab4e
+                         */
21ab4e
+                        brickinfo->status = GF_BRICK_STARTING;
21ab4e
                 }
21ab4e
                 return 0;
21ab4e
         }
21ab4e
@@ -5846,7 +5870,8 @@ run:
21ab4e
          *
21ab4e
          * TBD: pray for GlusterD 2 to be ready soon.
21ab4e
          */
21ab4e
-
21ab4e
+        gf_log (this->name, GF_LOG_INFO, "starting a fresh brick process for "
21ab4e
+                "brick %s", brickinfo->path);
21ab4e
         ret = glusterd_volume_start_glusterfs (volinfo, brickinfo, wait);
21ab4e
         if (ret) {
21ab4e
                 gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
21ab4e
index c493773..533f9a8 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd.h
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd.h
21ab4e
@@ -226,6 +226,7 @@ struct glusterd_brickinfo {
21ab4e
          * a replica 3 volume with arbiter enabled.
21ab4e
          */
21ab4e
         uint16_t           group;
21ab4e
+        gf_boolean_t       port_registered;
21ab4e
 };
21ab4e
 
21ab4e
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e