21ab4e
From 615dd37c770519bf3a91445187afcbf6ab9922e5 Mon Sep 17 00:00:00 2001
21ab4e
From: Atin Mukherjee <amukherj@redhat.com>
21ab4e
Date: Thu, 1 Jun 2017 22:05:51 +0530
21ab4e
Subject: [PATCH 489/509] glusterd: fix brick start race
21ab4e
21ab4e
This commit tries to handle a race where we might end up trying to spawn
21ab4e
the brick process twice with two different set of ports resulting into
21ab4e
glusterd portmapper having the same brick entry in two different ports
21ab4e
which will result into clients to fail connect to bricks because of
21ab4e
incorrect ports been communicated back by glusterd.
21ab4e
21ab4e
In glusterd_brick_start () checking brickinfo->status flag to identify
21ab4e
whether a brick has been started by glusterd or not is not sufficient as
21ab4e
there might be cases where while glusterd restarts
21ab4e
glusterd_restart_bricks () will be called through glusterd_spawn_daemons
21ab4e
() in synctask and immediately glusterd_do_volume_quorum_action () with
21ab4e
server-side-quorum set to on will again try to start the brick and in
21ab4e
case if the RPC_CLNT_CONNECT event for the same brick  hasn't been processed by
21ab4e
glusterd by that time, brickinfo->status will still be marked as
21ab4e
GF_BRICK_STOPPED resulting into a reattempt to start the brick with a different
21ab4e
port and that would result portmap go for a toss and resulting clients to fetch
21ab4e
incorrect port.
21ab4e
21ab4e
Fix would be to introduce another enum value called GF_BRICK_STARTING in
21ab4e
brickinfo->status which will be set when a brick start is attempted by
21ab4e
glusterd and will be set to started through RPC_CLNT_CONNECT event. For
21ab4e
brick multiplexing, on attach brick request given the brickinfo->status
21ab4e
flag is marked to started directly this value will not have any effect.
21ab4e
Also this patch removes started_here flag as it looks to be redundant as
21ab4e
brickinfo->status.
21ab4e
21ab4e
>Reviewed-on: https://review.gluster.org/17447
21ab4e
>Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>
21ab4e
21ab4e
Change-Id: I9dda1a9a531b67734a6e8c7619677867b520dcb2
21ab4e
BUG: 1451756
21ab4e
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/108338
21ab4e
---
21ab4e
 tests/volume.rc                              |  5 -----
21ab4e
 xlators/mgmt/glusterd/src/glusterd-handler.c |  3 ---
21ab4e
 xlators/mgmt/glusterd/src/glusterd-op-sm.c   |  2 --
21ab4e
 xlators/mgmt/glusterd/src/glusterd-pmap.c    |  1 -
21ab4e
 xlators/mgmt/glusterd/src/glusterd-utils.c   | 23 +++++++++--------------
21ab4e
 xlators/mgmt/glusterd/src/glusterd.h         |  2 +-
21ab4e
 6 files changed, 10 insertions(+), 26 deletions(-)
21ab4e
21ab4e
diff --git a/tests/volume.rc b/tests/volume.rc
21ab4e
index b3786c3..495d8e5 100644
21ab4e
--- a/tests/volume.rc
21ab4e
+++ b/tests/volume.rc
21ab4e
@@ -271,11 +271,6 @@ function kill_brick {
21ab4e
 	local socket=$(cat $cmdline | tr '\0' '\n' | grep '\.socket$')
21ab4e
 
21ab4e
 	gf_attach -d $socket $brick
21ab4e
-	# Since we're not going through glusterd, we need to clean up the
21ab4e
-	# pidfile ourselves.  However, other state in glusterd (e.g.
21ab4e
-	# started_here) won't be updated.  A "stop-brick" CLI command would
21ab4e
-	# sure be useful.
21ab4e
-	rm -f $pidfile
21ab4e
 
21ab4e
 	# When the last brick in a process is terminated, the process has to
21ab4e
 	# sleep for a second to give the RPC response a chance to get back to
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
21ab4e
index 1fd7813..3c793d9 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
21ab4e
@@ -5652,7 +5652,6 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
21ab4e
                         brickinfo->hostname, brickinfo->path);
21ab4e
 
21ab4e
                 glusterd_set_brick_status (brickinfo, GF_BRICK_STARTED);
21ab4e
-                brickinfo->started_here = _gf_true;
21ab4e
 
21ab4e
                 gf_event (EVENT_BRICK_CONNECTED, "peer=%s;volume=%s;brick=%s",
21ab4e
                           brickinfo->hostname, volinfo->volname,
21ab4e
@@ -5684,8 +5683,6 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
21ab4e
                                 "Brick %s:%s has disconnected from glusterd.",
21ab4e
                                 brickinfo->hostname, brickinfo->path);
21ab4e
 
21ab4e
-                        brickinfo->started_here = _gf_false;
21ab4e
-
21ab4e
                         ret = get_volinfo_from_brickid (brickid, &volinfo);
21ab4e
                         if (ret) {
21ab4e
                                 gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
21ab4e
index f82a8f1..5fbf15f 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
21ab4e
@@ -6210,7 +6210,6 @@ glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr,
21ab4e
                          * TBD: move this to *after* the RPC
21ab4e
                          */
21ab4e
                         brickinfo->status = GF_BRICK_STOPPED;
21ab4e
-                        brickinfo->started_here = _gf_false;
21ab4e
                 }
21ab4e
         }
21ab4e
 
21ab4e
@@ -6311,7 +6310,6 @@ glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr,
21ab4e
                          * TBD: move this to *after* the RPC
21ab4e
                          */
21ab4e
                         brickinfo->status = GF_BRICK_STOPPED;
21ab4e
-                        brickinfo->started_here = _gf_false;
21ab4e
                 }
21ab4e
                 i++;
21ab4e
         }
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
21ab4e
index 13b347e..f43324c 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
21ab4e
@@ -593,7 +593,6 @@ __gluster_pmap_signout (rpcsvc_request_t *req)
21ab4e
                         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo,
21ab4e
                                                     conf);
21ab4e
                         sys_unlink (pidfile);
21ab4e
-                        brickinfo->started_here = _gf_false;
21ab4e
 
21ab4e
                         /* Setting the brick status to GF_BRICK_STOPPED to
21ab4e
                          * ensure correct brick status is maintained on the
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
index 903ee80..5450be4 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
@@ -1972,7 +1972,7 @@ retry:
21ab4e
 
21ab4e
         brickinfo->port = port;
21ab4e
         brickinfo->rdma_port = rdma_port;
21ab4e
-        brickinfo->started_here = _gf_true;
21ab4e
+        brickinfo->status = GF_BRICK_STARTING;
21ab4e
 
21ab4e
         if (wait) {
21ab4e
                 synclock_unlock (&priv->big_lock);
21ab4e
@@ -2020,6 +2020,8 @@ connect:
21ab4e
         }
21ab4e
 
21ab4e
 out:
21ab4e
+        if (ret)
21ab4e
+                brickinfo->status = GF_BRICK_STOPPED;
21ab4e
         return ret;
21ab4e
 }
21ab4e
 
21ab4e
@@ -2131,7 +2133,6 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
21ab4e
         gf_msg_debug (this->name,  0, "Unlinking pidfile %s", pidfile);
21ab4e
         (void) sys_unlink (pidfile);
21ab4e
 
21ab4e
-        brickinfo->started_here = _gf_false;
21ab4e
         brickinfo->status = GF_BRICK_STOPPED;
21ab4e
 
21ab4e
         if (del_brick)
21ab4e
@@ -5053,7 +5054,6 @@ attach_brick (xlator_t *this,
21ab4e
 
21ab4e
         brickinfo->port = other_brick->port;
21ab4e
         brickinfo->status = GF_BRICK_STARTED;
21ab4e
-        brickinfo->started_here = _gf_true;
21ab4e
         brickinfo->rpc = rpc_clnt_ref (other_brick->rpc);
21ab4e
 
21ab4e
         GLUSTERD_GET_BRICK_PIDFILE (pidfile1, other_vol, other_brick, conf);
21ab4e
@@ -5200,10 +5200,11 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
21ab4e
                 if (other_brick == brickinfo) {
21ab4e
                         continue;
21ab4e
                 }
21ab4e
-                if (!other_brick->started_here) {
21ab4e
+                if (strcmp (brickinfo->hostname, other_brick->hostname) != 0) {
21ab4e
                         continue;
21ab4e
                 }
21ab4e
-                if (strcmp (brickinfo->hostname, other_brick->hostname) != 0) {
21ab4e
+                if (other_brick->status != GF_BRICK_STARTED &&
21ab4e
+                    other_brick->status != GF_BRICK_STARTING) {
21ab4e
                         continue;
21ab4e
                 }
21ab4e
 
21ab4e
@@ -5229,7 +5230,7 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
21ab4e
                         gf_log (this->name, GF_LOG_INFO,
21ab4e
                                 "cleaning up dead brick %s:%s",
21ab4e
                                 other_brick->hostname, other_brick->path);
21ab4e
-                        other_brick->started_here = _gf_false;
21ab4e
+                        other_brick->status = GF_BRICK_STOPPED;
21ab4e
                         sys_unlink (pidfile2);
21ab4e
                         continue;
21ab4e
                 }
21ab4e
@@ -5435,16 +5436,11 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
21ab4e
 
21ab4e
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
21ab4e
         if (gf_is_service_running (pidfile, &pid)) {
21ab4e
-                /*
21ab4e
-                 * In general, if the pidfile exists and points to a running
21ab4e
-                 * process, this will already be set.  However, that's not the
21ab4e
-                 * case when we're starting up and bricks are already running.
21ab4e
-                 */
21ab4e
-                if (brickinfo->status != GF_BRICK_STARTED) {
21ab4e
+                if (brickinfo->status != GF_BRICK_STARTING &&
21ab4e
+                    brickinfo->status != GF_BRICK_STARTED) {
21ab4e
                         gf_log (this->name, GF_LOG_INFO,
21ab4e
                                 "discovered already-running brick %s",
21ab4e
                                 brickinfo->path);
21ab4e
-                        //brickinfo->status = GF_BRICK_STARTED;
21ab4e
                         (void) pmap_registry_bind (this,
21ab4e
                                         brickinfo->port, brickinfo->path,
21ab4e
                                         GF_PMAP_PORT_BRICKSERVER, NULL);
21ab4e
@@ -5468,7 +5464,6 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
21ab4e
                                 socketpath, brickinfo->path, volinfo->volname);
21ab4e
                         (void) glusterd_brick_connect (volinfo, brickinfo,
21ab4e
                                         socketpath);
21ab4e
-                        brickinfo->started_here = _gf_true;
21ab4e
                 }
21ab4e
                 return 0;
21ab4e
         }
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
21ab4e
index dde06bd..b5a0a3b 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd.h
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd.h
21ab4e
@@ -192,6 +192,7 @@ typedef enum gf_brick_status {
21ab4e
         GF_BRICK_STOPPED,
21ab4e
         GF_BRICK_STARTED,
21ab4e
         GF_BRICK_STOPPING,
21ab4e
+        GF_BRICK_STARTING
21ab4e
 } gf_brick_status_t;
21ab4e
 
21ab4e
 struct glusterd_brickinfo {
21ab4e
@@ -223,7 +224,6 @@ struct glusterd_brickinfo {
21ab4e
          * a replica 3 volume with arbiter enabled.
21ab4e
          */
21ab4e
         uint16_t           group;
21ab4e
-        gf_boolean_t       started_here;
21ab4e
 };
21ab4e
 
21ab4e
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e