Blob Blame History Raw
From 615dd37c770519bf3a91445187afcbf6ab9922e5 Mon Sep 17 00:00:00 2001
From: Atin Mukherjee <amukherj@redhat.com>
Date: Thu, 1 Jun 2017 22:05:51 +0530
Subject: [PATCH 489/509] glusterd: fix brick start race

This commit tries to handle a race where we might end up trying to spawn
the brick process twice with two different set of ports resulting into
glusterd portmapper having the same brick entry in two different ports
which will result into clients to fail connect to bricks because of
incorrect ports been communicated back by glusterd.

In glusterd_brick_start () checking brickinfo->status flag to identify
whether a brick has been started by glusterd or not is not sufficient as
there might be cases where while glusterd restarts
glusterd_restart_bricks () will be called through glusterd_spawn_daemons
() in synctask and immediately glusterd_do_volume_quorum_action () with
server-side-quorum set to on will again try to start the brick and in
case if the RPC_CLNT_CONNECT event for the same brick  hasn't been processed by
glusterd by that time, brickinfo->status will still be marked as
GF_BRICK_STOPPED resulting into a reattempt to start the brick with a different
port and that would result portmap go for a toss and resulting clients to fetch
incorrect port.

Fix would be to introduce another enum value called GF_BRICK_STARTING in
brickinfo->status which will be set when a brick start is attempted by
glusterd and will be set to started through RPC_CLNT_CONNECT event. For
brick multiplexing, on attach brick request given the brickinfo->status
flag is marked to started directly this value will not have any effect.
Also this patch removes started_here flag as it looks to be redundant as
brickinfo->status.

>Reviewed-on: https://review.gluster.org/17447
>Smoke: Gluster Build System <jenkins@build.gluster.org>
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
>Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>

Change-Id: I9dda1a9a531b67734a6e8c7619677867b520dcb2
BUG: 1451756
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/108338
---
 tests/volume.rc                              |  5 -----
 xlators/mgmt/glusterd/src/glusterd-handler.c |  3 ---
 xlators/mgmt/glusterd/src/glusterd-op-sm.c   |  2 --
 xlators/mgmt/glusterd/src/glusterd-pmap.c    |  1 -
 xlators/mgmt/glusterd/src/glusterd-utils.c   | 23 +++++++++--------------
 xlators/mgmt/glusterd/src/glusterd.h         |  2 +-
 6 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/tests/volume.rc b/tests/volume.rc
index b3786c3..495d8e5 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -271,11 +271,6 @@ function kill_brick {
 	local socket=$(cat $cmdline | tr '\0' '\n' | grep '\.socket$')
 
 	gf_attach -d $socket $brick
-	# Since we're not going through glusterd, we need to clean up the
-	# pidfile ourselves.  However, other state in glusterd (e.g.
-	# started_here) won't be updated.  A "stop-brick" CLI command would
-	# sure be useful.
-	rm -f $pidfile
 
 	# When the last brick in a process is terminated, the process has to
 	# sleep for a second to give the RPC response a chance to get back to
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index 1fd7813..3c793d9 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -5652,7 +5652,6 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
                         brickinfo->hostname, brickinfo->path);
 
                 glusterd_set_brick_status (brickinfo, GF_BRICK_STARTED);
-                brickinfo->started_here = _gf_true;
 
                 gf_event (EVENT_BRICK_CONNECTED, "peer=%s;volume=%s;brick=%s",
                           brickinfo->hostname, volinfo->volname,
@@ -5684,8 +5683,6 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
                                 "Brick %s:%s has disconnected from glusterd.",
                                 brickinfo->hostname, brickinfo->path);
 
-                        brickinfo->started_here = _gf_false;
-
                         ret = get_volinfo_from_brickid (brickid, &volinfo);
                         if (ret) {
                                 gf_msg (this->name, GF_LOG_ERROR, 0,
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index f82a8f1..5fbf15f 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -6210,7 +6210,6 @@ glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr,
                          * TBD: move this to *after* the RPC
                          */
                         brickinfo->status = GF_BRICK_STOPPED;
-                        brickinfo->started_here = _gf_false;
                 }
         }
 
@@ -6311,7 +6310,6 @@ glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr,
                          * TBD: move this to *after* the RPC
                          */
                         brickinfo->status = GF_BRICK_STOPPED;
-                        brickinfo->started_here = _gf_false;
                 }
                 i++;
         }
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
index 13b347e..f43324c 100644
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
@@ -593,7 +593,6 @@ __gluster_pmap_signout (rpcsvc_request_t *req)
                         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo,
                                                     conf);
                         sys_unlink (pidfile);
-                        brickinfo->started_here = _gf_false;
 
                         /* Setting the brick status to GF_BRICK_STOPPED to
                          * ensure correct brick status is maintained on the
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 903ee80..5450be4 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -1972,7 +1972,7 @@ retry:
 
         brickinfo->port = port;
         brickinfo->rdma_port = rdma_port;
-        brickinfo->started_here = _gf_true;
+        brickinfo->status = GF_BRICK_STARTING;
 
         if (wait) {
                 synclock_unlock (&priv->big_lock);
@@ -2020,6 +2020,8 @@ connect:
         }
 
 out:
+        if (ret)
+                brickinfo->status = GF_BRICK_STOPPED;
         return ret;
 }
 
@@ -2131,7 +2133,6 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
         gf_msg_debug (this->name,  0, "Unlinking pidfile %s", pidfile);
         (void) sys_unlink (pidfile);
 
-        brickinfo->started_here = _gf_false;
         brickinfo->status = GF_BRICK_STOPPED;
 
         if (del_brick)
@@ -5053,7 +5054,6 @@ attach_brick (xlator_t *this,
 
         brickinfo->port = other_brick->port;
         brickinfo->status = GF_BRICK_STARTED;
-        brickinfo->started_here = _gf_true;
         brickinfo->rpc = rpc_clnt_ref (other_brick->rpc);
 
         GLUSTERD_GET_BRICK_PIDFILE (pidfile1, other_vol, other_brick, conf);
@@ -5200,10 +5200,11 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
                 if (other_brick == brickinfo) {
                         continue;
                 }
-                if (!other_brick->started_here) {
+                if (strcmp (brickinfo->hostname, other_brick->hostname) != 0) {
                         continue;
                 }
-                if (strcmp (brickinfo->hostname, other_brick->hostname) != 0) {
+                if (other_brick->status != GF_BRICK_STARTED &&
+                    other_brick->status != GF_BRICK_STARTING) {
                         continue;
                 }
 
@@ -5229,7 +5230,7 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
                         gf_log (this->name, GF_LOG_INFO,
                                 "cleaning up dead brick %s:%s",
                                 other_brick->hostname, other_brick->path);
-                        other_brick->started_here = _gf_false;
+                        other_brick->status = GF_BRICK_STOPPED;
                         sys_unlink (pidfile2);
                         continue;
                 }
@@ -5435,16 +5436,11 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
 
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
         if (gf_is_service_running (pidfile, &pid)) {
-                /*
-                 * In general, if the pidfile exists and points to a running
-                 * process, this will already be set.  However, that's not the
-                 * case when we're starting up and bricks are already running.
-                 */
-                if (brickinfo->status != GF_BRICK_STARTED) {
+                if (brickinfo->status != GF_BRICK_STARTING &&
+                    brickinfo->status != GF_BRICK_STARTED) {
                         gf_log (this->name, GF_LOG_INFO,
                                 "discovered already-running brick %s",
                                 brickinfo->path);
-                        //brickinfo->status = GF_BRICK_STARTED;
                         (void) pmap_registry_bind (this,
                                         brickinfo->port, brickinfo->path,
                                         GF_PMAP_PORT_BRICKSERVER, NULL);
@@ -5468,7 +5464,6 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
                                 socketpath, brickinfo->path, volinfo->volname);
                         (void) glusterd_brick_connect (volinfo, brickinfo,
                                         socketpath);
-                        brickinfo->started_here = _gf_true;
                 }
                 return 0;
         }
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index dde06bd..b5a0a3b 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -192,6 +192,7 @@ typedef enum gf_brick_status {
         GF_BRICK_STOPPED,
         GF_BRICK_STARTED,
         GF_BRICK_STOPPING,
+        GF_BRICK_STARTING
 } gf_brick_status_t;
 
 struct glusterd_brickinfo {
@@ -223,7 +224,6 @@ struct glusterd_brickinfo {
          * a replica 3 volume with arbiter enabled.
          */
         uint16_t           group;
-        gf_boolean_t       started_here;
 };
 
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
-- 
1.8.3.1