21ab4e
From 8a4daf2701e4689e76f54a8fefb29ddc0ce2a574 Mon Sep 17 00:00:00 2001
21ab4e
From: Atin Mukherjee <amukherj@redhat.com>
21ab4e
Date: Thu, 20 Jul 2017 18:11:14 +0530
21ab4e
Subject: [PATCH 570/576] glusterd: fix brick start race
21ab4e
21ab4e
Problem:
21ab4e
21ab4e
Another race where glusterd was restarted glusterd_brick_start () is called
21ab4e
multiple times due to friend handshaking and in one instance when one of the
21ab4e
brick was attempted to be attached to the existing brick process,
21ab4e
send_attach_req failed as the first brick itself was still not up and then we
21ab4e
did a synlock_unlock () followed by a sleep of 1 sec, before the same thread
21ab4e
woke up, another thread tried to start the same brick process and then it
21ab4e
assumed that it has to start a fresh brick process.
21ab4e
21ab4e
Solution:
21ab4e
21ab4e
1. If brick is in starting phase (brickinfo->status ==
21ab4e
GF_BRICK_STARTING), no need for a reattempt to
21ab4e
start the brick.
21ab4e
2. While initiating attach_req set brickinfo->status to
21ab4e
GF_BRICK_STARTING
21ab4e
21ab4e
>Reviewed-on: https://review.gluster.org/17840
21ab4e
>Reviewed-by: Amar Tumballi <amarts@redhat.com>
21ab4e
>Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>
21ab4e
21ab4e
Change-Id: Ib007b6199ec36fdab4214a1d37f99d7f65ef64da
21ab4e
BUG: 1473327
21ab4e
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/113212
21ab4e
---
21ab4e
 xlators/mgmt/glusterd/src/glusterd-utils.c | 19 +++++++++++++++++++
21ab4e
 1 file changed, 19 insertions(+)
21ab4e
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
index 55f7089..3afed1e 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
@@ -5742,6 +5742,19 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
+        /* If a trigger to start the brick is already initiated then no need for
21ab4e
+         * a reattempt as it's an overkill. With glusterd_brick_start ()
21ab4e
+         * function being used in multiple places, when glusterd restarts we see
21ab4e
+         * three different triggers for an attempt to start the brick process
21ab4e
+         * due to the quorum handling code in glusterd_friend_sm.
21ab4e
+         */
21ab4e
+        if (brickinfo->status == GF_BRICK_STARTING) {
21ab4e
+                gf_msg_debug (this->name, 0, "brick %s is already in starting "
21ab4e
+                              "phase", brickinfo->path);
21ab4e
+                ret = 0;
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
21ab4e
         if (gf_is_service_running (pidfile, &pid)) {
21ab4e
                 if (brickinfo->status != GF_BRICK_STARTING &&
21ab4e
@@ -5802,6 +5815,12 @@ run:
21ab4e
         other_brick = find_compatible_brick (conf, volinfo, brickinfo,
21ab4e
                                              &other_vol);
21ab4e
         if (other_brick) {
21ab4e
+                /* mark the brick to starting as send_attach_req might take few
21ab4e
+                 * iterations to successfully attach the brick and we don't want
21ab4e
+                 * to get into a state where another needless trigger to start
21ab4e
+                 * the brick is processed
21ab4e
+                 */
21ab4e
+                brickinfo->status = GF_BRICK_STARTING;
21ab4e
                 ret = attach_brick (this, brickinfo, other_brick,
21ab4e
                                     volinfo, other_vol);
21ab4e
                 if (ret == 0) {
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e