From 8a4daf2701e4689e76f54a8fefb29ddc0ce2a574 Mon Sep 17 00:00:00 2001 From: Atin Mukherjee Date: Thu, 20 Jul 2017 18:11:14 +0530 Subject: [PATCH 570/576] glusterd: fix brick start race Problem: Another race where glusterd was restarted glusterd_brick_start () is called multiple times due to friend handshaking and in one instance when one of the brick was attempted to be attached to the existing brick process, send_attach_req failed as the first brick itself was still not up and then we did a synlock_unlock () followed by a sleep of 1 sec, before the same thread woke up, another thread tried to start the same brick process and then it assumed that it has to start a fresh brick process. Solution: 1. If brick is in starting phase (brickinfo->status == GF_BRICK_STARTING), no need for a reattempt to start the brick. 2. While initiating attach_req set brickinfo->status to GF_BRICK_STARTING >Reviewed-on: https://review.gluster.org/17840 >Reviewed-by: Amar Tumballi >Smoke: Gluster Build System >CentOS-regression: Gluster Build System >Reviewed-by: Jeff Darcy Change-Id: Ib007b6199ec36fdab4214a1d37f99d7f65ef64da BUG: 1473327 Signed-off-by: Atin Mukherjee Reviewed-on: https://code.engineering.redhat.com/gerrit/113212 --- xlators/mgmt/glusterd/src/glusterd-utils.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 55f7089..3afed1e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -5742,6 +5742,19 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo, goto out; } + /* If a trigger to start the brick is already initiated then no need for + * a reattempt as it's an overkill. With glusterd_brick_start () + * function being used in multiple places, when glusterd restarts we see + * three different triggers for an attempt to start the brick process + * due to the quorum handling code in glusterd_friend_sm. + */ + if (brickinfo->status == GF_BRICK_STARTING) { + gf_msg_debug (this->name, 0, "brick %s is already in starting " + "phase", brickinfo->path); + ret = 0; + goto out; + } + GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf); if (gf_is_service_running (pidfile, &pid)) { if (brickinfo->status != GF_BRICK_STARTING && @@ -5802,6 +5815,12 @@ run: other_brick = find_compatible_brick (conf, volinfo, brickinfo, &other_vol); if (other_brick) { + /* mark the brick to starting as send_attach_req might take few + * iterations to successfully attach the brick and we don't want + * to get into a state where another needless trigger to start + * the brick is processed + */ + brickinfo->status = GF_BRICK_STARTING; ret = attach_brick (this, brickinfo, other_brick, volinfo, other_vol); if (ret == 0) { -- 1.8.3.1