From c1fb83040ecc324c503d93dfd800c5bdc677428c Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Thu, 2 Feb 2017 13:08:04 -0500 Subject: [PATCH 324/361] glusterd: double-check brick liveness for remove-brick validation Same problem as https://review.gluster.org/#/c/16509/ in a different place. Tests detach bricks without glusterd's knowledge, so glusterd's internal brick state is out of date and we have to re-check (via the brick's pidfile) as well. mainline: > BUG: 1385758 > Reviewed-on: https://review.gluster.org/16529 > Smoke: Gluster Build System > NetBSD-regression: NetBSD Build System > CentOS-regression: Gluster Build System > Reviewed-by: Atin Mukherjee (cherry picked from commit 13cd11a91ec52af6a7cfcbd7e0c34f1c27904df6) BUG: 1417815 Change-Id: I169538c1c62d72a685a49d57ef65fb6c3db6eab2 Signed-off-by: Jeff Darcy Reviewed-on: https://code.engineering.redhat.com/gerrit/101305 Tested-by: Milind Changire Reviewed-by: Atin Mukherjee --- ...-1225716-brick-online-validation-remove-brick.t | 6 ++++-- xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 22 +++++++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t b/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t index eca1c1a..47403b4 100644 --- a/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t +++ b/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t @@ -12,7 +12,8 @@ TEST $CLI volume create $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 $H0:$B0/${V0}2 TEST $CLI volume start $V0 #kill a brick process -kill -15 `cat $GLUSTERD_WORKDIR/vols/$V0/run/$H0-d-backends-${V0}1.pid`; +kill_brick $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1 #remove-brick start should fail as the brick is down TEST ! $CLI volume remove-brick $V0 $H0:$B0/${V0}1 start @@ -26,7 +27,8 @@ TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}1 start EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" remove_brick_status_completed_field "$V0 $H0:$B0/${V0}1" #kill a brick process -kill -15 `cat $GLUSTERD_WORKDIR/vols/$V0/run/$H0-d-backends-${V0}1.pid`; +kill_brick $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1 #remove-brick commit should pass even if the brick is down TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}1 commit diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index b22a7da..e12d314 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -1947,6 +1947,8 @@ glusterd_remove_brick_validate_bricks (gf1_op_commands cmd, int32_t brick_count, glusterd_peerinfo_t *peerinfo = NULL; int i = 0; int ret = -1; + char pidfile[PATH_MAX+1] = {0,}; + glusterd_conf_t *priv = THIS->private; /* Check whether all the nodes of the bricks to be removed are * up, if not fail the operation */ @@ -1996,15 +1998,29 @@ glusterd_remove_brick_validate_bricks (gf1_op_commands cmd, int32_t brick_count, } if (glusterd_is_local_brick (THIS, volinfo, brickinfo)) { - if (((cmd == GF_OP_CMD_START) || - (cmd == GF_OP_CMD_DETACH_START)) && - brickinfo->status != GF_BRICK_STARTED) { + switch (cmd) { + case GF_OP_CMD_START: + case GF_OP_CMD_DETACH_START: + break; + default: + continue; + } + if (brickinfo->status != GF_BRICK_STARTED) { snprintf (msg, sizeof (msg), "Found stopped " "brick %s", brick); *errstr = gf_strdup (msg); ret = -1; goto out; } + GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, + brickinfo, priv); + if (!gf_is_service_running (pidfile, NULL)) { + snprintf (msg, sizeof (msg), "Found dead " + "brick %s", brick); + *errstr = gf_strdup (msg); + ret = -1; + goto out; + } continue; } -- 1.8.3.1