From c1fb83040ecc324c503d93dfd800c5bdc677428c Mon Sep 17 00:00:00 2001
From: Jeff Darcy <jdarcy@redhat.com>
Date: Thu, 2 Feb 2017 13:08:04 -0500
Subject: [PATCH 324/361] glusterd: double-check brick liveness for
remove-brick validation
Same problem as https://review.gluster.org/#/c/16509/ in a different
place. Tests detach bricks without glusterd's knowledge, so
glusterd's internal brick state is out of date and we have to re-check
(via the brick's pidfile) as well.
mainline:
> BUG: 1385758
> Reviewed-on: https://review.gluster.org/16529
> Smoke: Gluster Build System <jenkins@build.gluster.org>
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
> Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
(cherry picked from commit 13cd11a91ec52af6a7cfcbd7e0c34f1c27904df6)
BUG: 1417815
Change-Id: I169538c1c62d72a685a49d57ef65fb6c3db6eab2
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/101305
Tested-by: Milind Changire <mchangir@redhat.com>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
...-1225716-brick-online-validation-remove-brick.t | 6 ++++--
xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 22 +++++++++++++++++++---
2 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t b/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t
index eca1c1a..47403b4 100644
--- a/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t
+++ b/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t
@@ -12,7 +12,8 @@ TEST $CLI volume create $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 $H0:$B0/${V0}2
TEST $CLI volume start $V0
#kill a brick process
-kill -15 `cat $GLUSTERD_WORKDIR/vols/$V0/run/$H0-d-backends-${V0}1.pid`;
+kill_brick $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1
#remove-brick start should fail as the brick is down
TEST ! $CLI volume remove-brick $V0 $H0:$B0/${V0}1 start
@@ -26,7 +27,8 @@ TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}1 start
EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" remove_brick_status_completed_field "$V0 $H0:$B0/${V0}1"
#kill a brick process
-kill -15 `cat $GLUSTERD_WORKDIR/vols/$V0/run/$H0-d-backends-${V0}1.pid`;
+kill_brick $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1
#remove-brick commit should pass even if the brick is down
TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}1 commit
diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
index b22a7da..e12d314 100644
--- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
@@ -1947,6 +1947,8 @@ glusterd_remove_brick_validate_bricks (gf1_op_commands cmd, int32_t brick_count,
glusterd_peerinfo_t *peerinfo = NULL;
int i = 0;
int ret = -1;
+ char pidfile[PATH_MAX+1] = {0,};
+ glusterd_conf_t *priv = THIS->private;
/* Check whether all the nodes of the bricks to be removed are
* up, if not fail the operation */
@@ -1996,15 +1998,29 @@ glusterd_remove_brick_validate_bricks (gf1_op_commands cmd, int32_t brick_count,
}
if (glusterd_is_local_brick (THIS, volinfo, brickinfo)) {
- if (((cmd == GF_OP_CMD_START) ||
- (cmd == GF_OP_CMD_DETACH_START)) &&
- brickinfo->status != GF_BRICK_STARTED) {
+ switch (cmd) {
+ case GF_OP_CMD_START:
+ case GF_OP_CMD_DETACH_START:
+ break;
+ default:
+ continue;
+ }
+ if (brickinfo->status != GF_BRICK_STARTED) {
snprintf (msg, sizeof (msg), "Found stopped "
"brick %s", brick);
*errstr = gf_strdup (msg);
ret = -1;
goto out;
}
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo,
+ brickinfo, priv);
+ if (!gf_is_service_running (pidfile, NULL)) {
+ snprintf (msg, sizeof (msg), "Found dead "
+ "brick %s", brick);
+ *errstr = gf_strdup (msg);
+ ret = -1;
+ goto out;
+ }
continue;
}
--
1.8.3.1