Blob Blame History Raw
From c1fb83040ecc324c503d93dfd800c5bdc677428c Mon Sep 17 00:00:00 2001
From: Jeff Darcy <jdarcy@redhat.com>
Date: Thu, 2 Feb 2017 13:08:04 -0500
Subject: [PATCH 324/361] glusterd: double-check brick liveness for
 remove-brick validation

Same problem as https://review.gluster.org/#/c/16509/ in a different
place.  Tests detach bricks without glusterd's knowledge, so
glusterd's internal brick state is out of date and we have to re-check
(via the brick's pidfile) as well.

mainline:
> BUG: 1385758
> Reviewed-on: https://review.gluster.org/16529
> Smoke: Gluster Build System <jenkins@build.gluster.org>
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
> Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
(cherry picked from commit 13cd11a91ec52af6a7cfcbd7e0c34f1c27904df6)

BUG: 1417815
Change-Id: I169538c1c62d72a685a49d57ef65fb6c3db6eab2
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/101305
Tested-by: Milind Changire <mchangir@redhat.com>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 ...-1225716-brick-online-validation-remove-brick.t |  6 ++++--
 xlators/mgmt/glusterd/src/glusterd-brick-ops.c     | 22 +++++++++++++++++++---
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t b/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t
index eca1c1a..47403b4 100644
--- a/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t
+++ b/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t
@@ -12,7 +12,8 @@ TEST $CLI volume create $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 $H0:$B0/${V0}2
 TEST $CLI volume start $V0
 
 #kill a brick process
-kill -15 `cat $GLUSTERD_WORKDIR/vols/$V0/run/$H0-d-backends-${V0}1.pid`;
+kill_brick $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1
 
 #remove-brick start should fail as the brick is down
 TEST ! $CLI volume remove-brick $V0 $H0:$B0/${V0}1 start
@@ -26,7 +27,8 @@ TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}1 start
 EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" remove_brick_status_completed_field "$V0 $H0:$B0/${V0}1"
 
 #kill a brick process
-kill -15 `cat $GLUSTERD_WORKDIR/vols/$V0/run/$H0-d-backends-${V0}1.pid`;
+kill_brick $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1
 
 #remove-brick commit should pass even if the brick is down
 TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}1 commit
diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
index b22a7da..e12d314 100644
--- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
@@ -1947,6 +1947,8 @@ glusterd_remove_brick_validate_bricks (gf1_op_commands cmd, int32_t brick_count,
         glusterd_peerinfo_t    *peerinfo    = NULL;
         int                     i           = 0;
         int                     ret         = -1;
+        char                    pidfile[PATH_MAX+1] = {0,};
+        glusterd_conf_t        *priv        = THIS->private;
 
         /* Check whether all the nodes of the bricks to be removed are
         * up, if not fail the operation */
@@ -1996,15 +1998,29 @@ glusterd_remove_brick_validate_bricks (gf1_op_commands cmd, int32_t brick_count,
                 }
 
                 if (glusterd_is_local_brick (THIS, volinfo, brickinfo)) {
-                        if (((cmd == GF_OP_CMD_START) ||
-                            (cmd == GF_OP_CMD_DETACH_START))  &&
-                            brickinfo->status != GF_BRICK_STARTED) {
+                        switch (cmd) {
+                        case GF_OP_CMD_START:
+                        case GF_OP_CMD_DETACH_START:
+                                break;
+                        default:
+                                continue;
+                        }
+                        if (brickinfo->status != GF_BRICK_STARTED) {
                                 snprintf (msg, sizeof (msg), "Found stopped "
                                           "brick %s", brick);
                                 *errstr = gf_strdup (msg);
                                 ret = -1;
                                 goto out;
                         }
+                        GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo,
+                                                    brickinfo, priv);
+                        if (!gf_is_service_running (pidfile, NULL)) {
+                                snprintf (msg, sizeof (msg), "Found dead "
+                                          "brick %s", brick);
+                                *errstr = gf_strdup (msg);
+                                ret = -1;
+                                goto out;
+                        }
                         continue;
                 }
 
-- 
1.8.3.1