21ab4e
From c1fb83040ecc324c503d93dfd800c5bdc677428c Mon Sep 17 00:00:00 2001
21ab4e
From: Jeff Darcy <jdarcy@redhat.com>
21ab4e
Date: Thu, 2 Feb 2017 13:08:04 -0500
21ab4e
Subject: [PATCH 324/361] glusterd: double-check brick liveness for
21ab4e
 remove-brick validation
21ab4e
21ab4e
Same problem as https://review.gluster.org/#/c/16509/ in a different
21ab4e
place.  Tests detach bricks without glusterd's knowledge, so
21ab4e
glusterd's internal brick state is out of date and we have to re-check
21ab4e
(via the brick's pidfile) as well.
21ab4e
21ab4e
mainline:
21ab4e
> BUG: 1385758
21ab4e
> Reviewed-on: https://review.gluster.org/16529
21ab4e
> Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
(cherry picked from commit 13cd11a91ec52af6a7cfcbd7e0c34f1c27904df6)
21ab4e
21ab4e
BUG: 1417815
21ab4e
Change-Id: I169538c1c62d72a685a49d57ef65fb6c3db6eab2
21ab4e
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/101305
21ab4e
Tested-by: Milind Changire <mchangir@redhat.com>
21ab4e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
---
21ab4e
 ...-1225716-brick-online-validation-remove-brick.t |  6 ++++--
21ab4e
 xlators/mgmt/glusterd/src/glusterd-brick-ops.c     | 22 +++++++++++++++++++---
21ab4e
 2 files changed, 23 insertions(+), 5 deletions(-)
21ab4e
21ab4e
diff --git a/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t b/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t
21ab4e
index eca1c1a..47403b4 100644
21ab4e
--- a/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t
21ab4e
+++ b/tests/bugs/glusterd/bug-1225716-brick-online-validation-remove-brick.t
21ab4e
@@ -12,7 +12,8 @@ TEST $CLI volume create $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 $H0:$B0/${V0}2
21ab4e
 TEST $CLI volume start $V0
21ab4e
 
21ab4e
 #kill a brick process
21ab4e
-kill -15 `cat $GLUSTERD_WORKDIR/vols/$V0/run/$H0-d-backends-${V0}1.pid`;
21ab4e
+kill_brick $V0 $H0 $B0/${V0}1
21ab4e
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1
21ab4e
 
21ab4e
 #remove-brick start should fail as the brick is down
21ab4e
 TEST ! $CLI volume remove-brick $V0 $H0:$B0/${V0}1 start
21ab4e
@@ -26,7 +27,8 @@ TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}1 start
21ab4e
 EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" remove_brick_status_completed_field "$V0 $H0:$B0/${V0}1"
21ab4e
 
21ab4e
 #kill a brick process
21ab4e
-kill -15 `cat $GLUSTERD_WORKDIR/vols/$V0/run/$H0-d-backends-${V0}1.pid`;
21ab4e
+kill_brick $V0 $H0 $B0/${V0}1
21ab4e
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1
21ab4e
 
21ab4e
 #remove-brick commit should pass even if the brick is down
21ab4e
 TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}1 commit
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
21ab4e
index b22a7da..e12d314 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
21ab4e
@@ -1947,6 +1947,8 @@ glusterd_remove_brick_validate_bricks (gf1_op_commands cmd, int32_t brick_count,
21ab4e
         glusterd_peerinfo_t    *peerinfo    = NULL;
21ab4e
         int                     i           = 0;
21ab4e
         int                     ret         = -1;
21ab4e
+        char                    pidfile[PATH_MAX+1] = {0,};
21ab4e
+        glusterd_conf_t        *priv        = THIS->private;
21ab4e
 
21ab4e
         /* Check whether all the nodes of the bricks to be removed are
21ab4e
         * up, if not fail the operation */
21ab4e
@@ -1996,15 +1998,29 @@ glusterd_remove_brick_validate_bricks (gf1_op_commands cmd, int32_t brick_count,
21ab4e
                 }
21ab4e
 
21ab4e
                 if (glusterd_is_local_brick (THIS, volinfo, brickinfo)) {
21ab4e
-                        if (((cmd == GF_OP_CMD_START) ||
21ab4e
-                            (cmd == GF_OP_CMD_DETACH_START))  &&
21ab4e
-                            brickinfo->status != GF_BRICK_STARTED) {
21ab4e
+                        switch (cmd) {
21ab4e
+                        case GF_OP_CMD_START:
21ab4e
+                        case GF_OP_CMD_DETACH_START:
21ab4e
+                                break;
21ab4e
+                        default:
21ab4e
+                                continue;
21ab4e
+                        }
21ab4e
+                        if (brickinfo->status != GF_BRICK_STARTED) {
21ab4e
                                 snprintf (msg, sizeof (msg), "Found stopped "
21ab4e
                                           "brick %s", brick);
21ab4e
                                 *errstr = gf_strdup (msg);
21ab4e
                                 ret = -1;
21ab4e
                                 goto out;
21ab4e
                         }
21ab4e
+                        GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo,
21ab4e
+                                                    brickinfo, priv);
21ab4e
+                        if (!gf_is_service_running (pidfile, NULL)) {
21ab4e
+                                snprintf (msg, sizeof (msg), "Found dead "
21ab4e
+                                          "brick %s", brick);
21ab4e
+                                *errstr = gf_strdup (msg);
21ab4e
+                                ret = -1;
21ab4e
+                                goto out;
21ab4e
+                        }
21ab4e
                         continue;
21ab4e
                 }
21ab4e
 
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e