From 941241b33f424a6c4b27883482e0c7b101f712c5 Mon Sep 17 00:00:00 2001
From: Jeff Darcy <jdarcy@redhat.com>
Date: Wed, 1 Feb 2017 21:54:30 -0500
Subject: [PATCH 321/361] glusterd: double-check whether brick is alive for
 stats

With multiplexing, our tests detach bricks from their host processes
without glusterd being involved.  Thus, when we ask glusterd to fetch
profile info, it will try to fetch from a brick that's actually not
present any more.  While it can handle the process being dead and its
RPC connection being closed, it barfs if it gets a negative response
from a live brick process.  This is not a problem in normal use,
because the brick can't disappear without glusterd seeing it.  The fix
is to double check that the brick is actually running, by looking for
its pidfile which the tests *do* clean up as part of killing a brick.

mainline:
> BUG: 1385758
> Reviewed-on: https://review.gluster.org/16509
> Smoke: Gluster Build System <jenkins@build.gluster.org>
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
> Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
(cherry picked from commit f4b94ddd3034f2ac27890f75ec28aa75b4fc18eb)

BUG: 1417815
Change-Id: I098465b175ecf23538bd7207357c752a2bba8f4e
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/101302
Tested-by: Milind Changire <mchangir@redhat.com>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 xlators/mgmt/glusterd/src/glusterd-op-sm.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index a3a0462..ef31cdb 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -6247,15 +6247,14 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr,
         glusterd_brickinfo_t                    *brickinfo = NULL;
         glusterd_pending_node_t                 *pending_node = NULL;
         char                                    *brick = NULL;
-
-
+        int32_t                                 pid = -1;
+        char                                    pidfile[PATH_MAX] = {0};
 
         this = THIS;
         GF_ASSERT (this);
         priv = this->private;
         GF_ASSERT (priv);
 
-
         ret = dict_get_str (dict, "volname", &volname);
         if (ret) {
                 gf_msg ("glusterd", GF_LOG_ERROR, 0,
@@ -6386,6 +6385,18 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr,
                 cds_list_for_each_entry (brickinfo, &volinfo->bricks,
                                          brick_list) {
                         if (glusterd_is_brick_started (brickinfo)) {
+                                /*
+                                 * In normal use, glusterd_is_brick_started
+                                 * will give us the answer we need.  However,
+                                 * in our tests the brick gets detached behind
+                                 * our back, so we need to double-check this
+                                 * way.
+                                 */
+                                GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo,
+                                                            brickinfo, priv);
+                                if (!gf_is_service_running (pidfile, &pid)) {
+                                        continue;
+                                }
                                 pending_node = GF_CALLOC (1, sizeof (*pending_node),
                                                           gf_gld_mt_pending_node_t);
                                 if (!pending_node) {
-- 
1.8.3.1