Blob Blame History Raw
From 16b5177ca40cb5f8b37ea5331003ecc7dbb6d992 Mon Sep 17 00:00:00 2001
From: Atin Mukherjee <amukherj@redhat.com>
Date: Tue, 7 Aug 2018 10:25:49 +0530
Subject: [PATCH 342/351] glusterd: more stricter checks of if brick is running
 in multiplex mode

While gf_attach () utility can help in detaching a brick instance from
the brick process which the kill_brick () function in tests/volume.rc
uses it has a caveat which is as follows:
1. It doesn't ensure the respective brick is marked as stopped which
glusterd does from glusterd_brick_stop
2. Sometimes if kill_brick () is executed just after a brick stack is
up, the mgmt_rpc_notify () can take some time before marking
priv->connected to 1 and before it if kill_brick () is executed, brick
will fail to initiate the pmap_signout which would inturn cleans up the
pidfile.

To avoid such possibilities, a more stricter check on if a brick is
running or not in brick multiplexing has been brought in now where it
not only checks for its pid's existance but checks if the respective
process has the brick instance associated with it before checking for
brick's status.

> Change-Id: I98b92df949076663b9686add7aab4ec2f24ad5ab
> Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
> (cherry pick from commit bb9b8f61501cc633e585593de4d9f2fe5494d5ce)
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/20651/)

Change-Id: I47b70927c839ca4828a0499006b5c1f604d3d6a4
BUG: 1612098
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/146874
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
Tested-by: Atin Mukherjee <amukherj@redhat.com>
---
 xlators/mgmt/glusterd/src/glusterd-utils.c | 71 ++++++++++++++++--------------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 372d5f4..6f7c787 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -6083,6 +6083,7 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
         char                    *brickpath            = NULL;
         glusterd_volinfo_t      *other_vol;
         struct statvfs           brickstat = {0,};
+        gf_boolean_t             is_service_running = _gf_false;
 
         this = THIS;
         GF_ASSERT (this);
@@ -6149,8 +6150,39 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
                         brickinfo->path);
                 goto out;
         }
-
-        if (gf_is_service_running (pidfile, &pid)) {
+        is_service_running = gf_is_service_running (pidfile, &pid);
+        if (is_service_running) {
+                if (is_brick_mx_enabled ()) {
+                        brickpath = search_brick_path_from_proc
+                                                (pid, brickinfo->path);
+                        if (!brickpath) {
+                                gf_log (this->name, GF_LOG_INFO,
+                                        "Either pid %d is not running or brick"
+                                        " path %s is not consumed so cleanup pidfile",
+                                        pid, brickinfo->path);
+                                /* brick isn't running,so unlink stale pidfile
+                                 * if any.
+                                 */
+                                if (sys_access (pidfile , R_OK) == 0) {
+                                        sys_unlink (pidfile);
+                                }
+                                goto run;
+                        }
+                        GF_FREE (brickpath);
+                        ret = glusterd_get_sock_from_brick_pid (pid, socketpath,
+                                                                sizeof(socketpath));
+                        if (ret) {
+                                gf_log (this->name, GF_LOG_INFO,
+                                        "Either pid %d is not running or does "
+                                        "not match with any running brick "
+                                        "processes", pid);
+                                /* Fetch unix socket is failed so unlink pidfile */
+                                if (sys_access (pidfile , R_OK) == 0) {
+                                        sys_unlink (pidfile);
+                                }
+                                goto run;
+                        }
+                }
                 if (brickinfo->status != GF_BRICK_STARTING &&
                     brickinfo->status != GF_BRICK_STARTED) {
                         gf_log (this->name, GF_LOG_INFO,
@@ -6168,36 +6200,11 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
                          * same port (on another brick) and re-use that.
                          * TBD: re-use RPC connection across bricks
                          */
-                        if (is_brick_mx_enabled ()) {
-                                brickpath = search_brick_path_from_proc (pid, brickinfo->path);
-                                if (!brickpath) {
-                                        gf_log (this->name, GF_LOG_INFO,
-                                                "Either pid %d is not running or brick"
-                                                " path %s is not consumed so cleanup pidfile",
-                                                pid, brickinfo->path);
-                                        /* search brick is failed so unlink pidfile */
-                                        if (sys_access (pidfile , R_OK) == 0) {
-                                                sys_unlink (pidfile);
-                                        }
-                                        goto run;
-                                }
-                                GF_FREE (brickpath);
-                                ret = glusterd_get_sock_from_brick_pid (pid, socketpath,
-                                                                        sizeof(socketpath));
-                                if (ret) {
-                                        gf_log (this->name, GF_LOG_INFO,
-                                                "Either pid %d is not running or is not match"
-                                                " with any running brick process ", pid);
-                                        /* Fetch unix socket is failed so unlink pidfile */
-                                        if (sys_access (pidfile , R_OK) == 0) {
-                                                sys_unlink (pidfile);
-                                        }
-                                        goto run;
-                                }
-                        } else {
-                                glusterd_set_brick_socket_filepath (volinfo, brickinfo,
-                                                                    socketpath,
-                                                                    sizeof (socketpath));
+                        if (!is_brick_mx_enabled ()) {
+                                        glusterd_set_brick_socket_filepath
+                                                (volinfo, brickinfo,
+                                                 socketpath,
+                                                 sizeof (socketpath));
                         }
                         gf_log (this->name, GF_LOG_DEBUG,
                                 "Using %s as sockfile for brick %s of volume %s ",
-- 
1.8.3.1