7c2869
From 778a8e338c40103ff7837e30413ed62d3c1a3c8c Mon Sep 17 00:00:00 2001
7c2869
From: Sanju Rakonde <srakonde@redhat.com>
7c2869
Date: Sat, 7 Oct 2017 03:33:40 +0530
7c2869
Subject: [PATCH 635/642] glusterd:Marking all the brick status as stopped when
7c2869
 a process goes down in brick multiplexing
7c2869
7c2869
In brick multiplexing environment, if a brick process goes down
7c2869
i.e., if we kill it with SIGKILL, the status of the brick for which
7c2869
the process came up for the first time is only changing to stopped.
7c2869
all other brick statuses are remain started. This is happening because
7c2869
the process was killed abruptly using SIGKILL signal and signal
7c2869
handler wasn't invoked and further cleanup wasn't triggered.
7c2869
7c2869
When we try to start a volume using force, it shows error saying
7c2869
"Request timed out", since all the brickinfo->status are still in
7c2869
started state, we're waiting for one of the brick process to come up
7c2869
which never going to happen since the brick process was killed.
7c2869
7c2869
To resolve this, In the disconnect event, We are checking all the
7c2869
processes that whether the brick which got disconnected belongs the
7c2869
process. Once we get the process we are calling a function named
7c2869
glusterd_mark_bricks_stopped_by_proc() and sending brick_proc_t object as
7c2869
an argument.
7c2869
7c2869
From the glusterd_brick_proc_t we can get all the bricks attached
7c2869
to that process. but these are duplicated ones. To get the original
7c2869
brickinfo we are reading volinfo from brick. In volinfo we will have
7c2869
original brickinfo copies. We are changing brickinfo->status to
7c2869
stopped for all the bricks.
7c2869
7c2869
>upstream patch : https://review.gluster.org/#/c/18444/
7c2869
7c2869
Change-Id: Ifb9054b3ee081ef56b39b2903ae686984fe827e7
7c2869
BUG: 1526373
7c2869
Signed-off-by: Sanju Rakonde <srakonde@redhat.com>
7c2869
Reviewed-on: https://code.engineering.redhat.com/gerrit/125949
7c2869
Tested-by: RHGS Build Bot <nigelb@redhat.com>
7c2869
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
7c2869
---
7c2869
 .../glusterd/bug-1499509-disconnect-in-brick-mux.t | 27 ++++++++++
7c2869
 xlators/mgmt/glusterd/src/glusterd-handler.c       | 59 +++++++++++++++++++++-
7c2869
 2 files changed, 85 insertions(+), 1 deletion(-)
7c2869
 create mode 100644 tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
7c2869
7c2869
diff --git a/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
7c2869
new file mode 100644
7c2869
index 0000000..3c5bebe
7c2869
--- /dev/null
7c2869
+++ b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
7c2869
@@ -0,0 +1,27 @@
7c2869
+#!/bin/bash
7c2869
+
7c2869
+. $(dirname $0)/../../include.rc
7c2869
+. $(dirname $0)/../../volume.rc
7c2869
+. $(dirname $0)/../../cluster.rc
7c2869
+
7c2869
+cleanup
7c2869
+
7c2869
+TEST glusterd
7c2869
+TEST pidof glusterd
7c2869
+
7c2869
+## Enable brick multiplexing
7c2869
+TEST $CLI volume set all cluster.brick-multiplex on
7c2869
+
7c2869
+## creating 1x3 replicated volumes
7c2869
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}_{1..3}
7c2869
+TEST $CLI volume create $V1 replica 3 $H0:$B1/${V1}_{1..3}
7c2869
+
7c2869
+## Start the volume
7c2869
+TEST $CLI volume start $V0
7c2869
+TEST $CLI volume start $V1
7c2869
+
7c2869
+kill -9 $(pgrep glusterfsd)
7c2869
+
7c2869
+EXPECT 0 online_brick_count
7c2869
+
7c2869
+cleanup
7c2869
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
7c2869
index ae8ddde..350ef23 100644
7c2869
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
7c2869
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
7c2869
@@ -5987,6 +5987,31 @@ out:
7c2869
 
7c2869
 static int gd_stale_rpc_disconnect_log;
7c2869
 
7c2869
+static int
7c2869
+glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) {
7c2869
+        glusterd_brickinfo_t     *brickinfo        =  NULL;
7c2869
+        glusterd_brickinfo_t     *brickinfo_tmp    =  NULL;
7c2869
+        glusterd_volinfo_t       *volinfo          =  NULL;
7c2869
+        int                       ret              =  -1;
7c2869
+
7c2869
+        cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) {
7c2869
+                ret =  glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo);
7c2869
+                if (ret) {
7c2869
+                        gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
7c2869
+                                "Failed to get volinfo from brick(%s)",
7c2869
+                                brickinfo->path);
7c2869
+                        goto out;
7c2869
+                }
7c2869
+                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) {
7c2869
+                        if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0)
7c2869
+                                glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED);
7c2869
+                }
7c2869
+        }
7c2869
+        return 0;
7c2869
+out:
7c2869
+        return ret;
7c2869
+}
7c2869
+
7c2869
 int
7c2869
 __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
7c2869
                              rpc_clnt_event_t event, void *data)
7c2869
@@ -5997,6 +6022,9 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
7c2869
         glusterd_brickinfo_t    *brickinfo         = NULL;
7c2869
         glusterd_volinfo_t      *volinfo           = NULL;
7c2869
         xlator_t                *this              = NULL;
7c2869
+        int                      temp              = 0;
7c2869
+        glusterd_brickinfo_t    *brickinfo_tmp     = NULL;
7c2869
+        glusterd_brick_proc_t   *brick_proc        = NULL;
7c2869
 
7c2869
         brickid = mydata;
7c2869
         if (!brickid)
7c2869
@@ -6097,7 +6125,36 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
7c2869
                                   brickinfo->path);
7c2869
                 }
7c2869
 
7c2869
-                glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
7c2869
+                if (is_brick_mx_enabled()) {
7c2869
+                        cds_list_for_each_entry (brick_proc, &conf->brick_procs,
7c2869
+                                                 brick_proc_list) {
7c2869
+                                cds_list_for_each_entry (brickinfo_tmp,
7c2869
+                                                         &brick_proc->bricks,
7c2869
+                                                         brick_list) {
7c2869
+                                        if (strcmp (brickinfo_tmp->path,
7c2869
+                                                    brickinfo->path) == 0) {
7c2869
+                                                ret  = glusterd_mark_bricks_stopped_by_proc
7c2869
+                                                       (brick_proc);
7c2869
+                                                if (ret) {
7c2869
+                                                        gf_msg(THIS->name,
7c2869
+                                                               GF_LOG_ERROR, 0,
7c2869
+                                                               GD_MSG_BRICK_STOP_FAIL,
7c2869
+                                                               "Unable to stop "
7c2869
+                                                               "bricks of process"
7c2869
+                                                               " to which brick(%s)"
7c2869
+                                                               " belongs",
7c2869
+                                                               brickinfo->path);
7c2869
+                                                        goto out;
7c2869
+                                                }
7c2869
+                                                temp = 1;
7c2869
+                                                break;
7c2869
+                                        }
7c2869
+                                }
7c2869
+                                if (temp == 1)
7c2869
+                                        break;
7c2869
+                        }
7c2869
+                } else
7c2869
+                        glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
7c2869
                 break;
7c2869
 
7c2869
         case RPC_CLNT_DESTROY:
7c2869
-- 
7c2869
2.9.3
7c2869