From 778a8e338c40103ff7837e30413ed62d3c1a3c8c Mon Sep 17 00:00:00 2001
From: Sanju Rakonde <srakonde@redhat.com>
Date: Sat, 7 Oct 2017 03:33:40 +0530
Subject: [PATCH 635/642] glusterd:Marking all the brick status as stopped when
a process goes down in brick multiplexing
In brick multiplexing environment, if a brick process goes down
i.e., if we kill it with SIGKILL, the status of the brick for which
the process came up for the first time is only changing to stopped.
all other brick statuses are remain started. This is happening because
the process was killed abruptly using SIGKILL signal and signal
handler wasn't invoked and further cleanup wasn't triggered.
When we try to start a volume using force, it shows error saying
"Request timed out", since all the brickinfo->status are still in
started state, we're waiting for one of the brick process to come up
which never going to happen since the brick process was killed.
To resolve this, In the disconnect event, We are checking all the
processes that whether the brick which got disconnected belongs the
process. Once we get the process we are calling a function named
glusterd_mark_bricks_stopped_by_proc() and sending brick_proc_t object as
an argument.
From the glusterd_brick_proc_t we can get all the bricks attached
to that process. but these are duplicated ones. To get the original
brickinfo we are reading volinfo from brick. In volinfo we will have
original brickinfo copies. We are changing brickinfo->status to
stopped for all the bricks.
>upstream patch : https://review.gluster.org/#/c/18444/
Change-Id: Ifb9054b3ee081ef56b39b2903ae686984fe827e7
BUG: 1526373
Signed-off-by: Sanju Rakonde <srakonde@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/125949
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
.../glusterd/bug-1499509-disconnect-in-brick-mux.t | 27 ++++++++++
xlators/mgmt/glusterd/src/glusterd-handler.c | 59 +++++++++++++++++++++-
2 files changed, 85 insertions(+), 1 deletion(-)
create mode 100644 tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
diff --git a/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
new file mode 100644
index 0000000..3c5bebe
--- /dev/null
+++ b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../cluster.rc
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+
+## Enable brick multiplexing
+TEST $CLI volume set all cluster.brick-multiplex on
+
+## creating 1x3 replicated volumes
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}_{1..3}
+TEST $CLI volume create $V1 replica 3 $H0:$B1/${V1}_{1..3}
+
+## Start the volume
+TEST $CLI volume start $V0
+TEST $CLI volume start $V1
+
+kill -9 $(pgrep glusterfsd)
+
+EXPECT 0 online_brick_count
+
+cleanup
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index ae8ddde..350ef23 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -5987,6 +5987,31 @@ out:
static int gd_stale_rpc_disconnect_log;
+static int
+glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) {
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_brickinfo_t *brickinfo_tmp = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = -1;
+
+ cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) {
+ ret = glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo);
+ if (ret) {
+ gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+ "Failed to get volinfo from brick(%s)",
+ brickinfo->path);
+ goto out;
+ }
+ cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) {
+ if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0)
+ glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED);
+ }
+ }
+ return 0;
+out:
+ return ret;
+}
+
int
__glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
rpc_clnt_event_t event, void *data)
@@ -5997,6 +6022,9 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
glusterd_brickinfo_t *brickinfo = NULL;
glusterd_volinfo_t *volinfo = NULL;
xlator_t *this = NULL;
+ int temp = 0;
+ glusterd_brickinfo_t *brickinfo_tmp = NULL;
+ glusterd_brick_proc_t *brick_proc = NULL;
brickid = mydata;
if (!brickid)
@@ -6097,7 +6125,36 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
brickinfo->path);
}
- glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
+ if (is_brick_mx_enabled()) {
+ cds_list_for_each_entry (brick_proc, &conf->brick_procs,
+ brick_proc_list) {
+ cds_list_for_each_entry (brickinfo_tmp,
+ &brick_proc->bricks,
+ brick_list) {
+ if (strcmp (brickinfo_tmp->path,
+ brickinfo->path) == 0) {
+ ret = glusterd_mark_bricks_stopped_by_proc
+ (brick_proc);
+ if (ret) {
+ gf_msg(THIS->name,
+ GF_LOG_ERROR, 0,
+ GD_MSG_BRICK_STOP_FAIL,
+ "Unable to stop "
+ "bricks of process"
+ " to which brick(%s)"
+ " belongs",
+ brickinfo->path);
+ goto out;
+ }
+ temp = 1;
+ break;
+ }
+ }
+ if (temp == 1)
+ break;
+ }
+ } else
+ glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
break;
case RPC_CLNT_DESTROY:
--
2.9.3