|
|
7c2869 |
From 778a8e338c40103ff7837e30413ed62d3c1a3c8c Mon Sep 17 00:00:00 2001
|
|
|
7c2869 |
From: Sanju Rakonde <srakonde@redhat.com>
|
|
|
7c2869 |
Date: Sat, 7 Oct 2017 03:33:40 +0530
|
|
|
7c2869 |
Subject: [PATCH 635/642] glusterd:Marking all the brick status as stopped when
|
|
|
7c2869 |
a process goes down in brick multiplexing
|
|
|
7c2869 |
|
|
|
7c2869 |
In brick multiplexing environment, if a brick process goes down
|
|
|
7c2869 |
i.e., if we kill it with SIGKILL, the status of the brick for which
|
|
|
7c2869 |
the process came up for the first time is only changing to stopped.
|
|
|
7c2869 |
all other brick statuses are remain started. This is happening because
|
|
|
7c2869 |
the process was killed abruptly using SIGKILL signal and signal
|
|
|
7c2869 |
handler wasn't invoked and further cleanup wasn't triggered.
|
|
|
7c2869 |
|
|
|
7c2869 |
When we try to start a volume using force, it shows error saying
|
|
|
7c2869 |
"Request timed out", since all the brickinfo->status are still in
|
|
|
7c2869 |
started state, we're waiting for one of the brick process to come up
|
|
|
7c2869 |
which never going to happen since the brick process was killed.
|
|
|
7c2869 |
|
|
|
7c2869 |
To resolve this, In the disconnect event, We are checking all the
|
|
|
7c2869 |
processes that whether the brick which got disconnected belongs the
|
|
|
7c2869 |
process. Once we get the process we are calling a function named
|
|
|
7c2869 |
glusterd_mark_bricks_stopped_by_proc() and sending brick_proc_t object as
|
|
|
7c2869 |
an argument.
|
|
|
7c2869 |
|
|
|
7c2869 |
From the glusterd_brick_proc_t we can get all the bricks attached
|
|
|
7c2869 |
to that process. but these are duplicated ones. To get the original
|
|
|
7c2869 |
brickinfo we are reading volinfo from brick. In volinfo we will have
|
|
|
7c2869 |
original brickinfo copies. We are changing brickinfo->status to
|
|
|
7c2869 |
stopped for all the bricks.
|
|
|
7c2869 |
|
|
|
7c2869 |
>upstream patch : https://review.gluster.org/#/c/18444/
|
|
|
7c2869 |
|
|
|
7c2869 |
Change-Id: Ifb9054b3ee081ef56b39b2903ae686984fe827e7
|
|
|
7c2869 |
BUG: 1526373
|
|
|
7c2869 |
Signed-off-by: Sanju Rakonde <srakonde@redhat.com>
|
|
|
7c2869 |
Reviewed-on: https://code.engineering.redhat.com/gerrit/125949
|
|
|
7c2869 |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
7c2869 |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
7c2869 |
---
|
|
|
7c2869 |
.../glusterd/bug-1499509-disconnect-in-brick-mux.t | 27 ++++++++++
|
|
|
7c2869 |
xlators/mgmt/glusterd/src/glusterd-handler.c | 59 +++++++++++++++++++++-
|
|
|
7c2869 |
2 files changed, 85 insertions(+), 1 deletion(-)
|
|
|
7c2869 |
create mode 100644 tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
|
|
|
7c2869 |
|
|
|
7c2869 |
diff --git a/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
|
|
|
7c2869 |
new file mode 100644
|
|
|
7c2869 |
index 0000000..3c5bebe
|
|
|
7c2869 |
--- /dev/null
|
|
|
7c2869 |
+++ b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
|
|
|
7c2869 |
@@ -0,0 +1,27 @@
|
|
|
7c2869 |
+#!/bin/bash
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+. $(dirname $0)/../../include.rc
|
|
|
7c2869 |
+. $(dirname $0)/../../volume.rc
|
|
|
7c2869 |
+. $(dirname $0)/../../cluster.rc
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+cleanup
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+TEST glusterd
|
|
|
7c2869 |
+TEST pidof glusterd
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+## Enable brick multiplexing
|
|
|
7c2869 |
+TEST $CLI volume set all cluster.brick-multiplex on
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+## creating 1x3 replicated volumes
|
|
|
7c2869 |
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}_{1..3}
|
|
|
7c2869 |
+TEST $CLI volume create $V1 replica 3 $H0:$B1/${V1}_{1..3}
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+## Start the volume
|
|
|
7c2869 |
+TEST $CLI volume start $V0
|
|
|
7c2869 |
+TEST $CLI volume start $V1
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+kill -9 $(pgrep glusterfsd)
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+EXPECT 0 online_brick_count
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+cleanup
|
|
|
7c2869 |
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
|
|
|
7c2869 |
index ae8ddde..350ef23 100644
|
|
|
7c2869 |
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
|
|
|
7c2869 |
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
|
|
|
7c2869 |
@@ -5987,6 +5987,31 @@ out:
|
|
|
7c2869 |
|
|
|
7c2869 |
static int gd_stale_rpc_disconnect_log;
|
|
|
7c2869 |
|
|
|
7c2869 |
+static int
|
|
|
7c2869 |
+glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) {
|
|
|
7c2869 |
+ glusterd_brickinfo_t *brickinfo = NULL;
|
|
|
7c2869 |
+ glusterd_brickinfo_t *brickinfo_tmp = NULL;
|
|
|
7c2869 |
+ glusterd_volinfo_t *volinfo = NULL;
|
|
|
7c2869 |
+ int ret = -1;
|
|
|
7c2869 |
+
|
|
|
7c2869 |
+ cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) {
|
|
|
7c2869 |
+ ret = glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo);
|
|
|
7c2869 |
+ if (ret) {
|
|
|
7c2869 |
+ gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
|
|
|
7c2869 |
+ "Failed to get volinfo from brick(%s)",
|
|
|
7c2869 |
+ brickinfo->path);
|
|
|
7c2869 |
+ goto out;
|
|
|
7c2869 |
+ }
|
|
|
7c2869 |
+ cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) {
|
|
|
7c2869 |
+ if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0)
|
|
|
7c2869 |
+ glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED);
|
|
|
7c2869 |
+ }
|
|
|
7c2869 |
+ }
|
|
|
7c2869 |
+ return 0;
|
|
|
7c2869 |
+out:
|
|
|
7c2869 |
+ return ret;
|
|
|
7c2869 |
+}
|
|
|
7c2869 |
+
|
|
|
7c2869 |
int
|
|
|
7c2869 |
__glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
|
|
|
7c2869 |
rpc_clnt_event_t event, void *data)
|
|
|
7c2869 |
@@ -5997,6 +6022,9 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
|
|
|
7c2869 |
glusterd_brickinfo_t *brickinfo = NULL;
|
|
|
7c2869 |
glusterd_volinfo_t *volinfo = NULL;
|
|
|
7c2869 |
xlator_t *this = NULL;
|
|
|
7c2869 |
+ int temp = 0;
|
|
|
7c2869 |
+ glusterd_brickinfo_t *brickinfo_tmp = NULL;
|
|
|
7c2869 |
+ glusterd_brick_proc_t *brick_proc = NULL;
|
|
|
7c2869 |
|
|
|
7c2869 |
brickid = mydata;
|
|
|
7c2869 |
if (!brickid)
|
|
|
7c2869 |
@@ -6097,7 +6125,36 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
|
|
|
7c2869 |
brickinfo->path);
|
|
|
7c2869 |
}
|
|
|
7c2869 |
|
|
|
7c2869 |
- glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
|
|
|
7c2869 |
+ if (is_brick_mx_enabled()) {
|
|
|
7c2869 |
+ cds_list_for_each_entry (brick_proc, &conf->brick_procs,
|
|
|
7c2869 |
+ brick_proc_list) {
|
|
|
7c2869 |
+ cds_list_for_each_entry (brickinfo_tmp,
|
|
|
7c2869 |
+ &brick_proc->bricks,
|
|
|
7c2869 |
+ brick_list) {
|
|
|
7c2869 |
+ if (strcmp (brickinfo_tmp->path,
|
|
|
7c2869 |
+ brickinfo->path) == 0) {
|
|
|
7c2869 |
+ ret = glusterd_mark_bricks_stopped_by_proc
|
|
|
7c2869 |
+ (brick_proc);
|
|
|
7c2869 |
+ if (ret) {
|
|
|
7c2869 |
+ gf_msg(THIS->name,
|
|
|
7c2869 |
+ GF_LOG_ERROR, 0,
|
|
|
7c2869 |
+ GD_MSG_BRICK_STOP_FAIL,
|
|
|
7c2869 |
+ "Unable to stop "
|
|
|
7c2869 |
+ "bricks of process"
|
|
|
7c2869 |
+ " to which brick(%s)"
|
|
|
7c2869 |
+ " belongs",
|
|
|
7c2869 |
+ brickinfo->path);
|
|
|
7c2869 |
+ goto out;
|
|
|
7c2869 |
+ }
|
|
|
7c2869 |
+ temp = 1;
|
|
|
7c2869 |
+ break;
|
|
|
7c2869 |
+ }
|
|
|
7c2869 |
+ }
|
|
|
7c2869 |
+ if (temp == 1)
|
|
|
7c2869 |
+ break;
|
|
|
7c2869 |
+ }
|
|
|
7c2869 |
+ } else
|
|
|
7c2869 |
+ glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
|
|
|
7c2869 |
break;
|
|
|
7c2869 |
|
|
|
7c2869 |
case RPC_CLNT_DESTROY:
|
|
|
7c2869 |
--
|
|
|
7c2869 |
2.9.3
|
|
|
7c2869 |
|