d1681e
From 975e18d864b0b5c9158abae8752271e4a7fe6299 Mon Sep 17 00:00:00 2001
d1681e
From: Atin Mukherjee <amukherj@redhat.com>
d1681e
Date: Tue, 27 Mar 2018 16:53:33 +0530
d1681e
Subject: [PATCH 213/236] glusterd: mark port_registered to true for all
d1681e
 running bricks with brick mux
d1681e
d1681e
glusterd maintains a boolean flag 'port_registered' which is used to determine
d1681e
if a brick has completed its portmap sign in process. This flag is (re)set in
d1681e
pmap_sigin and pmap_signout events. In case of brick multiplexing this flag is
d1681e
the identifier to determine if the very first brick with which the process is
d1681e
spawned up has completed its sign in process. However in case of glusterd
d1681e
restart when a brick is already identified as running, glusterd does a
d1681e
pmap_registry_bind to ensure its portmap table is updated but this flag isn't
d1681e
which is fine in case of non brick multiplex case but causes an issue if
d1681e
the very first brick which came as part of process is replaced and then
d1681e
the subsequent brick attach will fail. One of the way to validate this
d1681e
is to create and start a volume, remove the first brick and then
d1681e
add-brick a new one. Add-brick operation will take a very long time and
d1681e
post that the volume status will show all other brick status apart from
d1681e
the new brick as down.
d1681e
d1681e
Solution is to set brickinfo->port_registered to true for all the
d1681e
running bricks when brick multiplexing is enabled.
d1681e
d1681e
>upstream mainline patch : https://review.gluster.org/#/c/19800/
d1681e
d1681e
>Change-Id: Ib0662d99d0fa66b1538947fd96b43f1cbc04e4ff
d1681e
>Fixes: bz#1560957
d1681e
>Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
d1681e
d1681e
Change-Id: Ib0662d99d0fa66b1538947fd96b43f1cbc04e4ff
d1681e
BUG: 1560955
d1681e
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/134827
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
Reviewed-by: Sanju Rakonde <srakonde@redhat.com>
d1681e
---
d1681e
 .../bug-1560955-brick-mux-port-registered-issue.t  | 39 ++++++++++++++++++++++
d1681e
 xlators/mgmt/glusterd/src/glusterd-handler.c       |  2 ++
d1681e
 xlators/mgmt/glusterd/src/glusterd-utils.c         |  1 +
d1681e
 3 files changed, 42 insertions(+)
d1681e
 create mode 100644 tests/bugs/glusterd/bug-1560955-brick-mux-port-registered-issue.t
d1681e
d1681e
diff --git a/tests/bugs/glusterd/bug-1560955-brick-mux-port-registered-issue.t b/tests/bugs/glusterd/bug-1560955-brick-mux-port-registered-issue.t
d1681e
new file mode 100644
d1681e
index 0000000..d1b8f06
d1681e
--- /dev/null
d1681e
+++ b/tests/bugs/glusterd/bug-1560955-brick-mux-port-registered-issue.t
d1681e
@@ -0,0 +1,39 @@
d1681e
+#!/bin/bash
d1681e
+
d1681e
+. $(dirname $0)/../../include.rc
d1681e
+. $(dirname $0)/../../traps.rc
d1681e
+. $(dirname $0)/../../volume.rc
d1681e
+
d1681e
+function count_brick_processes {
d1681e
+        pgrep glusterfsd | wc -l
d1681e
+}
d1681e
+
d1681e
+function count_brick_pids {
d1681e
+        $CLI --xml volume status all | sed -n '/.*<pid>\([^<]*\).*/s//\1/p' \
d1681e
+                                     | grep -v "N/A" | sort | uniq | wc -l
d1681e
+}
d1681e
+
d1681e
+cleanup;
d1681e
+
d1681e
+#bug-1560955 - brick status goes offline after remove-brick followed by add-brick
d1681e
+TEST glusterd
d1681e
+TEST $CLI volume set all cluster.brick-multiplex on
d1681e
+push_trapfunc "$CLI volume set all cluster.brick-multiplex off"
d1681e
+push_trapfunc "cleanup"
d1681e
+
d1681e
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1..3}
d1681e
+TEST $CLI volume start $V0
d1681e
+
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 1 count_brick_processes
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 1 count_brick_pids
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 3 online_brick_count
d1681e
+
d1681e
+
d1681e
+pkill glusterd
d1681e
+TEST glusterd
d1681e
+TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}1 force
d1681e
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}1_new force
d1681e
+
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 1 count_brick_processes
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 1 count_brick_pids
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 3 online_brick_count
d1681e
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
d1681e
index dbf80a1..cb19321 100644
d1681e
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
d1681e
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
d1681e
@@ -5721,6 +5721,8 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict)
d1681e
                                  count, brickinfo->port);
d1681e
                         fprintf (fp, "Volume%d.Brick%d.rdma_port: %d\n", count_bkp,
d1681e
                                  count, brickinfo->rdma_port);
d1681e
+                        fprintf (fp, "Volume%d.Brick%d.port_registered: %d\n",
d1681e
+                                 count_bkp, count, brickinfo->port_registered);
d1681e
                         fprintf (fp, "Volume%d.Brick%d.status: %s\n", count_bkp,
d1681e
                                  count, brickinfo->status ? "Started" : "Stopped");
d1681e
 
d1681e
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
d1681e
index 49605cc..5e9213c 100644
d1681e
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
d1681e
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
d1681e
@@ -5976,6 +5976,7 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
d1681e
                          * TBD: re-use RPC connection across bricks
d1681e
                          */
d1681e
                         if (is_brick_mx_enabled ()) {
d1681e
+                                brickinfo->port_registered = _gf_true;
d1681e
                                 ret = glusterd_get_sock_from_brick_pid (pid, socketpath,
d1681e
                                                                         sizeof(socketpath));
d1681e
                                 if (ret) {
d1681e
-- 
d1681e
1.8.3.1
d1681e