9ae3f9
From a30a5fdef2e252eba9f44a3c671de8f3aa4f17d7 Mon Sep 17 00:00:00 2001
9ae3f9
From: Vishal Pandey <vpandey@redhat.com>
9ae3f9
Date: Tue, 19 Nov 2019 11:39:22 +0530
9ae3f9
Subject: [PATCH 392/449] glusterd: Brick process fails to come up with
9ae3f9
 brickmux on
9ae3f9
9ae3f9
Issue:
9ae3f9
1- In a cluster of 3 Nodes N1, N2, N3. Create 3 volumes vol1,
9ae3f9
vol2, vol3 with 3 bricks (one from each node)
9ae3f9
2- Set cluster.brick-multiplex on
9ae3f9
3- Start all 3 volumes
9ae3f9
4- Check if all bricks on a node are running on same port
9ae3f9
5- Kill N1
9ae3f9
6- Set performance.readdir-ahead for volumes vol1, vol2, vol3
9ae3f9
7- Bring N1 up and check volume status
9ae3f9
8- All bricks processes not running on N1.
9ae3f9
9ae3f9
Root Cause -
9ae3f9
Since, There is a diff in volfile versions in N1 as compared
9ae3f9
to N2 and N3 therefore glusterd_import_friend_volume() is called.
9ae3f9
glusterd_import_friend_volume() copies the new_volinfo and deletes
9ae3f9
old_volinfo and then calls glusterd_start_bricks().
9ae3f9
glusterd_start_bricks() looks for the volfiles and sends an rpc
9ae3f9
request to glusterfs_handle_attach(). Now, since the volinfo
9ae3f9
has been deleted by glusterd_delete_stale_volume()
9ae3f9
from priv->volumes list before glusterd_start_bricks() and
9ae3f9
glusterd_create_volfiles_and_notify_services() and
9ae3f9
glusterd_list_add_order is called after glusterd_start_bricks(),
9ae3f9
therefore the attach RPC req gets an empty volfile path
9ae3f9
and that causes the brick to crash.
9ae3f9
9ae3f9
Fix- Call glusterd_list_add_order() and
9ae3f9
glusterd_create_volfiles_and_notify_services before
9ae3f9
glusterd_start_bricks() cal is made in glusterd_import_friend_volume
9ae3f9
9ae3f9
> upstream patch link: https://review.gluster.org/#/c/glusterfs/+/23724/
9ae3f9
> Change-Id: Idfe0e8710f7eb77ca3ddfa1cabeb45b2987f41aa
9ae3f9
> Fixes: bz#1773856
9ae3f9
> Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
9ae3f9
9ae3f9
BUG: 1683602
9ae3f9
Change-Id: Idfe0e8710f7eb77ca3ddfa1cabeb45b2987f41aa
9ae3f9
Signed-off-by: Sanju Rakonde <srakonde@redhat.com>
9ae3f9
Reviewed-on: https://code.engineering.redhat.com/gerrit/202255
9ae3f9
Tested-by: RHGS Build Bot <nigelb@redhat.com>
9ae3f9
Reviewed-by: Mohit Agrawal <moagrawa@redhat.com>
9ae3f9
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
9ae3f9
---
9ae3f9
 .../glusterd/brick-mux-validation-in-cluster.t     | 61 +++++++++++++++++++++-
9ae3f9
 xlators/mgmt/glusterd/src/glusterd-utils.c         | 28 +++++-----
9ae3f9
 2 files changed, 75 insertions(+), 14 deletions(-)
9ae3f9
9ae3f9
diff --git a/tests/bugs/glusterd/brick-mux-validation-in-cluster.t b/tests/bugs/glusterd/brick-mux-validation-in-cluster.t
9ae3f9
index 4e57038..f088dbb 100644
9ae3f9
--- a/tests/bugs/glusterd/brick-mux-validation-in-cluster.t
9ae3f9
+++ b/tests/bugs/glusterd/brick-mux-validation-in-cluster.t
9ae3f9
@@ -7,6 +7,20 @@ function count_brick_processes {
9ae3f9
         pgrep glusterfsd | wc -l
9ae3f9
 }
9ae3f9
 
9ae3f9
+function count_brick_pids {
9ae3f9
+        $CLI_1 --xml volume status all | sed -n '/.*<pid>\([^<]*\).*/s//\1/p' \
9ae3f9
+                                     | grep -v "N/A" | sort | uniq | wc -l
9ae3f9
+}
9ae3f9
+
9ae3f9
+function count_N/A_brick_pids {
9ae3f9
+        $CLI_1 --xml volume status all | sed -n '/.*<pid>\([^<]*\).*/s//\1/p' \
9ae3f9
+                                     | grep -- '\-1' | sort | uniq | wc -l
9ae3f9
+}
9ae3f9
+
9ae3f9
+function check_peers {
9ae3f9
+        $CLI_2 peer status | grep 'Peer in Cluster (Connected)' | wc -l
9ae3f9
+}
9ae3f9
+
9ae3f9
 cleanup;
9ae3f9
 
9ae3f9
 TEST launch_cluster 3
9ae3f9
@@ -48,4 +62,49 @@ TEST $CLI_1 volume stop $V1
9ae3f9
 
9ae3f9
 EXPECT 3 count_brick_processes
9ae3f9
 
9ae3f9
-cleanup
9ae3f9
+TEST $CLI_1 volume stop $META_VOL
9ae3f9
+
9ae3f9
+TEST $CLI_1 volume delete $META_VOL
9ae3f9
+TEST $CLI_1 volume delete $V0
9ae3f9
+TEST $CLI_1 volume delete $V1
9ae3f9
+
9ae3f9
+#bug-1773856 - Brick process fails to come up with brickmux on
9ae3f9
+
9ae3f9
+TEST $CLI_1 volume create $V0 $H1:$B1/${V0}1 $H2:$B2/${V0}1 $H3:$B3/${V0}1 force
9ae3f9
+TEST $CLI_1 volume start $V0
9ae3f9
+
9ae3f9
+
9ae3f9
+EXPECT 3 count_brick_processes
9ae3f9
+
9ae3f9
+#create and start a new volume
9ae3f9
+TEST $CLI_1 volume create $V1 $H1:$B1/${V1}2 $H2:$B2/${V1}2 $H3:$B3/${V1}2 force
9ae3f9
+TEST $CLI_1 volume start $V1
9ae3f9
+
9ae3f9
+EXPECT 3 count_brick_processes
9ae3f9
+
9ae3f9
+V2=patchy2
9ae3f9
+TEST $CLI_1 volume create $V2 $H1:$B1/${V2}3 $H2:$B2/${V2}3 $H3:$B3/${V2}3 force
9ae3f9
+TEST $CLI_1 volume start $V2
9ae3f9
+
9ae3f9
+EXPECT 3 count_brick_processes
9ae3f9
+
9ae3f9
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 3 count_brick_pids
9ae3f9
+
9ae3f9
+TEST kill_node 1
9ae3f9
+
9ae3f9
+sleep 10
9ae3f9
+
9ae3f9
+EXPECT_WITHIN $PROBE_TIMEOUT 1 check_peers;
9ae3f9
+
9ae3f9
+$CLI_2 volume set $V0 performance.readdir-ahead on
9ae3f9
+$CLI_2 volume set $V1 performance.readdir-ahead on
9ae3f9
+
9ae3f9
+TEST $glusterd_1;
9ae3f9
+
9ae3f9
+sleep 10
9ae3f9
+
9ae3f9
+EXPECT 4 count_brick_processes
9ae3f9
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 4 count_brick_pids
9ae3f9
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 0  count_N/A_brick_pids
9ae3f9
+
9ae3f9
+cleanup;
9ae3f9
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
9ae3f9
index 6654741..1b78812 100644
9ae3f9
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
9ae3f9
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
9ae3f9
@@ -4988,16 +4988,6 @@ glusterd_import_friend_volume(dict_t *peer_data, int count)
9ae3f9
         glusterd_volinfo_unref(old_volinfo);
9ae3f9
     }
9ae3f9
 
9ae3f9
-    if (glusterd_is_volume_started(new_volinfo)) {
9ae3f9
-        (void)glusterd_start_bricks(new_volinfo);
9ae3f9
-        if (glusterd_is_snapd_enabled(new_volinfo)) {
9ae3f9
-            svc = &(new_volinfo->snapd.svc);
9ae3f9
-            if (svc->manager(svc, new_volinfo, PROC_START_NO_WAIT)) {
9ae3f9
-                gf_event(EVENT_SVC_MANAGER_FAILED, "svc_name=%s", svc->name);
9ae3f9
-            }
9ae3f9
-        }
9ae3f9
-    }
9ae3f9
-
9ae3f9
     ret = glusterd_store_volinfo(new_volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
9ae3f9
     if (ret) {
9ae3f9
         gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_STORE_FAIL,
9ae3f9
@@ -5007,19 +4997,31 @@ glusterd_import_friend_volume(dict_t *peer_data, int count)
9ae3f9
         goto out;
9ae3f9
     }
9ae3f9
 
9ae3f9
-    ret = glusterd_create_volfiles_and_notify_services(new_volinfo);
9ae3f9
+    ret = glusterd_create_volfiles(new_volinfo);
9ae3f9
     if (ret)
9ae3f9
         goto out;
9ae3f9
 
9ae3f9
+    glusterd_list_add_order(&new_volinfo->vol_list, &priv->volumes,
9ae3f9
+                            glusterd_compare_volume_name);
9ae3f9
+
9ae3f9
+    if (glusterd_is_volume_started(new_volinfo)) {
9ae3f9
+        (void)glusterd_start_bricks(new_volinfo);
9ae3f9
+        if (glusterd_is_snapd_enabled(new_volinfo)) {
9ae3f9
+            svc = &(new_volinfo->snapd.svc);
9ae3f9
+            if (svc->manager(svc, new_volinfo, PROC_START_NO_WAIT)) {
9ae3f9
+                gf_event(EVENT_SVC_MANAGER_FAILED, "svc_name=%s", svc->name);
9ae3f9
+            }
9ae3f9
+        }
9ae3f9
+    }
9ae3f9
+
9ae3f9
     ret = glusterd_import_quota_conf(peer_data, count, new_volinfo, "volume");
9ae3f9
     if (ret) {
9ae3f9
         gf_event(EVENT_IMPORT_QUOTA_CONF_FAILED, "volume=%s",
9ae3f9
                  new_volinfo->volname);
9ae3f9
         goto out;
9ae3f9
     }
9ae3f9
-    glusterd_list_add_order(&new_volinfo->vol_list, &priv->volumes,
9ae3f9
-                            glusterd_compare_volume_name);
9ae3f9
 
9ae3f9
+    ret = glusterd_fetchspec_notify(this);
9ae3f9
 out:
9ae3f9
     gf_msg_debug("glusterd", 0, "Returning with ret: %d", ret);
9ae3f9
     return ret;
9ae3f9
-- 
9ae3f9
1.8.3.1
9ae3f9