21ab4e
From ed98a2c88a54dd14790632f97abf61403557ab7a Mon Sep 17 00:00:00 2001
21ab4e
From: Jeff Darcy <jdarcy@redhat.com>
21ab4e
Date: Mon, 20 Mar 2017 12:32:33 -0400
21ab4e
Subject: [PATCH 366/369] glusterd: hold off volume deletes while still
21ab4e
 restarting bricks
21ab4e
21ab4e
We need to do this because modifying the volume/brick tree while
21ab4e
glusterd_restart_bricks is still walking it can lead to segfaults.
21ab4e
Without waiting we could accidentally "slip in" while attach_brick has
21ab4e
released big_lock between retries and make such a modification.
21ab4e
21ab4e
>Reviewed-on: https://review.gluster.org/16927
21ab4e
>Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
21ab4e
Change-Id: I30ccc4efa8d286aae847250f5d4fb28956a74b03
21ab4e
BUG: 1438052
21ab4e
Signed-off-by: Jeff Darcy <jeff@pl.atyp.us>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/102298
21ab4e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
Tested-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
---
21ab4e
 ...ports.t => bug-1421590-brick-mux-reuse-ports.t} |  5 ++
21ab4e
 tests/bugs/core/bug-1432542-mpx-restart-crash.t    | 91 ++++++++++++++++++++++
21ab4e
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         | 15 ++++
21ab4e
 xlators/mgmt/glusterd/src/glusterd-utils.c         | 39 +++++++---
21ab4e
 xlators/mgmt/glusterd/src/glusterd-volume-ops.c    |  3 -
21ab4e
 xlators/mgmt/glusterd/src/glusterd.c               |  2 +-
21ab4e
 xlators/mgmt/glusterd/src/glusterd.h               |  1 +
21ab4e
 7 files changed, 141 insertions(+), 15 deletions(-)
21ab4e
 rename tests/bugs/core/{bug-1421590-brick-mux-resuse-ports.t => bug-1421590-brick-mux-reuse-ports.t} (86%)
21ab4e
 create mode 100644 tests/bugs/core/bug-1432542-mpx-restart-crash.t
21ab4e
21ab4e
diff --git a/tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t b/tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t
21ab4e
similarity index 86%
21ab4e
rename from tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t
21ab4e
rename to tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t
21ab4e
index ed401f6..a227f82 100644
21ab4e
--- a/tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t
21ab4e
+++ b/tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t
21ab4e
@@ -21,6 +21,11 @@ push_trapfunc "cleanup"
21ab4e
 TEST $CLI volume create $V0 $H0:$B0/brick{0,1}
21ab4e
 TEST $CLI volume start $V0
21ab4e
 
21ab4e
+# We can't expect a valid port number instantly.  We need to wait for the
21ab4e
+# bricks to finish coming up.  In every other case we use EXPECT_WITHIN, but
21ab4e
+# this first time we need to wait more explicitly.
21ab4e
+sleep $PROCESS_UP_TIMEOUT
21ab4e
+
21ab4e
 port_brick0=$(get_nth_brick_port_for_volume $V0 1)
21ab4e
 
21ab4e
 # restart the volume
21ab4e
diff --git a/tests/bugs/core/bug-1432542-mpx-restart-crash.t b/tests/bugs/core/bug-1432542-mpx-restart-crash.t
21ab4e
new file mode 100644
21ab4e
index 0000000..970a181
21ab4e
--- /dev/null
21ab4e
+++ b/tests/bugs/core/bug-1432542-mpx-restart-crash.t
21ab4e
@@ -0,0 +1,91 @@
21ab4e
+#!/bin/bash
21ab4e
+
21ab4e
+. $(dirname $0)/../../include.rc
21ab4e
+. $(dirname $0)/../../volume.rc
21ab4e
+. $(dirname $0)/../../traps.rc
21ab4e
+
21ab4e
+NUM_VOLS=20
21ab4e
+MOUNT_BASE=$(dirname $M0)
21ab4e
+
21ab4e
+# GlusterD reports that bricks are started when in fact their attach requests
21ab4e
+# might still need to be retried.  That's a bit of a hack, but there's no
21ab4e
+# feasible way to wait at that point (in attach_brick) and the rest of the
21ab4e
+# code is unprepared to deal with transient errors so the whole "brick start"
21ab4e
+# would fail.  Meanwhile, glusterfsd can only handle attach requests at a
21ab4e
+# rather slow rate.  After GlusterD tries to start a couple of hundred bricks,
21ab4e
+# glusterfsd can fall behind and we start getting mount failures.  Arguably,
21ab4e
+# those are spurious because we will eventually catch up.  We're just not
21ab4e
+# ready *yet*.  More to the point, even if the errors aren't spurious that's
21ab4e
+# not what we're testing right now.  Therefore, we give glusterfsd a bit more
21ab4e
+# breathing room for this test than we would otherwise.
21ab4e
+MOUNT_TIMEOUT=15
21ab4e
+
21ab4e
+get_brick_base () {
21ab4e
+	printf "%s/vol%02d" $B0 $1
21ab4e
+}
21ab4e
+
21ab4e
+get_mount_point () {
21ab4e
+	printf "%s/vol%02d" $MOUNT_BASE $1
21ab4e
+}
21ab4e
+
21ab4e
+create_volume () {
21ab4e
+
21ab4e
+	local vol_name=$(printf "%s-vol%02d" $V0 $1)
21ab4e
+
21ab4e
+	local brick_base=$(get_brick_base $1)
21ab4e
+	local cmd="$CLI volume create $vol_name replica 2"
21ab4e
+	local b
21ab4e
+	for b in $(seq 0 5); do
21ab4e
+		local this_brick=${brick_base}/brick$b
21ab4e
+		mkdir -p $this_brick
21ab4e
+		cmd="$cmd $H0:$this_brick"
21ab4e
+	done
21ab4e
+	TEST $cmd
21ab4e
+	TEST $CLI volume start $vol_name
21ab4e
+	EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $vol_name "Status"
21ab4e
+	local mount_point=$(get_mount_point $1)
21ab4e
+	mkdir -p $mount_point
21ab4e
+	TEST $GFS -s $H0 --volfile-id=$vol_name $mount_point
21ab4e
+}
21ab4e
+
21ab4e
+cleanup_func () {
21ab4e
+	local v
21ab4e
+	for v in $(seq 1 $NUM_VOLS); do
21ab4e
+		local mount_point=$(get_mount_point $v)
21ab4e
+		force_umount $mount_point
21ab4e
+		rm -rf $mount_point
21ab4e
+		local vol_name=$(printf "%s-vol%02d" $V0 $v)
21ab4e
+		$CLI volume stop $vol_name
21ab4e
+		$CLI volume delete $vol_name
21ab4e
+		rm -rf $(get_brick_base $1) &
21ab4e
+	done &> /dev/null
21ab4e
+	wait
21ab4e
+}
21ab4e
+push_trapfunc cleanup_func
21ab4e
+
21ab4e
+TEST glusterd
21ab4e
+TEST $CLI volume set all cluster.brick-multiplex on
21ab4e
+
21ab4e
+# Our infrastructure can't handle an arithmetic expression here.  The formula
21ab4e
+# is (NUM_VOLS-1)*5 because it sees each TEST/EXPECT once but needs the other
21ab4e
+# NUM_VOLS-1 and there are 5 such statements in each iteration.
21ab4e
+TESTS_EXPECTED_IN_LOOP=95
21ab4e
+for i in $(seq 1 $NUM_VOLS); do
21ab4e
+	create_volume $i
21ab4e
+	TEST dd if=/dev/zero of=$(get_mount_point $i)/a_file bs=4k count=1
21ab4e
+done
21ab4e
+
21ab4e
+# Kill glusterd, and wait a bit for all traces to disappear.
21ab4e
+TEST killall -9 glusterd
21ab4e
+sleep 5
21ab4e
+TEST killall -9 glusterfsd
21ab4e
+sleep 5
21ab4e
+
21ab4e
+# Restart glusterd.  This is where the brick daemon supposedly dumps core,
21ab4e
+# though I (jdarcy) have yet to see that.  Again, give it a while to settle,
21ab4e
+# just to be sure.
21ab4e
+TEST glusterd
21ab4e
+
21ab4e
+cleanup_func
21ab4e
+trap - EXIT
21ab4e
+cleanup
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
21ab4e
index 98ae7b6..dae6c1e 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
21ab4e
@@ -5935,6 +5935,15 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr,
21ab4e
         return ret;
21ab4e
 }
21ab4e
 
21ab4e
+static void
21ab4e
+glusterd_wait_for_blockers (glusterd_conf_t *priv)
21ab4e
+{
21ab4e
+        while (priv->blockers) {
21ab4e
+                synclock_unlock (&priv->big_lock);
21ab4e
+                sleep (1);
21ab4e
+                synclock_lock (&priv->big_lock);
21ab4e
+        }
21ab4e
+}
21ab4e
 
21ab4e
 int32_t
21ab4e
 glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
21ab4e
@@ -5954,18 +5963,22 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
21ab4e
                         break;
21ab4e
 
21ab4e
                 case GD_OP_STOP_VOLUME:
21ab4e
+                        glusterd_wait_for_blockers (this->private);
21ab4e
                         ret = glusterd_op_stop_volume (dict);
21ab4e
                         break;
21ab4e
 
21ab4e
                 case GD_OP_DELETE_VOLUME:
21ab4e
+                        glusterd_wait_for_blockers (this->private);
21ab4e
                         ret = glusterd_op_delete_volume (dict);
21ab4e
                         break;
21ab4e
 
21ab4e
                 case GD_OP_ADD_BRICK:
21ab4e
+                        glusterd_wait_for_blockers (this->private);
21ab4e
                         ret = glusterd_op_add_brick (dict, op_errstr);
21ab4e
                         break;
21ab4e
 
21ab4e
                 case GD_OP_REPLACE_BRICK:
21ab4e
+                        glusterd_wait_for_blockers (this->private);
21ab4e
                         ret = glusterd_op_replace_brick (dict, rsp_dict);
21ab4e
                         break;
21ab4e
 
21ab4e
@@ -5976,11 +5989,13 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
21ab4e
                         ret = glusterd_op_set_ganesha (dict, op_errstr);
21ab4e
                         break;
21ab4e
 
21ab4e
+
21ab4e
                 case GD_OP_RESET_VOLUME:
21ab4e
                         ret = glusterd_op_reset_volume (dict, op_errstr);
21ab4e
                         break;
21ab4e
 
21ab4e
                 case GD_OP_REMOVE_BRICK:
21ab4e
+                        glusterd_wait_for_blockers (this->private);
21ab4e
                         ret = glusterd_op_remove_brick (dict, op_errstr);
21ab4e
                         break;
21ab4e
 
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
index 77f97a5..2adfb47 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
21ab4e
@@ -3123,8 +3123,8 @@ out:
21ab4e
 int
21ab4e
 glusterd_spawn_daemons (void *opaque)
21ab4e
 {
21ab4e
-        glusterd_conf_t *conf = THIS->private;
21ab4e
-        int             ret             = -1;
21ab4e
+        glusterd_conf_t *conf   = THIS->private;
21ab4e
+        int             ret     = -1;
21ab4e
 
21ab4e
         synclock_lock (&conf->big_lock);
21ab4e
         glusterd_restart_bricks (conf);
21ab4e
@@ -4891,9 +4891,13 @@ static int32_t
21ab4e
 my_callback (struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
21ab4e
 {
21ab4e
         call_frame_t    *frame  = v_frame;
21ab4e
+        glusterd_conf_t *conf   = frame->this->private;
21ab4e
 
21ab4e
-        STACK_DESTROY (frame->root);
21ab4e
+        synclock_lock (&conf->big_lock);
21ab4e
+        --(conf->blockers);
21ab4e
+        synclock_unlock (&conf->big_lock);
21ab4e
 
21ab4e
+        STACK_DESTROY (frame->root);
21ab4e
         return 0;
21ab4e
 }
21ab4e
 
21ab4e
@@ -4910,6 +4914,7 @@ send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op)
21ab4e
         void                            *req = &brick_req;
21ab4e
         void                            *errlbl   = &&err;
21ab4e
         struct rpc_clnt_connection      *conn;
21ab4e
+        glusterd_conf_t                 *conf     = this->private;
21ab4e
         extern struct rpc_clnt_program  gd_brick_prog;
21ab4e
 
21ab4e
         if (!rpc) {
21ab4e
@@ -4969,9 +4974,13 @@ send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op)
21ab4e
         iov.iov_len = ret;
21ab4e
 
21ab4e
         /* Send the msg */
21ab4e
+        ++(conf->blockers);
21ab4e
         ret = rpc_clnt_submit (rpc, &gd_brick_prog, op,
21ab4e
-                               my_callback, &iov, 1, NULL, 0, iobref, frame,
21ab4e
-                               NULL, 0, NULL, 0, NULL);
21ab4e
+                               my_callback, &iov, 1, NULL, 0, iobref,
21ab4e
+                               frame, NULL, 0, NULL, 0, NULL);
21ab4e
+        if (ret) {
21ab4e
+                --(conf->blockers);
21ab4e
+        }
21ab4e
         return ret;
21ab4e
 
21ab4e
 free_iobref:
21ab4e
@@ -5003,6 +5012,8 @@ attach_brick (xlator_t *this,
21ab4e
         char            full_id[PATH_MAX]       = {'\0',};
21ab4e
         char            path[PATH_MAX]          = {'\0',};
21ab4e
         int             ret;
21ab4e
+        int             tries;
21ab4e
+        rpc_clnt_t      *rpc;
21ab4e
 
21ab4e
         gf_log (this->name, GF_LOG_INFO,
21ab4e
                 "add brick %s to existing process for %s",
21ab4e
@@ -5039,12 +5050,15 @@ attach_brick (xlator_t *this,
21ab4e
         }
21ab4e
         (void) build_volfile_path (full_id, path, sizeof(path), NULL);
21ab4e
 
21ab4e
-        int tries = 0;
21ab4e
-        while (tries++ <= 15) {
21ab4e
-                ret = send_attach_req (this, other_brick->rpc, path,
21ab4e
-                                       GLUSTERD_BRICK_ATTACH);
21ab4e
-                if (!ret) {
21ab4e
-                        return 0;
21ab4e
+        for (tries = 15; tries > 0; --tries) {
21ab4e
+                rpc = rpc_clnt_ref (other_brick->rpc);
21ab4e
+                if (rpc) {
21ab4e
+                        ret = send_attach_req (this, rpc, path,
21ab4e
+                                               GLUSTERD_BRICK_ATTACH);
21ab4e
+                        rpc_clnt_unref (rpc);
21ab4e
+                        if (!ret) {
21ab4e
+                                return 0;
21ab4e
+                        }
21ab4e
                 }
21ab4e
                 /*
21ab4e
                  * It might not actually be safe to manipulate the lock like
21ab4e
@@ -5410,6 +5424,8 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
21ab4e
         conf = this->private;
21ab4e
         GF_VALIDATE_OR_GOTO (this->name, conf, out);
21ab4e
 
21ab4e
+        ++(conf->blockers);
21ab4e
+
21ab4e
         ret = glusterd_get_quorum_cluster_counts (this, &active_count,
21ab4e
                                                   &quorum_count);
21ab4e
         if (ret)
21ab4e
@@ -5489,6 +5505,7 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
21ab4e
         ret = 0;
21ab4e
 
21ab4e
 out:
21ab4e
+        --(conf->blockers);
21ab4e
         conf->restart_done = _gf_true;
21ab4e
         return ret;
21ab4e
 }
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
21ab4e
index 3941b06..08906ba 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
21ab4e
@@ -2788,14 +2788,11 @@ glusterd_op_delete_volume (dict_t *dict)
21ab4e
 {
21ab4e
         int                                     ret = 0;
21ab4e
         char                                    *volname = NULL;
21ab4e
-        glusterd_conf_t                         *priv = NULL;
21ab4e
         glusterd_volinfo_t                      *volinfo = NULL;
21ab4e
         xlator_t                                *this = NULL;
21ab4e
 
21ab4e
         this = THIS;
21ab4e
         GF_ASSERT (this);
21ab4e
-        priv = this->private;
21ab4e
-        GF_ASSERT (priv);
21ab4e
 
21ab4e
         ret = dict_get_str (dict, "volname", &volname);
21ab4e
         if (ret) {
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
21ab4e
index ae81de6..7d66718 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd.c
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd.c
21ab4e
@@ -1869,10 +1869,10 @@ init (xlator_t *this)
21ab4e
         if (ret < 0)
21ab4e
                 goto out;
21ab4e
 
21ab4e
+        conf->blockers = 0;
21ab4e
         /* If there are no 'friends', this would be the best time to
21ab4e
          * spawn process/bricks that may need (re)starting since last
21ab4e
          * time (this) glusterd was up.*/
21ab4e
-
21ab4e
         if (cds_list_empty (&conf->peers)) {
21ab4e
                 glusterd_launch_synctask (glusterd_spawn_daemons, NULL);
21ab4e
         }
21ab4e
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
21ab4e
index d80ad20..671ec58 100644
21ab4e
--- a/xlators/mgmt/glusterd/src/glusterd.h
21ab4e
+++ b/xlators/mgmt/glusterd/src/glusterd.h
21ab4e
@@ -184,6 +184,7 @@ typedef struct {
21ab4e
         int                        ping_timeout;
21ab4e
         uint32_t                   generation;
21ab4e
         int32_t                    workers;
21ab4e
+        uint32_t                   blockers;
21ab4e
 } glusterd_conf_t;
21ab4e
 
21ab4e
 
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e