a3470f
From 938ee38c02cce2a743c672f9c03798ebcbb1e348 Mon Sep 17 00:00:00 2001
a3470f
From: Atin Mukherjee <amukherj@redhat.com>
a3470f
Date: Thu, 26 Oct 2017 14:26:30 +0530
a3470f
Subject: [PATCH 44/74] glusterd: fix brick restart parallelism
a3470f
a3470f
glusterd's brick restart logic is not always sequential as there is
a3470f
atleast three different ways how the bricks are restarted.
a3470f
1. through friend-sm and glusterd_spawn_daemons ()
a3470f
2. through friend-sm and handling volume quorum action
a3470f
3. through friend handshaking when there is a mimatch on quorum on
a3470f
friend import.
a3470f
a3470f
In a brick multiplexing setup, glusterd ended up trying to spawn the
a3470f
same brick process couple of times as almost in fraction of milliseconds
a3470f
two threads hit glusterd_brick_start () because of which glusterd didn't
a3470f
have any choice of rejecting any one of them as for both the case brick
a3470f
start criteria met.
a3470f
a3470f
As a solution, it'd be better to control this madness by two different
a3470f
flags, one is a boolean called start_triggered which indicates a brick
a3470f
start has been triggered and it continues to be true till a brick dies
a3470f
or killed, the second is a mutex lock to ensure for a particular brick
a3470f
we don't end up getting into glusterd_brick_start () more than once at
a3470f
same point of time.
a3470f
a3470f
>mainline patch : https://review.gluster.org/#/c/18577
a3470f
a3470f
Change-Id: I292f1e58d6971e111725e1baea1fe98b890b43e2
a3470f
BUG: 1505363
a3470f
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
a3470f
Reviewed-on: https://code.engineering.redhat.com/gerrit/123056
a3470f
Reviewed-by: Gaurav Yadav <gyadav@redhat.com>
a3470f
---
a3470f
 xlators/mgmt/glusterd/src/glusterd-handler.c       | 24 ++++++++-----
a3470f
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         | 31 ++++++++++-------
a3470f
 xlators/mgmt/glusterd/src/glusterd-server-quorum.c | 15 +++++++--
a3470f
 xlators/mgmt/glusterd/src/glusterd-utils.c         | 39 +++++++++++++++++-----
a3470f
 xlators/mgmt/glusterd/src/glusterd-volume-ops.c    |  8 +++++
a3470f
 xlators/mgmt/glusterd/src/glusterd.h               |  2 ++
a3470f
 6 files changed, 87 insertions(+), 32 deletions(-)
a3470f
a3470f
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
a3470f
index 34e751c..c3b9252 100644
a3470f
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
a3470f
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
a3470f
@@ -5946,16 +5946,22 @@ glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) {
a3470f
         int                       ret              =  -1;
a3470f
 
a3470f
         cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) {
a3470f
-                ret =  glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo);
a3470f
+                ret =  glusterd_get_volinfo_from_brick (brickinfo->path,
a3470f
+                                                        &volinfo);
a3470f
                 if (ret) {
a3470f
-                        gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
a3470f
-                                "Failed to get volinfo from brick(%s)",
a3470f
-                                brickinfo->path);
a3470f
+                        gf_msg (THIS->name, GF_LOG_ERROR, 0,
a3470f
+                                GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo"
a3470f
+                                " from brick(%s)", brickinfo->path);
a3470f
                         goto out;
a3470f
                 }
a3470f
-                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) {
a3470f
-                        if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0)
a3470f
-                                glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED);
a3470f
+                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks,
a3470f
+                                         brick_list) {
a3470f
+                        if (strcmp (brickinfo->path,
a3470f
+                                    brickinfo_tmp->path) == 0) {
a3470f
+                                glusterd_set_brick_status (brickinfo_tmp,
a3470f
+                                                           GF_BRICK_STOPPED);
a3470f
+                                brickinfo_tmp->start_triggered = _gf_false;
a3470f
+                        }
a3470f
                 }
a3470f
         }
a3470f
         return 0;
a3470f
@@ -6129,8 +6135,10 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
a3470f
                                 if (temp == 1)
a3470f
                                         break;
a3470f
                         }
a3470f
-                } else
a3470f
+                } else {
a3470f
                         glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
a3470f
+                        brickinfo->start_triggered = _gf_false;
a3470f
+                }
a3470f
                 break;
a3470f
 
a3470f
         case RPC_CLNT_DESTROY:
a3470f
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
a3470f
index 9641b4f..5b8f833 100644
a3470f
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
a3470f
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
a3470f
@@ -2402,18 +2402,25 @@ glusterd_start_bricks (glusterd_volinfo_t *volinfo)
a3470f
         GF_ASSERT (volinfo);
a3470f
 
a3470f
         cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
a3470f
-                ret = glusterd_brick_start (volinfo, brickinfo, _gf_false);
a3470f
-                if (ret) {
a3470f
-                        gf_msg (THIS->name, GF_LOG_ERROR, 0,
a3470f
-                                GD_MSG_BRICK_DISCONNECTED,
a3470f
-                                "Failed to start %s:%s for %s",
a3470f
-                                brickinfo->hostname, brickinfo->path,
a3470f
-                                volinfo->volname);
a3470f
-                        gf_event (EVENT_BRICK_START_FAILED,
a3470f
-                                  "peer=%s;volume=%s;brick=%s",
a3470f
-                                  brickinfo->hostname, volinfo->volname,
a3470f
-                                  brickinfo->path);
a3470f
-                        goto out;
a3470f
+                if (!brickinfo->start_triggered) {
a3470f
+                        pthread_mutex_lock (&brickinfo->restart_mutex);
a3470f
+                        {
a3470f
+                                ret = glusterd_brick_start (volinfo, brickinfo,
a3470f
+                                                            _gf_false);
a3470f
+                        }
a3470f
+                        pthread_mutex_unlock (&brickinfo->restart_mutex);
a3470f
+                        if (ret) {
a3470f
+                                gf_msg (THIS->name, GF_LOG_ERROR, 0,
a3470f
+                                        GD_MSG_BRICK_DISCONNECTED,
a3470f
+                                        "Failed to start %s:%s for %s",
a3470f
+                                        brickinfo->hostname, brickinfo->path,
a3470f
+                                        volinfo->volname);
a3470f
+                                gf_event (EVENT_BRICK_START_FAILED,
a3470f
+                                          "peer=%s;volume=%s;brick=%s",
a3470f
+                                          brickinfo->hostname, volinfo->volname,
a3470f
+                                          brickinfo->path);
a3470f
+                                goto out;
a3470f
+                        }
a3470f
                 }
a3470f
 
a3470f
         }
a3470f
diff --git a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
a3470f
index 4706403..995a568 100644
a3470f
--- a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
a3470f
+++ b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
a3470f
@@ -362,10 +362,19 @@ glusterd_do_volume_quorum_action (xlator_t *this, glusterd_volinfo_t *volinfo,
a3470f
         list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
a3470f
                 if (!glusterd_is_local_brick (this, volinfo, brickinfo))
a3470f
                         continue;
a3470f
-                if (quorum_status == DOESNT_MEET_QUORUM)
a3470f
+                if (quorum_status == DOESNT_MEET_QUORUM) {
a3470f
                         glusterd_brick_stop (volinfo, brickinfo, _gf_false);
a3470f
-                else
a3470f
-                        glusterd_brick_start (volinfo, brickinfo, _gf_false);
a3470f
+                } else {
a3470f
+                        if (!brickinfo->start_triggered) {
a3470f
+                                pthread_mutex_lock (&brickinfo->restart_mutex);
a3470f
+                                {
a3470f
+                                        glusterd_brick_start (volinfo,
a3470f
+                                                              brickinfo,
a3470f
+                                                              _gf_false);
a3470f
+                                }
a3470f
+                                pthread_mutex_unlock (&brickinfo->restart_mutex);
a3470f
+                        }
a3470f
+                }
a3470f
         }
a3470f
         volinfo->quorum_status = quorum_status;
a3470f
         if (quorum_status == MEETS_QUORUM) {
a3470f
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
a3470f
index bb236df..18de517 100644
a3470f
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
a3470f
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
a3470f
@@ -1084,7 +1084,7 @@ glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo)
a3470f
                 goto out;
a3470f
 
a3470f
         CDS_INIT_LIST_HEAD (&new_brickinfo->brick_list);
a3470f
-
a3470f
+        pthread_mutex_init (&new_brickinfo->restart_mutex, NULL);
a3470f
         *brickinfo = new_brickinfo;
a3470f
 
a3470f
         ret = 0;
a3470f
@@ -2481,7 +2481,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
a3470f
         (void) sys_unlink (pidfile);
a3470f
 
a3470f
         brickinfo->status = GF_BRICK_STOPPED;
a3470f
-
a3470f
+        brickinfo->start_triggered = _gf_false;
a3470f
         if (del_brick)
a3470f
                 glusterd_delete_brick (volinfo, brickinfo);
a3470f
 out:
a3470f
@@ -5817,13 +5817,14 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
a3470f
          * three different triggers for an attempt to start the brick process
a3470f
          * due to the quorum handling code in glusterd_friend_sm.
a3470f
          */
a3470f
-        if (brickinfo->status == GF_BRICK_STARTING) {
a3470f
+        if (brickinfo->status == GF_BRICK_STARTING ||
a3470f
+            brickinfo->start_triggered) {
a3470f
                 gf_msg_debug (this->name, 0, "brick %s is already in starting "
a3470f
                               "phase", brickinfo->path);
a3470f
                 ret = 0;
a3470f
                 goto out;
a3470f
         }
a3470f
-
a3470f
+        brickinfo->start_triggered = _gf_true;
a3470f
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
a3470f
         if (gf_is_service_running (pidfile, &pid)) {
a3470f
                 if (brickinfo->status != GF_BRICK_STARTING &&
a3470f
@@ -5936,6 +5937,9 @@ run:
a3470f
         }
a3470f
 
a3470f
 out:
a3470f
+        if (ret && brickinfo) {
a3470f
+                brickinfo->start_triggered = _gf_false;
a3470f
+        }
a3470f
         gf_msg_debug (this->name, 0, "returning %d ", ret);
a3470f
         return ret;
a3470f
 }
a3470f
@@ -5997,11 +6001,19 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
a3470f
                                 start_svcs = _gf_true;
a3470f
                                 glusterd_svcs_manager (NULL);
a3470f
                         }
a3470f
-
a3470f
                         cds_list_for_each_entry (brickinfo, &volinfo->bricks,
a3470f
                                                  brick_list) {
a3470f
-                                glusterd_brick_start (volinfo, brickinfo,
a3470f
-                                                     _gf_false);
a3470f
+                                if (!brickinfo->start_triggered) {
a3470f
+                                        pthread_mutex_lock
a3470f
+                                                (&brickinfo->restart_mutex);
a3470f
+                                        {
a3470f
+                                                glusterd_brick_start
a3470f
+                                                         (volinfo, brickinfo,
a3470f
+                                                          _gf_false);
a3470f
+                                        }
a3470f
+                                        pthread_mutex_unlock
a3470f
+                                                (&brickinfo->restart_mutex);
a3470f
+                                }
a3470f
                         }
a3470f
                         ret = glusterd_store_volinfo
a3470f
                                 (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
a3470f
@@ -6040,8 +6052,17 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
a3470f
                                 "volume %s", volinfo->volname);
a3470f
                         cds_list_for_each_entry (brickinfo, &volinfo->bricks,
a3470f
                                                  brick_list) {
a3470f
-                                glusterd_brick_start (volinfo, brickinfo,
a3470f
-                                                      _gf_false);
a3470f
+                                if (!brickinfo->start_triggered) {
a3470f
+                                        pthread_mutex_lock
a3470f
+                                                (&brickinfo->restart_mutex);
a3470f
+                                        {
a3470f
+                                                glusterd_brick_start
a3470f
+                                                         (volinfo, brickinfo,
a3470f
+                                                          _gf_false);
a3470f
+                                        }
a3470f
+                                        pthread_mutex_unlock
a3470f
+                                                (&brickinfo->restart_mutex);
a3470f
+                                }
a3470f
                         }
a3470f
                         ret = glusterd_store_volinfo
a3470f
                                 (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
a3470f
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
a3470f
index 834acab..bec5f72 100644
a3470f
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
a3470f
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
a3470f
@@ -2545,6 +2545,14 @@ glusterd_start_volume (glusterd_volinfo_t *volinfo, int flags,
a3470f
         GF_ASSERT (volinfo);
a3470f
 
a3470f
         cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
a3470f
+                /* Mark start_triggered to false so that in case if this brick
a3470f
+                 * was brought down through gf_attach utility, the
a3470f
+                 * brickinfo->start_triggered wouldn't have been updated to
a3470f
+                 * _gf_false
a3470f
+                 */
a3470f
+                if (flags & GF_CLI_FLAG_OP_FORCE) {
a3470f
+                        brickinfo->start_triggered = _gf_false;
a3470f
+                }
a3470f
                 ret = glusterd_brick_start (volinfo, brickinfo, wait);
a3470f
                 /* If 'force' try to start all bricks regardless of success or
a3470f
                  * failure
a3470f
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
a3470f
index 722d2f8..d4bb236 100644
a3470f
--- a/xlators/mgmt/glusterd/src/glusterd.h
a3470f
+++ b/xlators/mgmt/glusterd/src/glusterd.h
a3470f
@@ -240,6 +240,8 @@ struct glusterd_brickinfo {
a3470f
         uint64_t           statfs_fsid;
a3470f
         uint32_t           fs_share_count;
a3470f
         gf_boolean_t       port_registered;
a3470f
+        gf_boolean_t       start_triggered;
a3470f
+        pthread_mutex_t    restart_mutex;
a3470f
 };
a3470f
 
a3470f
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
a3470f
-- 
a3470f
1.8.3.1
a3470f