e7a346
From 938ee38c02cce2a743c672f9c03798ebcbb1e348 Mon Sep 17 00:00:00 2001
e7a346
From: Atin Mukherjee <amukherj@redhat.com>
e7a346
Date: Thu, 26 Oct 2017 14:26:30 +0530
e7a346
Subject: [PATCH 44/74] glusterd: fix brick restart parallelism
e7a346
e7a346
glusterd's brick restart logic is not always sequential as there is
e7a346
atleast three different ways how the bricks are restarted.
e7a346
1. through friend-sm and glusterd_spawn_daemons ()
e7a346
2. through friend-sm and handling volume quorum action
e7a346
3. through friend handshaking when there is a mimatch on quorum on
e7a346
friend import.
e7a346
e7a346
In a brick multiplexing setup, glusterd ended up trying to spawn the
e7a346
same brick process couple of times as almost in fraction of milliseconds
e7a346
two threads hit glusterd_brick_start () because of which glusterd didn't
e7a346
have any choice of rejecting any one of them as for both the case brick
e7a346
start criteria met.
e7a346
e7a346
As a solution, it'd be better to control this madness by two different
e7a346
flags, one is a boolean called start_triggered which indicates a brick
e7a346
start has been triggered and it continues to be true till a brick dies
e7a346
or killed, the second is a mutex lock to ensure for a particular brick
e7a346
we don't end up getting into glusterd_brick_start () more than once at
e7a346
same point of time.
e7a346
e7a346
>mainline patch : https://review.gluster.org/#/c/18577
e7a346
e7a346
Change-Id: I292f1e58d6971e111725e1baea1fe98b890b43e2
e7a346
BUG: 1505363
e7a346
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/123056
e7a346
Reviewed-by: Gaurav Yadav <gyadav@redhat.com>
e7a346
---
e7a346
 xlators/mgmt/glusterd/src/glusterd-handler.c       | 24 ++++++++-----
e7a346
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         | 31 ++++++++++-------
e7a346
 xlators/mgmt/glusterd/src/glusterd-server-quorum.c | 15 +++++++--
e7a346
 xlators/mgmt/glusterd/src/glusterd-utils.c         | 39 +++++++++++++++++-----
e7a346
 xlators/mgmt/glusterd/src/glusterd-volume-ops.c    |  8 +++++
e7a346
 xlators/mgmt/glusterd/src/glusterd.h               |  2 ++
e7a346
 6 files changed, 87 insertions(+), 32 deletions(-)
e7a346
e7a346
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
e7a346
index 34e751c..c3b9252 100644
e7a346
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
e7a346
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
e7a346
@@ -5946,16 +5946,22 @@ glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) {
e7a346
         int                       ret              =  -1;
e7a346
 
e7a346
         cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) {
e7a346
-                ret =  glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo);
e7a346
+                ret =  glusterd_get_volinfo_from_brick (brickinfo->path,
e7a346
+                                                        &volinfo);
e7a346
                 if (ret) {
e7a346
-                        gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
e7a346
-                                "Failed to get volinfo from brick(%s)",
e7a346
-                                brickinfo->path);
e7a346
+                        gf_msg (THIS->name, GF_LOG_ERROR, 0,
e7a346
+                                GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo"
e7a346
+                                " from brick(%s)", brickinfo->path);
e7a346
                         goto out;
e7a346
                 }
e7a346
-                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) {
e7a346
-                        if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0)
e7a346
-                                glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED);
e7a346
+                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks,
e7a346
+                                         brick_list) {
e7a346
+                        if (strcmp (brickinfo->path,
e7a346
+                                    brickinfo_tmp->path) == 0) {
e7a346
+                                glusterd_set_brick_status (brickinfo_tmp,
e7a346
+                                                           GF_BRICK_STOPPED);
e7a346
+                                brickinfo_tmp->start_triggered = _gf_false;
e7a346
+                        }
e7a346
                 }
e7a346
         }
e7a346
         return 0;
e7a346
@@ -6129,8 +6135,10 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
e7a346
                                 if (temp == 1)
e7a346
                                         break;
e7a346
                         }
e7a346
-                } else
e7a346
+                } else {
e7a346
                         glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
e7a346
+                        brickinfo->start_triggered = _gf_false;
e7a346
+                }
e7a346
                 break;
e7a346
 
e7a346
         case RPC_CLNT_DESTROY:
e7a346
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
e7a346
index 9641b4f..5b8f833 100644
e7a346
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
e7a346
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
e7a346
@@ -2402,18 +2402,25 @@ glusterd_start_bricks (glusterd_volinfo_t *volinfo)
e7a346
         GF_ASSERT (volinfo);
e7a346
 
e7a346
         cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
e7a346
-                ret = glusterd_brick_start (volinfo, brickinfo, _gf_false);
e7a346
-                if (ret) {
e7a346
-                        gf_msg (THIS->name, GF_LOG_ERROR, 0,
e7a346
-                                GD_MSG_BRICK_DISCONNECTED,
e7a346
-                                "Failed to start %s:%s for %s",
e7a346
-                                brickinfo->hostname, brickinfo->path,
e7a346
-                                volinfo->volname);
e7a346
-                        gf_event (EVENT_BRICK_START_FAILED,
e7a346
-                                  "peer=%s;volume=%s;brick=%s",
e7a346
-                                  brickinfo->hostname, volinfo->volname,
e7a346
-                                  brickinfo->path);
e7a346
-                        goto out;
e7a346
+                if (!brickinfo->start_triggered) {
e7a346
+                        pthread_mutex_lock (&brickinfo->restart_mutex);
e7a346
+                        {
e7a346
+                                ret = glusterd_brick_start (volinfo, brickinfo,
e7a346
+                                                            _gf_false);
e7a346
+                        }
e7a346
+                        pthread_mutex_unlock (&brickinfo->restart_mutex);
e7a346
+                        if (ret) {
e7a346
+                                gf_msg (THIS->name, GF_LOG_ERROR, 0,
e7a346
+                                        GD_MSG_BRICK_DISCONNECTED,
e7a346
+                                        "Failed to start %s:%s for %s",
e7a346
+                                        brickinfo->hostname, brickinfo->path,
e7a346
+                                        volinfo->volname);
e7a346
+                                gf_event (EVENT_BRICK_START_FAILED,
e7a346
+                                          "peer=%s;volume=%s;brick=%s",
e7a346
+                                          brickinfo->hostname, volinfo->volname,
e7a346
+                                          brickinfo->path);
e7a346
+                                goto out;
e7a346
+                        }
e7a346
                 }
e7a346
 
e7a346
         }
e7a346
diff --git a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
e7a346
index 4706403..995a568 100644
e7a346
--- a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
e7a346
+++ b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
e7a346
@@ -362,10 +362,19 @@ glusterd_do_volume_quorum_action (xlator_t *this, glusterd_volinfo_t *volinfo,
e7a346
         list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
e7a346
                 if (!glusterd_is_local_brick (this, volinfo, brickinfo))
e7a346
                         continue;
e7a346
-                if (quorum_status == DOESNT_MEET_QUORUM)
e7a346
+                if (quorum_status == DOESNT_MEET_QUORUM) {
e7a346
                         glusterd_brick_stop (volinfo, brickinfo, _gf_false);
e7a346
-                else
e7a346
-                        glusterd_brick_start (volinfo, brickinfo, _gf_false);
e7a346
+                } else {
e7a346
+                        if (!brickinfo->start_triggered) {
e7a346
+                                pthread_mutex_lock (&brickinfo->restart_mutex);
e7a346
+                                {
e7a346
+                                        glusterd_brick_start (volinfo,
e7a346
+                                                              brickinfo,
e7a346
+                                                              _gf_false);
e7a346
+                                }
e7a346
+                                pthread_mutex_unlock (&brickinfo->restart_mutex);
e7a346
+                        }
e7a346
+                }
e7a346
         }
e7a346
         volinfo->quorum_status = quorum_status;
e7a346
         if (quorum_status == MEETS_QUORUM) {
e7a346
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
e7a346
index bb236df..18de517 100644
e7a346
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
e7a346
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
e7a346
@@ -1084,7 +1084,7 @@ glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo)
e7a346
                 goto out;
e7a346
 
e7a346
         CDS_INIT_LIST_HEAD (&new_brickinfo->brick_list);
e7a346
-
e7a346
+        pthread_mutex_init (&new_brickinfo->restart_mutex, NULL);
e7a346
         *brickinfo = new_brickinfo;
e7a346
 
e7a346
         ret = 0;
e7a346
@@ -2481,7 +2481,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
e7a346
         (void) sys_unlink (pidfile);
e7a346
 
e7a346
         brickinfo->status = GF_BRICK_STOPPED;
e7a346
-
e7a346
+        brickinfo->start_triggered = _gf_false;
e7a346
         if (del_brick)
e7a346
                 glusterd_delete_brick (volinfo, brickinfo);
e7a346
 out:
e7a346
@@ -5817,13 +5817,14 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
e7a346
          * three different triggers for an attempt to start the brick process
e7a346
          * due to the quorum handling code in glusterd_friend_sm.
e7a346
          */
e7a346
-        if (brickinfo->status == GF_BRICK_STARTING) {
e7a346
+        if (brickinfo->status == GF_BRICK_STARTING ||
e7a346
+            brickinfo->start_triggered) {
e7a346
                 gf_msg_debug (this->name, 0, "brick %s is already in starting "
e7a346
                               "phase", brickinfo->path);
e7a346
                 ret = 0;
e7a346
                 goto out;
e7a346
         }
e7a346
-
e7a346
+        brickinfo->start_triggered = _gf_true;
e7a346
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
e7a346
         if (gf_is_service_running (pidfile, &pid)) {
e7a346
                 if (brickinfo->status != GF_BRICK_STARTING &&
e7a346
@@ -5936,6 +5937,9 @@ run:
e7a346
         }
e7a346
 
e7a346
 out:
e7a346
+        if (ret && brickinfo) {
e7a346
+                brickinfo->start_triggered = _gf_false;
e7a346
+        }
e7a346
         gf_msg_debug (this->name, 0, "returning %d ", ret);
e7a346
         return ret;
e7a346
 }
e7a346
@@ -5997,11 +6001,19 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
e7a346
                                 start_svcs = _gf_true;
e7a346
                                 glusterd_svcs_manager (NULL);
e7a346
                         }
e7a346
-
e7a346
                         cds_list_for_each_entry (brickinfo, &volinfo->bricks,
e7a346
                                                  brick_list) {
e7a346
-                                glusterd_brick_start (volinfo, brickinfo,
e7a346
-                                                     _gf_false);
e7a346
+                                if (!brickinfo->start_triggered) {
e7a346
+                                        pthread_mutex_lock
e7a346
+                                                (&brickinfo->restart_mutex);
e7a346
+                                        {
e7a346
+                                                glusterd_brick_start
e7a346
+                                                         (volinfo, brickinfo,
e7a346
+                                                          _gf_false);
e7a346
+                                        }
e7a346
+                                        pthread_mutex_unlock
e7a346
+                                                (&brickinfo->restart_mutex);
e7a346
+                                }
e7a346
                         }
e7a346
                         ret = glusterd_store_volinfo
e7a346
                                 (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
e7a346
@@ -6040,8 +6052,17 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
e7a346
                                 "volume %s", volinfo->volname);
e7a346
                         cds_list_for_each_entry (brickinfo, &volinfo->bricks,
e7a346
                                                  brick_list) {
e7a346
-                                glusterd_brick_start (volinfo, brickinfo,
e7a346
-                                                      _gf_false);
e7a346
+                                if (!brickinfo->start_triggered) {
e7a346
+                                        pthread_mutex_lock
e7a346
+                                                (&brickinfo->restart_mutex);
e7a346
+                                        {
e7a346
+                                                glusterd_brick_start
e7a346
+                                                         (volinfo, brickinfo,
e7a346
+                                                          _gf_false);
e7a346
+                                        }
e7a346
+                                        pthread_mutex_unlock
e7a346
+                                                (&brickinfo->restart_mutex);
e7a346
+                                }
e7a346
                         }
e7a346
                         ret = glusterd_store_volinfo
e7a346
                                 (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
e7a346
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
e7a346
index 834acab..bec5f72 100644
e7a346
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
e7a346
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
e7a346
@@ -2545,6 +2545,14 @@ glusterd_start_volume (glusterd_volinfo_t *volinfo, int flags,
e7a346
         GF_ASSERT (volinfo);
e7a346
 
e7a346
         cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
e7a346
+                /* Mark start_triggered to false so that in case if this brick
e7a346
+                 * was brought down through gf_attach utility, the
e7a346
+                 * brickinfo->start_triggered wouldn't have been updated to
e7a346
+                 * _gf_false
e7a346
+                 */
e7a346
+                if (flags & GF_CLI_FLAG_OP_FORCE) {
e7a346
+                        brickinfo->start_triggered = _gf_false;
e7a346
+                }
e7a346
                 ret = glusterd_brick_start (volinfo, brickinfo, wait);
e7a346
                 /* If 'force' try to start all bricks regardless of success or
e7a346
                  * failure
e7a346
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
e7a346
index 722d2f8..d4bb236 100644
e7a346
--- a/xlators/mgmt/glusterd/src/glusterd.h
e7a346
+++ b/xlators/mgmt/glusterd/src/glusterd.h
e7a346
@@ -240,6 +240,8 @@ struct glusterd_brickinfo {
e7a346
         uint64_t           statfs_fsid;
e7a346
         uint32_t           fs_share_count;
e7a346
         gf_boolean_t       port_registered;
e7a346
+        gf_boolean_t       start_triggered;
e7a346
+        pthread_mutex_t    restart_mutex;
e7a346
 };
e7a346
 
e7a346
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
e7a346
-- 
e7a346
1.8.3.1
e7a346