d1681e
From 938ee38c02cce2a743c672f9c03798ebcbb1e348 Mon Sep 17 00:00:00 2001
d1681e
From: Atin Mukherjee <amukherj@redhat.com>
d1681e
Date: Thu, 26 Oct 2017 14:26:30 +0530
d1681e
Subject: [PATCH 44/74] glusterd: fix brick restart parallelism
d1681e
d1681e
glusterd's brick restart logic is not always sequential as there is
d1681e
atleast three different ways how the bricks are restarted.
d1681e
1. through friend-sm and glusterd_spawn_daemons ()
d1681e
2. through friend-sm and handling volume quorum action
d1681e
3. through friend handshaking when there is a mimatch on quorum on
d1681e
friend import.
d1681e
d1681e
In a brick multiplexing setup, glusterd ended up trying to spawn the
d1681e
same brick process couple of times as almost in fraction of milliseconds
d1681e
two threads hit glusterd_brick_start () because of which glusterd didn't
d1681e
have any choice of rejecting any one of them as for both the case brick
d1681e
start criteria met.
d1681e
d1681e
As a solution, it'd be better to control this madness by two different
d1681e
flags, one is a boolean called start_triggered which indicates a brick
d1681e
start has been triggered and it continues to be true till a brick dies
d1681e
or killed, the second is a mutex lock to ensure for a particular brick
d1681e
we don't end up getting into glusterd_brick_start () more than once at
d1681e
same point of time.
d1681e
d1681e
>mainline patch : https://review.gluster.org/#/c/18577
d1681e
d1681e
Change-Id: I292f1e58d6971e111725e1baea1fe98b890b43e2
d1681e
BUG: 1505363
d1681e
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/123056
d1681e
Reviewed-by: Gaurav Yadav <gyadav@redhat.com>
d1681e
---
d1681e
 xlators/mgmt/glusterd/src/glusterd-handler.c       | 24 ++++++++-----
d1681e
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         | 31 ++++++++++-------
d1681e
 xlators/mgmt/glusterd/src/glusterd-server-quorum.c | 15 +++++++--
d1681e
 xlators/mgmt/glusterd/src/glusterd-utils.c         | 39 +++++++++++++++++-----
d1681e
 xlators/mgmt/glusterd/src/glusterd-volume-ops.c    |  8 +++++
d1681e
 xlators/mgmt/glusterd/src/glusterd.h               |  2 ++
d1681e
 6 files changed, 87 insertions(+), 32 deletions(-)
d1681e
d1681e
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
d1681e
index 34e751c..c3b9252 100644
d1681e
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
d1681e
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
d1681e
@@ -5946,16 +5946,22 @@ glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) {
d1681e
         int                       ret              =  -1;
d1681e
 
d1681e
         cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) {
d1681e
-                ret =  glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo);
d1681e
+                ret =  glusterd_get_volinfo_from_brick (brickinfo->path,
d1681e
+                                                        &volinfo);
d1681e
                 if (ret) {
d1681e
-                        gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
d1681e
-                                "Failed to get volinfo from brick(%s)",
d1681e
-                                brickinfo->path);
d1681e
+                        gf_msg (THIS->name, GF_LOG_ERROR, 0,
d1681e
+                                GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo"
d1681e
+                                " from brick(%s)", brickinfo->path);
d1681e
                         goto out;
d1681e
                 }
d1681e
-                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) {
d1681e
-                        if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0)
d1681e
-                                glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED);
d1681e
+                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks,
d1681e
+                                         brick_list) {
d1681e
+                        if (strcmp (brickinfo->path,
d1681e
+                                    brickinfo_tmp->path) == 0) {
d1681e
+                                glusterd_set_brick_status (brickinfo_tmp,
d1681e
+                                                           GF_BRICK_STOPPED);
d1681e
+                                brickinfo_tmp->start_triggered = _gf_false;
d1681e
+                        }
d1681e
                 }
d1681e
         }
d1681e
         return 0;
d1681e
@@ -6129,8 +6135,10 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
d1681e
                                 if (temp == 1)
d1681e
                                         break;
d1681e
                         }
d1681e
-                } else
d1681e
+                } else {
d1681e
                         glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
d1681e
+                        brickinfo->start_triggered = _gf_false;
d1681e
+                }
d1681e
                 break;
d1681e
 
d1681e
         case RPC_CLNT_DESTROY:
d1681e
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
d1681e
index 9641b4f..5b8f833 100644
d1681e
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
d1681e
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
d1681e
@@ -2402,18 +2402,25 @@ glusterd_start_bricks (glusterd_volinfo_t *volinfo)
d1681e
         GF_ASSERT (volinfo);
d1681e
 
d1681e
         cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
d1681e
-                ret = glusterd_brick_start (volinfo, brickinfo, _gf_false);
d1681e
-                if (ret) {
d1681e
-                        gf_msg (THIS->name, GF_LOG_ERROR, 0,
d1681e
-                                GD_MSG_BRICK_DISCONNECTED,
d1681e
-                                "Failed to start %s:%s for %s",
d1681e
-                                brickinfo->hostname, brickinfo->path,
d1681e
-                                volinfo->volname);
d1681e
-                        gf_event (EVENT_BRICK_START_FAILED,
d1681e
-                                  "peer=%s;volume=%s;brick=%s",
d1681e
-                                  brickinfo->hostname, volinfo->volname,
d1681e
-                                  brickinfo->path);
d1681e
-                        goto out;
d1681e
+                if (!brickinfo->start_triggered) {
d1681e
+                        pthread_mutex_lock (&brickinfo->restart_mutex);
d1681e
+                        {
d1681e
+                                ret = glusterd_brick_start (volinfo, brickinfo,
d1681e
+                                                            _gf_false);
d1681e
+                        }
d1681e
+                        pthread_mutex_unlock (&brickinfo->restart_mutex);
d1681e
+                        if (ret) {
d1681e
+                                gf_msg (THIS->name, GF_LOG_ERROR, 0,
d1681e
+                                        GD_MSG_BRICK_DISCONNECTED,
d1681e
+                                        "Failed to start %s:%s for %s",
d1681e
+                                        brickinfo->hostname, brickinfo->path,
d1681e
+                                        volinfo->volname);
d1681e
+                                gf_event (EVENT_BRICK_START_FAILED,
d1681e
+                                          "peer=%s;volume=%s;brick=%s",
d1681e
+                                          brickinfo->hostname, volinfo->volname,
d1681e
+                                          brickinfo->path);
d1681e
+                                goto out;
d1681e
+                        }
d1681e
                 }
d1681e
 
d1681e
         }
d1681e
diff --git a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
d1681e
index 4706403..995a568 100644
d1681e
--- a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
d1681e
+++ b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
d1681e
@@ -362,10 +362,19 @@ glusterd_do_volume_quorum_action (xlator_t *this, glusterd_volinfo_t *volinfo,
d1681e
         list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
d1681e
                 if (!glusterd_is_local_brick (this, volinfo, brickinfo))
d1681e
                         continue;
d1681e
-                if (quorum_status == DOESNT_MEET_QUORUM)
d1681e
+                if (quorum_status == DOESNT_MEET_QUORUM) {
d1681e
                         glusterd_brick_stop (volinfo, brickinfo, _gf_false);
d1681e
-                else
d1681e
-                        glusterd_brick_start (volinfo, brickinfo, _gf_false);
d1681e
+                } else {
d1681e
+                        if (!brickinfo->start_triggered) {
d1681e
+                                pthread_mutex_lock (&brickinfo->restart_mutex);
d1681e
+                                {
d1681e
+                                        glusterd_brick_start (volinfo,
d1681e
+                                                              brickinfo,
d1681e
+                                                              _gf_false);
d1681e
+                                }
d1681e
+                                pthread_mutex_unlock (&brickinfo->restart_mutex);
d1681e
+                        }
d1681e
+                }
d1681e
         }
d1681e
         volinfo->quorum_status = quorum_status;
d1681e
         if (quorum_status == MEETS_QUORUM) {
d1681e
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
d1681e
index bb236df..18de517 100644
d1681e
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
d1681e
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
d1681e
@@ -1084,7 +1084,7 @@ glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo)
d1681e
                 goto out;
d1681e
 
d1681e
         CDS_INIT_LIST_HEAD (&new_brickinfo->brick_list);
d1681e
-
d1681e
+        pthread_mutex_init (&new_brickinfo->restart_mutex, NULL);
d1681e
         *brickinfo = new_brickinfo;
d1681e
 
d1681e
         ret = 0;
d1681e
@@ -2481,7 +2481,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
d1681e
         (void) sys_unlink (pidfile);
d1681e
 
d1681e
         brickinfo->status = GF_BRICK_STOPPED;
d1681e
-
d1681e
+        brickinfo->start_triggered = _gf_false;
d1681e
         if (del_brick)
d1681e
                 glusterd_delete_brick (volinfo, brickinfo);
d1681e
 out:
d1681e
@@ -5817,13 +5817,14 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
d1681e
          * three different triggers for an attempt to start the brick process
d1681e
          * due to the quorum handling code in glusterd_friend_sm.
d1681e
          */
d1681e
-        if (brickinfo->status == GF_BRICK_STARTING) {
d1681e
+        if (brickinfo->status == GF_BRICK_STARTING ||
d1681e
+            brickinfo->start_triggered) {
d1681e
                 gf_msg_debug (this->name, 0, "brick %s is already in starting "
d1681e
                               "phase", brickinfo->path);
d1681e
                 ret = 0;
d1681e
                 goto out;
d1681e
         }
d1681e
-
d1681e
+        brickinfo->start_triggered = _gf_true;
d1681e
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
d1681e
         if (gf_is_service_running (pidfile, &pid)) {
d1681e
                 if (brickinfo->status != GF_BRICK_STARTING &&
d1681e
@@ -5936,6 +5937,9 @@ run:
d1681e
         }
d1681e
 
d1681e
 out:
d1681e
+        if (ret && brickinfo) {
d1681e
+                brickinfo->start_triggered = _gf_false;
d1681e
+        }
d1681e
         gf_msg_debug (this->name, 0, "returning %d ", ret);
d1681e
         return ret;
d1681e
 }
d1681e
@@ -5997,11 +6001,19 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
d1681e
                                 start_svcs = _gf_true;
d1681e
                                 glusterd_svcs_manager (NULL);
d1681e
                         }
d1681e
-
d1681e
                         cds_list_for_each_entry (brickinfo, &volinfo->bricks,
d1681e
                                                  brick_list) {
d1681e
-                                glusterd_brick_start (volinfo, brickinfo,
d1681e
-                                                     _gf_false);
d1681e
+                                if (!brickinfo->start_triggered) {
d1681e
+                                        pthread_mutex_lock
d1681e
+                                                (&brickinfo->restart_mutex);
d1681e
+                                        {
d1681e
+                                                glusterd_brick_start
d1681e
+                                                         (volinfo, brickinfo,
d1681e
+                                                          _gf_false);
d1681e
+                                        }
d1681e
+                                        pthread_mutex_unlock
d1681e
+                                                (&brickinfo->restart_mutex);
d1681e
+                                }
d1681e
                         }
d1681e
                         ret = glusterd_store_volinfo
d1681e
                                 (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
d1681e
@@ -6040,8 +6052,17 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
d1681e
                                 "volume %s", volinfo->volname);
d1681e
                         cds_list_for_each_entry (brickinfo, &volinfo->bricks,
d1681e
                                                  brick_list) {
d1681e
-                                glusterd_brick_start (volinfo, brickinfo,
d1681e
-                                                      _gf_false);
d1681e
+                                if (!brickinfo->start_triggered) {
d1681e
+                                        pthread_mutex_lock
d1681e
+                                                (&brickinfo->restart_mutex);
d1681e
+                                        {
d1681e
+                                                glusterd_brick_start
d1681e
+                                                         (volinfo, brickinfo,
d1681e
+                                                          _gf_false);
d1681e
+                                        }
d1681e
+                                        pthread_mutex_unlock
d1681e
+                                                (&brickinfo->restart_mutex);
d1681e
+                                }
d1681e
                         }
d1681e
                         ret = glusterd_store_volinfo
d1681e
                                 (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
d1681e
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
d1681e
index 834acab..bec5f72 100644
d1681e
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
d1681e
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
d1681e
@@ -2545,6 +2545,14 @@ glusterd_start_volume (glusterd_volinfo_t *volinfo, int flags,
d1681e
         GF_ASSERT (volinfo);
d1681e
 
d1681e
         cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
d1681e
+                /* Mark start_triggered to false so that in case if this brick
d1681e
+                 * was brought down through gf_attach utility, the
d1681e
+                 * brickinfo->start_triggered wouldn't have been updated to
d1681e
+                 * _gf_false
d1681e
+                 */
d1681e
+                if (flags & GF_CLI_FLAG_OP_FORCE) {
d1681e
+                        brickinfo->start_triggered = _gf_false;
d1681e
+                }
d1681e
                 ret = glusterd_brick_start (volinfo, brickinfo, wait);
d1681e
                 /* If 'force' try to start all bricks regardless of success or
d1681e
                  * failure
d1681e
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
d1681e
index 722d2f8..d4bb236 100644
d1681e
--- a/xlators/mgmt/glusterd/src/glusterd.h
d1681e
+++ b/xlators/mgmt/glusterd/src/glusterd.h
d1681e
@@ -240,6 +240,8 @@ struct glusterd_brickinfo {
d1681e
         uint64_t           statfs_fsid;
d1681e
         uint32_t           fs_share_count;
d1681e
         gf_boolean_t       port_registered;
d1681e
+        gf_boolean_t       start_triggered;
d1681e
+        pthread_mutex_t    restart_mutex;
d1681e
 };
d1681e
 
d1681e
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
d1681e
-- 
d1681e
1.8.3.1
d1681e