Blob Blame History Raw
From 938ee38c02cce2a743c672f9c03798ebcbb1e348 Mon Sep 17 00:00:00 2001
From: Atin Mukherjee <amukherj@redhat.com>
Date: Thu, 26 Oct 2017 14:26:30 +0530
Subject: [PATCH 44/74] glusterd: fix brick restart parallelism

glusterd's brick restart logic is not always sequential as there is
atleast three different ways how the bricks are restarted.
1. through friend-sm and glusterd_spawn_daemons ()
2. through friend-sm and handling volume quorum action
3. through friend handshaking when there is a mimatch on quorum on
friend import.

In a brick multiplexing setup, glusterd ended up trying to spawn the
same brick process couple of times as almost in fraction of milliseconds
two threads hit glusterd_brick_start () because of which glusterd didn't
have any choice of rejecting any one of them as for both the case brick
start criteria met.

As a solution, it'd be better to control this madness by two different
flags, one is a boolean called start_triggered which indicates a brick
start has been triggered and it continues to be true till a brick dies
or killed, the second is a mutex lock to ensure for a particular brick
we don't end up getting into glusterd_brick_start () more than once at
same point of time.

>mainline patch : https://review.gluster.org/#/c/18577

Change-Id: I292f1e58d6971e111725e1baea1fe98b890b43e2
BUG: 1505363
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/123056
Reviewed-by: Gaurav Yadav <gyadav@redhat.com>
---
 xlators/mgmt/glusterd/src/glusterd-handler.c       | 24 ++++++++-----
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         | 31 ++++++++++-------
 xlators/mgmt/glusterd/src/glusterd-server-quorum.c | 15 +++++++--
 xlators/mgmt/glusterd/src/glusterd-utils.c         | 39 +++++++++++++++++-----
 xlators/mgmt/glusterd/src/glusterd-volume-ops.c    |  8 +++++
 xlators/mgmt/glusterd/src/glusterd.h               |  2 ++
 6 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index 34e751c..c3b9252 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -5946,16 +5946,22 @@ glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) {
         int                       ret              =  -1;
 
         cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) {
-                ret =  glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo);
+                ret =  glusterd_get_volinfo_from_brick (brickinfo->path,
+                                                        &volinfo);
                 if (ret) {
-                        gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
-                                "Failed to get volinfo from brick(%s)",
-                                brickinfo->path);
+                        gf_msg (THIS->name, GF_LOG_ERROR, 0,
+                                GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo"
+                                " from brick(%s)", brickinfo->path);
                         goto out;
                 }
-                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) {
-                        if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0)
-                                glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED);
+                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks,
+                                         brick_list) {
+                        if (strcmp (brickinfo->path,
+                                    brickinfo_tmp->path) == 0) {
+                                glusterd_set_brick_status (brickinfo_tmp,
+                                                           GF_BRICK_STOPPED);
+                                brickinfo_tmp->start_triggered = _gf_false;
+                        }
                 }
         }
         return 0;
@@ -6129,8 +6135,10 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
                                 if (temp == 1)
                                         break;
                         }
-                } else
+                } else {
                         glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
+                        brickinfo->start_triggered = _gf_false;
+                }
                 break;
 
         case RPC_CLNT_DESTROY:
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index 9641b4f..5b8f833 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -2402,18 +2402,25 @@ glusterd_start_bricks (glusterd_volinfo_t *volinfo)
         GF_ASSERT (volinfo);
 
         cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
-                ret = glusterd_brick_start (volinfo, brickinfo, _gf_false);
-                if (ret) {
-                        gf_msg (THIS->name, GF_LOG_ERROR, 0,
-                                GD_MSG_BRICK_DISCONNECTED,
-                                "Failed to start %s:%s for %s",
-                                brickinfo->hostname, brickinfo->path,
-                                volinfo->volname);
-                        gf_event (EVENT_BRICK_START_FAILED,
-                                  "peer=%s;volume=%s;brick=%s",
-                                  brickinfo->hostname, volinfo->volname,
-                                  brickinfo->path);
-                        goto out;
+                if (!brickinfo->start_triggered) {
+                        pthread_mutex_lock (&brickinfo->restart_mutex);
+                        {
+                                ret = glusterd_brick_start (volinfo, brickinfo,
+                                                            _gf_false);
+                        }
+                        pthread_mutex_unlock (&brickinfo->restart_mutex);
+                        if (ret) {
+                                gf_msg (THIS->name, GF_LOG_ERROR, 0,
+                                        GD_MSG_BRICK_DISCONNECTED,
+                                        "Failed to start %s:%s for %s",
+                                        brickinfo->hostname, brickinfo->path,
+                                        volinfo->volname);
+                                gf_event (EVENT_BRICK_START_FAILED,
+                                          "peer=%s;volume=%s;brick=%s",
+                                          brickinfo->hostname, volinfo->volname,
+                                          brickinfo->path);
+                                goto out;
+                        }
                 }
 
         }
diff --git a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
index 4706403..995a568 100644
--- a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
+++ b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
@@ -362,10 +362,19 @@ glusterd_do_volume_quorum_action (xlator_t *this, glusterd_volinfo_t *volinfo,
         list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
                 if (!glusterd_is_local_brick (this, volinfo, brickinfo))
                         continue;
-                if (quorum_status == DOESNT_MEET_QUORUM)
+                if (quorum_status == DOESNT_MEET_QUORUM) {
                         glusterd_brick_stop (volinfo, brickinfo, _gf_false);
-                else
-                        glusterd_brick_start (volinfo, brickinfo, _gf_false);
+                } else {
+                        if (!brickinfo->start_triggered) {
+                                pthread_mutex_lock (&brickinfo->restart_mutex);
+                                {
+                                        glusterd_brick_start (volinfo,
+                                                              brickinfo,
+                                                              _gf_false);
+                                }
+                                pthread_mutex_unlock (&brickinfo->restart_mutex);
+                        }
+                }
         }
         volinfo->quorum_status = quorum_status;
         if (quorum_status == MEETS_QUORUM) {
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index bb236df..18de517 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -1084,7 +1084,7 @@ glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo)
                 goto out;
 
         CDS_INIT_LIST_HEAD (&new_brickinfo->brick_list);
-
+        pthread_mutex_init (&new_brickinfo->restart_mutex, NULL);
         *brickinfo = new_brickinfo;
 
         ret = 0;
@@ -2481,7 +2481,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
         (void) sys_unlink (pidfile);
 
         brickinfo->status = GF_BRICK_STOPPED;
-
+        brickinfo->start_triggered = _gf_false;
         if (del_brick)
                 glusterd_delete_brick (volinfo, brickinfo);
 out:
@@ -5817,13 +5817,14 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
          * three different triggers for an attempt to start the brick process
          * due to the quorum handling code in glusterd_friend_sm.
          */
-        if (brickinfo->status == GF_BRICK_STARTING) {
+        if (brickinfo->status == GF_BRICK_STARTING ||
+            brickinfo->start_triggered) {
                 gf_msg_debug (this->name, 0, "brick %s is already in starting "
                               "phase", brickinfo->path);
                 ret = 0;
                 goto out;
         }
-
+        brickinfo->start_triggered = _gf_true;
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
         if (gf_is_service_running (pidfile, &pid)) {
                 if (brickinfo->status != GF_BRICK_STARTING &&
@@ -5936,6 +5937,9 @@ run:
         }
 
 out:
+        if (ret && brickinfo) {
+                brickinfo->start_triggered = _gf_false;
+        }
         gf_msg_debug (this->name, 0, "returning %d ", ret);
         return ret;
 }
@@ -5997,11 +6001,19 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
                                 start_svcs = _gf_true;
                                 glusterd_svcs_manager (NULL);
                         }
-
                         cds_list_for_each_entry (brickinfo, &volinfo->bricks,
                                                  brick_list) {
-                                glusterd_brick_start (volinfo, brickinfo,
-                                                     _gf_false);
+                                if (!brickinfo->start_triggered) {
+                                        pthread_mutex_lock
+                                                (&brickinfo->restart_mutex);
+                                        {
+                                                glusterd_brick_start
+                                                         (volinfo, brickinfo,
+                                                          _gf_false);
+                                        }
+                                        pthread_mutex_unlock
+                                                (&brickinfo->restart_mutex);
+                                }
                         }
                         ret = glusterd_store_volinfo
                                 (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
@@ -6040,8 +6052,17 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
                                 "volume %s", volinfo->volname);
                         cds_list_for_each_entry (brickinfo, &volinfo->bricks,
                                                  brick_list) {
-                                glusterd_brick_start (volinfo, brickinfo,
-                                                      _gf_false);
+                                if (!brickinfo->start_triggered) {
+                                        pthread_mutex_lock
+                                                (&brickinfo->restart_mutex);
+                                        {
+                                                glusterd_brick_start
+                                                         (volinfo, brickinfo,
+                                                          _gf_false);
+                                        }
+                                        pthread_mutex_unlock
+                                                (&brickinfo->restart_mutex);
+                                }
                         }
                         ret = glusterd_store_volinfo
                                 (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
index 834acab..bec5f72 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
@@ -2545,6 +2545,14 @@ glusterd_start_volume (glusterd_volinfo_t *volinfo, int flags,
         GF_ASSERT (volinfo);
 
         cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+                /* Mark start_triggered to false so that in case if this brick
+                 * was brought down through gf_attach utility, the
+                 * brickinfo->start_triggered wouldn't have been updated to
+                 * _gf_false
+                 */
+                if (flags & GF_CLI_FLAG_OP_FORCE) {
+                        brickinfo->start_triggered = _gf_false;
+                }
                 ret = glusterd_brick_start (volinfo, brickinfo, wait);
                 /* If 'force' try to start all bricks regardless of success or
                  * failure
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index 722d2f8..d4bb236 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -240,6 +240,8 @@ struct glusterd_brickinfo {
         uint64_t           statfs_fsid;
         uint32_t           fs_share_count;
         gf_boolean_t       port_registered;
+        gf_boolean_t       start_triggered;
+        pthread_mutex_t    restart_mutex;
 };
 
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
-- 
1.8.3.1