Blob Blame History Raw
From 3be59a9bdebcb6f54ddd0098e5f5038a739c5a7c Mon Sep 17 00:00:00 2001
From: Ravishankar N <ravishankar@redhat.com>
Date: Wed, 23 Dec 2015 13:49:14 +0530
Subject: [PATCH 095/104] afr: propagate child up event after timeout

Patch in master: http://review.gluster.org/11113
Patch in release-3.7: http://review.gluster.org/#/c/14088/

Problem: During mount, afr waits for response from all its children before
notifying the parent xlator. In a 1x2 replica volume , if one of the nodes is
down, the mount will hang for more than a minute until child down is received
from the client xlator for that node.

Fix:
When parent up is received by afr, start a 10 second timer. In the timer call
back, if we receive a successful child up from atleast one brick, propagate the
event to the parent xlator.

Change-Id: I58b7f51943d285ae9797be0a42961a743855e7db
BUG: 1286911
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/73112
---
 xlators/cluster/afr/src/afr-common.c   |  280 +++++++++++++++++++++-----------
 xlators/cluster/afr/src/afr-messages.h |   10 +-
 xlators/cluster/afr/src/afr.c          |    6 +
 xlators/cluster/afr/src/afr.h          |    3 +-
 4 files changed, 200 insertions(+), 99 deletions(-)

diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 1345d4d..9ff3740 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -3922,6 +3922,117 @@ find_child_index (xlator_t *this, xlator_t *child)
         return i;
 }
 
+static int
+__afr_get_up_children_count (afr_private_t *priv)
+{
+        int             up_children         = 0;
+        int             i = 0;
+
+        for (i = 0; i < priv->child_count; i++)
+                if (priv->child_up[i] == 1)
+                        up_children++;
+
+        return up_children;
+}
+
+glusterfs_event_t
+__afr_transform_event_from_state (afr_private_t *priv)
+{
+        int i = 0;
+        int up_children = 0;
+
+        if (AFR_COUNT (priv->last_event, priv->child_count) ==
+                       priv->child_count)
+                /* have_heard_from_all. Let afr_notify() do the propagation. */
+                return GF_EVENT_MAXVAL;
+
+        up_children = __afr_get_up_children_count (priv);
+        if (up_children) {
+                /* We received at least one child up and there are pending
+                 * notifications from some children. Treat these children as
+                 * having sent a GF_EVENT_CHILD_DOWN. i.e. set the event as
+                 * GF_EVENT_CHILD_MODIFIED, as done in afr_notify() */
+                for (i = 0; i < priv->child_count; i++) {
+                        if (priv->last_event[i])
+                                continue;
+                        priv->last_event[i] = GF_EVENT_CHILD_MODIFIED;
+                        priv->child_up[i] = 0;
+                }
+                return GF_EVENT_CHILD_UP;
+        } else {
+                for (i = 0; i < priv->child_count; i++) {
+                        if (priv->last_event[i])
+                                continue;
+                        priv->last_event[i] = GF_EVENT_SOME_CHILD_DOWN;
+                        priv->child_up[i] = 0;
+                }
+                return GF_EVENT_CHILD_DOWN;
+        }
+
+        return GF_EVENT_MAXVAL;
+}
+
+static void
+afr_notify_cbk (void *data)
+{
+        xlator_t *this = data;
+        afr_private_t *priv = this->private;
+        glusterfs_event_t event = GF_EVENT_MAXVAL;
+        gf_boolean_t propagate = _gf_false;
+
+        LOCK (&priv->lock);
+        {
+                if (!priv->timer) {
+                        /*
+                         * Either child_up/child_down is already sent to parent.
+                         * This is a spurious wake up.
+                         */
+                        goto unlock;
+                }
+                priv->timer = NULL;
+                event = __afr_transform_event_from_state (priv);
+                if (event != GF_EVENT_MAXVAL)
+                        propagate = _gf_true;
+        }
+unlock:
+        UNLOCK (&priv->lock);
+        if (propagate)
+                default_notify (this, event, NULL);
+}
+
+static void
+__afr_launch_notify_timer (xlator_t *this, afr_private_t *priv)
+{
+
+        struct timespec delay = {0, };
+
+        gf_msg_debug (this->name, 0, "Initiating child-down timer");
+        delay.tv_sec = 10;
+        delay.tv_nsec = 0;
+        priv->timer = gf_timer_call_after (this->ctx, delay,
+                                           afr_notify_cbk, this);
+        if (priv->timer == NULL) {
+                gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL,
+                        "Cannot create timer for delayed initialization");
+        }
+}
+
+int
+__get_heard_from_all_status (xlator_t *this)
+{
+        afr_private_t *priv          = this->private;
+        int           heard_from_all = 1;
+        int           i              = 0;
+
+        for (i = 0; i < priv->child_count; i++) {
+                if (!priv->last_event[i]) {
+                        heard_from_all = 0;
+                        break;
+                }
+        }
+        return heard_from_all;
+}
+
 int32_t
 afr_notify (xlator_t *this, int32_t event,
             void *data, void *data2)
@@ -3954,12 +4065,6 @@ afr_notify (xlator_t *this, int32_t event,
          */
         priv->did_discovery = _gf_false;
 
-        had_heard_from_all = 1;
-        for (i = 0; i < priv->child_count; i++) {
-                if (!priv->last_event[i]) {
-                        had_heard_from_all = 0;
-                }
-        }
 
         /* parent xlators dont need to know about every child_up, child_down
          * because of afr ha. If all subvolumes go down, child_down has
@@ -3979,10 +4084,32 @@ afr_notify (xlator_t *this, int32_t event,
 
         had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
                                                            this);
-        switch (event) {
-        case GF_EVENT_CHILD_UP:
+        if (event == GF_EVENT_TRANSLATOR_OP) {
                 LOCK (&priv->lock);
                 {
+                        had_heard_from_all = __get_heard_from_all_status (this);
+                }
+                UNLOCK (&priv->lock);
+
+                if (!had_heard_from_all) {
+                        ret = -1;
+                } else {
+                        input = data;
+                        output = data2;
+                        ret = afr_xl_op (this, input, output);
+                }
+                goto out;
+        }
+
+        LOCK (&priv->lock);
+        {
+                had_heard_from_all = __get_heard_from_all_status (this);
+                switch (event) {
+                case GF_EVENT_PARENT_UP:
+                        __afr_launch_notify_timer (this, priv);
+                        propagate = 1;
+                        break;
+                case GF_EVENT_CHILD_UP:
                         /*
                          * This only really counts if the child was never up
                          * (value = -1) or had been down (value = 0).  See
@@ -3990,48 +4117,28 @@ afr_notify (xlator_t *this, int32_t event,
                          * explanation.
                          */
                         if (priv->child_up[idx] != 1) {
-                                priv->up_count++;
-				priv->event_generation++;
+                                priv->event_generation++;
                         }
                         priv->child_up[idx] = 1;
 
                         call_psh = 1;
-                        for (i = 0; i < priv->child_count; i++)
-                                if (priv->child_up[i] == 1)
-                                        up_children++;
+                        up_children = __afr_get_up_children_count (priv);
                         if (up_children == 1) {
                                 gf_msg (this->name, GF_LOG_INFO, 0,
                                         AFR_MSG_SUBVOL_UP,
                                         "Subvolume '%s' came back up; "
-                                        "going online.", ((xlator_t *)data)->name);
+                                     "going online.", ((xlator_t *)data)->name);
                         } else {
                                 event = GF_EVENT_CHILD_MODIFIED;
                         }
 
                         priv->last_event[idx] = event;
-                }
-                UNLOCK (&priv->lock);
 
-                break;
+                        break;
 
-        case GF_EVENT_CHILD_DOWN:
-                LOCK (&priv->lock);
-                {
-                        /*
-                         * If a brick is down when we start, we'll get a
-                         * CHILD_DOWN to indicate its initial state.  There
-                         * was never a CHILD_UP in this case, so if we
-                         * increment "down_count" the difference between than
-                         * and "up_count" will no longer be the number of
-                         * children that are currently up.  This has serious
-                         * implications e.g. for quorum enforcement, so we
-                         * don't increment these values unless the event
-                         * represents an actual state transition between "up"
-                         * (value = 1) and anything else.
-                         */
+                case GF_EVENT_CHILD_DOWN:
                         if (priv->child_up[idx] == 1) {
-                                priv->down_count++;
-				priv->event_generation++;
+                                priv->event_generation++;
                         }
                         priv->child_up[idx] = 0;
 
@@ -4041,42 +4148,56 @@ afr_notify (xlator_t *this, int32_t event,
                         if (down_children == priv->child_count) {
                                 gf_msg (this->name, GF_LOG_ERROR, 0,
                                         AFR_MSG_ALL_SUBVOLS_DOWN,
-                                        "All subvolumes are down. Going offline "
-                                        "until atleast one of them comes back up.");
+                                       "All subvolumes are down. Going offline "
+                                    "until atleast one of them comes back up.");
                         } else {
                                 event = GF_EVENT_SOME_CHILD_DOWN;
                         }
 
                         priv->last_event[idx] = event;
-                }
-                UNLOCK (&priv->lock);
 
-                break;
+                        break;
 
-        case GF_EVENT_CHILD_CONNECTING:
-                LOCK (&priv->lock);
-                {
+                case GF_EVENT_CHILD_CONNECTING:
                         priv->last_event[idx] = event;
-                }
-                UNLOCK (&priv->lock);
 
-                break;
+                        break;
 
-        case GF_EVENT_TRANSLATOR_OP:
-                input = data;
-                output = data2;
-                if (!had_heard_from_all) {
-                        ret = -1;
-                        goto out;
+                case GF_EVENT_SOME_CHILD_DOWN:
+                        priv->last_event[idx] = event;
+                        break;
+
+                default:
+                        propagate = 1;
+                        break;
                 }
-                ret = afr_xl_op (this, input, output);
-                goto out;
-                break;
+                have_heard_from_all = __get_heard_from_all_status (this);
+                if (!had_heard_from_all && have_heard_from_all) {
+                        if (priv->timer) {
+                                gf_timer_call_cancel (this->ctx, priv->timer);
+                                priv->timer = NULL;
+                        }
+                        /* This is the first event which completes aggregation
+                           of events from all subvolumes. If at least one subvol
+                           had come up, propagate CHILD_UP, but only this time
+                        */
+                        event = GF_EVENT_CHILD_DOWN;
+                        up_children = __afr_get_up_children_count (priv);
+                        for (i = 0; i < priv->child_count; i++) {
+                                if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
+                                        event = GF_EVENT_CHILD_UP;
+                                        break;
+                                }
 
-        default:
-                propagate = 1;
-                break;
+                                if (priv->last_event[i] ==
+                                                GF_EVENT_CHILD_CONNECTING) {
+                                        event = GF_EVENT_CHILD_CONNECTING;
+                               /* continue to check other events for CHILD_UP */
+                                }
+                        }
+                }
         }
+        UNLOCK (&priv->lock);
 
         if (priv->quorum_count) {
                 has_quorum = afr_has_quorum (priv->child_up, this);
@@ -4089,44 +4210,11 @@ afr_notify (xlator_t *this, int32_t event,
                                 "Client-quorum is not met");
         }
 
-        /* have all subvolumes reported status once by now? */
-        have_heard_from_all = 1;
-        for (i = 0; i < priv->child_count; i++) {
-                if (!priv->last_event[i])
-                        have_heard_from_all = 0;
-        }
-
         /* if all subvols have reported status, no need to hide anything
            or wait for anything else. Just propagate blindly */
         if (have_heard_from_all)
                 propagate = 1;
 
-        if (!had_heard_from_all && have_heard_from_all) {
-                /* This is the first event which completes aggregation
-                   of events from all subvolumes. If at least one subvol
-                   had come up, propagate CHILD_UP, but only this time
-                */
-                event = GF_EVENT_CHILD_DOWN;
-
-                LOCK (&priv->lock);
-                {
-                        up_children = AFR_COUNT (priv->child_up, priv->child_count);
-                        for (i = 0; i < priv->child_count; i++) {
-                                if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
-                                        event = GF_EVENT_CHILD_UP;
-                                        break;
-                                }
-
-                                if (priv->last_event[i] ==
-                                                GF_EVENT_CHILD_CONNECTING) {
-                                        event = GF_EVENT_CHILD_CONNECTING;
-                                        /* continue to check other events for CHILD_UP */
-                                }
-                        }
-                }
-                UNLOCK (&priv->lock);
-        }
-
         ret = 0;
         if (propagate)
                 ret = default_notify (this, event, data);
@@ -4371,18 +4459,18 @@ gf_boolean_t
 afr_have_quorum (char *logname, afr_private_t *priv)
 {
         unsigned int        quorum = 0;
+        unsigned int        up_children = 0;
 
         GF_VALIDATE_OR_GOTO(logname,priv,out);
 
+        up_children = __afr_get_up_children_count (priv);
         quorum = priv->quorum_count;
-        if (quorum != AFR_QUORUM_AUTO) {
-                return (priv->up_count >= (priv->down_count + quorum));
-        }
+        if (quorum != AFR_QUORUM_AUTO)
+                return up_children >= quorum;
 
         quorum = priv->child_count / 2 + 1;
-        if (priv->up_count >= (priv->down_count + quorum)) {
+        if (up_children >= quorum)
                 return _gf_true;
-        }
 
         /*
          * Special case for even numbers of nodes: if we have exactly half
@@ -4393,7 +4481,7 @@ afr_have_quorum (char *logname, afr_private_t *priv)
          */
         if ((priv->child_count % 2) == 0) {
                 quorum = priv->child_count / 2;
-                if (priv->up_count >= (priv->down_count + quorum)) {
+                if (up_children >= quorum) {
                         if (priv->child_up[0]) {
                                 return _gf_true;
                         }
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
index 1fb5b51..f2cd830 100644
--- a/xlators/cluster/afr/src/afr-messages.h
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -45,7 +45,7 @@
  */
 
 #define GLFS_COMP_BASE_AFR      GLFS_MSGID_COMP_AFR
-#define GLFS_NUM_MESSAGES       39
+#define GLFS_NUM_MESSAGES       41
 #define GLFS_MSGID_END          (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)
 
 #define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages"
@@ -357,5 +357,13 @@
 */
 #define AFR_MSG_NO_CHANGELOG  (GLFS_COMP_BASE_AFR + 40)
 
+/*!
+ * @messageid 108041
+ * @diagnosis Unable to create timer thread for delayed initialization.
+ * @recommendedaction Possibly check process's log file for messages from
+ * timer infra.
+*/
+#define AFR_MSG_TIMER_CREATE_FAIL               (GLFS_COMP_BASE_AFR + 41)
+
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
 #endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index ad92eaf..dafacee 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -498,6 +498,12 @@ fini (xlator_t *this)
         afr_private_t *priv = NULL;
 
         priv = this->private;
+        LOCK (&priv->lock);
+        if (priv->timer != NULL) {
+                gf_timer_call_cancel(this->ctx, priv->timer);
+                priv->timer = NULL;
+        }
+        UNLOCK (&priv->lock);
         this->private = NULL;
         afr_priv_destroy (priv);
         //if (this->itable);//I dont see any destroy func
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index d3c1e10..c553a1f 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -104,8 +104,7 @@ typedef struct _afr_private {
 
         unsigned int wait_count;      /* # of servers to wait for success */
 
-        uint64_t up_count;      /* number of CHILD_UPs we have seen */
-        uint64_t down_count;    /* number of CHILD_DOWNs we have seen */
+        gf_timer_t *timer;      /* launched when parent up is received */
 
         gf_boolean_t      optimistic_change_log;
         gf_boolean_t      eager_lock;
-- 
1.7.1