a3470f
From 09698d53b91786c990a0f7bc067e5c13551b0b12 Mon Sep 17 00:00:00 2001
a3470f
From: Xavi Hernandez <jahernan@redhat.com>
a3470f
Date: Wed, 21 Feb 2018 17:47:37 +0100
a3470f
Subject: [PATCH 199/201] cluster/ec: avoid delays in self-heal
a3470f
a3470f
Self-heal creates a thread per brick to sweep the index looking for
a3470f
files that need to be healed. These threads are started before the
a3470f
volume comes online, so nothing is done but waiting for the next
a3470f
sweep. This happens once per minute.
a3470f
a3470f
When a replace brick command is executed, the new graph is loaded and
a3470f
all index sweeper threads started. When all bricks have reported, a
a3470f
getxattr request is sent to the root directory of the volume. This
a3470f
causes a heal on it (because the new brick doesn't have good data),
a3470f
and marks its contents as pending to be healed. This is done by the
a3470f
index sweeper thread on the next round, one minute later.
a3470f
a3470f
This patch solves this problem by waking all index sweeper threads
a3470f
after a successful check on the root directory.
a3470f
a3470f
Additionally, the index sweep thread scans the index directory
a3470f
sequentially, but it might happen that after healing a directory entry
a3470f
more index entries are created but skipped by the current directory
a3470f
scan. This causes the remaining entries to be processed on the next
a3470f
round, one minute later. The same can happen in the next round, so
a3470f
the heal is running in bursts and taking a lot to finish, specially
a3470f
on volumes with many directory levels.
a3470f
a3470f
This patch solves this problem by immediately restarting the index
a3470f
sweep if a directory has been healed.
a3470f
a3470f
> Upstream patch: https://review.gluster.org/19718
a3470f
> master patch: https://review.gluster.org/#/c/19609/
a3470f
a3470f
Change-Id: I58d9ab6ef17b30f704dc322e1d3d53b904e5f30e
a3470f
BUG: 1555261
a3470f
Signed-off-by: Xavi Hernandez <jahernan@redhat.com>
a3470f
Reviewed-on: https://code.engineering.redhat.com/gerrit/133570
a3470f
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
a3470f
Tested-by: Ashish Pandey <aspandey@redhat.com>
a3470f
Tested-by: RHGS Build Bot <nigelb@redhat.com>
a3470f
---
a3470f
 tests/bugs/ec/bug-1547662.t       |  41 ++++++++++++++++
a3470f
 xlators/cluster/ec/src/ec-heal.c  |   9 ++++
a3470f
 xlators/cluster/ec/src/ec-heald.c |  27 +++++++---
a3470f
 xlators/cluster/ec/src/ec-heald.h |   4 +-
a3470f
 xlators/cluster/ec/src/ec.c       | 101 ++++++++++++++++++++++----------------
a3470f
 5 files changed, 134 insertions(+), 48 deletions(-)
a3470f
 create mode 100644 tests/bugs/ec/bug-1547662.t
a3470f
a3470f
diff --git a/tests/bugs/ec/bug-1547662.t b/tests/bugs/ec/bug-1547662.t
a3470f
new file mode 100644
a3470f
index 0000000..5748218
a3470f
--- /dev/null
a3470f
+++ b/tests/bugs/ec/bug-1547662.t
a3470f
@@ -0,0 +1,41 @@
a3470f
+#!/bin/bash
a3470f
+
a3470f
+. $(dirname $0)/../../include.rc
a3470f
+. $(dirname $0)/../../volume.rc
a3470f
+
a3470f
+# Immediately after replace-brick, trusted.ec.version will be absent, so if it
a3470f
+# is present we can assume that heal was started on root
a3470f
+function root_heal_attempted {
a3470f
+        if [ -z $(get_hex_xattr trusted.ec.version $1) ]; then
a3470f
+                echo "N"
a3470f
+        else
a3470f
+                echo "Y"
a3470f
+        fi
a3470f
+}
a3470f
+
a3470f
+cleanup
a3470f
+
a3470f
+TEST glusterd
a3470f
+TEST pidof glusterd
a3470f
+TEST ${CLI} volume create ${V0} disperse 6 redundancy 2 ${H0}:${B0}/${V0}{0..5}
a3470f
+TEST ${CLI} volume start ${V0}
a3470f
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
a3470f
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
a3470f
+
a3470f
+TEST mkdir ${M0}/base
a3470f
+TEST mkdir ${M0}/base/dir.{1,2}
a3470f
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}
a3470f
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}
a3470f
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
a3470f
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
a3470f
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
a3470f
+
a3470f
+TEST ${CLI} volume replace-brick ${V0} ${H0}:${B0}/${V0}5 ${H0}:${B0}/${V0}6 commit force
a3470f
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
a3470f
+EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "Y" glustershd_up_status
a3470f
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count_shd ${V0} 0
a3470f
+EXPECT_WITHIN ${HEAL_TIMEOUT} "Y" root_heal_attempted ${B0}/${V0}6
a3470f
+EXPECT_WITHIN ${HEAL_TIMEOUT} "^0$" get_pending_heal_count ${V0}
a3470f
+EXPECT "^127$" echo $(find ${B0}/${V0}6/base -type d | wc -l)
a3470f
+
a3470f
+cleanup;
a3470f
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
a3470f
index b8518d6..8e02986 100644
a3470f
--- a/xlators/cluster/ec/src/ec-heal.c
a3470f
+++ b/xlators/cluster/ec/src/ec-heal.c
a3470f
@@ -25,6 +25,7 @@
a3470f
 #include "ec-combine.h"
a3470f
 #include "ec-method.h"
a3470f
 #include "ec-fops.h"
a3470f
+#include "ec-heald.h"
a3470f
 
a3470f
 #define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; })
a3470f
 #define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; })
a3470f
@@ -2752,6 +2753,14 @@ ec_replace_heal (ec_t *ec, inode_t *inode)
a3470f
                 gf_msg_debug (ec->xl->name, 0,
a3470f
                         "Heal failed for replace brick ret = %d", ret);
a3470f
 
a3470f
+        /* Once the root inode has been checked, it might have triggered a
a3470f
+         * self-heal on it after a replace brick command or for some other
a3470f
+         * reason. It can also happen that the volume already had damaged
a3470f
+         * files in the index, even if the heal on the root directory failed.
a3470f
+         * In both cases we need to wake all index healers to continue
a3470f
+         * healing remaining entries that are marked as dirty. */
a3470f
+        ec_shd_index_healer_wake(ec);
a3470f
+
a3470f
         loc_wipe (&loc;;
a3470f
         return ret;
a3470f
 }
a3470f
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
a3470f
index b4fa6f8..a703379 100644
a3470f
--- a/xlators/cluster/ec/src/ec-heald.c
a3470f
+++ b/xlators/cluster/ec/src/ec-heald.c
a3470f
@@ -184,8 +184,19 @@ ec_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)
a3470f
 int
a3470f
 ec_shd_selfheal (struct subvol_healer *healer, int child, loc_t *loc)
a3470f
 {
a3470f
-        return syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
a3470f
-                                NULL);
a3470f
+        int32_t ret;
a3470f
+
a3470f
+        ret = syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
a3470f
+                               NULL);
a3470f
+        if ((ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
a3470f
+                /* If we have just healed a directory, it's possible that
a3470f
+                 * other index entries have appeared to be healed. We put a
a3470f
+                 * mark so that we can check it later and restart a scan
a3470f
+                 * without delay. */
a3470f
+                healer->rerun = _gf_true;
a3470f
+        }
a3470f
+
a3470f
+        return ret;
a3470f
 }
a3470f
 
a3470f
 
a3470f
@@ -472,11 +483,15 @@ ec_shd_index_healer_spawn (xlator_t *this, int subvol)
a3470f
 }
a3470f
 
a3470f
 void
a3470f
-ec_selfheal_childup (ec_t *ec, int child)
a3470f
+ec_shd_index_healer_wake(ec_t *ec)
a3470f
 {
a3470f
-        if (!ec->shd.iamshd)
a3470f
-                return;
a3470f
-        ec_shd_index_healer_spawn (ec->xl, child);
a3470f
+        int32_t i;
a3470f
+
a3470f
+        for (i = 0; i < ec->nodes; i++) {
a3470f
+                if (((ec->xl_up >> i) & 1) != 0) {
a3470f
+                        ec_shd_index_healer_spawn(ec->xl, i);
a3470f
+                }
a3470f
+        }
a3470f
 }
a3470f
 
a3470f
 int
a3470f
diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h
a3470f
index 4ae02e2..2a84881 100644
a3470f
--- a/xlators/cluster/ec/src/ec-heald.h
a3470f
+++ b/xlators/cluster/ec/src/ec-heald.h
a3470f
@@ -20,6 +20,8 @@ ec_xl_op (xlator_t *this, dict_t *input, dict_t *output);
a3470f
 
a3470f
 int
a3470f
 ec_selfheal_daemon_init (xlator_t *this);
a3470f
-void ec_selfheal_childup (ec_t *ec, int child);
a3470f
+
a3470f
+void
a3470f
+ec_shd_index_healer_wake(ec_t *ec);
a3470f
 
a3470f
 #endif /* __EC_HEALD_H__ */
a3470f
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
a3470f
index bfdca64..956b45b 100644
a3470f
--- a/xlators/cluster/ec/src/ec.c
a3470f
+++ b/xlators/cluster/ec/src/ec.c
a3470f
@@ -322,7 +322,7 @@ ec_get_event_from_state (ec_t *ec)
a3470f
                 /* If ec is up but some subvolumes are yet to notify, give
a3470f
                  * grace time for other subvols to notify to prevent start of
a3470f
                  * I/O which may result in self-heals */
a3470f
-                if (ec->timer && ec->xl_notify_count < ec->nodes)
a3470f
+                if (ec->xl_notify_count < ec->nodes)
a3470f
                         return GF_EVENT_MAXVAL;
a3470f
 
a3470f
                 return GF_EVENT_CHILD_UP;
a3470f
@@ -344,8 +344,8 @@ ec_up (xlator_t *this, ec_t *ec)
a3470f
         }
a3470f
 
a3470f
         ec->up = 1;
a3470f
-        gf_msg (this->name, GF_LOG_INFO, 0,
a3470f
-                EC_MSG_EC_UP, "Going UP");
a3470f
+        gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP");
a3470f
+
a3470f
         gf_event (EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
a3470f
 }
a3470f
 
a3470f
@@ -358,8 +358,8 @@ ec_down (xlator_t *this, ec_t *ec)
a3470f
         }
a3470f
 
a3470f
         ec->up = 0;
a3470f
-        gf_msg (this->name, GF_LOG_INFO, 0,
a3470f
-                EC_MSG_EC_DOWN, "Going DOWN");
a3470f
+        gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN");
a3470f
+
a3470f
         gf_event (EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
a3470f
 }
a3470f
 
a3470f
@@ -383,31 +383,38 @@ ec_notify_cbk (void *data)
a3470f
                 gf_timer_call_cancel (ec->xl->ctx, ec->timer);
a3470f
                 ec->timer = NULL;
a3470f
 
a3470f
+                /* The timeout has expired, so any subvolume that has not
a3470f
+                 * already reported its state, will be considered to be down.
a3470f
+                 * We mark as if all bricks had reported. */
a3470f
+                ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
a3470f
+                ec->xl_notify_count = ec->nodes;
a3470f
+
a3470f
+                /* Since we have marked all subvolumes as notified, it's
a3470f
+                 * guaranteed that ec_get_event_from_state() will return
a3470f
+                 * CHILD_UP or CHILD_DOWN, but not MAXVAL. */
a3470f
                 event = ec_get_event_from_state (ec);
a3470f
-                /* If event is still MAXVAL then enough subvolumes didn't
a3470f
-                 * notify, treat it as CHILD_DOWN. */
a3470f
-                if (event == GF_EVENT_MAXVAL) {
a3470f
-                        event = GF_EVENT_CHILD_DOWN;
a3470f
-                        ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
a3470f
-                        ec->xl_notify_count = ec->nodes;
a3470f
-                } else if (event == GF_EVENT_CHILD_UP) {
a3470f
-                        /* Rest of the bricks are still not coming up,
a3470f
-                         * notify that ec is up. Files/directories will be
a3470f
-                         * healed as in when they come up. */
a3470f
+                if (event == GF_EVENT_CHILD_UP) {
a3470f
+                        /* We are ready to bring the volume up. If there are
a3470f
+                         * still bricks DOWN, they will be healed when they
a3470f
+                         * come up. */
a3470f
                         ec_up (ec->xl, ec);
a3470f
                 }
a3470f
 
a3470f
-                /* CHILD_DOWN should not come here as no grace period is given
a3470f
-                 * for notifying CHILD_DOWN. */
a3470f
-
a3470f
                 propagate = _gf_true;
a3470f
         }
a3470f
 unlock:
a3470f
         UNLOCK(&ec->lock);
a3470f
 
a3470f
         if (propagate) {
a3470f
+                if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) {
a3470f
+                        /* We have just brought the volume UP, so we trigger
a3470f
+                         * a self-heal check on the root directory. */
a3470f
+                        ec_launch_replace_heal (ec);
a3470f
+                }
a3470f
+
a3470f
                 default_notify (ec->xl, event, NULL);
a3470f
         }
a3470f
+
a3470f
 }
a3470f
 
a3470f
 void
a3470f
@@ -442,7 +449,7 @@ ec_pending_fops_completed(ec_t *ec)
a3470f
         }
a3470f
 }
a3470f
 
a3470f
-static void
a3470f
+static gf_boolean_t
a3470f
 ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
a3470f
 {
a3470f
         uintptr_t current_state = 0;
a3470f
@@ -455,23 +462,28 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
a3470f
         if (current_state != new_state) {
a3470f
                 ec->xl_up ^= index_mask;
a3470f
                 ec->xl_up_count += (current_state ? -1 : 1);
a3470f
+
a3470f
+                return _gf_true;
a3470f
         }
a3470f
+
a3470f
+        return _gf_false;
a3470f
 }
a3470f
 
a3470f
 int32_t
a3470f
 ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
a3470f
 {
a3470f
-        ec_t             *ec        = this->private;
a3470f
-        int32_t           idx       = 0;
a3470f
-        int32_t           error     = 0;
a3470f
-        glusterfs_event_t old_event = GF_EVENT_MAXVAL;
a3470f
-        dict_t            *input    = NULL;
a3470f
-        dict_t            *output   = NULL;
a3470f
-        gf_boolean_t      propagate = _gf_true;
a3470f
-        int32_t           orig_event = event;
a3470f
+        ec_t             *ec              = this->private;
a3470f
+        int32_t           idx             = 0;
a3470f
+        int32_t           error           = 0;
a3470f
+        glusterfs_event_t old_event       = GF_EVENT_MAXVAL;
a3470f
+        dict_t            *input          = NULL;
a3470f
+        dict_t            *output         = NULL;
a3470f
+        gf_boolean_t      propagate       = _gf_true;
a3470f
+        gf_boolean_t      needs_shd_check = _gf_false;
a3470f
+        int32_t           orig_event      = event;
a3470f
         struct gf_upcall *up_data   = NULL;
a3470f
         struct gf_upcall_cache_invalidation *up_ci = NULL;
a3470f
-        uintptr_t mask = 0;
a3470f
+        uintptr_t         mask            = 0;
a3470f
 
a3470f
         gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p",
a3470f
                 event, data, data2);
a3470f
@@ -498,8 +510,6 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
a3470f
 
a3470f
         for (idx = 0; idx < ec->nodes; idx++) {
a3470f
                 if (ec->xl_list[idx] == data) {
a3470f
-                        if (event == GF_EVENT_CHILD_UP)
a3470f
-                                ec_selfheal_childup (ec, idx);
a3470f
                         break;
a3470f
                 }
a3470f
         }
a3470f
@@ -525,17 +535,27 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
a3470f
 
a3470f
                 mask = 1ULL << idx;
a3470f
                 if (event == GF_EVENT_CHILD_UP) {
a3470f
-                    ec_set_up_state(ec, mask, mask);
a3470f
+                        /* We need to trigger a selfheal if a brick changes
a3470f
+                         * to UP state. */
a3470f
+                        needs_shd_check = ec_set_up_state(ec, mask, mask);
a3470f
                 } else if (event == GF_EVENT_CHILD_DOWN) {
a3470f
-                    ec_set_up_state(ec, mask, 0);
a3470f
+                        ec_set_up_state(ec, mask, 0);
a3470f
                 }
a3470f
 
a3470f
                 event = ec_get_event_from_state (ec);
a3470f
 
a3470f
-                if (event == GF_EVENT_CHILD_UP && !ec->up) {
a3470f
-                        ec_up (this, ec);
a3470f
-                } else if (event == GF_EVENT_CHILD_DOWN && ec->up) {
a3470f
-                        ec_down (this, ec);
a3470f
+                if (event == GF_EVENT_CHILD_UP) {
a3470f
+                        if (!ec->up) {
a3470f
+                                ec_up (this, ec);
a3470f
+                        }
a3470f
+                } else {
a3470f
+                        /* If the volume is not UP, it's irrelevant if one
a3470f
+                         * brick has come up. We cannot heal anything. */
a3470f
+                        needs_shd_check = _gf_false;
a3470f
+
a3470f
+                        if ((event == GF_EVENT_CHILD_DOWN) && ec->up) {
a3470f
+                                ec_down (this, ec);
a3470f
+                        }
a3470f
                 }
a3470f
 
a3470f
                 if (event != GF_EVENT_MAXVAL) {
a3470f
@@ -554,14 +574,13 @@ unlock:
a3470f
 
a3470f
 done:
a3470f
         if (propagate) {
a3470f
+                if (needs_shd_check && ec->shd.iamshd) {
a3470f
+                        ec_launch_replace_heal (ec);
a3470f
+                }
a3470f
+
a3470f
                 error = default_notify (this, event, data);
a3470f
         }
a3470f
 
a3470f
-        if (ec->shd.iamshd &&
a3470f
-            ec->xl_notify_count == ec->nodes &&
a3470f
-            event == GF_EVENT_CHILD_UP) {
a3470f
-                ec_launch_replace_heal (ec);
a3470f
-        }
a3470f
 out:
a3470f
         return error;
a3470f
 }
a3470f
-- 
a3470f
1.8.3.1
a3470f