d1681e
From 09698d53b91786c990a0f7bc067e5c13551b0b12 Mon Sep 17 00:00:00 2001
d1681e
From: Xavi Hernandez <jahernan@redhat.com>
d1681e
Date: Wed, 21 Feb 2018 17:47:37 +0100
d1681e
Subject: [PATCH 199/201] cluster/ec: avoid delays in self-heal
d1681e
d1681e
Self-heal creates a thread per brick to sweep the index looking for
d1681e
files that need to be healed. These threads are started before the
d1681e
volume comes online, so nothing is done but waiting for the next
d1681e
sweep. This happens once per minute.
d1681e
d1681e
When a replace brick command is executed, the new graph is loaded and
d1681e
all index sweeper threads started. When all bricks have reported, a
d1681e
getxattr request is sent to the root directory of the volume. This
d1681e
causes a heal on it (because the new brick doesn't have good data),
d1681e
and marks its contents as pending to be healed. This is done by the
d1681e
index sweeper thread on the next round, one minute later.
d1681e
d1681e
This patch solves this problem by waking all index sweeper threads
d1681e
after a successful check on the root directory.
d1681e
d1681e
Additionally, the index sweep thread scans the index directory
d1681e
sequentially, but it might happen that after healing a directory entry
d1681e
more index entries are created but skipped by the current directory
d1681e
scan. This causes the remaining entries to be processed on the next
d1681e
round, one minute later. The same can happen in the next round, so
d1681e
the heal is running in bursts and taking a lot to finish, specially
d1681e
on volumes with many directory levels.
d1681e
d1681e
This patch solves this problem by immediately restarting the index
d1681e
sweep if a directory has been healed.
d1681e
d1681e
> Upstream patch: https://review.gluster.org/19718
d1681e
> master patch: https://review.gluster.org/#/c/19609/
d1681e
d1681e
Change-Id: I58d9ab6ef17b30f704dc322e1d3d53b904e5f30e
d1681e
BUG: 1555261
d1681e
Signed-off-by: Xavi Hernandez <jahernan@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/133570
d1681e
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
d1681e
Tested-by: Ashish Pandey <aspandey@redhat.com>
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
---
d1681e
 tests/bugs/ec/bug-1547662.t       |  41 ++++++++++++++++
d1681e
 xlators/cluster/ec/src/ec-heal.c  |   9 ++++
d1681e
 xlators/cluster/ec/src/ec-heald.c |  27 +++++++---
d1681e
 xlators/cluster/ec/src/ec-heald.h |   4 +-
d1681e
 xlators/cluster/ec/src/ec.c       | 101 ++++++++++++++++++++++----------------
d1681e
 5 files changed, 134 insertions(+), 48 deletions(-)
d1681e
 create mode 100644 tests/bugs/ec/bug-1547662.t
d1681e
d1681e
diff --git a/tests/bugs/ec/bug-1547662.t b/tests/bugs/ec/bug-1547662.t
d1681e
new file mode 100644
d1681e
index 0000000..5748218
d1681e
--- /dev/null
d1681e
+++ b/tests/bugs/ec/bug-1547662.t
d1681e
@@ -0,0 +1,41 @@
d1681e
+#!/bin/bash
d1681e
+
d1681e
+. $(dirname $0)/../../include.rc
d1681e
+. $(dirname $0)/../../volume.rc
d1681e
+
d1681e
+# Immediately after replace-brick, trusted.ec.version will be absent, so if it
d1681e
+# is present we can assume that heal was started on root
d1681e
+function root_heal_attempted {
d1681e
+        if [ -z $(get_hex_xattr trusted.ec.version $1) ]; then
d1681e
+                echo "N"
d1681e
+        else
d1681e
+                echo "Y"
d1681e
+        fi
d1681e
+}
d1681e
+
d1681e
+cleanup
d1681e
+
d1681e
+TEST glusterd
d1681e
+TEST pidof glusterd
d1681e
+TEST ${CLI} volume create ${V0} disperse 6 redundancy 2 ${H0}:${B0}/${V0}{0..5}
d1681e
+TEST ${CLI} volume start ${V0}
d1681e
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
d1681e
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
d1681e
+
d1681e
+TEST mkdir ${M0}/base
d1681e
+TEST mkdir ${M0}/base/dir.{1,2}
d1681e
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}
d1681e
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}
d1681e
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
d1681e
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
d1681e
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
d1681e
+
d1681e
+TEST ${CLI} volume replace-brick ${V0} ${H0}:${B0}/${V0}5 ${H0}:${B0}/${V0}6 commit force
d1681e
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
d1681e
+EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "Y" glustershd_up_status
d1681e
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count_shd ${V0} 0
d1681e
+EXPECT_WITHIN ${HEAL_TIMEOUT} "Y" root_heal_attempted ${B0}/${V0}6
d1681e
+EXPECT_WITHIN ${HEAL_TIMEOUT} "^0$" get_pending_heal_count ${V0}
d1681e
+EXPECT "^127$" echo $(find ${B0}/${V0}6/base -type d | wc -l)
d1681e
+
d1681e
+cleanup;
d1681e
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
d1681e
index b8518d6..8e02986 100644
d1681e
--- a/xlators/cluster/ec/src/ec-heal.c
d1681e
+++ b/xlators/cluster/ec/src/ec-heal.c
d1681e
@@ -25,6 +25,7 @@
d1681e
 #include "ec-combine.h"
d1681e
 #include "ec-method.h"
d1681e
 #include "ec-fops.h"
d1681e
+#include "ec-heald.h"
d1681e
 
d1681e
 #define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; })
d1681e
 #define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; })
d1681e
@@ -2752,6 +2753,14 @@ ec_replace_heal (ec_t *ec, inode_t *inode)
d1681e
                 gf_msg_debug (ec->xl->name, 0,
d1681e
                         "Heal failed for replace brick ret = %d", ret);
d1681e
 
d1681e
+        /* Once the root inode has been checked, it might have triggered a
d1681e
+         * self-heal on it after a replace brick command or for some other
d1681e
+         * reason. It can also happen that the volume already had damaged
d1681e
+         * files in the index, even if the heal on the root directory failed.
d1681e
+         * In both cases we need to wake all index healers to continue
d1681e
+         * healing remaining entries that are marked as dirty. */
d1681e
+        ec_shd_index_healer_wake(ec);
d1681e
+
d1681e
         loc_wipe (&loc;;
d1681e
         return ret;
d1681e
 }
d1681e
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
d1681e
index b4fa6f8..a703379 100644
d1681e
--- a/xlators/cluster/ec/src/ec-heald.c
d1681e
+++ b/xlators/cluster/ec/src/ec-heald.c
d1681e
@@ -184,8 +184,19 @@ ec_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)
d1681e
 int
d1681e
 ec_shd_selfheal (struct subvol_healer *healer, int child, loc_t *loc)
d1681e
 {
d1681e
-        return syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
d1681e
-                                NULL);
d1681e
+        int32_t ret;
d1681e
+
d1681e
+        ret = syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
d1681e
+                               NULL);
d1681e
+        if ((ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
d1681e
+                /* If we have just healed a directory, it's possible that
d1681e
+                 * other index entries have appeared to be healed. We put a
d1681e
+                 * mark so that we can check it later and restart a scan
d1681e
+                 * without delay. */
d1681e
+                healer->rerun = _gf_true;
d1681e
+        }
d1681e
+
d1681e
+        return ret;
d1681e
 }
d1681e
 
d1681e
 
d1681e
@@ -472,11 +483,15 @@ ec_shd_index_healer_spawn (xlator_t *this, int subvol)
d1681e
 }
d1681e
 
d1681e
 void
d1681e
-ec_selfheal_childup (ec_t *ec, int child)
d1681e
+ec_shd_index_healer_wake(ec_t *ec)
d1681e
 {
d1681e
-        if (!ec->shd.iamshd)
d1681e
-                return;
d1681e
-        ec_shd_index_healer_spawn (ec->xl, child);
d1681e
+        int32_t i;
d1681e
+
d1681e
+        for (i = 0; i < ec->nodes; i++) {
d1681e
+                if (((ec->xl_up >> i) & 1) != 0) {
d1681e
+                        ec_shd_index_healer_spawn(ec->xl, i);
d1681e
+                }
d1681e
+        }
d1681e
 }
d1681e
 
d1681e
 int
d1681e
diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h
d1681e
index 4ae02e2..2a84881 100644
d1681e
--- a/xlators/cluster/ec/src/ec-heald.h
d1681e
+++ b/xlators/cluster/ec/src/ec-heald.h
d1681e
@@ -20,6 +20,8 @@ ec_xl_op (xlator_t *this, dict_t *input, dict_t *output);
d1681e
 
d1681e
 int
d1681e
 ec_selfheal_daemon_init (xlator_t *this);
d1681e
-void ec_selfheal_childup (ec_t *ec, int child);
d1681e
+
d1681e
+void
d1681e
+ec_shd_index_healer_wake(ec_t *ec);
d1681e
 
d1681e
 #endif /* __EC_HEALD_H__ */
d1681e
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
d1681e
index bfdca64..956b45b 100644
d1681e
--- a/xlators/cluster/ec/src/ec.c
d1681e
+++ b/xlators/cluster/ec/src/ec.c
d1681e
@@ -322,7 +322,7 @@ ec_get_event_from_state (ec_t *ec)
d1681e
                 /* If ec is up but some subvolumes are yet to notify, give
d1681e
                  * grace time for other subvols to notify to prevent start of
d1681e
                  * I/O which may result in self-heals */
d1681e
-                if (ec->timer && ec->xl_notify_count < ec->nodes)
d1681e
+                if (ec->xl_notify_count < ec->nodes)
d1681e
                         return GF_EVENT_MAXVAL;
d1681e
 
d1681e
                 return GF_EVENT_CHILD_UP;
d1681e
@@ -344,8 +344,8 @@ ec_up (xlator_t *this, ec_t *ec)
d1681e
         }
d1681e
 
d1681e
         ec->up = 1;
d1681e
-        gf_msg (this->name, GF_LOG_INFO, 0,
d1681e
-                EC_MSG_EC_UP, "Going UP");
d1681e
+        gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP");
d1681e
+
d1681e
         gf_event (EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
d1681e
 }
d1681e
 
d1681e
@@ -358,8 +358,8 @@ ec_down (xlator_t *this, ec_t *ec)
d1681e
         }
d1681e
 
d1681e
         ec->up = 0;
d1681e
-        gf_msg (this->name, GF_LOG_INFO, 0,
d1681e
-                EC_MSG_EC_DOWN, "Going DOWN");
d1681e
+        gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN");
d1681e
+
d1681e
         gf_event (EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
d1681e
 }
d1681e
 
d1681e
@@ -383,31 +383,38 @@ ec_notify_cbk (void *data)
d1681e
                 gf_timer_call_cancel (ec->xl->ctx, ec->timer);
d1681e
                 ec->timer = NULL;
d1681e
 
d1681e
+                /* The timeout has expired, so any subvolume that has not
d1681e
+                 * already reported its state, will be considered to be down.
d1681e
+                 * We mark as if all bricks had reported. */
d1681e
+                ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
d1681e
+                ec->xl_notify_count = ec->nodes;
d1681e
+
d1681e
+                /* Since we have marked all subvolumes as notified, it's
d1681e
+                 * guaranteed that ec_get_event_from_state() will return
d1681e
+                 * CHILD_UP or CHILD_DOWN, but not MAXVAL. */
d1681e
                 event = ec_get_event_from_state (ec);
d1681e
-                /* If event is still MAXVAL then enough subvolumes didn't
d1681e
-                 * notify, treat it as CHILD_DOWN. */
d1681e
-                if (event == GF_EVENT_MAXVAL) {
d1681e
-                        event = GF_EVENT_CHILD_DOWN;
d1681e
-                        ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
d1681e
-                        ec->xl_notify_count = ec->nodes;
d1681e
-                } else if (event == GF_EVENT_CHILD_UP) {
d1681e
-                        /* Rest of the bricks are still not coming up,
d1681e
-                         * notify that ec is up. Files/directories will be
d1681e
-                         * healed as in when they come up. */
d1681e
+                if (event == GF_EVENT_CHILD_UP) {
d1681e
+                        /* We are ready to bring the volume up. If there are
d1681e
+                         * still bricks DOWN, they will be healed when they
d1681e
+                         * come up. */
d1681e
                         ec_up (ec->xl, ec);
d1681e
                 }
d1681e
 
d1681e
-                /* CHILD_DOWN should not come here as no grace period is given
d1681e
-                 * for notifying CHILD_DOWN. */
d1681e
-
d1681e
                 propagate = _gf_true;
d1681e
         }
d1681e
 unlock:
d1681e
         UNLOCK(&ec->lock);
d1681e
 
d1681e
         if (propagate) {
d1681e
+                if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) {
d1681e
+                        /* We have just brought the volume UP, so we trigger
d1681e
+                         * a self-heal check on the root directory. */
d1681e
+                        ec_launch_replace_heal (ec);
d1681e
+                }
d1681e
+
d1681e
                 default_notify (ec->xl, event, NULL);
d1681e
         }
d1681e
+
d1681e
 }
d1681e
 
d1681e
 void
d1681e
@@ -442,7 +449,7 @@ ec_pending_fops_completed(ec_t *ec)
d1681e
         }
d1681e
 }
d1681e
 
d1681e
-static void
d1681e
+static gf_boolean_t
d1681e
 ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
d1681e
 {
d1681e
         uintptr_t current_state = 0;
d1681e
@@ -455,23 +462,28 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
d1681e
         if (current_state != new_state) {
d1681e
                 ec->xl_up ^= index_mask;
d1681e
                 ec->xl_up_count += (current_state ? -1 : 1);
d1681e
+
d1681e
+                return _gf_true;
d1681e
         }
d1681e
+
d1681e
+        return _gf_false;
d1681e
 }
d1681e
 
d1681e
 int32_t
d1681e
 ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
d1681e
 {
d1681e
-        ec_t             *ec        = this->private;
d1681e
-        int32_t           idx       = 0;
d1681e
-        int32_t           error     = 0;
d1681e
-        glusterfs_event_t old_event = GF_EVENT_MAXVAL;
d1681e
-        dict_t            *input    = NULL;
d1681e
-        dict_t            *output   = NULL;
d1681e
-        gf_boolean_t      propagate = _gf_true;
d1681e
-        int32_t           orig_event = event;
d1681e
+        ec_t             *ec              = this->private;
d1681e
+        int32_t           idx             = 0;
d1681e
+        int32_t           error           = 0;
d1681e
+        glusterfs_event_t old_event       = GF_EVENT_MAXVAL;
d1681e
+        dict_t            *input          = NULL;
d1681e
+        dict_t            *output         = NULL;
d1681e
+        gf_boolean_t      propagate       = _gf_true;
d1681e
+        gf_boolean_t      needs_shd_check = _gf_false;
d1681e
+        int32_t           orig_event      = event;
d1681e
         struct gf_upcall *up_data   = NULL;
d1681e
         struct gf_upcall_cache_invalidation *up_ci = NULL;
d1681e
-        uintptr_t mask = 0;
d1681e
+        uintptr_t         mask            = 0;
d1681e
 
d1681e
         gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p",
d1681e
                 event, data, data2);
d1681e
@@ -498,8 +510,6 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
d1681e
 
d1681e
         for (idx = 0; idx < ec->nodes; idx++) {
d1681e
                 if (ec->xl_list[idx] == data) {
d1681e
-                        if (event == GF_EVENT_CHILD_UP)
d1681e
-                                ec_selfheal_childup (ec, idx);
d1681e
                         break;
d1681e
                 }
d1681e
         }
d1681e
@@ -525,17 +535,27 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
d1681e
 
d1681e
                 mask = 1ULL << idx;
d1681e
                 if (event == GF_EVENT_CHILD_UP) {
d1681e
-                    ec_set_up_state(ec, mask, mask);
d1681e
+                        /* We need to trigger a selfheal if a brick changes
d1681e
+                         * to UP state. */
d1681e
+                        needs_shd_check = ec_set_up_state(ec, mask, mask);
d1681e
                 } else if (event == GF_EVENT_CHILD_DOWN) {
d1681e
-                    ec_set_up_state(ec, mask, 0);
d1681e
+                        ec_set_up_state(ec, mask, 0);
d1681e
                 }
d1681e
 
d1681e
                 event = ec_get_event_from_state (ec);
d1681e
 
d1681e
-                if (event == GF_EVENT_CHILD_UP && !ec->up) {
d1681e
-                        ec_up (this, ec);
d1681e
-                } else if (event == GF_EVENT_CHILD_DOWN && ec->up) {
d1681e
-                        ec_down (this, ec);
d1681e
+                if (event == GF_EVENT_CHILD_UP) {
d1681e
+                        if (!ec->up) {
d1681e
+                                ec_up (this, ec);
d1681e
+                        }
d1681e
+                } else {
d1681e
+                        /* If the volume is not UP, it's irrelevant if one
d1681e
+                         * brick has come up. We cannot heal anything. */
d1681e
+                        needs_shd_check = _gf_false;
d1681e
+
d1681e
+                        if ((event == GF_EVENT_CHILD_DOWN) && ec->up) {
d1681e
+                                ec_down (this, ec);
d1681e
+                        }
d1681e
                 }
d1681e
 
d1681e
                 if (event != GF_EVENT_MAXVAL) {
d1681e
@@ -554,14 +574,13 @@ unlock:
d1681e
 
d1681e
 done:
d1681e
         if (propagate) {
d1681e
+                if (needs_shd_check && ec->shd.iamshd) {
d1681e
+                        ec_launch_replace_heal (ec);
d1681e
+                }
d1681e
+
d1681e
                 error = default_notify (this, event, data);
d1681e
         }
d1681e
 
d1681e
-        if (ec->shd.iamshd &&
d1681e
-            ec->xl_notify_count == ec->nodes &&
d1681e
-            event == GF_EVENT_CHILD_UP) {
d1681e
-                ec_launch_replace_heal (ec);
d1681e
-        }
d1681e
 out:
d1681e
         return error;
d1681e
 }
d1681e
-- 
d1681e
1.8.3.1
d1681e