Blob Blame History Raw
From 3e8b3a2c2c6f83635486035fc8040c87d89813d2 Mon Sep 17 00:00:00 2001
From: Xavi Hernandez <xhernandez@redhat.com>
Date: Thu, 2 Jul 2020 18:08:52 +0200
Subject: [PATCH 457/465] cluster/ec: Improve detection of new heals

When EC successfully healed a directory it assumed that maybe other
entries inside that directory could have been created, which could
require additional heal cycles. For this reason, when the heal happened
as part of one index heal iteration, it triggered a new iteration.

The problem happened when the directory was healthy, so no new entries
were added, but its index entry was not removed for some reason. In
this case self-heal started and endless loop healing the same directory
continuously, cause high CPU utilization.

This patch improves detection of new files added to the heal index so
that a new index heal iteration is only triggered if there is new work
to do.

>Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24665/
>Fixes: #1354

Change-Id: I2355742b85fbfa6de758bccc5d2e1a283c82b53f
BUG: 1852736
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/208041
Tested-by: Ashish Pandey <aspandey@redhat.com>
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
---
 xlators/cluster/ec/src/ec-common.c     |  2 +-
 xlators/cluster/ec/src/ec-heal.c       | 58 +++++++++++++++++++++++-----------
 xlators/cluster/ec/src/ec-heald.c      | 24 ++++++++++----
 xlators/cluster/ec/src/ec-inode-read.c | 27 ++++++++++++++--
 xlators/cluster/ec/src/ec-types.h      |  4 +--
 xlators/cluster/ec/src/ec.h            |  1 +
 6 files changed, 86 insertions(+), 30 deletions(-)

diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index e580bfb..e3f8769 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
 int32_t
 ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
                int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
-               uintptr_t bad, dict_t *xdata)
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
 {
     if (op_ret < 0) {
         gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 06a7016..e2de879 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -72,6 +72,7 @@ struct ec_name_data {
     char *name;
     inode_t *parent;
     default_args_cbk_t *replies;
+    uint32_t heal_pending;
 };
 
 static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
@@ -996,6 +997,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
         ret = -ENOTCONN;
         goto out;
     }
+
 out:
     if (xattr)
         dict_unref(xattr);
@@ -1164,6 +1166,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
     dict_t *xdata = NULL;
     char *linkname = NULL;
     ec_config_t config;
+
     /* There should be just one gfid key */
     EC_REPLIES_ALLOC(replies, ec->nodes);
     if (gfid_db->count != 1) {
@@ -1408,6 +1411,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
 
     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
                          participants);
+    if (ret >= 0) {
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
+         * file has been created and it will need to be healed. */
+        ret = 1;
+    }
 out:
     cluster_replies_wipe(replies, ec->nodes);
     loc_wipe(&loc);
@@ -1485,18 +1493,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
                        name_on);
 
-    if (ret < 0)
+    if (ret < 0) {
         memset(name_on, 0, ec->nodes);
+    } else {
+        name_data->heal_pending += ret;
+    }
 
     for (i = 0; i < ec->nodes; i++)
         if (name_data->participants[i] && !name_on[i])
             name_data->failed_on[i] = 1;
+
     return 0;
 }
 
 int
 ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
-              unsigned char *participants)
+              unsigned char *participants, uint32_t *pending)
 {
     int i = 0;
     int j = 0;
@@ -1509,7 +1521,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
     name_data.frame = frame;
     name_data.participants = participants;
     name_data.failed_on = alloca0(ec->nodes);
-    ;
+    name_data.heal_pending = 0;
 
     for (i = 0; i < ec->nodes; i++) {
         if (!participants[i])
@@ -1528,6 +1540,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
             break;
         }
     }
+    *pending += name_data.heal_pending;
+
     loc_wipe(&loc);
     return ret;
 }
@@ -1535,7 +1549,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
 int
 __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
                 unsigned char *heal_on, unsigned char *sources,
-                unsigned char *healed_sinks)
+                unsigned char *healed_sinks, uint32_t *pending)
 {
     unsigned char *locked_on = NULL;
     unsigned char *output = NULL;
@@ -1580,7 +1594,7 @@ unlock:
         if (sources[i] || healed_sinks[i])
             participants[i] = 1;
     }
-    ret = ec_heal_names(frame, ec, inode, participants);
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
 
     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
         goto out;
@@ -1601,7 +1615,8 @@ out:
 
 int
 ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
-              unsigned char *sources, unsigned char *healed_sinks)
+              unsigned char *sources, unsigned char *healed_sinks,
+              uint32_t *pending)
 {
     unsigned char *locked_on = NULL;
     unsigned char *up_subvols = NULL;
@@ -1632,7 +1647,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
             goto unlock;
         }
         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
-                              healed_sinks);
+                              healed_sinks, pending);
     }
 unlock:
     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
@@ -1953,14 +1968,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
             if (fop->cbks.heal) {
                 fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0,
                                (heal->good | heal->bad), heal->good, heal->bad,
-                               NULL);
+                               0, NULL);
             }
 
             return EC_STATE_END;
         case -EC_STATE_REPORT:
             if (fop->cbks.heal) {
-                fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
-                               0, 0, NULL);
+                fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
+                               fop->error, 0, 0, 0, 0, NULL);
             }
 
             return EC_STATE_END;
@@ -1997,14 +2012,15 @@ out:
     if (fop != NULL) {
         ec_manager(fop, error);
     } else {
-        func(frame, NULL, this, -1, error, 0, 0, 0, NULL);
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
     }
 }
 
 int32_t
 ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
+                   dict_t *xdata)
 {
     ec_fop_data_t *fop = cookie;
     ec_heal_t *heal = fop->data;
@@ -2489,6 +2505,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
     intptr_t mbad = 0;
     intptr_t good = 0;
     intptr_t bad = 0;
+    uint32_t pending = 0;
     ec_fop_data_t *fop = data;
     gf_boolean_t blocking = _gf_false;
     ec_heal_need_t need_heal = EC_HEAL_NONEED;
@@ -2524,7 +2541,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
     if (loc->name && strlen(loc->name)) {
         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
                            participants);
-        if (ret == 0) {
+        if (ret >= 0) {
             gf_msg_debug(this->name, 0,
                          "%s: name heal "
                          "successful on %" PRIXPTR,
@@ -2542,7 +2559,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
 
     /* Mount triggers heal only when it detects that it must need heal, shd
      * triggers heals periodically which need not be thorough*/
-    if (ec->shd.iamshd) {
+    if (ec->shd.iamshd && (ret <= 0)) {
         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
                         &need_heal);
 
@@ -2552,13 +2569,15 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
             goto out;
         }
     }
+
     sources = alloca0(ec->nodes);
     healed_sinks = alloca0(ec->nodes);
     if (IA_ISREG(loc->inode->ia_type)) {
         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
                            healed_sinks);
     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
+                            &pending);
     } else {
         ret = 0;
         memcpy(sources, participants, ec->nodes);
@@ -2588,10 +2607,11 @@ out:
     if (fop->cbks.heal) {
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
                        ec_char_array_to_mask(participants, ec->nodes),
-                       mgood & good, mbad & bad, NULL);
+                       mgood & good, mbad & bad, pending, NULL);
     }
     if (frame)
         STACK_DESTROY(frame->root);
+
     return;
 }
 
@@ -2638,8 +2658,8 @@ void
 ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
 {
     if (fop->cbks.heal) {
-        fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0,
-                       NULL);
+        fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
+                       0, 0, NULL);
     }
     ec_fop_data_release(fop);
 }
@@ -2826,7 +2846,7 @@ fail:
     if (fop)
         ec_fop_data_release(fop);
     if (func)
-        func(frame, NULL, this, -1, err, 0, 0, 0, NULL);
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
 }
 
 int
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
index cba111a..4f4b6aa 100644
--- a/xlators/cluster/ec/src/ec-heald.c
+++ b/xlators/cluster/ec/src/ec-heald.c
@@ -156,15 +156,27 @@ int
 ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
                 gf_boolean_t full)
 {
+    dict_t *xdata = NULL;
+    uint32_t count;
     int32_t ret;
 
-    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
-    if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
+    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata);
+    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
         /* If we have just healed a directory, it's possible that
-         * other index entries have appeared to be healed. We put a
-         * mark so that we can check it later and restart a scan
-         * without delay. */
-        healer->rerun = _gf_true;
+         * other index entries have appeared to be healed. */
+        if ((xdata != NULL) &&
+            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
+            (count > 0)) {
+            /* Force a rerun of the index healer. */
+            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
+                         count);
+
+            healer->rerun = _gf_true;
+        }
+    }
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
     }
 
     return ret;
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
index f87a94a..e82e8f6 100644
--- a/xlators/cluster/ec/src/ec-inode-read.c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -393,7 +393,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
 int32_t
 ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
                      int32_t op_ret, int32_t op_errno, uintptr_t mask,
-                     uintptr_t good, uintptr_t bad, dict_t *xdata)
+                     uintptr_t good, uintptr_t bad, uint32_t pending,
+                     dict_t *xdata)
 {
     ec_fop_data_t *fop = cookie;
     fop_getxattr_cbk_t func = fop->data;
@@ -402,6 +403,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
     char *str;
     char bin1[65], bin2[65];
 
+    /* We try to return the 'pending' information in xdata, but if this cannot
+     * be set, we will ignore it silently. We prefer to report the success or
+     * failure of the heal itself. */
+    if (xdata == NULL) {
+        xdata = dict_new();
+    } else {
+        dict_ref(xdata);
+    }
+    if (xdata != NULL) {
+        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
+            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
+             * enforces to check the result in this case. However we don't
+             * really care if it succeeded or not. We'll just do the same.
+             *
+             * This empty 'if' avoids the warning, and it will be removed by
+             * the optimizer. */
+        }
+    }
+
     if (op_ret >= 0) {
         dict = dict_new();
         if (dict == NULL) {
@@ -435,11 +455,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
     }
 
 out:
-    func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
+    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
 
     if (dict != NULL) {
         dict_unref(dict);
     }
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
 
     return 0;
 }
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 34a9768..f15429d 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -186,10 +186,10 @@ struct _ec_inode {
 
 typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
-                                  dict_t *);
+                                  uint32_t, dict_t *);
 typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
                                    int32_t, uintptr_t, uintptr_t, uintptr_t,
-                                   dict_t *);
+                                   uint32_t, dict_t *);
 
 union _ec_cbk {
     fop_access_cbk_t access;
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index 1b210d9..6f6de6d 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -18,6 +18,7 @@
 #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
 #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
 #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
 #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
 #define EC_STRIPE_CACHE_MAX_SIZE 10
 #define EC_VERSION_SIZE 2
-- 
1.8.3.1