d84cf8
From 3e8b3a2c2c6f83635486035fc8040c87d89813d2 Mon Sep 17 00:00:00 2001
d84cf8
From: Xavi Hernandez <xhernandez@redhat.com>
d84cf8
Date: Thu, 2 Jul 2020 18:08:52 +0200
d84cf8
Subject: [PATCH 457/465] cluster/ec: Improve detection of new heals
d84cf8
d84cf8
When EC successfully healed a directory it assumed that maybe other
d84cf8
entries inside that directory could have been created, which could
d84cf8
require additional heal cycles. For this reason, when the heal happened
d84cf8
as part of one index heal iteration, it triggered a new iteration.
d84cf8
d84cf8
The problem happened when the directory was healthy, so no new entries
d84cf8
were added, but its index entry was not removed for some reason. In
d84cf8
this case self-heal started and endless loop healing the same directory
d84cf8
continuously, cause high CPU utilization.
d84cf8
d84cf8
This patch improves detection of new files added to the heal index so
d84cf8
that a new index heal iteration is only triggered if there is new work
d84cf8
to do.
d84cf8
d84cf8
>Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24665/
d84cf8
>Fixes: #1354
d84cf8
d84cf8
Change-Id: I2355742b85fbfa6de758bccc5d2e1a283c82b53f
d84cf8
BUG: 1852736
d84cf8
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
d84cf8
Reviewed-on: https://code.engineering.redhat.com/gerrit/208041
d84cf8
Tested-by: Ashish Pandey <aspandey@redhat.com>
d84cf8
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d84cf8
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
d84cf8
---
d84cf8
 xlators/cluster/ec/src/ec-common.c     |  2 +-
d84cf8
 xlators/cluster/ec/src/ec-heal.c       | 58 +++++++++++++++++++++++-----------
d84cf8
 xlators/cluster/ec/src/ec-heald.c      | 24 ++++++++++----
d84cf8
 xlators/cluster/ec/src/ec-inode-read.c | 27 ++++++++++++++--
d84cf8
 xlators/cluster/ec/src/ec-types.h      |  4 +--
d84cf8
 xlators/cluster/ec/src/ec.h            |  1 +
d84cf8
 6 files changed, 86 insertions(+), 30 deletions(-)
d84cf8
d84cf8
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
d84cf8
index e580bfb..e3f8769 100644
d84cf8
--- a/xlators/cluster/ec/src/ec-common.c
d84cf8
+++ b/xlators/cluster/ec/src/ec-common.c
d84cf8
@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
d84cf8
 int32_t
d84cf8
 ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
d84cf8
                int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
d84cf8
-               uintptr_t bad, dict_t *xdata)
d84cf8
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
d84cf8
 {
d84cf8
     if (op_ret < 0) {
d84cf8
         gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
d84cf8
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
d84cf8
index 06a7016..e2de879 100644
d84cf8
--- a/xlators/cluster/ec/src/ec-heal.c
d84cf8
+++ b/xlators/cluster/ec/src/ec-heal.c
d84cf8
@@ -72,6 +72,7 @@ struct ec_name_data {
d84cf8
     char *name;
d84cf8
     inode_t *parent;
d84cf8
     default_args_cbk_t *replies;
d84cf8
+    uint32_t heal_pending;
d84cf8
 };
d84cf8
 
d84cf8
 static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
d84cf8
@@ -996,6 +997,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
d84cf8
         ret = -ENOTCONN;
d84cf8
         goto out;
d84cf8
     }
d84cf8
+
d84cf8
 out:
d84cf8
     if (xattr)
d84cf8
         dict_unref(xattr);
d84cf8
@@ -1164,6 +1166,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
d84cf8
     dict_t *xdata = NULL;
d84cf8
     char *linkname = NULL;
d84cf8
     ec_config_t config;
d84cf8
+
d84cf8
     /* There should be just one gfid key */
d84cf8
     EC_REPLIES_ALLOC(replies, ec->nodes);
d84cf8
     if (gfid_db->count != 1) {
d84cf8
@@ -1408,6 +1411,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
d84cf8
 
d84cf8
     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
d84cf8
                          participants);
d84cf8
+    if (ret >= 0) {
d84cf8
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
d84cf8
+         * file has been created and it will need to be healed. */
d84cf8
+        ret = 1;
d84cf8
+    }
d84cf8
 out:
d84cf8
     cluster_replies_wipe(replies, ec->nodes);
d84cf8
     loc_wipe(&loc;;
d84cf8
@@ -1485,18 +1493,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
d84cf8
     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
d84cf8
                        name_on);
d84cf8
 
d84cf8
-    if (ret < 0)
d84cf8
+    if (ret < 0) {
d84cf8
         memset(name_on, 0, ec->nodes);
d84cf8
+    } else {
d84cf8
+        name_data->heal_pending += ret;
d84cf8
+    }
d84cf8
 
d84cf8
     for (i = 0; i < ec->nodes; i++)
d84cf8
         if (name_data->participants[i] && !name_on[i])
d84cf8
             name_data->failed_on[i] = 1;
d84cf8
+
d84cf8
     return 0;
d84cf8
 }
d84cf8
 
d84cf8
 int
d84cf8
 ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
d84cf8
-              unsigned char *participants)
d84cf8
+              unsigned char *participants, uint32_t *pending)
d84cf8
 {
d84cf8
     int i = 0;
d84cf8
     int j = 0;
d84cf8
@@ -1509,7 +1521,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
d84cf8
     name_data.frame = frame;
d84cf8
     name_data.participants = participants;
d84cf8
     name_data.failed_on = alloca0(ec->nodes);
d84cf8
-    ;
d84cf8
+    name_data.heal_pending = 0;
d84cf8
 
d84cf8
     for (i = 0; i < ec->nodes; i++) {
d84cf8
         if (!participants[i])
d84cf8
@@ -1528,6 +1540,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
d84cf8
             break;
d84cf8
         }
d84cf8
     }
d84cf8
+    *pending += name_data.heal_pending;
d84cf8
+
d84cf8
     loc_wipe(&loc;;
d84cf8
     return ret;
d84cf8
 }
d84cf8
@@ -1535,7 +1549,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
d84cf8
 int
d84cf8
 __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
d84cf8
                 unsigned char *heal_on, unsigned char *sources,
d84cf8
-                unsigned char *healed_sinks)
d84cf8
+                unsigned char *healed_sinks, uint32_t *pending)
d84cf8
 {
d84cf8
     unsigned char *locked_on = NULL;
d84cf8
     unsigned char *output = NULL;
d84cf8
@@ -1580,7 +1594,7 @@ unlock:
d84cf8
         if (sources[i] || healed_sinks[i])
d84cf8
             participants[i] = 1;
d84cf8
     }
d84cf8
-    ret = ec_heal_names(frame, ec, inode, participants);
d84cf8
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
d84cf8
 
d84cf8
     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
d84cf8
         goto out;
d84cf8
@@ -1601,7 +1615,8 @@ out:
d84cf8
 
d84cf8
 int
d84cf8
 ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
d84cf8
-              unsigned char *sources, unsigned char *healed_sinks)
d84cf8
+              unsigned char *sources, unsigned char *healed_sinks,
d84cf8
+              uint32_t *pending)
d84cf8
 {
d84cf8
     unsigned char *locked_on = NULL;
d84cf8
     unsigned char *up_subvols = NULL;
d84cf8
@@ -1632,7 +1647,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
d84cf8
             goto unlock;
d84cf8
         }
d84cf8
         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
d84cf8
-                              healed_sinks);
d84cf8
+                              healed_sinks, pending);
d84cf8
     }
d84cf8
 unlock:
d84cf8
     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
d84cf8
@@ -1953,14 +1968,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
d84cf8
             if (fop->cbks.heal) {
d84cf8
                 fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0,
d84cf8
                                (heal->good | heal->bad), heal->good, heal->bad,
d84cf8
-                               NULL);
d84cf8
+                               0, NULL);
d84cf8
             }
d84cf8
 
d84cf8
             return EC_STATE_END;
d84cf8
         case -EC_STATE_REPORT:
d84cf8
             if (fop->cbks.heal) {
d84cf8
-                fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
d84cf8
-                               0, 0, NULL);
d84cf8
+                fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
d84cf8
+                               fop->error, 0, 0, 0, 0, NULL);
d84cf8
             }
d84cf8
 
d84cf8
             return EC_STATE_END;
d84cf8
@@ -1997,14 +2012,15 @@ out:
d84cf8
     if (fop != NULL) {
d84cf8
         ec_manager(fop, error);
d84cf8
     } else {
d84cf8
-        func(frame, NULL, this, -1, error, 0, 0, 0, NULL);
d84cf8
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
d84cf8
     }
d84cf8
 }
d84cf8
 
d84cf8
 int32_t
d84cf8
 ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
d84cf8
                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
d84cf8
-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
d84cf8
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
d84cf8
+                   dict_t *xdata)
d84cf8
 {
d84cf8
     ec_fop_data_t *fop = cookie;
d84cf8
     ec_heal_t *heal = fop->data;
d84cf8
@@ -2489,6 +2505,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
d84cf8
     intptr_t mbad = 0;
d84cf8
     intptr_t good = 0;
d84cf8
     intptr_t bad = 0;
d84cf8
+    uint32_t pending = 0;
d84cf8
     ec_fop_data_t *fop = data;
d84cf8
     gf_boolean_t blocking = _gf_false;
d84cf8
     ec_heal_need_t need_heal = EC_HEAL_NONEED;
d84cf8
@@ -2524,7 +2541,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
d84cf8
     if (loc->name && strlen(loc->name)) {
d84cf8
         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
d84cf8
                            participants);
d84cf8
-        if (ret == 0) {
d84cf8
+        if (ret >= 0) {
d84cf8
             gf_msg_debug(this->name, 0,
d84cf8
                          "%s: name heal "
d84cf8
                          "successful on %" PRIXPTR,
d84cf8
@@ -2542,7 +2559,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
d84cf8
 
d84cf8
     /* Mount triggers heal only when it detects that it must need heal, shd
d84cf8
      * triggers heals periodically which need not be thorough*/
d84cf8
-    if (ec->shd.iamshd) {
d84cf8
+    if (ec->shd.iamshd && (ret <= 0)) {
d84cf8
         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
d84cf8
                         &need_heal);
d84cf8
 
d84cf8
@@ -2552,13 +2569,15 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
d84cf8
             goto out;
d84cf8
         }
d84cf8
     }
d84cf8
+
d84cf8
     sources = alloca0(ec->nodes);
d84cf8
     healed_sinks = alloca0(ec->nodes);
d84cf8
     if (IA_ISREG(loc->inode->ia_type)) {
d84cf8
         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
d84cf8
                            healed_sinks);
d84cf8
     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
d84cf8
-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
d84cf8
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
d84cf8
+                            &pending);
d84cf8
     } else {
d84cf8
         ret = 0;
d84cf8
         memcpy(sources, participants, ec->nodes);
d84cf8
@@ -2588,10 +2607,11 @@ out:
d84cf8
     if (fop->cbks.heal) {
d84cf8
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
d84cf8
                        ec_char_array_to_mask(participants, ec->nodes),
d84cf8
-                       mgood & good, mbad & bad, NULL);
d84cf8
+                       mgood & good, mbad & bad, pending, NULL);
d84cf8
     }
d84cf8
     if (frame)
d84cf8
         STACK_DESTROY(frame->root);
d84cf8
+
d84cf8
     return;
d84cf8
 }
d84cf8
 
d84cf8
@@ -2638,8 +2658,8 @@ void
d84cf8
 ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
d84cf8
 {
d84cf8
     if (fop->cbks.heal) {
d84cf8
-        fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0,
d84cf8
-                       NULL);
d84cf8
+        fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
d84cf8
+                       0, 0, NULL);
d84cf8
     }
d84cf8
     ec_fop_data_release(fop);
d84cf8
 }
d84cf8
@@ -2826,7 +2846,7 @@ fail:
d84cf8
     if (fop)
d84cf8
         ec_fop_data_release(fop);
d84cf8
     if (func)
d84cf8
-        func(frame, NULL, this, -1, err, 0, 0, 0, NULL);
d84cf8
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
d84cf8
 }
d84cf8
 
d84cf8
 int
d84cf8
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
d84cf8
index cba111a..4f4b6aa 100644
d84cf8
--- a/xlators/cluster/ec/src/ec-heald.c
d84cf8
+++ b/xlators/cluster/ec/src/ec-heald.c
d84cf8
@@ -156,15 +156,27 @@ int
d84cf8
 ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
d84cf8
                 gf_boolean_t full)
d84cf8
 {
d84cf8
+    dict_t *xdata = NULL;
d84cf8
+    uint32_t count;
d84cf8
     int32_t ret;
d84cf8
 
d84cf8
-    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
d84cf8
-    if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
d84cf8
+    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata);
d84cf8
+    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
d84cf8
         /* If we have just healed a directory, it's possible that
d84cf8
-         * other index entries have appeared to be healed. We put a
d84cf8
-         * mark so that we can check it later and restart a scan
d84cf8
-         * without delay. */
d84cf8
-        healer->rerun = _gf_true;
d84cf8
+         * other index entries have appeared to be healed. */
d84cf8
+        if ((xdata != NULL) &&
d84cf8
+            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
d84cf8
+            (count > 0)) {
d84cf8
+            /* Force a rerun of the index healer. */
d84cf8
+            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
d84cf8
+                         count);
d84cf8
+
d84cf8
+            healer->rerun = _gf_true;
d84cf8
+        }
d84cf8
+    }
d84cf8
+
d84cf8
+    if (xdata != NULL) {
d84cf8
+        dict_unref(xdata);
d84cf8
     }
d84cf8
 
d84cf8
     return ret;
d84cf8
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
d84cf8
index f87a94a..e82e8f6 100644
d84cf8
--- a/xlators/cluster/ec/src/ec-inode-read.c
d84cf8
+++ b/xlators/cluster/ec/src/ec-inode-read.c
d84cf8
@@ -393,7 +393,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
d84cf8
 int32_t
d84cf8
 ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
d84cf8
                      int32_t op_ret, int32_t op_errno, uintptr_t mask,
d84cf8
-                     uintptr_t good, uintptr_t bad, dict_t *xdata)
d84cf8
+                     uintptr_t good, uintptr_t bad, uint32_t pending,
d84cf8
+                     dict_t *xdata)
d84cf8
 {
d84cf8
     ec_fop_data_t *fop = cookie;
d84cf8
     fop_getxattr_cbk_t func = fop->data;
d84cf8
@@ -402,6 +403,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
d84cf8
     char *str;
d84cf8
     char bin1[65], bin2[65];
d84cf8
 
d84cf8
+    /* We try to return the 'pending' information in xdata, but if this cannot
d84cf8
+     * be set, we will ignore it silently. We prefer to report the success or
d84cf8
+     * failure of the heal itself. */
d84cf8
+    if (xdata == NULL) {
d84cf8
+        xdata = dict_new();
d84cf8
+    } else {
d84cf8
+        dict_ref(xdata);
d84cf8
+    }
d84cf8
+    if (xdata != NULL) {
d84cf8
+        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
d84cf8
+            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
d84cf8
+             * enforces to check the result in this case. However we don't
d84cf8
+             * really care if it succeeded or not. We'll just do the same.
d84cf8
+             *
d84cf8
+             * This empty 'if' avoids the warning, and it will be removed by
d84cf8
+             * the optimizer. */
d84cf8
+        }
d84cf8
+    }
d84cf8
+
d84cf8
     if (op_ret >= 0) {
d84cf8
         dict = dict_new();
d84cf8
         if (dict == NULL) {
d84cf8
@@ -435,11 +455,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
d84cf8
     }
d84cf8
 
d84cf8
 out:
d84cf8
-    func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
d84cf8
+    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
d84cf8
 
d84cf8
     if (dict != NULL) {
d84cf8
         dict_unref(dict);
d84cf8
     }
d84cf8
+    if (xdata != NULL) {
d84cf8
+        dict_unref(xdata);
d84cf8
+    }
d84cf8
 
d84cf8
     return 0;
d84cf8
 }
d84cf8
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
d84cf8
index 34a9768..f15429d 100644
d84cf8
--- a/xlators/cluster/ec/src/ec-types.h
d84cf8
+++ b/xlators/cluster/ec/src/ec-types.h
d84cf8
@@ -186,10 +186,10 @@ struct _ec_inode {
d84cf8
 
d84cf8
 typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
d84cf8
                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
d84cf8
-                                  dict_t *);
d84cf8
+                                  uint32_t, dict_t *);
d84cf8
 typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
d84cf8
                                    int32_t, uintptr_t, uintptr_t, uintptr_t,
d84cf8
-                                   dict_t *);
d84cf8
+                                   uint32_t, dict_t *);
d84cf8
 
d84cf8
 union _ec_cbk {
d84cf8
     fop_access_cbk_t access;
d84cf8
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
d84cf8
index 1b210d9..6f6de6d 100644
d84cf8
--- a/xlators/cluster/ec/src/ec.h
d84cf8
+++ b/xlators/cluster/ec/src/ec.h
d84cf8
@@ -18,6 +18,7 @@
d84cf8
 #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
d84cf8
 #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
d84cf8
 #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
d84cf8
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
d84cf8
 #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
d84cf8
 #define EC_STRIPE_CACHE_MAX_SIZE 10
d84cf8
 #define EC_VERSION_SIZE 2
d84cf8
-- 
d84cf8
1.8.3.1
d84cf8